]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rulebasedcollator.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / rulebasedcollator.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * rulebasedcollator.cpp
9 *
10 * (replaced the former tblcoll.cpp)
11 *
12 * created on: 2012feb14 with new and old collation code
13 * created by: Markus W. Scherer
14 */
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/coll.h"
21 #include "unicode/coleitr.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/locid.h"
24 #include "unicode/sortkey.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/ucol.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uloc.h"
29 #include "unicode/uniset.h"
30 #include "unicode/unistr.h"
31 #include "unicode/usetiter.h"
32 #include "unicode/utf8.h"
33 #include "unicode/uversion.h"
34 #include "bocsu.h"
35 #include "charstr.h"
36 #include "cmemory.h"
37 #include "collation.h"
38 #include "collationcompare.h"
39 #include "collationdata.h"
40 #include "collationdatareader.h"
41 #include "collationfastlatin.h"
42 #include "collationiterator.h"
43 #include "collationkeys.h"
44 #include "collationroot.h"
45 #include "collationsets.h"
46 #include "collationsettings.h"
47 #include "collationtailoring.h"
48 #include "cstring.h"
49 #include "uassert.h"
50 #include "ucol_imp.h"
51 #include "uhash.h"
52 #include "uitercollationiterator.h"
53 #include "ustr_imp.h"
54 #include "utf16collationiterator.h"
55 #include "utf8collationiterator.h"
56 #include "uvectr64.h"
57
58 U_NAMESPACE_BEGIN
59
60 namespace {
61
62 class FixedSortKeyByteSink : public SortKeyByteSink {
63 public:
64 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
65 : SortKeyByteSink(dest, destCapacity) {}
66 virtual ~FixedSortKeyByteSink();
67
68 private:
69 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
70 virtual UBool Resize(int32_t appendCapacity, int32_t length);
71 };
72
73 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
74
75 void
76 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
77 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
78 // Fill the buffer completely.
79 int32_t available = capacity_ - length;
80 if (available > 0) {
81 uprv_memcpy(buffer_ + length, bytes, available);
82 }
83 }
84
85 UBool
86 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
87 return FALSE;
88 }
89
90 } // namespace
91
92 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
93 class CollationKeyByteSink : public SortKeyByteSink {
94 public:
95 CollationKeyByteSink(CollationKey &key)
96 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
97 key_(key) {}
98 virtual ~CollationKeyByteSink();
99
100 private:
101 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
102 virtual UBool Resize(int32_t appendCapacity, int32_t length);
103
104 CollationKey &key_;
105 };
106
107 CollationKeyByteSink::~CollationKeyByteSink() {}
108
109 void
110 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
111 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
112 if (Resize(n, length)) {
113 uprv_memcpy(buffer_ + length, bytes, n);
114 }
115 }
116
117 UBool
118 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
119 if (buffer_ == NULL) {
120 return FALSE; // allocation failed before already
121 }
122 int32_t newCapacity = 2 * capacity_;
123 int32_t altCapacity = length + 2 * appendCapacity;
124 if (newCapacity < altCapacity) {
125 newCapacity = altCapacity;
126 }
127 if (newCapacity < 200) {
128 newCapacity = 200;
129 }
130 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
131 if (newBuffer == NULL) {
132 SetNotOk();
133 return FALSE;
134 }
135 buffer_ = reinterpret_cast<char *>(newBuffer);
136 capacity_ = newCapacity;
137 return TRUE;
138 }
139
140 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
141 : Collator(other),
142 data(other.data),
143 settings(other.settings),
144 tailoring(other.tailoring),
145 cacheEntry(other.cacheEntry),
146 validLocale(other.validLocale),
147 explicitlySetAttributes(other.explicitlySetAttributes),
148 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
149 settings->addRef();
150 cacheEntry->addRef();
151 }
152
153 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
154 const RuleBasedCollator *base, UErrorCode &errorCode)
155 : data(NULL),
156 settings(NULL),
157 tailoring(NULL),
158 cacheEntry(NULL),
159 validLocale(""),
160 explicitlySetAttributes(0),
161 actualLocaleIsSameAsValid(FALSE) {
162 if(U_FAILURE(errorCode)) { return; }
163 if(bin == NULL || length == 0 || base == NULL) {
164 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
165 return;
166 }
167 const CollationTailoring *root = CollationRoot::getRoot(errorCode);
168 if(U_FAILURE(errorCode)) { return; }
169 if(base->tailoring != root) {
170 errorCode = U_UNSUPPORTED_ERROR;
171 return;
172 }
173 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
174 if(t.isNull() || t->isBogus()) {
175 errorCode = U_MEMORY_ALLOCATION_ERROR;
176 return;
177 }
178 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
179 if(U_FAILURE(errorCode)) { return; }
180 t->actualLocale.setToBogus();
181 adoptTailoring(t.orphan(), errorCode);
182 }
183
184 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
185 : data(entry->tailoring->data),
186 settings(entry->tailoring->settings),
187 tailoring(entry->tailoring),
188 cacheEntry(entry),
189 validLocale(entry->validLocale),
190 explicitlySetAttributes(0),
191 actualLocaleIsSameAsValid(FALSE) {
192 settings->addRef();
193 cacheEntry->addRef();
194 }
195
196 RuleBasedCollator::~RuleBasedCollator() {
197 SharedObject::clearPtr(settings);
198 SharedObject::clearPtr(cacheEntry);
199 }
200
201 void
202 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
203 if(U_FAILURE(errorCode)) {
204 t->deleteIfZeroRefCount();
205 return;
206 }
207 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL);
208 cacheEntry = new CollationCacheEntry(t->actualLocale, t);
209 if(cacheEntry == NULL) {
210 errorCode = U_MEMORY_ALLOCATION_ERROR;
211 t->deleteIfZeroRefCount();
212 return;
213 }
214 data = t->data;
215 settings = t->settings;
216 settings->addRef();
217 tailoring = t;
218 cacheEntry->addRef();
219 validLocale = t->actualLocale;
220 actualLocaleIsSameAsValid = FALSE;
221 }
222
223 Collator *
224 RuleBasedCollator::clone() const {
225 return new RuleBasedCollator(*this);
226 }
227
228 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
229 if(this == &other) { return *this; }
230 SharedObject::copyPtr(other.settings, settings);
231 tailoring = other.tailoring;
232 SharedObject::copyPtr(other.cacheEntry, cacheEntry);
233 data = tailoring->data;
234 validLocale = other.validLocale;
235 explicitlySetAttributes = other.explicitlySetAttributes;
236 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
237 return *this;
238 }
239
240 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
241
242 UBool
243 RuleBasedCollator::operator==(const Collator& other) const {
244 if(this == &other) { return TRUE; }
245 if(!Collator::operator==(other)) { return FALSE; }
246 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
247 if(*settings != *o.settings) { return FALSE; }
248 if(data == o.data) { return TRUE; }
249 UBool thisIsRoot = data->base == NULL;
250 UBool otherIsRoot = o.data->base == NULL;
251 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
252 if(thisIsRoot != otherIsRoot) { return FALSE; }
253 if((thisIsRoot || !tailoring->rules.isEmpty()) &&
254 (otherIsRoot || !o.tailoring->rules.isEmpty())) {
255 // Shortcut: If both collators have valid rule strings, then compare those.
256 if(tailoring->rules == o.tailoring->rules) { return TRUE; }
257 }
258 // Different rule strings can result in the same or equivalent tailoring.
259 // The rule strings are optional in ICU resource bundles, although included by default.
260 // cloneBinary() drops the rule string.
261 UErrorCode errorCode = U_ZERO_ERROR;
262 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
263 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
264 if(U_FAILURE(errorCode)) { return FALSE; }
265 if(*thisTailored != *otherTailored) { return FALSE; }
266 // For completeness, we should compare all of the mappings;
267 // or we should create a list of strings, sort it with one collator,
268 // and check if both collators compare adjacent strings the same
269 // (order & strength, down to quaternary); or similar.
270 // Testing equality of collators seems unusual.
271 return TRUE;
272 }
273
274 int32_t
275 RuleBasedCollator::hashCode() const {
276 int32_t h = settings->hashCode();
277 if(data->base == NULL) { return h; } // root collator
278 // Do not rely on the rule string, see comments in operator==().
279 UErrorCode errorCode = U_ZERO_ERROR;
280 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
281 if(U_FAILURE(errorCode)) { return 0; }
282 UnicodeSetIterator iter(*set);
283 while(iter.next() && !iter.isString()) {
284 h ^= data->getCE32(iter.getCodepoint());
285 }
286 return h;
287 }
288
289 void
290 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
291 const Locale &actual) {
292 if(actual == tailoring->actualLocale) {
293 actualLocaleIsSameAsValid = FALSE;
294 } else {
295 U_ASSERT(actual == valid);
296 actualLocaleIsSameAsValid = TRUE;
297 }
298 // Do not modify tailoring.actualLocale:
299 // We cannot be sure that that would be thread-safe.
300 validLocale = valid;
301 (void)requested; // Ignore, see also ticket #10477.
302 }
303
304 Locale
305 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
306 if(U_FAILURE(errorCode)) {
307 return Locale::getRoot();
308 }
309 switch(type) {
310 case ULOC_ACTUAL_LOCALE:
311 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
312 case ULOC_VALID_LOCALE:
313 case ULOC_REQUESTED_LOCALE: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
314 return validLocale;
315 default:
316 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
317 return Locale::getRoot();
318 }
319 }
320
321 const char *
322 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
323 if(U_FAILURE(errorCode)) {
324 return NULL;
325 }
326 const Locale *result;
327 switch(type) {
328 case ULOC_ACTUAL_LOCALE:
329 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
330 break;
331 case ULOC_VALID_LOCALE:
332 case ULOC_REQUESTED_LOCALE: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
333 result = &validLocale;
334 break;
335 default:
336 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
337 return NULL;
338 }
339 if(result->isBogus()) { return NULL; }
340 const char *id = result->getName();
341 return id[0] == 0 ? "root" : id;
342 }
343
344 const UnicodeString&
345 RuleBasedCollator::getRules() const {
346 return tailoring->rules;
347 }
348
349 void
350 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
351 if(delta == UCOL_TAILORING_ONLY) {
352 buffer = tailoring->rules;
353 return;
354 }
355 // UCOL_FULL_RULES
356 buffer.remove();
357 CollationLoader::appendRootRules(buffer);
358 buffer.append(tailoring->rules).getTerminatedBuffer();
359 }
360
361 void
362 RuleBasedCollator::getVersion(UVersionInfo version) const {
363 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
364 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
365 }
366
367 UnicodeSet *
368 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
369 if(U_FAILURE(errorCode)) { return NULL; }
370 UnicodeSet *tailored = new UnicodeSet();
371 if(tailored == NULL) {
372 errorCode = U_MEMORY_ALLOCATION_ERROR;
373 return NULL;
374 }
375 if(data->base != NULL) {
376 TailoredSet(tailored).forData(data, errorCode);
377 if(U_FAILURE(errorCode)) {
378 delete tailored;
379 return NULL;
380 }
381 }
382 return tailored;
383 }
384
385 void
386 RuleBasedCollator::internalGetContractionsAndExpansions(
387 UnicodeSet *contractions, UnicodeSet *expansions,
388 UBool addPrefixes, UErrorCode &errorCode) const {
389 if(U_FAILURE(errorCode)) { return; }
390 if(contractions != NULL) {
391 contractions->clear();
392 }
393 if(expansions != NULL) {
394 expansions->clear();
395 }
396 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode);
397 }
398
399 void
400 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
401 if(U_FAILURE(errorCode)) { return; }
402 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode);
403 }
404
405 const CollationSettings &
406 RuleBasedCollator::getDefaultSettings() const {
407 return *tailoring->settings;
408 }
409
410 UColAttributeValue
411 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
412 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
413 int32_t option;
414 switch(attr) {
415 case UCOL_FRENCH_COLLATION:
416 option = CollationSettings::BACKWARD_SECONDARY;
417 break;
418 case UCOL_ALTERNATE_HANDLING:
419 return settings->getAlternateHandling();
420 case UCOL_CASE_FIRST:
421 return settings->getCaseFirst();
422 case UCOL_CASE_LEVEL:
423 option = CollationSettings::CASE_LEVEL;
424 break;
425 case UCOL_NORMALIZATION_MODE:
426 option = CollationSettings::CHECK_FCD;
427 break;
428 case UCOL_STRENGTH:
429 return (UColAttributeValue)settings->getStrength();
430 case UCOL_HIRAGANA_QUATERNARY_MODE:
431 // Deprecated attribute, unsettable.
432 return UCOL_OFF;
433 case UCOL_NUMERIC_COLLATION:
434 option = CollationSettings::NUMERIC;
435 break;
436 default:
437 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
438 return UCOL_DEFAULT;
439 }
440 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
441 }
442
443 void
444 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
445 UErrorCode &errorCode) {
446 UColAttributeValue oldValue = getAttribute(attr, errorCode);
447 if(U_FAILURE(errorCode)) { return; }
448 if(value == oldValue) {
449 setAttributeExplicitly(attr);
450 return;
451 }
452 const CollationSettings &defaultSettings = getDefaultSettings();
453 if(settings == &defaultSettings) {
454 if(value == UCOL_DEFAULT) {
455 setAttributeDefault(attr);
456 return;
457 }
458 }
459 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
460 if(ownedSettings == NULL) {
461 errorCode = U_MEMORY_ALLOCATION_ERROR;
462 return;
463 }
464
465 switch(attr) {
466 case UCOL_FRENCH_COLLATION:
467 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
468 defaultSettings.options, errorCode);
469 break;
470 case UCOL_ALTERNATE_HANDLING:
471 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
472 break;
473 case UCOL_CASE_FIRST:
474 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
475 break;
476 case UCOL_CASE_LEVEL:
477 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
478 defaultSettings.options, errorCode);
479 break;
480 case UCOL_NORMALIZATION_MODE:
481 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
482 defaultSettings.options, errorCode);
483 break;
484 case UCOL_STRENGTH:
485 ownedSettings->setStrength(value, defaultSettings.options, errorCode);
486 break;
487 case UCOL_HIRAGANA_QUATERNARY_MODE:
488 // Deprecated attribute. Check for valid values but do not change anything.
489 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
490 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
491 }
492 break;
493 case UCOL_NUMERIC_COLLATION:
494 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
495 break;
496 default:
497 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
498 break;
499 }
500 if(U_FAILURE(errorCode)) { return; }
501 setFastLatinOptions(*ownedSettings);
502 if(value == UCOL_DEFAULT) {
503 setAttributeDefault(attr);
504 } else {
505 setAttributeExplicitly(attr);
506 }
507 }
508
509 Collator &
510 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
511 if(U_FAILURE(errorCode)) { return *this; }
512 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
513 int32_t value;
514 if(group == UCOL_REORDER_CODE_DEFAULT) {
515 value = UCOL_DEFAULT;
516 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
517 value = group - UCOL_REORDER_CODE_FIRST;
518 } else {
519 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
520 return *this;
521 }
522 CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
523 if(value == oldValue) {
524 setAttributeExplicitly(ATTR_VARIABLE_TOP);
525 return *this;
526 }
527 const CollationSettings &defaultSettings = getDefaultSettings();
528 if(settings == &defaultSettings) {
529 if(value == UCOL_DEFAULT) {
530 setAttributeDefault(ATTR_VARIABLE_TOP);
531 return *this;
532 }
533 }
534 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
535 if(ownedSettings == NULL) {
536 errorCode = U_MEMORY_ALLOCATION_ERROR;
537 return *this;
538 }
539
540 if(group == UCOL_REORDER_CODE_DEFAULT) {
541 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable());
542 }
543 uint32_t varTop = data->getLastPrimaryForGroup(group);
544 U_ASSERT(varTop != 0);
545 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
546 if(U_FAILURE(errorCode)) { return *this; }
547 ownedSettings->variableTop = varTop;
548 setFastLatinOptions(*ownedSettings);
549 if(value == UCOL_DEFAULT) {
550 setAttributeDefault(ATTR_VARIABLE_TOP);
551 } else {
552 setAttributeExplicitly(ATTR_VARIABLE_TOP);
553 }
554 return *this;
555 }
556
557 UColReorderCode
558 RuleBasedCollator::getMaxVariable() const {
559 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
560 }
561
562 uint32_t
563 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
564 return settings->variableTop;
565 }
566
567 uint32_t
568 RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) {
569 if(U_FAILURE(errorCode)) { return 0; }
570 if(varTop == NULL && len !=0) {
571 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
572 return 0;
573 }
574 if(len < 0) { len = u_strlen(varTop); }
575 if(len == 0) {
576 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
577 return 0;
578 }
579 UBool numeric = settings->isNumeric();
580 int64_t ce1, ce2;
581 if(settings->dontCheckFCD()) {
582 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
583 ce1 = ci.nextCE(errorCode);
584 ce2 = ci.nextCE(errorCode);
585 } else {
586 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
587 ce1 = ci.nextCE(errorCode);
588 ce2 = ci.nextCE(errorCode);
589 }
590 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
591 errorCode = U_CE_NOT_FOUND_ERROR;
592 return 0;
593 }
594 setVariableTop((uint32_t)(ce1 >> 32), errorCode);
595 return settings->variableTop;
596 }
597
598 uint32_t
599 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
600 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
601 }
602
603 void
604 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
605 if(U_FAILURE(errorCode)) { return; }
606 if(varTop != settings->variableTop) {
607 // Pin the variable top to the end of the reordering group which contains it.
608 // Only a few special groups are supported.
609 int32_t group = data->getGroupForPrimary(varTop);
610 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
611 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
612 return;
613 }
614 uint32_t v = data->getLastPrimaryForGroup(group);
615 U_ASSERT(v != 0 && v >= varTop);
616 varTop = v;
617 if(varTop != settings->variableTop) {
618 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
619 if(ownedSettings == NULL) {
620 errorCode = U_MEMORY_ALLOCATION_ERROR;
621 return;
622 }
623 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
624 getDefaultSettings().options, errorCode);
625 if(U_FAILURE(errorCode)) { return; }
626 ownedSettings->variableTop = varTop;
627 setFastLatinOptions(*ownedSettings);
628 }
629 }
630 if(varTop == getDefaultSettings().variableTop) {
631 setAttributeDefault(ATTR_VARIABLE_TOP);
632 } else {
633 setAttributeExplicitly(ATTR_VARIABLE_TOP);
634 }
635 }
636
637 int32_t
638 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
639 UErrorCode &errorCode) const {
640 if(U_FAILURE(errorCode)) { return 0; }
641 if(capacity < 0 || (dest == NULL && capacity > 0)) {
642 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
643 return 0;
644 }
645 int32_t length = settings->reorderCodesLength;
646 if(length == 0) { return 0; }
647 if(length > capacity) {
648 errorCode = U_BUFFER_OVERFLOW_ERROR;
649 return length;
650 }
651 uprv_memcpy(dest, settings->reorderCodes, length * 4);
652 return length;
653 }
654
655 void
656 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
657 UErrorCode &errorCode) {
658 if(U_FAILURE(errorCode)) { return; }
659 if(length < 0 || (reorderCodes == NULL && length > 0)) {
660 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
661 return;
662 }
663 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
664 length = 0;
665 }
666 if(length == settings->reorderCodesLength &&
667 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
668 return;
669 }
670 const CollationSettings &defaultSettings = getDefaultSettings();
671 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
672 if(settings != &defaultSettings) {
673 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
674 if(ownedSettings == NULL) {
675 errorCode = U_MEMORY_ALLOCATION_ERROR;
676 return;
677 }
678 ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
679 setFastLatinOptions(*ownedSettings);
680 }
681 return;
682 }
683 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
684 if(ownedSettings == NULL) {
685 errorCode = U_MEMORY_ALLOCATION_ERROR;
686 return;
687 }
688 ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
689 setFastLatinOptions(*ownedSettings);
690 }
691
692 void
693 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
694 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
695 data, ownedSettings,
696 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
697 }
698
699 UCollationResult
700 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
701 UErrorCode &errorCode) const {
702 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
703 return doCompare(left.getBuffer(), left.length(),
704 right.getBuffer(), right.length(), errorCode);
705 }
706
707 UCollationResult
708 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
709 int32_t length, UErrorCode &errorCode) const {
710 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
711 if(length < 0) {
712 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
713 return UCOL_EQUAL;
714 }
715 int32_t leftLength = left.length();
716 int32_t rightLength = right.length();
717 if(leftLength > length) { leftLength = length; }
718 if(rightLength > length) { rightLength = length; }
719 return doCompare(left.getBuffer(), leftLength,
720 right.getBuffer(), rightLength, errorCode);
721 }
722
723 UCollationResult
724 RuleBasedCollator::compare(const UChar *left, int32_t leftLength,
725 const UChar *right, int32_t rightLength,
726 UErrorCode &errorCode) const {
727 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
728 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
729 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
730 return UCOL_EQUAL;
731 }
732 // Make sure both or neither strings have a known length.
733 // We do not optimize for mixed length/termination.
734 if(leftLength >= 0) {
735 if(rightLength < 0) { rightLength = u_strlen(right); }
736 } else {
737 if(rightLength >= 0) { leftLength = u_strlen(left); }
738 }
739 return doCompare(left, leftLength, right, rightLength, errorCode);
740 }
741
742 UCollationResult
743 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
744 UErrorCode &errorCode) const {
745 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
746 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
747 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
748 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) {
749 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
750 return UCOL_EQUAL;
751 }
752 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
753 }
754
755 UCollationResult
756 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
757 const char *right, int32_t rightLength,
758 UErrorCode &errorCode) const {
759 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
760 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
761 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
762 return UCOL_EQUAL;
763 }
764 // Make sure both or neither strings have a known length.
765 // We do not optimize for mixed length/termination.
766 if(leftLength >= 0) {
767 if(rightLength < 0) { rightLength = uprv_strlen(right); }
768 } else {
769 if(rightLength >= 0) { leftLength = uprv_strlen(left); }
770 }
771 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
772 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
773 }
774
775 namespace {
776
777 /**
778 * Abstract iterator for identical-level string comparisons.
779 * Returns FCD code points and handles temporary switching to NFD.
780 */
781 class NFDIterator : public UObject {
782 public:
783 NFDIterator() : index(-1), length(0) {}
784 virtual ~NFDIterator() {}
785 /**
786 * Returns the next code point from the internal normalization buffer,
787 * or else the next text code point.
788 * Returns -1 at the end of the text.
789 */
790 UChar32 nextCodePoint() {
791 if(index >= 0) {
792 if(index == length) {
793 index = -1;
794 } else {
795 UChar32 c;
796 U16_NEXT_UNSAFE(decomp, index, c);
797 return c;
798 }
799 }
800 return nextRawCodePoint();
801 }
802 /**
803 * @param nfcImpl
804 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
805 * @return the first code point in c's decomposition,
806 * or c itself if it was decomposed already or if it does not decompose
807 */
808 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
809 if(index >= 0) { return c; }
810 decomp = nfcImpl.getDecomposition(c, buffer, length);
811 if(decomp == NULL) { return c; }
812 index = 0;
813 U16_NEXT_UNSAFE(decomp, index, c);
814 return c;
815 }
816 protected:
817 /**
818 * Returns the next text code point in FCD order.
819 * Returns -1 at the end of the text.
820 */
821 virtual UChar32 nextRawCodePoint() = 0;
822 private:
823 const UChar *decomp;
824 UChar buffer[4];
825 int32_t index;
826 int32_t length;
827 };
828
829 class UTF16NFDIterator : public NFDIterator {
830 public:
831 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {}
832 protected:
833 virtual UChar32 nextRawCodePoint() {
834 if(s == limit) { return U_SENTINEL; }
835 UChar32 c = *s++;
836 if(limit == NULL && c == 0) {
837 s = NULL;
838 return U_SENTINEL;
839 }
840 UChar trail;
841 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
842 ++s;
843 c = U16_GET_SUPPLEMENTARY(c, trail);
844 }
845 return c;
846 }
847
848 const UChar *s;
849 const UChar *limit;
850 };
851
852 class FCDUTF16NFDIterator : public UTF16NFDIterator {
853 public:
854 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit)
855 : UTF16NFDIterator(NULL, NULL) {
856 UErrorCode errorCode = U_ZERO_ERROR;
857 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode);
858 if(U_FAILURE(errorCode)) { return; }
859 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) {
860 s = text;
861 limit = spanLimit;
862 } else {
863 str.setTo(text, (int32_t)(spanLimit - text));
864 {
865 ReorderingBuffer buffer(nfcImpl, str);
866 if(buffer.init(str.length(), errorCode)) {
867 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode);
868 }
869 }
870 if(U_SUCCESS(errorCode)) {
871 s = str.getBuffer();
872 limit = s + str.length();
873 }
874 }
875 }
876 private:
877 UnicodeString str;
878 };
879
880 class UTF8NFDIterator : public NFDIterator {
881 public:
882 UTF8NFDIterator(const uint8_t *text, int32_t textLength)
883 : s(text), pos(0), length(textLength) {}
884 protected:
885 virtual UChar32 nextRawCodePoint() {
886 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
887 UChar32 c;
888 U8_NEXT_OR_FFFD(s, pos, length, c);
889 return c;
890 }
891
892 const uint8_t *s;
893 int32_t pos;
894 int32_t length;
895 };
896
897 class FCDUTF8NFDIterator : public NFDIterator {
898 public:
899 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
900 : u8ci(data, FALSE, text, 0, textLength) {}
901 protected:
902 virtual UChar32 nextRawCodePoint() {
903 UErrorCode errorCode = U_ZERO_ERROR;
904 return u8ci.nextCodePoint(errorCode);
905 }
906 private:
907 FCDUTF8CollationIterator u8ci;
908 };
909
910 class UIterNFDIterator : public NFDIterator {
911 public:
912 UIterNFDIterator(UCharIterator &it) : iter(it) {}
913 protected:
914 virtual UChar32 nextRawCodePoint() {
915 return uiter_next32(&iter);
916 }
917 private:
918 UCharIterator &iter;
919 };
920
921 class FCDUIterNFDIterator : public NFDIterator {
922 public:
923 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
924 : uici(data, FALSE, it, startIndex) {}
925 protected:
926 virtual UChar32 nextRawCodePoint() {
927 UErrorCode errorCode = U_ZERO_ERROR;
928 return uici.nextCodePoint(errorCode);
929 }
930 private:
931 FCDUIterCollationIterator uici;
932 };
933
934 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
935 NFDIterator &left, NFDIterator &right) {
936 for(;;) {
937 // Fetch the next FCD code point from each string.
938 UChar32 leftCp = left.nextCodePoint();
939 UChar32 rightCp = right.nextCodePoint();
940 if(leftCp == rightCp) {
941 if(leftCp < 0) { break; }
942 continue;
943 }
944 // If they are different, then decompose each and compare again.
945 if(leftCp < 0) {
946 leftCp = -2; // end of string
947 } else if(leftCp == 0xfffe) {
948 leftCp = -1; // U+FFFE: merge separator
949 } else {
950 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
951 }
952 if(rightCp < 0) {
953 rightCp = -2; // end of string
954 } else if(rightCp == 0xfffe) {
955 rightCp = -1; // U+FFFE: merge separator
956 } else {
957 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
958 }
959 if(leftCp < rightCp) { return UCOL_LESS; }
960 if(leftCp > rightCp) { return UCOL_GREATER; }
961 }
962 return UCOL_EQUAL;
963 }
964
965 } // namespace
966
967 UCollationResult
968 RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength,
969 const UChar *right, int32_t rightLength,
970 UErrorCode &errorCode) const {
971 // U_FAILURE(errorCode) checked by caller.
972 if(left == right && leftLength == rightLength) {
973 return UCOL_EQUAL;
974 }
975
976 // Identical-prefix test.
977 const UChar *leftLimit;
978 const UChar *rightLimit;
979 int32_t equalPrefixLength = 0;
980 if(leftLength < 0) {
981 leftLimit = NULL;
982 rightLimit = NULL;
983 UChar c;
984 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
985 if(c == 0) { return UCOL_EQUAL; }
986 ++equalPrefixLength;
987 }
988 } else {
989 leftLimit = left + leftLength;
990 rightLimit = right + rightLength;
991 for(;;) {
992 if(equalPrefixLength == leftLength) {
993 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
994 break;
995 } else if(equalPrefixLength == rightLength ||
996 left[equalPrefixLength] != right[equalPrefixLength]) {
997 break;
998 }
999 ++equalPrefixLength;
1000 }
1001 }
1002
1003 UBool numeric = settings->isNumeric();
1004 if(equalPrefixLength > 0) {
1005 if((equalPrefixLength != leftLength &&
1006 data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1007 (equalPrefixLength != rightLength &&
1008 data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1009 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1010 while(--equalPrefixLength > 0 &&
1011 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1012 }
1013 // Notes:
1014 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1015 // - With a backward level, a longer string can compare less-than a prefix of it.
1016
1017 // Pass the actual start of each string into the CollationIterators,
1018 // plus the equalPrefixLength position,
1019 // so that prefix matches back into the equal prefix work.
1020 }
1021
1022 int32_t result;
1023 int32_t fastLatinOptions = settings->fastLatinOptions;
1024 if(fastLatinOptions >= 0 &&
1025 (equalPrefixLength == leftLength ||
1026 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1027 (equalPrefixLength == rightLength ||
1028 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1029 if(leftLength >= 0) {
1030 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1031 settings->fastLatinPrimaries,
1032 fastLatinOptions,
1033 left + equalPrefixLength,
1034 leftLength - equalPrefixLength,
1035 right + equalPrefixLength,
1036 rightLength - equalPrefixLength);
1037 } else {
1038 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1039 settings->fastLatinPrimaries,
1040 fastLatinOptions,
1041 left + equalPrefixLength, -1,
1042 right + equalPrefixLength, -1);
1043 }
1044 } else {
1045 result = CollationFastLatin::BAIL_OUT_RESULT;
1046 }
1047
1048 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1049 if(settings->dontCheckFCD()) {
1050 UTF16CollationIterator leftIter(data, numeric,
1051 left, left + equalPrefixLength, leftLimit);
1052 UTF16CollationIterator rightIter(data, numeric,
1053 right, right + equalPrefixLength, rightLimit);
1054 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1055 } else {
1056 FCDUTF16CollationIterator leftIter(data, numeric,
1057 left, left + equalPrefixLength, leftLimit);
1058 FCDUTF16CollationIterator rightIter(data, numeric,
1059 right, right + equalPrefixLength, rightLimit);
1060 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1061 }
1062 }
1063 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1064 return (UCollationResult)result;
1065 }
1066
1067 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1068 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1069 // and the benefit seems unlikely to be measurable.
1070
1071 // Compare identical level.
1072 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1073 left += equalPrefixLength;
1074 right += equalPrefixLength;
1075 if(settings->dontCheckFCD()) {
1076 UTF16NFDIterator leftIter(left, leftLimit);
1077 UTF16NFDIterator rightIter(right, rightLimit);
1078 return compareNFDIter(nfcImpl, leftIter, rightIter);
1079 } else {
1080 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1081 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1082 return compareNFDIter(nfcImpl, leftIter, rightIter);
1083 }
1084 }
1085
1086 UCollationResult
1087 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1088 const uint8_t *right, int32_t rightLength,
1089 UErrorCode &errorCode) const {
1090 // U_FAILURE(errorCode) checked by caller.
1091 if(left == right && leftLength == rightLength) {
1092 return UCOL_EQUAL;
1093 }
1094
1095 // Identical-prefix test.
1096 int32_t equalPrefixLength = 0;
1097 if(leftLength < 0) {
1098 uint8_t c;
1099 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1100 if(c == 0) { return UCOL_EQUAL; }
1101 ++equalPrefixLength;
1102 }
1103 } else {
1104 for(;;) {
1105 if(equalPrefixLength == leftLength) {
1106 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1107 break;
1108 } else if(equalPrefixLength == rightLength ||
1109 left[equalPrefixLength] != right[equalPrefixLength]) {
1110 break;
1111 }
1112 ++equalPrefixLength;
1113 }
1114 }
1115 // Back up to the start of a partially-equal code point.
1116 if(equalPrefixLength > 0 &&
1117 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
1118 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
1119 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
1120 }
1121
1122 UBool numeric = settings->isNumeric();
1123 if(equalPrefixLength > 0) {
1124 UBool unsafe = FALSE;
1125 if(equalPrefixLength != leftLength) {
1126 int32_t i = equalPrefixLength;
1127 UChar32 c;
1128 U8_NEXT_OR_FFFD(left, i, leftLength, c);
1129 unsafe = data->isUnsafeBackward(c, numeric);
1130 }
1131 if(!unsafe && equalPrefixLength != rightLength) {
1132 int32_t i = equalPrefixLength;
1133 UChar32 c;
1134 U8_NEXT_OR_FFFD(right, i, rightLength, c);
1135 unsafe = data->isUnsafeBackward(c, numeric);
1136 }
1137 if(unsafe) {
1138 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1139 UChar32 c;
1140 do {
1141 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1142 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
1143 }
1144 // See the notes in the UTF-16 version.
1145
1146 // Pass the actual start of each string into the CollationIterators,
1147 // plus the equalPrefixLength position,
1148 // so that prefix matches back into the equal prefix work.
1149 }
1150
1151 int32_t result;
1152 int32_t fastLatinOptions = settings->fastLatinOptions;
1153 if(fastLatinOptions >= 0 &&
1154 (equalPrefixLength == leftLength ||
1155 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
1156 (equalPrefixLength == rightLength ||
1157 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
1158 if(leftLength >= 0) {
1159 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1160 settings->fastLatinPrimaries,
1161 fastLatinOptions,
1162 left + equalPrefixLength,
1163 leftLength - equalPrefixLength,
1164 right + equalPrefixLength,
1165 rightLength - equalPrefixLength);
1166 } else {
1167 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1168 settings->fastLatinPrimaries,
1169 fastLatinOptions,
1170 left + equalPrefixLength, -1,
1171 right + equalPrefixLength, -1);
1172 }
1173 } else {
1174 result = CollationFastLatin::BAIL_OUT_RESULT;
1175 }
1176
1177 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1178 if(settings->dontCheckFCD()) {
1179 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1180 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1181 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1182 } else {
1183 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1184 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1185 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1186 }
1187 }
1188 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1189 return (UCollationResult)result;
1190 }
1191
1192 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1193 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1194 // and the benefit seems unlikely to be measurable.
1195
1196 // Compare identical level.
1197 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1198 left += equalPrefixLength;
1199 right += equalPrefixLength;
1200 if(leftLength > 0) {
1201 leftLength -= equalPrefixLength;
1202 rightLength -= equalPrefixLength;
1203 }
1204 if(settings->dontCheckFCD()) {
1205 UTF8NFDIterator leftIter(left, leftLength);
1206 UTF8NFDIterator rightIter(right, rightLength);
1207 return compareNFDIter(nfcImpl, leftIter, rightIter);
1208 } else {
1209 FCDUTF8NFDIterator leftIter(data, left, leftLength);
1210 FCDUTF8NFDIterator rightIter(data, right, rightLength);
1211 return compareNFDIter(nfcImpl, leftIter, rightIter);
1212 }
1213 }
1214
1215 UCollationResult
1216 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1217 UErrorCode &errorCode) const {
1218 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1219 UBool numeric = settings->isNumeric();
1220
1221 // Identical-prefix test.
1222 int32_t equalPrefixLength = 0;
1223 {
1224 UChar32 leftUnit;
1225 UChar32 rightUnit;
1226 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1227 if(leftUnit < 0) { return UCOL_EQUAL; }
1228 ++equalPrefixLength;
1229 }
1230
1231 // Back out the code units that differed, for the real collation comparison.
1232 if(leftUnit >= 0) { left.previous(&left); }
1233 if(rightUnit >= 0) { right.previous(&right); }
1234
1235 if(equalPrefixLength > 0) {
1236 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1237 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
1238 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1239 do {
1240 --equalPrefixLength;
1241 leftUnit = left.previous(&left);
1242 right.previous(&right);
1243 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
1244 }
1245 // See the notes in the UTF-16 version.
1246 }
1247 }
1248
1249 UCollationResult result;
1250 if(settings->dontCheckFCD()) {
1251 UIterCollationIterator leftIter(data, numeric, left);
1252 UIterCollationIterator rightIter(data, numeric, right);
1253 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1254 } else {
1255 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
1256 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
1257 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1258 }
1259 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1260 return result;
1261 }
1262
1263 // Compare identical level.
1264 left.move(&left, equalPrefixLength, UITER_ZERO);
1265 right.move(&right, equalPrefixLength, UITER_ZERO);
1266 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1267 if(settings->dontCheckFCD()) {
1268 UIterNFDIterator leftIter(left);
1269 UIterNFDIterator rightIter(right);
1270 return compareNFDIter(nfcImpl, leftIter, rightIter);
1271 } else {
1272 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1273 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1274 return compareNFDIter(nfcImpl, leftIter, rightIter);
1275 }
1276 }
1277
1278 CollationKey &
1279 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1280 UErrorCode &errorCode) const {
1281 return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1282 }
1283
1284 CollationKey &
1285 RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key,
1286 UErrorCode &errorCode) const {
1287 if(U_FAILURE(errorCode)) {
1288 return key.setToBogus();
1289 }
1290 if(s == NULL && length != 0) {
1291 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1292 return key.setToBogus();
1293 }
1294 key.reset(); // resets the "bogus" state
1295 CollationKeyByteSink sink(key);
1296 writeSortKey(s, length, sink, errorCode);
1297 if(U_FAILURE(errorCode)) {
1298 key.setToBogus();
1299 } else if(key.isBogus()) {
1300 errorCode = U_MEMORY_ALLOCATION_ERROR;
1301 } else {
1302 key.setLength(sink.NumberOfBytesAppended());
1303 }
1304 return key;
1305 }
1306
1307 int32_t
1308 RuleBasedCollator::getSortKey(const UnicodeString &s,
1309 uint8_t *dest, int32_t capacity) const {
1310 return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1311 }
1312
1313 int32_t
1314 RuleBasedCollator::getSortKey(const UChar *s, int32_t length,
1315 uint8_t *dest, int32_t capacity) const {
1316 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) {
1317 return 0;
1318 }
1319 uint8_t noDest[1] = { 0 };
1320 if(dest == NULL) {
1321 // Distinguish pure preflighting from an allocation error.
1322 dest = noDest;
1323 capacity = 0;
1324 }
1325 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1326 UErrorCode errorCode = U_ZERO_ERROR;
1327 writeSortKey(s, length, sink, errorCode);
1328 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1329 }
1330
1331 void
1332 RuleBasedCollator::writeSortKey(const UChar *s, int32_t length,
1333 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1334 if(U_FAILURE(errorCode)) { return; }
1335 const UChar *limit = (length >= 0) ? s + length : NULL;
1336 UBool numeric = settings->isNumeric();
1337 CollationKeys::LevelCallback callback;
1338 if(settings->dontCheckFCD()) {
1339 UTF16CollationIterator iter(data, numeric, s, s, limit);
1340 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1341 sink, Collation::PRIMARY_LEVEL,
1342 callback, TRUE, errorCode);
1343 } else {
1344 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1345 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1346 sink, Collation::PRIMARY_LEVEL,
1347 callback, TRUE, errorCode);
1348 }
1349 if(settings->getStrength() == UCOL_IDENTICAL) {
1350 writeIdenticalLevel(s, limit, sink, errorCode);
1351 }
1352 static const char terminator = 0; // TERMINATOR_BYTE
1353 sink.Append(&terminator, 1);
1354 }
1355
1356 void
1357 RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit,
1358 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1359 // NFD quick check
1360 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode);
1361 if(U_FAILURE(errorCode)) { return; }
1362 sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1363 UChar32 prev = 0;
1364 if(nfdQCYesLimit != s) {
1365 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
1366 }
1367 // Is there non-NFD text?
1368 int32_t destLengthEstimate;
1369 if(limit != NULL) {
1370 if(nfdQCYesLimit == limit) { return; }
1371 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1372 } else {
1373 // s is NUL-terminated
1374 if(*nfdQCYesLimit == 0) { return; }
1375 destLengthEstimate = -1;
1376 }
1377 UnicodeString nfd;
1378 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
1379 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1380 }
1381
1382 namespace {
1383
1384 /**
1385 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1386 * with an instance of this callback class.
1387 * When another level is about to be written, the callback
1388 * records the level and the number of bytes that will be written until
1389 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1390 *
1391 * When internalNextSortKeyPart() is called again, it restarts with the last level
1392 * and ignores as many bytes as were written previously for that level.
1393 */
1394 class PartLevelCallback : public CollationKeys::LevelCallback {
1395 public:
1396 PartLevelCallback(const SortKeyByteSink &s)
1397 : sink(s), level(Collation::PRIMARY_LEVEL) {
1398 levelCapacity = sink.GetRemainingCapacity();
1399 }
1400 virtual ~PartLevelCallback() {}
1401 virtual UBool needToWrite(Collation::Level l) {
1402 if(!sink.Overflowed()) {
1403 // Remember a level that will be at least partially written.
1404 level = l;
1405 levelCapacity = sink.GetRemainingCapacity();
1406 return TRUE;
1407 } else {
1408 return FALSE;
1409 }
1410 }
1411 Collation::Level getLevel() const { return level; }
1412 int32_t getLevelCapacity() const { return levelCapacity; }
1413
1414 private:
1415 const SortKeyByteSink &sink;
1416 Collation::Level level;
1417 int32_t levelCapacity;
1418 };
1419
1420 } // namespace
1421
1422 int32_t
1423 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
1424 uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
1425 if(U_FAILURE(errorCode)) { return 0; }
1426 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) {
1427 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1428 return 0;
1429 }
1430 if(count == 0) { return 0; }
1431
1432 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1433 sink.IgnoreBytes((int32_t)state[1]);
1434 iter->move(iter, 0, UITER_START);
1435
1436 Collation::Level level = (Collation::Level)state[0];
1437 if(level <= Collation::QUATERNARY_LEVEL) {
1438 UBool numeric = settings->isNumeric();
1439 PartLevelCallback callback(sink);
1440 if(settings->dontCheckFCD()) {
1441 UIterCollationIterator ci(data, numeric, *iter);
1442 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1443 sink, level, callback, FALSE, errorCode);
1444 } else {
1445 FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1446 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1447 sink, level, callback, FALSE, errorCode);
1448 }
1449 if(U_FAILURE(errorCode)) { return 0; }
1450 if(sink.NumberOfBytesAppended() > count) {
1451 state[0] = (uint32_t)callback.getLevel();
1452 state[1] = (uint32_t)callback.getLevelCapacity();
1453 return count;
1454 }
1455 // All of the normal levels are done.
1456 if(settings->getStrength() == UCOL_IDENTICAL) {
1457 level = Collation::IDENTICAL_LEVEL;
1458 iter->move(iter, 0, UITER_START);
1459 }
1460 // else fall through to setting ZERO_LEVEL
1461 }
1462
1463 if(level == Collation::IDENTICAL_LEVEL) {
1464 int32_t levelCapacity = sink.GetRemainingCapacity();
1465 UnicodeString s;
1466 for(;;) {
1467 UChar32 c = iter->next(iter);
1468 if(c < 0) { break; }
1469 s.append((UChar)c);
1470 }
1471 const UChar *sArray = s.getBuffer();
1472 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1473 if(U_FAILURE(errorCode)) { return 0; }
1474 if(sink.NumberOfBytesAppended() > count) {
1475 state[0] = (uint32_t)level;
1476 state[1] = (uint32_t)levelCapacity;
1477 return count;
1478 }
1479 }
1480
1481 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1482 state[0] = (uint32_t)Collation::ZERO_LEVEL;
1483 state[1] = 0;
1484 int32_t length = sink.NumberOfBytesAppended();
1485 int32_t i = length;
1486 while(i < count) { dest[i++] = 0; }
1487 return length;
1488 }
1489
1490 void
1491 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1492 UErrorCode &errorCode) const {
1493 if(U_FAILURE(errorCode)) { return; }
1494 const UChar *s = str.getBuffer();
1495 const UChar *limit = s + str.length();
1496 UBool numeric = settings->isNumeric();
1497 if(settings->dontCheckFCD()) {
1498 UTF16CollationIterator iter(data, numeric, s, s, limit);
1499 int64_t ce;
1500 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1501 ces.addElement(ce, errorCode);
1502 }
1503 } else {
1504 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1505 int64_t ce;
1506 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1507 ces.addElement(ce, errorCode);
1508 }
1509 }
1510 }
1511
1512 namespace {
1513
1514 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
1515 UErrorCode &errorCode) {
1516 if(U_FAILURE(errorCode) || length == 0) { return; }
1517 if(!s.isEmpty()) {
1518 s.append('_', errorCode);
1519 }
1520 s.append(letter, errorCode);
1521 for(int32_t i = 0; i < length; ++i) {
1522 s.append(uprv_toupper(subtag[i]), errorCode);
1523 }
1524 }
1525
1526 void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1527 UErrorCode &errorCode) {
1528 if(U_FAILURE(errorCode)) { return; }
1529 if(!s.isEmpty()) {
1530 s.append('_', errorCode);
1531 }
1532 static const char *valueChars = "1234...........IXO..SN..LU......";
1533 s.append(letter, errorCode);
1534 s.append(valueChars[value], errorCode);
1535 }
1536
1537 } // namespace
1538
1539 int32_t
1540 RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1541 char *buffer, int32_t capacity,
1542 UErrorCode &errorCode) const {
1543 if(U_FAILURE(errorCode)) { return 0; }
1544 if(buffer == NULL ? capacity != 0 : capacity < 0) {
1545 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1546 return 0;
1547 }
1548 if(locale == NULL) {
1549 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1550 }
1551
1552 char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1553 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
1554 "collation", locale,
1555 NULL, &errorCode);
1556 if(U_FAILURE(errorCode)) { return 0; }
1557 if(length == 0) {
1558 uprv_strcpy(resultLocale, "root");
1559 } else {
1560 resultLocale[length] = 0;
1561 }
1562
1563 // Append items in alphabetic order of their short definition letters.
1564 CharString result;
1565 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1566
1567 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1568 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
1569 }
1570 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1571 // See ICU tickets #10372 and #10386.
1572 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1573 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
1574 }
1575 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1576 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
1577 }
1578 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1579 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
1580 }
1581 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1582 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
1583 }
1584 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1585 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
1586 appendSubtag(result, 'K', subtag, length, errorCode);
1587 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1588 appendSubtag(result, 'L', subtag, length, errorCode);
1589 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1590 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
1591 }
1592 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1593 appendSubtag(result, 'R', subtag, length, errorCode);
1594 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1595 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
1596 }
1597 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1598 appendSubtag(result, 'V', subtag, length, errorCode);
1599 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1600 appendSubtag(result, 'Z', subtag, length, errorCode);
1601
1602 if(U_FAILURE(errorCode)) { return 0; }
1603 if(result.length() <= capacity) {
1604 uprv_memcpy(buffer, result.data(), result.length());
1605 }
1606 return u_terminateChars(buffer, capacity, result.length(), &errorCode);
1607 }
1608
1609 UBool
1610 RuleBasedCollator::isUnsafe(UChar32 c) const {
1611 return data->isUnsafeBackward(c, settings->isNumeric());
1612 }
1613
1614 void U_CALLCONV
1615 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1616 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
1617 }
1618
1619 UBool
1620 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1621 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
1622 return U_SUCCESS(errorCode);
1623 }
1624
1625 CollationElementIterator *
1626 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
1627 UErrorCode errorCode = U_ZERO_ERROR;
1628 if(!initMaxExpansions(errorCode)) { return NULL; }
1629 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1630 if(U_FAILURE(errorCode)) {
1631 delete cei;
1632 return NULL;
1633 }
1634 return cei;
1635 }
1636
1637 CollationElementIterator *
1638 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
1639 UErrorCode errorCode = U_ZERO_ERROR;
1640 if(!initMaxExpansions(errorCode)) { return NULL; }
1641 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1642 if(U_FAILURE(errorCode)) {
1643 delete cei;
1644 return NULL;
1645 }
1646 return cei;
1647 }
1648
1649 int32_t
1650 RuleBasedCollator::getMaxExpansion(int32_t order) const {
1651 UErrorCode errorCode = U_ZERO_ERROR;
1652 (void)initMaxExpansions(errorCode);
1653 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
1654 }
1655
1656 U_NAMESPACE_END
1657
1658 #endif // !UCONFIG_NO_COLLATION