]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rulebasedcollator.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / rulebasedcollator.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * rulebasedcollator.cpp
9 *
10 * (replaced the former tblcoll.cpp)
11 *
12 * created on: 2012feb14 with new and old collation code
13 * created by: Markus W. Scherer
14 */
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/coll.h"
21 #include "unicode/coleitr.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/locid.h"
24 #include "unicode/sortkey.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/ucol.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uloc.h"
29 #include "unicode/uniset.h"
30 #include "unicode/unistr.h"
31 #include "unicode/usetiter.h"
32 #include "unicode/utf8.h"
33 #include "unicode/uversion.h"
34 #include "bocsu.h"
35 #include "charstr.h"
36 #include "cmemory.h"
37 #include "collation.h"
38 #include "collationcompare.h"
39 #include "collationdata.h"
40 #include "collationdatareader.h"
41 #include "collationfastlatin.h"
42 #include "collationiterator.h"
43 #include "collationkeys.h"
44 #include "collationroot.h"
45 #include "collationsets.h"
46 #include "collationsettings.h"
47 #include "collationtailoring.h"
48 #include "cstring.h"
49 #include "uassert.h"
50 #include "ucol_imp.h"
51 #include "uhash.h"
52 #include "uitercollationiterator.h"
53 #include "ustr_imp.h"
54 #include "utf16collationiterator.h"
55 #include "utf8collationiterator.h"
56 #include "uvectr64.h"
57
58 U_NAMESPACE_BEGIN
59
60 namespace {
61
62 class FixedSortKeyByteSink : public SortKeyByteSink {
63 public:
64 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
65 : SortKeyByteSink(dest, destCapacity) {}
66 virtual ~FixedSortKeyByteSink();
67
68 private:
69 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
70 virtual UBool Resize(int32_t appendCapacity, int32_t length);
71 };
72
73 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
74
75 void
76 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
77 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
78 // Fill the buffer completely.
79 int32_t available = capacity_ - length;
80 if (available > 0) {
81 uprv_memcpy(buffer_ + length, bytes, available);
82 }
83 }
84
85 UBool
86 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
87 return FALSE;
88 }
89
90 } // namespace
91
92 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
93 class CollationKeyByteSink : public SortKeyByteSink {
94 public:
95 CollationKeyByteSink(CollationKey &key)
96 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
97 key_(key) {}
98 virtual ~CollationKeyByteSink();
99
100 private:
101 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
102 virtual UBool Resize(int32_t appendCapacity, int32_t length);
103
104 CollationKey &key_;
105 };
106
107 CollationKeyByteSink::~CollationKeyByteSink() {}
108
109 void
110 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
111 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
112 if (Resize(n, length)) {
113 uprv_memcpy(buffer_ + length, bytes, n);
114 }
115 }
116
117 UBool
118 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
119 if (buffer_ == NULL) {
120 return FALSE; // allocation failed before already
121 }
122 int32_t newCapacity = 2 * capacity_;
123 int32_t altCapacity = length + 2 * appendCapacity;
124 if (newCapacity < altCapacity) {
125 newCapacity = altCapacity;
126 }
127 if (newCapacity < 200) {
128 newCapacity = 200;
129 }
130 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
131 if (newBuffer == NULL) {
132 SetNotOk();
133 return FALSE;
134 }
135 buffer_ = reinterpret_cast<char *>(newBuffer);
136 capacity_ = newCapacity;
137 return TRUE;
138 }
139
140 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
141 : Collator(other),
142 data(other.data),
143 settings(other.settings),
144 tailoring(other.tailoring),
145 cacheEntry(other.cacheEntry),
146 validLocale(other.validLocale),
147 explicitlySetAttributes(other.explicitlySetAttributes),
148 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
149 settings->addRef();
150 cacheEntry->addRef();
151 }
152
153 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
154 const RuleBasedCollator *base, UErrorCode &errorCode)
155 : data(NULL),
156 settings(NULL),
157 tailoring(NULL),
158 cacheEntry(NULL),
159 validLocale(""),
160 explicitlySetAttributes(0),
161 actualLocaleIsSameAsValid(FALSE) {
162 if(U_FAILURE(errorCode)) { return; }
163 if(bin == NULL || length == 0 || base == NULL) {
164 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
165 return;
166 }
167 const CollationTailoring *root = CollationRoot::getRoot(errorCode);
168 if(U_FAILURE(errorCode)) { return; }
169 if(base->tailoring != root) {
170 errorCode = U_UNSUPPORTED_ERROR;
171 return;
172 }
173 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
174 if(t.isNull() || t->isBogus()) {
175 errorCode = U_MEMORY_ALLOCATION_ERROR;
176 return;
177 }
178 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
179 if(U_FAILURE(errorCode)) { return; }
180 t->actualLocale.setToBogus();
181 adoptTailoring(t.orphan(), errorCode);
182 }
183
184 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
185 : data(entry->tailoring->data),
186 settings(entry->tailoring->settings),
187 tailoring(entry->tailoring),
188 cacheEntry(entry),
189 validLocale(entry->validLocale),
190 explicitlySetAttributes(0),
191 actualLocaleIsSameAsValid(FALSE) {
192 settings->addRef();
193 cacheEntry->addRef();
194 }
195
196 RuleBasedCollator::~RuleBasedCollator() {
197 SharedObject::clearPtr(settings);
198 SharedObject::clearPtr(cacheEntry);
199 }
200
201 void
202 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
203 if(U_FAILURE(errorCode)) {
204 t->deleteIfZeroRefCount();
205 return;
206 }
207 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL);
208 cacheEntry = new CollationCacheEntry(t->actualLocale, t);
209 if(cacheEntry == NULL) {
210 errorCode = U_MEMORY_ALLOCATION_ERROR;
211 t->deleteIfZeroRefCount();
212 return;
213 }
214 data = t->data;
215 settings = t->settings;
216 settings->addRef();
217 tailoring = t;
218 cacheEntry->addRef();
219 validLocale = t->actualLocale;
220 actualLocaleIsSameAsValid = FALSE;
221 }
222
223 RuleBasedCollator *
224 RuleBasedCollator::clone() const {
225 return new RuleBasedCollator(*this);
226 }
227
228 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
229 if(this == &other) { return *this; }
230 SharedObject::copyPtr(other.settings, settings);
231 tailoring = other.tailoring;
232 SharedObject::copyPtr(other.cacheEntry, cacheEntry);
233 data = tailoring->data;
234 validLocale = other.validLocale;
235 explicitlySetAttributes = other.explicitlySetAttributes;
236 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
237 return *this;
238 }
239
240 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
241
242 UBool
243 RuleBasedCollator::operator==(const Collator& other) const {
244 if(this == &other) { return TRUE; }
245 if(!Collator::operator==(other)) { return FALSE; }
246 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
247 if(*settings != *o.settings) { return FALSE; }
248 if(data == o.data) { return TRUE; }
249 UBool thisIsRoot = data->base == NULL;
250 UBool otherIsRoot = o.data->base == NULL;
251 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
252 if(thisIsRoot != otherIsRoot) { return FALSE; }
253 if((thisIsRoot || !tailoring->rules.isEmpty()) &&
254 (otherIsRoot || !o.tailoring->rules.isEmpty())) {
255 // Shortcut: If both collators have valid rule strings, then compare those.
256 if(tailoring->rules == o.tailoring->rules) { return TRUE; }
257 }
258 // Different rule strings can result in the same or equivalent tailoring.
259 // The rule strings are optional in ICU resource bundles, although included by default.
260 // cloneBinary() drops the rule string.
261 UErrorCode errorCode = U_ZERO_ERROR;
262 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
263 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
264 if(U_FAILURE(errorCode)) { return FALSE; }
265 if(*thisTailored != *otherTailored) { return FALSE; }
266 // For completeness, we should compare all of the mappings;
267 // or we should create a list of strings, sort it with one collator,
268 // and check if both collators compare adjacent strings the same
269 // (order & strength, down to quaternary); or similar.
270 // Testing equality of collators seems unusual.
271 return TRUE;
272 }
273
274 int32_t
275 RuleBasedCollator::hashCode() const {
276 int32_t h = settings->hashCode();
277 if(data->base == NULL) { return h; } // root collator
278 // Do not rely on the rule string, see comments in operator==().
279 UErrorCode errorCode = U_ZERO_ERROR;
280 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
281 if(U_FAILURE(errorCode)) { return 0; }
282 UnicodeSetIterator iter(*set);
283 while(iter.next() && !iter.isString()) {
284 h ^= data->getCE32(iter.getCodepoint());
285 }
286 return h;
287 }
288
289 void
290 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
291 const Locale &actual) {
292 if(actual == tailoring->actualLocale) {
293 actualLocaleIsSameAsValid = FALSE;
294 } else {
295 U_ASSERT(actual == valid);
296 actualLocaleIsSameAsValid = TRUE;
297 }
298 // Do not modify tailoring.actualLocale:
299 // We cannot be sure that that would be thread-safe.
300 validLocale = valid;
301 (void)requested; // Ignore, see also ticket #10477.
302 }
303
304 Locale
305 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
306 if(U_FAILURE(errorCode)) {
307 return Locale::getRoot();
308 }
309 switch(type) {
310 case ULOC_ACTUAL_LOCALE:
311 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
312 case ULOC_VALID_LOCALE:
313 case ULOC_REQUESTED_LOCALE: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
314 return validLocale;
315 default:
316 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
317 return Locale::getRoot();
318 }
319 }
320
321 const char *
322 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
323 if(U_FAILURE(errorCode)) {
324 return NULL;
325 }
326 const Locale *result;
327 switch(type) {
328 case ULOC_ACTUAL_LOCALE:
329 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
330 break;
331 case ULOC_VALID_LOCALE:
332 case ULOC_REQUESTED_LOCALE: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
333 result = &validLocale;
334 break;
335 default:
336 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
337 return NULL;
338 }
339 if(result->isBogus()) { return NULL; }
340 const char *id = result->getName();
341 return id[0] == 0 ? "root" : id;
342 }
343
344 const UnicodeString&
345 RuleBasedCollator::getRules() const {
346 return tailoring->rules;
347 }
348
349 void
350 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
351 if(delta == UCOL_TAILORING_ONLY) {
352 buffer = tailoring->rules;
353 return;
354 }
355 // UCOL_FULL_RULES
356 buffer.remove();
357 CollationLoader::appendRootRules(buffer);
358 buffer.append(tailoring->rules).getTerminatedBuffer();
359 }
360
361 void
362 RuleBasedCollator::getVersion(UVersionInfo version) const {
363 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
364 if (version[1]==0x68 /* uca 13.0 */ && (version[2]&0xC0)==0 /*uca x.x.0*/) {
365 version[2]|=0x40; // uca x.x.1, Apple hack to bump UCA version for ICU 66.1 final
366 }
367 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
368 }
369
370 UnicodeSet *
371 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
372 if(U_FAILURE(errorCode)) { return NULL; }
373 UnicodeSet *tailored = new UnicodeSet();
374 if(tailored == NULL) {
375 errorCode = U_MEMORY_ALLOCATION_ERROR;
376 return NULL;
377 }
378 if(data->base != NULL) {
379 TailoredSet(tailored).forData(data, errorCode);
380 if(U_FAILURE(errorCode)) {
381 delete tailored;
382 return NULL;
383 }
384 }
385 return tailored;
386 }
387
388 void
389 RuleBasedCollator::internalGetContractionsAndExpansions(
390 UnicodeSet *contractions, UnicodeSet *expansions,
391 UBool addPrefixes, UErrorCode &errorCode) const {
392 if(U_FAILURE(errorCode)) { return; }
393 if(contractions != NULL) {
394 contractions->clear();
395 }
396 if(expansions != NULL) {
397 expansions->clear();
398 }
399 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode);
400 }
401
402 void
403 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
404 if(U_FAILURE(errorCode)) { return; }
405 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode);
406 }
407
408 const CollationSettings &
409 RuleBasedCollator::getDefaultSettings() const {
410 return *tailoring->settings;
411 }
412
413 UColAttributeValue
414 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
415 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
416 int32_t option;
417 switch(attr) {
418 case UCOL_FRENCH_COLLATION:
419 option = CollationSettings::BACKWARD_SECONDARY;
420 break;
421 case UCOL_ALTERNATE_HANDLING:
422 return settings->getAlternateHandling();
423 case UCOL_CASE_FIRST:
424 return settings->getCaseFirst();
425 case UCOL_CASE_LEVEL:
426 option = CollationSettings::CASE_LEVEL;
427 break;
428 case UCOL_NORMALIZATION_MODE:
429 option = CollationSettings::CHECK_FCD;
430 break;
431 case UCOL_STRENGTH:
432 return (UColAttributeValue)settings->getStrength();
433 case UCOL_HIRAGANA_QUATERNARY_MODE:
434 // Deprecated attribute, unsettable.
435 return UCOL_OFF;
436 case UCOL_NUMERIC_COLLATION:
437 option = CollationSettings::NUMERIC;
438 break;
439 default:
440 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
441 return UCOL_DEFAULT;
442 }
443 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
444 }
445
446 void
447 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
448 UErrorCode &errorCode) {
449 UColAttributeValue oldValue = getAttribute(attr, errorCode);
450 if(U_FAILURE(errorCode)) { return; }
451 if(value == oldValue) {
452 setAttributeExplicitly(attr);
453 return;
454 }
455 const CollationSettings &defaultSettings = getDefaultSettings();
456 if(settings == &defaultSettings) {
457 if(value == UCOL_DEFAULT) {
458 setAttributeDefault(attr);
459 return;
460 }
461 }
462 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
463 if(ownedSettings == NULL) {
464 errorCode = U_MEMORY_ALLOCATION_ERROR;
465 return;
466 }
467
468 switch(attr) {
469 case UCOL_FRENCH_COLLATION:
470 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
471 defaultSettings.options, errorCode);
472 break;
473 case UCOL_ALTERNATE_HANDLING:
474 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
475 break;
476 case UCOL_CASE_FIRST:
477 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
478 break;
479 case UCOL_CASE_LEVEL:
480 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
481 defaultSettings.options, errorCode);
482 break;
483 case UCOL_NORMALIZATION_MODE:
484 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
485 defaultSettings.options, errorCode);
486 break;
487 case UCOL_STRENGTH:
488 ownedSettings->setStrength(value, defaultSettings.options, errorCode);
489 break;
490 case UCOL_HIRAGANA_QUATERNARY_MODE:
491 // Deprecated attribute. Check for valid values but do not change anything.
492 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
493 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
494 }
495 break;
496 case UCOL_NUMERIC_COLLATION:
497 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
498 break;
499 default:
500 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
501 break;
502 }
503 if(U_FAILURE(errorCode)) { return; }
504 setFastLatinOptions(*ownedSettings);
505 if(value == UCOL_DEFAULT) {
506 setAttributeDefault(attr);
507 } else {
508 setAttributeExplicitly(attr);
509 }
510 }
511
512 Collator &
513 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
514 if(U_FAILURE(errorCode)) { return *this; }
515 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
516 int32_t value;
517 if(group == UCOL_REORDER_CODE_DEFAULT) {
518 value = UCOL_DEFAULT;
519 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
520 value = group - UCOL_REORDER_CODE_FIRST;
521 } else {
522 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
523 return *this;
524 }
525 CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
526 if(value == oldValue) {
527 setAttributeExplicitly(ATTR_VARIABLE_TOP);
528 return *this;
529 }
530 const CollationSettings &defaultSettings = getDefaultSettings();
531 if(settings == &defaultSettings) {
532 if(value == UCOL_DEFAULT) {
533 setAttributeDefault(ATTR_VARIABLE_TOP);
534 return *this;
535 }
536 }
537 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
538 if(ownedSettings == NULL) {
539 errorCode = U_MEMORY_ALLOCATION_ERROR;
540 return *this;
541 }
542
543 if(group == UCOL_REORDER_CODE_DEFAULT) {
544 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable());
545 }
546 uint32_t varTop = data->getLastPrimaryForGroup(group);
547 U_ASSERT(varTop != 0);
548 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
549 if(U_FAILURE(errorCode)) { return *this; }
550 ownedSettings->variableTop = varTop;
551 setFastLatinOptions(*ownedSettings);
552 if(value == UCOL_DEFAULT) {
553 setAttributeDefault(ATTR_VARIABLE_TOP);
554 } else {
555 setAttributeExplicitly(ATTR_VARIABLE_TOP);
556 }
557 return *this;
558 }
559
560 UColReorderCode
561 RuleBasedCollator::getMaxVariable() const {
562 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
563 }
564
565 uint32_t
566 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
567 return settings->variableTop;
568 }
569
570 uint32_t
571 RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) {
572 if(U_FAILURE(errorCode)) { return 0; }
573 if(varTop == NULL && len !=0) {
574 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
575 return 0;
576 }
577 if(len < 0) { len = u_strlen(varTop); }
578 if(len == 0) {
579 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
580 return 0;
581 }
582 UBool numeric = settings->isNumeric();
583 int64_t ce1, ce2;
584 if(settings->dontCheckFCD()) {
585 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
586 ce1 = ci.nextCE(errorCode);
587 ce2 = ci.nextCE(errorCode);
588 } else {
589 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
590 ce1 = ci.nextCE(errorCode);
591 ce2 = ci.nextCE(errorCode);
592 }
593 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
594 errorCode = U_CE_NOT_FOUND_ERROR;
595 return 0;
596 }
597 setVariableTop((uint32_t)(ce1 >> 32), errorCode);
598 return settings->variableTop;
599 }
600
601 uint32_t
602 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
603 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
604 }
605
606 void
607 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
608 if(U_FAILURE(errorCode)) { return; }
609 if(varTop != settings->variableTop) {
610 // Pin the variable top to the end of the reordering group which contains it.
611 // Only a few special groups are supported.
612 int32_t group = data->getGroupForPrimary(varTop);
613 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
614 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
615 return;
616 }
617 uint32_t v = data->getLastPrimaryForGroup(group);
618 U_ASSERT(v != 0 && v >= varTop);
619 varTop = v;
620 if(varTop != settings->variableTop) {
621 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
622 if(ownedSettings == NULL) {
623 errorCode = U_MEMORY_ALLOCATION_ERROR;
624 return;
625 }
626 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
627 getDefaultSettings().options, errorCode);
628 if(U_FAILURE(errorCode)) { return; }
629 ownedSettings->variableTop = varTop;
630 setFastLatinOptions(*ownedSettings);
631 }
632 }
633 if(varTop == getDefaultSettings().variableTop) {
634 setAttributeDefault(ATTR_VARIABLE_TOP);
635 } else {
636 setAttributeExplicitly(ATTR_VARIABLE_TOP);
637 }
638 }
639
640 int32_t
641 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
642 UErrorCode &errorCode) const {
643 if(U_FAILURE(errorCode)) { return 0; }
644 if(capacity < 0 || (dest == NULL && capacity > 0)) {
645 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
646 return 0;
647 }
648 int32_t length = settings->reorderCodesLength;
649 if(length == 0) { return 0; }
650 if(length > capacity) {
651 errorCode = U_BUFFER_OVERFLOW_ERROR;
652 return length;
653 }
654 uprv_memcpy(dest, settings->reorderCodes, length * 4);
655 return length;
656 }
657
658 void
659 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
660 UErrorCode &errorCode) {
661 if(U_FAILURE(errorCode)) { return; }
662 if(length < 0 || (reorderCodes == NULL && length > 0)) {
663 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
664 return;
665 }
666 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
667 length = 0;
668 }
669 if(length == settings->reorderCodesLength &&
670 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
671 return;
672 }
673 const CollationSettings &defaultSettings = getDefaultSettings();
674 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
675 if(settings != &defaultSettings) {
676 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
677 if(ownedSettings == NULL) {
678 errorCode = U_MEMORY_ALLOCATION_ERROR;
679 return;
680 }
681 ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
682 setFastLatinOptions(*ownedSettings);
683 }
684 return;
685 }
686 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
687 if(ownedSettings == NULL) {
688 errorCode = U_MEMORY_ALLOCATION_ERROR;
689 return;
690 }
691 ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
692 setFastLatinOptions(*ownedSettings);
693 }
694
695 void
696 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
697 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
698 data, ownedSettings,
699 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
700 }
701
702 UCollationResult
703 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
704 UErrorCode &errorCode) const {
705 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
706 return doCompare(left.getBuffer(), left.length(),
707 right.getBuffer(), right.length(), errorCode);
708 }
709
710 UCollationResult
711 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
712 int32_t length, UErrorCode &errorCode) const {
713 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
714 if(length < 0) {
715 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
716 return UCOL_EQUAL;
717 }
718 int32_t leftLength = left.length();
719 int32_t rightLength = right.length();
720 if(leftLength > length) { leftLength = length; }
721 if(rightLength > length) { rightLength = length; }
722 return doCompare(left.getBuffer(), leftLength,
723 right.getBuffer(), rightLength, errorCode);
724 }
725
726 UCollationResult
727 RuleBasedCollator::compare(const UChar *left, int32_t leftLength,
728 const UChar *right, int32_t rightLength,
729 UErrorCode &errorCode) const {
730 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
731 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
732 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
733 return UCOL_EQUAL;
734 }
735 // Make sure both or neither strings have a known length.
736 // We do not optimize for mixed length/termination.
737 if(leftLength >= 0) {
738 if(rightLength < 0) { rightLength = u_strlen(right); }
739 } else {
740 if(rightLength >= 0) { leftLength = u_strlen(left); }
741 }
742 return doCompare(left, leftLength, right, rightLength, errorCode);
743 }
744
745 UCollationResult
746 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
747 UErrorCode &errorCode) const {
748 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
749 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
750 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
751 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) {
752 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
753 return UCOL_EQUAL;
754 }
755 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
756 }
757
758 UCollationResult
759 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
760 const char *right, int32_t rightLength,
761 UErrorCode &errorCode) const {
762 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
763 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
764 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
765 return UCOL_EQUAL;
766 }
767 // Make sure both or neither strings have a known length.
768 // We do not optimize for mixed length/termination.
769 if(leftLength >= 0) {
770 if(rightLength < 0) { rightLength = static_cast<int32_t>(uprv_strlen(right)); }
771 } else {
772 if(rightLength >= 0) { leftLength = static_cast<int32_t>(uprv_strlen(left)); }
773 }
774 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
775 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
776 }
777
778 namespace {
779
780 /**
781 * Abstract iterator for identical-level string comparisons.
782 * Returns FCD code points and handles temporary switching to NFD.
783 */
784 class NFDIterator : public UObject {
785 public:
786 NFDIterator() : index(-1), length(0) {}
787 virtual ~NFDIterator() {}
788 /**
789 * Returns the next code point from the internal normalization buffer,
790 * or else the next text code point.
791 * Returns -1 at the end of the text.
792 */
793 UChar32 nextCodePoint() {
794 if(index >= 0) {
795 if(index == length) {
796 index = -1;
797 } else {
798 UChar32 c;
799 U16_NEXT_UNSAFE(decomp, index, c);
800 return c;
801 }
802 }
803 return nextRawCodePoint();
804 }
805 /**
806 * @param nfcImpl
807 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
808 * @return the first code point in c's decomposition,
809 * or c itself if it was decomposed already or if it does not decompose
810 */
811 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
812 if(index >= 0) { return c; }
813 decomp = nfcImpl.getDecomposition(c, buffer, length);
814 if(decomp == NULL) { return c; }
815 index = 0;
816 U16_NEXT_UNSAFE(decomp, index, c);
817 return c;
818 }
819 protected:
820 /**
821 * Returns the next text code point in FCD order.
822 * Returns -1 at the end of the text.
823 */
824 virtual UChar32 nextRawCodePoint() = 0;
825 private:
826 const UChar *decomp;
827 UChar buffer[4];
828 int32_t index;
829 int32_t length;
830 };
831
832 class UTF16NFDIterator : public NFDIterator {
833 public:
834 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {}
835 protected:
836 virtual UChar32 nextRawCodePoint() {
837 if(s == limit) { return U_SENTINEL; }
838 UChar32 c = *s++;
839 if(limit == NULL && c == 0) {
840 s = NULL;
841 return U_SENTINEL;
842 }
843 UChar trail;
844 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
845 ++s;
846 c = U16_GET_SUPPLEMENTARY(c, trail);
847 }
848 return c;
849 }
850
851 const UChar *s;
852 const UChar *limit;
853 };
854
855 class FCDUTF16NFDIterator : public UTF16NFDIterator {
856 public:
857 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit)
858 : UTF16NFDIterator(NULL, NULL) {
859 UErrorCode errorCode = U_ZERO_ERROR;
860 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode);
861 if(U_FAILURE(errorCode)) { return; }
862 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) {
863 s = text;
864 limit = spanLimit;
865 } else {
866 str.setTo(text, (int32_t)(spanLimit - text));
867 {
868 ReorderingBuffer r_buffer(nfcImpl, str);
869 if(r_buffer.init(str.length(), errorCode)) {
870 nfcImpl.makeFCD(spanLimit, textLimit, &r_buffer, errorCode);
871 }
872 }
873 if(U_SUCCESS(errorCode)) {
874 s = str.getBuffer();
875 limit = s + str.length();
876 }
877 }
878 }
879 private:
880 UnicodeString str;
881 };
882
883 class UTF8NFDIterator : public NFDIterator {
884 public:
885 UTF8NFDIterator(const uint8_t *text, int32_t textLength)
886 : s(text), pos(0), length(textLength) {}
887 protected:
888 virtual UChar32 nextRawCodePoint() {
889 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
890 UChar32 c;
891 U8_NEXT_OR_FFFD(s, pos, length, c);
892 return c;
893 }
894
895 const uint8_t *s;
896 int32_t pos;
897 int32_t length;
898 };
899
900 class FCDUTF8NFDIterator : public NFDIterator {
901 public:
902 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
903 : u8ci(data, FALSE, text, 0, textLength) {}
904 protected:
905 virtual UChar32 nextRawCodePoint() {
906 UErrorCode errorCode = U_ZERO_ERROR;
907 return u8ci.nextCodePoint(errorCode);
908 }
909 private:
910 FCDUTF8CollationIterator u8ci;
911 };
912
913 class UIterNFDIterator : public NFDIterator {
914 public:
915 UIterNFDIterator(UCharIterator &it) : iter(it) {}
916 protected:
917 virtual UChar32 nextRawCodePoint() {
918 return uiter_next32(&iter);
919 }
920 private:
921 UCharIterator &iter;
922 };
923
924 class FCDUIterNFDIterator : public NFDIterator {
925 public:
926 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
927 : uici(data, FALSE, it, startIndex) {}
928 protected:
929 virtual UChar32 nextRawCodePoint() {
930 UErrorCode errorCode = U_ZERO_ERROR;
931 return uici.nextCodePoint(errorCode);
932 }
933 private:
934 FCDUIterCollationIterator uici;
935 };
936
937 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
938 NFDIterator &left, NFDIterator &right) {
939 for(;;) {
940 // Fetch the next FCD code point from each string.
941 UChar32 leftCp = left.nextCodePoint();
942 UChar32 rightCp = right.nextCodePoint();
943 if(leftCp == rightCp) {
944 if(leftCp < 0) { break; }
945 continue;
946 }
947 // If they are different, then decompose each and compare again.
948 if(leftCp < 0) {
949 leftCp = -2; // end of string
950 } else if(leftCp == 0xfffe) {
951 leftCp = -1; // U+FFFE: merge separator
952 } else {
953 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
954 }
955 if(rightCp < 0) {
956 rightCp = -2; // end of string
957 } else if(rightCp == 0xfffe) {
958 rightCp = -1; // U+FFFE: merge separator
959 } else {
960 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
961 }
962 if(leftCp < rightCp) { return UCOL_LESS; }
963 if(leftCp > rightCp) { return UCOL_GREATER; }
964 }
965 return UCOL_EQUAL;
966 }
967
968 } // namespace
969
970 UCollationResult
971 RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength,
972 const UChar *right, int32_t rightLength,
973 UErrorCode &errorCode) const {
974 // U_FAILURE(errorCode) checked by caller.
975 if(left == right && leftLength == rightLength) {
976 return UCOL_EQUAL;
977 }
978
979 // Identical-prefix test.
980 const UChar *leftLimit;
981 const UChar *rightLimit;
982 int32_t equalPrefixLength = 0;
983 if(leftLength < 0) {
984 leftLimit = NULL;
985 rightLimit = NULL;
986 UChar c;
987 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
988 if(c == 0) { return UCOL_EQUAL; }
989 ++equalPrefixLength;
990 }
991 } else {
992 leftLimit = left + leftLength;
993 rightLimit = right + rightLength;
994 for(;;) {
995 if(equalPrefixLength == leftLength) {
996 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
997 break;
998 } else if(equalPrefixLength == rightLength ||
999 left[equalPrefixLength] != right[equalPrefixLength]) {
1000 break;
1001 }
1002 ++equalPrefixLength;
1003 }
1004 }
1005
1006 UBool numeric = settings->isNumeric();
1007 if(equalPrefixLength > 0) {
1008 if((equalPrefixLength != leftLength &&
1009 data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1010 (equalPrefixLength != rightLength &&
1011 data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1012 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1013 while(--equalPrefixLength > 0 &&
1014 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1015 }
1016 // Notes:
1017 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1018 // - With a backward level, a longer string can compare less-than a prefix of it.
1019
1020 // Pass the actual start of each string into the CollationIterators,
1021 // plus the equalPrefixLength position,
1022 // so that prefix matches back into the equal prefix work.
1023 }
1024
1025 int32_t result;
1026 int32_t fastLatinOptions = settings->fastLatinOptions;
1027 if(fastLatinOptions >= 0 &&
1028 (equalPrefixLength == leftLength ||
1029 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1030 (equalPrefixLength == rightLength ||
1031 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1032 if(leftLength >= 0) {
1033 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1034 settings->fastLatinPrimaries,
1035 fastLatinOptions,
1036 left + equalPrefixLength,
1037 leftLength - equalPrefixLength,
1038 right + equalPrefixLength,
1039 rightLength - equalPrefixLength);
1040 } else {
1041 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1042 settings->fastLatinPrimaries,
1043 fastLatinOptions,
1044 left + equalPrefixLength, -1,
1045 right + equalPrefixLength, -1);
1046 }
1047 } else {
1048 result = CollationFastLatin::BAIL_OUT_RESULT;
1049 }
1050
1051 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1052 if(settings->dontCheckFCD()) {
1053 UTF16CollationIterator leftIter(data, numeric,
1054 left, left + equalPrefixLength, leftLimit);
1055 UTF16CollationIterator rightIter(data, numeric,
1056 right, right + equalPrefixLength, rightLimit);
1057 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1058 } else {
1059 FCDUTF16CollationIterator leftIter(data, numeric,
1060 left, left + equalPrefixLength, leftLimit);
1061 FCDUTF16CollationIterator rightIter(data, numeric,
1062 right, right + equalPrefixLength, rightLimit);
1063 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1064 }
1065 }
1066 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1067 return (UCollationResult)result;
1068 }
1069
1070 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1071 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1072 // and the benefit seems unlikely to be measurable.
1073
1074 // Compare identical level.
1075 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1076 left += equalPrefixLength;
1077 right += equalPrefixLength;
1078 if(settings->dontCheckFCD()) {
1079 UTF16NFDIterator leftIter(left, leftLimit);
1080 UTF16NFDIterator rightIter(right, rightLimit);
1081 return compareNFDIter(nfcImpl, leftIter, rightIter);
1082 } else {
1083 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1084 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1085 return compareNFDIter(nfcImpl, leftIter, rightIter);
1086 }
1087 }
1088
1089 UCollationResult
1090 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1091 const uint8_t *right, int32_t rightLength,
1092 UErrorCode &errorCode) const {
1093 // U_FAILURE(errorCode) checked by caller.
1094 if(left == right && leftLength == rightLength) {
1095 return UCOL_EQUAL;
1096 }
1097
1098 // Identical-prefix test.
1099 int32_t equalPrefixLength = 0;
1100 if(leftLength < 0) {
1101 uint8_t c;
1102 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1103 if(c == 0) { return UCOL_EQUAL; }
1104 ++equalPrefixLength;
1105 }
1106 } else {
1107 for(;;) {
1108 if(equalPrefixLength == leftLength) {
1109 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1110 break;
1111 } else if(equalPrefixLength == rightLength ||
1112 left[equalPrefixLength] != right[equalPrefixLength]) {
1113 break;
1114 }
1115 ++equalPrefixLength;
1116 }
1117 }
1118 // Back up to the start of a partially-equal code point.
1119 if(equalPrefixLength > 0 &&
1120 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
1121 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
1122 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
1123 }
1124
1125 UBool numeric = settings->isNumeric();
1126 if(equalPrefixLength > 0) {
1127 UBool unsafe = FALSE;
1128 if(equalPrefixLength != leftLength) {
1129 int32_t i = equalPrefixLength;
1130 UChar32 c;
1131 U8_NEXT_OR_FFFD(left, i, leftLength, c);
1132 unsafe = data->isUnsafeBackward(c, numeric);
1133 }
1134 if(!unsafe && equalPrefixLength != rightLength) {
1135 int32_t i = equalPrefixLength;
1136 UChar32 c;
1137 U8_NEXT_OR_FFFD(right, i, rightLength, c);
1138 unsafe = data->isUnsafeBackward(c, numeric);
1139 }
1140 if(unsafe) {
1141 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1142 UChar32 c;
1143 do {
1144 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1145 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
1146 }
1147 // See the notes in the UTF-16 version.
1148
1149 // Pass the actual start of each string into the CollationIterators,
1150 // plus the equalPrefixLength position,
1151 // so that prefix matches back into the equal prefix work.
1152 }
1153
1154 int32_t result;
1155 int32_t fastLatinOptions = settings->fastLatinOptions;
1156 if(fastLatinOptions >= 0 &&
1157 (equalPrefixLength == leftLength ||
1158 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
1159 (equalPrefixLength == rightLength ||
1160 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
1161 if(leftLength >= 0) {
1162 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1163 settings->fastLatinPrimaries,
1164 fastLatinOptions,
1165 left + equalPrefixLength,
1166 leftLength - equalPrefixLength,
1167 right + equalPrefixLength,
1168 rightLength - equalPrefixLength);
1169 } else {
1170 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1171 settings->fastLatinPrimaries,
1172 fastLatinOptions,
1173 left + equalPrefixLength, -1,
1174 right + equalPrefixLength, -1);
1175 }
1176 } else {
1177 result = CollationFastLatin::BAIL_OUT_RESULT;
1178 }
1179
1180 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1181 if(settings->dontCheckFCD()) {
1182 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1183 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1184 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1185 } else {
1186 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1187 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1188 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1189 }
1190 }
1191 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1192 return (UCollationResult)result;
1193 }
1194
1195 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1196 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1197 // and the benefit seems unlikely to be measurable.
1198
1199 // Compare identical level.
1200 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1201 left += equalPrefixLength;
1202 right += equalPrefixLength;
1203 if(leftLength > 0) {
1204 leftLength -= equalPrefixLength;
1205 rightLength -= equalPrefixLength;
1206 }
1207 if(settings->dontCheckFCD()) {
1208 UTF8NFDIterator leftIter(left, leftLength);
1209 UTF8NFDIterator rightIter(right, rightLength);
1210 return compareNFDIter(nfcImpl, leftIter, rightIter);
1211 } else {
1212 FCDUTF8NFDIterator leftIter(data, left, leftLength);
1213 FCDUTF8NFDIterator rightIter(data, right, rightLength);
1214 return compareNFDIter(nfcImpl, leftIter, rightIter);
1215 }
1216 }
1217
1218 UCollationResult
1219 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1220 UErrorCode &errorCode) const {
1221 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1222 UBool numeric = settings->isNumeric();
1223
1224 // Identical-prefix test.
1225 int32_t equalPrefixLength = 0;
1226 {
1227 UChar32 leftUnit;
1228 UChar32 rightUnit;
1229 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1230 if(leftUnit < 0) { return UCOL_EQUAL; }
1231 ++equalPrefixLength;
1232 }
1233
1234 // Back out the code units that differed, for the real collation comparison.
1235 if(leftUnit >= 0) { left.previous(&left); }
1236 if(rightUnit >= 0) { right.previous(&right); }
1237
1238 if(equalPrefixLength > 0) {
1239 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1240 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
1241 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1242 do {
1243 --equalPrefixLength;
1244 leftUnit = left.previous(&left);
1245 right.previous(&right);
1246 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
1247 }
1248 // See the notes in the UTF-16 version.
1249 }
1250 }
1251
1252 UCollationResult result;
1253 if(settings->dontCheckFCD()) {
1254 UIterCollationIterator leftIter(data, numeric, left);
1255 UIterCollationIterator rightIter(data, numeric, right);
1256 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1257 } else {
1258 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
1259 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
1260 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1261 }
1262 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1263 return result;
1264 }
1265
1266 // Compare identical level.
1267 left.move(&left, equalPrefixLength, UITER_ZERO);
1268 right.move(&right, equalPrefixLength, UITER_ZERO);
1269 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1270 if(settings->dontCheckFCD()) {
1271 UIterNFDIterator leftIter(left);
1272 UIterNFDIterator rightIter(right);
1273 return compareNFDIter(nfcImpl, leftIter, rightIter);
1274 } else {
1275 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1276 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1277 return compareNFDIter(nfcImpl, leftIter, rightIter);
1278 }
1279 }
1280
1281 CollationKey &
1282 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1283 UErrorCode &errorCode) const {
1284 return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1285 }
1286
1287 CollationKey &
1288 RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key,
1289 UErrorCode &errorCode) const {
1290 if(U_FAILURE(errorCode)) {
1291 return key.setToBogus();
1292 }
1293 if(s == NULL && length != 0) {
1294 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1295 return key.setToBogus();
1296 }
1297 key.reset(); // resets the "bogus" state
1298 CollationKeyByteSink sink(key);
1299 writeSortKey(s, length, sink, errorCode);
1300 if(U_FAILURE(errorCode)) {
1301 key.setToBogus();
1302 } else if(key.isBogus()) {
1303 errorCode = U_MEMORY_ALLOCATION_ERROR;
1304 } else {
1305 key.setLength(sink.NumberOfBytesAppended());
1306 }
1307 return key;
1308 }
1309
1310 int32_t
1311 RuleBasedCollator::getSortKey(const UnicodeString &s,
1312 uint8_t *dest, int32_t capacity) const {
1313 return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1314 }
1315
1316 int32_t
1317 RuleBasedCollator::getSortKey(const UChar *s, int32_t length,
1318 uint8_t *dest, int32_t capacity) const {
1319 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) {
1320 return 0;
1321 }
1322 uint8_t noDest[1] = { 0 };
1323 if(dest == NULL) {
1324 // Distinguish pure preflighting from an allocation error.
1325 dest = noDest;
1326 capacity = 0;
1327 }
1328 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1329 UErrorCode errorCode = U_ZERO_ERROR;
1330 writeSortKey(s, length, sink, errorCode);
1331 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1332 }
1333
1334 void
1335 RuleBasedCollator::writeSortKey(const UChar *s, int32_t length,
1336 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1337 if(U_FAILURE(errorCode)) { return; }
1338 const UChar *limit = (length >= 0) ? s + length : NULL;
1339 UBool numeric = settings->isNumeric();
1340 CollationKeys::LevelCallback callback;
1341 if(settings->dontCheckFCD()) {
1342 UTF16CollationIterator iter(data, numeric, s, s, limit);
1343 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1344 sink, Collation::PRIMARY_LEVEL,
1345 callback, TRUE, errorCode);
1346 } else {
1347 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1348 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1349 sink, Collation::PRIMARY_LEVEL,
1350 callback, TRUE, errorCode);
1351 }
1352 if(settings->getStrength() == UCOL_IDENTICAL) {
1353 writeIdenticalLevel(s, limit, sink, errorCode);
1354 }
1355 static const char terminator = 0; // TERMINATOR_BYTE
1356 sink.Append(&terminator, 1);
1357 }
1358
1359 void
1360 RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit,
1361 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1362 // NFD quick check
1363 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode);
1364 if(U_FAILURE(errorCode)) { return; }
1365 sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1366 UChar32 prev = 0;
1367 if(nfdQCYesLimit != s) {
1368 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
1369 }
1370 // Is there non-NFD text?
1371 int32_t destLengthEstimate;
1372 if(limit != NULL) {
1373 if(nfdQCYesLimit == limit) { return; }
1374 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1375 } else {
1376 // s is NUL-terminated
1377 if(*nfdQCYesLimit == 0) { return; }
1378 destLengthEstimate = -1;
1379 }
1380 UnicodeString nfd;
1381 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
1382 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1383 }
1384
1385 namespace {
1386
1387 /**
1388 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1389 * with an instance of this callback class.
1390 * When another level is about to be written, the callback
1391 * records the level and the number of bytes that will be written until
1392 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1393 *
1394 * When internalNextSortKeyPart() is called again, it restarts with the last level
1395 * and ignores as many bytes as were written previously for that level.
1396 */
1397 class PartLevelCallback : public CollationKeys::LevelCallback {
1398 public:
1399 PartLevelCallback(const SortKeyByteSink &s)
1400 : sink(s), level(Collation::PRIMARY_LEVEL) {
1401 levelCapacity = sink.GetRemainingCapacity();
1402 }
1403 virtual ~PartLevelCallback() {}
1404 virtual UBool needToWrite(Collation::Level l) {
1405 if(!sink.Overflowed()) {
1406 // Remember a level that will be at least partially written.
1407 level = l;
1408 levelCapacity = sink.GetRemainingCapacity();
1409 return TRUE;
1410 } else {
1411 return FALSE;
1412 }
1413 }
1414 Collation::Level getLevel() const { return level; }
1415 int32_t getLevelCapacity() const { return levelCapacity; }
1416
1417 private:
1418 const SortKeyByteSink &sink;
1419 Collation::Level level;
1420 int32_t levelCapacity;
1421 };
1422
1423 } // namespace
1424
1425 int32_t
1426 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
1427 uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
1428 if(U_FAILURE(errorCode)) { return 0; }
1429 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) {
1430 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1431 return 0;
1432 }
1433 if(count == 0) { return 0; }
1434
1435 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1436 sink.IgnoreBytes((int32_t)state[1]);
1437 iter->move(iter, 0, UITER_START);
1438
1439 Collation::Level level = (Collation::Level)state[0];
1440 if(level <= Collation::QUATERNARY_LEVEL) {
1441 UBool numeric = settings->isNumeric();
1442 PartLevelCallback callback(sink);
1443 if(settings->dontCheckFCD()) {
1444 UIterCollationIterator ci(data, numeric, *iter);
1445 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1446 sink, level, callback, FALSE, errorCode);
1447 } else {
1448 FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1449 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1450 sink, level, callback, FALSE, errorCode);
1451 }
1452 if(U_FAILURE(errorCode)) { return 0; }
1453 if(sink.NumberOfBytesAppended() > count) {
1454 state[0] = (uint32_t)callback.getLevel();
1455 state[1] = (uint32_t)callback.getLevelCapacity();
1456 return count;
1457 }
1458 // All of the normal levels are done.
1459 if(settings->getStrength() == UCOL_IDENTICAL) {
1460 level = Collation::IDENTICAL_LEVEL;
1461 iter->move(iter, 0, UITER_START);
1462 }
1463 // else fall through to setting ZERO_LEVEL
1464 }
1465
1466 if(level == Collation::IDENTICAL_LEVEL) {
1467 int32_t levelCapacity = sink.GetRemainingCapacity();
1468 UnicodeString s;
1469 for(;;) {
1470 UChar32 c = iter->next(iter);
1471 if(c < 0) { break; }
1472 s.append((UChar)c);
1473 }
1474 const UChar *sArray = s.getBuffer();
1475 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1476 if(U_FAILURE(errorCode)) { return 0; }
1477 if(sink.NumberOfBytesAppended() > count) {
1478 state[0] = (uint32_t)level;
1479 state[1] = (uint32_t)levelCapacity;
1480 return count;
1481 }
1482 }
1483
1484 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1485 state[0] = (uint32_t)Collation::ZERO_LEVEL;
1486 state[1] = 0;
1487 int32_t length = sink.NumberOfBytesAppended();
1488 int32_t i = length;
1489 while(i < count) { dest[i++] = 0; }
1490 return length;
1491 }
1492
1493 void
1494 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1495 UErrorCode &errorCode) const {
1496 if(U_FAILURE(errorCode)) { return; }
1497 const UChar *s = str.getBuffer();
1498 const UChar *limit = s + str.length();
1499 UBool numeric = settings->isNumeric();
1500 if(settings->dontCheckFCD()) {
1501 UTF16CollationIterator iter(data, numeric, s, s, limit);
1502 int64_t ce;
1503 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1504 ces.addElement(ce, errorCode);
1505 }
1506 } else {
1507 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1508 int64_t ce;
1509 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1510 ces.addElement(ce, errorCode);
1511 }
1512 }
1513 }
1514
1515 namespace {
1516
1517 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
1518 UErrorCode &errorCode) {
1519 if(U_FAILURE(errorCode) || length == 0) { return; }
1520 if(!s.isEmpty()) {
1521 s.append('_', errorCode);
1522 }
1523 s.append(letter, errorCode);
1524 for(int32_t i = 0; i < length; ++i) {
1525 s.append(uprv_toupper(subtag[i]), errorCode);
1526 }
1527 }
1528
1529 void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1530 UErrorCode &errorCode) {
1531 if(U_FAILURE(errorCode)) { return; }
1532 if(!s.isEmpty()) {
1533 s.append('_', errorCode);
1534 }
1535 static const char *valueChars = "1234...........IXO..SN..LU......";
1536 s.append(letter, errorCode);
1537 s.append(valueChars[value], errorCode);
1538 }
1539
1540 } // namespace
1541
1542 int32_t
1543 RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1544 char *buffer, int32_t capacity,
1545 UErrorCode &errorCode) const {
1546 if(U_FAILURE(errorCode)) { return 0; }
1547 if(buffer == NULL ? capacity != 0 : capacity < 0) {
1548 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1549 return 0;
1550 }
1551 if(locale == NULL) {
1552 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1553 }
1554
1555 char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1556 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
1557 "collation", locale,
1558 NULL, &errorCode);
1559 if(U_FAILURE(errorCode)) { return 0; }
1560 if(length == 0) {
1561 uprv_strcpy(resultLocale, "root");
1562 } else {
1563 resultLocale[length] = 0;
1564 }
1565
1566 // Append items in alphabetic order of their short definition letters.
1567 CharString result;
1568 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1569
1570 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1571 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
1572 }
1573 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1574 // See ICU tickets #10372 and #10386.
1575 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1576 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
1577 }
1578 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1579 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
1580 }
1581 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1582 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
1583 }
1584 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1585 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
1586 }
1587 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1588 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
1589 appendSubtag(result, 'K', subtag, length, errorCode);
1590 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1591 if (length == 0) {
1592 appendSubtag(result, 'L', "root", 4, errorCode);
1593 } else {
1594 appendSubtag(result, 'L', subtag, length, errorCode);
1595 }
1596 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1597 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
1598 }
1599 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1600 appendSubtag(result, 'R', subtag, length, errorCode);
1601 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1602 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
1603 }
1604 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1605 appendSubtag(result, 'V', subtag, length, errorCode);
1606 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1607 appendSubtag(result, 'Z', subtag, length, errorCode);
1608
1609 if(U_FAILURE(errorCode)) { return 0; }
1610 if(result.length() <= capacity) {
1611 uprv_memcpy(buffer, result.data(), result.length());
1612 }
1613 return u_terminateChars(buffer, capacity, result.length(), &errorCode);
1614 }
1615
1616 UBool
1617 RuleBasedCollator::isUnsafe(UChar32 c) const {
1618 return data->isUnsafeBackward(c, settings->isNumeric());
1619 }
1620
1621 void U_CALLCONV
1622 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1623 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
1624 }
1625
1626 UBool
1627 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1628 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
1629 return U_SUCCESS(errorCode);
1630 }
1631
1632 CollationElementIterator *
1633 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
1634 UErrorCode errorCode = U_ZERO_ERROR;
1635 if(!initMaxExpansions(errorCode)) { return NULL; }
1636 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1637 if(U_FAILURE(errorCode)) {
1638 delete cei;
1639 return NULL;
1640 }
1641 return cei;
1642 }
1643
1644 CollationElementIterator *
1645 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
1646 UErrorCode errorCode = U_ZERO_ERROR;
1647 if(!initMaxExpansions(errorCode)) { return NULL; }
1648 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1649 if(U_FAILURE(errorCode)) {
1650 delete cei;
1651 return NULL;
1652 }
1653 return cei;
1654 }
1655
1656 int32_t
1657 RuleBasedCollator::getMaxExpansion(int32_t order) const {
1658 UErrorCode errorCode = U_ZERO_ERROR;
1659 (void)initMaxExpansions(errorCode);
1660 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
1661 }
1662
1663 U_NAMESPACE_END
1664
1665 #endif // !UCONFIG_NO_COLLATION