]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/rulebasedcollator.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / rulebasedcollator.cpp
... / ...
CommitLineData
1/*
2*******************************************************************************
3* Copyright (C) 1996-2015, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* rulebasedcollator.cpp
7*
8* (replaced the former tblcoll.cpp)
9*
10* created on: 2012feb14 with new and old collation code
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/coll.h"
19#include "unicode/coleitr.h"
20#include "unicode/localpointer.h"
21#include "unicode/locid.h"
22#include "unicode/sortkey.h"
23#include "unicode/tblcoll.h"
24#include "unicode/ucol.h"
25#include "unicode/uiter.h"
26#include "unicode/uloc.h"
27#include "unicode/uniset.h"
28#include "unicode/unistr.h"
29#include "unicode/usetiter.h"
30#include "unicode/utf8.h"
31#include "unicode/uversion.h"
32#include "bocsu.h"
33#include "charstr.h"
34#include "cmemory.h"
35#include "collation.h"
36#include "collationcompare.h"
37#include "collationdata.h"
38#include "collationdatareader.h"
39#include "collationfastlatin.h"
40#include "collationiterator.h"
41#include "collationkeys.h"
42#include "collationroot.h"
43#include "collationsets.h"
44#include "collationsettings.h"
45#include "collationtailoring.h"
46#include "cstring.h"
47#include "uassert.h"
48#include "ucol_imp.h"
49#include "uhash.h"
50#include "uitercollationiterator.h"
51#include "ustr_imp.h"
52#include "utf16collationiterator.h"
53#include "utf8collationiterator.h"
54#include "uvectr64.h"
55
56U_NAMESPACE_BEGIN
57
58namespace {
59
60class FixedSortKeyByteSink : public SortKeyByteSink {
61public:
62 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
63 : SortKeyByteSink(dest, destCapacity) {}
64 virtual ~FixedSortKeyByteSink();
65
66private:
67 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
68 virtual UBool Resize(int32_t appendCapacity, int32_t length);
69};
70
71FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
72
73void
74FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
75 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
76 // Fill the buffer completely.
77 int32_t available = capacity_ - length;
78 if (available > 0) {
79 uprv_memcpy(buffer_ + length, bytes, available);
80 }
81}
82
83UBool
84FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
85 return FALSE;
86}
87
88} // namespace
89
90// Not in an anonymous namespace, so that it can be a friend of CollationKey.
91class CollationKeyByteSink : public SortKeyByteSink {
92public:
93 CollationKeyByteSink(CollationKey &key)
94 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
95 key_(key) {}
96 virtual ~CollationKeyByteSink();
97
98private:
99 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
100 virtual UBool Resize(int32_t appendCapacity, int32_t length);
101
102 CollationKey &key_;
103};
104
105CollationKeyByteSink::~CollationKeyByteSink() {}
106
107void
108CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
109 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
110 if (Resize(n, length)) {
111 uprv_memcpy(buffer_ + length, bytes, n);
112 }
113}
114
115UBool
116CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
117 if (buffer_ == NULL) {
118 return FALSE; // allocation failed before already
119 }
120 int32_t newCapacity = 2 * capacity_;
121 int32_t altCapacity = length + 2 * appendCapacity;
122 if (newCapacity < altCapacity) {
123 newCapacity = altCapacity;
124 }
125 if (newCapacity < 200) {
126 newCapacity = 200;
127 }
128 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
129 if (newBuffer == NULL) {
130 SetNotOk();
131 return FALSE;
132 }
133 buffer_ = reinterpret_cast<char *>(newBuffer);
134 capacity_ = newCapacity;
135 return TRUE;
136}
137
138RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
139 : Collator(other),
140 data(other.data),
141 settings(other.settings),
142 tailoring(other.tailoring),
143 cacheEntry(other.cacheEntry),
144 validLocale(other.validLocale),
145 explicitlySetAttributes(other.explicitlySetAttributes),
146 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
147 settings->addRef();
148 cacheEntry->addRef();
149}
150
151RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
152 const RuleBasedCollator *base, UErrorCode &errorCode)
153 : data(NULL),
154 settings(NULL),
155 tailoring(NULL),
156 cacheEntry(NULL),
157 validLocale(""),
158 explicitlySetAttributes(0),
159 actualLocaleIsSameAsValid(FALSE) {
160 if(U_FAILURE(errorCode)) { return; }
161 if(bin == NULL || length == 0 || base == NULL) {
162 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
163 return;
164 }
165 const CollationTailoring *root = CollationRoot::getRoot(errorCode);
166 if(U_FAILURE(errorCode)) { return; }
167 if(base->tailoring != root) {
168 errorCode = U_UNSUPPORTED_ERROR;
169 return;
170 }
171 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
172 if(t.isNull() || t->isBogus()) {
173 errorCode = U_MEMORY_ALLOCATION_ERROR;
174 return;
175 }
176 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
177 if(U_FAILURE(errorCode)) { return; }
178 t->actualLocale.setToBogus();
179 adoptTailoring(t.orphan(), errorCode);
180}
181
182RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
183 : data(entry->tailoring->data),
184 settings(entry->tailoring->settings),
185 tailoring(entry->tailoring),
186 cacheEntry(entry),
187 validLocale(entry->validLocale),
188 explicitlySetAttributes(0),
189 actualLocaleIsSameAsValid(FALSE) {
190 settings->addRef();
191 cacheEntry->addRef();
192}
193
194RuleBasedCollator::~RuleBasedCollator() {
195 SharedObject::clearPtr(settings);
196 SharedObject::clearPtr(cacheEntry);
197}
198
199void
200RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
201 if(U_FAILURE(errorCode)) {
202 t->deleteIfZeroRefCount();
203 return;
204 }
205 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL);
206 cacheEntry = new CollationCacheEntry(t->actualLocale, t);
207 if(cacheEntry == NULL) {
208 errorCode = U_MEMORY_ALLOCATION_ERROR;
209 t->deleteIfZeroRefCount();
210 return;
211 }
212 data = t->data;
213 settings = t->settings;
214 settings->addRef();
215 tailoring = t;
216 cacheEntry->addRef();
217 validLocale = t->actualLocale;
218 actualLocaleIsSameAsValid = FALSE;
219}
220
221Collator *
222RuleBasedCollator::clone() const {
223 return new RuleBasedCollator(*this);
224}
225
226RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
227 if(this == &other) { return *this; }
228 SharedObject::copyPtr(other.settings, settings);
229 tailoring = other.tailoring;
230 SharedObject::copyPtr(other.cacheEntry, cacheEntry);
231 data = tailoring->data;
232 validLocale = other.validLocale;
233 explicitlySetAttributes = other.explicitlySetAttributes;
234 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
235 return *this;
236}
237
238UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
239
240UBool
241RuleBasedCollator::operator==(const Collator& other) const {
242 if(this == &other) { return TRUE; }
243 if(!Collator::operator==(other)) { return FALSE; }
244 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
245 if(*settings != *o.settings) { return FALSE; }
246 if(data == o.data) { return TRUE; }
247 UBool thisIsRoot = data->base == NULL;
248 UBool otherIsRoot = o.data->base == NULL;
249 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
250 if(thisIsRoot != otherIsRoot) { return FALSE; }
251 if((thisIsRoot || !tailoring->rules.isEmpty()) &&
252 (otherIsRoot || !o.tailoring->rules.isEmpty())) {
253 // Shortcut: If both collators have valid rule strings, then compare those.
254 if(tailoring->rules == o.tailoring->rules) { return TRUE; }
255 }
256 // Different rule strings can result in the same or equivalent tailoring.
257 // The rule strings are optional in ICU resource bundles, although included by default.
258 // cloneBinary() drops the rule string.
259 UErrorCode errorCode = U_ZERO_ERROR;
260 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
261 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
262 if(U_FAILURE(errorCode)) { return FALSE; }
263 if(*thisTailored != *otherTailored) { return FALSE; }
264 // For completeness, we should compare all of the mappings;
265 // or we should create a list of strings, sort it with one collator,
266 // and check if both collators compare adjacent strings the same
267 // (order & strength, down to quaternary); or similar.
268 // Testing equality of collators seems unusual.
269 return TRUE;
270}
271
272int32_t
273RuleBasedCollator::hashCode() const {
274 int32_t h = settings->hashCode();
275 if(data->base == NULL) { return h; } // root collator
276 // Do not rely on the rule string, see comments in operator==().
277 UErrorCode errorCode = U_ZERO_ERROR;
278 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
279 if(U_FAILURE(errorCode)) { return 0; }
280 UnicodeSetIterator iter(*set);
281 while(iter.next() && !iter.isString()) {
282 h ^= data->getCE32(iter.getCodepoint());
283 }
284 return h;
285}
286
287void
288RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
289 const Locale &actual) {
290 if(actual == tailoring->actualLocale) {
291 actualLocaleIsSameAsValid = FALSE;
292 } else {
293 U_ASSERT(actual == valid);
294 actualLocaleIsSameAsValid = TRUE;
295 }
296 // Do not modify tailoring.actualLocale:
297 // We cannot be sure that that would be thread-safe.
298 validLocale = valid;
299 (void)requested; // Ignore, see also ticket #10477.
300}
301
302Locale
303RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
304 if(U_FAILURE(errorCode)) {
305 return Locale::getRoot();
306 }
307 switch(type) {
308 case ULOC_ACTUAL_LOCALE:
309 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
310 case ULOC_VALID_LOCALE:
311 case ULOC_REQUESTED_LOCALE: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
312 return validLocale;
313 default:
314 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
315 return Locale::getRoot();
316 }
317}
318
319const char *
320RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
321 if(U_FAILURE(errorCode)) {
322 return NULL;
323 }
324 const Locale *result;
325 switch(type) {
326 case ULOC_ACTUAL_LOCALE:
327 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
328 break;
329 case ULOC_VALID_LOCALE:
330 case ULOC_REQUESTED_LOCALE: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
331 result = &validLocale;
332 break;
333 default:
334 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
335 return NULL;
336 }
337 if(result->isBogus()) { return NULL; }
338 const char *id = result->getName();
339 return id[0] == 0 ? "root" : id;
340}
341
342const UnicodeString&
343RuleBasedCollator::getRules() const {
344 return tailoring->rules;
345}
346
347void
348RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
349 if(delta == UCOL_TAILORING_ONLY) {
350 buffer = tailoring->rules;
351 return;
352 }
353 // UCOL_FULL_RULES
354 buffer.remove();
355 CollationLoader::appendRootRules(buffer);
356 buffer.append(tailoring->rules).getTerminatedBuffer();
357}
358
359void
360RuleBasedCollator::getVersion(UVersionInfo version) const {
361 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
362 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
363}
364
365UnicodeSet *
366RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
367 if(U_FAILURE(errorCode)) { return NULL; }
368 UnicodeSet *tailored = new UnicodeSet();
369 if(tailored == NULL) {
370 errorCode = U_MEMORY_ALLOCATION_ERROR;
371 return NULL;
372 }
373 if(data->base != NULL) {
374 TailoredSet(tailored).forData(data, errorCode);
375 if(U_FAILURE(errorCode)) {
376 delete tailored;
377 return NULL;
378 }
379 }
380 return tailored;
381}
382
383void
384RuleBasedCollator::internalGetContractionsAndExpansions(
385 UnicodeSet *contractions, UnicodeSet *expansions,
386 UBool addPrefixes, UErrorCode &errorCode) const {
387 if(U_FAILURE(errorCode)) { return; }
388 if(contractions != NULL) {
389 contractions->clear();
390 }
391 if(expansions != NULL) {
392 expansions->clear();
393 }
394 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode);
395}
396
397void
398RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
399 if(U_FAILURE(errorCode)) { return; }
400 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode);
401}
402
403const CollationSettings &
404RuleBasedCollator::getDefaultSettings() const {
405 return *tailoring->settings;
406}
407
408UColAttributeValue
409RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
410 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
411 int32_t option;
412 switch(attr) {
413 case UCOL_FRENCH_COLLATION:
414 option = CollationSettings::BACKWARD_SECONDARY;
415 break;
416 case UCOL_ALTERNATE_HANDLING:
417 return settings->getAlternateHandling();
418 case UCOL_CASE_FIRST:
419 return settings->getCaseFirst();
420 case UCOL_CASE_LEVEL:
421 option = CollationSettings::CASE_LEVEL;
422 break;
423 case UCOL_NORMALIZATION_MODE:
424 option = CollationSettings::CHECK_FCD;
425 break;
426 case UCOL_STRENGTH:
427 return (UColAttributeValue)settings->getStrength();
428 case UCOL_HIRAGANA_QUATERNARY_MODE:
429 // Deprecated attribute, unsettable.
430 return UCOL_OFF;
431 case UCOL_NUMERIC_COLLATION:
432 option = CollationSettings::NUMERIC;
433 break;
434 default:
435 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
436 return UCOL_DEFAULT;
437 }
438 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
439}
440
441void
442RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
443 UErrorCode &errorCode) {
444 UColAttributeValue oldValue = getAttribute(attr, errorCode);
445 if(U_FAILURE(errorCode)) { return; }
446 if(value == oldValue) {
447 setAttributeExplicitly(attr);
448 return;
449 }
450 const CollationSettings &defaultSettings = getDefaultSettings();
451 if(settings == &defaultSettings) {
452 if(value == UCOL_DEFAULT) {
453 setAttributeDefault(attr);
454 return;
455 }
456 }
457 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
458 if(ownedSettings == NULL) {
459 errorCode = U_MEMORY_ALLOCATION_ERROR;
460 return;
461 }
462
463 switch(attr) {
464 case UCOL_FRENCH_COLLATION:
465 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
466 defaultSettings.options, errorCode);
467 break;
468 case UCOL_ALTERNATE_HANDLING:
469 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
470 break;
471 case UCOL_CASE_FIRST:
472 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
473 break;
474 case UCOL_CASE_LEVEL:
475 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
476 defaultSettings.options, errorCode);
477 break;
478 case UCOL_NORMALIZATION_MODE:
479 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
480 defaultSettings.options, errorCode);
481 break;
482 case UCOL_STRENGTH:
483 ownedSettings->setStrength(value, defaultSettings.options, errorCode);
484 break;
485 case UCOL_HIRAGANA_QUATERNARY_MODE:
486 // Deprecated attribute. Check for valid values but do not change anything.
487 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
488 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
489 }
490 break;
491 case UCOL_NUMERIC_COLLATION:
492 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
493 break;
494 default:
495 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
496 break;
497 }
498 if(U_FAILURE(errorCode)) { return; }
499 setFastLatinOptions(*ownedSettings);
500 if(value == UCOL_DEFAULT) {
501 setAttributeDefault(attr);
502 } else {
503 setAttributeExplicitly(attr);
504 }
505}
506
507Collator &
508RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
509 if(U_FAILURE(errorCode)) { return *this; }
510 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
511 int32_t value;
512 if(group == UCOL_REORDER_CODE_DEFAULT) {
513 value = UCOL_DEFAULT;
514 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
515 value = group - UCOL_REORDER_CODE_FIRST;
516 } else {
517 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
518 return *this;
519 }
520 CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
521 if(value == oldValue) {
522 setAttributeExplicitly(ATTR_VARIABLE_TOP);
523 return *this;
524 }
525 const CollationSettings &defaultSettings = getDefaultSettings();
526 if(settings == &defaultSettings) {
527 if(value == UCOL_DEFAULT) {
528 setAttributeDefault(ATTR_VARIABLE_TOP);
529 return *this;
530 }
531 }
532 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
533 if(ownedSettings == NULL) {
534 errorCode = U_MEMORY_ALLOCATION_ERROR;
535 return *this;
536 }
537
538 if(group == UCOL_REORDER_CODE_DEFAULT) {
539 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable());
540 }
541 uint32_t varTop = data->getLastPrimaryForGroup(group);
542 U_ASSERT(varTop != 0);
543 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
544 if(U_FAILURE(errorCode)) { return *this; }
545 ownedSettings->variableTop = varTop;
546 setFastLatinOptions(*ownedSettings);
547 if(value == UCOL_DEFAULT) {
548 setAttributeDefault(ATTR_VARIABLE_TOP);
549 } else {
550 setAttributeExplicitly(ATTR_VARIABLE_TOP);
551 }
552 return *this;
553}
554
555UColReorderCode
556RuleBasedCollator::getMaxVariable() const {
557 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
558}
559
560uint32_t
561RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
562 return settings->variableTop;
563}
564
565uint32_t
566RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) {
567 if(U_FAILURE(errorCode)) { return 0; }
568 if(varTop == NULL && len !=0) {
569 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
570 return 0;
571 }
572 if(len < 0) { len = u_strlen(varTop); }
573 if(len == 0) {
574 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
575 return 0;
576 }
577 UBool numeric = settings->isNumeric();
578 int64_t ce1, ce2;
579 if(settings->dontCheckFCD()) {
580 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
581 ce1 = ci.nextCE(errorCode);
582 ce2 = ci.nextCE(errorCode);
583 } else {
584 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
585 ce1 = ci.nextCE(errorCode);
586 ce2 = ci.nextCE(errorCode);
587 }
588 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
589 errorCode = U_CE_NOT_FOUND_ERROR;
590 return 0;
591 }
592 setVariableTop((uint32_t)(ce1 >> 32), errorCode);
593 return settings->variableTop;
594}
595
596uint32_t
597RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
598 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
599}
600
601void
602RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
603 if(U_FAILURE(errorCode)) { return; }
604 if(varTop != settings->variableTop) {
605 // Pin the variable top to the end of the reordering group which contains it.
606 // Only a few special groups are supported.
607 int32_t group = data->getGroupForPrimary(varTop);
608 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
609 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
610 return;
611 }
612 uint32_t v = data->getLastPrimaryForGroup(group);
613 U_ASSERT(v != 0 && v >= varTop);
614 varTop = v;
615 if(varTop != settings->variableTop) {
616 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
617 if(ownedSettings == NULL) {
618 errorCode = U_MEMORY_ALLOCATION_ERROR;
619 return;
620 }
621 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
622 getDefaultSettings().options, errorCode);
623 if(U_FAILURE(errorCode)) { return; }
624 ownedSettings->variableTop = varTop;
625 setFastLatinOptions(*ownedSettings);
626 }
627 }
628 if(varTop == getDefaultSettings().variableTop) {
629 setAttributeDefault(ATTR_VARIABLE_TOP);
630 } else {
631 setAttributeExplicitly(ATTR_VARIABLE_TOP);
632 }
633}
634
635int32_t
636RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
637 UErrorCode &errorCode) const {
638 if(U_FAILURE(errorCode)) { return 0; }
639 if(capacity < 0 || (dest == NULL && capacity > 0)) {
640 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
641 return 0;
642 }
643 int32_t length = settings->reorderCodesLength;
644 if(length == 0) { return 0; }
645 if(length > capacity) {
646 errorCode = U_BUFFER_OVERFLOW_ERROR;
647 return length;
648 }
649 uprv_memcpy(dest, settings->reorderCodes, length * 4);
650 return length;
651}
652
653void
654RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
655 UErrorCode &errorCode) {
656 if(U_FAILURE(errorCode)) { return; }
657 if(length < 0 || (reorderCodes == NULL && length > 0)) {
658 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
659 return;
660 }
661 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
662 length = 0;
663 }
664 if(length == settings->reorderCodesLength &&
665 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
666 return;
667 }
668 const CollationSettings &defaultSettings = getDefaultSettings();
669 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
670 if(settings != &defaultSettings) {
671 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
672 if(ownedSettings == NULL) {
673 errorCode = U_MEMORY_ALLOCATION_ERROR;
674 return;
675 }
676 ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
677 setFastLatinOptions(*ownedSettings);
678 }
679 return;
680 }
681 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
682 if(ownedSettings == NULL) {
683 errorCode = U_MEMORY_ALLOCATION_ERROR;
684 return;
685 }
686 ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
687 setFastLatinOptions(*ownedSettings);
688}
689
690void
691RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
692 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
693 data, ownedSettings,
694 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
695}
696
697UCollationResult
698RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
699 UErrorCode &errorCode) const {
700 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
701 return doCompare(left.getBuffer(), left.length(),
702 right.getBuffer(), right.length(), errorCode);
703}
704
705UCollationResult
706RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
707 int32_t length, UErrorCode &errorCode) const {
708 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
709 if(length < 0) {
710 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
711 return UCOL_EQUAL;
712 }
713 int32_t leftLength = left.length();
714 int32_t rightLength = right.length();
715 if(leftLength > length) { leftLength = length; }
716 if(rightLength > length) { rightLength = length; }
717 return doCompare(left.getBuffer(), leftLength,
718 right.getBuffer(), rightLength, errorCode);
719}
720
721UCollationResult
722RuleBasedCollator::compare(const UChar *left, int32_t leftLength,
723 const UChar *right, int32_t rightLength,
724 UErrorCode &errorCode) const {
725 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
726 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
727 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
728 return UCOL_EQUAL;
729 }
730 // Make sure both or neither strings have a known length.
731 // We do not optimize for mixed length/termination.
732 if(leftLength >= 0) {
733 if(rightLength < 0) { rightLength = u_strlen(right); }
734 } else {
735 if(rightLength >= 0) { leftLength = u_strlen(left); }
736 }
737 return doCompare(left, leftLength, right, rightLength, errorCode);
738}
739
740UCollationResult
741RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
742 UErrorCode &errorCode) const {
743 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
744 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
745 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
746 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) {
747 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
748 return UCOL_EQUAL;
749 }
750 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
751}
752
753UCollationResult
754RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
755 const char *right, int32_t rightLength,
756 UErrorCode &errorCode) const {
757 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
758 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
759 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
760 return UCOL_EQUAL;
761 }
762 // Make sure both or neither strings have a known length.
763 // We do not optimize for mixed length/termination.
764 if(leftLength >= 0) {
765 if(rightLength < 0) { rightLength = uprv_strlen(right); }
766 } else {
767 if(rightLength >= 0) { leftLength = uprv_strlen(left); }
768 }
769 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
770 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
771}
772
773namespace {
774
775/**
776 * Abstract iterator for identical-level string comparisons.
777 * Returns FCD code points and handles temporary switching to NFD.
778 */
779class NFDIterator : public UObject {
780public:
781 NFDIterator() : index(-1), length(0) {}
782 virtual ~NFDIterator() {}
783 /**
784 * Returns the next code point from the internal normalization buffer,
785 * or else the next text code point.
786 * Returns -1 at the end of the text.
787 */
788 UChar32 nextCodePoint() {
789 if(index >= 0) {
790 if(index == length) {
791 index = -1;
792 } else {
793 UChar32 c;
794 U16_NEXT_UNSAFE(decomp, index, c);
795 return c;
796 }
797 }
798 return nextRawCodePoint();
799 }
800 /**
801 * @param nfcImpl
802 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
803 * @return the first code point in c's decomposition,
804 * or c itself if it was decomposed already or if it does not decompose
805 */
806 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
807 if(index >= 0) { return c; }
808 decomp = nfcImpl.getDecomposition(c, buffer, length);
809 if(decomp == NULL) { return c; }
810 index = 0;
811 U16_NEXT_UNSAFE(decomp, index, c);
812 return c;
813 }
814protected:
815 /**
816 * Returns the next text code point in FCD order.
817 * Returns -1 at the end of the text.
818 */
819 virtual UChar32 nextRawCodePoint() = 0;
820private:
821 const UChar *decomp;
822 UChar buffer[4];
823 int32_t index;
824 int32_t length;
825};
826
827class UTF16NFDIterator : public NFDIterator {
828public:
829 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {}
830protected:
831 virtual UChar32 nextRawCodePoint() {
832 if(s == limit) { return U_SENTINEL; }
833 UChar32 c = *s++;
834 if(limit == NULL && c == 0) {
835 s = NULL;
836 return U_SENTINEL;
837 }
838 UChar trail;
839 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
840 ++s;
841 c = U16_GET_SUPPLEMENTARY(c, trail);
842 }
843 return c;
844 }
845
846 const UChar *s;
847 const UChar *limit;
848};
849
850class FCDUTF16NFDIterator : public UTF16NFDIterator {
851public:
852 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit)
853 : UTF16NFDIterator(NULL, NULL) {
854 UErrorCode errorCode = U_ZERO_ERROR;
855 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode);
856 if(U_FAILURE(errorCode)) { return; }
857 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) {
858 s = text;
859 limit = spanLimit;
860 } else {
861 str.setTo(text, (int32_t)(spanLimit - text));
862 {
863 ReorderingBuffer buffer(nfcImpl, str);
864 if(buffer.init(str.length(), errorCode)) {
865 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode);
866 }
867 }
868 if(U_SUCCESS(errorCode)) {
869 s = str.getBuffer();
870 limit = s + str.length();
871 }
872 }
873 }
874private:
875 UnicodeString str;
876};
877
878class UTF8NFDIterator : public NFDIterator {
879public:
880 UTF8NFDIterator(const uint8_t *text, int32_t textLength)
881 : s(text), pos(0), length(textLength) {}
882protected:
883 virtual UChar32 nextRawCodePoint() {
884 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
885 UChar32 c;
886 U8_NEXT_OR_FFFD(s, pos, length, c);
887 return c;
888 }
889
890 const uint8_t *s;
891 int32_t pos;
892 int32_t length;
893};
894
895class FCDUTF8NFDIterator : public NFDIterator {
896public:
897 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
898 : u8ci(data, FALSE, text, 0, textLength) {}
899protected:
900 virtual UChar32 nextRawCodePoint() {
901 UErrorCode errorCode = U_ZERO_ERROR;
902 return u8ci.nextCodePoint(errorCode);
903 }
904private:
905 FCDUTF8CollationIterator u8ci;
906};
907
908class UIterNFDIterator : public NFDIterator {
909public:
910 UIterNFDIterator(UCharIterator &it) : iter(it) {}
911protected:
912 virtual UChar32 nextRawCodePoint() {
913 return uiter_next32(&iter);
914 }
915private:
916 UCharIterator &iter;
917};
918
919class FCDUIterNFDIterator : public NFDIterator {
920public:
921 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
922 : uici(data, FALSE, it, startIndex) {}
923protected:
924 virtual UChar32 nextRawCodePoint() {
925 UErrorCode errorCode = U_ZERO_ERROR;
926 return uici.nextCodePoint(errorCode);
927 }
928private:
929 FCDUIterCollationIterator uici;
930};
931
932UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
933 NFDIterator &left, NFDIterator &right) {
934 for(;;) {
935 // Fetch the next FCD code point from each string.
936 UChar32 leftCp = left.nextCodePoint();
937 UChar32 rightCp = right.nextCodePoint();
938 if(leftCp == rightCp) {
939 if(leftCp < 0) { break; }
940 continue;
941 }
942 // If they are different, then decompose each and compare again.
943 if(leftCp < 0) {
944 leftCp = -2; // end of string
945 } else if(leftCp == 0xfffe) {
946 leftCp = -1; // U+FFFE: merge separator
947 } else {
948 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
949 }
950 if(rightCp < 0) {
951 rightCp = -2; // end of string
952 } else if(rightCp == 0xfffe) {
953 rightCp = -1; // U+FFFE: merge separator
954 } else {
955 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
956 }
957 if(leftCp < rightCp) { return UCOL_LESS; }
958 if(leftCp > rightCp) { return UCOL_GREATER; }
959 }
960 return UCOL_EQUAL;
961}
962
963} // namespace
964
965UCollationResult
966RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength,
967 const UChar *right, int32_t rightLength,
968 UErrorCode &errorCode) const {
969 // U_FAILURE(errorCode) checked by caller.
970 if(left == right && leftLength == rightLength) {
971 return UCOL_EQUAL;
972 }
973
974 // Identical-prefix test.
975 const UChar *leftLimit;
976 const UChar *rightLimit;
977 int32_t equalPrefixLength = 0;
978 if(leftLength < 0) {
979 leftLimit = NULL;
980 rightLimit = NULL;
981 UChar c;
982 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
983 if(c == 0) { return UCOL_EQUAL; }
984 ++equalPrefixLength;
985 }
986 } else {
987 leftLimit = left + leftLength;
988 rightLimit = right + rightLength;
989 for(;;) {
990 if(equalPrefixLength == leftLength) {
991 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
992 break;
993 } else if(equalPrefixLength == rightLength ||
994 left[equalPrefixLength] != right[equalPrefixLength]) {
995 break;
996 }
997 ++equalPrefixLength;
998 }
999 }
1000
1001 UBool numeric = settings->isNumeric();
1002 if(equalPrefixLength > 0) {
1003 if((equalPrefixLength != leftLength &&
1004 data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1005 (equalPrefixLength != rightLength &&
1006 data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1007 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1008 while(--equalPrefixLength > 0 &&
1009 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1010 }
1011 // Notes:
1012 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1013 // - With a backward level, a longer string can compare less-than a prefix of it.
1014
1015 // Pass the actual start of each string into the CollationIterators,
1016 // plus the equalPrefixLength position,
1017 // so that prefix matches back into the equal prefix work.
1018 }
1019
1020 int32_t result;
1021 int32_t fastLatinOptions = settings->fastLatinOptions;
1022 if(fastLatinOptions >= 0 &&
1023 (equalPrefixLength == leftLength ||
1024 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1025 (equalPrefixLength == rightLength ||
1026 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1027 if(leftLength >= 0) {
1028 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1029 settings->fastLatinPrimaries,
1030 fastLatinOptions,
1031 left + equalPrefixLength,
1032 leftLength - equalPrefixLength,
1033 right + equalPrefixLength,
1034 rightLength - equalPrefixLength);
1035 } else {
1036 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1037 settings->fastLatinPrimaries,
1038 fastLatinOptions,
1039 left + equalPrefixLength, -1,
1040 right + equalPrefixLength, -1);
1041 }
1042 } else {
1043 result = CollationFastLatin::BAIL_OUT_RESULT;
1044 }
1045
1046 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1047 if(settings->dontCheckFCD()) {
1048 UTF16CollationIterator leftIter(data, numeric,
1049 left, left + equalPrefixLength, leftLimit);
1050 UTF16CollationIterator rightIter(data, numeric,
1051 right, right + equalPrefixLength, rightLimit);
1052 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1053 } else {
1054 FCDUTF16CollationIterator leftIter(data, numeric,
1055 left, left + equalPrefixLength, leftLimit);
1056 FCDUTF16CollationIterator rightIter(data, numeric,
1057 right, right + equalPrefixLength, rightLimit);
1058 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1059 }
1060 }
1061 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1062 return (UCollationResult)result;
1063 }
1064
1065 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1066 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1067 // and the benefit seems unlikely to be measurable.
1068
1069 // Compare identical level.
1070 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1071 left += equalPrefixLength;
1072 right += equalPrefixLength;
1073 if(settings->dontCheckFCD()) {
1074 UTF16NFDIterator leftIter(left, leftLimit);
1075 UTF16NFDIterator rightIter(right, rightLimit);
1076 return compareNFDIter(nfcImpl, leftIter, rightIter);
1077 } else {
1078 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1079 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1080 return compareNFDIter(nfcImpl, leftIter, rightIter);
1081 }
1082}
1083
1084UCollationResult
1085RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1086 const uint8_t *right, int32_t rightLength,
1087 UErrorCode &errorCode) const {
1088 // U_FAILURE(errorCode) checked by caller.
1089 if(left == right && leftLength == rightLength) {
1090 return UCOL_EQUAL;
1091 }
1092
1093 // Identical-prefix test.
1094 int32_t equalPrefixLength = 0;
1095 if(leftLength < 0) {
1096 uint8_t c;
1097 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1098 if(c == 0) { return UCOL_EQUAL; }
1099 ++equalPrefixLength;
1100 }
1101 } else {
1102 for(;;) {
1103 if(equalPrefixLength == leftLength) {
1104 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1105 break;
1106 } else if(equalPrefixLength == rightLength ||
1107 left[equalPrefixLength] != right[equalPrefixLength]) {
1108 break;
1109 }
1110 ++equalPrefixLength;
1111 }
1112 }
1113 // Back up to the start of a partially-equal code point.
1114 if(equalPrefixLength > 0 &&
1115 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
1116 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
1117 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
1118 }
1119
1120 UBool numeric = settings->isNumeric();
1121 if(equalPrefixLength > 0) {
1122 UBool unsafe = FALSE;
1123 if(equalPrefixLength != leftLength) {
1124 int32_t i = equalPrefixLength;
1125 UChar32 c;
1126 U8_NEXT_OR_FFFD(left, i, leftLength, c);
1127 unsafe = data->isUnsafeBackward(c, numeric);
1128 }
1129 if(!unsafe && equalPrefixLength != rightLength) {
1130 int32_t i = equalPrefixLength;
1131 UChar32 c;
1132 U8_NEXT_OR_FFFD(right, i, rightLength, c);
1133 unsafe = data->isUnsafeBackward(c, numeric);
1134 }
1135 if(unsafe) {
1136 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1137 UChar32 c;
1138 do {
1139 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1140 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
1141 }
1142 // See the notes in the UTF-16 version.
1143
1144 // Pass the actual start of each string into the CollationIterators,
1145 // plus the equalPrefixLength position,
1146 // so that prefix matches back into the equal prefix work.
1147 }
1148
1149 int32_t result;
1150 int32_t fastLatinOptions = settings->fastLatinOptions;
1151 if(fastLatinOptions >= 0 &&
1152 (equalPrefixLength == leftLength ||
1153 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
1154 (equalPrefixLength == rightLength ||
1155 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
1156 if(leftLength >= 0) {
1157 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1158 settings->fastLatinPrimaries,
1159 fastLatinOptions,
1160 left + equalPrefixLength,
1161 leftLength - equalPrefixLength,
1162 right + equalPrefixLength,
1163 rightLength - equalPrefixLength);
1164 } else {
1165 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1166 settings->fastLatinPrimaries,
1167 fastLatinOptions,
1168 left + equalPrefixLength, -1,
1169 right + equalPrefixLength, -1);
1170 }
1171 } else {
1172 result = CollationFastLatin::BAIL_OUT_RESULT;
1173 }
1174
1175 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1176 if(settings->dontCheckFCD()) {
1177 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1178 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1179 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1180 } else {
1181 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1182 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1183 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1184 }
1185 }
1186 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1187 return (UCollationResult)result;
1188 }
1189
1190 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1191 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1192 // and the benefit seems unlikely to be measurable.
1193
1194 // Compare identical level.
1195 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1196 left += equalPrefixLength;
1197 right += equalPrefixLength;
1198 if(leftLength > 0) {
1199 leftLength -= equalPrefixLength;
1200 rightLength -= equalPrefixLength;
1201 }
1202 if(settings->dontCheckFCD()) {
1203 UTF8NFDIterator leftIter(left, leftLength);
1204 UTF8NFDIterator rightIter(right, rightLength);
1205 return compareNFDIter(nfcImpl, leftIter, rightIter);
1206 } else {
1207 FCDUTF8NFDIterator leftIter(data, left, leftLength);
1208 FCDUTF8NFDIterator rightIter(data, right, rightLength);
1209 return compareNFDIter(nfcImpl, leftIter, rightIter);
1210 }
1211}
1212
1213UCollationResult
1214RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1215 UErrorCode &errorCode) const {
1216 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1217 UBool numeric = settings->isNumeric();
1218
1219 // Identical-prefix test.
1220 int32_t equalPrefixLength = 0;
1221 {
1222 UChar32 leftUnit;
1223 UChar32 rightUnit;
1224 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1225 if(leftUnit < 0) { return UCOL_EQUAL; }
1226 ++equalPrefixLength;
1227 }
1228
1229 // Back out the code units that differed, for the real collation comparison.
1230 if(leftUnit >= 0) { left.previous(&left); }
1231 if(rightUnit >= 0) { right.previous(&right); }
1232
1233 if(equalPrefixLength > 0) {
1234 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1235 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
1236 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1237 do {
1238 --equalPrefixLength;
1239 leftUnit = left.previous(&left);
1240 right.previous(&right);
1241 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
1242 }
1243 // See the notes in the UTF-16 version.
1244 }
1245 }
1246
1247 UCollationResult result;
1248 if(settings->dontCheckFCD()) {
1249 UIterCollationIterator leftIter(data, numeric, left);
1250 UIterCollationIterator rightIter(data, numeric, right);
1251 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1252 } else {
1253 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
1254 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
1255 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1256 }
1257 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1258 return result;
1259 }
1260
1261 // Compare identical level.
1262 left.move(&left, equalPrefixLength, UITER_ZERO);
1263 right.move(&right, equalPrefixLength, UITER_ZERO);
1264 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1265 if(settings->dontCheckFCD()) {
1266 UIterNFDIterator leftIter(left);
1267 UIterNFDIterator rightIter(right);
1268 return compareNFDIter(nfcImpl, leftIter, rightIter);
1269 } else {
1270 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1271 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1272 return compareNFDIter(nfcImpl, leftIter, rightIter);
1273 }
1274}
1275
1276CollationKey &
1277RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1278 UErrorCode &errorCode) const {
1279 return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1280}
1281
1282CollationKey &
1283RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key,
1284 UErrorCode &errorCode) const {
1285 if(U_FAILURE(errorCode)) {
1286 return key.setToBogus();
1287 }
1288 if(s == NULL && length != 0) {
1289 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1290 return key.setToBogus();
1291 }
1292 key.reset(); // resets the "bogus" state
1293 CollationKeyByteSink sink(key);
1294 writeSortKey(s, length, sink, errorCode);
1295 if(U_FAILURE(errorCode)) {
1296 key.setToBogus();
1297 } else if(key.isBogus()) {
1298 errorCode = U_MEMORY_ALLOCATION_ERROR;
1299 } else {
1300 key.setLength(sink.NumberOfBytesAppended());
1301 }
1302 return key;
1303}
1304
1305int32_t
1306RuleBasedCollator::getSortKey(const UnicodeString &s,
1307 uint8_t *dest, int32_t capacity) const {
1308 return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1309}
1310
1311int32_t
1312RuleBasedCollator::getSortKey(const UChar *s, int32_t length,
1313 uint8_t *dest, int32_t capacity) const {
1314 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) {
1315 return 0;
1316 }
1317 uint8_t noDest[1] = { 0 };
1318 if(dest == NULL) {
1319 // Distinguish pure preflighting from an allocation error.
1320 dest = noDest;
1321 capacity = 0;
1322 }
1323 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1324 UErrorCode errorCode = U_ZERO_ERROR;
1325 writeSortKey(s, length, sink, errorCode);
1326 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1327}
1328
1329void
1330RuleBasedCollator::writeSortKey(const UChar *s, int32_t length,
1331 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1332 if(U_FAILURE(errorCode)) { return; }
1333 const UChar *limit = (length >= 0) ? s + length : NULL;
1334 UBool numeric = settings->isNumeric();
1335 CollationKeys::LevelCallback callback;
1336 if(settings->dontCheckFCD()) {
1337 UTF16CollationIterator iter(data, numeric, s, s, limit);
1338 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1339 sink, Collation::PRIMARY_LEVEL,
1340 callback, TRUE, errorCode);
1341 } else {
1342 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1343 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1344 sink, Collation::PRIMARY_LEVEL,
1345 callback, TRUE, errorCode);
1346 }
1347 if(settings->getStrength() == UCOL_IDENTICAL) {
1348 writeIdenticalLevel(s, limit, sink, errorCode);
1349 }
1350 static const char terminator = 0; // TERMINATOR_BYTE
1351 sink.Append(&terminator, 1);
1352}
1353
1354void
1355RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit,
1356 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1357 // NFD quick check
1358 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode);
1359 if(U_FAILURE(errorCode)) { return; }
1360 sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1361 UChar32 prev = 0;
1362 if(nfdQCYesLimit != s) {
1363 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
1364 }
1365 // Is there non-NFD text?
1366 int32_t destLengthEstimate;
1367 if(limit != NULL) {
1368 if(nfdQCYesLimit == limit) { return; }
1369 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1370 } else {
1371 // s is NUL-terminated
1372 if(*nfdQCYesLimit == 0) { return; }
1373 destLengthEstimate = -1;
1374 }
1375 UnicodeString nfd;
1376 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
1377 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1378}
1379
1380namespace {
1381
1382/**
1383 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1384 * with an instance of this callback class.
1385 * When another level is about to be written, the callback
1386 * records the level and the number of bytes that will be written until
1387 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1388 *
1389 * When internalNextSortKeyPart() is called again, it restarts with the last level
1390 * and ignores as many bytes as were written previously for that level.
1391 */
1392class PartLevelCallback : public CollationKeys::LevelCallback {
1393public:
1394 PartLevelCallback(const SortKeyByteSink &s)
1395 : sink(s), level(Collation::PRIMARY_LEVEL) {
1396 levelCapacity = sink.GetRemainingCapacity();
1397 }
1398 virtual ~PartLevelCallback() {}
1399 virtual UBool needToWrite(Collation::Level l) {
1400 if(!sink.Overflowed()) {
1401 // Remember a level that will be at least partially written.
1402 level = l;
1403 levelCapacity = sink.GetRemainingCapacity();
1404 return TRUE;
1405 } else {
1406 return FALSE;
1407 }
1408 }
1409 Collation::Level getLevel() const { return level; }
1410 int32_t getLevelCapacity() const { return levelCapacity; }
1411
1412private:
1413 const SortKeyByteSink &sink;
1414 Collation::Level level;
1415 int32_t levelCapacity;
1416};
1417
1418} // namespace
1419
1420int32_t
1421RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
1422 uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
1423 if(U_FAILURE(errorCode)) { return 0; }
1424 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) {
1425 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1426 return 0;
1427 }
1428 if(count == 0) { return 0; }
1429
1430 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1431 sink.IgnoreBytes((int32_t)state[1]);
1432 iter->move(iter, 0, UITER_START);
1433
1434 Collation::Level level = (Collation::Level)state[0];
1435 if(level <= Collation::QUATERNARY_LEVEL) {
1436 UBool numeric = settings->isNumeric();
1437 PartLevelCallback callback(sink);
1438 if(settings->dontCheckFCD()) {
1439 UIterCollationIterator ci(data, numeric, *iter);
1440 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1441 sink, level, callback, FALSE, errorCode);
1442 } else {
1443 FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1444 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1445 sink, level, callback, FALSE, errorCode);
1446 }
1447 if(U_FAILURE(errorCode)) { return 0; }
1448 if(sink.NumberOfBytesAppended() > count) {
1449 state[0] = (uint32_t)callback.getLevel();
1450 state[1] = (uint32_t)callback.getLevelCapacity();
1451 return count;
1452 }
1453 // All of the normal levels are done.
1454 if(settings->getStrength() == UCOL_IDENTICAL) {
1455 level = Collation::IDENTICAL_LEVEL;
1456 iter->move(iter, 0, UITER_START);
1457 }
1458 // else fall through to setting ZERO_LEVEL
1459 }
1460
1461 if(level == Collation::IDENTICAL_LEVEL) {
1462 int32_t levelCapacity = sink.GetRemainingCapacity();
1463 UnicodeString s;
1464 for(;;) {
1465 UChar32 c = iter->next(iter);
1466 if(c < 0) { break; }
1467 s.append((UChar)c);
1468 }
1469 const UChar *sArray = s.getBuffer();
1470 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1471 if(U_FAILURE(errorCode)) { return 0; }
1472 if(sink.NumberOfBytesAppended() > count) {
1473 state[0] = (uint32_t)level;
1474 state[1] = (uint32_t)levelCapacity;
1475 return count;
1476 }
1477 }
1478
1479 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1480 state[0] = (uint32_t)Collation::ZERO_LEVEL;
1481 state[1] = 0;
1482 int32_t length = sink.NumberOfBytesAppended();
1483 int32_t i = length;
1484 while(i < count) { dest[i++] = 0; }
1485 return length;
1486}
1487
1488void
1489RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1490 UErrorCode &errorCode) const {
1491 if(U_FAILURE(errorCode)) { return; }
1492 const UChar *s = str.getBuffer();
1493 const UChar *limit = s + str.length();
1494 UBool numeric = settings->isNumeric();
1495 if(settings->dontCheckFCD()) {
1496 UTF16CollationIterator iter(data, numeric, s, s, limit);
1497 int64_t ce;
1498 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1499 ces.addElement(ce, errorCode);
1500 }
1501 } else {
1502 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1503 int64_t ce;
1504 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1505 ces.addElement(ce, errorCode);
1506 }
1507 }
1508}
1509
1510namespace {
1511
1512void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
1513 UErrorCode &errorCode) {
1514 if(U_FAILURE(errorCode) || length == 0) { return; }
1515 if(!s.isEmpty()) {
1516 s.append('_', errorCode);
1517 }
1518 s.append(letter, errorCode);
1519 for(int32_t i = 0; i < length; ++i) {
1520 s.append(uprv_toupper(subtag[i]), errorCode);
1521 }
1522}
1523
1524void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1525 UErrorCode &errorCode) {
1526 if(U_FAILURE(errorCode)) { return; }
1527 if(!s.isEmpty()) {
1528 s.append('_', errorCode);
1529 }
1530 static const char *valueChars = "1234...........IXO..SN..LU......";
1531 s.append(letter, errorCode);
1532 s.append(valueChars[value], errorCode);
1533}
1534
1535} // namespace
1536
1537int32_t
1538RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1539 char *buffer, int32_t capacity,
1540 UErrorCode &errorCode) const {
1541 if(U_FAILURE(errorCode)) { return 0; }
1542 if(buffer == NULL ? capacity != 0 : capacity < 0) {
1543 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1544 return 0;
1545 }
1546 if(locale == NULL) {
1547 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1548 }
1549
1550 char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1551 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
1552 "collation", locale,
1553 NULL, &errorCode);
1554 if(U_FAILURE(errorCode)) { return 0; }
1555 if(length == 0) {
1556 uprv_strcpy(resultLocale, "root");
1557 } else {
1558 resultLocale[length] = 0;
1559 }
1560
1561 // Append items in alphabetic order of their short definition letters.
1562 CharString result;
1563 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1564
1565 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1566 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
1567 }
1568 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1569 // See ICU tickets #10372 and #10386.
1570 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1571 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
1572 }
1573 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1574 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
1575 }
1576 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1577 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
1578 }
1579 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1580 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
1581 }
1582 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1583 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
1584 appendSubtag(result, 'K', subtag, length, errorCode);
1585 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1586 appendSubtag(result, 'L', subtag, length, errorCode);
1587 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1588 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
1589 }
1590 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1591 appendSubtag(result, 'R', subtag, length, errorCode);
1592 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1593 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
1594 }
1595 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1596 appendSubtag(result, 'V', subtag, length, errorCode);
1597 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1598 appendSubtag(result, 'Z', subtag, length, errorCode);
1599
1600 if(U_FAILURE(errorCode)) { return 0; }
1601 if(result.length() <= capacity) {
1602 uprv_memcpy(buffer, result.data(), result.length());
1603 }
1604 return u_terminateChars(buffer, capacity, result.length(), &errorCode);
1605}
1606
1607UBool
1608RuleBasedCollator::isUnsafe(UChar32 c) const {
1609 return data->isUnsafeBackward(c, settings->isNumeric());
1610}
1611
1612void
1613RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1614 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
1615}
1616
1617UBool
1618RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1619 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
1620 return U_SUCCESS(errorCode);
1621}
1622
1623CollationElementIterator *
1624RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
1625 UErrorCode errorCode = U_ZERO_ERROR;
1626 if(!initMaxExpansions(errorCode)) { return NULL; }
1627 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1628 if(U_FAILURE(errorCode)) {
1629 delete cei;
1630 return NULL;
1631 }
1632 return cei;
1633}
1634
1635CollationElementIterator *
1636RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
1637 UErrorCode errorCode = U_ZERO_ERROR;
1638 if(!initMaxExpansions(errorCode)) { return NULL; }
1639 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1640 if(U_FAILURE(errorCode)) {
1641 delete cei;
1642 return NULL;
1643 }
1644 return cei;
1645}
1646
1647int32_t
1648RuleBasedCollator::getMaxExpansion(int32_t order) const {
1649 UErrorCode errorCode = U_ZERO_ERROR;
1650 (void)initMaxExpansions(errorCode);
1651 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
1652}
1653
1654U_NAMESPACE_END
1655
1656#endif // !UCONFIG_NO_COLLATION