1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * rulebasedcollator.cpp
10 * (replaced the former tblcoll.cpp)
12 * created on: 2012feb14 with new and old collation code
13 * created by: Markus W. Scherer
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/coll.h"
21 #include "unicode/coleitr.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/locid.h"
24 #include "unicode/sortkey.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/ucol.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uloc.h"
29 #include "unicode/uniset.h"
30 #include "unicode/unistr.h"
31 #include "unicode/usetiter.h"
32 #include "unicode/utf8.h"
33 #include "unicode/uversion.h"
37 #include "collation.h"
38 #include "collationcompare.h"
39 #include "collationdata.h"
40 #include "collationdatareader.h"
41 #include "collationfastlatin.h"
42 #include "collationiterator.h"
43 #include "collationkeys.h"
44 #include "collationroot.h"
45 #include "collationsets.h"
46 #include "collationsettings.h"
47 #include "collationtailoring.h"
52 #include "uitercollationiterator.h"
54 #include "utf16collationiterator.h"
55 #include "utf8collationiterator.h"
62 class FixedSortKeyByteSink
: public SortKeyByteSink
{
64 FixedSortKeyByteSink(char *dest
, int32_t destCapacity
)
65 : SortKeyByteSink(dest
, destCapacity
) {}
66 virtual ~FixedSortKeyByteSink();
69 virtual void AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
);
70 virtual UBool
Resize(int32_t appendCapacity
, int32_t length
);
73 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
76 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes
, int32_t /*n*/, int32_t length
) {
77 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
78 // Fill the buffer completely.
79 int32_t available
= capacity_
- length
;
81 uprv_memcpy(buffer_
+ length
, bytes
, available
);
86 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
92 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
93 class CollationKeyByteSink
: public SortKeyByteSink
{
95 CollationKeyByteSink(CollationKey
&key
)
96 : SortKeyByteSink(reinterpret_cast<char *>(key
.getBytes()), key
.getCapacity()),
98 virtual ~CollationKeyByteSink();
101 virtual void AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
);
102 virtual UBool
Resize(int32_t appendCapacity
, int32_t length
);
107 CollationKeyByteSink::~CollationKeyByteSink() {}
110 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
) {
111 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
112 if (Resize(n
, length
)) {
113 uprv_memcpy(buffer_
+ length
, bytes
, n
);
118 CollationKeyByteSink::Resize(int32_t appendCapacity
, int32_t length
) {
119 if (buffer_
== NULL
) {
120 return FALSE
; // allocation failed before already
122 int32_t newCapacity
= 2 * capacity_
;
123 int32_t altCapacity
= length
+ 2 * appendCapacity
;
124 if (newCapacity
< altCapacity
) {
125 newCapacity
= altCapacity
;
127 if (newCapacity
< 200) {
130 uint8_t *newBuffer
= key_
.reallocate(newCapacity
, length
);
131 if (newBuffer
== NULL
) {
135 buffer_
= reinterpret_cast<char *>(newBuffer
);
136 capacity_
= newCapacity
;
140 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator
&other
)
143 settings(other
.settings
),
144 tailoring(other
.tailoring
),
145 cacheEntry(other
.cacheEntry
),
146 validLocale(other
.validLocale
),
147 explicitlySetAttributes(other
.explicitlySetAttributes
),
148 actualLocaleIsSameAsValid(other
.actualLocaleIsSameAsValid
) {
150 cacheEntry
->addRef();
153 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin
, int32_t length
,
154 const RuleBasedCollator
*base
, UErrorCode
&errorCode
)
160 explicitlySetAttributes(0),
161 actualLocaleIsSameAsValid(FALSE
) {
162 if(U_FAILURE(errorCode
)) { return; }
163 if(bin
== NULL
|| length
== 0 || base
== NULL
) {
164 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
167 const CollationTailoring
*root
= CollationRoot::getRoot(errorCode
);
168 if(U_FAILURE(errorCode
)) { return; }
169 if(base
->tailoring
!= root
) {
170 errorCode
= U_UNSUPPORTED_ERROR
;
173 LocalPointer
<CollationTailoring
> t(new CollationTailoring(base
->tailoring
->settings
));
174 if(t
.isNull() || t
->isBogus()) {
175 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
178 CollationDataReader::read(base
->tailoring
, bin
, length
, *t
, errorCode
);
179 if(U_FAILURE(errorCode
)) { return; }
180 t
->actualLocale
.setToBogus();
181 adoptTailoring(t
.orphan(), errorCode
);
184 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry
*entry
)
185 : data(entry
->tailoring
->data
),
186 settings(entry
->tailoring
->settings
),
187 tailoring(entry
->tailoring
),
189 validLocale(entry
->validLocale
),
190 explicitlySetAttributes(0),
191 actualLocaleIsSameAsValid(FALSE
) {
193 cacheEntry
->addRef();
196 RuleBasedCollator::~RuleBasedCollator() {
197 SharedObject::clearPtr(settings
);
198 SharedObject::clearPtr(cacheEntry
);
202 RuleBasedCollator::adoptTailoring(CollationTailoring
*t
, UErrorCode
&errorCode
) {
203 if(U_FAILURE(errorCode
)) {
204 t
->deleteIfZeroRefCount();
207 U_ASSERT(settings
== NULL
&& data
== NULL
&& tailoring
== NULL
&& cacheEntry
== NULL
);
208 cacheEntry
= new CollationCacheEntry(t
->actualLocale
, t
);
209 if(cacheEntry
== NULL
) {
210 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
211 t
->deleteIfZeroRefCount();
215 settings
= t
->settings
;
218 cacheEntry
->addRef();
219 validLocale
= t
->actualLocale
;
220 actualLocaleIsSameAsValid
= FALSE
;
224 RuleBasedCollator::clone() const {
225 return new RuleBasedCollator(*this);
228 RuleBasedCollator
&RuleBasedCollator::operator=(const RuleBasedCollator
&other
) {
229 if(this == &other
) { return *this; }
230 SharedObject::copyPtr(other
.settings
, settings
);
231 tailoring
= other
.tailoring
;
232 SharedObject::copyPtr(other
.cacheEntry
, cacheEntry
);
233 data
= tailoring
->data
;
234 validLocale
= other
.validLocale
;
235 explicitlySetAttributes
= other
.explicitlySetAttributes
;
236 actualLocaleIsSameAsValid
= other
.actualLocaleIsSameAsValid
;
240 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator
)
243 RuleBasedCollator::operator==(const Collator
& other
) const {
244 if(this == &other
) { return TRUE
; }
245 if(!Collator::operator==(other
)) { return FALSE
; }
246 const RuleBasedCollator
&o
= static_cast<const RuleBasedCollator
&>(other
);
247 if(*settings
!= *o
.settings
) { return FALSE
; }
248 if(data
== o
.data
) { return TRUE
; }
249 UBool thisIsRoot
= data
->base
== NULL
;
250 UBool otherIsRoot
= o
.data
->base
== NULL
;
251 U_ASSERT(!thisIsRoot
|| !otherIsRoot
); // otherwise their data pointers should be ==
252 if(thisIsRoot
!= otherIsRoot
) { return FALSE
; }
253 if((thisIsRoot
|| !tailoring
->rules
.isEmpty()) &&
254 (otherIsRoot
|| !o
.tailoring
->rules
.isEmpty())) {
255 // Shortcut: If both collators have valid rule strings, then compare those.
256 if(tailoring
->rules
== o
.tailoring
->rules
) { return TRUE
; }
258 // Different rule strings can result in the same or equivalent tailoring.
259 // The rule strings are optional in ICU resource bundles, although included by default.
260 // cloneBinary() drops the rule string.
261 UErrorCode errorCode
= U_ZERO_ERROR
;
262 LocalPointer
<UnicodeSet
> thisTailored(getTailoredSet(errorCode
));
263 LocalPointer
<UnicodeSet
> otherTailored(o
.getTailoredSet(errorCode
));
264 if(U_FAILURE(errorCode
)) { return FALSE
; }
265 if(*thisTailored
!= *otherTailored
) { return FALSE
; }
266 // For completeness, we should compare all of the mappings;
267 // or we should create a list of strings, sort it with one collator,
268 // and check if both collators compare adjacent strings the same
269 // (order & strength, down to quaternary); or similar.
270 // Testing equality of collators seems unusual.
275 RuleBasedCollator::hashCode() const {
276 int32_t h
= settings
->hashCode();
277 if(data
->base
== NULL
) { return h
; } // root collator
278 // Do not rely on the rule string, see comments in operator==().
279 UErrorCode errorCode
= U_ZERO_ERROR
;
280 LocalPointer
<UnicodeSet
> set(getTailoredSet(errorCode
));
281 if(U_FAILURE(errorCode
)) { return 0; }
282 UnicodeSetIterator
iter(*set
);
283 while(iter
.next() && !iter
.isString()) {
284 h
^= data
->getCE32(iter
.getCodepoint());
290 RuleBasedCollator::setLocales(const Locale
&requested
, const Locale
&valid
,
291 const Locale
&actual
) {
292 if(actual
== tailoring
->actualLocale
) {
293 actualLocaleIsSameAsValid
= FALSE
;
295 U_ASSERT(actual
== valid
);
296 actualLocaleIsSameAsValid
= TRUE
;
298 // Do not modify tailoring.actualLocale:
299 // We cannot be sure that that would be thread-safe.
301 (void)requested
; // Ignore, see also ticket #10477.
305 RuleBasedCollator::getLocale(ULocDataLocaleType type
, UErrorCode
& errorCode
) const {
306 if(U_FAILURE(errorCode
)) {
307 return Locale::getRoot();
310 case ULOC_ACTUAL_LOCALE
:
311 return actualLocaleIsSameAsValid
? validLocale
: tailoring
->actualLocale
;
312 case ULOC_VALID_LOCALE
:
313 case ULOC_REQUESTED_LOCALE
: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
316 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
317 return Locale::getRoot();
322 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type
, UErrorCode
&errorCode
) const {
323 if(U_FAILURE(errorCode
)) {
326 const Locale
*result
;
328 case ULOC_ACTUAL_LOCALE
:
329 result
= actualLocaleIsSameAsValid
? &validLocale
: &tailoring
->actualLocale
;
331 case ULOC_VALID_LOCALE
:
332 case ULOC_REQUESTED_LOCALE
: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
333 result
= &validLocale
;
336 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
339 if(result
->isBogus()) { return NULL
; }
340 const char *id
= result
->getName();
341 return id
[0] == 0 ? "root" : id
;
345 RuleBasedCollator::getRules() const {
346 return tailoring
->rules
;
350 RuleBasedCollator::getRules(UColRuleOption delta
, UnicodeString
&buffer
) const {
351 if(delta
== UCOL_TAILORING_ONLY
) {
352 buffer
= tailoring
->rules
;
357 CollationLoader::appendRootRules(buffer
);
358 buffer
.append(tailoring
->rules
).getTerminatedBuffer();
362 RuleBasedCollator::getVersion(UVersionInfo version
) const {
363 uprv_memcpy(version
, tailoring
->version
, U_MAX_VERSION_LENGTH
);
364 if (version
[1]==0x68 /* uca 13.0 */ && (version
[2]&0xC0)==0 /*uca x.x.0*/) {
365 version
[2]|=0x40; // uca x.x.1, Apple hack to bump UCA version for ICU 66.1 final
367 version
[0] += (UCOL_RUNTIME_VERSION
<< 4) + (UCOL_RUNTIME_VERSION
>> 4);
371 RuleBasedCollator::getTailoredSet(UErrorCode
&errorCode
) const {
372 if(U_FAILURE(errorCode
)) { return NULL
; }
373 UnicodeSet
*tailored
= new UnicodeSet();
374 if(tailored
== NULL
) {
375 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
378 if(data
->base
!= NULL
) {
379 TailoredSet(tailored
).forData(data
, errorCode
);
380 if(U_FAILURE(errorCode
)) {
389 RuleBasedCollator::internalGetContractionsAndExpansions(
390 UnicodeSet
*contractions
, UnicodeSet
*expansions
,
391 UBool addPrefixes
, UErrorCode
&errorCode
) const {
392 if(U_FAILURE(errorCode
)) { return; }
393 if(contractions
!= NULL
) {
394 contractions
->clear();
396 if(expansions
!= NULL
) {
399 ContractionsAndExpansions(contractions
, expansions
, NULL
, addPrefixes
).forData(data
, errorCode
);
403 RuleBasedCollator::internalAddContractions(UChar32 c
, UnicodeSet
&set
, UErrorCode
&errorCode
) const {
404 if(U_FAILURE(errorCode
)) { return; }
405 ContractionsAndExpansions(&set
, NULL
, NULL
, FALSE
).forCodePoint(data
, c
, errorCode
);
408 const CollationSettings
&
409 RuleBasedCollator::getDefaultSettings() const {
410 return *tailoring
->settings
;
414 RuleBasedCollator::getAttribute(UColAttribute attr
, UErrorCode
&errorCode
) const {
415 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
418 case UCOL_FRENCH_COLLATION
:
419 option
= CollationSettings::BACKWARD_SECONDARY
;
421 case UCOL_ALTERNATE_HANDLING
:
422 return settings
->getAlternateHandling();
423 case UCOL_CASE_FIRST
:
424 return settings
->getCaseFirst();
425 case UCOL_CASE_LEVEL
:
426 option
= CollationSettings::CASE_LEVEL
;
428 case UCOL_NORMALIZATION_MODE
:
429 option
= CollationSettings::CHECK_FCD
;
432 return (UColAttributeValue
)settings
->getStrength();
433 case UCOL_HIRAGANA_QUATERNARY_MODE
:
434 // Deprecated attribute, unsettable.
436 case UCOL_NUMERIC_COLLATION
:
437 option
= CollationSettings::NUMERIC
;
440 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
443 return ((settings
->options
& option
) == 0) ? UCOL_OFF
: UCOL_ON
;
447 RuleBasedCollator::setAttribute(UColAttribute attr
, UColAttributeValue value
,
448 UErrorCode
&errorCode
) {
449 UColAttributeValue oldValue
= getAttribute(attr
, errorCode
);
450 if(U_FAILURE(errorCode
)) { return; }
451 if(value
== oldValue
) {
452 setAttributeExplicitly(attr
);
455 const CollationSettings
&defaultSettings
= getDefaultSettings();
456 if(settings
== &defaultSettings
) {
457 if(value
== UCOL_DEFAULT
) {
458 setAttributeDefault(attr
);
462 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
463 if(ownedSettings
== NULL
) {
464 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
469 case UCOL_FRENCH_COLLATION
:
470 ownedSettings
->setFlag(CollationSettings::BACKWARD_SECONDARY
, value
,
471 defaultSettings
.options
, errorCode
);
473 case UCOL_ALTERNATE_HANDLING
:
474 ownedSettings
->setAlternateHandling(value
, defaultSettings
.options
, errorCode
);
476 case UCOL_CASE_FIRST
:
477 ownedSettings
->setCaseFirst(value
, defaultSettings
.options
, errorCode
);
479 case UCOL_CASE_LEVEL
:
480 ownedSettings
->setFlag(CollationSettings::CASE_LEVEL
, value
,
481 defaultSettings
.options
, errorCode
);
483 case UCOL_NORMALIZATION_MODE
:
484 ownedSettings
->setFlag(CollationSettings::CHECK_FCD
, value
,
485 defaultSettings
.options
, errorCode
);
488 ownedSettings
->setStrength(value
, defaultSettings
.options
, errorCode
);
490 case UCOL_HIRAGANA_QUATERNARY_MODE
:
491 // Deprecated attribute. Check for valid values but do not change anything.
492 if(value
!= UCOL_OFF
&& value
!= UCOL_ON
&& value
!= UCOL_DEFAULT
) {
493 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
496 case UCOL_NUMERIC_COLLATION
:
497 ownedSettings
->setFlag(CollationSettings::NUMERIC
, value
, defaultSettings
.options
, errorCode
);
500 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
503 if(U_FAILURE(errorCode
)) { return; }
504 setFastLatinOptions(*ownedSettings
);
505 if(value
== UCOL_DEFAULT
) {
506 setAttributeDefault(attr
);
508 setAttributeExplicitly(attr
);
513 RuleBasedCollator::setMaxVariable(UColReorderCode group
, UErrorCode
&errorCode
) {
514 if(U_FAILURE(errorCode
)) { return *this; }
515 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
517 if(group
== UCOL_REORDER_CODE_DEFAULT
) {
518 value
= UCOL_DEFAULT
;
519 } else if(UCOL_REORDER_CODE_FIRST
<= group
&& group
<= UCOL_REORDER_CODE_CURRENCY
) {
520 value
= group
- UCOL_REORDER_CODE_FIRST
;
522 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
525 CollationSettings::MaxVariable oldValue
= settings
->getMaxVariable();
526 if(value
== oldValue
) {
527 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
530 const CollationSettings
&defaultSettings
= getDefaultSettings();
531 if(settings
== &defaultSettings
) {
532 if(value
== UCOL_DEFAULT
) {
533 setAttributeDefault(ATTR_VARIABLE_TOP
);
537 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
538 if(ownedSettings
== NULL
) {
539 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
543 if(group
== UCOL_REORDER_CODE_DEFAULT
) {
544 group
= (UColReorderCode
)(UCOL_REORDER_CODE_FIRST
+ defaultSettings
.getMaxVariable());
546 uint32_t varTop
= data
->getLastPrimaryForGroup(group
);
547 U_ASSERT(varTop
!= 0);
548 ownedSettings
->setMaxVariable(value
, defaultSettings
.options
, errorCode
);
549 if(U_FAILURE(errorCode
)) { return *this; }
550 ownedSettings
->variableTop
= varTop
;
551 setFastLatinOptions(*ownedSettings
);
552 if(value
== UCOL_DEFAULT
) {
553 setAttributeDefault(ATTR_VARIABLE_TOP
);
555 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
561 RuleBasedCollator::getMaxVariable() const {
562 return (UColReorderCode
)(UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
566 RuleBasedCollator::getVariableTop(UErrorCode
& /*errorCode*/) const {
567 return settings
->variableTop
;
571 RuleBasedCollator::setVariableTop(const UChar
*varTop
, int32_t len
, UErrorCode
&errorCode
) {
572 if(U_FAILURE(errorCode
)) { return 0; }
573 if(varTop
== NULL
&& len
!=0) {
574 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
577 if(len
< 0) { len
= u_strlen(varTop
); }
579 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
582 UBool numeric
= settings
->isNumeric();
584 if(settings
->dontCheckFCD()) {
585 UTF16CollationIterator
ci(data
, numeric
, varTop
, varTop
, varTop
+ len
);
586 ce1
= ci
.nextCE(errorCode
);
587 ce2
= ci
.nextCE(errorCode
);
589 FCDUTF16CollationIterator
ci(data
, numeric
, varTop
, varTop
, varTop
+ len
);
590 ce1
= ci
.nextCE(errorCode
);
591 ce2
= ci
.nextCE(errorCode
);
593 if(ce1
== Collation::NO_CE
|| ce2
!= Collation::NO_CE
) {
594 errorCode
= U_CE_NOT_FOUND_ERROR
;
597 setVariableTop((uint32_t)(ce1
>> 32), errorCode
);
598 return settings
->variableTop
;
602 RuleBasedCollator::setVariableTop(const UnicodeString
&varTop
, UErrorCode
&errorCode
) {
603 return setVariableTop(varTop
.getBuffer(), varTop
.length(), errorCode
);
607 RuleBasedCollator::setVariableTop(uint32_t varTop
, UErrorCode
&errorCode
) {
608 if(U_FAILURE(errorCode
)) { return; }
609 if(varTop
!= settings
->variableTop
) {
610 // Pin the variable top to the end of the reordering group which contains it.
611 // Only a few special groups are supported.
612 int32_t group
= data
->getGroupForPrimary(varTop
);
613 if(group
< UCOL_REORDER_CODE_FIRST
|| UCOL_REORDER_CODE_CURRENCY
< group
) {
614 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
617 uint32_t v
= data
->getLastPrimaryForGroup(group
);
618 U_ASSERT(v
!= 0 && v
>= varTop
);
620 if(varTop
!= settings
->variableTop
) {
621 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
622 if(ownedSettings
== NULL
) {
623 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
626 ownedSettings
->setMaxVariable(group
- UCOL_REORDER_CODE_FIRST
,
627 getDefaultSettings().options
, errorCode
);
628 if(U_FAILURE(errorCode
)) { return; }
629 ownedSettings
->variableTop
= varTop
;
630 setFastLatinOptions(*ownedSettings
);
633 if(varTop
== getDefaultSettings().variableTop
) {
634 setAttributeDefault(ATTR_VARIABLE_TOP
);
636 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
641 RuleBasedCollator::getReorderCodes(int32_t *dest
, int32_t capacity
,
642 UErrorCode
&errorCode
) const {
643 if(U_FAILURE(errorCode
)) { return 0; }
644 if(capacity
< 0 || (dest
== NULL
&& capacity
> 0)) {
645 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
648 int32_t length
= settings
->reorderCodesLength
;
649 if(length
== 0) { return 0; }
650 if(length
> capacity
) {
651 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
654 uprv_memcpy(dest
, settings
->reorderCodes
, length
* 4);
659 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes
, int32_t length
,
660 UErrorCode
&errorCode
) {
661 if(U_FAILURE(errorCode
)) { return; }
662 if(length
< 0 || (reorderCodes
== NULL
&& length
> 0)) {
663 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
666 if(length
== 1 && reorderCodes
[0] == UCOL_REORDER_CODE_NONE
) {
669 if(length
== settings
->reorderCodesLength
&&
670 uprv_memcmp(reorderCodes
, settings
->reorderCodes
, length
* 4) == 0) {
673 const CollationSettings
&defaultSettings
= getDefaultSettings();
674 if(length
== 1 && reorderCodes
[0] == UCOL_REORDER_CODE_DEFAULT
) {
675 if(settings
!= &defaultSettings
) {
676 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
677 if(ownedSettings
== NULL
) {
678 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
681 ownedSettings
->copyReorderingFrom(defaultSettings
, errorCode
);
682 setFastLatinOptions(*ownedSettings
);
686 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
687 if(ownedSettings
== NULL
) {
688 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
691 ownedSettings
->setReordering(*data
, reorderCodes
, length
, errorCode
);
692 setFastLatinOptions(*ownedSettings
);
696 RuleBasedCollator::setFastLatinOptions(CollationSettings
&ownedSettings
) const {
697 ownedSettings
.fastLatinOptions
= CollationFastLatin::getOptions(
699 ownedSettings
.fastLatinPrimaries
, UPRV_LENGTHOF(ownedSettings
.fastLatinPrimaries
));
703 RuleBasedCollator::compare(const UnicodeString
&left
, const UnicodeString
&right
,
704 UErrorCode
&errorCode
) const {
705 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
706 return doCompare(left
.getBuffer(), left
.length(),
707 right
.getBuffer(), right
.length(), errorCode
);
711 RuleBasedCollator::compare(const UnicodeString
&left
, const UnicodeString
&right
,
712 int32_t length
, UErrorCode
&errorCode
) const {
713 if(U_FAILURE(errorCode
) || length
== 0) { return UCOL_EQUAL
; }
715 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
718 int32_t leftLength
= left
.length();
719 int32_t rightLength
= right
.length();
720 if(leftLength
> length
) { leftLength
= length
; }
721 if(rightLength
> length
) { rightLength
= length
; }
722 return doCompare(left
.getBuffer(), leftLength
,
723 right
.getBuffer(), rightLength
, errorCode
);
727 RuleBasedCollator::compare(const UChar
*left
, int32_t leftLength
,
728 const UChar
*right
, int32_t rightLength
,
729 UErrorCode
&errorCode
) const {
730 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
731 if((left
== NULL
&& leftLength
!= 0) || (right
== NULL
&& rightLength
!= 0)) {
732 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
735 // Make sure both or neither strings have a known length.
736 // We do not optimize for mixed length/termination.
737 if(leftLength
>= 0) {
738 if(rightLength
< 0) { rightLength
= u_strlen(right
); }
740 if(rightLength
>= 0) { leftLength
= u_strlen(left
); }
742 return doCompare(left
, leftLength
, right
, rightLength
, errorCode
);
746 RuleBasedCollator::compareUTF8(const StringPiece
&left
, const StringPiece
&right
,
747 UErrorCode
&errorCode
) const {
748 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
749 const uint8_t *leftBytes
= reinterpret_cast<const uint8_t *>(left
.data());
750 const uint8_t *rightBytes
= reinterpret_cast<const uint8_t *>(right
.data());
751 if((leftBytes
== NULL
&& !left
.empty()) || (rightBytes
== NULL
&& !right
.empty())) {
752 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
755 return doCompare(leftBytes
, left
.length(), rightBytes
, right
.length(), errorCode
);
759 RuleBasedCollator::internalCompareUTF8(const char *left
, int32_t leftLength
,
760 const char *right
, int32_t rightLength
,
761 UErrorCode
&errorCode
) const {
762 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
763 if((left
== NULL
&& leftLength
!= 0) || (right
== NULL
&& rightLength
!= 0)) {
764 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
767 // Make sure both or neither strings have a known length.
768 // We do not optimize for mixed length/termination.
769 if(leftLength
>= 0) {
770 if(rightLength
< 0) { rightLength
= static_cast<int32_t>(uprv_strlen(right
)); }
772 if(rightLength
>= 0) { leftLength
= static_cast<int32_t>(uprv_strlen(left
)); }
774 return doCompare(reinterpret_cast<const uint8_t *>(left
), leftLength
,
775 reinterpret_cast<const uint8_t *>(right
), rightLength
, errorCode
);
781 * Abstract iterator for identical-level string comparisons.
782 * Returns FCD code points and handles temporary switching to NFD.
784 class NFDIterator
: public UObject
{
786 NFDIterator() : index(-1), length(0) {}
787 virtual ~NFDIterator() {}
789 * Returns the next code point from the internal normalization buffer,
790 * or else the next text code point.
791 * Returns -1 at the end of the text.
793 UChar32
nextCodePoint() {
795 if(index
== length
) {
799 U16_NEXT_UNSAFE(decomp
, index
, c
);
803 return nextRawCodePoint();
807 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
808 * @return the first code point in c's decomposition,
809 * or c itself if it was decomposed already or if it does not decompose
811 UChar32
nextDecomposedCodePoint(const Normalizer2Impl
&nfcImpl
, UChar32 c
) {
812 if(index
>= 0) { return c
; }
813 decomp
= nfcImpl
.getDecomposition(c
, buffer
, length
);
814 if(decomp
== NULL
) { return c
; }
816 U16_NEXT_UNSAFE(decomp
, index
, c
);
821 * Returns the next text code point in FCD order.
822 * Returns -1 at the end of the text.
824 virtual UChar32
nextRawCodePoint() = 0;
832 class UTF16NFDIterator
: public NFDIterator
{
834 UTF16NFDIterator(const UChar
*text
, const UChar
*textLimit
) : s(text
), limit(textLimit
) {}
836 virtual UChar32
nextRawCodePoint() {
837 if(s
== limit
) { return U_SENTINEL
; }
839 if(limit
== NULL
&& c
== 0) {
844 if(U16_IS_LEAD(c
) && s
!= limit
&& U16_IS_TRAIL(trail
= *s
)) {
846 c
= U16_GET_SUPPLEMENTARY(c
, trail
);
855 class FCDUTF16NFDIterator
: public UTF16NFDIterator
{
857 FCDUTF16NFDIterator(const Normalizer2Impl
&nfcImpl
, const UChar
*text
, const UChar
*textLimit
)
858 : UTF16NFDIterator(NULL
, NULL
) {
859 UErrorCode errorCode
= U_ZERO_ERROR
;
860 const UChar
*spanLimit
= nfcImpl
.makeFCD(text
, textLimit
, NULL
, errorCode
);
861 if(U_FAILURE(errorCode
)) { return; }
862 if(spanLimit
== textLimit
|| (textLimit
== NULL
&& *spanLimit
== 0)) {
866 str
.setTo(text
, (int32_t)(spanLimit
- text
));
868 ReorderingBuffer
r_buffer(nfcImpl
, str
);
869 if(r_buffer
.init(str
.length(), errorCode
)) {
870 nfcImpl
.makeFCD(spanLimit
, textLimit
, &r_buffer
, errorCode
);
873 if(U_SUCCESS(errorCode
)) {
875 limit
= s
+ str
.length();
883 class UTF8NFDIterator
: public NFDIterator
{
885 UTF8NFDIterator(const uint8_t *text
, int32_t textLength
)
886 : s(text
), pos(0), length(textLength
) {}
888 virtual UChar32
nextRawCodePoint() {
889 if(pos
== length
|| (s
[pos
] == 0 && length
< 0)) { return U_SENTINEL
; }
891 U8_NEXT_OR_FFFD(s
, pos
, length
, c
);
900 class FCDUTF8NFDIterator
: public NFDIterator
{
902 FCDUTF8NFDIterator(const CollationData
*data
, const uint8_t *text
, int32_t textLength
)
903 : u8ci(data
, FALSE
, text
, 0, textLength
) {}
905 virtual UChar32
nextRawCodePoint() {
906 UErrorCode errorCode
= U_ZERO_ERROR
;
907 return u8ci
.nextCodePoint(errorCode
);
910 FCDUTF8CollationIterator u8ci
;
913 class UIterNFDIterator
: public NFDIterator
{
915 UIterNFDIterator(UCharIterator
&it
) : iter(it
) {}
917 virtual UChar32
nextRawCodePoint() {
918 return uiter_next32(&iter
);
924 class FCDUIterNFDIterator
: public NFDIterator
{
926 FCDUIterNFDIterator(const CollationData
*data
, UCharIterator
&it
, int32_t startIndex
)
927 : uici(data
, FALSE
, it
, startIndex
) {}
929 virtual UChar32
nextRawCodePoint() {
930 UErrorCode errorCode
= U_ZERO_ERROR
;
931 return uici
.nextCodePoint(errorCode
);
934 FCDUIterCollationIterator uici
;
937 UCollationResult
compareNFDIter(const Normalizer2Impl
&nfcImpl
,
938 NFDIterator
&left
, NFDIterator
&right
) {
940 // Fetch the next FCD code point from each string.
941 UChar32 leftCp
= left
.nextCodePoint();
942 UChar32 rightCp
= right
.nextCodePoint();
943 if(leftCp
== rightCp
) {
944 if(leftCp
< 0) { break; }
947 // If they are different, then decompose each and compare again.
949 leftCp
= -2; // end of string
950 } else if(leftCp
== 0xfffe) {
951 leftCp
= -1; // U+FFFE: merge separator
953 leftCp
= left
.nextDecomposedCodePoint(nfcImpl
, leftCp
);
956 rightCp
= -2; // end of string
957 } else if(rightCp
== 0xfffe) {
958 rightCp
= -1; // U+FFFE: merge separator
960 rightCp
= right
.nextDecomposedCodePoint(nfcImpl
, rightCp
);
962 if(leftCp
< rightCp
) { return UCOL_LESS
; }
963 if(leftCp
> rightCp
) { return UCOL_GREATER
; }
971 RuleBasedCollator::doCompare(const UChar
*left
, int32_t leftLength
,
972 const UChar
*right
, int32_t rightLength
,
973 UErrorCode
&errorCode
) const {
974 // U_FAILURE(errorCode) checked by caller.
975 if(left
== right
&& leftLength
== rightLength
) {
979 // Identical-prefix test.
980 const UChar
*leftLimit
;
981 const UChar
*rightLimit
;
982 int32_t equalPrefixLength
= 0;
987 while((c
= left
[equalPrefixLength
]) == right
[equalPrefixLength
]) {
988 if(c
== 0) { return UCOL_EQUAL
; }
992 leftLimit
= left
+ leftLength
;
993 rightLimit
= right
+ rightLength
;
995 if(equalPrefixLength
== leftLength
) {
996 if(equalPrefixLength
== rightLength
) { return UCOL_EQUAL
; }
998 } else if(equalPrefixLength
== rightLength
||
999 left
[equalPrefixLength
] != right
[equalPrefixLength
]) {
1002 ++equalPrefixLength
;
1006 UBool numeric
= settings
->isNumeric();
1007 if(equalPrefixLength
> 0) {
1008 if((equalPrefixLength
!= leftLength
&&
1009 data
->isUnsafeBackward(left
[equalPrefixLength
], numeric
)) ||
1010 (equalPrefixLength
!= rightLength
&&
1011 data
->isUnsafeBackward(right
[equalPrefixLength
], numeric
))) {
1012 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1013 while(--equalPrefixLength
> 0 &&
1014 data
->isUnsafeBackward(left
[equalPrefixLength
], numeric
)) {}
1017 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1018 // - With a backward level, a longer string can compare less-than a prefix of it.
1020 // Pass the actual start of each string into the CollationIterators,
1021 // plus the equalPrefixLength position,
1022 // so that prefix matches back into the equal prefix work.
1026 int32_t fastLatinOptions
= settings
->fastLatinOptions
;
1027 if(fastLatinOptions
>= 0 &&
1028 (equalPrefixLength
== leftLength
||
1029 left
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX
) &&
1030 (equalPrefixLength
== rightLength
||
1031 right
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX
)) {
1032 if(leftLength
>= 0) {
1033 result
= CollationFastLatin::compareUTF16(data
->fastLatinTable
,
1034 settings
->fastLatinPrimaries
,
1036 left
+ equalPrefixLength
,
1037 leftLength
- equalPrefixLength
,
1038 right
+ equalPrefixLength
,
1039 rightLength
- equalPrefixLength
);
1041 result
= CollationFastLatin::compareUTF16(data
->fastLatinTable
,
1042 settings
->fastLatinPrimaries
,
1044 left
+ equalPrefixLength
, -1,
1045 right
+ equalPrefixLength
, -1);
1048 result
= CollationFastLatin::BAIL_OUT_RESULT
;
1051 if(result
== CollationFastLatin::BAIL_OUT_RESULT
) {
1052 if(settings
->dontCheckFCD()) {
1053 UTF16CollationIterator
leftIter(data
, numeric
,
1054 left
, left
+ equalPrefixLength
, leftLimit
);
1055 UTF16CollationIterator
rightIter(data
, numeric
,
1056 right
, right
+ equalPrefixLength
, rightLimit
);
1057 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1059 FCDUTF16CollationIterator
leftIter(data
, numeric
,
1060 left
, left
+ equalPrefixLength
, leftLimit
);
1061 FCDUTF16CollationIterator
rightIter(data
, numeric
,
1062 right
, right
+ equalPrefixLength
, rightLimit
);
1063 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1066 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1067 return (UCollationResult
)result
;
1070 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1071 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1072 // and the benefit seems unlikely to be measurable.
1074 // Compare identical level.
1075 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1076 left
+= equalPrefixLength
;
1077 right
+= equalPrefixLength
;
1078 if(settings
->dontCheckFCD()) {
1079 UTF16NFDIterator
leftIter(left
, leftLimit
);
1080 UTF16NFDIterator
rightIter(right
, rightLimit
);
1081 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1083 FCDUTF16NFDIterator
leftIter(nfcImpl
, left
, leftLimit
);
1084 FCDUTF16NFDIterator
rightIter(nfcImpl
, right
, rightLimit
);
1085 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1090 RuleBasedCollator::doCompare(const uint8_t *left
, int32_t leftLength
,
1091 const uint8_t *right
, int32_t rightLength
,
1092 UErrorCode
&errorCode
) const {
1093 // U_FAILURE(errorCode) checked by caller.
1094 if(left
== right
&& leftLength
== rightLength
) {
1098 // Identical-prefix test.
1099 int32_t equalPrefixLength
= 0;
1100 if(leftLength
< 0) {
1102 while((c
= left
[equalPrefixLength
]) == right
[equalPrefixLength
]) {
1103 if(c
== 0) { return UCOL_EQUAL
; }
1104 ++equalPrefixLength
;
1108 if(equalPrefixLength
== leftLength
) {
1109 if(equalPrefixLength
== rightLength
) { return UCOL_EQUAL
; }
1111 } else if(equalPrefixLength
== rightLength
||
1112 left
[equalPrefixLength
] != right
[equalPrefixLength
]) {
1115 ++equalPrefixLength
;
1118 // Back up to the start of a partially-equal code point.
1119 if(equalPrefixLength
> 0 &&
1120 ((equalPrefixLength
!= leftLength
&& U8_IS_TRAIL(left
[equalPrefixLength
])) ||
1121 (equalPrefixLength
!= rightLength
&& U8_IS_TRAIL(right
[equalPrefixLength
])))) {
1122 while(--equalPrefixLength
> 0 && U8_IS_TRAIL(left
[equalPrefixLength
])) {}
1125 UBool numeric
= settings
->isNumeric();
1126 if(equalPrefixLength
> 0) {
1127 UBool unsafe
= FALSE
;
1128 if(equalPrefixLength
!= leftLength
) {
1129 int32_t i
= equalPrefixLength
;
1131 U8_NEXT_OR_FFFD(left
, i
, leftLength
, c
);
1132 unsafe
= data
->isUnsafeBackward(c
, numeric
);
1134 if(!unsafe
&& equalPrefixLength
!= rightLength
) {
1135 int32_t i
= equalPrefixLength
;
1137 U8_NEXT_OR_FFFD(right
, i
, rightLength
, c
);
1138 unsafe
= data
->isUnsafeBackward(c
, numeric
);
1141 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1144 U8_PREV_OR_FFFD(left
, 0, equalPrefixLength
, c
);
1145 } while(equalPrefixLength
> 0 && data
->isUnsafeBackward(c
, numeric
));
1147 // See the notes in the UTF-16 version.
1149 // Pass the actual start of each string into the CollationIterators,
1150 // plus the equalPrefixLength position,
1151 // so that prefix matches back into the equal prefix work.
1155 int32_t fastLatinOptions
= settings
->fastLatinOptions
;
1156 if(fastLatinOptions
>= 0 &&
1157 (equalPrefixLength
== leftLength
||
1158 left
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD
) &&
1159 (equalPrefixLength
== rightLength
||
1160 right
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD
)) {
1161 if(leftLength
>= 0) {
1162 result
= CollationFastLatin::compareUTF8(data
->fastLatinTable
,
1163 settings
->fastLatinPrimaries
,
1165 left
+ equalPrefixLength
,
1166 leftLength
- equalPrefixLength
,
1167 right
+ equalPrefixLength
,
1168 rightLength
- equalPrefixLength
);
1170 result
= CollationFastLatin::compareUTF8(data
->fastLatinTable
,
1171 settings
->fastLatinPrimaries
,
1173 left
+ equalPrefixLength
, -1,
1174 right
+ equalPrefixLength
, -1);
1177 result
= CollationFastLatin::BAIL_OUT_RESULT
;
1180 if(result
== CollationFastLatin::BAIL_OUT_RESULT
) {
1181 if(settings
->dontCheckFCD()) {
1182 UTF8CollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
, leftLength
);
1183 UTF8CollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
, rightLength
);
1184 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1186 FCDUTF8CollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
, leftLength
);
1187 FCDUTF8CollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
, rightLength
);
1188 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1191 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1192 return (UCollationResult
)result
;
1195 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1196 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1197 // and the benefit seems unlikely to be measurable.
1199 // Compare identical level.
1200 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1201 left
+= equalPrefixLength
;
1202 right
+= equalPrefixLength
;
1203 if(leftLength
> 0) {
1204 leftLength
-= equalPrefixLength
;
1205 rightLength
-= equalPrefixLength
;
1207 if(settings
->dontCheckFCD()) {
1208 UTF8NFDIterator
leftIter(left
, leftLength
);
1209 UTF8NFDIterator
rightIter(right
, rightLength
);
1210 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1212 FCDUTF8NFDIterator
leftIter(data
, left
, leftLength
);
1213 FCDUTF8NFDIterator
rightIter(data
, right
, rightLength
);
1214 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1219 RuleBasedCollator::compare(UCharIterator
&left
, UCharIterator
&right
,
1220 UErrorCode
&errorCode
) const {
1221 if(U_FAILURE(errorCode
) || &left
== &right
) { return UCOL_EQUAL
; }
1222 UBool numeric
= settings
->isNumeric();
1224 // Identical-prefix test.
1225 int32_t equalPrefixLength
= 0;
1229 while((leftUnit
= left
.next(&left
)) == (rightUnit
= right
.next(&right
))) {
1230 if(leftUnit
< 0) { return UCOL_EQUAL
; }
1231 ++equalPrefixLength
;
1234 // Back out the code units that differed, for the real collation comparison.
1235 if(leftUnit
>= 0) { left
.previous(&left
); }
1236 if(rightUnit
>= 0) { right
.previous(&right
); }
1238 if(equalPrefixLength
> 0) {
1239 if((leftUnit
>= 0 && data
->isUnsafeBackward(leftUnit
, numeric
)) ||
1240 (rightUnit
>= 0 && data
->isUnsafeBackward(rightUnit
, numeric
))) {
1241 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1243 --equalPrefixLength
;
1244 leftUnit
= left
.previous(&left
);
1245 right
.previous(&right
);
1246 } while(equalPrefixLength
> 0 && data
->isUnsafeBackward(leftUnit
, numeric
));
1248 // See the notes in the UTF-16 version.
1252 UCollationResult result
;
1253 if(settings
->dontCheckFCD()) {
1254 UIterCollationIterator
leftIter(data
, numeric
, left
);
1255 UIterCollationIterator
rightIter(data
, numeric
, right
);
1256 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1258 FCDUIterCollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
);
1259 FCDUIterCollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
);
1260 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1262 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1266 // Compare identical level.
1267 left
.move(&left
, equalPrefixLength
, UITER_ZERO
);
1268 right
.move(&right
, equalPrefixLength
, UITER_ZERO
);
1269 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1270 if(settings
->dontCheckFCD()) {
1271 UIterNFDIterator
leftIter(left
);
1272 UIterNFDIterator
rightIter(right
);
1273 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1275 FCDUIterNFDIterator
leftIter(data
, left
, equalPrefixLength
);
1276 FCDUIterNFDIterator
rightIter(data
, right
, equalPrefixLength
);
1277 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1282 RuleBasedCollator::getCollationKey(const UnicodeString
&s
, CollationKey
&key
,
1283 UErrorCode
&errorCode
) const {
1284 return getCollationKey(s
.getBuffer(), s
.length(), key
, errorCode
);
1288 RuleBasedCollator::getCollationKey(const UChar
*s
, int32_t length
, CollationKey
& key
,
1289 UErrorCode
&errorCode
) const {
1290 if(U_FAILURE(errorCode
)) {
1291 return key
.setToBogus();
1293 if(s
== NULL
&& length
!= 0) {
1294 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1295 return key
.setToBogus();
1297 key
.reset(); // resets the "bogus" state
1298 CollationKeyByteSink
sink(key
);
1299 writeSortKey(s
, length
, sink
, errorCode
);
1300 if(U_FAILURE(errorCode
)) {
1302 } else if(key
.isBogus()) {
1303 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1305 key
.setLength(sink
.NumberOfBytesAppended());
1311 RuleBasedCollator::getSortKey(const UnicodeString
&s
,
1312 uint8_t *dest
, int32_t capacity
) const {
1313 return getSortKey(s
.getBuffer(), s
.length(), dest
, capacity
);
1317 RuleBasedCollator::getSortKey(const UChar
*s
, int32_t length
,
1318 uint8_t *dest
, int32_t capacity
) const {
1319 if((s
== NULL
&& length
!= 0) || capacity
< 0 || (dest
== NULL
&& capacity
> 0)) {
1322 uint8_t noDest
[1] = { 0 };
1324 // Distinguish pure preflighting from an allocation error.
1328 FixedSortKeyByteSink
sink(reinterpret_cast<char *>(dest
), capacity
);
1329 UErrorCode errorCode
= U_ZERO_ERROR
;
1330 writeSortKey(s
, length
, sink
, errorCode
);
1331 return U_SUCCESS(errorCode
) ? sink
.NumberOfBytesAppended() : 0;
1335 RuleBasedCollator::writeSortKey(const UChar
*s
, int32_t length
,
1336 SortKeyByteSink
&sink
, UErrorCode
&errorCode
) const {
1337 if(U_FAILURE(errorCode
)) { return; }
1338 const UChar
*limit
= (length
>= 0) ? s
+ length
: NULL
;
1339 UBool numeric
= settings
->isNumeric();
1340 CollationKeys::LevelCallback callback
;
1341 if(settings
->dontCheckFCD()) {
1342 UTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1343 CollationKeys::writeSortKeyUpToQuaternary(iter
, data
->compressibleBytes
, *settings
,
1344 sink
, Collation::PRIMARY_LEVEL
,
1345 callback
, TRUE
, errorCode
);
1347 FCDUTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1348 CollationKeys::writeSortKeyUpToQuaternary(iter
, data
->compressibleBytes
, *settings
,
1349 sink
, Collation::PRIMARY_LEVEL
,
1350 callback
, TRUE
, errorCode
);
1352 if(settings
->getStrength() == UCOL_IDENTICAL
) {
1353 writeIdenticalLevel(s
, limit
, sink
, errorCode
);
1355 static const char terminator
= 0; // TERMINATOR_BYTE
1356 sink
.Append(&terminator
, 1);
1360 RuleBasedCollator::writeIdenticalLevel(const UChar
*s
, const UChar
*limit
,
1361 SortKeyByteSink
&sink
, UErrorCode
&errorCode
) const {
1363 const UChar
*nfdQCYesLimit
= data
->nfcImpl
.decompose(s
, limit
, NULL
, errorCode
);
1364 if(U_FAILURE(errorCode
)) { return; }
1365 sink
.Append(Collation::LEVEL_SEPARATOR_BYTE
);
1367 if(nfdQCYesLimit
!= s
) {
1368 prev
= u_writeIdenticalLevelRun(prev
, s
, (int32_t)(nfdQCYesLimit
- s
), sink
);
1370 // Is there non-NFD text?
1371 int32_t destLengthEstimate
;
1373 if(nfdQCYesLimit
== limit
) { return; }
1374 destLengthEstimate
= (int32_t)(limit
- nfdQCYesLimit
);
1376 // s is NUL-terminated
1377 if(*nfdQCYesLimit
== 0) { return; }
1378 destLengthEstimate
= -1;
1381 data
->nfcImpl
.decompose(nfdQCYesLimit
, limit
, nfd
, destLengthEstimate
, errorCode
);
1382 u_writeIdenticalLevelRun(prev
, nfd
.getBuffer(), nfd
.length(), sink
);
1388 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1389 * with an instance of this callback class.
1390 * When another level is about to be written, the callback
1391 * records the level and the number of bytes that will be written until
1392 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1394 * When internalNextSortKeyPart() is called again, it restarts with the last level
1395 * and ignores as many bytes as were written previously for that level.
1397 class PartLevelCallback
: public CollationKeys::LevelCallback
{
1399 PartLevelCallback(const SortKeyByteSink
&s
)
1400 : sink(s
), level(Collation::PRIMARY_LEVEL
) {
1401 levelCapacity
= sink
.GetRemainingCapacity();
1403 virtual ~PartLevelCallback() {}
1404 virtual UBool
needToWrite(Collation::Level l
) {
1405 if(!sink
.Overflowed()) {
1406 // Remember a level that will be at least partially written.
1408 levelCapacity
= sink
.GetRemainingCapacity();
1414 Collation::Level
getLevel() const { return level
; }
1415 int32_t getLevelCapacity() const { return levelCapacity
; }
1418 const SortKeyByteSink
&sink
;
1419 Collation::Level level
;
1420 int32_t levelCapacity
;
1426 RuleBasedCollator::internalNextSortKeyPart(UCharIterator
*iter
, uint32_t state
[2],
1427 uint8_t *dest
, int32_t count
, UErrorCode
&errorCode
) const {
1428 if(U_FAILURE(errorCode
)) { return 0; }
1429 if(iter
== NULL
|| state
== NULL
|| count
< 0 || (count
> 0 && dest
== NULL
)) {
1430 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1433 if(count
== 0) { return 0; }
1435 FixedSortKeyByteSink
sink(reinterpret_cast<char *>(dest
), count
);
1436 sink
.IgnoreBytes((int32_t)state
[1]);
1437 iter
->move(iter
, 0, UITER_START
);
1439 Collation::Level level
= (Collation::Level
)state
[0];
1440 if(level
<= Collation::QUATERNARY_LEVEL
) {
1441 UBool numeric
= settings
->isNumeric();
1442 PartLevelCallback
callback(sink
);
1443 if(settings
->dontCheckFCD()) {
1444 UIterCollationIterator
ci(data
, numeric
, *iter
);
1445 CollationKeys::writeSortKeyUpToQuaternary(ci
, data
->compressibleBytes
, *settings
,
1446 sink
, level
, callback
, FALSE
, errorCode
);
1448 FCDUIterCollationIterator
ci(data
, numeric
, *iter
, 0);
1449 CollationKeys::writeSortKeyUpToQuaternary(ci
, data
->compressibleBytes
, *settings
,
1450 sink
, level
, callback
, FALSE
, errorCode
);
1452 if(U_FAILURE(errorCode
)) { return 0; }
1453 if(sink
.NumberOfBytesAppended() > count
) {
1454 state
[0] = (uint32_t)callback
.getLevel();
1455 state
[1] = (uint32_t)callback
.getLevelCapacity();
1458 // All of the normal levels are done.
1459 if(settings
->getStrength() == UCOL_IDENTICAL
) {
1460 level
= Collation::IDENTICAL_LEVEL
;
1461 iter
->move(iter
, 0, UITER_START
);
1463 // else fall through to setting ZERO_LEVEL
1466 if(level
== Collation::IDENTICAL_LEVEL
) {
1467 int32_t levelCapacity
= sink
.GetRemainingCapacity();
1470 UChar32 c
= iter
->next(iter
);
1471 if(c
< 0) { break; }
1474 const UChar
*sArray
= s
.getBuffer();
1475 writeIdenticalLevel(sArray
, sArray
+ s
.length(), sink
, errorCode
);
1476 if(U_FAILURE(errorCode
)) { return 0; }
1477 if(sink
.NumberOfBytesAppended() > count
) {
1478 state
[0] = (uint32_t)level
;
1479 state
[1] = (uint32_t)levelCapacity
;
1484 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1485 state
[0] = (uint32_t)Collation::ZERO_LEVEL
;
1487 int32_t length
= sink
.NumberOfBytesAppended();
1489 while(i
< count
) { dest
[i
++] = 0; }
1494 RuleBasedCollator::internalGetCEs(const UnicodeString
&str
, UVector64
&ces
,
1495 UErrorCode
&errorCode
) const {
1496 if(U_FAILURE(errorCode
)) { return; }
1497 const UChar
*s
= str
.getBuffer();
1498 const UChar
*limit
= s
+ str
.length();
1499 UBool numeric
= settings
->isNumeric();
1500 if(settings
->dontCheckFCD()) {
1501 UTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1503 while((ce
= iter
.nextCE(errorCode
)) != Collation::NO_CE
) {
1504 ces
.addElement(ce
, errorCode
);
1507 FCDUTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1509 while((ce
= iter
.nextCE(errorCode
)) != Collation::NO_CE
) {
1510 ces
.addElement(ce
, errorCode
);
1517 void appendSubtag(CharString
&s
, char letter
, const char *subtag
, int32_t length
,
1518 UErrorCode
&errorCode
) {
1519 if(U_FAILURE(errorCode
) || length
== 0) { return; }
1521 s
.append('_', errorCode
);
1523 s
.append(letter
, errorCode
);
1524 for(int32_t i
= 0; i
< length
; ++i
) {
1525 s
.append(uprv_toupper(subtag
[i
]), errorCode
);
1529 void appendAttribute(CharString
&s
, char letter
, UColAttributeValue value
,
1530 UErrorCode
&errorCode
) {
1531 if(U_FAILURE(errorCode
)) { return; }
1533 s
.append('_', errorCode
);
1535 static const char *valueChars
= "1234...........IXO..SN..LU......";
1536 s
.append(letter
, errorCode
);
1537 s
.append(valueChars
[value
], errorCode
);
1543 RuleBasedCollator::internalGetShortDefinitionString(const char *locale
,
1544 char *buffer
, int32_t capacity
,
1545 UErrorCode
&errorCode
) const {
1546 if(U_FAILURE(errorCode
)) { return 0; }
1547 if(buffer
== NULL
? capacity
!= 0 : capacity
< 0) {
1548 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1551 if(locale
== NULL
) {
1552 locale
= internalGetLocaleID(ULOC_VALID_LOCALE
, errorCode
);
1555 char resultLocale
[ULOC_FULLNAME_CAPACITY
+ 1];
1556 int32_t length
= ucol_getFunctionalEquivalent(resultLocale
, ULOC_FULLNAME_CAPACITY
,
1557 "collation", locale
,
1559 if(U_FAILURE(errorCode
)) { return 0; }
1561 uprv_strcpy(resultLocale
, "root");
1563 resultLocale
[length
] = 0;
1566 // Append items in alphabetic order of their short definition letters.
1568 char subtag
[ULOC_KEYWORD_AND_VALUES_CAPACITY
];
1570 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING
)) {
1571 appendAttribute(result
, 'A', getAttribute(UCOL_ALTERNATE_HANDLING
, errorCode
), errorCode
);
1573 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1574 // See ICU tickets #10372 and #10386.
1575 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST
)) {
1576 appendAttribute(result
, 'C', getAttribute(UCOL_CASE_FIRST
, errorCode
), errorCode
);
1578 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION
)) {
1579 appendAttribute(result
, 'D', getAttribute(UCOL_NUMERIC_COLLATION
, errorCode
), errorCode
);
1581 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL
)) {
1582 appendAttribute(result
, 'E', getAttribute(UCOL_CASE_LEVEL
, errorCode
), errorCode
);
1584 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION
)) {
1585 appendAttribute(result
, 'F', getAttribute(UCOL_FRENCH_COLLATION
, errorCode
), errorCode
);
1587 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1588 length
= uloc_getKeywordValue(resultLocale
, "collation", subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1589 appendSubtag(result
, 'K', subtag
, length
, errorCode
);
1590 length
= uloc_getLanguage(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1592 appendSubtag(result
, 'L', "root", 4, errorCode
);
1594 appendSubtag(result
, 'L', subtag
, length
, errorCode
);
1596 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE
)) {
1597 appendAttribute(result
, 'N', getAttribute(UCOL_NORMALIZATION_MODE
, errorCode
), errorCode
);
1599 length
= uloc_getCountry(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1600 appendSubtag(result
, 'R', subtag
, length
, errorCode
);
1601 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH
)) {
1602 appendAttribute(result
, 'S', getAttribute(UCOL_STRENGTH
, errorCode
), errorCode
);
1604 length
= uloc_getVariant(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1605 appendSubtag(result
, 'V', subtag
, length
, errorCode
);
1606 length
= uloc_getScript(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1607 appendSubtag(result
, 'Z', subtag
, length
, errorCode
);
1609 if(U_FAILURE(errorCode
)) { return 0; }
1610 if(result
.length() <= capacity
) {
1611 uprv_memcpy(buffer
, result
.data(), result
.length());
1613 return u_terminateChars(buffer
, capacity
, result
.length(), &errorCode
);
1617 RuleBasedCollator::isUnsafe(UChar32 c
) const {
1618 return data
->isUnsafeBackward(c
, settings
->isNumeric());
1622 RuleBasedCollator::computeMaxExpansions(const CollationTailoring
*t
, UErrorCode
&errorCode
) {
1623 t
->maxExpansions
= CollationElementIterator::computeMaxExpansions(t
->data
, errorCode
);
1627 RuleBasedCollator::initMaxExpansions(UErrorCode
&errorCode
) const {
1628 umtx_initOnce(tailoring
->maxExpansionsInitOnce
, computeMaxExpansions
, tailoring
, errorCode
);
1629 return U_SUCCESS(errorCode
);
1632 CollationElementIterator
*
1633 RuleBasedCollator::createCollationElementIterator(const UnicodeString
& source
) const {
1634 UErrorCode errorCode
= U_ZERO_ERROR
;
1635 if(!initMaxExpansions(errorCode
)) { return NULL
; }
1636 CollationElementIterator
*cei
= new CollationElementIterator(source
, this, errorCode
);
1637 if(U_FAILURE(errorCode
)) {
1644 CollationElementIterator
*
1645 RuleBasedCollator::createCollationElementIterator(const CharacterIterator
& source
) const {
1646 UErrorCode errorCode
= U_ZERO_ERROR
;
1647 if(!initMaxExpansions(errorCode
)) { return NULL
; }
1648 CollationElementIterator
*cei
= new CollationElementIterator(source
, this, errorCode
);
1649 if(U_FAILURE(errorCode
)) {
1657 RuleBasedCollator::getMaxExpansion(int32_t order
) const {
1658 UErrorCode errorCode
= U_ZERO_ERROR
;
1659 (void)initMaxExpansions(errorCode
);
1660 return CollationElementIterator::getMaxExpansion(tailoring
->maxExpansions
, order
);
1665 #endif // !UCONFIG_NO_COLLATION