1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * rulebasedcollator.cpp
10 * (replaced the former tblcoll.cpp)
12 * created on: 2012feb14 with new and old collation code
13 * created by: Markus W. Scherer
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/coll.h"
21 #include "unicode/coleitr.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/locid.h"
24 #include "unicode/sortkey.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/ucol.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uloc.h"
29 #include "unicode/uniset.h"
30 #include "unicode/unistr.h"
31 #include "unicode/usetiter.h"
32 #include "unicode/utf8.h"
33 #include "unicode/uversion.h"
37 #include "collation.h"
38 #include "collationcompare.h"
39 #include "collationdata.h"
40 #include "collationdatareader.h"
41 #include "collationfastlatin.h"
42 #include "collationiterator.h"
43 #include "collationkeys.h"
44 #include "collationroot.h"
45 #include "collationsets.h"
46 #include "collationsettings.h"
47 #include "collationtailoring.h"
52 #include "uitercollationiterator.h"
54 #include "utf16collationiterator.h"
55 #include "utf8collationiterator.h"
62 class FixedSortKeyByteSink
: public SortKeyByteSink
{
64 FixedSortKeyByteSink(char *dest
, int32_t destCapacity
)
65 : SortKeyByteSink(dest
, destCapacity
) {}
66 virtual ~FixedSortKeyByteSink();
69 virtual void AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
);
70 virtual UBool
Resize(int32_t appendCapacity
, int32_t length
);
73 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
76 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes
, int32_t /*n*/, int32_t length
) {
77 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
78 // Fill the buffer completely.
79 int32_t available
= capacity_
- length
;
81 uprv_memcpy(buffer_
+ length
, bytes
, available
);
86 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
92 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
93 class CollationKeyByteSink
: public SortKeyByteSink
{
95 CollationKeyByteSink(CollationKey
&key
)
96 : SortKeyByteSink(reinterpret_cast<char *>(key
.getBytes()), key
.getCapacity()),
98 virtual ~CollationKeyByteSink();
101 virtual void AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
);
102 virtual UBool
Resize(int32_t appendCapacity
, int32_t length
);
107 CollationKeyByteSink::~CollationKeyByteSink() {}
110 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
) {
111 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
112 if (Resize(n
, length
)) {
113 uprv_memcpy(buffer_
+ length
, bytes
, n
);
118 CollationKeyByteSink::Resize(int32_t appendCapacity
, int32_t length
) {
119 if (buffer_
== NULL
) {
120 return FALSE
; // allocation failed before already
122 int32_t newCapacity
= 2 * capacity_
;
123 int32_t altCapacity
= length
+ 2 * appendCapacity
;
124 if (newCapacity
< altCapacity
) {
125 newCapacity
= altCapacity
;
127 if (newCapacity
< 200) {
130 uint8_t *newBuffer
= key_
.reallocate(newCapacity
, length
);
131 if (newBuffer
== NULL
) {
135 buffer_
= reinterpret_cast<char *>(newBuffer
);
136 capacity_
= newCapacity
;
140 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator
&other
)
143 settings(other
.settings
),
144 tailoring(other
.tailoring
),
145 cacheEntry(other
.cacheEntry
),
146 validLocale(other
.validLocale
),
147 explicitlySetAttributes(other
.explicitlySetAttributes
),
148 actualLocaleIsSameAsValid(other
.actualLocaleIsSameAsValid
) {
150 cacheEntry
->addRef();
153 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin
, int32_t length
,
154 const RuleBasedCollator
*base
, UErrorCode
&errorCode
)
160 explicitlySetAttributes(0),
161 actualLocaleIsSameAsValid(FALSE
) {
162 if(U_FAILURE(errorCode
)) { return; }
163 if(bin
== NULL
|| length
== 0 || base
== NULL
) {
164 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
167 const CollationTailoring
*root
= CollationRoot::getRoot(errorCode
);
168 if(U_FAILURE(errorCode
)) { return; }
169 if(base
->tailoring
!= root
) {
170 errorCode
= U_UNSUPPORTED_ERROR
;
173 LocalPointer
<CollationTailoring
> t(new CollationTailoring(base
->tailoring
->settings
));
174 if(t
.isNull() || t
->isBogus()) {
175 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
178 CollationDataReader::read(base
->tailoring
, bin
, length
, *t
, errorCode
);
179 if(U_FAILURE(errorCode
)) { return; }
180 t
->actualLocale
.setToBogus();
181 adoptTailoring(t
.orphan(), errorCode
);
184 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry
*entry
)
185 : data(entry
->tailoring
->data
),
186 settings(entry
->tailoring
->settings
),
187 tailoring(entry
->tailoring
),
189 validLocale(entry
->validLocale
),
190 explicitlySetAttributes(0),
191 actualLocaleIsSameAsValid(FALSE
) {
193 cacheEntry
->addRef();
196 RuleBasedCollator::~RuleBasedCollator() {
197 SharedObject::clearPtr(settings
);
198 SharedObject::clearPtr(cacheEntry
);
202 RuleBasedCollator::adoptTailoring(CollationTailoring
*t
, UErrorCode
&errorCode
) {
203 if(U_FAILURE(errorCode
)) {
204 t
->deleteIfZeroRefCount();
207 U_ASSERT(settings
== NULL
&& data
== NULL
&& tailoring
== NULL
&& cacheEntry
== NULL
);
208 cacheEntry
= new CollationCacheEntry(t
->actualLocale
, t
);
209 if(cacheEntry
== NULL
) {
210 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
211 t
->deleteIfZeroRefCount();
215 settings
= t
->settings
;
218 cacheEntry
->addRef();
219 validLocale
= t
->actualLocale
;
220 actualLocaleIsSameAsValid
= FALSE
;
224 RuleBasedCollator::clone() const {
225 return new RuleBasedCollator(*this);
228 RuleBasedCollator
&RuleBasedCollator::operator=(const RuleBasedCollator
&other
) {
229 if(this == &other
) { return *this; }
230 SharedObject::copyPtr(other
.settings
, settings
);
231 tailoring
= other
.tailoring
;
232 SharedObject::copyPtr(other
.cacheEntry
, cacheEntry
);
233 data
= tailoring
->data
;
234 validLocale
= other
.validLocale
;
235 explicitlySetAttributes
= other
.explicitlySetAttributes
;
236 actualLocaleIsSameAsValid
= other
.actualLocaleIsSameAsValid
;
240 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator
)
243 RuleBasedCollator::operator==(const Collator
& other
) const {
244 if(this == &other
) { return TRUE
; }
245 if(!Collator::operator==(other
)) { return FALSE
; }
246 const RuleBasedCollator
&o
= static_cast<const RuleBasedCollator
&>(other
);
247 if(*settings
!= *o
.settings
) { return FALSE
; }
248 if(data
== o
.data
) { return TRUE
; }
249 UBool thisIsRoot
= data
->base
== NULL
;
250 UBool otherIsRoot
= o
.data
->base
== NULL
;
251 U_ASSERT(!thisIsRoot
|| !otherIsRoot
); // otherwise their data pointers should be ==
252 if(thisIsRoot
!= otherIsRoot
) { return FALSE
; }
253 if((thisIsRoot
|| !tailoring
->rules
.isEmpty()) &&
254 (otherIsRoot
|| !o
.tailoring
->rules
.isEmpty())) {
255 // Shortcut: If both collators have valid rule strings, then compare those.
256 if(tailoring
->rules
== o
.tailoring
->rules
) { return TRUE
; }
258 // Different rule strings can result in the same or equivalent tailoring.
259 // The rule strings are optional in ICU resource bundles, although included by default.
260 // cloneBinary() drops the rule string.
261 UErrorCode errorCode
= U_ZERO_ERROR
;
262 LocalPointer
<UnicodeSet
> thisTailored(getTailoredSet(errorCode
));
263 LocalPointer
<UnicodeSet
> otherTailored(o
.getTailoredSet(errorCode
));
264 if(U_FAILURE(errorCode
)) { return FALSE
; }
265 if(*thisTailored
!= *otherTailored
) { return FALSE
; }
266 // For completeness, we should compare all of the mappings;
267 // or we should create a list of strings, sort it with one collator,
268 // and check if both collators compare adjacent strings the same
269 // (order & strength, down to quaternary); or similar.
270 // Testing equality of collators seems unusual.
275 RuleBasedCollator::hashCode() const {
276 int32_t h
= settings
->hashCode();
277 if(data
->base
== NULL
) { return h
; } // root collator
278 // Do not rely on the rule string, see comments in operator==().
279 UErrorCode errorCode
= U_ZERO_ERROR
;
280 LocalPointer
<UnicodeSet
> set(getTailoredSet(errorCode
));
281 if(U_FAILURE(errorCode
)) { return 0; }
282 UnicodeSetIterator
iter(*set
);
283 while(iter
.next() && !iter
.isString()) {
284 h
^= data
->getCE32(iter
.getCodepoint());
290 RuleBasedCollator::setLocales(const Locale
&requested
, const Locale
&valid
,
291 const Locale
&actual
) {
292 if(actual
== tailoring
->actualLocale
) {
293 actualLocaleIsSameAsValid
= FALSE
;
295 U_ASSERT(actual
== valid
);
296 actualLocaleIsSameAsValid
= TRUE
;
298 // Do not modify tailoring.actualLocale:
299 // We cannot be sure that that would be thread-safe.
301 (void)requested
; // Ignore, see also ticket #10477.
305 RuleBasedCollator::getLocale(ULocDataLocaleType type
, UErrorCode
& errorCode
) const {
306 if(U_FAILURE(errorCode
)) {
307 return Locale::getRoot();
310 case ULOC_ACTUAL_LOCALE
:
311 return actualLocaleIsSameAsValid
? validLocale
: tailoring
->actualLocale
;
312 case ULOC_VALID_LOCALE
:
313 case ULOC_REQUESTED_LOCALE
: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
316 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
317 return Locale::getRoot();
322 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type
, UErrorCode
&errorCode
) const {
323 if(U_FAILURE(errorCode
)) {
326 const Locale
*result
;
328 case ULOC_ACTUAL_LOCALE
:
329 result
= actualLocaleIsSameAsValid
? &validLocale
: &tailoring
->actualLocale
;
331 case ULOC_VALID_LOCALE
:
332 case ULOC_REQUESTED_LOCALE
: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
333 result
= &validLocale
;
336 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
339 if(result
->isBogus()) { return NULL
; }
340 const char *id
= result
->getName();
341 return id
[0] == 0 ? "root" : id
;
345 RuleBasedCollator::getRules() const {
346 return tailoring
->rules
;
350 RuleBasedCollator::getRules(UColRuleOption delta
, UnicodeString
&buffer
) const {
351 if(delta
== UCOL_TAILORING_ONLY
) {
352 buffer
= tailoring
->rules
;
357 CollationLoader::appendRootRules(buffer
);
358 buffer
.append(tailoring
->rules
).getTerminatedBuffer();
362 RuleBasedCollator::getVersion(UVersionInfo version
) const {
363 uprv_memcpy(version
, tailoring
->version
, U_MAX_VERSION_LENGTH
);
364 version
[0] += (UCOL_RUNTIME_VERSION
<< 4) + (UCOL_RUNTIME_VERSION
>> 4);
368 RuleBasedCollator::getTailoredSet(UErrorCode
&errorCode
) const {
369 if(U_FAILURE(errorCode
)) { return NULL
; }
370 UnicodeSet
*tailored
= new UnicodeSet();
371 if(tailored
== NULL
) {
372 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
375 if(data
->base
!= NULL
) {
376 TailoredSet(tailored
).forData(data
, errorCode
);
377 if(U_FAILURE(errorCode
)) {
386 RuleBasedCollator::internalGetContractionsAndExpansions(
387 UnicodeSet
*contractions
, UnicodeSet
*expansions
,
388 UBool addPrefixes
, UErrorCode
&errorCode
) const {
389 if(U_FAILURE(errorCode
)) { return; }
390 if(contractions
!= NULL
) {
391 contractions
->clear();
393 if(expansions
!= NULL
) {
396 ContractionsAndExpansions(contractions
, expansions
, NULL
, addPrefixes
).forData(data
, errorCode
);
400 RuleBasedCollator::internalAddContractions(UChar32 c
, UnicodeSet
&set
, UErrorCode
&errorCode
) const {
401 if(U_FAILURE(errorCode
)) { return; }
402 ContractionsAndExpansions(&set
, NULL
, NULL
, FALSE
).forCodePoint(data
, c
, errorCode
);
405 const CollationSettings
&
406 RuleBasedCollator::getDefaultSettings() const {
407 return *tailoring
->settings
;
411 RuleBasedCollator::getAttribute(UColAttribute attr
, UErrorCode
&errorCode
) const {
412 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
415 case UCOL_FRENCH_COLLATION
:
416 option
= CollationSettings::BACKWARD_SECONDARY
;
418 case UCOL_ALTERNATE_HANDLING
:
419 return settings
->getAlternateHandling();
420 case UCOL_CASE_FIRST
:
421 return settings
->getCaseFirst();
422 case UCOL_CASE_LEVEL
:
423 option
= CollationSettings::CASE_LEVEL
;
425 case UCOL_NORMALIZATION_MODE
:
426 option
= CollationSettings::CHECK_FCD
;
429 return (UColAttributeValue
)settings
->getStrength();
430 case UCOL_HIRAGANA_QUATERNARY_MODE
:
431 // Deprecated attribute, unsettable.
433 case UCOL_NUMERIC_COLLATION
:
434 option
= CollationSettings::NUMERIC
;
437 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
440 return ((settings
->options
& option
) == 0) ? UCOL_OFF
: UCOL_ON
;
444 RuleBasedCollator::setAttribute(UColAttribute attr
, UColAttributeValue value
,
445 UErrorCode
&errorCode
) {
446 UColAttributeValue oldValue
= getAttribute(attr
, errorCode
);
447 if(U_FAILURE(errorCode
)) { return; }
448 if(value
== oldValue
) {
449 setAttributeExplicitly(attr
);
452 const CollationSettings
&defaultSettings
= getDefaultSettings();
453 if(settings
== &defaultSettings
) {
454 if(value
== UCOL_DEFAULT
) {
455 setAttributeDefault(attr
);
459 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
460 if(ownedSettings
== NULL
) {
461 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
466 case UCOL_FRENCH_COLLATION
:
467 ownedSettings
->setFlag(CollationSettings::BACKWARD_SECONDARY
, value
,
468 defaultSettings
.options
, errorCode
);
470 case UCOL_ALTERNATE_HANDLING
:
471 ownedSettings
->setAlternateHandling(value
, defaultSettings
.options
, errorCode
);
473 case UCOL_CASE_FIRST
:
474 ownedSettings
->setCaseFirst(value
, defaultSettings
.options
, errorCode
);
476 case UCOL_CASE_LEVEL
:
477 ownedSettings
->setFlag(CollationSettings::CASE_LEVEL
, value
,
478 defaultSettings
.options
, errorCode
);
480 case UCOL_NORMALIZATION_MODE
:
481 ownedSettings
->setFlag(CollationSettings::CHECK_FCD
, value
,
482 defaultSettings
.options
, errorCode
);
485 ownedSettings
->setStrength(value
, defaultSettings
.options
, errorCode
);
487 case UCOL_HIRAGANA_QUATERNARY_MODE
:
488 // Deprecated attribute. Check for valid values but do not change anything.
489 if(value
!= UCOL_OFF
&& value
!= UCOL_ON
&& value
!= UCOL_DEFAULT
) {
490 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
493 case UCOL_NUMERIC_COLLATION
:
494 ownedSettings
->setFlag(CollationSettings::NUMERIC
, value
, defaultSettings
.options
, errorCode
);
497 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
500 if(U_FAILURE(errorCode
)) { return; }
501 setFastLatinOptions(*ownedSettings
);
502 if(value
== UCOL_DEFAULT
) {
503 setAttributeDefault(attr
);
505 setAttributeExplicitly(attr
);
510 RuleBasedCollator::setMaxVariable(UColReorderCode group
, UErrorCode
&errorCode
) {
511 if(U_FAILURE(errorCode
)) { return *this; }
512 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
514 if(group
== UCOL_REORDER_CODE_DEFAULT
) {
515 value
= UCOL_DEFAULT
;
516 } else if(UCOL_REORDER_CODE_FIRST
<= group
&& group
<= UCOL_REORDER_CODE_CURRENCY
) {
517 value
= group
- UCOL_REORDER_CODE_FIRST
;
519 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
522 CollationSettings::MaxVariable oldValue
= settings
->getMaxVariable();
523 if(value
== oldValue
) {
524 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
527 const CollationSettings
&defaultSettings
= getDefaultSettings();
528 if(settings
== &defaultSettings
) {
529 if(value
== UCOL_DEFAULT
) {
530 setAttributeDefault(ATTR_VARIABLE_TOP
);
534 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
535 if(ownedSettings
== NULL
) {
536 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
540 if(group
== UCOL_REORDER_CODE_DEFAULT
) {
541 group
= (UColReorderCode
)(UCOL_REORDER_CODE_FIRST
+ defaultSettings
.getMaxVariable());
543 uint32_t varTop
= data
->getLastPrimaryForGroup(group
);
544 U_ASSERT(varTop
!= 0);
545 ownedSettings
->setMaxVariable(value
, defaultSettings
.options
, errorCode
);
546 if(U_FAILURE(errorCode
)) { return *this; }
547 ownedSettings
->variableTop
= varTop
;
548 setFastLatinOptions(*ownedSettings
);
549 if(value
== UCOL_DEFAULT
) {
550 setAttributeDefault(ATTR_VARIABLE_TOP
);
552 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
558 RuleBasedCollator::getMaxVariable() const {
559 return (UColReorderCode
)(UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
563 RuleBasedCollator::getVariableTop(UErrorCode
& /*errorCode*/) const {
564 return settings
->variableTop
;
568 RuleBasedCollator::setVariableTop(const UChar
*varTop
, int32_t len
, UErrorCode
&errorCode
) {
569 if(U_FAILURE(errorCode
)) { return 0; }
570 if(varTop
== NULL
&& len
!=0) {
571 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
574 if(len
< 0) { len
= u_strlen(varTop
); }
576 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
579 UBool numeric
= settings
->isNumeric();
581 if(settings
->dontCheckFCD()) {
582 UTF16CollationIterator
ci(data
, numeric
, varTop
, varTop
, varTop
+ len
);
583 ce1
= ci
.nextCE(errorCode
);
584 ce2
= ci
.nextCE(errorCode
);
586 FCDUTF16CollationIterator
ci(data
, numeric
, varTop
, varTop
, varTop
+ len
);
587 ce1
= ci
.nextCE(errorCode
);
588 ce2
= ci
.nextCE(errorCode
);
590 if(ce1
== Collation::NO_CE
|| ce2
!= Collation::NO_CE
) {
591 errorCode
= U_CE_NOT_FOUND_ERROR
;
594 setVariableTop((uint32_t)(ce1
>> 32), errorCode
);
595 return settings
->variableTop
;
599 RuleBasedCollator::setVariableTop(const UnicodeString
&varTop
, UErrorCode
&errorCode
) {
600 return setVariableTop(varTop
.getBuffer(), varTop
.length(), errorCode
);
604 RuleBasedCollator::setVariableTop(uint32_t varTop
, UErrorCode
&errorCode
) {
605 if(U_FAILURE(errorCode
)) { return; }
606 if(varTop
!= settings
->variableTop
) {
607 // Pin the variable top to the end of the reordering group which contains it.
608 // Only a few special groups are supported.
609 int32_t group
= data
->getGroupForPrimary(varTop
);
610 if(group
< UCOL_REORDER_CODE_FIRST
|| UCOL_REORDER_CODE_CURRENCY
< group
) {
611 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
614 uint32_t v
= data
->getLastPrimaryForGroup(group
);
615 U_ASSERT(v
!= 0 && v
>= varTop
);
617 if(varTop
!= settings
->variableTop
) {
618 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
619 if(ownedSettings
== NULL
) {
620 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
623 ownedSettings
->setMaxVariable(group
- UCOL_REORDER_CODE_FIRST
,
624 getDefaultSettings().options
, errorCode
);
625 if(U_FAILURE(errorCode
)) { return; }
626 ownedSettings
->variableTop
= varTop
;
627 setFastLatinOptions(*ownedSettings
);
630 if(varTop
== getDefaultSettings().variableTop
) {
631 setAttributeDefault(ATTR_VARIABLE_TOP
);
633 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
638 RuleBasedCollator::getReorderCodes(int32_t *dest
, int32_t capacity
,
639 UErrorCode
&errorCode
) const {
640 if(U_FAILURE(errorCode
)) { return 0; }
641 if(capacity
< 0 || (dest
== NULL
&& capacity
> 0)) {
642 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
645 int32_t length
= settings
->reorderCodesLength
;
646 if(length
== 0) { return 0; }
647 if(length
> capacity
) {
648 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
651 uprv_memcpy(dest
, settings
->reorderCodes
, length
* 4);
656 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes
, int32_t length
,
657 UErrorCode
&errorCode
) {
658 if(U_FAILURE(errorCode
)) { return; }
659 if(length
< 0 || (reorderCodes
== NULL
&& length
> 0)) {
660 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
663 if(length
== 1 && reorderCodes
[0] == UCOL_REORDER_CODE_NONE
) {
666 if(length
== settings
->reorderCodesLength
&&
667 uprv_memcmp(reorderCodes
, settings
->reorderCodes
, length
* 4) == 0) {
670 const CollationSettings
&defaultSettings
= getDefaultSettings();
671 if(length
== 1 && reorderCodes
[0] == UCOL_REORDER_CODE_DEFAULT
) {
672 if(settings
!= &defaultSettings
) {
673 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
674 if(ownedSettings
== NULL
) {
675 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
678 ownedSettings
->copyReorderingFrom(defaultSettings
, errorCode
);
679 setFastLatinOptions(*ownedSettings
);
683 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
684 if(ownedSettings
== NULL
) {
685 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
688 ownedSettings
->setReordering(*data
, reorderCodes
, length
, errorCode
);
689 setFastLatinOptions(*ownedSettings
);
693 RuleBasedCollator::setFastLatinOptions(CollationSettings
&ownedSettings
) const {
694 ownedSettings
.fastLatinOptions
= CollationFastLatin::getOptions(
696 ownedSettings
.fastLatinPrimaries
, UPRV_LENGTHOF(ownedSettings
.fastLatinPrimaries
));
700 RuleBasedCollator::compare(const UnicodeString
&left
, const UnicodeString
&right
,
701 UErrorCode
&errorCode
) const {
702 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
703 return doCompare(left
.getBuffer(), left
.length(),
704 right
.getBuffer(), right
.length(), errorCode
);
708 RuleBasedCollator::compare(const UnicodeString
&left
, const UnicodeString
&right
,
709 int32_t length
, UErrorCode
&errorCode
) const {
710 if(U_FAILURE(errorCode
) || length
== 0) { return UCOL_EQUAL
; }
712 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
715 int32_t leftLength
= left
.length();
716 int32_t rightLength
= right
.length();
717 if(leftLength
> length
) { leftLength
= length
; }
718 if(rightLength
> length
) { rightLength
= length
; }
719 return doCompare(left
.getBuffer(), leftLength
,
720 right
.getBuffer(), rightLength
, errorCode
);
724 RuleBasedCollator::compare(const UChar
*left
, int32_t leftLength
,
725 const UChar
*right
, int32_t rightLength
,
726 UErrorCode
&errorCode
) const {
727 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
728 if((left
== NULL
&& leftLength
!= 0) || (right
== NULL
&& rightLength
!= 0)) {
729 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
732 // Make sure both or neither strings have a known length.
733 // We do not optimize for mixed length/termination.
734 if(leftLength
>= 0) {
735 if(rightLength
< 0) { rightLength
= u_strlen(right
); }
737 if(rightLength
>= 0) { leftLength
= u_strlen(left
); }
739 return doCompare(left
, leftLength
, right
, rightLength
, errorCode
);
743 RuleBasedCollator::compareUTF8(const StringPiece
&left
, const StringPiece
&right
,
744 UErrorCode
&errorCode
) const {
745 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
746 const uint8_t *leftBytes
= reinterpret_cast<const uint8_t *>(left
.data());
747 const uint8_t *rightBytes
= reinterpret_cast<const uint8_t *>(right
.data());
748 if((leftBytes
== NULL
&& !left
.empty()) || (rightBytes
== NULL
&& !right
.empty())) {
749 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
752 return doCompare(leftBytes
, left
.length(), rightBytes
, right
.length(), errorCode
);
756 RuleBasedCollator::internalCompareUTF8(const char *left
, int32_t leftLength
,
757 const char *right
, int32_t rightLength
,
758 UErrorCode
&errorCode
) const {
759 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
760 if((left
== NULL
&& leftLength
!= 0) || (right
== NULL
&& rightLength
!= 0)) {
761 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
764 // Make sure both or neither strings have a known length.
765 // We do not optimize for mixed length/termination.
766 if(leftLength
>= 0) {
767 if(rightLength
< 0) { rightLength
= uprv_strlen(right
); }
769 if(rightLength
>= 0) { leftLength
= uprv_strlen(left
); }
771 return doCompare(reinterpret_cast<const uint8_t *>(left
), leftLength
,
772 reinterpret_cast<const uint8_t *>(right
), rightLength
, errorCode
);
778 * Abstract iterator for identical-level string comparisons.
779 * Returns FCD code points and handles temporary switching to NFD.
781 class NFDIterator
: public UObject
{
783 NFDIterator() : index(-1), length(0) {}
784 virtual ~NFDIterator() {}
786 * Returns the next code point from the internal normalization buffer,
787 * or else the next text code point.
788 * Returns -1 at the end of the text.
790 UChar32
nextCodePoint() {
792 if(index
== length
) {
796 U16_NEXT_UNSAFE(decomp
, index
, c
);
800 return nextRawCodePoint();
804 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
805 * @return the first code point in c's decomposition,
806 * or c itself if it was decomposed already or if it does not decompose
808 UChar32
nextDecomposedCodePoint(const Normalizer2Impl
&nfcImpl
, UChar32 c
) {
809 if(index
>= 0) { return c
; }
810 decomp
= nfcImpl
.getDecomposition(c
, buffer
, length
);
811 if(decomp
== NULL
) { return c
; }
813 U16_NEXT_UNSAFE(decomp
, index
, c
);
818 * Returns the next text code point in FCD order.
819 * Returns -1 at the end of the text.
821 virtual UChar32
nextRawCodePoint() = 0;
829 class UTF16NFDIterator
: public NFDIterator
{
831 UTF16NFDIterator(const UChar
*text
, const UChar
*textLimit
) : s(text
), limit(textLimit
) {}
833 virtual UChar32
nextRawCodePoint() {
834 if(s
== limit
) { return U_SENTINEL
; }
836 if(limit
== NULL
&& c
== 0) {
841 if(U16_IS_LEAD(c
) && s
!= limit
&& U16_IS_TRAIL(trail
= *s
)) {
843 c
= U16_GET_SUPPLEMENTARY(c
, trail
);
852 class FCDUTF16NFDIterator
: public UTF16NFDIterator
{
854 FCDUTF16NFDIterator(const Normalizer2Impl
&nfcImpl
, const UChar
*text
, const UChar
*textLimit
)
855 : UTF16NFDIterator(NULL
, NULL
) {
856 UErrorCode errorCode
= U_ZERO_ERROR
;
857 const UChar
*spanLimit
= nfcImpl
.makeFCD(text
, textLimit
, NULL
, errorCode
);
858 if(U_FAILURE(errorCode
)) { return; }
859 if(spanLimit
== textLimit
|| (textLimit
== NULL
&& *spanLimit
== 0)) {
863 str
.setTo(text
, (int32_t)(spanLimit
- text
));
865 ReorderingBuffer
buffer(nfcImpl
, str
);
866 if(buffer
.init(str
.length(), errorCode
)) {
867 nfcImpl
.makeFCD(spanLimit
, textLimit
, &buffer
, errorCode
);
870 if(U_SUCCESS(errorCode
)) {
872 limit
= s
+ str
.length();
880 class UTF8NFDIterator
: public NFDIterator
{
882 UTF8NFDIterator(const uint8_t *text
, int32_t textLength
)
883 : s(text
), pos(0), length(textLength
) {}
885 virtual UChar32
nextRawCodePoint() {
886 if(pos
== length
|| (s
[pos
] == 0 && length
< 0)) { return U_SENTINEL
; }
888 U8_NEXT_OR_FFFD(s
, pos
, length
, c
);
897 class FCDUTF8NFDIterator
: public NFDIterator
{
899 FCDUTF8NFDIterator(const CollationData
*data
, const uint8_t *text
, int32_t textLength
)
900 : u8ci(data
, FALSE
, text
, 0, textLength
) {}
902 virtual UChar32
nextRawCodePoint() {
903 UErrorCode errorCode
= U_ZERO_ERROR
;
904 return u8ci
.nextCodePoint(errorCode
);
907 FCDUTF8CollationIterator u8ci
;
910 class UIterNFDIterator
: public NFDIterator
{
912 UIterNFDIterator(UCharIterator
&it
) : iter(it
) {}
914 virtual UChar32
nextRawCodePoint() {
915 return uiter_next32(&iter
);
921 class FCDUIterNFDIterator
: public NFDIterator
{
923 FCDUIterNFDIterator(const CollationData
*data
, UCharIterator
&it
, int32_t startIndex
)
924 : uici(data
, FALSE
, it
, startIndex
) {}
926 virtual UChar32
nextRawCodePoint() {
927 UErrorCode errorCode
= U_ZERO_ERROR
;
928 return uici
.nextCodePoint(errorCode
);
931 FCDUIterCollationIterator uici
;
934 UCollationResult
compareNFDIter(const Normalizer2Impl
&nfcImpl
,
935 NFDIterator
&left
, NFDIterator
&right
) {
937 // Fetch the next FCD code point from each string.
938 UChar32 leftCp
= left
.nextCodePoint();
939 UChar32 rightCp
= right
.nextCodePoint();
940 if(leftCp
== rightCp
) {
941 if(leftCp
< 0) { break; }
944 // If they are different, then decompose each and compare again.
946 leftCp
= -2; // end of string
947 } else if(leftCp
== 0xfffe) {
948 leftCp
= -1; // U+FFFE: merge separator
950 leftCp
= left
.nextDecomposedCodePoint(nfcImpl
, leftCp
);
953 rightCp
= -2; // end of string
954 } else if(rightCp
== 0xfffe) {
955 rightCp
= -1; // U+FFFE: merge separator
957 rightCp
= right
.nextDecomposedCodePoint(nfcImpl
, rightCp
);
959 if(leftCp
< rightCp
) { return UCOL_LESS
; }
960 if(leftCp
> rightCp
) { return UCOL_GREATER
; }
968 RuleBasedCollator::doCompare(const UChar
*left
, int32_t leftLength
,
969 const UChar
*right
, int32_t rightLength
,
970 UErrorCode
&errorCode
) const {
971 // U_FAILURE(errorCode) checked by caller.
972 if(left
== right
&& leftLength
== rightLength
) {
976 // Identical-prefix test.
977 const UChar
*leftLimit
;
978 const UChar
*rightLimit
;
979 int32_t equalPrefixLength
= 0;
984 while((c
= left
[equalPrefixLength
]) == right
[equalPrefixLength
]) {
985 if(c
== 0) { return UCOL_EQUAL
; }
989 leftLimit
= left
+ leftLength
;
990 rightLimit
= right
+ rightLength
;
992 if(equalPrefixLength
== leftLength
) {
993 if(equalPrefixLength
== rightLength
) { return UCOL_EQUAL
; }
995 } else if(equalPrefixLength
== rightLength
||
996 left
[equalPrefixLength
] != right
[equalPrefixLength
]) {
1003 UBool numeric
= settings
->isNumeric();
1004 if(equalPrefixLength
> 0) {
1005 if((equalPrefixLength
!= leftLength
&&
1006 data
->isUnsafeBackward(left
[equalPrefixLength
], numeric
)) ||
1007 (equalPrefixLength
!= rightLength
&&
1008 data
->isUnsafeBackward(right
[equalPrefixLength
], numeric
))) {
1009 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1010 while(--equalPrefixLength
> 0 &&
1011 data
->isUnsafeBackward(left
[equalPrefixLength
], numeric
)) {}
1014 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1015 // - With a backward level, a longer string can compare less-than a prefix of it.
1017 // Pass the actual start of each string into the CollationIterators,
1018 // plus the equalPrefixLength position,
1019 // so that prefix matches back into the equal prefix work.
1023 int32_t fastLatinOptions
= settings
->fastLatinOptions
;
1024 if(fastLatinOptions
>= 0 &&
1025 (equalPrefixLength
== leftLength
||
1026 left
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX
) &&
1027 (equalPrefixLength
== rightLength
||
1028 right
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX
)) {
1029 if(leftLength
>= 0) {
1030 result
= CollationFastLatin::compareUTF16(data
->fastLatinTable
,
1031 settings
->fastLatinPrimaries
,
1033 left
+ equalPrefixLength
,
1034 leftLength
- equalPrefixLength
,
1035 right
+ equalPrefixLength
,
1036 rightLength
- equalPrefixLength
);
1038 result
= CollationFastLatin::compareUTF16(data
->fastLatinTable
,
1039 settings
->fastLatinPrimaries
,
1041 left
+ equalPrefixLength
, -1,
1042 right
+ equalPrefixLength
, -1);
1045 result
= CollationFastLatin::BAIL_OUT_RESULT
;
1048 if(result
== CollationFastLatin::BAIL_OUT_RESULT
) {
1049 if(settings
->dontCheckFCD()) {
1050 UTF16CollationIterator
leftIter(data
, numeric
,
1051 left
, left
+ equalPrefixLength
, leftLimit
);
1052 UTF16CollationIterator
rightIter(data
, numeric
,
1053 right
, right
+ equalPrefixLength
, rightLimit
);
1054 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1056 FCDUTF16CollationIterator
leftIter(data
, numeric
,
1057 left
, left
+ equalPrefixLength
, leftLimit
);
1058 FCDUTF16CollationIterator
rightIter(data
, numeric
,
1059 right
, right
+ equalPrefixLength
, rightLimit
);
1060 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1063 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1064 return (UCollationResult
)result
;
1067 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1068 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1069 // and the benefit seems unlikely to be measurable.
1071 // Compare identical level.
1072 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1073 left
+= equalPrefixLength
;
1074 right
+= equalPrefixLength
;
1075 if(settings
->dontCheckFCD()) {
1076 UTF16NFDIterator
leftIter(left
, leftLimit
);
1077 UTF16NFDIterator
rightIter(right
, rightLimit
);
1078 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1080 FCDUTF16NFDIterator
leftIter(nfcImpl
, left
, leftLimit
);
1081 FCDUTF16NFDIterator
rightIter(nfcImpl
, right
, rightLimit
);
1082 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1087 RuleBasedCollator::doCompare(const uint8_t *left
, int32_t leftLength
,
1088 const uint8_t *right
, int32_t rightLength
,
1089 UErrorCode
&errorCode
) const {
1090 // U_FAILURE(errorCode) checked by caller.
1091 if(left
== right
&& leftLength
== rightLength
) {
1095 // Identical-prefix test.
1096 int32_t equalPrefixLength
= 0;
1097 if(leftLength
< 0) {
1099 while((c
= left
[equalPrefixLength
]) == right
[equalPrefixLength
]) {
1100 if(c
== 0) { return UCOL_EQUAL
; }
1101 ++equalPrefixLength
;
1105 if(equalPrefixLength
== leftLength
) {
1106 if(equalPrefixLength
== rightLength
) { return UCOL_EQUAL
; }
1108 } else if(equalPrefixLength
== rightLength
||
1109 left
[equalPrefixLength
] != right
[equalPrefixLength
]) {
1112 ++equalPrefixLength
;
1115 // Back up to the start of a partially-equal code point.
1116 if(equalPrefixLength
> 0 &&
1117 ((equalPrefixLength
!= leftLength
&& U8_IS_TRAIL(left
[equalPrefixLength
])) ||
1118 (equalPrefixLength
!= rightLength
&& U8_IS_TRAIL(right
[equalPrefixLength
])))) {
1119 while(--equalPrefixLength
> 0 && U8_IS_TRAIL(left
[equalPrefixLength
])) {}
1122 UBool numeric
= settings
->isNumeric();
1123 if(equalPrefixLength
> 0) {
1124 UBool unsafe
= FALSE
;
1125 if(equalPrefixLength
!= leftLength
) {
1126 int32_t i
= equalPrefixLength
;
1128 U8_NEXT_OR_FFFD(left
, i
, leftLength
, c
);
1129 unsafe
= data
->isUnsafeBackward(c
, numeric
);
1131 if(!unsafe
&& equalPrefixLength
!= rightLength
) {
1132 int32_t i
= equalPrefixLength
;
1134 U8_NEXT_OR_FFFD(right
, i
, rightLength
, c
);
1135 unsafe
= data
->isUnsafeBackward(c
, numeric
);
1138 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1141 U8_PREV_OR_FFFD(left
, 0, equalPrefixLength
, c
);
1142 } while(equalPrefixLength
> 0 && data
->isUnsafeBackward(c
, numeric
));
1144 // See the notes in the UTF-16 version.
1146 // Pass the actual start of each string into the CollationIterators,
1147 // plus the equalPrefixLength position,
1148 // so that prefix matches back into the equal prefix work.
1152 int32_t fastLatinOptions
= settings
->fastLatinOptions
;
1153 if(fastLatinOptions
>= 0 &&
1154 (equalPrefixLength
== leftLength
||
1155 left
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD
) &&
1156 (equalPrefixLength
== rightLength
||
1157 right
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD
)) {
1158 if(leftLength
>= 0) {
1159 result
= CollationFastLatin::compareUTF8(data
->fastLatinTable
,
1160 settings
->fastLatinPrimaries
,
1162 left
+ equalPrefixLength
,
1163 leftLength
- equalPrefixLength
,
1164 right
+ equalPrefixLength
,
1165 rightLength
- equalPrefixLength
);
1167 result
= CollationFastLatin::compareUTF8(data
->fastLatinTable
,
1168 settings
->fastLatinPrimaries
,
1170 left
+ equalPrefixLength
, -1,
1171 right
+ equalPrefixLength
, -1);
1174 result
= CollationFastLatin::BAIL_OUT_RESULT
;
1177 if(result
== CollationFastLatin::BAIL_OUT_RESULT
) {
1178 if(settings
->dontCheckFCD()) {
1179 UTF8CollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
, leftLength
);
1180 UTF8CollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
, rightLength
);
1181 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1183 FCDUTF8CollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
, leftLength
);
1184 FCDUTF8CollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
, rightLength
);
1185 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1188 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1189 return (UCollationResult
)result
;
1192 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1193 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1194 // and the benefit seems unlikely to be measurable.
1196 // Compare identical level.
1197 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1198 left
+= equalPrefixLength
;
1199 right
+= equalPrefixLength
;
1200 if(leftLength
> 0) {
1201 leftLength
-= equalPrefixLength
;
1202 rightLength
-= equalPrefixLength
;
1204 if(settings
->dontCheckFCD()) {
1205 UTF8NFDIterator
leftIter(left
, leftLength
);
1206 UTF8NFDIterator
rightIter(right
, rightLength
);
1207 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1209 FCDUTF8NFDIterator
leftIter(data
, left
, leftLength
);
1210 FCDUTF8NFDIterator
rightIter(data
, right
, rightLength
);
1211 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1216 RuleBasedCollator::compare(UCharIterator
&left
, UCharIterator
&right
,
1217 UErrorCode
&errorCode
) const {
1218 if(U_FAILURE(errorCode
) || &left
== &right
) { return UCOL_EQUAL
; }
1219 UBool numeric
= settings
->isNumeric();
1221 // Identical-prefix test.
1222 int32_t equalPrefixLength
= 0;
1226 while((leftUnit
= left
.next(&left
)) == (rightUnit
= right
.next(&right
))) {
1227 if(leftUnit
< 0) { return UCOL_EQUAL
; }
1228 ++equalPrefixLength
;
1231 // Back out the code units that differed, for the real collation comparison.
1232 if(leftUnit
>= 0) { left
.previous(&left
); }
1233 if(rightUnit
>= 0) { right
.previous(&right
); }
1235 if(equalPrefixLength
> 0) {
1236 if((leftUnit
>= 0 && data
->isUnsafeBackward(leftUnit
, numeric
)) ||
1237 (rightUnit
>= 0 && data
->isUnsafeBackward(rightUnit
, numeric
))) {
1238 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1240 --equalPrefixLength
;
1241 leftUnit
= left
.previous(&left
);
1242 right
.previous(&right
);
1243 } while(equalPrefixLength
> 0 && data
->isUnsafeBackward(leftUnit
, numeric
));
1245 // See the notes in the UTF-16 version.
1249 UCollationResult result
;
1250 if(settings
->dontCheckFCD()) {
1251 UIterCollationIterator
leftIter(data
, numeric
, left
);
1252 UIterCollationIterator
rightIter(data
, numeric
, right
);
1253 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1255 FCDUIterCollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
);
1256 FCDUIterCollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
);
1257 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1259 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1263 // Compare identical level.
1264 left
.move(&left
, equalPrefixLength
, UITER_ZERO
);
1265 right
.move(&right
, equalPrefixLength
, UITER_ZERO
);
1266 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1267 if(settings
->dontCheckFCD()) {
1268 UIterNFDIterator
leftIter(left
);
1269 UIterNFDIterator
rightIter(right
);
1270 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1272 FCDUIterNFDIterator
leftIter(data
, left
, equalPrefixLength
);
1273 FCDUIterNFDIterator
rightIter(data
, right
, equalPrefixLength
);
1274 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1279 RuleBasedCollator::getCollationKey(const UnicodeString
&s
, CollationKey
&key
,
1280 UErrorCode
&errorCode
) const {
1281 return getCollationKey(s
.getBuffer(), s
.length(), key
, errorCode
);
1285 RuleBasedCollator::getCollationKey(const UChar
*s
, int32_t length
, CollationKey
& key
,
1286 UErrorCode
&errorCode
) const {
1287 if(U_FAILURE(errorCode
)) {
1288 return key
.setToBogus();
1290 if(s
== NULL
&& length
!= 0) {
1291 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1292 return key
.setToBogus();
1294 key
.reset(); // resets the "bogus" state
1295 CollationKeyByteSink
sink(key
);
1296 writeSortKey(s
, length
, sink
, errorCode
);
1297 if(U_FAILURE(errorCode
)) {
1299 } else if(key
.isBogus()) {
1300 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1302 key
.setLength(sink
.NumberOfBytesAppended());
1308 RuleBasedCollator::getSortKey(const UnicodeString
&s
,
1309 uint8_t *dest
, int32_t capacity
) const {
1310 return getSortKey(s
.getBuffer(), s
.length(), dest
, capacity
);
1314 RuleBasedCollator::getSortKey(const UChar
*s
, int32_t length
,
1315 uint8_t *dest
, int32_t capacity
) const {
1316 if((s
== NULL
&& length
!= 0) || capacity
< 0 || (dest
== NULL
&& capacity
> 0)) {
1319 uint8_t noDest
[1] = { 0 };
1321 // Distinguish pure preflighting from an allocation error.
1325 FixedSortKeyByteSink
sink(reinterpret_cast<char *>(dest
), capacity
);
1326 UErrorCode errorCode
= U_ZERO_ERROR
;
1327 writeSortKey(s
, length
, sink
, errorCode
);
1328 return U_SUCCESS(errorCode
) ? sink
.NumberOfBytesAppended() : 0;
1332 RuleBasedCollator::writeSortKey(const UChar
*s
, int32_t length
,
1333 SortKeyByteSink
&sink
, UErrorCode
&errorCode
) const {
1334 if(U_FAILURE(errorCode
)) { return; }
1335 const UChar
*limit
= (length
>= 0) ? s
+ length
: NULL
;
1336 UBool numeric
= settings
->isNumeric();
1337 CollationKeys::LevelCallback callback
;
1338 if(settings
->dontCheckFCD()) {
1339 UTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1340 CollationKeys::writeSortKeyUpToQuaternary(iter
, data
->compressibleBytes
, *settings
,
1341 sink
, Collation::PRIMARY_LEVEL
,
1342 callback
, TRUE
, errorCode
);
1344 FCDUTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1345 CollationKeys::writeSortKeyUpToQuaternary(iter
, data
->compressibleBytes
, *settings
,
1346 sink
, Collation::PRIMARY_LEVEL
,
1347 callback
, TRUE
, errorCode
);
1349 if(settings
->getStrength() == UCOL_IDENTICAL
) {
1350 writeIdenticalLevel(s
, limit
, sink
, errorCode
);
1352 static const char terminator
= 0; // TERMINATOR_BYTE
1353 sink
.Append(&terminator
, 1);
1357 RuleBasedCollator::writeIdenticalLevel(const UChar
*s
, const UChar
*limit
,
1358 SortKeyByteSink
&sink
, UErrorCode
&errorCode
) const {
1360 const UChar
*nfdQCYesLimit
= data
->nfcImpl
.decompose(s
, limit
, NULL
, errorCode
);
1361 if(U_FAILURE(errorCode
)) { return; }
1362 sink
.Append(Collation::LEVEL_SEPARATOR_BYTE
);
1364 if(nfdQCYesLimit
!= s
) {
1365 prev
= u_writeIdenticalLevelRun(prev
, s
, (int32_t)(nfdQCYesLimit
- s
), sink
);
1367 // Is there non-NFD text?
1368 int32_t destLengthEstimate
;
1370 if(nfdQCYesLimit
== limit
) { return; }
1371 destLengthEstimate
= (int32_t)(limit
- nfdQCYesLimit
);
1373 // s is NUL-terminated
1374 if(*nfdQCYesLimit
== 0) { return; }
1375 destLengthEstimate
= -1;
1378 data
->nfcImpl
.decompose(nfdQCYesLimit
, limit
, nfd
, destLengthEstimate
, errorCode
);
1379 u_writeIdenticalLevelRun(prev
, nfd
.getBuffer(), nfd
.length(), sink
);
1385 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1386 * with an instance of this callback class.
1387 * When another level is about to be written, the callback
1388 * records the level and the number of bytes that will be written until
1389 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1391 * When internalNextSortKeyPart() is called again, it restarts with the last level
1392 * and ignores as many bytes as were written previously for that level.
1394 class PartLevelCallback
: public CollationKeys::LevelCallback
{
1396 PartLevelCallback(const SortKeyByteSink
&s
)
1397 : sink(s
), level(Collation::PRIMARY_LEVEL
) {
1398 levelCapacity
= sink
.GetRemainingCapacity();
1400 virtual ~PartLevelCallback() {}
1401 virtual UBool
needToWrite(Collation::Level l
) {
1402 if(!sink
.Overflowed()) {
1403 // Remember a level that will be at least partially written.
1405 levelCapacity
= sink
.GetRemainingCapacity();
1411 Collation::Level
getLevel() const { return level
; }
1412 int32_t getLevelCapacity() const { return levelCapacity
; }
1415 const SortKeyByteSink
&sink
;
1416 Collation::Level level
;
1417 int32_t levelCapacity
;
1423 RuleBasedCollator::internalNextSortKeyPart(UCharIterator
*iter
, uint32_t state
[2],
1424 uint8_t *dest
, int32_t count
, UErrorCode
&errorCode
) const {
1425 if(U_FAILURE(errorCode
)) { return 0; }
1426 if(iter
== NULL
|| state
== NULL
|| count
< 0 || (count
> 0 && dest
== NULL
)) {
1427 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1430 if(count
== 0) { return 0; }
1432 FixedSortKeyByteSink
sink(reinterpret_cast<char *>(dest
), count
);
1433 sink
.IgnoreBytes((int32_t)state
[1]);
1434 iter
->move(iter
, 0, UITER_START
);
1436 Collation::Level level
= (Collation::Level
)state
[0];
1437 if(level
<= Collation::QUATERNARY_LEVEL
) {
1438 UBool numeric
= settings
->isNumeric();
1439 PartLevelCallback
callback(sink
);
1440 if(settings
->dontCheckFCD()) {
1441 UIterCollationIterator
ci(data
, numeric
, *iter
);
1442 CollationKeys::writeSortKeyUpToQuaternary(ci
, data
->compressibleBytes
, *settings
,
1443 sink
, level
, callback
, FALSE
, errorCode
);
1445 FCDUIterCollationIterator
ci(data
, numeric
, *iter
, 0);
1446 CollationKeys::writeSortKeyUpToQuaternary(ci
, data
->compressibleBytes
, *settings
,
1447 sink
, level
, callback
, FALSE
, errorCode
);
1449 if(U_FAILURE(errorCode
)) { return 0; }
1450 if(sink
.NumberOfBytesAppended() > count
) {
1451 state
[0] = (uint32_t)callback
.getLevel();
1452 state
[1] = (uint32_t)callback
.getLevelCapacity();
1455 // All of the normal levels are done.
1456 if(settings
->getStrength() == UCOL_IDENTICAL
) {
1457 level
= Collation::IDENTICAL_LEVEL
;
1458 iter
->move(iter
, 0, UITER_START
);
1460 // else fall through to setting ZERO_LEVEL
1463 if(level
== Collation::IDENTICAL_LEVEL
) {
1464 int32_t levelCapacity
= sink
.GetRemainingCapacity();
1467 UChar32 c
= iter
->next(iter
);
1468 if(c
< 0) { break; }
1471 const UChar
*sArray
= s
.getBuffer();
1472 writeIdenticalLevel(sArray
, sArray
+ s
.length(), sink
, errorCode
);
1473 if(U_FAILURE(errorCode
)) { return 0; }
1474 if(sink
.NumberOfBytesAppended() > count
) {
1475 state
[0] = (uint32_t)level
;
1476 state
[1] = (uint32_t)levelCapacity
;
1481 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1482 state
[0] = (uint32_t)Collation::ZERO_LEVEL
;
1484 int32_t length
= sink
.NumberOfBytesAppended();
1486 while(i
< count
) { dest
[i
++] = 0; }
1491 RuleBasedCollator::internalGetCEs(const UnicodeString
&str
, UVector64
&ces
,
1492 UErrorCode
&errorCode
) const {
1493 if(U_FAILURE(errorCode
)) { return; }
1494 const UChar
*s
= str
.getBuffer();
1495 const UChar
*limit
= s
+ str
.length();
1496 UBool numeric
= settings
->isNumeric();
1497 if(settings
->dontCheckFCD()) {
1498 UTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1500 while((ce
= iter
.nextCE(errorCode
)) != Collation::NO_CE
) {
1501 ces
.addElement(ce
, errorCode
);
1504 FCDUTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1506 while((ce
= iter
.nextCE(errorCode
)) != Collation::NO_CE
) {
1507 ces
.addElement(ce
, errorCode
);
1514 void appendSubtag(CharString
&s
, char letter
, const char *subtag
, int32_t length
,
1515 UErrorCode
&errorCode
) {
1516 if(U_FAILURE(errorCode
) || length
== 0) { return; }
1518 s
.append('_', errorCode
);
1520 s
.append(letter
, errorCode
);
1521 for(int32_t i
= 0; i
< length
; ++i
) {
1522 s
.append(uprv_toupper(subtag
[i
]), errorCode
);
1526 void appendAttribute(CharString
&s
, char letter
, UColAttributeValue value
,
1527 UErrorCode
&errorCode
) {
1528 if(U_FAILURE(errorCode
)) { return; }
1530 s
.append('_', errorCode
);
1532 static const char *valueChars
= "1234...........IXO..SN..LU......";
1533 s
.append(letter
, errorCode
);
1534 s
.append(valueChars
[value
], errorCode
);
1540 RuleBasedCollator::internalGetShortDefinitionString(const char *locale
,
1541 char *buffer
, int32_t capacity
,
1542 UErrorCode
&errorCode
) const {
1543 if(U_FAILURE(errorCode
)) { return 0; }
1544 if(buffer
== NULL
? capacity
!= 0 : capacity
< 0) {
1545 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1548 if(locale
== NULL
) {
1549 locale
= internalGetLocaleID(ULOC_VALID_LOCALE
, errorCode
);
1552 char resultLocale
[ULOC_FULLNAME_CAPACITY
+ 1];
1553 int32_t length
= ucol_getFunctionalEquivalent(resultLocale
, ULOC_FULLNAME_CAPACITY
,
1554 "collation", locale
,
1556 if(U_FAILURE(errorCode
)) { return 0; }
1558 uprv_strcpy(resultLocale
, "root");
1560 resultLocale
[length
] = 0;
1563 // Append items in alphabetic order of their short definition letters.
1565 char subtag
[ULOC_KEYWORD_AND_VALUES_CAPACITY
];
1567 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING
)) {
1568 appendAttribute(result
, 'A', getAttribute(UCOL_ALTERNATE_HANDLING
, errorCode
), errorCode
);
1570 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1571 // See ICU tickets #10372 and #10386.
1572 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST
)) {
1573 appendAttribute(result
, 'C', getAttribute(UCOL_CASE_FIRST
, errorCode
), errorCode
);
1575 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION
)) {
1576 appendAttribute(result
, 'D', getAttribute(UCOL_NUMERIC_COLLATION
, errorCode
), errorCode
);
1578 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL
)) {
1579 appendAttribute(result
, 'E', getAttribute(UCOL_CASE_LEVEL
, errorCode
), errorCode
);
1581 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION
)) {
1582 appendAttribute(result
, 'F', getAttribute(UCOL_FRENCH_COLLATION
, errorCode
), errorCode
);
1584 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1585 length
= uloc_getKeywordValue(resultLocale
, "collation", subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1586 appendSubtag(result
, 'K', subtag
, length
, errorCode
);
1587 length
= uloc_getLanguage(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1588 appendSubtag(result
, 'L', subtag
, length
, errorCode
);
1589 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE
)) {
1590 appendAttribute(result
, 'N', getAttribute(UCOL_NORMALIZATION_MODE
, errorCode
), errorCode
);
1592 length
= uloc_getCountry(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1593 appendSubtag(result
, 'R', subtag
, length
, errorCode
);
1594 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH
)) {
1595 appendAttribute(result
, 'S', getAttribute(UCOL_STRENGTH
, errorCode
), errorCode
);
1597 length
= uloc_getVariant(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1598 appendSubtag(result
, 'V', subtag
, length
, errorCode
);
1599 length
= uloc_getScript(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1600 appendSubtag(result
, 'Z', subtag
, length
, errorCode
);
1602 if(U_FAILURE(errorCode
)) { return 0; }
1603 if(result
.length() <= capacity
) {
1604 uprv_memcpy(buffer
, result
.data(), result
.length());
1606 return u_terminateChars(buffer
, capacity
, result
.length(), &errorCode
);
1610 RuleBasedCollator::isUnsafe(UChar32 c
) const {
1611 return data
->isUnsafeBackward(c
, settings
->isNumeric());
1615 RuleBasedCollator::computeMaxExpansions(const CollationTailoring
*t
, UErrorCode
&errorCode
) {
1616 t
->maxExpansions
= CollationElementIterator::computeMaxExpansions(t
->data
, errorCode
);
1620 RuleBasedCollator::initMaxExpansions(UErrorCode
&errorCode
) const {
1621 umtx_initOnce(tailoring
->maxExpansionsInitOnce
, computeMaxExpansions
, tailoring
, errorCode
);
1622 return U_SUCCESS(errorCode
);
1625 CollationElementIterator
*
1626 RuleBasedCollator::createCollationElementIterator(const UnicodeString
& source
) const {
1627 UErrorCode errorCode
= U_ZERO_ERROR
;
1628 if(!initMaxExpansions(errorCode
)) { return NULL
; }
1629 CollationElementIterator
*cei
= new CollationElementIterator(source
, this, errorCode
);
1630 if(U_FAILURE(errorCode
)) {
1637 CollationElementIterator
*
1638 RuleBasedCollator::createCollationElementIterator(const CharacterIterator
& source
) const {
1639 UErrorCode errorCode
= U_ZERO_ERROR
;
1640 if(!initMaxExpansions(errorCode
)) { return NULL
; }
1641 CollationElementIterator
*cei
= new CollationElementIterator(source
, this, errorCode
);
1642 if(U_FAILURE(errorCode
)) {
1650 RuleBasedCollator::getMaxExpansion(int32_t order
) const {
1651 UErrorCode errorCode
= U_ZERO_ERROR
;
1652 (void)initMaxExpansions(errorCode
);
1653 return CollationElementIterator::getMaxExpansion(tailoring
->maxExpansions
, order
);
1658 #endif // !UCONFIG_NO_COLLATION