2 *******************************************************************************
3 * Copyright (C) 1996-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * rulebasedcollator.cpp
8 * (replaced the former tblcoll.cpp)
10 * created on: 2012feb14 with new and old collation code
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/coll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/locid.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/ucol.h"
25 #include "unicode/uiter.h"
26 #include "unicode/uloc.h"
27 #include "unicode/uniset.h"
28 #include "unicode/unistr.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/utf8.h"
31 #include "unicode/uversion.h"
35 #include "collation.h"
36 #include "collationcompare.h"
37 #include "collationdata.h"
38 #include "collationdatareader.h"
39 #include "collationfastlatin.h"
40 #include "collationiterator.h"
41 #include "collationkeys.h"
42 #include "collationroot.h"
43 #include "collationsets.h"
44 #include "collationsettings.h"
45 #include "collationtailoring.h"
50 #include "uitercollationiterator.h"
52 #include "utf16collationiterator.h"
53 #include "utf8collationiterator.h"
60 class FixedSortKeyByteSink
: public SortKeyByteSink
{
62 FixedSortKeyByteSink(char *dest
, int32_t destCapacity
)
63 : SortKeyByteSink(dest
, destCapacity
) {}
64 virtual ~FixedSortKeyByteSink();
67 virtual void AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
);
68 virtual UBool
Resize(int32_t appendCapacity
, int32_t length
);
71 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
74 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes
, int32_t /*n*/, int32_t length
) {
75 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
76 // Fill the buffer completely.
77 int32_t available
= capacity_
- length
;
79 uprv_memcpy(buffer_
+ length
, bytes
, available
);
84 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
90 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
91 class CollationKeyByteSink
: public SortKeyByteSink
{
93 CollationKeyByteSink(CollationKey
&key
)
94 : SortKeyByteSink(reinterpret_cast<char *>(key
.getBytes()), key
.getCapacity()),
96 virtual ~CollationKeyByteSink();
99 virtual void AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
);
100 virtual UBool
Resize(int32_t appendCapacity
, int32_t length
);
105 CollationKeyByteSink::~CollationKeyByteSink() {}
108 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes
, int32_t n
, int32_t length
) {
109 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
110 if (Resize(n
, length
)) {
111 uprv_memcpy(buffer_
+ length
, bytes
, n
);
116 CollationKeyByteSink::Resize(int32_t appendCapacity
, int32_t length
) {
117 if (buffer_
== NULL
) {
118 return FALSE
; // allocation failed before already
120 int32_t newCapacity
= 2 * capacity_
;
121 int32_t altCapacity
= length
+ 2 * appendCapacity
;
122 if (newCapacity
< altCapacity
) {
123 newCapacity
= altCapacity
;
125 if (newCapacity
< 200) {
128 uint8_t *newBuffer
= key_
.reallocate(newCapacity
, length
);
129 if (newBuffer
== NULL
) {
133 buffer_
= reinterpret_cast<char *>(newBuffer
);
134 capacity_
= newCapacity
;
138 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator
&other
)
141 settings(other
.settings
),
142 tailoring(other
.tailoring
),
143 cacheEntry(other
.cacheEntry
),
144 validLocale(other
.validLocale
),
145 explicitlySetAttributes(other
.explicitlySetAttributes
),
146 actualLocaleIsSameAsValid(other
.actualLocaleIsSameAsValid
) {
148 cacheEntry
->addRef();
151 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin
, int32_t length
,
152 const RuleBasedCollator
*base
, UErrorCode
&errorCode
)
158 explicitlySetAttributes(0),
159 actualLocaleIsSameAsValid(FALSE
) {
160 if(U_FAILURE(errorCode
)) { return; }
161 if(bin
== NULL
|| length
== 0 || base
== NULL
) {
162 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
165 const CollationTailoring
*root
= CollationRoot::getRoot(errorCode
);
166 if(U_FAILURE(errorCode
)) { return; }
167 if(base
->tailoring
!= root
) {
168 errorCode
= U_UNSUPPORTED_ERROR
;
171 LocalPointer
<CollationTailoring
> t(new CollationTailoring(base
->tailoring
->settings
));
172 if(t
.isNull() || t
->isBogus()) {
173 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
176 CollationDataReader::read(base
->tailoring
, bin
, length
, *t
, errorCode
);
177 if(U_FAILURE(errorCode
)) { return; }
178 t
->actualLocale
.setToBogus();
179 adoptTailoring(t
.orphan(), errorCode
);
182 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry
*entry
)
183 : data(entry
->tailoring
->data
),
184 settings(entry
->tailoring
->settings
),
185 tailoring(entry
->tailoring
),
187 validLocale(entry
->validLocale
),
188 explicitlySetAttributes(0),
189 actualLocaleIsSameAsValid(FALSE
) {
191 cacheEntry
->addRef();
194 RuleBasedCollator::~RuleBasedCollator() {
195 SharedObject::clearPtr(settings
);
196 SharedObject::clearPtr(cacheEntry
);
200 RuleBasedCollator::adoptTailoring(CollationTailoring
*t
, UErrorCode
&errorCode
) {
201 if(U_FAILURE(errorCode
)) {
202 t
->deleteIfZeroRefCount();
205 U_ASSERT(settings
== NULL
&& data
== NULL
&& tailoring
== NULL
&& cacheEntry
== NULL
);
206 cacheEntry
= new CollationCacheEntry(t
->actualLocale
, t
);
207 if(cacheEntry
== NULL
) {
208 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
209 t
->deleteIfZeroRefCount();
213 settings
= t
->settings
;
216 cacheEntry
->addRef();
217 validLocale
= t
->actualLocale
;
218 actualLocaleIsSameAsValid
= FALSE
;
222 RuleBasedCollator::clone() const {
223 return new RuleBasedCollator(*this);
226 RuleBasedCollator
&RuleBasedCollator::operator=(const RuleBasedCollator
&other
) {
227 if(this == &other
) { return *this; }
228 SharedObject::copyPtr(other
.settings
, settings
);
229 tailoring
= other
.tailoring
;
230 SharedObject::copyPtr(other
.cacheEntry
, cacheEntry
);
231 data
= tailoring
->data
;
232 validLocale
= other
.validLocale
;
233 explicitlySetAttributes
= other
.explicitlySetAttributes
;
234 actualLocaleIsSameAsValid
= other
.actualLocaleIsSameAsValid
;
238 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator
)
241 RuleBasedCollator::operator==(const Collator
& other
) const {
242 if(this == &other
) { return TRUE
; }
243 if(!Collator::operator==(other
)) { return FALSE
; }
244 const RuleBasedCollator
&o
= static_cast<const RuleBasedCollator
&>(other
);
245 if(*settings
!= *o
.settings
) { return FALSE
; }
246 if(data
== o
.data
) { return TRUE
; }
247 UBool thisIsRoot
= data
->base
== NULL
;
248 UBool otherIsRoot
= o
.data
->base
== NULL
;
249 U_ASSERT(!thisIsRoot
|| !otherIsRoot
); // otherwise their data pointers should be ==
250 if(thisIsRoot
!= otherIsRoot
) { return FALSE
; }
251 if((thisIsRoot
|| !tailoring
->rules
.isEmpty()) &&
252 (otherIsRoot
|| !o
.tailoring
->rules
.isEmpty())) {
253 // Shortcut: If both collators have valid rule strings, then compare those.
254 if(tailoring
->rules
== o
.tailoring
->rules
) { return TRUE
; }
256 // Different rule strings can result in the same or equivalent tailoring.
257 // The rule strings are optional in ICU resource bundles, although included by default.
258 // cloneBinary() drops the rule string.
259 UErrorCode errorCode
= U_ZERO_ERROR
;
260 LocalPointer
<UnicodeSet
> thisTailored(getTailoredSet(errorCode
));
261 LocalPointer
<UnicodeSet
> otherTailored(o
.getTailoredSet(errorCode
));
262 if(U_FAILURE(errorCode
)) { return FALSE
; }
263 if(*thisTailored
!= *otherTailored
) { return FALSE
; }
264 // For completeness, we should compare all of the mappings;
265 // or we should create a list of strings, sort it with one collator,
266 // and check if both collators compare adjacent strings the same
267 // (order & strength, down to quaternary); or similar.
268 // Testing equality of collators seems unusual.
273 RuleBasedCollator::hashCode() const {
274 int32_t h
= settings
->hashCode();
275 if(data
->base
== NULL
) { return h
; } // root collator
276 // Do not rely on the rule string, see comments in operator==().
277 UErrorCode errorCode
= U_ZERO_ERROR
;
278 LocalPointer
<UnicodeSet
> set(getTailoredSet(errorCode
));
279 if(U_FAILURE(errorCode
)) { return 0; }
280 UnicodeSetIterator
iter(*set
);
281 while(iter
.next() && !iter
.isString()) {
282 h
^= data
->getCE32(iter
.getCodepoint());
288 RuleBasedCollator::setLocales(const Locale
&requested
, const Locale
&valid
,
289 const Locale
&actual
) {
290 if(actual
== tailoring
->actualLocale
) {
291 actualLocaleIsSameAsValid
= FALSE
;
293 U_ASSERT(actual
== valid
);
294 actualLocaleIsSameAsValid
= TRUE
;
296 // Do not modify tailoring.actualLocale:
297 // We cannot be sure that that would be thread-safe.
299 (void)requested
; // Ignore, see also ticket #10477.
303 RuleBasedCollator::getLocale(ULocDataLocaleType type
, UErrorCode
& errorCode
) const {
304 if(U_FAILURE(errorCode
)) {
305 return Locale::getRoot();
308 case ULOC_ACTUAL_LOCALE
:
309 return actualLocaleIsSameAsValid
? validLocale
: tailoring
->actualLocale
;
310 case ULOC_VALID_LOCALE
:
311 case ULOC_REQUESTED_LOCALE
: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
314 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
315 return Locale::getRoot();
320 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type
, UErrorCode
&errorCode
) const {
321 if(U_FAILURE(errorCode
)) {
324 const Locale
*result
;
326 case ULOC_ACTUAL_LOCALE
:
327 result
= actualLocaleIsSameAsValid
? &validLocale
: &tailoring
->actualLocale
;
329 case ULOC_VALID_LOCALE
:
330 case ULOC_REQUESTED_LOCALE
: // Apple: keep treating as ULOC_VALID_LOCALE, apps depend on it <rdar://problem/19546211>
331 result
= &validLocale
;
334 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
337 if(result
->isBogus()) { return NULL
; }
338 const char *id
= result
->getName();
339 return id
[0] == 0 ? "root" : id
;
343 RuleBasedCollator::getRules() const {
344 return tailoring
->rules
;
348 RuleBasedCollator::getRules(UColRuleOption delta
, UnicodeString
&buffer
) const {
349 if(delta
== UCOL_TAILORING_ONLY
) {
350 buffer
= tailoring
->rules
;
355 CollationLoader::appendRootRules(buffer
);
356 buffer
.append(tailoring
->rules
).getTerminatedBuffer();
360 RuleBasedCollator::getVersion(UVersionInfo version
) const {
361 uprv_memcpy(version
, tailoring
->version
, U_MAX_VERSION_LENGTH
);
362 version
[0] += (UCOL_RUNTIME_VERSION
<< 4) + (UCOL_RUNTIME_VERSION
>> 4);
366 RuleBasedCollator::getTailoredSet(UErrorCode
&errorCode
) const {
367 if(U_FAILURE(errorCode
)) { return NULL
; }
368 UnicodeSet
*tailored
= new UnicodeSet();
369 if(tailored
== NULL
) {
370 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
373 if(data
->base
!= NULL
) {
374 TailoredSet(tailored
).forData(data
, errorCode
);
375 if(U_FAILURE(errorCode
)) {
384 RuleBasedCollator::internalGetContractionsAndExpansions(
385 UnicodeSet
*contractions
, UnicodeSet
*expansions
,
386 UBool addPrefixes
, UErrorCode
&errorCode
) const {
387 if(U_FAILURE(errorCode
)) { return; }
388 if(contractions
!= NULL
) {
389 contractions
->clear();
391 if(expansions
!= NULL
) {
394 ContractionsAndExpansions(contractions
, expansions
, NULL
, addPrefixes
).forData(data
, errorCode
);
398 RuleBasedCollator::internalAddContractions(UChar32 c
, UnicodeSet
&set
, UErrorCode
&errorCode
) const {
399 if(U_FAILURE(errorCode
)) { return; }
400 ContractionsAndExpansions(&set
, NULL
, NULL
, FALSE
).forCodePoint(data
, c
, errorCode
);
403 const CollationSettings
&
404 RuleBasedCollator::getDefaultSettings() const {
405 return *tailoring
->settings
;
409 RuleBasedCollator::getAttribute(UColAttribute attr
, UErrorCode
&errorCode
) const {
410 if(U_FAILURE(errorCode
)) { return UCOL_DEFAULT
; }
413 case UCOL_FRENCH_COLLATION
:
414 option
= CollationSettings::BACKWARD_SECONDARY
;
416 case UCOL_ALTERNATE_HANDLING
:
417 return settings
->getAlternateHandling();
418 case UCOL_CASE_FIRST
:
419 return settings
->getCaseFirst();
420 case UCOL_CASE_LEVEL
:
421 option
= CollationSettings::CASE_LEVEL
;
423 case UCOL_NORMALIZATION_MODE
:
424 option
= CollationSettings::CHECK_FCD
;
427 return (UColAttributeValue
)settings
->getStrength();
428 case UCOL_HIRAGANA_QUATERNARY_MODE
:
429 // Deprecated attribute, unsettable.
431 case UCOL_NUMERIC_COLLATION
:
432 option
= CollationSettings::NUMERIC
;
435 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
438 return ((settings
->options
& option
) == 0) ? UCOL_OFF
: UCOL_ON
;
442 RuleBasedCollator::setAttribute(UColAttribute attr
, UColAttributeValue value
,
443 UErrorCode
&errorCode
) {
444 UColAttributeValue oldValue
= getAttribute(attr
, errorCode
);
445 if(U_FAILURE(errorCode
)) { return; }
446 if(value
== oldValue
) {
447 setAttributeExplicitly(attr
);
450 const CollationSettings
&defaultSettings
= getDefaultSettings();
451 if(settings
== &defaultSettings
) {
452 if(value
== UCOL_DEFAULT
) {
453 setAttributeDefault(attr
);
457 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
458 if(ownedSettings
== NULL
) {
459 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
464 case UCOL_FRENCH_COLLATION
:
465 ownedSettings
->setFlag(CollationSettings::BACKWARD_SECONDARY
, value
,
466 defaultSettings
.options
, errorCode
);
468 case UCOL_ALTERNATE_HANDLING
:
469 ownedSettings
->setAlternateHandling(value
, defaultSettings
.options
, errorCode
);
471 case UCOL_CASE_FIRST
:
472 ownedSettings
->setCaseFirst(value
, defaultSettings
.options
, errorCode
);
474 case UCOL_CASE_LEVEL
:
475 ownedSettings
->setFlag(CollationSettings::CASE_LEVEL
, value
,
476 defaultSettings
.options
, errorCode
);
478 case UCOL_NORMALIZATION_MODE
:
479 ownedSettings
->setFlag(CollationSettings::CHECK_FCD
, value
,
480 defaultSettings
.options
, errorCode
);
483 ownedSettings
->setStrength(value
, defaultSettings
.options
, errorCode
);
485 case UCOL_HIRAGANA_QUATERNARY_MODE
:
486 // Deprecated attribute. Check for valid values but do not change anything.
487 if(value
!= UCOL_OFF
&& value
!= UCOL_ON
&& value
!= UCOL_DEFAULT
) {
488 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
491 case UCOL_NUMERIC_COLLATION
:
492 ownedSettings
->setFlag(CollationSettings::NUMERIC
, value
, defaultSettings
.options
, errorCode
);
495 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
498 if(U_FAILURE(errorCode
)) { return; }
499 setFastLatinOptions(*ownedSettings
);
500 if(value
== UCOL_DEFAULT
) {
501 setAttributeDefault(attr
);
503 setAttributeExplicitly(attr
);
508 RuleBasedCollator::setMaxVariable(UColReorderCode group
, UErrorCode
&errorCode
) {
509 if(U_FAILURE(errorCode
)) { return *this; }
510 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
512 if(group
== UCOL_REORDER_CODE_DEFAULT
) {
513 value
= UCOL_DEFAULT
;
514 } else if(UCOL_REORDER_CODE_FIRST
<= group
&& group
<= UCOL_REORDER_CODE_CURRENCY
) {
515 value
= group
- UCOL_REORDER_CODE_FIRST
;
517 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
520 CollationSettings::MaxVariable oldValue
= settings
->getMaxVariable();
521 if(value
== oldValue
) {
522 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
525 const CollationSettings
&defaultSettings
= getDefaultSettings();
526 if(settings
== &defaultSettings
) {
527 if(value
== UCOL_DEFAULT
) {
528 setAttributeDefault(ATTR_VARIABLE_TOP
);
532 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
533 if(ownedSettings
== NULL
) {
534 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
538 if(group
== UCOL_REORDER_CODE_DEFAULT
) {
539 group
= (UColReorderCode
)(UCOL_REORDER_CODE_FIRST
+ defaultSettings
.getMaxVariable());
541 uint32_t varTop
= data
->getLastPrimaryForGroup(group
);
542 U_ASSERT(varTop
!= 0);
543 ownedSettings
->setMaxVariable(value
, defaultSettings
.options
, errorCode
);
544 if(U_FAILURE(errorCode
)) { return *this; }
545 ownedSettings
->variableTop
= varTop
;
546 setFastLatinOptions(*ownedSettings
);
547 if(value
== UCOL_DEFAULT
) {
548 setAttributeDefault(ATTR_VARIABLE_TOP
);
550 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
556 RuleBasedCollator::getMaxVariable() const {
557 return (UColReorderCode
)(UCOL_REORDER_CODE_FIRST
+ settings
->getMaxVariable());
561 RuleBasedCollator::getVariableTop(UErrorCode
& /*errorCode*/) const {
562 return settings
->variableTop
;
566 RuleBasedCollator::setVariableTop(const UChar
*varTop
, int32_t len
, UErrorCode
&errorCode
) {
567 if(U_FAILURE(errorCode
)) { return 0; }
568 if(varTop
== NULL
&& len
!=0) {
569 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
572 if(len
< 0) { len
= u_strlen(varTop
); }
574 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
577 UBool numeric
= settings
->isNumeric();
579 if(settings
->dontCheckFCD()) {
580 UTF16CollationIterator
ci(data
, numeric
, varTop
, varTop
, varTop
+ len
);
581 ce1
= ci
.nextCE(errorCode
);
582 ce2
= ci
.nextCE(errorCode
);
584 FCDUTF16CollationIterator
ci(data
, numeric
, varTop
, varTop
, varTop
+ len
);
585 ce1
= ci
.nextCE(errorCode
);
586 ce2
= ci
.nextCE(errorCode
);
588 if(ce1
== Collation::NO_CE
|| ce2
!= Collation::NO_CE
) {
589 errorCode
= U_CE_NOT_FOUND_ERROR
;
592 setVariableTop((uint32_t)(ce1
>> 32), errorCode
);
593 return settings
->variableTop
;
597 RuleBasedCollator::setVariableTop(const UnicodeString
&varTop
, UErrorCode
&errorCode
) {
598 return setVariableTop(varTop
.getBuffer(), varTop
.length(), errorCode
);
602 RuleBasedCollator::setVariableTop(uint32_t varTop
, UErrorCode
&errorCode
) {
603 if(U_FAILURE(errorCode
)) { return; }
604 if(varTop
!= settings
->variableTop
) {
605 // Pin the variable top to the end of the reordering group which contains it.
606 // Only a few special groups are supported.
607 int32_t group
= data
->getGroupForPrimary(varTop
);
608 if(group
< UCOL_REORDER_CODE_FIRST
|| UCOL_REORDER_CODE_CURRENCY
< group
) {
609 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
612 uint32_t v
= data
->getLastPrimaryForGroup(group
);
613 U_ASSERT(v
!= 0 && v
>= varTop
);
615 if(varTop
!= settings
->variableTop
) {
616 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
617 if(ownedSettings
== NULL
) {
618 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
621 ownedSettings
->setMaxVariable(group
- UCOL_REORDER_CODE_FIRST
,
622 getDefaultSettings().options
, errorCode
);
623 if(U_FAILURE(errorCode
)) { return; }
624 ownedSettings
->variableTop
= varTop
;
625 setFastLatinOptions(*ownedSettings
);
628 if(varTop
== getDefaultSettings().variableTop
) {
629 setAttributeDefault(ATTR_VARIABLE_TOP
);
631 setAttributeExplicitly(ATTR_VARIABLE_TOP
);
636 RuleBasedCollator::getReorderCodes(int32_t *dest
, int32_t capacity
,
637 UErrorCode
&errorCode
) const {
638 if(U_FAILURE(errorCode
)) { return 0; }
639 if(capacity
< 0 || (dest
== NULL
&& capacity
> 0)) {
640 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
643 int32_t length
= settings
->reorderCodesLength
;
644 if(length
== 0) { return 0; }
645 if(length
> capacity
) {
646 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
649 uprv_memcpy(dest
, settings
->reorderCodes
, length
* 4);
654 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes
, int32_t length
,
655 UErrorCode
&errorCode
) {
656 if(U_FAILURE(errorCode
)) { return; }
657 if(length
< 0 || (reorderCodes
== NULL
&& length
> 0)) {
658 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
661 if(length
== 1 && reorderCodes
[0] == UCOL_REORDER_CODE_NONE
) {
664 if(length
== settings
->reorderCodesLength
&&
665 uprv_memcmp(reorderCodes
, settings
->reorderCodes
, length
* 4) == 0) {
668 const CollationSettings
&defaultSettings
= getDefaultSettings();
669 if(length
== 1 && reorderCodes
[0] == UCOL_REORDER_CODE_DEFAULT
) {
670 if(settings
!= &defaultSettings
) {
671 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
672 if(ownedSettings
== NULL
) {
673 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
676 ownedSettings
->copyReorderingFrom(defaultSettings
, errorCode
);
677 setFastLatinOptions(*ownedSettings
);
681 CollationSettings
*ownedSettings
= SharedObject::copyOnWrite(settings
);
682 if(ownedSettings
== NULL
) {
683 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
686 ownedSettings
->setReordering(*data
, reorderCodes
, length
, errorCode
);
687 setFastLatinOptions(*ownedSettings
);
691 RuleBasedCollator::setFastLatinOptions(CollationSettings
&ownedSettings
) const {
692 ownedSettings
.fastLatinOptions
= CollationFastLatin::getOptions(
694 ownedSettings
.fastLatinPrimaries
, UPRV_LENGTHOF(ownedSettings
.fastLatinPrimaries
));
698 RuleBasedCollator::compare(const UnicodeString
&left
, const UnicodeString
&right
,
699 UErrorCode
&errorCode
) const {
700 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
701 return doCompare(left
.getBuffer(), left
.length(),
702 right
.getBuffer(), right
.length(), errorCode
);
706 RuleBasedCollator::compare(const UnicodeString
&left
, const UnicodeString
&right
,
707 int32_t length
, UErrorCode
&errorCode
) const {
708 if(U_FAILURE(errorCode
) || length
== 0) { return UCOL_EQUAL
; }
710 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
713 int32_t leftLength
= left
.length();
714 int32_t rightLength
= right
.length();
715 if(leftLength
> length
) { leftLength
= length
; }
716 if(rightLength
> length
) { rightLength
= length
; }
717 return doCompare(left
.getBuffer(), leftLength
,
718 right
.getBuffer(), rightLength
, errorCode
);
722 RuleBasedCollator::compare(const UChar
*left
, int32_t leftLength
,
723 const UChar
*right
, int32_t rightLength
,
724 UErrorCode
&errorCode
) const {
725 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
726 if((left
== NULL
&& leftLength
!= 0) || (right
== NULL
&& rightLength
!= 0)) {
727 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
730 // Make sure both or neither strings have a known length.
731 // We do not optimize for mixed length/termination.
732 if(leftLength
>= 0) {
733 if(rightLength
< 0) { rightLength
= u_strlen(right
); }
735 if(rightLength
>= 0) { leftLength
= u_strlen(left
); }
737 return doCompare(left
, leftLength
, right
, rightLength
, errorCode
);
741 RuleBasedCollator::compareUTF8(const StringPiece
&left
, const StringPiece
&right
,
742 UErrorCode
&errorCode
) const {
743 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
744 const uint8_t *leftBytes
= reinterpret_cast<const uint8_t *>(left
.data());
745 const uint8_t *rightBytes
= reinterpret_cast<const uint8_t *>(right
.data());
746 if((leftBytes
== NULL
&& !left
.empty()) || (rightBytes
== NULL
&& !right
.empty())) {
747 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
750 return doCompare(leftBytes
, left
.length(), rightBytes
, right
.length(), errorCode
);
754 RuleBasedCollator::internalCompareUTF8(const char *left
, int32_t leftLength
,
755 const char *right
, int32_t rightLength
,
756 UErrorCode
&errorCode
) const {
757 if(U_FAILURE(errorCode
)) { return UCOL_EQUAL
; }
758 if((left
== NULL
&& leftLength
!= 0) || (right
== NULL
&& rightLength
!= 0)) {
759 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
762 // Make sure both or neither strings have a known length.
763 // We do not optimize for mixed length/termination.
764 if(leftLength
>= 0) {
765 if(rightLength
< 0) { rightLength
= uprv_strlen(right
); }
767 if(rightLength
>= 0) { leftLength
= uprv_strlen(left
); }
769 return doCompare(reinterpret_cast<const uint8_t *>(left
), leftLength
,
770 reinterpret_cast<const uint8_t *>(right
), rightLength
, errorCode
);
776 * Abstract iterator for identical-level string comparisons.
777 * Returns FCD code points and handles temporary switching to NFD.
779 class NFDIterator
: public UObject
{
781 NFDIterator() : index(-1), length(0) {}
782 virtual ~NFDIterator() {}
784 * Returns the next code point from the internal normalization buffer,
785 * or else the next text code point.
786 * Returns -1 at the end of the text.
788 UChar32
nextCodePoint() {
790 if(index
== length
) {
794 U16_NEXT_UNSAFE(decomp
, index
, c
);
798 return nextRawCodePoint();
802 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
803 * @return the first code point in c's decomposition,
804 * or c itself if it was decomposed already or if it does not decompose
806 UChar32
nextDecomposedCodePoint(const Normalizer2Impl
&nfcImpl
, UChar32 c
) {
807 if(index
>= 0) { return c
; }
808 decomp
= nfcImpl
.getDecomposition(c
, buffer
, length
);
809 if(decomp
== NULL
) { return c
; }
811 U16_NEXT_UNSAFE(decomp
, index
, c
);
816 * Returns the next text code point in FCD order.
817 * Returns -1 at the end of the text.
819 virtual UChar32
nextRawCodePoint() = 0;
827 class UTF16NFDIterator
: public NFDIterator
{
829 UTF16NFDIterator(const UChar
*text
, const UChar
*textLimit
) : s(text
), limit(textLimit
) {}
831 virtual UChar32
nextRawCodePoint() {
832 if(s
== limit
) { return U_SENTINEL
; }
834 if(limit
== NULL
&& c
== 0) {
839 if(U16_IS_LEAD(c
) && s
!= limit
&& U16_IS_TRAIL(trail
= *s
)) {
841 c
= U16_GET_SUPPLEMENTARY(c
, trail
);
850 class FCDUTF16NFDIterator
: public UTF16NFDIterator
{
852 FCDUTF16NFDIterator(const Normalizer2Impl
&nfcImpl
, const UChar
*text
, const UChar
*textLimit
)
853 : UTF16NFDIterator(NULL
, NULL
) {
854 UErrorCode errorCode
= U_ZERO_ERROR
;
855 const UChar
*spanLimit
= nfcImpl
.makeFCD(text
, textLimit
, NULL
, errorCode
);
856 if(U_FAILURE(errorCode
)) { return; }
857 if(spanLimit
== textLimit
|| (textLimit
== NULL
&& *spanLimit
== 0)) {
861 str
.setTo(text
, (int32_t)(spanLimit
- text
));
863 ReorderingBuffer
buffer(nfcImpl
, str
);
864 if(buffer
.init(str
.length(), errorCode
)) {
865 nfcImpl
.makeFCD(spanLimit
, textLimit
, &buffer
, errorCode
);
868 if(U_SUCCESS(errorCode
)) {
870 limit
= s
+ str
.length();
878 class UTF8NFDIterator
: public NFDIterator
{
880 UTF8NFDIterator(const uint8_t *text
, int32_t textLength
)
881 : s(text
), pos(0), length(textLength
) {}
883 virtual UChar32
nextRawCodePoint() {
884 if(pos
== length
|| (s
[pos
] == 0 && length
< 0)) { return U_SENTINEL
; }
886 U8_NEXT_OR_FFFD(s
, pos
, length
, c
);
895 class FCDUTF8NFDIterator
: public NFDIterator
{
897 FCDUTF8NFDIterator(const CollationData
*data
, const uint8_t *text
, int32_t textLength
)
898 : u8ci(data
, FALSE
, text
, 0, textLength
) {}
900 virtual UChar32
nextRawCodePoint() {
901 UErrorCode errorCode
= U_ZERO_ERROR
;
902 return u8ci
.nextCodePoint(errorCode
);
905 FCDUTF8CollationIterator u8ci
;
908 class UIterNFDIterator
: public NFDIterator
{
910 UIterNFDIterator(UCharIterator
&it
) : iter(it
) {}
912 virtual UChar32
nextRawCodePoint() {
913 return uiter_next32(&iter
);
919 class FCDUIterNFDIterator
: public NFDIterator
{
921 FCDUIterNFDIterator(const CollationData
*data
, UCharIterator
&it
, int32_t startIndex
)
922 : uici(data
, FALSE
, it
, startIndex
) {}
924 virtual UChar32
nextRawCodePoint() {
925 UErrorCode errorCode
= U_ZERO_ERROR
;
926 return uici
.nextCodePoint(errorCode
);
929 FCDUIterCollationIterator uici
;
932 UCollationResult
compareNFDIter(const Normalizer2Impl
&nfcImpl
,
933 NFDIterator
&left
, NFDIterator
&right
) {
935 // Fetch the next FCD code point from each string.
936 UChar32 leftCp
= left
.nextCodePoint();
937 UChar32 rightCp
= right
.nextCodePoint();
938 if(leftCp
== rightCp
) {
939 if(leftCp
< 0) { break; }
942 // If they are different, then decompose each and compare again.
944 leftCp
= -2; // end of string
945 } else if(leftCp
== 0xfffe) {
946 leftCp
= -1; // U+FFFE: merge separator
948 leftCp
= left
.nextDecomposedCodePoint(nfcImpl
, leftCp
);
951 rightCp
= -2; // end of string
952 } else if(rightCp
== 0xfffe) {
953 rightCp
= -1; // U+FFFE: merge separator
955 rightCp
= right
.nextDecomposedCodePoint(nfcImpl
, rightCp
);
957 if(leftCp
< rightCp
) { return UCOL_LESS
; }
958 if(leftCp
> rightCp
) { return UCOL_GREATER
; }
966 RuleBasedCollator::doCompare(const UChar
*left
, int32_t leftLength
,
967 const UChar
*right
, int32_t rightLength
,
968 UErrorCode
&errorCode
) const {
969 // U_FAILURE(errorCode) checked by caller.
970 if(left
== right
&& leftLength
== rightLength
) {
974 // Identical-prefix test.
975 const UChar
*leftLimit
;
976 const UChar
*rightLimit
;
977 int32_t equalPrefixLength
= 0;
982 while((c
= left
[equalPrefixLength
]) == right
[equalPrefixLength
]) {
983 if(c
== 0) { return UCOL_EQUAL
; }
987 leftLimit
= left
+ leftLength
;
988 rightLimit
= right
+ rightLength
;
990 if(equalPrefixLength
== leftLength
) {
991 if(equalPrefixLength
== rightLength
) { return UCOL_EQUAL
; }
993 } else if(equalPrefixLength
== rightLength
||
994 left
[equalPrefixLength
] != right
[equalPrefixLength
]) {
1001 UBool numeric
= settings
->isNumeric();
1002 if(equalPrefixLength
> 0) {
1003 if((equalPrefixLength
!= leftLength
&&
1004 data
->isUnsafeBackward(left
[equalPrefixLength
], numeric
)) ||
1005 (equalPrefixLength
!= rightLength
&&
1006 data
->isUnsafeBackward(right
[equalPrefixLength
], numeric
))) {
1007 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1008 while(--equalPrefixLength
> 0 &&
1009 data
->isUnsafeBackward(left
[equalPrefixLength
], numeric
)) {}
1012 // - A longer string can compare equal to a prefix of it if only ignorables follow.
1013 // - With a backward level, a longer string can compare less-than a prefix of it.
1015 // Pass the actual start of each string into the CollationIterators,
1016 // plus the equalPrefixLength position,
1017 // so that prefix matches back into the equal prefix work.
1021 int32_t fastLatinOptions
= settings
->fastLatinOptions
;
1022 if(fastLatinOptions
>= 0 &&
1023 (equalPrefixLength
== leftLength
||
1024 left
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX
) &&
1025 (equalPrefixLength
== rightLength
||
1026 right
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX
)) {
1027 if(leftLength
>= 0) {
1028 result
= CollationFastLatin::compareUTF16(data
->fastLatinTable
,
1029 settings
->fastLatinPrimaries
,
1031 left
+ equalPrefixLength
,
1032 leftLength
- equalPrefixLength
,
1033 right
+ equalPrefixLength
,
1034 rightLength
- equalPrefixLength
);
1036 result
= CollationFastLatin::compareUTF16(data
->fastLatinTable
,
1037 settings
->fastLatinPrimaries
,
1039 left
+ equalPrefixLength
, -1,
1040 right
+ equalPrefixLength
, -1);
1043 result
= CollationFastLatin::BAIL_OUT_RESULT
;
1046 if(result
== CollationFastLatin::BAIL_OUT_RESULT
) {
1047 if(settings
->dontCheckFCD()) {
1048 UTF16CollationIterator
leftIter(data
, numeric
,
1049 left
, left
+ equalPrefixLength
, leftLimit
);
1050 UTF16CollationIterator
rightIter(data
, numeric
,
1051 right
, right
+ equalPrefixLength
, rightLimit
);
1052 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1054 FCDUTF16CollationIterator
leftIter(data
, numeric
,
1055 left
, left
+ equalPrefixLength
, leftLimit
);
1056 FCDUTF16CollationIterator
rightIter(data
, numeric
,
1057 right
, right
+ equalPrefixLength
, rightLimit
);
1058 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1061 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1062 return (UCollationResult
)result
;
1065 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1066 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1067 // and the benefit seems unlikely to be measurable.
1069 // Compare identical level.
1070 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1071 left
+= equalPrefixLength
;
1072 right
+= equalPrefixLength
;
1073 if(settings
->dontCheckFCD()) {
1074 UTF16NFDIterator
leftIter(left
, leftLimit
);
1075 UTF16NFDIterator
rightIter(right
, rightLimit
);
1076 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1078 FCDUTF16NFDIterator
leftIter(nfcImpl
, left
, leftLimit
);
1079 FCDUTF16NFDIterator
rightIter(nfcImpl
, right
, rightLimit
);
1080 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1085 RuleBasedCollator::doCompare(const uint8_t *left
, int32_t leftLength
,
1086 const uint8_t *right
, int32_t rightLength
,
1087 UErrorCode
&errorCode
) const {
1088 // U_FAILURE(errorCode) checked by caller.
1089 if(left
== right
&& leftLength
== rightLength
) {
1093 // Identical-prefix test.
1094 int32_t equalPrefixLength
= 0;
1095 if(leftLength
< 0) {
1097 while((c
= left
[equalPrefixLength
]) == right
[equalPrefixLength
]) {
1098 if(c
== 0) { return UCOL_EQUAL
; }
1099 ++equalPrefixLength
;
1103 if(equalPrefixLength
== leftLength
) {
1104 if(equalPrefixLength
== rightLength
) { return UCOL_EQUAL
; }
1106 } else if(equalPrefixLength
== rightLength
||
1107 left
[equalPrefixLength
] != right
[equalPrefixLength
]) {
1110 ++equalPrefixLength
;
1113 // Back up to the start of a partially-equal code point.
1114 if(equalPrefixLength
> 0 &&
1115 ((equalPrefixLength
!= leftLength
&& U8_IS_TRAIL(left
[equalPrefixLength
])) ||
1116 (equalPrefixLength
!= rightLength
&& U8_IS_TRAIL(right
[equalPrefixLength
])))) {
1117 while(--equalPrefixLength
> 0 && U8_IS_TRAIL(left
[equalPrefixLength
])) {}
1120 UBool numeric
= settings
->isNumeric();
1121 if(equalPrefixLength
> 0) {
1122 UBool unsafe
= FALSE
;
1123 if(equalPrefixLength
!= leftLength
) {
1124 int32_t i
= equalPrefixLength
;
1126 U8_NEXT_OR_FFFD(left
, i
, leftLength
, c
);
1127 unsafe
= data
->isUnsafeBackward(c
, numeric
);
1129 if(!unsafe
&& equalPrefixLength
!= rightLength
) {
1130 int32_t i
= equalPrefixLength
;
1132 U8_NEXT_OR_FFFD(right
, i
, rightLength
, c
);
1133 unsafe
= data
->isUnsafeBackward(c
, numeric
);
1136 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1139 U8_PREV_OR_FFFD(left
, 0, equalPrefixLength
, c
);
1140 } while(equalPrefixLength
> 0 && data
->isUnsafeBackward(c
, numeric
));
1142 // See the notes in the UTF-16 version.
1144 // Pass the actual start of each string into the CollationIterators,
1145 // plus the equalPrefixLength position,
1146 // so that prefix matches back into the equal prefix work.
1150 int32_t fastLatinOptions
= settings
->fastLatinOptions
;
1151 if(fastLatinOptions
>= 0 &&
1152 (equalPrefixLength
== leftLength
||
1153 left
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD
) &&
1154 (equalPrefixLength
== rightLength
||
1155 right
[equalPrefixLength
] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD
)) {
1156 if(leftLength
>= 0) {
1157 result
= CollationFastLatin::compareUTF8(data
->fastLatinTable
,
1158 settings
->fastLatinPrimaries
,
1160 left
+ equalPrefixLength
,
1161 leftLength
- equalPrefixLength
,
1162 right
+ equalPrefixLength
,
1163 rightLength
- equalPrefixLength
);
1165 result
= CollationFastLatin::compareUTF8(data
->fastLatinTable
,
1166 settings
->fastLatinPrimaries
,
1168 left
+ equalPrefixLength
, -1,
1169 right
+ equalPrefixLength
, -1);
1172 result
= CollationFastLatin::BAIL_OUT_RESULT
;
1175 if(result
== CollationFastLatin::BAIL_OUT_RESULT
) {
1176 if(settings
->dontCheckFCD()) {
1177 UTF8CollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
, leftLength
);
1178 UTF8CollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
, rightLength
);
1179 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1181 FCDUTF8CollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
, leftLength
);
1182 FCDUTF8CollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
, rightLength
);
1183 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1186 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1187 return (UCollationResult
)result
;
1190 // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1191 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1192 // and the benefit seems unlikely to be measurable.
1194 // Compare identical level.
1195 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1196 left
+= equalPrefixLength
;
1197 right
+= equalPrefixLength
;
1198 if(leftLength
> 0) {
1199 leftLength
-= equalPrefixLength
;
1200 rightLength
-= equalPrefixLength
;
1202 if(settings
->dontCheckFCD()) {
1203 UTF8NFDIterator
leftIter(left
, leftLength
);
1204 UTF8NFDIterator
rightIter(right
, rightLength
);
1205 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1207 FCDUTF8NFDIterator
leftIter(data
, left
, leftLength
);
1208 FCDUTF8NFDIterator
rightIter(data
, right
, rightLength
);
1209 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1214 RuleBasedCollator::compare(UCharIterator
&left
, UCharIterator
&right
,
1215 UErrorCode
&errorCode
) const {
1216 if(U_FAILURE(errorCode
) || &left
== &right
) { return UCOL_EQUAL
; }
1217 UBool numeric
= settings
->isNumeric();
1219 // Identical-prefix test.
1220 int32_t equalPrefixLength
= 0;
1224 while((leftUnit
= left
.next(&left
)) == (rightUnit
= right
.next(&right
))) {
1225 if(leftUnit
< 0) { return UCOL_EQUAL
; }
1226 ++equalPrefixLength
;
1229 // Back out the code units that differed, for the real collation comparison.
1230 if(leftUnit
>= 0) { left
.previous(&left
); }
1231 if(rightUnit
>= 0) { right
.previous(&right
); }
1233 if(equalPrefixLength
> 0) {
1234 if((leftUnit
>= 0 && data
->isUnsafeBackward(leftUnit
, numeric
)) ||
1235 (rightUnit
>= 0 && data
->isUnsafeBackward(rightUnit
, numeric
))) {
1236 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1238 --equalPrefixLength
;
1239 leftUnit
= left
.previous(&left
);
1240 right
.previous(&right
);
1241 } while(equalPrefixLength
> 0 && data
->isUnsafeBackward(leftUnit
, numeric
));
1243 // See the notes in the UTF-16 version.
1247 UCollationResult result
;
1248 if(settings
->dontCheckFCD()) {
1249 UIterCollationIterator
leftIter(data
, numeric
, left
);
1250 UIterCollationIterator
rightIter(data
, numeric
, right
);
1251 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1253 FCDUIterCollationIterator
leftIter(data
, numeric
, left
, equalPrefixLength
);
1254 FCDUIterCollationIterator
rightIter(data
, numeric
, right
, equalPrefixLength
);
1255 result
= CollationCompare::compareUpToQuaternary(leftIter
, rightIter
, *settings
, errorCode
);
1257 if(result
!= UCOL_EQUAL
|| settings
->getStrength() < UCOL_IDENTICAL
|| U_FAILURE(errorCode
)) {
1261 // Compare identical level.
1262 left
.move(&left
, equalPrefixLength
, UITER_ZERO
);
1263 right
.move(&right
, equalPrefixLength
, UITER_ZERO
);
1264 const Normalizer2Impl
&nfcImpl
= data
->nfcImpl
;
1265 if(settings
->dontCheckFCD()) {
1266 UIterNFDIterator
leftIter(left
);
1267 UIterNFDIterator
rightIter(right
);
1268 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1270 FCDUIterNFDIterator
leftIter(data
, left
, equalPrefixLength
);
1271 FCDUIterNFDIterator
rightIter(data
, right
, equalPrefixLength
);
1272 return compareNFDIter(nfcImpl
, leftIter
, rightIter
);
1277 RuleBasedCollator::getCollationKey(const UnicodeString
&s
, CollationKey
&key
,
1278 UErrorCode
&errorCode
) const {
1279 return getCollationKey(s
.getBuffer(), s
.length(), key
, errorCode
);
1283 RuleBasedCollator::getCollationKey(const UChar
*s
, int32_t length
, CollationKey
& key
,
1284 UErrorCode
&errorCode
) const {
1285 if(U_FAILURE(errorCode
)) {
1286 return key
.setToBogus();
1288 if(s
== NULL
&& length
!= 0) {
1289 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1290 return key
.setToBogus();
1292 key
.reset(); // resets the "bogus" state
1293 CollationKeyByteSink
sink(key
);
1294 writeSortKey(s
, length
, sink
, errorCode
);
1295 if(U_FAILURE(errorCode
)) {
1297 } else if(key
.isBogus()) {
1298 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1300 key
.setLength(sink
.NumberOfBytesAppended());
1306 RuleBasedCollator::getSortKey(const UnicodeString
&s
,
1307 uint8_t *dest
, int32_t capacity
) const {
1308 return getSortKey(s
.getBuffer(), s
.length(), dest
, capacity
);
1312 RuleBasedCollator::getSortKey(const UChar
*s
, int32_t length
,
1313 uint8_t *dest
, int32_t capacity
) const {
1314 if((s
== NULL
&& length
!= 0) || capacity
< 0 || (dest
== NULL
&& capacity
> 0)) {
1317 uint8_t noDest
[1] = { 0 };
1319 // Distinguish pure preflighting from an allocation error.
1323 FixedSortKeyByteSink
sink(reinterpret_cast<char *>(dest
), capacity
);
1324 UErrorCode errorCode
= U_ZERO_ERROR
;
1325 writeSortKey(s
, length
, sink
, errorCode
);
1326 return U_SUCCESS(errorCode
) ? sink
.NumberOfBytesAppended() : 0;
1330 RuleBasedCollator::writeSortKey(const UChar
*s
, int32_t length
,
1331 SortKeyByteSink
&sink
, UErrorCode
&errorCode
) const {
1332 if(U_FAILURE(errorCode
)) { return; }
1333 const UChar
*limit
= (length
>= 0) ? s
+ length
: NULL
;
1334 UBool numeric
= settings
->isNumeric();
1335 CollationKeys::LevelCallback callback
;
1336 if(settings
->dontCheckFCD()) {
1337 UTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1338 CollationKeys::writeSortKeyUpToQuaternary(iter
, data
->compressibleBytes
, *settings
,
1339 sink
, Collation::PRIMARY_LEVEL
,
1340 callback
, TRUE
, errorCode
);
1342 FCDUTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1343 CollationKeys::writeSortKeyUpToQuaternary(iter
, data
->compressibleBytes
, *settings
,
1344 sink
, Collation::PRIMARY_LEVEL
,
1345 callback
, TRUE
, errorCode
);
1347 if(settings
->getStrength() == UCOL_IDENTICAL
) {
1348 writeIdenticalLevel(s
, limit
, sink
, errorCode
);
1350 static const char terminator
= 0; // TERMINATOR_BYTE
1351 sink
.Append(&terminator
, 1);
1355 RuleBasedCollator::writeIdenticalLevel(const UChar
*s
, const UChar
*limit
,
1356 SortKeyByteSink
&sink
, UErrorCode
&errorCode
) const {
1358 const UChar
*nfdQCYesLimit
= data
->nfcImpl
.decompose(s
, limit
, NULL
, errorCode
);
1359 if(U_FAILURE(errorCode
)) { return; }
1360 sink
.Append(Collation::LEVEL_SEPARATOR_BYTE
);
1362 if(nfdQCYesLimit
!= s
) {
1363 prev
= u_writeIdenticalLevelRun(prev
, s
, (int32_t)(nfdQCYesLimit
- s
), sink
);
1365 // Is there non-NFD text?
1366 int32_t destLengthEstimate
;
1368 if(nfdQCYesLimit
== limit
) { return; }
1369 destLengthEstimate
= (int32_t)(limit
- nfdQCYesLimit
);
1371 // s is NUL-terminated
1372 if(*nfdQCYesLimit
== 0) { return; }
1373 destLengthEstimate
= -1;
1376 data
->nfcImpl
.decompose(nfdQCYesLimit
, limit
, nfd
, destLengthEstimate
, errorCode
);
1377 u_writeIdenticalLevelRun(prev
, nfd
.getBuffer(), nfd
.length(), sink
);
1383 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1384 * with an instance of this callback class.
1385 * When another level is about to be written, the callback
1386 * records the level and the number of bytes that will be written until
1387 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1389 * When internalNextSortKeyPart() is called again, it restarts with the last level
1390 * and ignores as many bytes as were written previously for that level.
1392 class PartLevelCallback
: public CollationKeys::LevelCallback
{
1394 PartLevelCallback(const SortKeyByteSink
&s
)
1395 : sink(s
), level(Collation::PRIMARY_LEVEL
) {
1396 levelCapacity
= sink
.GetRemainingCapacity();
1398 virtual ~PartLevelCallback() {}
1399 virtual UBool
needToWrite(Collation::Level l
) {
1400 if(!sink
.Overflowed()) {
1401 // Remember a level that will be at least partially written.
1403 levelCapacity
= sink
.GetRemainingCapacity();
1409 Collation::Level
getLevel() const { return level
; }
1410 int32_t getLevelCapacity() const { return levelCapacity
; }
1413 const SortKeyByteSink
&sink
;
1414 Collation::Level level
;
1415 int32_t levelCapacity
;
1421 RuleBasedCollator::internalNextSortKeyPart(UCharIterator
*iter
, uint32_t state
[2],
1422 uint8_t *dest
, int32_t count
, UErrorCode
&errorCode
) const {
1423 if(U_FAILURE(errorCode
)) { return 0; }
1424 if(iter
== NULL
|| state
== NULL
|| count
< 0 || (count
> 0 && dest
== NULL
)) {
1425 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1428 if(count
== 0) { return 0; }
1430 FixedSortKeyByteSink
sink(reinterpret_cast<char *>(dest
), count
);
1431 sink
.IgnoreBytes((int32_t)state
[1]);
1432 iter
->move(iter
, 0, UITER_START
);
1434 Collation::Level level
= (Collation::Level
)state
[0];
1435 if(level
<= Collation::QUATERNARY_LEVEL
) {
1436 UBool numeric
= settings
->isNumeric();
1437 PartLevelCallback
callback(sink
);
1438 if(settings
->dontCheckFCD()) {
1439 UIterCollationIterator
ci(data
, numeric
, *iter
);
1440 CollationKeys::writeSortKeyUpToQuaternary(ci
, data
->compressibleBytes
, *settings
,
1441 sink
, level
, callback
, FALSE
, errorCode
);
1443 FCDUIterCollationIterator
ci(data
, numeric
, *iter
, 0);
1444 CollationKeys::writeSortKeyUpToQuaternary(ci
, data
->compressibleBytes
, *settings
,
1445 sink
, level
, callback
, FALSE
, errorCode
);
1447 if(U_FAILURE(errorCode
)) { return 0; }
1448 if(sink
.NumberOfBytesAppended() > count
) {
1449 state
[0] = (uint32_t)callback
.getLevel();
1450 state
[1] = (uint32_t)callback
.getLevelCapacity();
1453 // All of the normal levels are done.
1454 if(settings
->getStrength() == UCOL_IDENTICAL
) {
1455 level
= Collation::IDENTICAL_LEVEL
;
1456 iter
->move(iter
, 0, UITER_START
);
1458 // else fall through to setting ZERO_LEVEL
1461 if(level
== Collation::IDENTICAL_LEVEL
) {
1462 int32_t levelCapacity
= sink
.GetRemainingCapacity();
1465 UChar32 c
= iter
->next(iter
);
1466 if(c
< 0) { break; }
1469 const UChar
*sArray
= s
.getBuffer();
1470 writeIdenticalLevel(sArray
, sArray
+ s
.length(), sink
, errorCode
);
1471 if(U_FAILURE(errorCode
)) { return 0; }
1472 if(sink
.NumberOfBytesAppended() > count
) {
1473 state
[0] = (uint32_t)level
;
1474 state
[1] = (uint32_t)levelCapacity
;
1479 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1480 state
[0] = (uint32_t)Collation::ZERO_LEVEL
;
1482 int32_t length
= sink
.NumberOfBytesAppended();
1484 while(i
< count
) { dest
[i
++] = 0; }
1489 RuleBasedCollator::internalGetCEs(const UnicodeString
&str
, UVector64
&ces
,
1490 UErrorCode
&errorCode
) const {
1491 if(U_FAILURE(errorCode
)) { return; }
1492 const UChar
*s
= str
.getBuffer();
1493 const UChar
*limit
= s
+ str
.length();
1494 UBool numeric
= settings
->isNumeric();
1495 if(settings
->dontCheckFCD()) {
1496 UTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1498 while((ce
= iter
.nextCE(errorCode
)) != Collation::NO_CE
) {
1499 ces
.addElement(ce
, errorCode
);
1502 FCDUTF16CollationIterator
iter(data
, numeric
, s
, s
, limit
);
1504 while((ce
= iter
.nextCE(errorCode
)) != Collation::NO_CE
) {
1505 ces
.addElement(ce
, errorCode
);
1512 void appendSubtag(CharString
&s
, char letter
, const char *subtag
, int32_t length
,
1513 UErrorCode
&errorCode
) {
1514 if(U_FAILURE(errorCode
) || length
== 0) { return; }
1516 s
.append('_', errorCode
);
1518 s
.append(letter
, errorCode
);
1519 for(int32_t i
= 0; i
< length
; ++i
) {
1520 s
.append(uprv_toupper(subtag
[i
]), errorCode
);
1524 void appendAttribute(CharString
&s
, char letter
, UColAttributeValue value
,
1525 UErrorCode
&errorCode
) {
1526 if(U_FAILURE(errorCode
)) { return; }
1528 s
.append('_', errorCode
);
1530 static const char *valueChars
= "1234...........IXO..SN..LU......";
1531 s
.append(letter
, errorCode
);
1532 s
.append(valueChars
[value
], errorCode
);
1538 RuleBasedCollator::internalGetShortDefinitionString(const char *locale
,
1539 char *buffer
, int32_t capacity
,
1540 UErrorCode
&errorCode
) const {
1541 if(U_FAILURE(errorCode
)) { return 0; }
1542 if(buffer
== NULL
? capacity
!= 0 : capacity
< 0) {
1543 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1546 if(locale
== NULL
) {
1547 locale
= internalGetLocaleID(ULOC_VALID_LOCALE
, errorCode
);
1550 char resultLocale
[ULOC_FULLNAME_CAPACITY
+ 1];
1551 int32_t length
= ucol_getFunctionalEquivalent(resultLocale
, ULOC_FULLNAME_CAPACITY
,
1552 "collation", locale
,
1554 if(U_FAILURE(errorCode
)) { return 0; }
1556 uprv_strcpy(resultLocale
, "root");
1558 resultLocale
[length
] = 0;
1561 // Append items in alphabetic order of their short definition letters.
1563 char subtag
[ULOC_KEYWORD_AND_VALUES_CAPACITY
];
1565 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING
)) {
1566 appendAttribute(result
, 'A', getAttribute(UCOL_ALTERNATE_HANDLING
, errorCode
), errorCode
);
1568 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1569 // See ICU tickets #10372 and #10386.
1570 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST
)) {
1571 appendAttribute(result
, 'C', getAttribute(UCOL_CASE_FIRST
, errorCode
), errorCode
);
1573 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION
)) {
1574 appendAttribute(result
, 'D', getAttribute(UCOL_NUMERIC_COLLATION
, errorCode
), errorCode
);
1576 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL
)) {
1577 appendAttribute(result
, 'E', getAttribute(UCOL_CASE_LEVEL
, errorCode
), errorCode
);
1579 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION
)) {
1580 appendAttribute(result
, 'F', getAttribute(UCOL_FRENCH_COLLATION
, errorCode
), errorCode
);
1582 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1583 length
= uloc_getKeywordValue(resultLocale
, "collation", subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1584 appendSubtag(result
, 'K', subtag
, length
, errorCode
);
1585 length
= uloc_getLanguage(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1586 appendSubtag(result
, 'L', subtag
, length
, errorCode
);
1587 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE
)) {
1588 appendAttribute(result
, 'N', getAttribute(UCOL_NORMALIZATION_MODE
, errorCode
), errorCode
);
1590 length
= uloc_getCountry(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1591 appendSubtag(result
, 'R', subtag
, length
, errorCode
);
1592 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH
)) {
1593 appendAttribute(result
, 'S', getAttribute(UCOL_STRENGTH
, errorCode
), errorCode
);
1595 length
= uloc_getVariant(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1596 appendSubtag(result
, 'V', subtag
, length
, errorCode
);
1597 length
= uloc_getScript(resultLocale
, subtag
, UPRV_LENGTHOF(subtag
), &errorCode
);
1598 appendSubtag(result
, 'Z', subtag
, length
, errorCode
);
1600 if(U_FAILURE(errorCode
)) { return 0; }
1601 if(result
.length() <= capacity
) {
1602 uprv_memcpy(buffer
, result
.data(), result
.length());
1604 return u_terminateChars(buffer
, capacity
, result
.length(), &errorCode
);
1608 RuleBasedCollator::isUnsafe(UChar32 c
) const {
1609 return data
->isUnsafeBackward(c
, settings
->isNumeric());
1613 RuleBasedCollator::computeMaxExpansions(const CollationTailoring
*t
, UErrorCode
&errorCode
) {
1614 t
->maxExpansions
= CollationElementIterator::computeMaxExpansions(t
->data
, errorCode
);
1618 RuleBasedCollator::initMaxExpansions(UErrorCode
&errorCode
) const {
1619 umtx_initOnce(tailoring
->maxExpansionsInitOnce
, computeMaxExpansions
, tailoring
, errorCode
);
1620 return U_SUCCESS(errorCode
);
1623 CollationElementIterator
*
1624 RuleBasedCollator::createCollationElementIterator(const UnicodeString
& source
) const {
1625 UErrorCode errorCode
= U_ZERO_ERROR
;
1626 if(!initMaxExpansions(errorCode
)) { return NULL
; }
1627 CollationElementIterator
*cei
= new CollationElementIterator(source
, this, errorCode
);
1628 if(U_FAILURE(errorCode
)) {
1635 CollationElementIterator
*
1636 RuleBasedCollator::createCollationElementIterator(const CharacterIterator
& source
) const {
1637 UErrorCode errorCode
= U_ZERO_ERROR
;
1638 if(!initMaxExpansions(errorCode
)) { return NULL
; }
1639 CollationElementIterator
*cei
= new CollationElementIterator(source
, this, errorCode
);
1640 if(U_FAILURE(errorCode
)) {
1648 RuleBasedCollator::getMaxExpansion(int32_t order
) const {
1649 UErrorCode errorCode
= U_ZERO_ERROR
;
1650 (void)initMaxExpansions(errorCode
);
1651 return CollationElementIterator::getMaxExpansion(tailoring
->maxExpansions
, order
);
1656 #endif // !UCONFIG_NO_COLLATION