2 ******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
11 * Created by: Helena Shih
13 * Modification History:
15 * Date Name Description
16 * 2/5/97 aliu Added streamIn and streamOut methods. Added
17 * constructor which reads RuleBasedCollator object from
18 * a binary file. Added writeToFile method which streams
19 * RuleBasedCollator out to a binary file. The streamIn
20 * and streamOut methods use istream and ostream objects
22 * 2/11/97 aliu Moved declarations out of for loop initializer.
23 * Added Mac compatibility #ifdef for ios::nocreate.
24 * 2/12/97 aliu Modified to use TableCollationData sub-object to
25 * hold invariant data.
26 * 2/13/97 aliu Moved several methods into this class from Collation.
27 * Added a private RuleBasedCollator(Locale&) constructor,
28 * to be used by Collator::getInstance(). General
29 * clean up. Made use of UErrorCode variables consistent.
30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
31 * constructor and getDynamicClassID.
32 * 3/5/97 aliu Changed compaction cycle to improve performance. We
33 * use the maximum allowable value which is kBlockCount.
34 * Modified getRules() to load rules dynamically. Changed
35 * constructFromFile() call to accomodate this (added
36 * parameter to specify whether binary loading is to
38 * 05/06/97 helena Added memory allocation error check.
39 * 6/20/97 helena Java class name change.
40 * 6/23/97 helena Adding comments to make code more readable.
41 * 09/03/97 helena Added createCollationKeyValues().
42 * 06/26/98 erm Changes for CollationKeys using byte arrays.
43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99 stephen Removed EDecompositionMode, merged with
46 * 06/14/99 stephen Removed kResourceBundleSuffix
47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
48 * files are no longer used.
49 * 11/02/99 helena Collator performance enhancements. Special case
50 * for NO_OP situations.
51 * 11/17/99 srl More performance enhancements. Inlined some internal functions.
52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
53 * to implementation file.
54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
57 #include <typeinfo> // for 'typeid' to work
59 #include "unicode/utypes.h"
61 #if !UCONFIG_NO_COLLATION
63 #include "unicode/tblcoll.h"
64 #include "unicode/coleitr.h"
65 #include "unicode/ures.h"
66 #include "unicode/uset.h"
75 /* public RuleBasedCollator constructor ---------------------------------- */
80 * Copy constructor, aliasing, not write-through
82 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator
& that
)
85 , isWriteThroughAlias(FALSE
)
88 RuleBasedCollator::operator=(that
);
91 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
96 UCOL_DEFAULT_STRENGTH
,
101 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
102 ECollationStrength collationStrength
,
103 UErrorCode
& status
) : dataIsOwned(FALSE
)
106 getUCollationStrength(collationStrength
),
111 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
112 UColAttributeValue decompositionMode
,
113 UErrorCode
& status
) :
117 UCOL_DEFAULT_STRENGTH
,
122 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
123 ECollationStrength collationStrength
,
124 UColAttributeValue decompositionMode
,
125 UErrorCode
& status
) : dataIsOwned(FALSE
)
128 getUCollationStrength(collationStrength
),
132 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin
, int32_t length
,
133 const RuleBasedCollator
*base
,
134 UErrorCode
&status
) :
136 isWriteThroughAlias(FALSE
)
138 ucollator
= ucol_openBinary(bin
, length
, base
->ucollator
, &status
);
142 RuleBasedCollator::setRuleStringFromCollator()
145 const UChar
*r
= ucol_getRules(ucollator
, &length
);
147 if (r
&& length
> 0) {
148 // alias the rules string
149 urulestring
.setTo(TRUE
, r
, length
);
152 urulestring
.truncate(0); // Clear string.
156 // not aliasing, not write-through
158 RuleBasedCollator::construct(const UnicodeString
& rules
,
159 UColAttributeValue collationStrength
,
160 UColAttributeValue decompositionMode
,
163 ucollator
= ucol_openRules(rules
.getBuffer(), rules
.length(),
164 decompositionMode
, collationStrength
,
167 dataIsOwned
= TRUE
; // since we own a collator now, we need to get rid of it
168 isWriteThroughAlias
= FALSE
;
170 if(ucollator
== NULL
) {
171 if(U_SUCCESS(status
)) {
172 status
= U_MEMORY_ALLOCATION_ERROR
;
177 setRuleStringFromCollator();
180 /* RuleBasedCollator public destructor ----------------------------------- */
182 RuleBasedCollator::~RuleBasedCollator()
186 ucol_close(ucollator
);
191 /* RuleBaseCollator public methods --------------------------------------- */
193 UBool
RuleBasedCollator::operator==(const Collator
& that
) const
195 /* only checks for address equals here */
196 if (Collator::operator==(that
))
199 if (typeid(*this) != typeid(that
))
200 return FALSE
; /* not the same class */
202 RuleBasedCollator
& thatAlias
= (RuleBasedCollator
&)that
;
204 // weiv: use C function, commented code below is wrong
205 return ucol_equals(this->ucollator
, thatAlias
.ucollator
);
207 synwee : orginal code does not check for data compatibility
210 if (ucollator != thatAlias.ucollator)
217 UBool
RuleBasedCollator::operator!=(const Collator
& other
) const
219 return !(*this == other
);
222 // aliasing, not write-through
223 RuleBasedCollator
& RuleBasedCollator::operator=(const RuleBasedCollator
& that
)
229 ucol_close(ucollator
);
232 urulestring
.truncate(0); // empty the rule string
234 isWriteThroughAlias
= FALSE
;
236 UErrorCode intStatus
= U_ZERO_ERROR
;
237 int32_t buffersize
= U_COL_SAFECLONE_BUFFERSIZE
;
238 ucollator
= ucol_safeClone(that
.ucollator
, NULL
, &buffersize
,
240 if (U_SUCCESS(intStatus
)) {
241 setRuleStringFromCollator();
247 // aliasing, not write-through
248 Collator
* RuleBasedCollator::clone() const
250 return new RuleBasedCollator(*this);
254 CollationElementIterator
* RuleBasedCollator::createCollationElementIterator
255 (const UnicodeString
& source
) const
257 UErrorCode status
= U_ZERO_ERROR
;
258 CollationElementIterator
*result
= new CollationElementIterator(source
, this,
260 if (U_FAILURE(status
)) {
269 * Create a CollationElementIterator object that will iterate over the
270 * elements in a string, using the collation rules defined in this
273 CollationElementIterator
* RuleBasedCollator::createCollationElementIterator
274 (const CharacterIterator
& source
) const
276 UErrorCode status
= U_ZERO_ERROR
;
277 CollationElementIterator
*result
= new CollationElementIterator(source
, this,
280 if (U_FAILURE(status
)) {
289 * Return a string representation of this collator's rules. The string can
290 * later be passed to the constructor that takes a UnicodeString argument,
291 * which will construct a collator that's functionally identical to this one.
292 * You can also allow users to edit the string in order to change the collation
293 * data, or you can print it out for inspection, or whatever.
295 const UnicodeString
& RuleBasedCollator::getRules() const
300 void RuleBasedCollator::getRules(UColRuleOption delta
, UnicodeString
&buffer
)
302 int32_t rulesize
= ucol_getRulesEx(ucollator
, delta
, NULL
, -1);
305 UChar
*rules
= (UChar
*) uprv_malloc( sizeof(UChar
) * (rulesize
) );
307 ucol_getRulesEx(ucollator
, delta
, rules
, rulesize
);
308 buffer
.setTo(rules
, rulesize
);
310 } else { // couldn't allocate
320 RuleBasedCollator::getTailoredSet(UErrorCode
&status
) const
322 if(U_FAILURE(status
)) {
325 return (UnicodeSet
*)ucol_getTailoredSet(this->ucollator
, &status
);
329 void RuleBasedCollator::getVersion(UVersionInfo versionInfo
) const
331 if (versionInfo
!=NULL
){
332 ucol_getVersion(ucollator
, versionInfo
);
336 Collator::EComparisonResult
RuleBasedCollator::compare(
337 const UnicodeString
& source
,
338 const UnicodeString
& target
,
339 int32_t length
) const
341 UErrorCode status
= U_ZERO_ERROR
;
342 return getEComparisonResult(compare(source
.getBuffer(), uprv_min(length
,source
.length()), target
.getBuffer(), uprv_min(length
,target
.length()), status
));
345 UCollationResult
RuleBasedCollator::compare(
346 const UnicodeString
& source
,
347 const UnicodeString
& target
,
349 UErrorCode
&status
) const
351 return compare(source
.getBuffer(), uprv_min(length
,source
.length()), target
.getBuffer(), uprv_min(length
,target
.length()), status
);
354 Collator::EComparisonResult
RuleBasedCollator::compare(const UChar
* source
,
355 int32_t sourceLength
,
357 int32_t targetLength
)
360 return getEComparisonResult(ucol_strcoll(ucollator
, source
, sourceLength
,
361 target
, targetLength
));
364 UCollationResult
RuleBasedCollator::compare(const UChar
* source
,
365 int32_t sourceLength
,
367 int32_t targetLength
,
368 UErrorCode
&status
) const
370 if(U_SUCCESS(status
)) {
371 return ucol_strcoll(ucollator
, source
, sourceLength
, target
, targetLength
);
378 * Compare two strings using this collator
380 Collator::EComparisonResult
RuleBasedCollator::compare(
381 const UnicodeString
& source
,
382 const UnicodeString
& target
) const
384 return getEComparisonResult(ucol_strcoll(ucollator
, source
.getBuffer(), source
.length(),
385 target
.getBuffer(), target
.length()));
388 UCollationResult
RuleBasedCollator::compare(
389 const UnicodeString
& source
,
390 const UnicodeString
& target
,
391 UErrorCode
&status
) const
393 if(U_SUCCESS(status
)) {
394 return ucol_strcoll(ucollator
, source
.getBuffer(), source
.length(),
395 target
.getBuffer(), target
.length());
401 UCollationResult
RuleBasedCollator::compare(UCharIterator
&sIter
,
402 UCharIterator
&tIter
,
403 UErrorCode
&status
) const {
404 if(U_SUCCESS(status
)) {
405 return ucol_strcollIter(ucollator
, &sIter
, &tIter
, &status
);
412 * Retrieve a collation key for the specified string. The key can be compared
413 * with other collation keys using a bitwise comparison (e.g. memcmp) to find
414 * the ordering of their respective source strings. This is handy when doing a
415 * sort, where each sort key must be compared many times.
417 * The basic algorithm here is to find all of the collation elements for each
418 * character in the source string, convert them to an ASCII representation, and
419 * put them into the collation key. But it's trickier than that. Each
420 * collation element in a string has three components: primary ('A' vs 'B'),
421 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
422 * at the end of a string takes precedence over a secondary or tertiary
423 * difference earlier in the string.
425 * To account for this, we put all of the primary orders at the beginning of
426 * the string, followed by the secondary and tertiary orders. Each set of
427 * orders is terminated by nulls so that a key for a string which is a initial
428 * substring of another key will compare less without any special case.
430 * Here's a hypothetical example, with the collation element represented as a
431 * three-digit number, one digit for primary, one for secondary, etc.
433 * String: A a B \u00C9
434 * Collation Elements: 101 100 201 511
435 * Collation Key: 1125<null>0001<null>1011<null>
437 * To make things even trickier, secondary differences (accent marks) are
438 * compared starting at the *end* of the string in languages with French
439 * secondary ordering. But when comparing the accent marks on a single base
440 * character, they are compared from the beginning. To handle this, we reverse
441 * all of the accents that belong to each base character, then we reverse the
442 * entire string of secondary orderings at the end.
444 CollationKey
& RuleBasedCollator::getCollationKey(
445 const UnicodeString
& source
,
446 CollationKey
& sortkey
,
447 UErrorCode
& status
) const
449 return getCollationKey(source
.getBuffer(), source
.length(), sortkey
, status
);
452 CollationKey
& RuleBasedCollator::getCollationKey(const UChar
* source
,
454 CollationKey
& sortkey
,
455 UErrorCode
& status
) const
457 if (U_FAILURE(status
)) {
458 return sortkey
.setToBogus();
460 if (sourceLen
< -1 || (source
== NULL
&& sourceLen
!= 0)) {
461 status
= U_ILLEGAL_ARGUMENT_ERROR
;
462 return sortkey
.setToBogus();
466 sourceLen
= u_strlen(source
);
468 if (sourceLen
== 0) {
469 return sortkey
.reset();
473 int32_t resultCapacity
;
474 if (sortkey
.fCapacity
>= (sourceLen
* 3)) {
475 // Try to reuse the CollationKey.fBytes.
476 result
= sortkey
.fBytes
;
477 resultCapacity
= sortkey
.fCapacity
;
482 int32_t resultLen
= ucol_getSortKeyWithAllocation(ucollator
, source
, sourceLen
,
483 result
, resultCapacity
, &status
);
485 if (U_SUCCESS(status
)) {
486 if (result
== sortkey
.fBytes
) {
487 sortkey
.setLength(resultLen
);
489 sortkey
.adopt(result
, resultCapacity
, resultLen
);
492 if (result
!= sortkey
.fBytes
) {
495 sortkey
.setToBogus();
501 * Return the maximum length of any expansion sequences that end with the
502 * specified comparison order.
503 * @param order a collation order returned by previous or next.
504 * @return the maximum length of any expansion seuences ending with the
505 * specified order or 1 if collation order does not occur at the end of any
506 * expansion sequence.
507 * @see CollationElementIterator#getMaxExpansion
509 int32_t RuleBasedCollator::getMaxExpansion(int32_t order
) const
512 UCOL_GETMAXEXPANSION(ucollator
, (uint32_t)order
, result
);
516 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length
,
519 return ucol_cloneRuleData(ucollator
, &length
, &status
);
523 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer
, int32_t capacity
, UErrorCode
&status
)
525 return ucol_cloneBinary(ucollator
, buffer
, capacity
, &status
);
528 void RuleBasedCollator::setAttribute(UColAttribute attr
,
529 UColAttributeValue value
,
532 if (U_FAILURE(status
))
535 ucol_setAttribute(ucollator
, attr
, value
, &status
);
538 UColAttributeValue
RuleBasedCollator::getAttribute(UColAttribute attr
,
541 if (U_FAILURE(status
))
543 return ucol_getAttribute(ucollator
, attr
, &status
);
546 uint32_t RuleBasedCollator::setVariableTop(const UChar
*varTop
, int32_t len
, UErrorCode
&status
) {
548 return ucol_setVariableTop(ucollator
, varTop
, len
, &status
);
551 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop
, UErrorCode
&status
) {
553 return ucol_setVariableTop(ucollator
, varTop
.getBuffer(), varTop
.length(), &status
);
556 void RuleBasedCollator::setVariableTop(const uint32_t varTop
, UErrorCode
&status
) {
558 ucol_restoreVariableTop(ucollator
, varTop
, &status
);
561 uint32_t RuleBasedCollator::getVariableTop(UErrorCode
&status
) const {
562 return ucol_getVariableTop(ucollator
, &status
);
565 Collator
* RuleBasedCollator::safeClone(void)
567 UErrorCode intStatus
= U_ZERO_ERROR
;
568 int32_t buffersize
= U_COL_SAFECLONE_BUFFERSIZE
;
569 UCollator
*ucol
= ucol_safeClone(ucollator
, NULL
, &buffersize
,
571 if (U_FAILURE(intStatus
)) {
575 RuleBasedCollator
*result
= new RuleBasedCollator();
576 // Null pointer check
577 if (result
!= NULL
) {
578 result
->ucollator
= ucol
;
579 result
->dataIsOwned
= TRUE
;
580 result
->isWriteThroughAlias
= FALSE
;
581 setRuleStringFromCollator();
588 int32_t RuleBasedCollator::getSortKey(const UnicodeString
& source
,
589 uint8_t *result
, int32_t resultLength
)
592 return ucol_getSortKey(ucollator
, source
.getBuffer(), source
.length(), result
, resultLength
);
595 int32_t RuleBasedCollator::getSortKey(const UChar
*source
,
596 int32_t sourceLength
, uint8_t *result
,
597 int32_t resultLength
) const
599 return ucol_getSortKey(ucollator
, source
, sourceLength
, result
, resultLength
);
602 Collator::ECollationStrength
RuleBasedCollator::getStrength(void) const
604 UErrorCode intStatus
= U_ZERO_ERROR
;
605 return getECollationStrength(ucol_getAttribute(ucollator
, UCOL_STRENGTH
,
609 void RuleBasedCollator::setStrength(ECollationStrength newStrength
)
612 UErrorCode intStatus
= U_ZERO_ERROR
;
613 UCollationStrength strength
= getUCollationStrength(newStrength
);
614 ucol_setAttribute(ucollator
, UCOL_STRENGTH
, strength
, &intStatus
);
617 int32_t RuleBasedCollator::getReorderCodes(int32_t *dest
,
618 int32_t destCapacity
,
619 UErrorCode
& status
) const
621 return ucol_getReorderCodes(ucollator
, dest
, destCapacity
, &status
);
624 void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes
,
625 int32_t reorderCodesLength
,
629 ucol_setReorderCodes(ucollator
, reorderCodes
, reorderCodesLength
, &status
);
632 int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode
,
634 int32_t destCapacity
,
637 return ucol_getEquivalentReorderCodes(reorderCode
, dest
, destCapacity
, &status
);
641 * Create a hash code for this collation. Just hash the main rule table -- that
642 * should be good enough for almost any use.
644 int32_t RuleBasedCollator::hashCode() const
647 const UChar
*rules
= ucol_getRules(ucollator
, &length
);
648 return ustr_hashUCharsN(rules
, length
);
652 * return the locale of this collator
654 const Locale
RuleBasedCollator::getLocale(ULocDataLocaleType type
, UErrorCode
&status
) const {
655 const char *result
= ucol_getLocaleByType(ucollator
, type
, &status
);
661 return Locale(result
);
666 RuleBasedCollator::setLocales(const Locale
& requestedLocale
, const Locale
& validLocale
, const Locale
& actualLocale
) {
668 char* rloc
= uprv_strdup(requestedLocale
.getName());
670 char* vloc
= uprv_strdup(validLocale
.getName());
672 char* aloc
= uprv_strdup(actualLocale
.getName());
674 ucol_setReqValidLocales(ucollator
, rloc
, vloc
, aloc
);
683 // RuleBaseCollatorNew private constructor ----------------------------------
685 RuleBasedCollator::RuleBasedCollator()
686 : dataIsOwned(FALSE
), isWriteThroughAlias(FALSE
), ucollator(NULL
)
690 RuleBasedCollator::RuleBasedCollator(const Locale
& desiredLocale
,
692 : dataIsOwned(FALSE
), isWriteThroughAlias(FALSE
), ucollator(NULL
)
694 if (U_FAILURE(status
))
698 Try to load, in order:
699 1. The desired locale's collation.
700 2. A fallback of the desired locale.
701 3. The default locale's collation.
702 4. A fallback of the default locale.
703 5. The default collation rules, which contains en_US collation rules.
705 To reiterate, we try:
707 language+country+variant
711 language+country+variant
714 Root: (aka DEFAULTRULES)
715 steps 1-5 are handled by resource bundle fallback mechanism.
716 however, in a very unprobable situation that no resource bundle
717 data exists, step 5 is repeated with hardcoded default rules.
720 setUCollator(desiredLocale
, status
);
722 if (U_FAILURE(status
))
724 status
= U_ZERO_ERROR
;
726 setUCollator(kRootLocaleName
, status
);
727 if (status
== U_ZERO_ERROR
) {
728 status
= U_USING_DEFAULT_WARNING
;
732 if (U_SUCCESS(status
))
734 setRuleStringFromCollator();
739 RuleBasedCollator::setUCollator(const char *locale
,
742 if (U_FAILURE(status
)) {
745 if (ucollator
&& dataIsOwned
)
746 ucol_close(ucollator
);
747 ucollator
= ucol_open_internal(locale
, &status
);
749 isWriteThroughAlias
= FALSE
;
754 RuleBasedCollator::checkOwned() {
755 if (!(dataIsOwned
|| isWriteThroughAlias
)) {
756 UErrorCode status
= U_ZERO_ERROR
;
757 ucollator
= ucol_safeClone(ucollator
, NULL
, NULL
, &status
);
758 setRuleStringFromCollator();
760 isWriteThroughAlias
= FALSE
;
765 int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale
,
768 UErrorCode
&status
) const {
769 /* simply delegate */
770 return ucol_getShortDefinitionString(ucollator
, locale
, buffer
, capacity
, &status
);
774 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator
)
778 #endif /* #if !UCONFIG_NO_COLLATION */