2 ******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
11 * Created by: Helena Shih
13 * Modification History:
15 * Date Name Description
16 * 2/5/97 aliu Added streamIn and streamOut methods. Added
17 * constructor which reads RuleBasedCollator object from
18 * a binary file. Added writeToFile method which streams
19 * RuleBasedCollator out to a binary file. The streamIn
20 * and streamOut methods use istream and ostream objects
22 * 2/11/97 aliu Moved declarations out of for loop initializer.
23 * Added Mac compatibility #ifdef for ios::nocreate.
24 * 2/12/97 aliu Modified to use TableCollationData sub-object to
25 * hold invariant data.
26 * 2/13/97 aliu Moved several methods into this class from Collation.
27 * Added a private RuleBasedCollator(Locale&) constructor,
28 * to be used by Collator::getInstance(). General
29 * clean up. Made use of UErrorCode variables consistent.
30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
31 * constructor and getDynamicClassID.
32 * 3/5/97 aliu Changed compaction cycle to improve performance. We
33 * use the maximum allowable value which is kBlockCount.
34 * Modified getRules() to load rules dynamically. Changed
35 * constructFromFile() call to accomodate this (added
36 * parameter to specify whether binary loading is to
38 * 05/06/97 helena Added memory allocation error check.
39 * 6/20/97 helena Java class name change.
40 * 6/23/97 helena Adding comments to make code more readable.
41 * 09/03/97 helena Added createCollationKeyValues().
42 * 06/26/98 erm Changes for CollationKeys using byte arrays.
43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99 stephen Removed EDecompositionMode, merged with
46 * 06/14/99 stephen Removed kResourceBundleSuffix
47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
48 * files are no longer used.
49 * 11/02/99 helena Collator performance enhancements. Special case
50 * for NO_OP situations.
51 * 11/17/99 srl More performance enhancements. Inlined some internal functions.
52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
53 * to implementation file.
54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
57 #include "unicode/utypes.h"
59 #if !UCONFIG_NO_COLLATION
61 #include "unicode/tblcoll.h"
62 #include "unicode/coleitr.h"
63 #include "unicode/ures.h"
64 #include "unicode/uset.h"
73 /* public RuleBasedCollator constructor ---------------------------------- */
78 * Copy constructor, aliasing, not write-through
80 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator
& that
)
83 , isWriteThroughAlias(FALSE
)
86 RuleBasedCollator::operator=(that
);
89 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
94 UCOL_DEFAULT_STRENGTH
,
99 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
100 ECollationStrength collationStrength
,
101 UErrorCode
& status
) : dataIsOwned(FALSE
)
104 (UColAttributeValue
)collationStrength
,
109 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
110 UColAttributeValue decompositionMode
,
111 UErrorCode
& status
) :
115 UCOL_DEFAULT_STRENGTH
,
120 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
121 ECollationStrength collationStrength
,
122 UColAttributeValue decompositionMode
,
123 UErrorCode
& status
) : dataIsOwned(FALSE
)
126 (UColAttributeValue
)collationStrength
,
130 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin
, int32_t length
,
131 const RuleBasedCollator
*base
,
132 UErrorCode
&status
) :
134 isWriteThroughAlias(FALSE
)
136 ucollator
= ucol_openBinary(bin
, length
, base
->ucollator
, &status
);
140 RuleBasedCollator::setRuleStringFromCollator()
143 const UChar
*r
= ucol_getRules(ucollator
, &length
);
145 if (r
&& length
> 0) {
146 // alias the rules string
147 urulestring
.setTo(TRUE
, r
, length
);
150 urulestring
.truncate(0); // Clear string.
154 // not aliasing, not write-through
156 RuleBasedCollator::construct(const UnicodeString
& rules
,
157 UColAttributeValue collationStrength
,
158 UColAttributeValue decompositionMode
,
161 ucollator
= ucol_openRules(rules
.getBuffer(), rules
.length(),
162 decompositionMode
, collationStrength
,
165 dataIsOwned
= TRUE
; // since we own a collator now, we need to get rid of it
166 isWriteThroughAlias
= FALSE
;
168 if(ucollator
== NULL
) {
169 if(U_SUCCESS(status
)) {
170 status
= U_MEMORY_ALLOCATION_ERROR
;
175 setRuleStringFromCollator();
178 /* RuleBasedCollator public destructor ----------------------------------- */
180 RuleBasedCollator::~RuleBasedCollator()
184 ucol_close(ucollator
);
189 /* RuleBaseCollator public methods --------------------------------------- */
191 UBool
RuleBasedCollator::operator==(const Collator
& that
) const
193 /* only checks for address equals here */
197 if (!Collator::operator==(that
)) {
198 return FALSE
; /* not the same class */
201 RuleBasedCollator
& thatAlias
= (RuleBasedCollator
&)that
;
203 return ucol_equals(this->ucollator
, thatAlias
.ucollator
);
206 // aliasing, not write-through
207 RuleBasedCollator
& RuleBasedCollator::operator=(const RuleBasedCollator
& that
)
209 if (this == &that
) { return *this; }
211 UErrorCode intStatus
= U_ZERO_ERROR
;
212 int32_t buffersize
= U_COL_SAFECLONE_BUFFERSIZE
;
213 UCollator
*ucol
= ucol_safeClone(that
.ucollator
, NULL
, &buffersize
, &intStatus
);
214 if (U_FAILURE(intStatus
)) { return *this; }
217 ucol_close(ucollator
);
221 isWriteThroughAlias
= FALSE
;
222 setRuleStringFromCollator();
226 // aliasing, not write-through
227 Collator
* RuleBasedCollator::clone() const
229 RuleBasedCollator
* coll
= new RuleBasedCollator(*this);
230 // There is a small chance that the internal ucol_safeClone() call fails.
231 if (coll
!= NULL
&& coll
->ucollator
== NULL
) {
239 CollationElementIterator
* RuleBasedCollator::createCollationElementIterator
240 (const UnicodeString
& source
) const
242 UErrorCode status
= U_ZERO_ERROR
;
243 CollationElementIterator
*result
= new CollationElementIterator(source
, this,
245 if (U_FAILURE(status
)) {
254 * Create a CollationElementIterator object that will iterate over the
255 * elements in a string, using the collation rules defined in this
258 CollationElementIterator
* RuleBasedCollator::createCollationElementIterator
259 (const CharacterIterator
& source
) const
261 UErrorCode status
= U_ZERO_ERROR
;
262 CollationElementIterator
*result
= new CollationElementIterator(source
, this,
265 if (U_FAILURE(status
)) {
274 * Return a string representation of this collator's rules. The string can
275 * later be passed to the constructor that takes a UnicodeString argument,
276 * which will construct a collator that's functionally identical to this one.
277 * You can also allow users to edit the string in order to change the collation
278 * data, or you can print it out for inspection, or whatever.
280 const UnicodeString
& RuleBasedCollator::getRules() const
285 void RuleBasedCollator::getRules(UColRuleOption delta
, UnicodeString
&buffer
)
287 int32_t rulesize
= ucol_getRulesEx(ucollator
, delta
, NULL
, -1);
290 UChar
*rules
= (UChar
*) uprv_malloc( sizeof(UChar
) * (rulesize
) );
292 ucol_getRulesEx(ucollator
, delta
, rules
, rulesize
);
293 buffer
.setTo(rules
, rulesize
);
295 } else { // couldn't allocate
305 RuleBasedCollator::getTailoredSet(UErrorCode
&status
) const
307 if(U_FAILURE(status
)) {
310 return (UnicodeSet
*)ucol_getTailoredSet(this->ucollator
, &status
);
314 void RuleBasedCollator::getVersion(UVersionInfo versionInfo
) const
316 if (versionInfo
!=NULL
){
317 ucol_getVersion(ucollator
, versionInfo
);
322 * Compare two strings using this collator
324 UCollationResult
RuleBasedCollator::compare(
325 const UnicodeString
& source
,
326 const UnicodeString
& target
,
328 UErrorCode
&status
) const
330 return compare(source
.getBuffer(), uprv_min(length
,source
.length()), target
.getBuffer(), uprv_min(length
,target
.length()), status
);
333 UCollationResult
RuleBasedCollator::compare(const UChar
* source
,
334 int32_t sourceLength
,
336 int32_t targetLength
,
337 UErrorCode
&status
) const
339 if(U_SUCCESS(status
)) {
340 return ucol_strcoll(ucollator
, source
, sourceLength
, target
, targetLength
);
346 UCollationResult
RuleBasedCollator::compare(
347 const UnicodeString
& source
,
348 const UnicodeString
& target
,
349 UErrorCode
&status
) const
351 if(U_SUCCESS(status
)) {
352 return ucol_strcoll(ucollator
, source
.getBuffer(), source
.length(),
353 target
.getBuffer(), target
.length());
359 UCollationResult
RuleBasedCollator::compare(UCharIterator
&sIter
,
360 UCharIterator
&tIter
,
361 UErrorCode
&status
) const {
362 if(U_SUCCESS(status
)) {
363 return ucol_strcollIter(ucollator
, &sIter
, &tIter
, &status
);
370 * Retrieve a collation key for the specified string. The key can be compared
371 * with other collation keys using a bitwise comparison (e.g. memcmp) to find
372 * the ordering of their respective source strings. This is handy when doing a
373 * sort, where each sort key must be compared many times.
375 * The basic algorithm here is to find all of the collation elements for each
376 * character in the source string, convert them to an ASCII representation, and
377 * put them into the collation key. But it's trickier than that. Each
378 * collation element in a string has three components: primary ('A' vs 'B'),
379 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
380 * at the end of a string takes precedence over a secondary or tertiary
381 * difference earlier in the string.
383 * To account for this, we put all of the primary orders at the beginning of
384 * the string, followed by the secondary and tertiary orders. Each set of
385 * orders is terminated by nulls so that a key for a string which is a initial
386 * substring of another key will compare less without any special case.
388 * Here's a hypothetical example, with the collation element represented as a
389 * three-digit number, one digit for primary, one for secondary, etc.
391 * String: A a B \u00C9
392 * Collation Elements: 101 100 201 511
393 * Collation Key: 1125<null>0001<null>1011<null>
395 * To make things even trickier, secondary differences (accent marks) are
396 * compared starting at the *end* of the string in languages with French
397 * secondary ordering. But when comparing the accent marks on a single base
398 * character, they are compared from the beginning. To handle this, we reverse
399 * all of the accents that belong to each base character, then we reverse the
400 * entire string of secondary orderings at the end.
402 CollationKey
& RuleBasedCollator::getCollationKey(
403 const UnicodeString
& source
,
404 CollationKey
& sortkey
,
405 UErrorCode
& status
) const
407 return getCollationKey(source
.getBuffer(), source
.length(), sortkey
, status
);
410 CollationKey
& RuleBasedCollator::getCollationKey(const UChar
* source
,
412 CollationKey
& sortkey
,
413 UErrorCode
& status
) const
415 if (U_FAILURE(status
)) {
416 return sortkey
.setToBogus();
418 if (sourceLen
< -1 || (source
== NULL
&& sourceLen
!= 0)) {
419 status
= U_ILLEGAL_ARGUMENT_ERROR
;
420 return sortkey
.setToBogus();
424 sourceLen
= u_strlen(source
);
426 if (sourceLen
== 0) {
427 return sortkey
.reset();
430 int32_t resultLen
= ucol_getCollationKey(ucollator
, source
, sourceLen
, sortkey
, status
);
432 if (U_SUCCESS(status
)) {
433 sortkey
.setLength(resultLen
);
435 sortkey
.setToBogus();
441 * Return the maximum length of any expansion sequences that end with the
442 * specified comparison order.
443 * @param order a collation order returned by previous or next.
444 * @return the maximum length of any expansion seuences ending with the
445 * specified order or 1 if collation order does not occur at the end of any
446 * expansion sequence.
447 * @see CollationElementIterator#getMaxExpansion
449 int32_t RuleBasedCollator::getMaxExpansion(int32_t order
) const
452 UCOL_GETMAXEXPANSION(ucollator
, (uint32_t)order
, result
);
456 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length
,
459 return ucol_cloneRuleData(ucollator
, &length
, &status
);
463 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer
, int32_t capacity
, UErrorCode
&status
)
465 return ucol_cloneBinary(ucollator
, buffer
, capacity
, &status
);
468 void RuleBasedCollator::setAttribute(UColAttribute attr
,
469 UColAttributeValue value
,
472 if (U_FAILURE(status
))
475 ucol_setAttribute(ucollator
, attr
, value
, &status
);
478 UColAttributeValue
RuleBasedCollator::getAttribute(UColAttribute attr
,
479 UErrorCode
&status
) const
481 if (U_FAILURE(status
))
483 return ucol_getAttribute(ucollator
, attr
, &status
);
486 uint32_t RuleBasedCollator::setVariableTop(const UChar
*varTop
, int32_t len
, UErrorCode
&status
) {
488 return ucol_setVariableTop(ucollator
, varTop
, len
, &status
);
491 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString
&varTop
, UErrorCode
&status
) {
493 return ucol_setVariableTop(ucollator
, varTop
.getBuffer(), varTop
.length(), &status
);
496 void RuleBasedCollator::setVariableTop(uint32_t varTop
, UErrorCode
&status
) {
498 ucol_restoreVariableTop(ucollator
, varTop
, &status
);
501 uint32_t RuleBasedCollator::getVariableTop(UErrorCode
&status
) const {
502 return ucol_getVariableTop(ucollator
, &status
);
505 int32_t RuleBasedCollator::getSortKey(const UnicodeString
& source
,
506 uint8_t *result
, int32_t resultLength
)
509 return ucol_getSortKey(ucollator
, source
.getBuffer(), source
.length(), result
, resultLength
);
512 int32_t RuleBasedCollator::getSortKey(const UChar
*source
,
513 int32_t sourceLength
, uint8_t *result
,
514 int32_t resultLength
) const
516 return ucol_getSortKey(ucollator
, source
, sourceLength
, result
, resultLength
);
519 int32_t RuleBasedCollator::getReorderCodes(int32_t *dest
,
520 int32_t destCapacity
,
521 UErrorCode
& status
) const
523 return ucol_getReorderCodes(ucollator
, dest
, destCapacity
, &status
);
526 void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes
,
527 int32_t reorderCodesLength
,
531 ucol_setReorderCodes(ucollator
, reorderCodes
, reorderCodesLength
, &status
);
534 int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode
,
536 int32_t destCapacity
,
539 return ucol_getEquivalentReorderCodes(reorderCode
, dest
, destCapacity
, &status
);
543 * Create a hash code for this collation. Just hash the main rule table -- that
544 * should be good enough for almost any use.
546 int32_t RuleBasedCollator::hashCode() const
549 const UChar
*rules
= ucol_getRules(ucollator
, &length
);
550 return ustr_hashUCharsN(rules
, length
);
554 * return the locale of this collator
556 Locale
RuleBasedCollator::getLocale(ULocDataLocaleType type
, UErrorCode
&status
) const {
557 const char *result
= ucol_getLocaleByType(ucollator
, type
, &status
);
563 return Locale(result
);
568 RuleBasedCollator::setLocales(const Locale
& requestedLocale
, const Locale
& validLocale
, const Locale
& actualLocale
) {
570 char* rloc
= uprv_strdup(requestedLocale
.getName());
572 char* vloc
= uprv_strdup(validLocale
.getName());
574 char* aloc
= uprv_strdup(actualLocale
.getName());
576 ucol_setReqValidLocales(ucollator
, rloc
, vloc
, aloc
);
585 // RuleBaseCollatorNew private constructor ----------------------------------
587 RuleBasedCollator::RuleBasedCollator()
588 : dataIsOwned(FALSE
), isWriteThroughAlias(FALSE
), ucollator(NULL
)
592 RuleBasedCollator::RuleBasedCollator(const Locale
& desiredLocale
,
594 : dataIsOwned(FALSE
), isWriteThroughAlias(FALSE
), ucollator(NULL
)
596 if (U_FAILURE(status
))
600 Try to load, in order:
601 1. The desired locale's collation.
602 2. A fallback of the desired locale.
603 3. The default locale's collation.
604 4. A fallback of the default locale.
605 5. The default collation rules, which contains en_US collation rules.
607 To reiterate, we try:
609 language+country+variant
613 language+country+variant
616 Root: (aka DEFAULTRULES)
617 steps 1-5 are handled by resource bundle fallback mechanism.
618 however, in a very unprobable situation that no resource bundle
619 data exists, step 5 is repeated with hardcoded default rules.
622 setUCollator(desiredLocale
, status
);
624 if (U_FAILURE(status
))
626 status
= U_ZERO_ERROR
;
628 setUCollator(kRootLocaleName
, status
);
629 if (status
== U_ZERO_ERROR
) {
630 status
= U_USING_DEFAULT_WARNING
;
634 if (U_SUCCESS(status
))
636 setRuleStringFromCollator();
641 RuleBasedCollator::setUCollator(const char *locale
,
644 if (U_FAILURE(status
)) {
647 if (ucollator
&& dataIsOwned
)
648 ucol_close(ucollator
);
649 ucollator
= ucol_open_internal(locale
, &status
);
651 isWriteThroughAlias
= FALSE
;
656 RuleBasedCollator::checkOwned() {
657 if (!(dataIsOwned
|| isWriteThroughAlias
)) {
658 UErrorCode status
= U_ZERO_ERROR
;
659 ucollator
= ucol_safeClone(ucollator
, NULL
, NULL
, &status
);
660 setRuleStringFromCollator();
662 isWriteThroughAlias
= FALSE
;
667 int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale
,
670 UErrorCode
&status
) const {
671 /* simply delegate */
672 return ucol_getShortDefinitionString(ucollator
, locale
, buffer
, capacity
, &status
);
676 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator
)
680 #endif /* #if !UCONFIG_NO_COLLATION */