2 ******************************************************************************
3 * Copyright (C) {1996-2003}, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
11 * Created by: Helena Shih
13 * Modification History:
15 * Date Name Description
16 * 2/5/97 aliu Added streamIn and streamOut methods. Added
17 * constructor which reads RuleBasedCollator object from
18 * a binary file. Added writeToFile method which streams
19 * RuleBasedCollator out to a binary file. The streamIn
20 * and streamOut methods use istream and ostream objects
22 * 2/11/97 aliu Moved declarations out of for loop initializer.
23 * Added Mac compatibility #ifdef for ios::nocreate.
24 * 2/12/97 aliu Modified to use TableCollationData sub-object to
25 * hold invariant data.
26 * 2/13/97 aliu Moved several methods into this class from Collation.
27 * Added a private RuleBasedCollator(Locale&) constructor,
28 * to be used by Collator::getInstance(). General
29 * clean up. Made use of UErrorCode variables consistent.
30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
31 * constructor and getDynamicClassID.
32 * 3/5/97 aliu Changed compaction cycle to improve performance. We
33 * use the maximum allowable value which is kBlockCount.
34 * Modified getRules() to load rules dynamically. Changed
35 * constructFromFile() call to accomodate this (added
36 * parameter to specify whether binary loading is to
38 * 05/06/97 helena Added memory allocation error check.
39 * 6/20/97 helena Java class name change.
40 * 6/23/97 helena Adding comments to make code more readable.
41 * 09/03/97 helena Added createCollationKeyValues().
42 * 06/26/98 erm Changes for CollationKeys using byte arrays.
43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99 stephen Removed EDecompositionMode, merged with
46 * 06/14/99 stephen Removed kResourceBundleSuffix
47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
48 * files are no longer used.
49 * 11/02/99 helena Collator performance enhancements. Special case
50 * for NO_OP situations.
51 * 11/17/99 srl More performance enhancements. Inlined some internal functions.
52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
53 * to implementation file.
54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
57 #include "unicode/utypes.h"
59 #if !UCONFIG_NO_COLLATION
61 #include "unicode/tblcoll.h"
62 #include "unicode/coleitr.h"
63 #include "unicode/resbund.h"
64 #include "unicode/uset.h"
71 /* public RuleBasedCollator constructor ---------------------------------- */
76 * Copy constructor, aliasing, not write-through
78 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator
& that
)
81 , isWriteThroughAlias(FALSE
)
82 , ucollator(that
.ucollator
)
83 , urulestring(that
.urulestring
)
87 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
92 UCOL_DEFAULT_STRENGTH
,
97 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
98 ECollationStrength collationStrength
,
99 UErrorCode
& status
) : dataIsOwned(FALSE
)
102 getUCollationStrength(collationStrength
),
107 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
108 UColAttributeValue decompositionMode
,
109 UErrorCode
& status
) :
113 UCOL_DEFAULT_STRENGTH
,
118 RuleBasedCollator::RuleBasedCollator(const UnicodeString
& rules
,
119 ECollationStrength collationStrength
,
120 UColAttributeValue decompositionMode
,
121 UErrorCode
& status
) : dataIsOwned(FALSE
)
124 getUCollationStrength(collationStrength
),
130 RuleBasedCollator::setRuleStringFromCollator(UErrorCode
& status
)
133 if (U_SUCCESS(status
))
136 const UChar
*r
= ucol_getRules(ucollator
, &length
);
139 // alias the rules string
140 urulestring
= new UnicodeString(TRUE
, r
, length
);
143 urulestring
= new UnicodeString();
146 if (urulestring
== 0) {
147 status
= U_MEMORY_ALLOCATION_ERROR
;
153 // not aliasing, not write-through
155 RuleBasedCollator::construct(const UnicodeString
& rules
,
156 UColAttributeValue collationStrength
,
157 UColAttributeValue decompositionMode
,
161 ucollator
= ucol_openRules(rules
.getBuffer(), rules
.length(),
162 decompositionMode
, collationStrength
,
165 dataIsOwned
= TRUE
; // since we own a collator now, we need to get rid of it
166 isWriteThroughAlias
= FALSE
;
168 setRuleStringFromCollator(status
);
171 /* RuleBasedCollator public destructor ----------------------------------- */
173 RuleBasedCollator::~RuleBasedCollator()
177 ucol_close(ucollator
);
184 /* RuleBaseCollator public methods --------------------------------------- */
186 UBool
RuleBasedCollator::operator==(const Collator
& that
) const
188 /* only checks for address equals here */
189 if (Collator::operator==(that
))
192 if (getDynamicClassID() != that
.getDynamicClassID())
193 return FALSE
; /* not the same class */
195 RuleBasedCollator
& thatAlias
= (RuleBasedCollator
&)that
;
197 // weiv: use C function, commented code below is wrong
198 return ucol_equals(this->ucollator
, thatAlias
.ucollator
);
200 synwee : orginal code does not check for data compatibility
203 if (ucollator != thatAlias.ucollator)
210 // aliasing, not write-through
211 RuleBasedCollator
& RuleBasedCollator::operator=(const RuleBasedCollator
& that
)
217 ucol_close(ucollator
);
223 isWriteThroughAlias
= FALSE
;
224 ucollator
= that
.ucollator
;
225 urulestring
= that
.urulestring
;
230 // aliasing, not write-through
231 Collator
* RuleBasedCollator::clone() const
233 return new RuleBasedCollator(*this);
236 CollationElementIterator
* RuleBasedCollator::createCollationElementIterator
237 (const UnicodeString
& source
) const
239 UErrorCode status
= U_ZERO_ERROR
;
240 CollationElementIterator
*result
= new CollationElementIterator(source
, this,
242 if (U_FAILURE(status
)) {
251 * Create a CollationElementIterator object that will iterate over the
252 * elements in a string, using the collation rules defined in this
255 CollationElementIterator
* RuleBasedCollator::createCollationElementIterator
256 (const CharacterIterator
& source
) const
258 UErrorCode status
= U_ZERO_ERROR
;
259 CollationElementIterator
*result
= new CollationElementIterator(source
, this,
262 if (U_FAILURE(status
)) {
271 * Return a string representation of this collator's rules. The string can
272 * later be passed to the constructor that takes a UnicodeString argument,
273 * which will construct a collator that's functionally identical to this one.
274 * You can also allow users to edit the string in order to change the collation
275 * data, or you can print it out for inspection, or whatever.
277 const UnicodeString
& RuleBasedCollator::getRules() const
279 return (*urulestring
);
282 void RuleBasedCollator::getRules(UColRuleOption delta
, UnicodeString
&buffer
)
284 int32_t rulesize
= ucol_getRulesEx(ucollator
, delta
, NULL
, -1);
287 UChar
*rules
= (UChar
*) uprv_malloc( sizeof(UChar
) * (rulesize
) );
289 ucol_getRulesEx(ucollator
, delta
, rules
, rulesize
);
290 buffer
.setTo(rules
, rulesize
);
292 } else { // couldn't allocate
302 RuleBasedCollator::getTailoredSet(UErrorCode
&status
) const
304 if(U_FAILURE(status
)) {
307 return (UnicodeSet
*)ucol_getTailoredSet(this->ucollator
, &status
);
311 void RuleBasedCollator::getVersion(UVersionInfo versionInfo
) const
313 if (versionInfo
!=NULL
){
314 ucol_getVersion(ucollator
, versionInfo
);
318 Collator::EComparisonResult
RuleBasedCollator::compare(
319 const UnicodeString
& source
,
320 const UnicodeString
& target
,
321 int32_t length
) const
323 UErrorCode status
= U_ZERO_ERROR
;
324 return getEComparisonResult(compare(source
.getBuffer(), uprv_min(length
,source
.length()), target
.getBuffer(), uprv_min(length
,target
.length()), status
));
327 UCollationResult
RuleBasedCollator::compare(
328 const UnicodeString
& source
,
329 const UnicodeString
& target
,
331 UErrorCode
&status
) const
333 return compare(source
.getBuffer(), uprv_min(length
,source
.length()), target
.getBuffer(), uprv_min(length
,target
.length()), status
);
336 Collator::EComparisonResult
RuleBasedCollator::compare(const UChar
* source
,
337 int32_t sourceLength
,
339 int32_t targetLength
)
342 return getEComparisonResult(ucol_strcoll(ucollator
, source
, sourceLength
,
343 target
, targetLength
));
346 UCollationResult
RuleBasedCollator::compare(const UChar
* source
,
347 int32_t sourceLength
,
349 int32_t targetLength
,
350 UErrorCode
&status
) const
352 if(U_SUCCESS(status
)) {
353 return ucol_strcoll(ucollator
, source
, sourceLength
, target
, targetLength
);
360 * Compare two strings using this collator
362 Collator::EComparisonResult
RuleBasedCollator::compare(
363 const UnicodeString
& source
,
364 const UnicodeString
& target
) const
366 return getEComparisonResult(ucol_strcoll(ucollator
, source
.getBuffer(), source
.length(),
367 target
.getBuffer(), target
.length()));
370 UCollationResult
RuleBasedCollator::compare(
371 const UnicodeString
& source
,
372 const UnicodeString
& target
,
373 UErrorCode
&status
) const
375 if(U_SUCCESS(status
)) {
376 return ucol_strcoll(ucollator
, source
.getBuffer(), source
.length(),
377 target
.getBuffer(), target
.length());
384 * Retrieve a collation key for the specified string. The key can be compared
385 * with other collation keys using a bitwise comparison (e.g. memcmp) to find
386 * the ordering of their respective source strings. This is handy when doing a
387 * sort, where each sort key must be compared many times.
389 * The basic algorithm here is to find all of the collation elements for each
390 * character in the source string, convert them to an ASCII representation, and
391 * put them into the collation key. But it's trickier than that. Each
392 * collation element in a string has three components: primary ('A' vs 'B'),
393 * secondary ('u' vs 'ü'), and tertiary ('A' vs 'a'), and a primary difference
394 * at the end of a string takes precedence over a secondary or tertiary
395 * difference earlier in the string.
397 * To account for this, we put all of the primary orders at the beginning of
398 * the string, followed by the secondary and tertiary orders. Each set of
399 * orders is terminated by nulls so that a key for a string which is a initial
400 * substring of another key will compare less without any special case.
402 * Here's a hypothetical example, with the collation element represented as a
403 * three-digit number, one digit for primary, one for secondary, etc.
406 * Collation Elements: 101 100 201 511
407 * Collation Key: 1125<null>0001<null>1011<null>
409 * To make things even trickier, secondary differences (accent marks) are
410 * compared starting at the *end* of the string in languages with French
411 * secondary ordering. But when comparing the accent marks on a single base
412 * character, they are compared from the beginning. To handle this, we reverse
413 * all of the accents that belong to each base character, then we reverse the
414 * entire string of secondary orderings at the end.
416 CollationKey
& RuleBasedCollator::getCollationKey(
417 const UnicodeString
& source
,
418 CollationKey
& sortkey
,
419 UErrorCode
& status
) const
421 return getCollationKey(source
.getBuffer(), source
.length(), sortkey
, status
);
424 CollationKey
& RuleBasedCollator::getCollationKey(const UChar
* source
,
426 CollationKey
& sortkey
,
427 UErrorCode
& status
) const
429 if (U_FAILURE(status
))
431 return sortkey
.setToBogus();
434 if ((!source
) || (sourceLen
== 0)) {
435 return sortkey
.reset();
439 int32_t resultLen
= ucol_getSortKeyWithAllocation(ucollator
,
443 sortkey
.adopt(result
, resultLen
);
448 * Return the maximum length of any expansion sequences that end with the
449 * specified comparison order.
450 * @param order a collation order returned by previous or next.
451 * @return the maximum length of any expansion seuences ending with the
452 * specified order or 1 if collation order does not occur at the end of any
453 * expansion sequence.
454 * @see CollationElementIterator#getMaxExpansion
456 int32_t RuleBasedCollator::getMaxExpansion(int32_t order
) const
459 UCOL_GETMAXEXPANSION(ucollator
, (uint32_t)order
, result
);
463 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length
,
466 return ucol_cloneRuleData(ucollator
, &length
, &status
);
469 void RuleBasedCollator::setAttribute(UColAttribute attr
,
470 UColAttributeValue value
,
473 if (U_FAILURE(status
))
476 ucol_setAttribute(ucollator
, attr
, value
, &status
);
479 UColAttributeValue
RuleBasedCollator::getAttribute(UColAttribute attr
,
482 if (U_FAILURE(status
))
484 return ucol_getAttribute(ucollator
, attr
, &status
);
487 uint32_t RuleBasedCollator::setVariableTop(const UChar
*varTop
, int32_t len
, UErrorCode
&status
) {
489 return ucol_setVariableTop(ucollator
, varTop
, len
, &status
);
492 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop
, UErrorCode
&status
) {
494 return ucol_setVariableTop(ucollator
, varTop
.getBuffer(), varTop
.length(), &status
);
497 void RuleBasedCollator::setVariableTop(const uint32_t varTop
, UErrorCode
&status
) {
499 ucol_restoreVariableTop(ucollator
, varTop
, &status
);
502 uint32_t RuleBasedCollator::getVariableTop(UErrorCode
&status
) const {
503 return ucol_getVariableTop(ucollator
, &status
);
506 Collator
* RuleBasedCollator::safeClone(void)
508 UErrorCode intStatus
= U_ZERO_ERROR
;
509 int32_t buffersize
= U_COL_SAFECLONE_BUFFERSIZE
;
510 UCollator
*ucol
= ucol_safeClone(ucollator
, NULL
, &buffersize
,
512 if (U_FAILURE(intStatus
)) {
516 UnicodeString
*r
= new UnicodeString(*urulestring
);
517 RuleBasedCollator
*result
= new RuleBasedCollator(ucol
, r
);
518 result
->dataIsOwned
= TRUE
;
519 result
->isWriteThroughAlias
= FALSE
;
525 int32_t RuleBasedCollator::getSortKey(const UnicodeString
& source
,
526 uint8_t *result
, int32_t resultLength
)
529 return ucol_getSortKey(ucollator
, source
.getBuffer(), source
.length(), result
, resultLength
);
532 int32_t RuleBasedCollator::getSortKey(const UChar
*source
,
533 int32_t sourceLength
, uint8_t *result
,
534 int32_t resultLength
) const
536 return ucol_getSortKey(ucollator
, source
, sourceLength
, result
, resultLength
);
539 Collator::ECollationStrength
RuleBasedCollator::getStrength(void) const
541 UErrorCode intStatus
= U_ZERO_ERROR
;
542 return getECollationStrength(ucol_getAttribute(ucollator
, UCOL_STRENGTH
,
546 void RuleBasedCollator::setStrength(ECollationStrength newStrength
)
549 UErrorCode intStatus
= U_ZERO_ERROR
;
550 UCollationStrength strength
= getUCollationStrength(newStrength
);
551 ucol_setAttribute(ucollator
, UCOL_STRENGTH
, strength
, &intStatus
);
555 * Create a hash code for this collation. Just hash the main rule table -- that
556 * should be good enough for almost any use.
558 int32_t RuleBasedCollator::hashCode() const
561 const UChar
*rules
= ucol_getRules(ucollator
, &length
);
562 return uhash_hashUCharsN(rules
, length
);
566 * return the locale of this collator
568 const Locale
RuleBasedCollator::getLocale(ULocDataLocaleType type
, UErrorCode
&status
) const {
569 const char *result
= ucol_getLocale(ucollator
, type
, &status
);
575 return Locale(result
);
580 RuleBasedCollator::setLocales(const Locale
& requestedLocale
, const Locale
& validLocale
) {
582 size_t rlen
= uprv_strlen(requestedLocale
.getName());
583 char* rloc
= (char *)uprv_malloc((rlen
+1)*sizeof(char));
585 uprv_strcpy(rloc
, requestedLocale
.getName());
586 size_t vlen
= uprv_strlen(validLocale
.getName());
587 char* vloc
= (char*)uprv_malloc((vlen
+1)*sizeof(char));
589 uprv_strcpy(vloc
, validLocale
.getName());
590 ucol_setReqValidLocales(ucollator
, rloc
, vloc
);
597 // RuleBaseCollatorNew private constructor ----------------------------------
599 RuleBasedCollator::RuleBasedCollator()
600 : dataIsOwned(FALSE
), isWriteThroughAlias(FALSE
), ucollator(0), urulestring(0)
604 RuleBasedCollator::RuleBasedCollator(UCollator
*collator
,
606 : dataIsOwned(FALSE
), isWriteThroughAlias(FALSE
), urulestring(0)
608 ucollator
= collator
;
612 RuleBasedCollator::RuleBasedCollator(const Locale
& desiredLocale
,
613 UErrorCode
& status
) :
614 dataIsOwned(FALSE
), ucollator(0), urulestring(0)
616 if (U_FAILURE(status
))
620 Try to load, in order:
621 1. The desired locale's collation.
622 2. A fallback of the desired locale.
623 3. The default locale's collation.
624 4. A fallback of the default locale.
625 5. The default collation rules, which contains en_US collation rules.
627 To reiterate, we try:
629 language+country+variant
633 language+country+variant
636 Root: (aka DEFAULTRULES)
637 steps 1-5 are handled by resource bundle fallback mechanism.
638 however, in a very unprobable situation that no resource bundle
639 data exists, step 5 is repeated with hardcoded default rules.
642 setUCollator(desiredLocale
, status
);
644 if (U_FAILURE(status
))
646 status
= U_ZERO_ERROR
;
648 setUCollator(kRootLocaleName
, status
);
649 if (status
== U_ZERO_ERROR
) {
650 status
= U_USING_DEFAULT_WARNING
;
654 if (U_SUCCESS(status
))
657 const UChar
*r
= ucol_getRules(ucollator
, &length
);
659 // alias the rules string
660 urulestring
= new UnicodeString(TRUE
, r
, length
);
663 urulestring
= new UnicodeString();
666 if (urulestring
== 0) {
667 status
= U_MEMORY_ALLOCATION_ERROR
;
671 isWriteThroughAlias
= FALSE
;
678 RuleBasedCollator::setUCollator(const char *locale
,
681 if (U_FAILURE(status
))
683 if (ucollator
&& dataIsOwned
)
684 ucol_close(ucollator
);
685 ucollator
= ucol_open_internal(locale
, &status
);
687 isWriteThroughAlias
= FALSE
;
692 RuleBasedCollator::checkOwned() {
693 if (!(dataIsOwned
|| isWriteThroughAlias
)) {
694 UErrorCode status
= U_ZERO_ERROR
;
695 ucollator
= ucol_safeClone(ucollator
, NULL
, NULL
, &status
);
696 setRuleStringFromCollator(status
);
698 isWriteThroughAlias
= FALSE
;
702 /* RuleBasedCollator private data members -------------------------------- */
706 * These should probably be enums (<=0xffff) or #defines (>0xffff)
707 * for better performance.
708 * Include ucol_imp.h and use its constants if possible.
709 * Only used in coleitr.h?!
713 /* need look up in .commit() */
714 const int32_t RuleBasedCollator::CHARINDEX
= 0x70000000;
715 /* Expand index follows */
716 const int32_t RuleBasedCollator::EXPANDCHARINDEX
= 0x7E000000;
717 /* contract indexes follows */
718 const int32_t RuleBasedCollator::CONTRACTCHARINDEX
= 0x7F000000;
719 /* unmapped character values */
720 const int32_t RuleBasedCollator::UNMAPPED
= 0xFFFFFFFF;
721 /* primary strength increment */
722 const int32_t RuleBasedCollator::PRIMARYORDERINCREMENT
= 0x00010000;
723 /* secondary strength increment */
724 const int32_t RuleBasedCollator::SECONDARYORDERINCREMENT
= 0x00000100;
725 /* tertiary strength increment */
726 const int32_t RuleBasedCollator::TERTIARYORDERINCREMENT
= 0x00000001;
727 /* mask off anything but primary order */
728 const int32_t RuleBasedCollator::PRIMARYORDERMASK
= 0xffff0000;
729 /* mask off anything but secondary order */
730 const int32_t RuleBasedCollator::SECONDARYORDERMASK
= 0x0000ff00;
731 /* mask off anything but tertiary order */
732 const int32_t RuleBasedCollator::TERTIARYORDERMASK
= 0x000000ff;
733 /* mask off ignorable char order */
734 const int32_t RuleBasedCollator::IGNORABLEMASK
= 0x0000ffff;
735 /* use only the primary difference */
736 const int32_t RuleBasedCollator::PRIMARYDIFFERENCEONLY
= 0xffff0000;
737 /* use only the primary and secondary difference */
738 const int32_t RuleBasedCollator::SECONDARYDIFFERENCEONLY
= 0xffffff00;
739 /* primary order shift */
740 const int32_t RuleBasedCollator::PRIMARYORDERSHIFT
= 16;
741 /* secondary order shift */
742 const int32_t RuleBasedCollator::SECONDARYORDERSHIFT
= 8;
743 /* starting value for collation elements */
744 const int32_t RuleBasedCollator::COLELEMENTSTART
= 0x02020202;
745 /* testing mask for primary low element */
746 const int32_t RuleBasedCollator::PRIMARYLOWZEROMASK
= 0x00FF0000;
747 /* reseting value for secondaries and tertiaries */
748 const int32_t RuleBasedCollator::RESETSECONDARYTERTIARY
= 0x00000202;
749 /* reseting value for tertiaries */
750 const int32_t RuleBasedCollator::RESETTERTIARY
= 0x00000002;
752 const int32_t RuleBasedCollator::PRIMIGNORABLE
= 0x0202;
754 /* unique file id for parity check */
755 const int16_t RuleBasedCollator::FILEID
= 0x5443;
756 /* binary collation file extension */
757 const char RuleBasedCollator::kFilenameSuffix
[] = ".col";
758 /* class id ? Value is irrelevant */
759 const char RuleBasedCollator::fgClassID
= 0;
763 #endif /* #if !UCONFIG_NO_COLLATION */