]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/tblcoll.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / tblcoll.cpp
... / ...
CommitLineData
1/*
2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8/**
9 * File tblcoll.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 2/5/97 aliu Added streamIn and streamOut methods. Added
17 * constructor which reads RuleBasedCollator object from
18 * a binary file. Added writeToFile method which streams
19 * RuleBasedCollator out to a binary file. The streamIn
20 * and streamOut methods use istream and ostream objects
21 * in binary mode.
22 * 2/11/97 aliu Moved declarations out of for loop initializer.
23 * Added Mac compatibility #ifdef for ios::nocreate.
24 * 2/12/97 aliu Modified to use TableCollationData sub-object to
25 * hold invariant data.
26 * 2/13/97 aliu Moved several methods into this class from Collation.
27 * Added a private RuleBasedCollator(Locale&) constructor,
28 * to be used by Collator::getInstance(). General
29 * clean up. Made use of UErrorCode variables consistent.
30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
31 * constructor and getDynamicClassID.
32 * 3/5/97 aliu Changed compaction cycle to improve performance. We
33 * use the maximum allowable value which is kBlockCount.
34 * Modified getRules() to load rules dynamically. Changed
35 * constructFromFile() call to accomodate this (added
36 * parameter to specify whether binary loading is to
37 * take place).
38 * 05/06/97 helena Added memory allocation error check.
39 * 6/20/97 helena Java class name change.
40 * 6/23/97 helena Adding comments to make code more readable.
41 * 09/03/97 helena Added createCollationKeyValues().
42 * 06/26/98 erm Changes for CollationKeys using byte arrays.
43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99 stephen Removed EDecompositionMode, merged with
45 * Normalizer::EMode
46 * 06/14/99 stephen Removed kResourceBundleSuffix
47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
48 * files are no longer used.
49 * 11/02/99 helena Collator performance enhancements. Special case
50 * for NO_OP situations.
51 * 11/17/99 srl More performance enhancements. Inlined some internal functions.
52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
53 * to implementation file.
54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
55 */
56
57#include <typeinfo> // for 'typeid' to work
58
59#include "unicode/utypes.h"
60
61#if !UCONFIG_NO_COLLATION
62
63#include "unicode/tblcoll.h"
64#include "unicode/coleitr.h"
65#include "unicode/ures.h"
66#include "unicode/uset.h"
67#include "ucol_imp.h"
68#include "uresimp.h"
69#include "uhash.h"
70#include "cmemory.h"
71#include "cstring.h"
72#include "putilimp.h"
73
74/* public RuleBasedCollator constructor ---------------------------------- */
75
76U_NAMESPACE_BEGIN
77
78/**
79* Copy constructor, aliasing, not write-through
80*/
81RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
82: Collator(that)
83, dataIsOwned(FALSE)
84, isWriteThroughAlias(FALSE)
85, ucollator(NULL)
86{
87 RuleBasedCollator::operator=(that);
88}
89
90RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
91 UErrorCode& status) :
92dataIsOwned(FALSE)
93{
94 construct(rules,
95 UCOL_DEFAULT_STRENGTH,
96 UCOL_DEFAULT,
97 status);
98}
99
100RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
101 ECollationStrength collationStrength,
102 UErrorCode& status) : dataIsOwned(FALSE)
103{
104 construct(rules,
105 getUCollationStrength(collationStrength),
106 UCOL_DEFAULT,
107 status);
108}
109
110RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
111 UColAttributeValue decompositionMode,
112 UErrorCode& status) :
113dataIsOwned(FALSE)
114{
115 construct(rules,
116 UCOL_DEFAULT_STRENGTH,
117 decompositionMode,
118 status);
119}
120
121RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
122 ECollationStrength collationStrength,
123 UColAttributeValue decompositionMode,
124 UErrorCode& status) : dataIsOwned(FALSE)
125{
126 construct(rules,
127 getUCollationStrength(collationStrength),
128 decompositionMode,
129 status);
130}
131RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
132 const RuleBasedCollator *base,
133 UErrorCode &status) :
134dataIsOwned(TRUE),
135isWriteThroughAlias(FALSE)
136{
137 ucollator = ucol_openBinary(bin, length, base->ucollator, &status);
138}
139
140void
141RuleBasedCollator::setRuleStringFromCollator()
142{
143 int32_t length;
144 const UChar *r = ucol_getRules(ucollator, &length);
145
146 if (r && length > 0) {
147 // alias the rules string
148 urulestring.setTo(TRUE, r, length);
149 }
150 else {
151 urulestring.truncate(0); // Clear string.
152 }
153}
154
155// not aliasing, not write-through
156void
157RuleBasedCollator::construct(const UnicodeString& rules,
158 UColAttributeValue collationStrength,
159 UColAttributeValue decompositionMode,
160 UErrorCode& status)
161{
162 ucollator = ucol_openRules(rules.getBuffer(), rules.length(),
163 decompositionMode, collationStrength,
164 NULL, &status);
165
166 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it
167 isWriteThroughAlias = FALSE;
168
169 if(ucollator == NULL) {
170 if(U_SUCCESS(status)) {
171 status = U_MEMORY_ALLOCATION_ERROR;
172 }
173 return; // Failure
174 }
175
176 setRuleStringFromCollator();
177}
178
179/* RuleBasedCollator public destructor ----------------------------------- */
180
181RuleBasedCollator::~RuleBasedCollator()
182{
183 if (dataIsOwned)
184 {
185 ucol_close(ucollator);
186 }
187 ucollator = 0;
188}
189
190/* RuleBaseCollator public methods --------------------------------------- */
191
192UBool RuleBasedCollator::operator==(const Collator& that) const
193{
194 /* only checks for address equals here */
195 if (Collator::operator==(that))
196 return TRUE;
197
198 if (typeid(*this) != typeid(that))
199 return FALSE; /* not the same class */
200
201 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
202
203 // weiv: use C function, commented code below is wrong
204 return ucol_equals(this->ucollator, thatAlias.ucollator);
205 /*
206 synwee : orginal code does not check for data compatibility
207 */
208 /*
209 if (ucollator != thatAlias.ucollator)
210 return FALSE;
211
212 return TRUE;
213 */
214}
215
216UBool RuleBasedCollator::operator!=(const Collator& other) const
217{
218 return !(*this == other);
219}
220
221// aliasing, not write-through
222RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that)
223{
224 if (this != &that)
225 {
226 if (dataIsOwned)
227 {
228 ucol_close(ucollator);
229 }
230
231 urulestring.truncate(0); // empty the rule string
232 dataIsOwned = TRUE;
233 isWriteThroughAlias = FALSE;
234
235 UErrorCode intStatus = U_ZERO_ERROR;
236 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
237 ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize,
238 &intStatus);
239 if (U_SUCCESS(intStatus)) {
240 setRuleStringFromCollator();
241 }
242 }
243 return *this;
244}
245
246// aliasing, not write-through
247Collator* RuleBasedCollator::clone() const
248{
249 return new RuleBasedCollator(*this);
250}
251
252CollationElementIterator* RuleBasedCollator::createCollationElementIterator
253 (const UnicodeString& source) const
254{
255 UErrorCode status = U_ZERO_ERROR;
256 CollationElementIterator *result = new CollationElementIterator(source, this,
257 status);
258 if (U_FAILURE(status)) {
259 delete result;
260 return NULL;
261 }
262
263 return result;
264}
265
266/**
267* Create a CollationElementIterator object that will iterate over the
268* elements in a string, using the collation rules defined in this
269* RuleBasedCollator
270*/
271CollationElementIterator* RuleBasedCollator::createCollationElementIterator
272 (const CharacterIterator& source) const
273{
274 UErrorCode status = U_ZERO_ERROR;
275 CollationElementIterator *result = new CollationElementIterator(source, this,
276 status);
277
278 if (U_FAILURE(status)) {
279 delete result;
280 return NULL;
281 }
282
283 return result;
284}
285
286/**
287* Return a string representation of this collator's rules. The string can
288* later be passed to the constructor that takes a UnicodeString argument,
289* which will construct a collator that's functionally identical to this one.
290* You can also allow users to edit the string in order to change the collation
291* data, or you can print it out for inspection, or whatever.
292*/
293const UnicodeString& RuleBasedCollator::getRules() const
294{
295 return urulestring;
296}
297
298void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
299{
300 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1);
301
302 if (rulesize > 0) {
303 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) );
304 if(rules != NULL) {
305 ucol_getRulesEx(ucollator, delta, rules, rulesize);
306 buffer.setTo(rules, rulesize);
307 uprv_free(rules);
308 } else { // couldn't allocate
309 buffer.remove();
310 }
311 }
312 else {
313 buffer.remove();
314 }
315}
316
317UnicodeSet *
318RuleBasedCollator::getTailoredSet(UErrorCode &status) const
319{
320 if(U_FAILURE(status)) {
321 return NULL;
322 }
323 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
324}
325
326
327void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const
328{
329 if (versionInfo!=NULL){
330 ucol_getVersion(ucollator, versionInfo);
331 }
332}
333
334Collator::EComparisonResult RuleBasedCollator::compare(
335 const UnicodeString& source,
336 const UnicodeString& target,
337 int32_t length) const
338{
339 UErrorCode status = U_ZERO_ERROR;
340 return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status));
341}
342
343UCollationResult RuleBasedCollator::compare(
344 const UnicodeString& source,
345 const UnicodeString& target,
346 int32_t length,
347 UErrorCode &status) const
348{
349 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status);
350}
351
352Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source,
353 int32_t sourceLength,
354 const UChar* target,
355 int32_t targetLength)
356 const
357{
358 return getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength,
359 target, targetLength));
360}
361
362UCollationResult RuleBasedCollator::compare(const UChar* source,
363 int32_t sourceLength,
364 const UChar* target,
365 int32_t targetLength,
366 UErrorCode &status) const
367{
368 if(U_SUCCESS(status)) {
369 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength);
370 } else {
371 return UCOL_EQUAL;
372 }
373}
374
375/**
376* Compare two strings using this collator
377*/
378Collator::EComparisonResult RuleBasedCollator::compare(
379 const UnicodeString& source,
380 const UnicodeString& target) const
381{
382 return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(),
383 target.getBuffer(), target.length()));
384}
385
386UCollationResult RuleBasedCollator::compare(
387 const UnicodeString& source,
388 const UnicodeString& target,
389 UErrorCode &status) const
390{
391 if(U_SUCCESS(status)) {
392 return ucol_strcoll(ucollator, source.getBuffer(), source.length(),
393 target.getBuffer(), target.length());
394 } else {
395 return UCOL_EQUAL;
396 }
397}
398
399UCollationResult RuleBasedCollator::compare(UCharIterator &sIter,
400 UCharIterator &tIter,
401 UErrorCode &status) const {
402 if(U_SUCCESS(status)) {
403 return ucol_strcollIter(ucollator, &sIter, &tIter, &status);
404 } else {
405 return UCOL_EQUAL;
406 }
407}
408
409/**
410* Retrieve a collation key for the specified string. The key can be compared
411* with other collation keys using a bitwise comparison (e.g. memcmp) to find
412* the ordering of their respective source strings. This is handy when doing a
413* sort, where each sort key must be compared many times.
414*
415* The basic algorithm here is to find all of the collation elements for each
416* character in the source string, convert them to an ASCII representation, and
417* put them into the collation key. But it's trickier than that. Each
418* collation element in a string has three components: primary ('A' vs 'B'),
419* secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
420* at the end of a string takes precedence over a secondary or tertiary
421* difference earlier in the string.
422*
423* To account for this, we put all of the primary orders at the beginning of
424* the string, followed by the secondary and tertiary orders. Each set of
425* orders is terminated by nulls so that a key for a string which is a initial
426* substring of another key will compare less without any special case.
427*
428* Here's a hypothetical example, with the collation element represented as a
429* three-digit number, one digit for primary, one for secondary, etc.
430*
431* String: A a B \u00C9
432* Collation Elements: 101 100 201 511
433* Collation Key: 1125<null>0001<null>1011<null>
434*
435* To make things even trickier, secondary differences (accent marks) are
436* compared starting at the *end* of the string in languages with French
437* secondary ordering. But when comparing the accent marks on a single base
438* character, they are compared from the beginning. To handle this, we reverse
439* all of the accents that belong to each base character, then we reverse the
440* entire string of secondary orderings at the end.
441*/
442CollationKey& RuleBasedCollator::getCollationKey(
443 const UnicodeString& source,
444 CollationKey& sortkey,
445 UErrorCode& status) const
446{
447 return getCollationKey(source.getBuffer(), source.length(), sortkey, status);
448}
449
450CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
451 int32_t sourceLen,
452 CollationKey& sortkey,
453 UErrorCode& status) const
454{
455 if (U_FAILURE(status))
456 {
457 return sortkey.setToBogus();
458 }
459
460 if ((!source) || (sourceLen == 0)) {
461 return sortkey.reset();
462 }
463
464 uint8_t *result;
465 int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator,
466 source, sourceLen,
467 &result,
468 &status);
469 sortkey.adopt(result, resultLen);
470 return sortkey;
471}
472
473/**
474 * Return the maximum length of any expansion sequences that end with the
475 * specified comparison order.
476 * @param order a collation order returned by previous or next.
477 * @return the maximum length of any expansion seuences ending with the
478 * specified order or 1 if collation order does not occur at the end of any
479 * expansion sequence.
480 * @see CollationElementIterator#getMaxExpansion
481 */
482int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
483{
484 uint8_t result;
485 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result);
486 return result;
487}
488
489uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length,
490 UErrorCode &status)
491{
492 return ucol_cloneRuleData(ucollator, &length, &status);
493}
494
495
496int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status)
497{
498 return ucol_cloneBinary(ucollator, buffer, capacity, &status);
499}
500
501void RuleBasedCollator::setAttribute(UColAttribute attr,
502 UColAttributeValue value,
503 UErrorCode &status)
504{
505 if (U_FAILURE(status))
506 return;
507 checkOwned();
508 ucol_setAttribute(ucollator, attr, value, &status);
509}
510
511UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr,
512 UErrorCode &status)
513{
514 if (U_FAILURE(status))
515 return UCOL_DEFAULT;
516 return ucol_getAttribute(ucollator, attr, &status);
517}
518
519uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) {
520 checkOwned();
521 return ucol_setVariableTop(ucollator, varTop, len, &status);
522}
523
524uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) {
525 checkOwned();
526 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status);
527}
528
529void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) {
530 checkOwned();
531 ucol_restoreVariableTop(ucollator, varTop, &status);
532}
533
534uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const {
535 return ucol_getVariableTop(ucollator, &status);
536}
537
538Collator* RuleBasedCollator::safeClone(void)
539{
540 UErrorCode intStatus = U_ZERO_ERROR;
541 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
542 UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize,
543 &intStatus);
544 if (U_FAILURE(intStatus)) {
545 return NULL;
546 }
547
548 RuleBasedCollator *result = new RuleBasedCollator();
549 // Null pointer check
550 if (result != NULL) {
551 result->ucollator = ucol;
552 result->dataIsOwned = TRUE;
553 result->isWriteThroughAlias = FALSE;
554 setRuleStringFromCollator();
555 }
556
557 return result;
558}
559
560
561int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
562 uint8_t *result, int32_t resultLength)
563 const
564{
565 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength);
566}
567
568int32_t RuleBasedCollator::getSortKey(const UChar *source,
569 int32_t sourceLength, uint8_t *result,
570 int32_t resultLength) const
571{
572 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength);
573}
574
575Collator::ECollationStrength RuleBasedCollator::getStrength(void) const
576{
577 UErrorCode intStatus = U_ZERO_ERROR;
578 return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH,
579 &intStatus));
580}
581
582void RuleBasedCollator::setStrength(ECollationStrength newStrength)
583{
584 checkOwned();
585 UErrorCode intStatus = U_ZERO_ERROR;
586 UCollationStrength strength = getUCollationStrength(newStrength);
587 ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus);
588}
589
590int32_t RuleBasedCollator::getReorderCodes(int32_t *dest,
591 int32_t destCapacity,
592 UErrorCode& status) const
593{
594 return ucol_getReorderCodes(ucollator, dest, destCapacity, &status);
595}
596
597void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes,
598 int32_t reorderCodesLength,
599 UErrorCode& status)
600{
601 ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status);
602}
603
604
605/**
606* Create a hash code for this collation. Just hash the main rule table -- that
607* should be good enough for almost any use.
608*/
609int32_t RuleBasedCollator::hashCode() const
610{
611 int32_t length;
612 const UChar *rules = ucol_getRules(ucollator, &length);
613 return uhash_hashUCharsN(rules, length);
614}
615
616/**
617* return the locale of this collator
618*/
619const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const {
620 const char *result = ucol_getLocaleByType(ucollator, type, &status);
621 if(result == NULL) {
622 Locale res("");
623 res.setToBogus();
624 return res;
625 } else {
626 return Locale(result);
627 }
628}
629
630void
631RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) {
632 checkOwned();
633 char* rloc = uprv_strdup(requestedLocale.getName());
634 if (rloc) {
635 char* vloc = uprv_strdup(validLocale.getName());
636 if (vloc) {
637 char* aloc = uprv_strdup(actualLocale.getName());
638 if (aloc) {
639 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc);
640 return;
641 }
642 uprv_free(vloc);
643 }
644 uprv_free(rloc);
645 }
646}
647
648// RuleBaseCollatorNew private constructor ----------------------------------
649
650RuleBasedCollator::RuleBasedCollator()
651 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
652{
653}
654
655RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale,
656 UErrorCode& status)
657 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
658{
659 if (U_FAILURE(status))
660 return;
661
662 /*
663 Try to load, in order:
664 1. The desired locale's collation.
665 2. A fallback of the desired locale.
666 3. The default locale's collation.
667 4. A fallback of the default locale.
668 5. The default collation rules, which contains en_US collation rules.
669
670 To reiterate, we try:
671 Specific:
672 language+country+variant
673 language+country
674 language
675 Default:
676 language+country+variant
677 language+country
678 language
679 Root: (aka DEFAULTRULES)
680 steps 1-5 are handled by resource bundle fallback mechanism.
681 however, in a very unprobable situation that no resource bundle
682 data exists, step 5 is repeated with hardcoded default rules.
683 */
684
685 setUCollator(desiredLocale, status);
686
687 if (U_FAILURE(status))
688 {
689 status = U_ZERO_ERROR;
690
691 setUCollator(kRootLocaleName, status);
692 if (status == U_ZERO_ERROR) {
693 status = U_USING_DEFAULT_WARNING;
694 }
695 }
696
697 if (U_SUCCESS(status))
698 {
699 setRuleStringFromCollator();
700 }
701}
702
703void
704RuleBasedCollator::setUCollator(const char *locale,
705 UErrorCode &status)
706{
707 if (U_FAILURE(status))
708 return;
709 if (ucollator && dataIsOwned)
710 ucol_close(ucollator);
711 ucollator = ucol_open_internal(locale, &status);
712 dataIsOwned = TRUE;
713 isWriteThroughAlias = FALSE;
714}
715
716
717void
718RuleBasedCollator::checkOwned() {
719 if (!(dataIsOwned || isWriteThroughAlias)) {
720 UErrorCode status = U_ZERO_ERROR;
721 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status);
722 setRuleStringFromCollator();
723 dataIsOwned = TRUE;
724 isWriteThroughAlias = FALSE;
725 }
726}
727
728UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
729
730U_NAMESPACE_END
731
732#endif /* #if !UCONFIG_NO_COLLATION */