]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/tblcoll.cpp
ICU-461.13.tar.gz
[apple/icu.git] / icuSources / i18n / tblcoll.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8 /**
9 * File tblcoll.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 2/5/97 aliu Added streamIn and streamOut methods. Added
17 * constructor which reads RuleBasedCollator object from
18 * a binary file. Added writeToFile method which streams
19 * RuleBasedCollator out to a binary file. The streamIn
20 * and streamOut methods use istream and ostream objects
21 * in binary mode.
22 * 2/11/97 aliu Moved declarations out of for loop initializer.
23 * Added Mac compatibility #ifdef for ios::nocreate.
24 * 2/12/97 aliu Modified to use TableCollationData sub-object to
25 * hold invariant data.
26 * 2/13/97 aliu Moved several methods into this class from Collation.
27 * Added a private RuleBasedCollator(Locale&) constructor,
28 * to be used by Collator::getInstance(). General
29 * clean up. Made use of UErrorCode variables consistent.
30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
31 * constructor and getDynamicClassID.
32 * 3/5/97 aliu Changed compaction cycle to improve performance. We
33 * use the maximum allowable value which is kBlockCount.
34 * Modified getRules() to load rules dynamically. Changed
35 * constructFromFile() call to accomodate this (added
36 * parameter to specify whether binary loading is to
37 * take place).
38 * 05/06/97 helena Added memory allocation error check.
39 * 6/20/97 helena Java class name change.
40 * 6/23/97 helena Adding comments to make code more readable.
41 * 09/03/97 helena Added createCollationKeyValues().
42 * 06/26/98 erm Changes for CollationKeys using byte arrays.
43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99 stephen Removed EDecompositionMode, merged with
45 * Normalizer::EMode
46 * 06/14/99 stephen Removed kResourceBundleSuffix
47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
48 * files are no longer used.
49 * 11/02/99 helena Collator performance enhancements. Special case
50 * for NO_OP situations.
51 * 11/17/99 srl More performance enhancements. Inlined some internal functions.
52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
53 * to implementation file.
54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
55 */
56
57 #include <typeinfo> // for 'typeid' to work
58
59 #include "unicode/utypes.h"
60
61 #if !UCONFIG_NO_COLLATION
62
63 #include "unicode/tblcoll.h"
64 #include "unicode/coleitr.h"
65 #include "unicode/ures.h"
66 #include "unicode/uset.h"
67 #include "ucol_imp.h"
68 #include "uresimp.h"
69 #include "uhash.h"
70 #include "cmemory.h"
71 #include "cstring.h"
72 #include "putilimp.h"
73
74 /* public RuleBasedCollator constructor ---------------------------------- */
75
76 U_NAMESPACE_BEGIN
77
78 /**
79 * Copy constructor, aliasing, not write-through
80 */
81 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
82 : Collator(that)
83 , dataIsOwned(FALSE)
84 , isWriteThroughAlias(FALSE)
85 , ucollator(NULL)
86 {
87 RuleBasedCollator::operator=(that);
88 }
89
90 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
91 UErrorCode& status) :
92 dataIsOwned(FALSE)
93 {
94 construct(rules,
95 UCOL_DEFAULT_STRENGTH,
96 UCOL_DEFAULT,
97 status);
98 }
99
100 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
101 ECollationStrength collationStrength,
102 UErrorCode& status) : dataIsOwned(FALSE)
103 {
104 construct(rules,
105 getUCollationStrength(collationStrength),
106 UCOL_DEFAULT,
107 status);
108 }
109
110 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
111 UColAttributeValue decompositionMode,
112 UErrorCode& status) :
113 dataIsOwned(FALSE)
114 {
115 construct(rules,
116 UCOL_DEFAULT_STRENGTH,
117 decompositionMode,
118 status);
119 }
120
121 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
122 ECollationStrength collationStrength,
123 UColAttributeValue decompositionMode,
124 UErrorCode& status) : dataIsOwned(FALSE)
125 {
126 construct(rules,
127 getUCollationStrength(collationStrength),
128 decompositionMode,
129 status);
130 }
131 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
132 const RuleBasedCollator *base,
133 UErrorCode &status) :
134 dataIsOwned(TRUE),
135 isWriteThroughAlias(FALSE)
136 {
137 ucollator = ucol_openBinary(bin, length, base->ucollator, &status);
138 }
139
140 void
141 RuleBasedCollator::setRuleStringFromCollator()
142 {
143 int32_t length;
144 const UChar *r = ucol_getRules(ucollator, &length);
145
146 if (r && length > 0) {
147 // alias the rules string
148 urulestring.setTo(TRUE, r, length);
149 }
150 else {
151 urulestring.truncate(0); // Clear string.
152 }
153 }
154
155 // not aliasing, not write-through
156 void
157 RuleBasedCollator::construct(const UnicodeString& rules,
158 UColAttributeValue collationStrength,
159 UColAttributeValue decompositionMode,
160 UErrorCode& status)
161 {
162 ucollator = ucol_openRules(rules.getBuffer(), rules.length(),
163 decompositionMode, collationStrength,
164 NULL, &status);
165
166 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it
167 isWriteThroughAlias = FALSE;
168
169 if(ucollator == NULL) {
170 if(U_SUCCESS(status)) {
171 status = U_MEMORY_ALLOCATION_ERROR;
172 }
173 return; // Failure
174 }
175
176 setRuleStringFromCollator();
177 }
178
179 /* RuleBasedCollator public destructor ----------------------------------- */
180
181 RuleBasedCollator::~RuleBasedCollator()
182 {
183 if (dataIsOwned)
184 {
185 ucol_close(ucollator);
186 }
187 ucollator = 0;
188 }
189
190 /* RuleBaseCollator public methods --------------------------------------- */
191
192 UBool RuleBasedCollator::operator==(const Collator& that) const
193 {
194 /* only checks for address equals here */
195 if (Collator::operator==(that))
196 return TRUE;
197
198 if (typeid(*this) != typeid(that))
199 return FALSE; /* not the same class */
200
201 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
202
203 // weiv: use C function, commented code below is wrong
204 return ucol_equals(this->ucollator, thatAlias.ucollator);
205 /*
206 synwee : orginal code does not check for data compatibility
207 */
208 /*
209 if (ucollator != thatAlias.ucollator)
210 return FALSE;
211
212 return TRUE;
213 */
214 }
215
216 UBool RuleBasedCollator::operator!=(const Collator& other) const
217 {
218 return !(*this == other);
219 }
220
221 // aliasing, not write-through
222 RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that)
223 {
224 if (this != &that)
225 {
226 if (dataIsOwned)
227 {
228 ucol_close(ucollator);
229 }
230
231 urulestring.truncate(0); // empty the rule string
232 dataIsOwned = TRUE;
233 isWriteThroughAlias = FALSE;
234
235 UErrorCode intStatus = U_ZERO_ERROR;
236 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
237 ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize,
238 &intStatus);
239 if (U_SUCCESS(intStatus)) {
240 setRuleStringFromCollator();
241 }
242 }
243 return *this;
244 }
245
246 // aliasing, not write-through
247 Collator* RuleBasedCollator::clone() const
248 {
249 return new RuleBasedCollator(*this);
250 }
251
252 CollationElementIterator* RuleBasedCollator::createCollationElementIterator
253 (const UnicodeString& source) const
254 {
255 UErrorCode status = U_ZERO_ERROR;
256 CollationElementIterator *result = new CollationElementIterator(source, this,
257 status);
258 if (U_FAILURE(status)) {
259 delete result;
260 return NULL;
261 }
262
263 return result;
264 }
265
266 /**
267 * Create a CollationElementIterator object that will iterate over the
268 * elements in a string, using the collation rules defined in this
269 * RuleBasedCollator
270 */
271 CollationElementIterator* RuleBasedCollator::createCollationElementIterator
272 (const CharacterIterator& source) const
273 {
274 UErrorCode status = U_ZERO_ERROR;
275 CollationElementIterator *result = new CollationElementIterator(source, this,
276 status);
277
278 if (U_FAILURE(status)) {
279 delete result;
280 return NULL;
281 }
282
283 return result;
284 }
285
286 /**
287 * Return a string representation of this collator's rules. The string can
288 * later be passed to the constructor that takes a UnicodeString argument,
289 * which will construct a collator that's functionally identical to this one.
290 * You can also allow users to edit the string in order to change the collation
291 * data, or you can print it out for inspection, or whatever.
292 */
293 const UnicodeString& RuleBasedCollator::getRules() const
294 {
295 return urulestring;
296 }
297
298 void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
299 {
300 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1);
301
302 if (rulesize > 0) {
303 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) );
304 if(rules != NULL) {
305 ucol_getRulesEx(ucollator, delta, rules, rulesize);
306 buffer.setTo(rules, rulesize);
307 uprv_free(rules);
308 } else { // couldn't allocate
309 buffer.remove();
310 }
311 }
312 else {
313 buffer.remove();
314 }
315 }
316
317 UnicodeSet *
318 RuleBasedCollator::getTailoredSet(UErrorCode &status) const
319 {
320 if(U_FAILURE(status)) {
321 return NULL;
322 }
323 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
324 }
325
326
327 void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const
328 {
329 if (versionInfo!=NULL){
330 ucol_getVersion(ucollator, versionInfo);
331 }
332 }
333
334 Collator::EComparisonResult RuleBasedCollator::compare(
335 const UnicodeString& source,
336 const UnicodeString& target,
337 int32_t length) const
338 {
339 UErrorCode status = U_ZERO_ERROR;
340 return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status));
341 }
342
343 UCollationResult RuleBasedCollator::compare(
344 const UnicodeString& source,
345 const UnicodeString& target,
346 int32_t length,
347 UErrorCode &status) const
348 {
349 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status);
350 }
351
352 Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source,
353 int32_t sourceLength,
354 const UChar* target,
355 int32_t targetLength)
356 const
357 {
358 return getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength,
359 target, targetLength));
360 }
361
362 UCollationResult RuleBasedCollator::compare(const UChar* source,
363 int32_t sourceLength,
364 const UChar* target,
365 int32_t targetLength,
366 UErrorCode &status) const
367 {
368 if(U_SUCCESS(status)) {
369 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength);
370 } else {
371 return UCOL_EQUAL;
372 }
373 }
374
375 /**
376 * Compare two strings using this collator
377 */
378 Collator::EComparisonResult RuleBasedCollator::compare(
379 const UnicodeString& source,
380 const UnicodeString& target) const
381 {
382 return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(),
383 target.getBuffer(), target.length()));
384 }
385
386 UCollationResult RuleBasedCollator::compare(
387 const UnicodeString& source,
388 const UnicodeString& target,
389 UErrorCode &status) const
390 {
391 if(U_SUCCESS(status)) {
392 return ucol_strcoll(ucollator, source.getBuffer(), source.length(),
393 target.getBuffer(), target.length());
394 } else {
395 return UCOL_EQUAL;
396 }
397 }
398
399 UCollationResult RuleBasedCollator::compare(UCharIterator &sIter,
400 UCharIterator &tIter,
401 UErrorCode &status) const {
402 if(U_SUCCESS(status)) {
403 return ucol_strcollIter(ucollator, &sIter, &tIter, &status);
404 } else {
405 return UCOL_EQUAL;
406 }
407 }
408
409 /**
410 * Retrieve a collation key for the specified string. The key can be compared
411 * with other collation keys using a bitwise comparison (e.g. memcmp) to find
412 * the ordering of their respective source strings. This is handy when doing a
413 * sort, where each sort key must be compared many times.
414 *
415 * The basic algorithm here is to find all of the collation elements for each
416 * character in the source string, convert them to an ASCII representation, and
417 * put them into the collation key. But it's trickier than that. Each
418 * collation element in a string has three components: primary ('A' vs 'B'),
419 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
420 * at the end of a string takes precedence over a secondary or tertiary
421 * difference earlier in the string.
422 *
423 * To account for this, we put all of the primary orders at the beginning of
424 * the string, followed by the secondary and tertiary orders. Each set of
425 * orders is terminated by nulls so that a key for a string which is a initial
426 * substring of another key will compare less without any special case.
427 *
428 * Here's a hypothetical example, with the collation element represented as a
429 * three-digit number, one digit for primary, one for secondary, etc.
430 *
431 * String: A a B \u00C9
432 * Collation Elements: 101 100 201 511
433 * Collation Key: 1125<null>0001<null>1011<null>
434 *
435 * To make things even trickier, secondary differences (accent marks) are
436 * compared starting at the *end* of the string in languages with French
437 * secondary ordering. But when comparing the accent marks on a single base
438 * character, they are compared from the beginning. To handle this, we reverse
439 * all of the accents that belong to each base character, then we reverse the
440 * entire string of secondary orderings at the end.
441 */
442 CollationKey& RuleBasedCollator::getCollationKey(
443 const UnicodeString& source,
444 CollationKey& sortkey,
445 UErrorCode& status) const
446 {
447 return getCollationKey(source.getBuffer(), source.length(), sortkey, status);
448 }
449
450 CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
451 int32_t sourceLen,
452 CollationKey& sortkey,
453 UErrorCode& status) const
454 {
455 if (U_FAILURE(status))
456 {
457 return sortkey.setToBogus();
458 }
459
460 if ((!source) || (sourceLen == 0)) {
461 return sortkey.reset();
462 }
463
464 uint8_t *result;
465 int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator,
466 source, sourceLen,
467 &result,
468 &status);
469 sortkey.adopt(result, resultLen);
470 return sortkey;
471 }
472
473 /**
474 * Return the maximum length of any expansion sequences that end with the
475 * specified comparison order.
476 * @param order a collation order returned by previous or next.
477 * @return the maximum length of any expansion seuences ending with the
478 * specified order or 1 if collation order does not occur at the end of any
479 * expansion sequence.
480 * @see CollationElementIterator#getMaxExpansion
481 */
482 int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
483 {
484 uint8_t result;
485 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result);
486 return result;
487 }
488
489 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length,
490 UErrorCode &status)
491 {
492 return ucol_cloneRuleData(ucollator, &length, &status);
493 }
494
495
496 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status)
497 {
498 return ucol_cloneBinary(ucollator, buffer, capacity, &status);
499 }
500
501 void RuleBasedCollator::setAttribute(UColAttribute attr,
502 UColAttributeValue value,
503 UErrorCode &status)
504 {
505 if (U_FAILURE(status))
506 return;
507 checkOwned();
508 ucol_setAttribute(ucollator, attr, value, &status);
509 }
510
511 UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr,
512 UErrorCode &status)
513 {
514 if (U_FAILURE(status))
515 return UCOL_DEFAULT;
516 return ucol_getAttribute(ucollator, attr, &status);
517 }
518
519 uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) {
520 checkOwned();
521 return ucol_setVariableTop(ucollator, varTop, len, &status);
522 }
523
524 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) {
525 checkOwned();
526 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status);
527 }
528
529 void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) {
530 checkOwned();
531 ucol_restoreVariableTop(ucollator, varTop, &status);
532 }
533
534 uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const {
535 return ucol_getVariableTop(ucollator, &status);
536 }
537
538 Collator* RuleBasedCollator::safeClone(void)
539 {
540 UErrorCode intStatus = U_ZERO_ERROR;
541 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
542 UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize,
543 &intStatus);
544 if (U_FAILURE(intStatus)) {
545 return NULL;
546 }
547
548 RuleBasedCollator *result = new RuleBasedCollator();
549 // Null pointer check
550 if (result != NULL) {
551 result->ucollator = ucol;
552 result->dataIsOwned = TRUE;
553 result->isWriteThroughAlias = FALSE;
554 setRuleStringFromCollator();
555 }
556
557 return result;
558 }
559
560
561 int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
562 uint8_t *result, int32_t resultLength)
563 const
564 {
565 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength);
566 }
567
568 int32_t RuleBasedCollator::getSortKey(const UChar *source,
569 int32_t sourceLength, uint8_t *result,
570 int32_t resultLength) const
571 {
572 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength);
573 }
574
575 Collator::ECollationStrength RuleBasedCollator::getStrength(void) const
576 {
577 UErrorCode intStatus = U_ZERO_ERROR;
578 return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH,
579 &intStatus));
580 }
581
582 void RuleBasedCollator::setStrength(ECollationStrength newStrength)
583 {
584 checkOwned();
585 UErrorCode intStatus = U_ZERO_ERROR;
586 UCollationStrength strength = getUCollationStrength(newStrength);
587 ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus);
588 }
589
590 int32_t RuleBasedCollator::getReorderCodes(int32_t *dest,
591 int32_t destCapacity,
592 UErrorCode& status) const
593 {
594 return ucol_getReorderCodes(ucollator, dest, destCapacity, &status);
595 }
596
597 void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes,
598 int32_t reorderCodesLength,
599 UErrorCode& status)
600 {
601 ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status);
602 }
603
604
605 /**
606 * Create a hash code for this collation. Just hash the main rule table -- that
607 * should be good enough for almost any use.
608 */
609 int32_t RuleBasedCollator::hashCode() const
610 {
611 int32_t length;
612 const UChar *rules = ucol_getRules(ucollator, &length);
613 return uhash_hashUCharsN(rules, length);
614 }
615
616 /**
617 * return the locale of this collator
618 */
619 const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const {
620 const char *result = ucol_getLocaleByType(ucollator, type, &status);
621 if(result == NULL) {
622 Locale res("");
623 res.setToBogus();
624 return res;
625 } else {
626 return Locale(result);
627 }
628 }
629
630 void
631 RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) {
632 checkOwned();
633 char* rloc = uprv_strdup(requestedLocale.getName());
634 if (rloc) {
635 char* vloc = uprv_strdup(validLocale.getName());
636 if (vloc) {
637 char* aloc = uprv_strdup(actualLocale.getName());
638 if (aloc) {
639 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc);
640 return;
641 }
642 uprv_free(vloc);
643 }
644 uprv_free(rloc);
645 }
646 }
647
648 // RuleBaseCollatorNew private constructor ----------------------------------
649
650 RuleBasedCollator::RuleBasedCollator()
651 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
652 {
653 }
654
655 RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale,
656 UErrorCode& status)
657 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
658 {
659 if (U_FAILURE(status))
660 return;
661
662 /*
663 Try to load, in order:
664 1. The desired locale's collation.
665 2. A fallback of the desired locale.
666 3. The default locale's collation.
667 4. A fallback of the default locale.
668 5. The default collation rules, which contains en_US collation rules.
669
670 To reiterate, we try:
671 Specific:
672 language+country+variant
673 language+country
674 language
675 Default:
676 language+country+variant
677 language+country
678 language
679 Root: (aka DEFAULTRULES)
680 steps 1-5 are handled by resource bundle fallback mechanism.
681 however, in a very unprobable situation that no resource bundle
682 data exists, step 5 is repeated with hardcoded default rules.
683 */
684
685 setUCollator(desiredLocale, status);
686
687 if (U_FAILURE(status))
688 {
689 status = U_ZERO_ERROR;
690
691 setUCollator(kRootLocaleName, status);
692 if (status == U_ZERO_ERROR) {
693 status = U_USING_DEFAULT_WARNING;
694 }
695 }
696
697 if (U_SUCCESS(status))
698 {
699 setRuleStringFromCollator();
700 }
701 }
702
703 void
704 RuleBasedCollator::setUCollator(const char *locale,
705 UErrorCode &status)
706 {
707 if (U_FAILURE(status))
708 return;
709 if (ucollator && dataIsOwned)
710 ucol_close(ucollator);
711 ucollator = ucol_open_internal(locale, &status);
712 dataIsOwned = TRUE;
713 isWriteThroughAlias = FALSE;
714 }
715
716
717 void
718 RuleBasedCollator::checkOwned() {
719 if (!(dataIsOwned || isWriteThroughAlias)) {
720 UErrorCode status = U_ZERO_ERROR;
721 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status);
722 setRuleStringFromCollator();
723 dataIsOwned = TRUE;
724 isWriteThroughAlias = FALSE;
725 }
726 }
727
728 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
729
730 U_NAMESPACE_END
731
732 #endif /* #if !UCONFIG_NO_COLLATION */