]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/tblcoll.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / i18n / tblcoll.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 /**
9 * File tblcoll.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 2/5/97 aliu Added streamIn and streamOut methods. Added
17 * constructor which reads RuleBasedCollator object from
18 * a binary file. Added writeToFile method which streams
19 * RuleBasedCollator out to a binary file. The streamIn
20 * and streamOut methods use istream and ostream objects
21 * in binary mode.
22 * 2/11/97 aliu Moved declarations out of for loop initializer.
23 * Added Mac compatibility #ifdef for ios::nocreate.
24 * 2/12/97 aliu Modified to use TableCollationData sub-object to
25 * hold invariant data.
26 * 2/13/97 aliu Moved several methods into this class from Collation.
27 * Added a private RuleBasedCollator(Locale&) constructor,
28 * to be used by Collator::getInstance(). General
29 * clean up. Made use of UErrorCode variables consistent.
30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
31 * constructor and getDynamicClassID.
32 * 3/5/97 aliu Changed compaction cycle to improve performance. We
33 * use the maximum allowable value which is kBlockCount.
34 * Modified getRules() to load rules dynamically. Changed
35 * constructFromFile() call to accomodate this (added
36 * parameter to specify whether binary loading is to
37 * take place).
38 * 05/06/97 helena Added memory allocation error check.
39 * 6/20/97 helena Java class name change.
40 * 6/23/97 helena Adding comments to make code more readable.
41 * 09/03/97 helena Added createCollationKeyValues().
42 * 06/26/98 erm Changes for CollationKeys using byte arrays.
43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99 stephen Removed EDecompositionMode, merged with
45 * Normalizer::EMode
46 * 06/14/99 stephen Removed kResourceBundleSuffix
47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
48 * files are no longer used.
49 * 11/02/99 helena Collator performance enhancements. Special case
50 * for NO_OP situations.
51 * 11/17/99 srl More performance enhancements. Inlined some internal functions.
52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
53 * to implementation file.
54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
55 */
56
57 #include "unicode/utypes.h"
58
59 #if !UCONFIG_NO_COLLATION
60
61 #include "unicode/tblcoll.h"
62 #include "unicode/coleitr.h"
63 #include "unicode/ures.h"
64 #include "unicode/uset.h"
65 #include "ucol_imp.h"
66 #include "uresimp.h"
67 #include "uhash.h"
68 #include "cmemory.h"
69 #include "cstring.h"
70 #include "putilimp.h"
71
72 /* public RuleBasedCollator constructor ---------------------------------- */
73
74 U_NAMESPACE_BEGIN
75
76 /**
77 * Copy constructor, aliasing, not write-through
78 */
79 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
80 : Collator(that)
81 , dataIsOwned(FALSE)
82 , isWriteThroughAlias(FALSE)
83 , ucollator(NULL)
84 {
85 RuleBasedCollator::operator=(that);
86 }
87
88 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
89 UErrorCode& status) :
90 dataIsOwned(FALSE)
91 {
92 construct(rules,
93 UCOL_DEFAULT_STRENGTH,
94 UCOL_DEFAULT,
95 status);
96 }
97
98 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
99 ECollationStrength collationStrength,
100 UErrorCode& status) : dataIsOwned(FALSE)
101 {
102 construct(rules,
103 getUCollationStrength(collationStrength),
104 UCOL_DEFAULT,
105 status);
106 }
107
108 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
109 UColAttributeValue decompositionMode,
110 UErrorCode& status) :
111 dataIsOwned(FALSE)
112 {
113 construct(rules,
114 UCOL_DEFAULT_STRENGTH,
115 decompositionMode,
116 status);
117 }
118
119 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
120 ECollationStrength collationStrength,
121 UColAttributeValue decompositionMode,
122 UErrorCode& status) : dataIsOwned(FALSE)
123 {
124 construct(rules,
125 getUCollationStrength(collationStrength),
126 decompositionMode,
127 status);
128 }
129 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
130 const RuleBasedCollator *base,
131 UErrorCode &status) :
132 dataIsOwned(TRUE),
133 isWriteThroughAlias(FALSE)
134 {
135 ucollator = ucol_openBinary(bin, length, base->ucollator, &status);
136 }
137
138 void
139 RuleBasedCollator::setRuleStringFromCollator()
140 {
141 int32_t length;
142 const UChar *r = ucol_getRules(ucollator, &length);
143
144 if (r && length > 0) {
145 // alias the rules string
146 urulestring.setTo(TRUE, r, length);
147 }
148 else {
149 urulestring.truncate(0); // Clear string.
150 }
151 }
152
153 // not aliasing, not write-through
154 void
155 RuleBasedCollator::construct(const UnicodeString& rules,
156 UColAttributeValue collationStrength,
157 UColAttributeValue decompositionMode,
158 UErrorCode& status)
159 {
160 ucollator = ucol_openRules(rules.getBuffer(), rules.length(),
161 decompositionMode, collationStrength,
162 NULL, &status);
163
164 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it
165 isWriteThroughAlias = FALSE;
166
167 if(ucollator == NULL) {
168 if(U_SUCCESS(status)) {
169 status = U_MEMORY_ALLOCATION_ERROR;
170 }
171 return; // Failure
172 }
173
174 setRuleStringFromCollator();
175 }
176
177 /* RuleBasedCollator public destructor ----------------------------------- */
178
179 RuleBasedCollator::~RuleBasedCollator()
180 {
181 if (dataIsOwned)
182 {
183 ucol_close(ucollator);
184 }
185 ucollator = 0;
186 }
187
188 /* RuleBaseCollator public methods --------------------------------------- */
189
190 UBool RuleBasedCollator::operator==(const Collator& that) const
191 {
192 /* only checks for address equals here */
193 if (Collator::operator==(that))
194 return TRUE;
195
196 if (getDynamicClassID() != that.getDynamicClassID())
197 return FALSE; /* not the same class */
198
199 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
200
201 // weiv: use C function, commented code below is wrong
202 return ucol_equals(this->ucollator, thatAlias.ucollator);
203 /*
204 synwee : orginal code does not check for data compatibility
205 */
206 /*
207 if (ucollator != thatAlias.ucollator)
208 return FALSE;
209
210 return TRUE;
211 */
212 }
213
214 UBool RuleBasedCollator::operator!=(const Collator& other) const
215 {
216 return !(*this == other);
217 }
218
219 // aliasing, not write-through
220 RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that)
221 {
222 if (this != &that)
223 {
224 if (dataIsOwned)
225 {
226 ucol_close(ucollator);
227 }
228
229 urulestring.truncate(0); // empty the rule string
230 dataIsOwned = TRUE;
231 isWriteThroughAlias = FALSE;
232
233 UErrorCode intStatus = U_ZERO_ERROR;
234 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
235 ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize,
236 &intStatus);
237 if (U_SUCCESS(intStatus)) {
238 setRuleStringFromCollator();
239 }
240 }
241 return *this;
242 }
243
244 // aliasing, not write-through
245 Collator* RuleBasedCollator::clone() const
246 {
247 return new RuleBasedCollator(*this);
248 }
249
250 CollationElementIterator* RuleBasedCollator::createCollationElementIterator
251 (const UnicodeString& source) const
252 {
253 UErrorCode status = U_ZERO_ERROR;
254 CollationElementIterator *result = new CollationElementIterator(source, this,
255 status);
256 if (U_FAILURE(status)) {
257 delete result;
258 return NULL;
259 }
260
261 return result;
262 }
263
264 /**
265 * Create a CollationElementIterator object that will iterate over the
266 * elements in a string, using the collation rules defined in this
267 * RuleBasedCollator
268 */
269 CollationElementIterator* RuleBasedCollator::createCollationElementIterator
270 (const CharacterIterator& source) const
271 {
272 UErrorCode status = U_ZERO_ERROR;
273 CollationElementIterator *result = new CollationElementIterator(source, this,
274 status);
275
276 if (U_FAILURE(status)) {
277 delete result;
278 return NULL;
279 }
280
281 return result;
282 }
283
284 /**
285 * Return a string representation of this collator's rules. The string can
286 * later be passed to the constructor that takes a UnicodeString argument,
287 * which will construct a collator that's functionally identical to this one.
288 * You can also allow users to edit the string in order to change the collation
289 * data, or you can print it out for inspection, or whatever.
290 */
291 const UnicodeString& RuleBasedCollator::getRules() const
292 {
293 return urulestring;
294 }
295
296 void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
297 {
298 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1);
299
300 if (rulesize > 0) {
301 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) );
302 if(rules != NULL) {
303 ucol_getRulesEx(ucollator, delta, rules, rulesize);
304 buffer.setTo(rules, rulesize);
305 uprv_free(rules);
306 } else { // couldn't allocate
307 buffer.remove();
308 }
309 }
310 else {
311 buffer.remove();
312 }
313 }
314
315 UnicodeSet *
316 RuleBasedCollator::getTailoredSet(UErrorCode &status) const
317 {
318 if(U_FAILURE(status)) {
319 return NULL;
320 }
321 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
322 }
323
324
325 void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const
326 {
327 if (versionInfo!=NULL){
328 ucol_getVersion(ucollator, versionInfo);
329 }
330 }
331
332 Collator::EComparisonResult RuleBasedCollator::compare(
333 const UnicodeString& source,
334 const UnicodeString& target,
335 int32_t length) const
336 {
337 UErrorCode status = U_ZERO_ERROR;
338 return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status));
339 }
340
341 UCollationResult RuleBasedCollator::compare(
342 const UnicodeString& source,
343 const UnicodeString& target,
344 int32_t length,
345 UErrorCode &status) const
346 {
347 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status);
348 }
349
350 Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source,
351 int32_t sourceLength,
352 const UChar* target,
353 int32_t targetLength)
354 const
355 {
356 return getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength,
357 target, targetLength));
358 }
359
360 UCollationResult RuleBasedCollator::compare(const UChar* source,
361 int32_t sourceLength,
362 const UChar* target,
363 int32_t targetLength,
364 UErrorCode &status) const
365 {
366 if(U_SUCCESS(status)) {
367 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength);
368 } else {
369 return UCOL_EQUAL;
370 }
371 }
372
373 /**
374 * Compare two strings using this collator
375 */
376 Collator::EComparisonResult RuleBasedCollator::compare(
377 const UnicodeString& source,
378 const UnicodeString& target) const
379 {
380 return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(),
381 target.getBuffer(), target.length()));
382 }
383
384 UCollationResult RuleBasedCollator::compare(
385 const UnicodeString& source,
386 const UnicodeString& target,
387 UErrorCode &status) const
388 {
389 if(U_SUCCESS(status)) {
390 return ucol_strcoll(ucollator, source.getBuffer(), source.length(),
391 target.getBuffer(), target.length());
392 } else {
393 return UCOL_EQUAL;
394 }
395 }
396
397 /**
398 * Retrieve a collation key for the specified string. The key can be compared
399 * with other collation keys using a bitwise comparison (e.g. memcmp) to find
400 * the ordering of their respective source strings. This is handy when doing a
401 * sort, where each sort key must be compared many times.
402 *
403 * The basic algorithm here is to find all of the collation elements for each
404 * character in the source string, convert them to an ASCII representation, and
405 * put them into the collation key. But it's trickier than that. Each
406 * collation element in a string has three components: primary ('A' vs 'B'),
407 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
408 * at the end of a string takes precedence over a secondary or tertiary
409 * difference earlier in the string.
410 *
411 * To account for this, we put all of the primary orders at the beginning of
412 * the string, followed by the secondary and tertiary orders. Each set of
413 * orders is terminated by nulls so that a key for a string which is a initial
414 * substring of another key will compare less without any special case.
415 *
416 * Here's a hypothetical example, with the collation element represented as a
417 * three-digit number, one digit for primary, one for secondary, etc.
418 *
419 * String: A a B \u00C9
420 * Collation Elements: 101 100 201 511
421 * Collation Key: 1125<null>0001<null>1011<null>
422 *
423 * To make things even trickier, secondary differences (accent marks) are
424 * compared starting at the *end* of the string in languages with French
425 * secondary ordering. But when comparing the accent marks on a single base
426 * character, they are compared from the beginning. To handle this, we reverse
427 * all of the accents that belong to each base character, then we reverse the
428 * entire string of secondary orderings at the end.
429 */
430 CollationKey& RuleBasedCollator::getCollationKey(
431 const UnicodeString& source,
432 CollationKey& sortkey,
433 UErrorCode& status) const
434 {
435 return getCollationKey(source.getBuffer(), source.length(), sortkey, status);
436 }
437
438 CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
439 int32_t sourceLen,
440 CollationKey& sortkey,
441 UErrorCode& status) const
442 {
443 if (U_FAILURE(status))
444 {
445 return sortkey.setToBogus();
446 }
447
448 if ((!source) || (sourceLen == 0)) {
449 return sortkey.reset();
450 }
451
452 uint8_t *result;
453 int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator,
454 source, sourceLen,
455 &result,
456 &status);
457 sortkey.adopt(result, resultLen);
458 return sortkey;
459 }
460
461 /**
462 * Return the maximum length of any expansion sequences that end with the
463 * specified comparison order.
464 * @param order a collation order returned by previous or next.
465 * @return the maximum length of any expansion seuences ending with the
466 * specified order or 1 if collation order does not occur at the end of any
467 * expansion sequence.
468 * @see CollationElementIterator#getMaxExpansion
469 */
470 int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
471 {
472 uint8_t result;
473 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result);
474 return result;
475 }
476
477 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length,
478 UErrorCode &status)
479 {
480 return ucol_cloneRuleData(ucollator, &length, &status);
481 }
482
483
484 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status)
485 {
486 return ucol_cloneBinary(ucollator, buffer, capacity, &status);
487 }
488
489 void RuleBasedCollator::setAttribute(UColAttribute attr,
490 UColAttributeValue value,
491 UErrorCode &status)
492 {
493 if (U_FAILURE(status))
494 return;
495 checkOwned();
496 ucol_setAttribute(ucollator, attr, value, &status);
497 }
498
499 UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr,
500 UErrorCode &status)
501 {
502 if (U_FAILURE(status))
503 return UCOL_DEFAULT;
504 return ucol_getAttribute(ucollator, attr, &status);
505 }
506
507 uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) {
508 checkOwned();
509 return ucol_setVariableTop(ucollator, varTop, len, &status);
510 }
511
512 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) {
513 checkOwned();
514 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status);
515 }
516
517 void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) {
518 checkOwned();
519 ucol_restoreVariableTop(ucollator, varTop, &status);
520 }
521
522 uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const {
523 return ucol_getVariableTop(ucollator, &status);
524 }
525
526 Collator* RuleBasedCollator::safeClone(void)
527 {
528 UErrorCode intStatus = U_ZERO_ERROR;
529 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
530 UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize,
531 &intStatus);
532 if (U_FAILURE(intStatus)) {
533 return NULL;
534 }
535
536 RuleBasedCollator *result = new RuleBasedCollator();
537 // Null pointer check
538 if (result != NULL) {
539 result->ucollator = ucol;
540 result->dataIsOwned = TRUE;
541 result->isWriteThroughAlias = FALSE;
542 setRuleStringFromCollator();
543 }
544
545 return result;
546 }
547
548
549 int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
550 uint8_t *result, int32_t resultLength)
551 const
552 {
553 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength);
554 }
555
556 int32_t RuleBasedCollator::getSortKey(const UChar *source,
557 int32_t sourceLength, uint8_t *result,
558 int32_t resultLength) const
559 {
560 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength);
561 }
562
563 Collator::ECollationStrength RuleBasedCollator::getStrength(void) const
564 {
565 UErrorCode intStatus = U_ZERO_ERROR;
566 return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH,
567 &intStatus));
568 }
569
570 void RuleBasedCollator::setStrength(ECollationStrength newStrength)
571 {
572 checkOwned();
573 UErrorCode intStatus = U_ZERO_ERROR;
574 UCollationStrength strength = getUCollationStrength(newStrength);
575 ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus);
576 }
577
578 /**
579 * Create a hash code for this collation. Just hash the main rule table -- that
580 * should be good enough for almost any use.
581 */
582 int32_t RuleBasedCollator::hashCode() const
583 {
584 int32_t length;
585 const UChar *rules = ucol_getRules(ucollator, &length);
586 return uhash_hashUCharsN(rules, length);
587 }
588
589 /**
590 * return the locale of this collator
591 */
592 const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const {
593 const char *result = ucol_getLocale(ucollator, type, &status);
594 if(result == NULL) {
595 Locale res("");
596 res.setToBogus();
597 return res;
598 } else {
599 return Locale(result);
600 }
601 }
602
603 void
604 RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) {
605 checkOwned();
606 char* rloc = uprv_strdup(requestedLocale.getName());
607 if (rloc) {
608 char* vloc = uprv_strdup(validLocale.getName());
609 if (vloc) {
610 char* aloc = uprv_strdup(actualLocale.getName());
611 if (aloc) {
612 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc);
613 return;
614 }
615 uprv_free(vloc);
616 }
617 uprv_free(rloc);
618 }
619 }
620
621 // RuleBaseCollatorNew private constructor ----------------------------------
622
623 RuleBasedCollator::RuleBasedCollator()
624 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
625 {
626 }
627
628 RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale,
629 UErrorCode& status)
630 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
631 {
632 if (U_FAILURE(status))
633 return;
634
635 /*
636 Try to load, in order:
637 1. The desired locale's collation.
638 2. A fallback of the desired locale.
639 3. The default locale's collation.
640 4. A fallback of the default locale.
641 5. The default collation rules, which contains en_US collation rules.
642
643 To reiterate, we try:
644 Specific:
645 language+country+variant
646 language+country
647 language
648 Default:
649 language+country+variant
650 language+country
651 language
652 Root: (aka DEFAULTRULES)
653 steps 1-5 are handled by resource bundle fallback mechanism.
654 however, in a very unprobable situation that no resource bundle
655 data exists, step 5 is repeated with hardcoded default rules.
656 */
657
658 setUCollator(desiredLocale, status);
659
660 if (U_FAILURE(status))
661 {
662 status = U_ZERO_ERROR;
663
664 setUCollator(kRootLocaleName, status);
665 if (status == U_ZERO_ERROR) {
666 status = U_USING_DEFAULT_WARNING;
667 }
668 }
669
670 if (U_SUCCESS(status))
671 {
672 setRuleStringFromCollator();
673 }
674 }
675
676 void
677 RuleBasedCollator::setUCollator(const char *locale,
678 UErrorCode &status)
679 {
680 if (U_FAILURE(status))
681 return;
682 if (ucollator && dataIsOwned)
683 ucol_close(ucollator);
684 ucollator = ucol_open_internal(locale, &status);
685 dataIsOwned = TRUE;
686 isWriteThroughAlias = FALSE;
687 }
688
689
690 void
691 RuleBasedCollator::checkOwned() {
692 if (!(dataIsOwned || isWriteThroughAlias)) {
693 UErrorCode status = U_ZERO_ERROR;
694 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status);
695 setRuleStringFromCollator();
696 dataIsOwned = TRUE;
697 isWriteThroughAlias = FALSE;
698 }
699 }
700
701 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
702
703 U_NAMESPACE_END
704
705 #endif /* #if !UCONFIG_NO_COLLATION */