1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2009-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/alphaindex.h"
15 #include "unicode/coll.h"
16 #include "unicode/localpointer.h"
17 #include "unicode/normalizer2.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ulocdata.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uobject.h"
23 #include "unicode/usetiter.h"
24 #include "unicode/utf16.h"
40 * Prefix string for Chinese index buckets.
41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
43 const UChar BASE
[1] = { 0xFDD0 };
44 const int32_t BASE_LENGTH
= 1;
46 UBool
isOneLabelBetterThanOther(const Normalizer2
&nfkdNormalizer
,
47 const UnicodeString
&one
, const UnicodeString
&other
);
51 static int32_t U_CALLCONV
52 collatorComparator(const void *context
, const void *left
, const void *right
);
54 static int32_t U_CALLCONV
55 recordCompareFn(const void *context
, const void *left
, const void *right
);
57 // UVector<Record *> support function, delete a Record.
58 static void U_CALLCONV
59 alphaIndex_deleteRecord(void *obj
) {
60 delete static_cast<AlphabeticIndex::Record
*>(obj
);
65 UnicodeString
*ownedString(const UnicodeString
&s
, LocalPointer
<UnicodeString
> &owned
,
66 UErrorCode
&errorCode
) {
67 if (U_FAILURE(errorCode
)) { return NULL
; }
68 if (owned
.isValid()) {
69 return owned
.orphan();
71 UnicodeString
*p
= new UnicodeString(s
);
73 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
78 inline UnicodeString
*getString(const UVector
&list
, int32_t i
) {
79 return static_cast<UnicodeString
*>(list
[i
]);
82 inline AlphabeticIndex::Bucket
*getBucket(const UVector
&list
, int32_t i
) {
83 return static_cast<AlphabeticIndex::Bucket
*>(list
[i
]);
86 inline AlphabeticIndex::Record
*getRecord(const UVector
&list
, int32_t i
) {
87 return static_cast<AlphabeticIndex::Record
*>(list
[i
]);
91 * Like Java Collections.binarySearch(List, String, Comparator).
93 * @return the index>=0 where the item was found,
94 * or the index<0 for inserting the string at ~index in sorted order
96 int32_t binarySearch(const UVector
&list
, const UnicodeString
&s
, const Collator
&coll
) {
97 if (list
.size() == 0) { return ~0; }
99 int32_t limit
= list
.size();
101 int32_t i
= (start
+ limit
) / 2;
102 const UnicodeString
*si
= static_cast<UnicodeString
*>(list
.elementAt(i
));
103 UErrorCode errorCode
= U_ZERO_ERROR
;
104 UCollationResult cmp
= coll
.compare(s
, *si
, errorCode
);
105 if (cmp
== UCOL_EQUAL
) {
107 } else if (cmp
< 0) {
109 return ~start
; // insert s before *si
114 return ~(start
+ 1); // insert s after *si
123 // The BucketList is not in the anonymous namespace because only Clang
124 // seems to support its use in other classes from there.
125 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126 class BucketList
: public UObject
{
128 BucketList(UVector
*bucketList
, UVector
*publicBucketList
)
129 : bucketList_(bucketList
), immutableVisibleList_(publicBucketList
) {
130 int32_t displayIndex
= 0;
131 for (int32_t i
= 0; i
< publicBucketList
->size(); ++i
) {
132 getBucket(*publicBucketList
, i
)->displayIndex_
= displayIndex
++;
136 // The virtual destructor must not be inline.
137 // See ticket #8454 for details.
138 virtual ~BucketList();
140 int32_t getBucketCount() const {
141 return immutableVisibleList_
->size();
144 int32_t getBucketIndex(const UnicodeString
&name
, const Collator
&collatorPrimaryOnly
,
145 UErrorCode
&errorCode
) {
148 int32_t limit
= bucketList_
->size();
149 while ((start
+ 1) < limit
) {
150 int32_t i
= (start
+ limit
) / 2;
151 const AlphabeticIndex::Bucket
*bucket
= getBucket(*bucketList_
, i
);
152 UCollationResult nameVsBucket
=
153 collatorPrimaryOnly
.compare(name
, bucket
->lowerBoundary_
, errorCode
);
154 if (nameVsBucket
< 0) {
160 const AlphabeticIndex::Bucket
*bucket
= getBucket(*bucketList_
, start
);
161 if (bucket
->displayBucket_
!= NULL
) {
162 bucket
= bucket
->displayBucket_
;
164 return bucket
->displayIndex_
;
167 /** All of the buckets, visible and invisible. */
168 UVector
*bucketList_
;
169 /** Just the visible buckets. */
170 UVector
*immutableVisibleList_
;
173 BucketList::~BucketList() {
175 if (immutableVisibleList_
!= bucketList_
) {
176 delete immutableVisibleList_
;
180 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
182 delete collatorPrimaryOnly_
;
186 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187 return buckets_
->getBucketCount();
191 AlphabeticIndex::ImmutableIndex::getBucketIndex(
192 const UnicodeString
&name
, UErrorCode
&errorCode
) const {
193 return buckets_
->getBucketIndex(name
, *collatorPrimaryOnly_
, errorCode
);
196 const AlphabeticIndex::Bucket
*
197 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index
) const {
198 if (0 <= index
&& index
< buckets_
->getBucketCount()) {
199 return icu::getBucket(*buckets_
->immutableVisibleList_
, index
);
205 AlphabeticIndex::AlphabeticIndex(const Locale
&locale
, UErrorCode
&status
)
207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL
),
209 initialLabels_(NULL
), firstCharsInScripts_(NULL
),
210 collator_(NULL
), collatorPrimaryOnly_(NULL
),
212 init(&locale
, status
);
216 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator
*collator
, UErrorCode
&status
)
218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL
),
220 initialLabels_(NULL
), firstCharsInScripts_(NULL
),
221 collator_(collator
), collatorPrimaryOnly_(NULL
),
228 AlphabeticIndex::~AlphabeticIndex() {
230 delete collatorPrimaryOnly_
;
231 delete firstCharsInScripts_
;
234 delete initialLabels_
;
238 AlphabeticIndex
&AlphabeticIndex::addLabels(const UnicodeSet
&additions
, UErrorCode
&status
) {
239 if (U_FAILURE(status
)) {
242 initialLabels_
->addAll(additions
);
248 AlphabeticIndex
&AlphabeticIndex::addLabels(const Locale
&locale
, UErrorCode
&status
) {
249 addIndexExemplars(locale
, status
);
255 AlphabeticIndex::ImmutableIndex
*AlphabeticIndex::buildImmutableIndex(UErrorCode
&errorCode
) {
256 if (U_FAILURE(errorCode
)) { return NULL
; }
257 // In C++, the ImmutableIndex must own its copy of the BucketList,
258 // even if it contains no records, for proper memory management.
259 // We could clone the buckets_ if they are not NULL,
260 // but that would be worth it only if this method is called multiple times,
261 // or called after using the old-style bucket iterator API.
262 LocalPointer
<BucketList
> immutableBucketList(createBucketList(errorCode
));
263 LocalPointer
<RuleBasedCollator
> coll(collatorPrimaryOnly_
->clone());
264 if (immutableBucketList
.isNull() || coll
.isNull()) {
265 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
268 ImmutableIndex
*immIndex
= new ImmutableIndex(immutableBucketList
.getAlias(), coll
.getAlias());
269 if (immIndex
== NULL
) {
270 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
273 // The ImmutableIndex adopted its parameter objects.
274 immutableBucketList
.orphan();
279 int32_t AlphabeticIndex::getBucketCount(UErrorCode
&status
) {
281 if (U_FAILURE(status
)) {
284 return buckets_
->getBucketCount();
288 int32_t AlphabeticIndex::getRecordCount(UErrorCode
&status
) {
289 if (U_FAILURE(status
) || inputList_
== NULL
) {
292 return inputList_
->size();
295 void AlphabeticIndex::initLabels(UVector
&indexCharacters
, UErrorCode
&errorCode
) const {
296 const Normalizer2
*nfkdNormalizer
= Normalizer2::getNFKDInstance(errorCode
);
297 if (U_FAILURE(errorCode
)) { return; }
299 const UnicodeString
&firstScriptBoundary
= *getString(*firstCharsInScripts_
, 0);
300 const UnicodeString
&overflowBoundary
=
301 *getString(*firstCharsInScripts_
, firstCharsInScripts_
->size() - 1);
303 // We make a sorted array of elements.
304 // Some of the input may be redundant.
305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306 // We filter out those cases.
307 UnicodeSetIterator
iter(*initialLabels_
);
308 while (iter
.next()) {
309 const UnicodeString
*item
= &iter
.getString();
310 LocalPointer
<UnicodeString
> ownedItem
;
312 int32_t itemLength
= item
->length();
313 if (!item
->hasMoreChar32Than(0, itemLength
, 1)) {
314 checkDistinct
= FALSE
;
315 } else if(item
->charAt(itemLength
- 1) == 0x2a && // '*'
316 item
->charAt(itemLength
- 2) != 0x2a) {
317 // Use a label if it is marked with one trailing star,
318 // even if the label string sorts the same when all contractions are suppressed.
319 ownedItem
.adoptInstead(new UnicodeString(*item
, 0, itemLength
- 1));
320 item
= ownedItem
.getAlias();
322 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
325 checkDistinct
= FALSE
;
327 checkDistinct
= TRUE
;
329 if (collatorPrimaryOnly_
->compare(*item
, firstScriptBoundary
, errorCode
) < 0) {
330 // Ignore a primary-ignorable or non-alphabetic index character.
331 } else if (collatorPrimaryOnly_
->compare(*item
, overflowBoundary
, errorCode
) >= 0) {
332 // Ignore an index character that will land in the overflow bucket.
333 } else if (checkDistinct
&&
334 collatorPrimaryOnly_
->compare(*item
, separated(*item
), errorCode
) == 0) {
335 // Ignore a multi-code point index character that does not sort distinctly
336 // from the sequence of its separate characters.
338 int32_t insertionPoint
= binarySearch(indexCharacters
, *item
, *collatorPrimaryOnly_
);
339 if (insertionPoint
< 0) {
340 indexCharacters
.insertElementAt(
341 ownedString(*item
, ownedItem
, errorCode
), ~insertionPoint
, errorCode
);
343 const UnicodeString
&itemAlreadyIn
= *getString(indexCharacters
, insertionPoint
);
344 if (isOneLabelBetterThanOther(*nfkdNormalizer
, *item
, itemAlreadyIn
)) {
345 indexCharacters
.setElementAt(
346 ownedString(*item
, ownedItem
, errorCode
), insertionPoint
);
351 if (U_FAILURE(errorCode
)) { return; }
353 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
355 int32_t size
= indexCharacters
.size() - 1;
356 if (size
> maxLabelCount_
) {
359 for (int32_t i
= 0; i
< indexCharacters
.size();) {
361 int32_t bump
= count
* maxLabelCount_
/ size
;
363 indexCharacters
.removeElementAt(i
);
374 const UnicodeString
&fixLabel(const UnicodeString
¤t
, UnicodeString
&temp
) {
375 if (!current
.startsWith(BASE
, BASE_LENGTH
)) {
378 UChar rest
= current
.charAt(BASE_LENGTH
);
379 if (0x2800 < rest
&& rest
<= 0x28FF) { // stroke count
380 int32_t count
= rest
-0x2800;
381 temp
.setTo((UChar
)(0x30 + count
% 10));
384 temp
.insert(0, (UChar
)(0x30 + count
% 10));
387 temp
.insert(0, (UChar
)(0x30 + count
));
390 return temp
.append((UChar
)0x5283);
392 return temp
.setTo(current
, BASE_LENGTH
);
395 UBool
hasMultiplePrimaryWeights(
396 const RuleBasedCollator
&coll
, uint32_t variableTop
,
397 const UnicodeString
&s
, UVector64
&ces
, UErrorCode
&errorCode
) {
398 ces
.removeAllElements();
399 coll
.internalGetCEs(s
, ces
, errorCode
);
400 if (U_FAILURE(errorCode
)) { return FALSE
; }
401 UBool seenPrimary
= FALSE
;
402 for (int32_t i
= 0; i
< ces
.size(); ++i
) {
403 int64_t ce
= ces
.elementAti(i
);
404 uint32_t p
= (uint32_t)(ce
>> 32);
405 if (p
> variableTop
) {
406 // not primary ignorable
418 BucketList
*AlphabeticIndex::createBucketList(UErrorCode
&errorCode
) const {
419 // Initialize indexCharacters.
420 UVector
indexCharacters(errorCode
);
421 indexCharacters
.setDeleter(uprv_deleteUObject
);
422 initLabels(indexCharacters
, errorCode
);
423 if (U_FAILURE(errorCode
)) { return NULL
; }
425 // Variables for hasMultiplePrimaryWeights().
426 UVector64
ces(errorCode
);
427 uint32_t variableTop
;
428 if (collatorPrimaryOnly_
->getAttribute(UCOL_ALTERNATE_HANDLING
, errorCode
) == UCOL_SHIFTED
) {
429 variableTop
= collatorPrimaryOnly_
->getVariableTop(errorCode
);
433 UBool hasInvisibleBuckets
= FALSE
;
435 // Helper arrays for Chinese Pinyin collation.
436 Bucket
*asciiBuckets
[26] = {
437 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
438 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
440 Bucket
*pinyinBuckets
[26] = {
441 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
442 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
444 UBool hasPinyin
= FALSE
;
446 LocalPointer
<UVector
> bucketList(new UVector(errorCode
), errorCode
);
447 if (U_FAILURE(errorCode
)) {
450 bucketList
->setDeleter(uprv_deleteUObject
);
453 Bucket
*bucket
= new Bucket(getUnderflowLabel(), emptyString_
, U_ALPHAINDEX_UNDERFLOW
);
454 if (bucket
== NULL
) {
455 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
458 bucketList
->addElement(bucket
, errorCode
);
459 if (U_FAILURE(errorCode
)) { return NULL
; }
463 // fix up the list, adding underflow, additions, overflow
464 // Insert inflow labels as needed.
465 int32_t scriptIndex
= -1;
466 const UnicodeString
*scriptUpperBoundary
= &emptyString_
;
467 for (int32_t i
= 0; i
< indexCharacters
.size(); ++i
) {
468 UnicodeString
¤t
= *getString(indexCharacters
, i
);
469 if (collatorPrimaryOnly_
->compare(current
, *scriptUpperBoundary
, errorCode
) >= 0) {
470 // We crossed the script boundary into a new script.
471 const UnicodeString
&inflowBoundary
= *scriptUpperBoundary
;
472 UBool skippedScript
= FALSE
;
474 scriptUpperBoundary
= getString(*firstCharsInScripts_
, ++scriptIndex
);
475 if (collatorPrimaryOnly_
->compare(current
, *scriptUpperBoundary
, errorCode
) < 0) {
478 skippedScript
= TRUE
;
480 if (skippedScript
&& bucketList
->size() > 1) {
481 // We are skipping one or more scripts,
482 // and we are not just getting out of the underflow label.
483 bucket
= new Bucket(getInflowLabel(), inflowBoundary
, U_ALPHAINDEX_INFLOW
);
484 if (bucket
== NULL
) {
485 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
488 bucketList
->addElement(bucket
, errorCode
);
491 // Add a bucket with the current label.
492 bucket
= new Bucket(fixLabel(current
, temp
), current
, U_ALPHAINDEX_NORMAL
);
493 if (bucket
== NULL
) {
494 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
497 bucketList
->addElement(bucket
, errorCode
);
498 // Remember ASCII and Pinyin buckets for Pinyin redirects.
500 if (current
.length() == 1 && 0x41 <= (c
= current
.charAt(0)) && c
<= 0x5A) { // A-Z
501 asciiBuckets
[c
- 0x41] = bucket
;
502 } else if (current
.length() == BASE_LENGTH
+ 1 && current
.startsWith(BASE
, BASE_LENGTH
) &&
503 0x41 <= (c
= current
.charAt(BASE_LENGTH
)) && c
<= 0x5A) {
504 pinyinBuckets
[c
- 0x41] = bucket
;
507 // Check for multiple primary weights.
508 if (!current
.startsWith(BASE
, BASE_LENGTH
) &&
509 hasMultiplePrimaryWeights(*collatorPrimaryOnly_
, variableTop
, current
,
511 current
.charAt(current
.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
512 // "AE-ligature" or "Sch" etc.
513 for (int32_t j
= bucketList
->size() - 2;; --j
) {
514 Bucket
*singleBucket
= getBucket(*bucketList
, j
);
515 if (singleBucket
->labelType_
!= U_ALPHAINDEX_NORMAL
) {
516 // There is no single-character bucket since the last
517 // underflow or inflow label.
520 if (singleBucket
->displayBucket_
== NULL
&&
521 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_
, variableTop
,
522 singleBucket
->lowerBoundary_
,
524 // Add an invisible bucket that redirects strings greater than the expansion
525 // to the previous single-character bucket.
526 // For example, after ... Q R S Sch we add Sch\uFFFF->S
527 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
528 bucket
= new Bucket(emptyString_
,
529 UnicodeString(current
).append((UChar
)0xFFFF),
530 U_ALPHAINDEX_NORMAL
);
531 if (bucket
== NULL
) {
532 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
535 bucket
->displayBucket_
= singleBucket
;
536 bucketList
->addElement(bucket
, errorCode
);
537 hasInvisibleBuckets
= TRUE
;
543 if (U_FAILURE(errorCode
)) { return NULL
; }
544 if (bucketList
->size() == 1) {
545 // No real labels, show only the underflow label.
546 BucketList
*bl
= new BucketList(bucketList
.getAlias(), bucketList
.getAlias());
548 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
555 bucket
= new Bucket(getOverflowLabel(), *scriptUpperBoundary
, U_ALPHAINDEX_OVERFLOW
);
556 if (bucket
== NULL
) {
557 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
560 bucketList
->addElement(bucket
, errorCode
); // final
563 // Redirect Pinyin buckets.
564 Bucket
*asciiBucket
= NULL
;
565 for (int32_t i
= 0; i
< 26; ++i
) {
566 if (asciiBuckets
[i
] != NULL
) {
567 asciiBucket
= asciiBuckets
[i
];
569 if (pinyinBuckets
[i
] != NULL
&& asciiBucket
!= NULL
) {
570 pinyinBuckets
[i
]->displayBucket_
= asciiBucket
;
571 hasInvisibleBuckets
= TRUE
;
576 if (U_FAILURE(errorCode
)) { return NULL
; }
577 if (!hasInvisibleBuckets
) {
578 BucketList
*bl
= new BucketList(bucketList
.getAlias(), bucketList
.getAlias());
580 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
586 // Merge inflow buckets that are visually adjacent.
587 // Iterate backwards: Merge inflow into overflow rather than the other way around.
588 int32_t i
= bucketList
->size() - 1;
589 Bucket
*nextBucket
= getBucket(*bucketList
, i
);
591 bucket
= getBucket(*bucketList
, i
);
592 if (bucket
->displayBucket_
!= NULL
) {
593 continue; // skip invisible buckets
595 if (bucket
->labelType_
== U_ALPHAINDEX_INFLOW
) {
596 if (nextBucket
->labelType_
!= U_ALPHAINDEX_NORMAL
) {
597 bucket
->displayBucket_
= nextBucket
;
604 LocalPointer
<UVector
> publicBucketList(new UVector(errorCode
), errorCode
);
605 if (U_FAILURE(errorCode
)) {
608 // Do not call publicBucketList->setDeleter():
609 // This vector shares its objects with the bucketList.
610 for (int32_t j
= 0; j
< bucketList
->size(); ++j
) {
611 bucket
= getBucket(*bucketList
, j
);
612 if (bucket
->displayBucket_
== NULL
) {
613 publicBucketList
->addElement(bucket
, errorCode
);
616 if (U_FAILURE(errorCode
)) { return NULL
; }
617 BucketList
*bl
= new BucketList(bucketList
.getAlias(), publicBucketList
.getAlias());
619 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
623 publicBucketList
.orphan();
628 * Creates an index, and buckets and sorts the list of records into the index.
630 void AlphabeticIndex::initBuckets(UErrorCode
&errorCode
) {
631 if (U_FAILURE(errorCode
) || buckets_
!= NULL
) {
634 buckets_
= createBucketList(errorCode
);
635 if (U_FAILURE(errorCode
) || inputList_
== NULL
|| inputList_
->isEmpty()) {
639 // Sort the records by name.
640 // Stable sort preserves input order of collation duplicates.
641 inputList_
->sortWithUComparator(recordCompareFn
, collator_
, errorCode
);
643 // Now, we traverse all of the input, which is now sorted.
644 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
645 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
646 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
647 // we need to improve it for that case.
649 Bucket
*currentBucket
= getBucket(*buckets_
->bucketList_
, 0);
650 int32_t bucketIndex
= 1;
652 const UnicodeString
*upperBoundary
;
653 if (bucketIndex
< buckets_
->bucketList_
->size()) {
654 nextBucket
= getBucket(*buckets_
->bucketList_
, bucketIndex
++);
655 upperBoundary
= &nextBucket
->lowerBoundary_
;
658 upperBoundary
= NULL
;
660 for (int32_t i
= 0; i
< inputList_
->size(); ++i
) {
661 Record
*r
= getRecord(*inputList_
, i
);
662 // if the current bucket isn't the right one, find the one that is
663 // We have a special flag for the last bucket so that we don't look any further
664 while (upperBoundary
!= NULL
&&
665 collatorPrimaryOnly_
->compare(r
->name_
, *upperBoundary
, errorCode
) >= 0) {
666 currentBucket
= nextBucket
;
667 // now reset the boundary that we compare against
668 if (bucketIndex
< buckets_
->bucketList_
->size()) {
669 nextBucket
= getBucket(*buckets_
->bucketList_
, bucketIndex
++);
670 upperBoundary
= &nextBucket
->lowerBoundary_
;
672 upperBoundary
= NULL
;
675 // now put the record into the bucket.
676 Bucket
*bucket
= currentBucket
;
677 if (bucket
->displayBucket_
!= NULL
) {
678 bucket
= bucket
->displayBucket_
;
680 if (bucket
->records_
== NULL
) {
681 bucket
->records_
= new UVector(errorCode
);
682 if (bucket
->records_
== NULL
) {
683 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
687 bucket
->records_
->addElement(r
, errorCode
);
691 void AlphabeticIndex::clearBuckets() {
692 if (buckets_
!= NULL
) {
695 internalResetBucketIterator();
699 void AlphabeticIndex::internalResetBucketIterator() {
700 labelsIterIndex_
= -1;
701 currentBucket_
= NULL
;
705 void AlphabeticIndex::addIndexExemplars(const Locale
&locale
, UErrorCode
&status
) {
706 LocalULocaleDataPointer
uld(ulocdata_open(locale
.getName(), &status
));
707 if (U_FAILURE(status
)) {
711 UnicodeSet exemplars
;
712 ulocdata_getExemplarSet(uld
.getAlias(), exemplars
.toUSet(), 0, ULOCDATA_ES_INDEX
, &status
);
713 if (U_SUCCESS(status
)) {
714 initialLabels_
->addAll(exemplars
);
717 status
= U_ZERO_ERROR
; // Clear out U_MISSING_RESOURCE_ERROR
719 // The locale data did not include explicit Index characters.
720 // Synthesize a set of them from the locale's standard exemplar characters.
721 ulocdata_getExemplarSet(uld
.getAlias(), exemplars
.toUSet(), 0, ULOCDATA_ES_STANDARD
, &status
);
722 if (U_FAILURE(status
)) {
726 // question: should we add auxiliary exemplars?
727 if (exemplars
.containsSome(0x61, 0x7A) /* a-z */ || exemplars
.isEmpty()) {
728 exemplars
.add(0x61, 0x7A);
730 if (exemplars
.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
731 // cut down to small list
732 exemplars
.remove(0xAC00, 0xD7A3).
733 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
734 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
735 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
736 add(0xD30C).add(0xD558);
738 if (exemplars
.containsSome(0x1200, 0x137F)) { // Ethiopic block
739 // cut down to small list
740 // make use of the fact that Ethiopic is allocated in 8's, where
741 // the base is 0 mod 8.
742 UnicodeSet
ethiopic(UnicodeString(u
"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status
);
743 ethiopic
.retainAll(exemplars
);
744 exemplars
.remove(u
'ሀ', 0x137F).addAll(ethiopic
);
747 // Upper-case any that aren't already so.
748 // (We only do this for synthesized index characters.)
749 UnicodeSetIterator
it(exemplars
);
750 UnicodeString upperC
;
752 const UnicodeString
&exemplarC
= it
.getString();
754 upperC
.toUpper(locale
);
755 initialLabels_
->add(upperC
);
759 UBool
AlphabeticIndex::addChineseIndexCharacters(UErrorCode
&errorCode
) {
760 UnicodeSet contractions
;
761 collatorPrimaryOnly_
->internalAddContractions(BASE
[0], contractions
, errorCode
);
762 if (U_FAILURE(errorCode
) || contractions
.isEmpty()) { return FALSE
; }
763 initialLabels_
->addAll(contractions
);
764 UnicodeSetIterator
iter(contractions
);
765 while (iter
.next()) {
766 const UnicodeString
&s
= iter
.getString();
767 U_ASSERT (s
.startsWith(BASE
, BASE_LENGTH
));
768 UChar c
= s
.charAt(s
.length() - 1);
769 if (0x41 <= c
&& c
<= 0x5A) { // A-Z
770 // There are Pinyin labels, add ASCII A-Z labels as well.
771 initialLabels_
->add(0x41, 0x5A); // A-Z
780 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
782 static const UChar CGJ
= 0x034F;
783 UnicodeString
AlphabeticIndex::separated(const UnicodeString
&item
) {
784 UnicodeString result
;
785 if (item
.length() == 0) {
790 UChar32 cp
= item
.char32At(i
);
792 i
= item
.moveIndex32(i
, 1);
793 if (i
>= item
.length()) {
802 UBool
AlphabeticIndex::operator==(const AlphabeticIndex
& /* other */) const {
807 UBool
AlphabeticIndex::operator!=(const AlphabeticIndex
& /* other */) const {
812 const RuleBasedCollator
&AlphabeticIndex::getCollator() const {
817 const UnicodeString
&AlphabeticIndex::getInflowLabel() const {
821 const UnicodeString
&AlphabeticIndex::getOverflowLabel() const {
822 return overflowLabel_
;
826 const UnicodeString
&AlphabeticIndex::getUnderflowLabel() const {
827 return underflowLabel_
;
831 AlphabeticIndex
&AlphabeticIndex::setInflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
832 inflowLabel_
= label
;
838 AlphabeticIndex
&AlphabeticIndex::setOverflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
839 overflowLabel_
= label
;
845 AlphabeticIndex
&AlphabeticIndex::setUnderflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
846 underflowLabel_
= label
;
852 int32_t AlphabeticIndex::getMaxLabelCount() const {
853 return maxLabelCount_
;
857 AlphabeticIndex
&AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount
, UErrorCode
&status
) {
858 if (U_FAILURE(status
)) {
861 if (maxLabelCount
<= 0) {
862 status
= U_ILLEGAL_ARGUMENT_ERROR
;
865 maxLabelCount_
= maxLabelCount
;
872 // init() - Common code for constructors.
875 void AlphabeticIndex::init(const Locale
*locale
, UErrorCode
&status
) {
876 if (U_FAILURE(status
)) { return; }
877 if (locale
== NULL
&& collator_
== NULL
) {
878 status
= U_ILLEGAL_ARGUMENT_ERROR
;
882 initialLabels_
= new UnicodeSet();
883 if (initialLabels_
== NULL
) {
884 status
= U_MEMORY_ALLOCATION_ERROR
;
888 inflowLabel_
.setTo((UChar
)0x2026); // Ellipsis
889 overflowLabel_
= inflowLabel_
;
890 underflowLabel_
= inflowLabel_
;
892 if (collator_
== NULL
) {
893 Collator
*coll
= Collator::createInstance(*locale
, status
);
894 if (U_FAILURE(status
)) {
899 status
= U_MEMORY_ALLOCATION_ERROR
;
902 collator_
= dynamic_cast<RuleBasedCollator
*>(coll
);
903 if (collator_
== NULL
) {
905 status
= U_UNSUPPORTED_ERROR
;
909 collatorPrimaryOnly_
= collator_
->clone();
910 if (collatorPrimaryOnly_
== NULL
) {
911 status
= U_MEMORY_ALLOCATION_ERROR
;
914 collatorPrimaryOnly_
->setAttribute(UCOL_STRENGTH
, UCOL_PRIMARY
, status
);
915 firstCharsInScripts_
= firstStringsInScript(status
);
916 if (U_FAILURE(status
)) { return; }
917 firstCharsInScripts_
->sortWithUComparator(collatorComparator
, collatorPrimaryOnly_
, status
);
918 // Guard against a degenerate collator where
919 // some script boundary strings are primary ignorable.
921 if (U_FAILURE(status
)) { return; }
922 if (firstCharsInScripts_
->isEmpty()) {
923 // AlphabeticIndex requires some non-ignorable script boundary strings.
924 status
= U_ILLEGAL_ARGUMENT_ERROR
;
927 if (collatorPrimaryOnly_
->compare(
928 *static_cast<UnicodeString
*>(firstCharsInScripts_
->elementAt(0)),
929 emptyString_
, status
) == UCOL_EQUAL
) {
930 firstCharsInScripts_
->removeElementAt(0);
936 // Chinese index characters, which are specific to each of the several Chinese tailorings,
937 // take precedence over the single locale data exemplar set per language.
938 if (!addChineseIndexCharacters(status
) && locale
!= NULL
) {
939 addIndexExemplars(*locale
, status
);
945 // Comparison function for UVector<UnicodeString *> sorting with a collator.
947 static int32_t U_CALLCONV
948 collatorComparator(const void *context
, const void *left
, const void *right
) {
949 const UElement
*leftElement
= static_cast<const UElement
*>(left
);
950 const UElement
*rightElement
= static_cast<const UElement
*>(right
);
951 const UnicodeString
*leftString
= static_cast<const UnicodeString
*>(leftElement
->pointer
);
952 const UnicodeString
*rightString
= static_cast<const UnicodeString
*>(rightElement
->pointer
);
954 if (leftString
== rightString
) {
955 // Catches case where both are NULL
958 if (leftString
== NULL
) {
961 if (rightString
== NULL
) {
964 const Collator
*col
= static_cast<const Collator
*>(context
);
965 UErrorCode errorCode
= U_ZERO_ERROR
;
966 return col
->compare(*leftString
, *rightString
, errorCode
);
970 // Comparison function for UVector<Record *> sorting with a collator.
972 static int32_t U_CALLCONV
973 recordCompareFn(const void *context
, const void *left
, const void *right
) {
974 const UElement
*leftElement
= static_cast<const UElement
*>(left
);
975 const UElement
*rightElement
= static_cast<const UElement
*>(right
);
976 const AlphabeticIndex::Record
*leftRec
= static_cast<const AlphabeticIndex::Record
*>(leftElement
->pointer
);
977 const AlphabeticIndex::Record
*rightRec
= static_cast<const AlphabeticIndex::Record
*>(rightElement
->pointer
);
978 const Collator
*col
= static_cast<const Collator
*>(context
);
979 UErrorCode errorCode
= U_ZERO_ERROR
;
980 return col
->compare(leftRec
->name_
, rightRec
->name_
, errorCode
);
983 UVector
*AlphabeticIndex::firstStringsInScript(UErrorCode
&status
) {
984 if (U_FAILURE(status
)) {
987 LocalPointer
<UVector
> dest(new UVector(status
), status
);
988 if (U_FAILURE(status
)) {
991 dest
->setDeleter(uprv_deleteUObject
);
992 // Fetch the script-first-primary contractions which are defined in the root collator.
993 // They all start with U+FDD1.
995 collatorPrimaryOnly_
->internalAddContractions(0xFDD1, set
, status
);
996 if (U_FAILURE(status
)) {
1000 status
= U_UNSUPPORTED_ERROR
;
1003 UnicodeSetIterator
iter(set
);
1004 while (iter
.next()) {
1005 const UnicodeString
&boundary
= iter
.getString();
1006 uint32_t gcMask
= U_GET_GC_MASK(boundary
.char32At(1));
1007 if ((gcMask
& (U_GC_L_MASK
| U_GC_CN_MASK
)) == 0) {
1008 // Ignore boundaries for the special reordering groups.
1009 // Take only those for "real scripts" (where the sample character is a Letter,
1010 // and the one for unassigned implicit weights (Cn).
1013 UnicodeString
*s
= new UnicodeString(boundary
);
1015 status
= U_MEMORY_ALLOCATION_ERROR
;
1018 dest
->addElement(s
, status
);
1020 return dest
.orphan();
1027 * Returns true if one index character string is "better" than the other.
1028 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1029 * better, and otherwise binary-less-than is better.
1031 UBool
isOneLabelBetterThanOther(const Normalizer2
&nfkdNormalizer
,
1032 const UnicodeString
&one
, const UnicodeString
&other
) {
1033 // This is called with primary-equal strings, but never with one.equals(other).
1034 UErrorCode status
= U_ZERO_ERROR
;
1035 UnicodeString n1
= nfkdNormalizer
.normalize(one
, status
);
1036 UnicodeString n2
= nfkdNormalizer
.normalize(other
, status
);
1037 if (U_FAILURE(status
)) { return FALSE
; }
1038 int32_t result
= n1
.countChar32() - n2
.countChar32();
1042 result
= n1
.compareCodePointOrder(n2
);
1046 return one
.compareCodePointOrder(other
) < 0;
1052 // Constructor & Destructor for AlphabeticIndex::Record
1054 // Records are internal only, instances are not directly surfaced in the public API.
1055 // This class is mostly struct-like, with all public fields.
1057 AlphabeticIndex::Record::Record(const UnicodeString
&name
, const void *data
)
1058 : name_(name
), data_(data
) {}
1060 AlphabeticIndex::Record::~Record() {
1064 AlphabeticIndex
& AlphabeticIndex::addRecord(const UnicodeString
&name
, const void *data
, UErrorCode
&status
) {
1065 if (U_FAILURE(status
)) {
1068 if (inputList_
== NULL
) {
1069 inputList_
= new UVector(status
);
1070 if (inputList_
== NULL
) {
1071 status
= U_MEMORY_ALLOCATION_ERROR
;
1074 inputList_
->setDeleter(alphaIndex_deleteRecord
);
1076 Record
*r
= new Record(name
, data
);
1078 status
= U_MEMORY_ALLOCATION_ERROR
;
1081 inputList_
->addElement(r
, status
);
1085 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1086 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1091 AlphabeticIndex
&AlphabeticIndex::clearRecords(UErrorCode
&status
) {
1092 if (U_SUCCESS(status
) && inputList_
!= NULL
&& !inputList_
->isEmpty()) {
1093 inputList_
->removeAllElements();
1099 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString
&name
, UErrorCode
&status
) {
1100 initBuckets(status
);
1101 if (U_FAILURE(status
)) {
1104 return buckets_
->getBucketIndex(name
, *collatorPrimaryOnly_
, status
);
1108 int32_t AlphabeticIndex::getBucketIndex() const {
1109 return labelsIterIndex_
;
1113 UBool
AlphabeticIndex::nextBucket(UErrorCode
&status
) {
1114 if (U_FAILURE(status
)) {
1117 if (buckets_
== NULL
&& currentBucket_
!= NULL
) {
1118 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
1121 initBuckets(status
);
1122 if (U_FAILURE(status
)) {
1126 if (labelsIterIndex_
>= buckets_
->getBucketCount()) {
1127 labelsIterIndex_
= buckets_
->getBucketCount();
1130 currentBucket_
= getBucket(*buckets_
->immutableVisibleList_
, labelsIterIndex_
);
1131 resetRecordIterator();
1135 const UnicodeString
&AlphabeticIndex::getBucketLabel() const {
1136 if (currentBucket_
!= NULL
) {
1137 return currentBucket_
->label_
;
1139 return emptyString_
;
1144 UAlphabeticIndexLabelType
AlphabeticIndex::getBucketLabelType() const {
1145 if (currentBucket_
!= NULL
) {
1146 return currentBucket_
->labelType_
;
1148 return U_ALPHAINDEX_NORMAL
;
1153 int32_t AlphabeticIndex::getBucketRecordCount() const {
1154 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
) {
1155 return currentBucket_
->records_
->size();
1162 AlphabeticIndex
&AlphabeticIndex::resetBucketIterator(UErrorCode
&status
) {
1163 if (U_FAILURE(status
)) {
1166 internalResetBucketIterator();
1171 UBool
AlphabeticIndex::nextRecord(UErrorCode
&status
) {
1172 if (U_FAILURE(status
)) {
1175 if (currentBucket_
== NULL
) {
1176 // We are trying to iterate over the items in a bucket, but there is no
1177 // current bucket from the enumeration of buckets.
1178 status
= U_INVALID_STATE_ERROR
;
1181 if (buckets_
== NULL
) {
1182 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
1185 if (currentBucket_
->records_
== NULL
) {
1189 if (itemsIterIndex_
>= currentBucket_
->records_
->size()) {
1190 itemsIterIndex_
= currentBucket_
->records_
->size();
1197 const UnicodeString
&AlphabeticIndex::getRecordName() const {
1198 const UnicodeString
*retStr
= &emptyString_
;
1199 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
&&
1200 itemsIterIndex_
>= 0 &&
1201 itemsIterIndex_
< currentBucket_
->records_
->size()) {
1202 Record
*item
= static_cast<Record
*>(currentBucket_
->records_
->elementAt(itemsIterIndex_
));
1203 retStr
= &item
->name_
;
1208 const void *AlphabeticIndex::getRecordData() const {
1209 const void *retPtr
= NULL
;
1210 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
&&
1211 itemsIterIndex_
>= 0 &&
1212 itemsIterIndex_
< currentBucket_
->records_
->size()) {
1213 Record
*item
= static_cast<Record
*>(currentBucket_
->records_
->elementAt(itemsIterIndex_
));
1214 retPtr
= item
->data_
;
1220 AlphabeticIndex
& AlphabeticIndex::resetRecordIterator() {
1221 itemsIterIndex_
= -1;
1227 AlphabeticIndex::Bucket::Bucket(const UnicodeString
&label
,
1228 const UnicodeString
&lowerBoundary
,
1229 UAlphabeticIndexLabelType type
)
1230 : label_(label
), lowerBoundary_(lowerBoundary
), labelType_(type
),
1231 displayBucket_(NULL
), displayIndex_(-1),
1236 AlphabeticIndex::Bucket::~Bucket() {
1242 #endif // !UCONFIG_NO_COLLATION