1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2009-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/alphaindex.h"
15 #include "unicode/coll.h"
16 #include "unicode/localpointer.h"
17 #include "unicode/normalizer2.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ulocdata.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uobject.h"
23 #include "unicode/usetiter.h"
24 #include "unicode/utf16.h"
40 * Prefix string for Chinese index buckets.
41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
43 const UChar BASE
[1] = { 0xFDD0 };
44 const int32_t BASE_LENGTH
= 1;
46 UBool
isOneLabelBetterThanOther(const Normalizer2
&nfkdNormalizer
,
47 const UnicodeString
&one
, const UnicodeString
&other
);
51 static int32_t U_CALLCONV
52 collatorComparator(const void *context
, const void *left
, const void *right
);
54 static int32_t U_CALLCONV
55 recordCompareFn(const void *context
, const void *left
, const void *right
);
57 // UVector<Record *> support function, delete a Record.
58 static void U_CALLCONV
59 alphaIndex_deleteRecord(void *obj
) {
60 delete static_cast<AlphabeticIndex::Record
*>(obj
);
65 UnicodeString
*ownedString(const UnicodeString
&s
, LocalPointer
<UnicodeString
> &owned
,
66 UErrorCode
&errorCode
) {
67 if (U_FAILURE(errorCode
)) { return NULL
; }
68 if (owned
.isValid()) {
69 return owned
.orphan();
71 UnicodeString
*p
= new UnicodeString(s
);
73 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
78 inline UnicodeString
*getString(const UVector
&list
, int32_t i
) {
79 return static_cast<UnicodeString
*>(list
[i
]);
82 inline AlphabeticIndex::Bucket
*getBucket(const UVector
&list
, int32_t i
) {
83 return static_cast<AlphabeticIndex::Bucket
*>(list
[i
]);
86 inline AlphabeticIndex::Record
*getRecord(const UVector
&list
, int32_t i
) {
87 return static_cast<AlphabeticIndex::Record
*>(list
[i
]);
91 * Like Java Collections.binarySearch(List, String, Comparator).
93 * @return the index>=0 where the item was found,
94 * or the index<0 for inserting the string at ~index in sorted order
96 int32_t binarySearch(const UVector
&list
, const UnicodeString
&s
, const Collator
&coll
) {
97 if (list
.size() == 0) { return ~0; }
99 int32_t limit
= list
.size();
101 int32_t i
= (start
+ limit
) / 2;
102 const UnicodeString
*si
= static_cast<UnicodeString
*>(list
.elementAt(i
));
103 UErrorCode errorCode
= U_ZERO_ERROR
;
104 UCollationResult cmp
= coll
.compare(s
, *si
, errorCode
);
105 if (cmp
== UCOL_EQUAL
) {
107 } else if (cmp
< 0) {
109 return ~start
; // insert s before *si
114 return ~(start
+ 1); // insert s after *si
123 // The BucketList is not in the anonymous namespace because only Clang
124 // seems to support its use in other classes from there.
125 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126 class BucketList
: public UObject
{
128 BucketList(UVector
*bucketList
, UVector
*publicBucketList
)
129 : bucketList_(bucketList
), immutableVisibleList_(publicBucketList
) {
130 int32_t displayIndex
= 0;
131 for (int32_t i
= 0; i
< publicBucketList
->size(); ++i
) {
132 getBucket(*publicBucketList
, i
)->displayIndex_
= displayIndex
++;
136 // The virtual destructor must not be inline.
137 // See ticket #8454 for details.
138 virtual ~BucketList();
140 int32_t getBucketCount() const {
141 return immutableVisibleList_
->size();
144 int32_t getBucketIndex(const UnicodeString
&name
, const Collator
&collatorPrimaryOnly
,
145 UErrorCode
&errorCode
) {
148 int32_t limit
= bucketList_
->size();
149 while ((start
+ 1) < limit
) {
150 int32_t i
= (start
+ limit
) / 2;
151 const AlphabeticIndex::Bucket
*bucket
= getBucket(*bucketList_
, i
);
152 UCollationResult nameVsBucket
=
153 collatorPrimaryOnly
.compare(name
, bucket
->lowerBoundary_
, errorCode
);
154 if (nameVsBucket
< 0) {
160 const AlphabeticIndex::Bucket
*bucket
= getBucket(*bucketList_
, start
);
161 if (bucket
->displayBucket_
!= NULL
) {
162 bucket
= bucket
->displayBucket_
;
164 return bucket
->displayIndex_
;
167 /** All of the buckets, visible and invisible. */
168 UVector
*bucketList_
;
169 /** Just the visible buckets. */
170 UVector
*immutableVisibleList_
;
173 BucketList::~BucketList() {
175 if (immutableVisibleList_
!= bucketList_
) {
176 delete immutableVisibleList_
;
180 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
182 delete collatorPrimaryOnly_
;
186 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187 return buckets_
->getBucketCount();
191 AlphabeticIndex::ImmutableIndex::getBucketIndex(
192 const UnicodeString
&name
, UErrorCode
&errorCode
) const {
193 return buckets_
->getBucketIndex(name
, *collatorPrimaryOnly_
, errorCode
);
196 const AlphabeticIndex::Bucket
*
197 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index
) const {
198 if (0 <= index
&& index
< buckets_
->getBucketCount()) {
199 return icu::getBucket(*buckets_
->immutableVisibleList_
, index
);
205 AlphabeticIndex::AlphabeticIndex(const Locale
&locale
, UErrorCode
&status
)
207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL
),
209 initialLabels_(NULL
), firstCharsInScripts_(NULL
),
210 collator_(NULL
), collatorPrimaryOnly_(NULL
),
212 init(&locale
, status
);
216 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator
*collator
, UErrorCode
&status
)
218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL
),
220 initialLabels_(NULL
), firstCharsInScripts_(NULL
),
221 collator_(collator
), collatorPrimaryOnly_(NULL
),
228 AlphabeticIndex::~AlphabeticIndex() {
230 delete collatorPrimaryOnly_
;
231 delete firstCharsInScripts_
;
234 delete initialLabels_
;
238 AlphabeticIndex
&AlphabeticIndex::addLabels(const UnicodeSet
&additions
, UErrorCode
&status
) {
239 if (U_FAILURE(status
)) {
242 initialLabels_
->addAll(additions
);
248 AlphabeticIndex
&AlphabeticIndex::addLabels(const Locale
&locale
, UErrorCode
&status
) {
249 addIndexExemplars(locale
, status
);
255 AlphabeticIndex::ImmutableIndex
*AlphabeticIndex::buildImmutableIndex(UErrorCode
&errorCode
) {
256 if (U_FAILURE(errorCode
)) { return NULL
; }
257 // In C++, the ImmutableIndex must own its copy of the BucketList,
258 // even if it contains no records, for proper memory management.
259 // We could clone the buckets_ if they are not NULL,
260 // but that would be worth it only if this method is called multiple times,
261 // or called after using the old-style bucket iterator API.
262 LocalPointer
<BucketList
> immutableBucketList(createBucketList(errorCode
));
263 LocalPointer
<RuleBasedCollator
> coll(
264 static_cast<RuleBasedCollator
*>(collatorPrimaryOnly_
->clone()));
265 if (immutableBucketList
.isNull() || coll
.isNull()) {
266 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
269 ImmutableIndex
*immIndex
= new ImmutableIndex(immutableBucketList
.getAlias(), coll
.getAlias());
270 if (immIndex
== NULL
) {
271 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
274 // The ImmutableIndex adopted its parameter objects.
275 immutableBucketList
.orphan();
280 int32_t AlphabeticIndex::getBucketCount(UErrorCode
&status
) {
282 if (U_FAILURE(status
)) {
285 return buckets_
->getBucketCount();
289 int32_t AlphabeticIndex::getRecordCount(UErrorCode
&status
) {
290 if (U_FAILURE(status
) || inputList_
== NULL
) {
293 return inputList_
->size();
296 void AlphabeticIndex::initLabels(UVector
&indexCharacters
, UErrorCode
&errorCode
) const {
297 const Normalizer2
*nfkdNormalizer
= Normalizer2::getNFKDInstance(errorCode
);
298 if (U_FAILURE(errorCode
)) { return; }
300 const UnicodeString
&firstScriptBoundary
= *getString(*firstCharsInScripts_
, 0);
301 const UnicodeString
&overflowBoundary
=
302 *getString(*firstCharsInScripts_
, firstCharsInScripts_
->size() - 1);
304 // We make a sorted array of elements.
305 // Some of the input may be redundant.
306 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
307 // We filter out those cases.
308 UnicodeSetIterator
iter(*initialLabels_
);
309 while (iter
.next()) {
310 const UnicodeString
*item
= &iter
.getString();
311 LocalPointer
<UnicodeString
> ownedItem
;
313 int32_t itemLength
= item
->length();
314 if (!item
->hasMoreChar32Than(0, itemLength
, 1)) {
315 checkDistinct
= FALSE
;
316 } else if(item
->charAt(itemLength
- 1) == 0x2a && // '*'
317 item
->charAt(itemLength
- 2) != 0x2a) {
318 // Use a label if it is marked with one trailing star,
319 // even if the label string sorts the same when all contractions are suppressed.
320 ownedItem
.adoptInstead(new UnicodeString(*item
, 0, itemLength
- 1));
321 item
= ownedItem
.getAlias();
323 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
326 checkDistinct
= FALSE
;
328 checkDistinct
= TRUE
;
330 if (collatorPrimaryOnly_
->compare(*item
, firstScriptBoundary
, errorCode
) < 0) {
331 // Ignore a primary-ignorable or non-alphabetic index character.
332 } else if (collatorPrimaryOnly_
->compare(*item
, overflowBoundary
, errorCode
) >= 0) {
333 // Ignore an index character that will land in the overflow bucket.
334 } else if (checkDistinct
&&
335 collatorPrimaryOnly_
->compare(*item
, separated(*item
), errorCode
) == 0) {
336 // Ignore a multi-code point index character that does not sort distinctly
337 // from the sequence of its separate characters.
339 int32_t insertionPoint
= binarySearch(indexCharacters
, *item
, *collatorPrimaryOnly_
);
340 if (insertionPoint
< 0) {
341 indexCharacters
.insertElementAt(
342 ownedString(*item
, ownedItem
, errorCode
), ~insertionPoint
, errorCode
);
344 const UnicodeString
&itemAlreadyIn
= *getString(indexCharacters
, insertionPoint
);
345 if (isOneLabelBetterThanOther(*nfkdNormalizer
, *item
, itemAlreadyIn
)) {
346 indexCharacters
.setElementAt(
347 ownedString(*item
, ownedItem
, errorCode
), insertionPoint
);
352 if (U_FAILURE(errorCode
)) { return; }
354 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
356 int32_t size
= indexCharacters
.size() - 1;
357 if (size
> maxLabelCount_
) {
360 for (int32_t i
= 0; i
< indexCharacters
.size();) {
362 int32_t bump
= count
* maxLabelCount_
/ size
;
364 indexCharacters
.removeElementAt(i
);
375 const UnicodeString
&fixLabel(const UnicodeString
¤t
, UnicodeString
&temp
) {
376 if (!current
.startsWith(BASE
, BASE_LENGTH
)) {
379 UChar rest
= current
.charAt(BASE_LENGTH
);
380 if (0x2800 < rest
&& rest
<= 0x28FF) { // stroke count
381 int32_t count
= rest
-0x2800;
382 temp
.setTo((UChar
)(0x30 + count
% 10));
385 temp
.insert(0, (UChar
)(0x30 + count
% 10));
388 temp
.insert(0, (UChar
)(0x30 + count
));
391 return temp
.append((UChar
)0x5283);
393 return temp
.setTo(current
, BASE_LENGTH
);
396 UBool
hasMultiplePrimaryWeights(
397 const RuleBasedCollator
&coll
, uint32_t variableTop
,
398 const UnicodeString
&s
, UVector64
&ces
, UErrorCode
&errorCode
) {
399 ces
.removeAllElements();
400 coll
.internalGetCEs(s
, ces
, errorCode
);
401 if (U_FAILURE(errorCode
)) { return FALSE
; }
402 UBool seenPrimary
= FALSE
;
403 for (int32_t i
= 0; i
< ces
.size(); ++i
) {
404 int64_t ce
= ces
.elementAti(i
);
405 uint32_t p
= (uint32_t)(ce
>> 32);
406 if (p
> variableTop
) {
407 // not primary ignorable
419 BucketList
*AlphabeticIndex::createBucketList(UErrorCode
&errorCode
) const {
420 // Initialize indexCharacters.
421 UVector
indexCharacters(errorCode
);
422 indexCharacters
.setDeleter(uprv_deleteUObject
);
423 initLabels(indexCharacters
, errorCode
);
424 if (U_FAILURE(errorCode
)) { return NULL
; }
426 // Variables for hasMultiplePrimaryWeights().
427 UVector64
ces(errorCode
);
428 uint32_t variableTop
;
429 if (collatorPrimaryOnly_
->getAttribute(UCOL_ALTERNATE_HANDLING
, errorCode
) == UCOL_SHIFTED
) {
430 variableTop
= collatorPrimaryOnly_
->getVariableTop(errorCode
);
434 UBool hasInvisibleBuckets
= FALSE
;
436 // Helper arrays for Chinese Pinyin collation.
437 Bucket
*asciiBuckets
[26] = {
438 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
439 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
441 Bucket
*pinyinBuckets
[26] = {
442 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
443 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
445 UBool hasPinyin
= FALSE
;
447 LocalPointer
<UVector
> bucketList(new UVector(errorCode
), errorCode
);
448 if (U_FAILURE(errorCode
)) {
451 bucketList
->setDeleter(uprv_deleteUObject
);
454 Bucket
*bucket
= new Bucket(getUnderflowLabel(), emptyString_
, U_ALPHAINDEX_UNDERFLOW
);
455 if (bucket
== NULL
) {
456 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
459 bucketList
->addElement(bucket
, errorCode
);
460 if (U_FAILURE(errorCode
)) { return NULL
; }
464 // fix up the list, adding underflow, additions, overflow
465 // Insert inflow labels as needed.
466 int32_t scriptIndex
= -1;
467 const UnicodeString
*scriptUpperBoundary
= &emptyString_
;
468 for (int32_t i
= 0; i
< indexCharacters
.size(); ++i
) {
469 UnicodeString
¤t
= *getString(indexCharacters
, i
);
470 if (collatorPrimaryOnly_
->compare(current
, *scriptUpperBoundary
, errorCode
) >= 0) {
471 // We crossed the script boundary into a new script.
472 const UnicodeString
&inflowBoundary
= *scriptUpperBoundary
;
473 UBool skippedScript
= FALSE
;
475 scriptUpperBoundary
= getString(*firstCharsInScripts_
, ++scriptIndex
);
476 if (collatorPrimaryOnly_
->compare(current
, *scriptUpperBoundary
, errorCode
) < 0) {
479 skippedScript
= TRUE
;
481 if (skippedScript
&& bucketList
->size() > 1) {
482 // We are skipping one or more scripts,
483 // and we are not just getting out of the underflow label.
484 bucket
= new Bucket(getInflowLabel(), inflowBoundary
, U_ALPHAINDEX_INFLOW
);
485 if (bucket
== NULL
) {
486 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
489 bucketList
->addElement(bucket
, errorCode
);
492 // Add a bucket with the current label.
493 bucket
= new Bucket(fixLabel(current
, temp
), current
, U_ALPHAINDEX_NORMAL
);
494 if (bucket
== NULL
) {
495 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
498 bucketList
->addElement(bucket
, errorCode
);
499 // Remember ASCII and Pinyin buckets for Pinyin redirects.
501 if (current
.length() == 1 && 0x41 <= (c
= current
.charAt(0)) && c
<= 0x5A) { // A-Z
502 asciiBuckets
[c
- 0x41] = bucket
;
503 } else if (current
.length() == BASE_LENGTH
+ 1 && current
.startsWith(BASE
, BASE_LENGTH
) &&
504 0x41 <= (c
= current
.charAt(BASE_LENGTH
)) && c
<= 0x5A) {
505 pinyinBuckets
[c
- 0x41] = bucket
;
508 // Check for multiple primary weights.
509 if (!current
.startsWith(BASE
, BASE_LENGTH
) &&
510 hasMultiplePrimaryWeights(*collatorPrimaryOnly_
, variableTop
, current
,
512 current
.charAt(current
.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
513 // "AE-ligature" or "Sch" etc.
514 for (int32_t i
= bucketList
->size() - 2;; --i
) {
515 Bucket
*singleBucket
= getBucket(*bucketList
, i
);
516 if (singleBucket
->labelType_
!= U_ALPHAINDEX_NORMAL
) {
517 // There is no single-character bucket since the last
518 // underflow or inflow label.
521 if (singleBucket
->displayBucket_
== NULL
&&
522 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_
, variableTop
,
523 singleBucket
->lowerBoundary_
,
525 // Add an invisible bucket that redirects strings greater than the expansion
526 // to the previous single-character bucket.
527 // For example, after ... Q R S Sch we add Sch\uFFFF->S
528 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
529 bucket
= new Bucket(emptyString_
,
530 UnicodeString(current
).append((UChar
)0xFFFF),
531 U_ALPHAINDEX_NORMAL
);
532 if (bucket
== NULL
) {
533 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
536 bucket
->displayBucket_
= singleBucket
;
537 bucketList
->addElement(bucket
, errorCode
);
538 hasInvisibleBuckets
= TRUE
;
544 if (U_FAILURE(errorCode
)) { return NULL
; }
545 if (bucketList
->size() == 1) {
546 // No real labels, show only the underflow label.
547 BucketList
*bl
= new BucketList(bucketList
.getAlias(), bucketList
.getAlias());
549 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
556 bucket
= new Bucket(getOverflowLabel(), *scriptUpperBoundary
, U_ALPHAINDEX_OVERFLOW
);
557 if (bucket
== NULL
) {
558 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
561 bucketList
->addElement(bucket
, errorCode
); // final
564 // Redirect Pinyin buckets.
565 Bucket
*asciiBucket
= NULL
;
566 for (int32_t i
= 0; i
< 26; ++i
) {
567 if (asciiBuckets
[i
] != NULL
) {
568 asciiBucket
= asciiBuckets
[i
];
570 if (pinyinBuckets
[i
] != NULL
&& asciiBucket
!= NULL
) {
571 pinyinBuckets
[i
]->displayBucket_
= asciiBucket
;
572 hasInvisibleBuckets
= TRUE
;
577 if (U_FAILURE(errorCode
)) { return NULL
; }
578 if (!hasInvisibleBuckets
) {
579 BucketList
*bl
= new BucketList(bucketList
.getAlias(), bucketList
.getAlias());
581 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
587 // Merge inflow buckets that are visually adjacent.
588 // Iterate backwards: Merge inflow into overflow rather than the other way around.
589 int32_t i
= bucketList
->size() - 1;
590 Bucket
*nextBucket
= getBucket(*bucketList
, i
);
592 bucket
= getBucket(*bucketList
, i
);
593 if (bucket
->displayBucket_
!= NULL
) {
594 continue; // skip invisible buckets
596 if (bucket
->labelType_
== U_ALPHAINDEX_INFLOW
) {
597 if (nextBucket
->labelType_
!= U_ALPHAINDEX_NORMAL
) {
598 bucket
->displayBucket_
= nextBucket
;
605 LocalPointer
<UVector
> publicBucketList(new UVector(errorCode
), errorCode
);
606 if (U_FAILURE(errorCode
)) {
609 // Do not call publicBucketList->setDeleter():
610 // This vector shares its objects with the bucketList.
611 for (int32_t i
= 0; i
< bucketList
->size(); ++i
) {
612 bucket
= getBucket(*bucketList
, i
);
613 if (bucket
->displayBucket_
== NULL
) {
614 publicBucketList
->addElement(bucket
, errorCode
);
617 if (U_FAILURE(errorCode
)) { return NULL
; }
618 BucketList
*bl
= new BucketList(bucketList
.getAlias(), publicBucketList
.getAlias());
620 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
624 publicBucketList
.orphan();
629 * Creates an index, and buckets and sorts the list of records into the index.
631 void AlphabeticIndex::initBuckets(UErrorCode
&errorCode
) {
632 if (U_FAILURE(errorCode
) || buckets_
!= NULL
) {
635 buckets_
= createBucketList(errorCode
);
636 if (U_FAILURE(errorCode
) || inputList_
== NULL
|| inputList_
->isEmpty()) {
640 // Sort the records by name.
641 // Stable sort preserves input order of collation duplicates.
642 inputList_
->sortWithUComparator(recordCompareFn
, collator_
, errorCode
);
644 // Now, we traverse all of the input, which is now sorted.
645 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
646 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
647 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
648 // we need to improve it for that case.
650 Bucket
*currentBucket
= getBucket(*buckets_
->bucketList_
, 0);
651 int32_t bucketIndex
= 1;
653 const UnicodeString
*upperBoundary
;
654 if (bucketIndex
< buckets_
->bucketList_
->size()) {
655 nextBucket
= getBucket(*buckets_
->bucketList_
, bucketIndex
++);
656 upperBoundary
= &nextBucket
->lowerBoundary_
;
659 upperBoundary
= NULL
;
661 for (int32_t i
= 0; i
< inputList_
->size(); ++i
) {
662 Record
*r
= getRecord(*inputList_
, i
);
663 // if the current bucket isn't the right one, find the one that is
664 // We have a special flag for the last bucket so that we don't look any further
665 while (upperBoundary
!= NULL
&&
666 collatorPrimaryOnly_
->compare(r
->name_
, *upperBoundary
, errorCode
) >= 0) {
667 currentBucket
= nextBucket
;
668 // now reset the boundary that we compare against
669 if (bucketIndex
< buckets_
->bucketList_
->size()) {
670 nextBucket
= getBucket(*buckets_
->bucketList_
, bucketIndex
++);
671 upperBoundary
= &nextBucket
->lowerBoundary_
;
673 upperBoundary
= NULL
;
676 // now put the record into the bucket.
677 Bucket
*bucket
= currentBucket
;
678 if (bucket
->displayBucket_
!= NULL
) {
679 bucket
= bucket
->displayBucket_
;
681 if (bucket
->records_
== NULL
) {
682 bucket
->records_
= new UVector(errorCode
);
683 if (bucket
->records_
== NULL
) {
684 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
688 bucket
->records_
->addElement(r
, errorCode
);
692 void AlphabeticIndex::clearBuckets() {
693 if (buckets_
!= NULL
) {
696 internalResetBucketIterator();
700 void AlphabeticIndex::internalResetBucketIterator() {
701 labelsIterIndex_
= -1;
702 currentBucket_
= NULL
;
706 void AlphabeticIndex::addIndexExemplars(const Locale
&locale
, UErrorCode
&status
) {
707 LocalULocaleDataPointer
uld(ulocdata_open(locale
.getName(), &status
));
708 if (U_FAILURE(status
)) {
712 UnicodeSet exemplars
;
713 ulocdata_getExemplarSet(uld
.getAlias(), exemplars
.toUSet(), 0, ULOCDATA_ES_INDEX
, &status
);
714 if (U_SUCCESS(status
)) {
715 initialLabels_
->addAll(exemplars
);
718 status
= U_ZERO_ERROR
; // Clear out U_MISSING_RESOURCE_ERROR
720 // The locale data did not include explicit Index characters.
721 // Synthesize a set of them from the locale's standard exemplar characters.
722 ulocdata_getExemplarSet(uld
.getAlias(), exemplars
.toUSet(), 0, ULOCDATA_ES_STANDARD
, &status
);
723 if (U_FAILURE(status
)) {
727 // question: should we add auxiliary exemplars?
728 if (exemplars
.containsSome(0x61, 0x7A) /* a-z */ || exemplars
.isEmpty()) {
729 exemplars
.add(0x61, 0x7A);
731 if (exemplars
.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
732 // cut down to small list
733 exemplars
.remove(0xAC00, 0xD7A3).
734 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
735 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
736 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
737 add(0xD30C).add(0xD558);
739 if (exemplars
.containsSome(0x1200, 0x137F)) { // Ethiopic block
740 // cut down to small list
741 // make use of the fact that Ethiopic is allocated in 8's, where
742 // the base is 0 mod 8.
743 UnicodeSet
ethiopic(UnicodeString(u
"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status
);
744 ethiopic
.retainAll(exemplars
);
745 exemplars
.remove(u
'ሀ', 0x137F).addAll(ethiopic
);
748 // Upper-case any that aren't already so.
749 // (We only do this for synthesized index characters.)
750 UnicodeSetIterator
it(exemplars
);
751 UnicodeString upperC
;
753 const UnicodeString
&exemplarC
= it
.getString();
755 upperC
.toUpper(locale
);
756 initialLabels_
->add(upperC
);
760 UBool
AlphabeticIndex::addChineseIndexCharacters(UErrorCode
&errorCode
) {
761 UnicodeSet contractions
;
762 collatorPrimaryOnly_
->internalAddContractions(BASE
[0], contractions
, errorCode
);
763 if (U_FAILURE(errorCode
) || contractions
.isEmpty()) { return FALSE
; }
764 initialLabels_
->addAll(contractions
);
765 UnicodeSetIterator
iter(contractions
);
766 while (iter
.next()) {
767 const UnicodeString
&s
= iter
.getString();
768 U_ASSERT (s
.startsWith(BASE
, BASE_LENGTH
));
769 UChar c
= s
.charAt(s
.length() - 1);
770 if (0x41 <= c
&& c
<= 0x5A) { // A-Z
771 // There are Pinyin labels, add ASCII A-Z labels as well.
772 initialLabels_
->add(0x41, 0x5A); // A-Z
781 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
783 static const UChar CGJ
= 0x034F;
784 UnicodeString
AlphabeticIndex::separated(const UnicodeString
&item
) {
785 UnicodeString result
;
786 if (item
.length() == 0) {
791 UChar32 cp
= item
.char32At(i
);
793 i
= item
.moveIndex32(i
, 1);
794 if (i
>= item
.length()) {
803 UBool
AlphabeticIndex::operator==(const AlphabeticIndex
& /* other */) const {
808 UBool
AlphabeticIndex::operator!=(const AlphabeticIndex
& /* other */) const {
813 const RuleBasedCollator
&AlphabeticIndex::getCollator() const {
818 const UnicodeString
&AlphabeticIndex::getInflowLabel() const {
822 const UnicodeString
&AlphabeticIndex::getOverflowLabel() const {
823 return overflowLabel_
;
827 const UnicodeString
&AlphabeticIndex::getUnderflowLabel() const {
828 return underflowLabel_
;
832 AlphabeticIndex
&AlphabeticIndex::setInflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
833 inflowLabel_
= label
;
839 AlphabeticIndex
&AlphabeticIndex::setOverflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
840 overflowLabel_
= label
;
846 AlphabeticIndex
&AlphabeticIndex::setUnderflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
847 underflowLabel_
= label
;
853 int32_t AlphabeticIndex::getMaxLabelCount() const {
854 return maxLabelCount_
;
858 AlphabeticIndex
&AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount
, UErrorCode
&status
) {
859 if (U_FAILURE(status
)) {
862 if (maxLabelCount
<= 0) {
863 status
= U_ILLEGAL_ARGUMENT_ERROR
;
866 maxLabelCount_
= maxLabelCount
;
873 // init() - Common code for constructors.
876 void AlphabeticIndex::init(const Locale
*locale
, UErrorCode
&status
) {
877 if (U_FAILURE(status
)) { return; }
878 if (locale
== NULL
&& collator_
== NULL
) {
879 status
= U_ILLEGAL_ARGUMENT_ERROR
;
883 initialLabels_
= new UnicodeSet();
884 if (initialLabels_
== NULL
) {
885 status
= U_MEMORY_ALLOCATION_ERROR
;
889 inflowLabel_
.setTo((UChar
)0x2026); // Ellipsis
890 overflowLabel_
= inflowLabel_
;
891 underflowLabel_
= inflowLabel_
;
893 if (collator_
== NULL
) {
894 Collator
*coll
= Collator::createInstance(*locale
, status
);
895 if (U_FAILURE(status
)) {
900 status
= U_MEMORY_ALLOCATION_ERROR
;
903 collator_
= dynamic_cast<RuleBasedCollator
*>(coll
);
904 if (collator_
== NULL
) {
906 status
= U_UNSUPPORTED_ERROR
;
910 collatorPrimaryOnly_
= static_cast<RuleBasedCollator
*>(collator_
->clone());
911 if (collatorPrimaryOnly_
== NULL
) {
912 status
= U_MEMORY_ALLOCATION_ERROR
;
915 collatorPrimaryOnly_
->setAttribute(UCOL_STRENGTH
, UCOL_PRIMARY
, status
);
916 firstCharsInScripts_
= firstStringsInScript(status
);
917 if (U_FAILURE(status
)) { return; }
918 firstCharsInScripts_
->sortWithUComparator(collatorComparator
, collatorPrimaryOnly_
, status
);
919 // Guard against a degenerate collator where
920 // some script boundary strings are primary ignorable.
922 if (U_FAILURE(status
)) { return; }
923 if (firstCharsInScripts_
->isEmpty()) {
924 // AlphabeticIndex requires some non-ignorable script boundary strings.
925 status
= U_ILLEGAL_ARGUMENT_ERROR
;
928 if (collatorPrimaryOnly_
->compare(
929 *static_cast<UnicodeString
*>(firstCharsInScripts_
->elementAt(0)),
930 emptyString_
, status
) == UCOL_EQUAL
) {
931 firstCharsInScripts_
->removeElementAt(0);
937 // Chinese index characters, which are specific to each of the several Chinese tailorings,
938 // take precedence over the single locale data exemplar set per language.
939 if (!addChineseIndexCharacters(status
) && locale
!= NULL
) {
940 addIndexExemplars(*locale
, status
);
946 // Comparison function for UVector<UnicodeString *> sorting with a collator.
948 static int32_t U_CALLCONV
949 collatorComparator(const void *context
, const void *left
, const void *right
) {
950 const UElement
*leftElement
= static_cast<const UElement
*>(left
);
951 const UElement
*rightElement
= static_cast<const UElement
*>(right
);
952 const UnicodeString
*leftString
= static_cast<const UnicodeString
*>(leftElement
->pointer
);
953 const UnicodeString
*rightString
= static_cast<const UnicodeString
*>(rightElement
->pointer
);
955 if (leftString
== rightString
) {
956 // Catches case where both are NULL
959 if (leftString
== NULL
) {
962 if (rightString
== NULL
) {
965 const Collator
*col
= static_cast<const Collator
*>(context
);
966 UErrorCode errorCode
= U_ZERO_ERROR
;
967 return col
->compare(*leftString
, *rightString
, errorCode
);
971 // Comparison function for UVector<Record *> sorting with a collator.
973 static int32_t U_CALLCONV
974 recordCompareFn(const void *context
, const void *left
, const void *right
) {
975 const UElement
*leftElement
= static_cast<const UElement
*>(left
);
976 const UElement
*rightElement
= static_cast<const UElement
*>(right
);
977 const AlphabeticIndex::Record
*leftRec
= static_cast<const AlphabeticIndex::Record
*>(leftElement
->pointer
);
978 const AlphabeticIndex::Record
*rightRec
= static_cast<const AlphabeticIndex::Record
*>(rightElement
->pointer
);
979 const Collator
*col
= static_cast<const Collator
*>(context
);
980 UErrorCode errorCode
= U_ZERO_ERROR
;
981 return col
->compare(leftRec
->name_
, rightRec
->name_
, errorCode
);
984 UVector
*AlphabeticIndex::firstStringsInScript(UErrorCode
&status
) {
985 if (U_FAILURE(status
)) {
988 LocalPointer
<UVector
> dest(new UVector(status
), status
);
989 if (U_FAILURE(status
)) {
992 dest
->setDeleter(uprv_deleteUObject
);
993 // Fetch the script-first-primary contractions which are defined in the root collator.
994 // They all start with U+FDD1.
996 collatorPrimaryOnly_
->internalAddContractions(0xFDD1, set
, status
);
997 if (U_FAILURE(status
)) {
1000 if (set
.isEmpty()) {
1001 status
= U_UNSUPPORTED_ERROR
;
1004 UnicodeSetIterator
iter(set
);
1005 while (iter
.next()) {
1006 const UnicodeString
&boundary
= iter
.getString();
1007 uint32_t gcMask
= U_GET_GC_MASK(boundary
.char32At(1));
1008 if ((gcMask
& (U_GC_L_MASK
| U_GC_CN_MASK
)) == 0) {
1009 // Ignore boundaries for the special reordering groups.
1010 // Take only those for "real scripts" (where the sample character is a Letter,
1011 // and the one for unassigned implicit weights (Cn).
1014 UnicodeString
*s
= new UnicodeString(boundary
);
1016 status
= U_MEMORY_ALLOCATION_ERROR
;
1019 dest
->addElement(s
, status
);
1021 return dest
.orphan();
1028 * Returns true if one index character string is "better" than the other.
1029 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1030 * better, and otherwise binary-less-than is better.
1032 UBool
isOneLabelBetterThanOther(const Normalizer2
&nfkdNormalizer
,
1033 const UnicodeString
&one
, const UnicodeString
&other
) {
1034 // This is called with primary-equal strings, but never with one.equals(other).
1035 UErrorCode status
= U_ZERO_ERROR
;
1036 UnicodeString n1
= nfkdNormalizer
.normalize(one
, status
);
1037 UnicodeString n2
= nfkdNormalizer
.normalize(other
, status
);
1038 if (U_FAILURE(status
)) { return FALSE
; }
1039 int32_t result
= n1
.countChar32() - n2
.countChar32();
1043 result
= n1
.compareCodePointOrder(n2
);
1047 return one
.compareCodePointOrder(other
) < 0;
1053 // Constructor & Destructor for AlphabeticIndex::Record
1055 // Records are internal only, instances are not directly surfaced in the public API.
1056 // This class is mostly struct-like, with all public fields.
1058 AlphabeticIndex::Record::Record(const UnicodeString
&name
, const void *data
)
1059 : name_(name
), data_(data
) {}
1061 AlphabeticIndex::Record::~Record() {
1065 AlphabeticIndex
& AlphabeticIndex::addRecord(const UnicodeString
&name
, const void *data
, UErrorCode
&status
) {
1066 if (U_FAILURE(status
)) {
1069 if (inputList_
== NULL
) {
1070 inputList_
= new UVector(status
);
1071 if (inputList_
== NULL
) {
1072 status
= U_MEMORY_ALLOCATION_ERROR
;
1075 inputList_
->setDeleter(alphaIndex_deleteRecord
);
1077 Record
*r
= new Record(name
, data
);
1079 status
= U_MEMORY_ALLOCATION_ERROR
;
1082 inputList_
->addElement(r
, status
);
1086 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1087 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1092 AlphabeticIndex
&AlphabeticIndex::clearRecords(UErrorCode
&status
) {
1093 if (U_SUCCESS(status
) && inputList_
!= NULL
&& !inputList_
->isEmpty()) {
1094 inputList_
->removeAllElements();
1100 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString
&name
, UErrorCode
&status
) {
1101 initBuckets(status
);
1102 if (U_FAILURE(status
)) {
1105 return buckets_
->getBucketIndex(name
, *collatorPrimaryOnly_
, status
);
1109 int32_t AlphabeticIndex::getBucketIndex() const {
1110 return labelsIterIndex_
;
1114 UBool
AlphabeticIndex::nextBucket(UErrorCode
&status
) {
1115 if (U_FAILURE(status
)) {
1118 if (buckets_
== NULL
&& currentBucket_
!= NULL
) {
1119 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
1122 initBuckets(status
);
1123 if (U_FAILURE(status
)) {
1127 if (labelsIterIndex_
>= buckets_
->getBucketCount()) {
1128 labelsIterIndex_
= buckets_
->getBucketCount();
1131 currentBucket_
= getBucket(*buckets_
->immutableVisibleList_
, labelsIterIndex_
);
1132 resetRecordIterator();
1136 const UnicodeString
&AlphabeticIndex::getBucketLabel() const {
1137 if (currentBucket_
!= NULL
) {
1138 return currentBucket_
->label_
;
1140 return emptyString_
;
1145 UAlphabeticIndexLabelType
AlphabeticIndex::getBucketLabelType() const {
1146 if (currentBucket_
!= NULL
) {
1147 return currentBucket_
->labelType_
;
1149 return U_ALPHAINDEX_NORMAL
;
1154 int32_t AlphabeticIndex::getBucketRecordCount() const {
1155 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
) {
1156 return currentBucket_
->records_
->size();
1163 AlphabeticIndex
&AlphabeticIndex::resetBucketIterator(UErrorCode
&status
) {
1164 if (U_FAILURE(status
)) {
1167 internalResetBucketIterator();
1172 UBool
AlphabeticIndex::nextRecord(UErrorCode
&status
) {
1173 if (U_FAILURE(status
)) {
1176 if (currentBucket_
== NULL
) {
1177 // We are trying to iterate over the items in a bucket, but there is no
1178 // current bucket from the enumeration of buckets.
1179 status
= U_INVALID_STATE_ERROR
;
1182 if (buckets_
== NULL
) {
1183 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
1186 if (currentBucket_
->records_
== NULL
) {
1190 if (itemsIterIndex_
>= currentBucket_
->records_
->size()) {
1191 itemsIterIndex_
= currentBucket_
->records_
->size();
1198 const UnicodeString
&AlphabeticIndex::getRecordName() const {
1199 const UnicodeString
*retStr
= &emptyString_
;
1200 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
&&
1201 itemsIterIndex_
>= 0 &&
1202 itemsIterIndex_
< currentBucket_
->records_
->size()) {
1203 Record
*item
= static_cast<Record
*>(currentBucket_
->records_
->elementAt(itemsIterIndex_
));
1204 retStr
= &item
->name_
;
1209 const void *AlphabeticIndex::getRecordData() const {
1210 const void *retPtr
= NULL
;
1211 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
&&
1212 itemsIterIndex_
>= 0 &&
1213 itemsIterIndex_
< currentBucket_
->records_
->size()) {
1214 Record
*item
= static_cast<Record
*>(currentBucket_
->records_
->elementAt(itemsIterIndex_
));
1215 retPtr
= item
->data_
;
1221 AlphabeticIndex
& AlphabeticIndex::resetRecordIterator() {
1222 itemsIterIndex_
= -1;
1228 AlphabeticIndex::Bucket::Bucket(const UnicodeString
&label
,
1229 const UnicodeString
&lowerBoundary
,
1230 UAlphabeticIndexLabelType type
)
1231 : label_(label
), lowerBoundary_(lowerBoundary
), labelType_(type
),
1232 displayBucket_(NULL
), displayIndex_(-1),
1237 AlphabeticIndex::Bucket::~Bucket() {
1243 #endif // !UCONFIG_NO_COLLATION