2 *******************************************************************************
3 * Copyright (C) 2009-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION
12 #include "unicode/alphaindex.h"
13 #include "unicode/coll.h"
14 #include "unicode/localpointer.h"
15 #include "unicode/normalizer2.h"
16 #include "unicode/tblcoll.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ulocdata.h"
19 #include "unicode/uniset.h"
20 #include "unicode/uobject.h"
21 #include "unicode/usetiter.h"
22 #include "unicode/utf16.h"
38 * Prefix string for Chinese index buckets.
39 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
41 const UChar BASE
[1] = { 0xFDD0 };
42 const int32_t BASE_LENGTH
= 1;
44 UBool
isOneLabelBetterThanOther(const Normalizer2
&nfkdNormalizer
,
45 const UnicodeString
&one
, const UnicodeString
&other
);
49 static int32_t U_CALLCONV
50 collatorComparator(const void *context
, const void *left
, const void *right
);
52 static int32_t U_CALLCONV
53 recordCompareFn(const void *context
, const void *left
, const void *right
);
55 // UVector<Record *> support function, delete a Record.
56 static void U_CALLCONV
57 alphaIndex_deleteRecord(void *obj
) {
58 delete static_cast<AlphabeticIndex::Record
*>(obj
);
63 UnicodeString
*ownedString(const UnicodeString
&s
, LocalPointer
<UnicodeString
> &owned
,
64 UErrorCode
&errorCode
) {
65 if (U_FAILURE(errorCode
)) { return NULL
; }
66 if (owned
.isValid()) {
67 return owned
.orphan();
69 UnicodeString
*p
= new UnicodeString(s
);
71 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
76 inline UnicodeString
*getString(const UVector
&list
, int32_t i
) {
77 return static_cast<UnicodeString
*>(list
[i
]);
80 inline AlphabeticIndex::Bucket
*getBucket(const UVector
&list
, int32_t i
) {
81 return static_cast<AlphabeticIndex::Bucket
*>(list
[i
]);
84 inline AlphabeticIndex::Record
*getRecord(const UVector
&list
, int32_t i
) {
85 return static_cast<AlphabeticIndex::Record
*>(list
[i
]);
89 * Like Java Collections.binarySearch(List, String, Comparator).
91 * @return the index>=0 where the item was found,
92 * or the index<0 for inserting the string at ~index in sorted order
94 int32_t binarySearch(const UVector
&list
, const UnicodeString
&s
, const Collator
&coll
) {
95 if (list
.size() == 0) { return ~0; }
97 int32_t limit
= list
.size();
99 int32_t i
= (start
+ limit
) / 2;
100 const UnicodeString
*si
= static_cast<UnicodeString
*>(list
.elementAt(i
));
101 UErrorCode errorCode
= U_ZERO_ERROR
;
102 UCollationResult cmp
= coll
.compare(s
, *si
, errorCode
);
103 if (cmp
== UCOL_EQUAL
) {
105 } else if (cmp
< 0) {
107 return ~start
; // insert s before *si
112 return ~(start
+ 1); // insert s after *si
121 // The BucketList is not in the anonymous namespace because only Clang
122 // seems to support its use in other classes from there.
123 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
124 class BucketList
: public UObject
{
126 BucketList(UVector
*bucketList
, UVector
*publicBucketList
)
127 : bucketList_(bucketList
), immutableVisibleList_(publicBucketList
) {
128 int32_t displayIndex
= 0;
129 for (int32_t i
= 0; i
< publicBucketList
->size(); ++i
) {
130 getBucket(*publicBucketList
, i
)->displayIndex_
= displayIndex
++;
134 // The virtual destructor must not be inline.
135 // See ticket #8454 for details.
136 virtual ~BucketList();
138 int32_t getBucketCount() const {
139 return immutableVisibleList_
->size();
142 int32_t getBucketIndex(const UnicodeString
&name
, const Collator
&collatorPrimaryOnly
,
143 UErrorCode
&errorCode
) {
146 int32_t limit
= bucketList_
->size();
147 while ((start
+ 1) < limit
) {
148 int32_t i
= (start
+ limit
) / 2;
149 const AlphabeticIndex::Bucket
*bucket
= getBucket(*bucketList_
, i
);
150 UCollationResult nameVsBucket
=
151 collatorPrimaryOnly
.compare(name
, bucket
->lowerBoundary_
, errorCode
);
152 if (nameVsBucket
< 0) {
158 const AlphabeticIndex::Bucket
*bucket
= getBucket(*bucketList_
, start
);
159 if (bucket
->displayBucket_
!= NULL
) {
160 bucket
= bucket
->displayBucket_
;
162 return bucket
->displayIndex_
;
165 /** All of the buckets, visible and invisible. */
166 UVector
*bucketList_
;
167 /** Just the visible buckets. */
168 UVector
*immutableVisibleList_
;
171 BucketList::~BucketList() {
173 if (immutableVisibleList_
!= bucketList_
) {
174 delete immutableVisibleList_
;
178 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
180 delete collatorPrimaryOnly_
;
184 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
185 return buckets_
->getBucketCount();
189 AlphabeticIndex::ImmutableIndex::getBucketIndex(
190 const UnicodeString
&name
, UErrorCode
&errorCode
) const {
191 return buckets_
->getBucketIndex(name
, *collatorPrimaryOnly_
, errorCode
);
194 const AlphabeticIndex::Bucket
*
195 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index
) const {
196 if (0 <= index
&& index
< buckets_
->getBucketCount()) {
197 return icu::getBucket(*buckets_
->immutableVisibleList_
, index
);
203 AlphabeticIndex::AlphabeticIndex(const Locale
&locale
, UErrorCode
&status
)
205 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL
),
207 initialLabels_(NULL
), firstCharsInScripts_(NULL
),
208 collator_(NULL
), collatorPrimaryOnly_(NULL
),
210 init(&locale
, status
);
214 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator
*collator
, UErrorCode
&status
)
216 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL
),
218 initialLabels_(NULL
), firstCharsInScripts_(NULL
),
219 collator_(collator
), collatorPrimaryOnly_(NULL
),
226 AlphabeticIndex::~AlphabeticIndex() {
228 delete collatorPrimaryOnly_
;
229 delete firstCharsInScripts_
;
232 delete initialLabels_
;
236 AlphabeticIndex
&AlphabeticIndex::addLabels(const UnicodeSet
&additions
, UErrorCode
&status
) {
237 if (U_FAILURE(status
)) {
240 initialLabels_
->addAll(additions
);
246 AlphabeticIndex
&AlphabeticIndex::addLabels(const Locale
&locale
, UErrorCode
&status
) {
247 addIndexExemplars(locale
, status
);
253 AlphabeticIndex::ImmutableIndex
*AlphabeticIndex::buildImmutableIndex(UErrorCode
&errorCode
) {
254 if (U_FAILURE(errorCode
)) { return NULL
; }
255 // In C++, the ImmutableIndex must own its copy of the BucketList,
256 // even if it contains no records, for proper memory management.
257 // We could clone the buckets_ if they are not NULL,
258 // but that would be worth it only if this method is called multiple times,
259 // or called after using the old-style bucket iterator API.
260 LocalPointer
<BucketList
> immutableBucketList(createBucketList(errorCode
));
261 LocalPointer
<RuleBasedCollator
> coll(
262 static_cast<RuleBasedCollator
*>(collatorPrimaryOnly_
->clone()));
263 if (immutableBucketList
.isNull() || coll
.isNull()) {
264 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
267 ImmutableIndex
*immIndex
= new ImmutableIndex(immutableBucketList
.getAlias(), coll
.getAlias());
268 if (immIndex
== NULL
) {
269 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
272 // The ImmutableIndex adopted its parameter objects.
273 immutableBucketList
.orphan();
278 int32_t AlphabeticIndex::getBucketCount(UErrorCode
&status
) {
280 if (U_FAILURE(status
)) {
283 return buckets_
->getBucketCount();
287 int32_t AlphabeticIndex::getRecordCount(UErrorCode
&status
) {
288 if (U_FAILURE(status
) || inputList_
== NULL
) {
291 return inputList_
->size();
294 void AlphabeticIndex::initLabels(UVector
&indexCharacters
, UErrorCode
&errorCode
) const {
295 const Normalizer2
*nfkdNormalizer
= Normalizer2::getNFKDInstance(errorCode
);
296 if (U_FAILURE(errorCode
)) { return; }
298 const UnicodeString
&firstScriptBoundary
= *getString(*firstCharsInScripts_
, 0);
299 const UnicodeString
&overflowBoundary
=
300 *getString(*firstCharsInScripts_
, firstCharsInScripts_
->size() - 1);
302 // We make a sorted array of elements.
303 // Some of the input may be redundant.
304 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
305 // We filter out those cases.
306 UnicodeSetIterator
iter(*initialLabels_
);
307 while (iter
.next()) {
308 const UnicodeString
*item
= &iter
.getString();
309 LocalPointer
<UnicodeString
> ownedItem
;
311 int32_t itemLength
= item
->length();
312 if (!item
->hasMoreChar32Than(0, itemLength
, 1)) {
313 checkDistinct
= FALSE
;
314 } else if(item
->charAt(itemLength
- 1) == 0x2a && // '*'
315 item
->charAt(itemLength
- 2) != 0x2a) {
316 // Use a label if it is marked with one trailing star,
317 // even if the label string sorts the same when all contractions are suppressed.
318 ownedItem
.adoptInstead(new UnicodeString(*item
, 0, itemLength
- 1));
319 item
= ownedItem
.getAlias();
321 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
324 checkDistinct
= FALSE
;
326 checkDistinct
= TRUE
;
328 if (collatorPrimaryOnly_
->compare(*item
, firstScriptBoundary
, errorCode
) < 0) {
329 // Ignore a primary-ignorable or non-alphabetic index character.
330 } else if (collatorPrimaryOnly_
->compare(*item
, overflowBoundary
, errorCode
) >= 0) {
331 // Ignore an index character that will land in the overflow bucket.
332 } else if (checkDistinct
&&
333 collatorPrimaryOnly_
->compare(*item
, separated(*item
), errorCode
) == 0) {
334 // Ignore a multi-code point index character that does not sort distinctly
335 // from the sequence of its separate characters.
337 int32_t insertionPoint
= binarySearch(indexCharacters
, *item
, *collatorPrimaryOnly_
);
338 if (insertionPoint
< 0) {
339 indexCharacters
.insertElementAt(
340 ownedString(*item
, ownedItem
, errorCode
), ~insertionPoint
, errorCode
);
342 const UnicodeString
&itemAlreadyIn
= *getString(indexCharacters
, insertionPoint
);
343 if (isOneLabelBetterThanOther(*nfkdNormalizer
, *item
, itemAlreadyIn
)) {
344 indexCharacters
.setElementAt(
345 ownedString(*item
, ownedItem
, errorCode
), insertionPoint
);
350 if (U_FAILURE(errorCode
)) { return; }
352 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
354 int32_t size
= indexCharacters
.size() - 1;
355 if (size
> maxLabelCount_
) {
358 for (int32_t i
= 0; i
< indexCharacters
.size();) {
360 int32_t bump
= count
* maxLabelCount_
/ size
;
362 indexCharacters
.removeElementAt(i
);
373 const UnicodeString
&fixLabel(const UnicodeString
¤t
, UnicodeString
&temp
) {
374 if (!current
.startsWith(BASE
, BASE_LENGTH
)) {
377 UChar rest
= current
.charAt(BASE_LENGTH
);
378 if (0x2800 < rest
&& rest
<= 0x28FF) { // stroke count
379 int32_t count
= rest
-0x2800;
380 temp
.setTo((UChar
)(0x30 + count
% 10));
383 temp
.insert(0, (UChar
)(0x30 + count
% 10));
386 temp
.insert(0, (UChar
)(0x30 + count
));
389 return temp
.append((UChar
)0x5283);
391 return temp
.setTo(current
, BASE_LENGTH
);
394 UBool
hasMultiplePrimaryWeights(
395 const RuleBasedCollator
&coll
, uint32_t variableTop
,
396 const UnicodeString
&s
, UVector64
&ces
, UErrorCode
&errorCode
) {
397 ces
.removeAllElements();
398 coll
.internalGetCEs(s
, ces
, errorCode
);
399 if (U_FAILURE(errorCode
)) { return FALSE
; }
400 UBool seenPrimary
= FALSE
;
401 for (int32_t i
= 0; i
< ces
.size(); ++i
) {
402 int64_t ce
= ces
.elementAti(i
);
403 uint32_t p
= (uint32_t)(ce
>> 32);
404 if (p
> variableTop
) {
405 // not primary ignorable
417 BucketList
*AlphabeticIndex::createBucketList(UErrorCode
&errorCode
) const {
418 // Initialize indexCharacters.
419 UVector
indexCharacters(errorCode
);
420 indexCharacters
.setDeleter(uprv_deleteUObject
);
421 initLabels(indexCharacters
, errorCode
);
422 if (U_FAILURE(errorCode
)) { return NULL
; }
424 // Variables for hasMultiplePrimaryWeights().
425 UVector64
ces(errorCode
);
426 uint32_t variableTop
;
427 if (collatorPrimaryOnly_
->getAttribute(UCOL_ALTERNATE_HANDLING
, errorCode
) == UCOL_SHIFTED
) {
428 variableTop
= collatorPrimaryOnly_
->getVariableTop(errorCode
);
432 UBool hasInvisibleBuckets
= FALSE
;
434 // Helper arrays for Chinese Pinyin collation.
435 Bucket
*asciiBuckets
[26] = {
436 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
437 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
439 Bucket
*pinyinBuckets
[26] = {
440 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
441 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
443 UBool hasPinyin
= FALSE
;
445 LocalPointer
<UVector
> bucketList(new UVector(errorCode
), errorCode
);
446 if (U_FAILURE(errorCode
)) {
449 bucketList
->setDeleter(uprv_deleteUObject
);
452 Bucket
*bucket
= new Bucket(getUnderflowLabel(), emptyString_
, U_ALPHAINDEX_UNDERFLOW
);
453 if (bucket
== NULL
) {
454 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
457 bucketList
->addElement(bucket
, errorCode
);
458 if (U_FAILURE(errorCode
)) { return NULL
; }
462 // fix up the list, adding underflow, additions, overflow
463 // Insert inflow labels as needed.
464 int32_t scriptIndex
= -1;
465 const UnicodeString
*scriptUpperBoundary
= &emptyString_
;
466 for (int32_t i
= 0; i
< indexCharacters
.size(); ++i
) {
467 UnicodeString
¤t
= *getString(indexCharacters
, i
);
468 if (collatorPrimaryOnly_
->compare(current
, *scriptUpperBoundary
, errorCode
) >= 0) {
469 // We crossed the script boundary into a new script.
470 const UnicodeString
&inflowBoundary
= *scriptUpperBoundary
;
471 UBool skippedScript
= FALSE
;
473 scriptUpperBoundary
= getString(*firstCharsInScripts_
, ++scriptIndex
);
474 if (collatorPrimaryOnly_
->compare(current
, *scriptUpperBoundary
, errorCode
) < 0) {
477 skippedScript
= TRUE
;
479 if (skippedScript
&& bucketList
->size() > 1) {
480 // We are skipping one or more scripts,
481 // and we are not just getting out of the underflow label.
482 bucket
= new Bucket(getInflowLabel(), inflowBoundary
, U_ALPHAINDEX_INFLOW
);
483 if (bucket
== NULL
) {
484 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
487 bucketList
->addElement(bucket
, errorCode
);
490 // Add a bucket with the current label.
491 bucket
= new Bucket(fixLabel(current
, temp
), current
, U_ALPHAINDEX_NORMAL
);
492 if (bucket
== NULL
) {
493 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
496 bucketList
->addElement(bucket
, errorCode
);
497 // Remember ASCII and Pinyin buckets for Pinyin redirects.
499 if (current
.length() == 1 && 0x41 <= (c
= current
.charAt(0)) && c
<= 0x5A) { // A-Z
500 asciiBuckets
[c
- 0x41] = bucket
;
501 } else if (current
.length() == BASE_LENGTH
+ 1 && current
.startsWith(BASE
, BASE_LENGTH
) &&
502 0x41 <= (c
= current
.charAt(BASE_LENGTH
)) && c
<= 0x5A) {
503 pinyinBuckets
[c
- 0x41] = bucket
;
506 // Check for multiple primary weights.
507 if (!current
.startsWith(BASE
, BASE_LENGTH
) &&
508 hasMultiplePrimaryWeights(*collatorPrimaryOnly_
, variableTop
, current
,
510 current
.charAt(current
.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
511 // "AE-ligature" or "Sch" etc.
512 for (int32_t i
= bucketList
->size() - 2;; --i
) {
513 Bucket
*singleBucket
= getBucket(*bucketList
, i
);
514 if (singleBucket
->labelType_
!= U_ALPHAINDEX_NORMAL
) {
515 // There is no single-character bucket since the last
516 // underflow or inflow label.
519 if (singleBucket
->displayBucket_
== NULL
&&
520 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_
, variableTop
,
521 singleBucket
->lowerBoundary_
,
523 // Add an invisible bucket that redirects strings greater than the expansion
524 // to the previous single-character bucket.
525 // For example, after ... Q R S Sch we add Sch\uFFFF->S
526 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
527 bucket
= new Bucket(emptyString_
,
528 UnicodeString(current
).append((UChar
)0xFFFF),
529 U_ALPHAINDEX_NORMAL
);
530 if (bucket
== NULL
) {
531 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
534 bucket
->displayBucket_
= singleBucket
;
535 bucketList
->addElement(bucket
, errorCode
);
536 hasInvisibleBuckets
= TRUE
;
542 if (U_FAILURE(errorCode
)) { return NULL
; }
543 if (bucketList
->size() == 1) {
544 // No real labels, show only the underflow label.
545 BucketList
*bl
= new BucketList(bucketList
.getAlias(), bucketList
.getAlias());
547 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
554 bucket
= new Bucket(getOverflowLabel(), *scriptUpperBoundary
, U_ALPHAINDEX_OVERFLOW
);
555 if (bucket
== NULL
) {
556 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
559 bucketList
->addElement(bucket
, errorCode
); // final
562 // Redirect Pinyin buckets.
563 Bucket
*asciiBucket
= NULL
;
564 for (int32_t i
= 0; i
< 26; ++i
) {
565 if (asciiBuckets
[i
] != NULL
) {
566 asciiBucket
= asciiBuckets
[i
];
568 if (pinyinBuckets
[i
] != NULL
&& asciiBucket
!= NULL
) {
569 pinyinBuckets
[i
]->displayBucket_
= asciiBucket
;
570 hasInvisibleBuckets
= TRUE
;
575 if (U_FAILURE(errorCode
)) { return NULL
; }
576 if (!hasInvisibleBuckets
) {
577 BucketList
*bl
= new BucketList(bucketList
.getAlias(), bucketList
.getAlias());
579 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
585 // Merge inflow buckets that are visually adjacent.
586 // Iterate backwards: Merge inflow into overflow rather than the other way around.
587 int32_t i
= bucketList
->size() - 1;
588 Bucket
*nextBucket
= getBucket(*bucketList
, i
);
590 bucket
= getBucket(*bucketList
, i
);
591 if (bucket
->displayBucket_
!= NULL
) {
592 continue; // skip invisible buckets
594 if (bucket
->labelType_
== U_ALPHAINDEX_INFLOW
) {
595 if (nextBucket
->labelType_
!= U_ALPHAINDEX_NORMAL
) {
596 bucket
->displayBucket_
= nextBucket
;
603 LocalPointer
<UVector
> publicBucketList(new UVector(errorCode
), errorCode
);
604 if (U_FAILURE(errorCode
)) {
607 // Do not call publicBucketList->setDeleter():
608 // This vector shares its objects with the bucketList.
609 for (int32_t i
= 0; i
< bucketList
->size(); ++i
) {
610 bucket
= getBucket(*bucketList
, i
);
611 if (bucket
->displayBucket_
== NULL
) {
612 publicBucketList
->addElement(bucket
, errorCode
);
615 if (U_FAILURE(errorCode
)) { return NULL
; }
616 BucketList
*bl
= new BucketList(bucketList
.getAlias(), publicBucketList
.getAlias());
618 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
622 publicBucketList
.orphan();
627 * Creates an index, and buckets and sorts the list of records into the index.
629 void AlphabeticIndex::initBuckets(UErrorCode
&errorCode
) {
630 if (U_FAILURE(errorCode
) || buckets_
!= NULL
) {
633 buckets_
= createBucketList(errorCode
);
634 if (U_FAILURE(errorCode
) || inputList_
== NULL
|| inputList_
->isEmpty()) {
638 // Sort the records by name.
639 // Stable sort preserves input order of collation duplicates.
640 inputList_
->sortWithUComparator(recordCompareFn
, collator_
, errorCode
);
642 // Now, we traverse all of the input, which is now sorted.
643 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
644 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
645 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
646 // we need to improve it for that case.
648 Bucket
*currentBucket
= getBucket(*buckets_
->bucketList_
, 0);
649 int32_t bucketIndex
= 1;
651 const UnicodeString
*upperBoundary
;
652 if (bucketIndex
< buckets_
->bucketList_
->size()) {
653 nextBucket
= getBucket(*buckets_
->bucketList_
, bucketIndex
++);
654 upperBoundary
= &nextBucket
->lowerBoundary_
;
657 upperBoundary
= NULL
;
659 for (int32_t i
= 0; i
< inputList_
->size(); ++i
) {
660 Record
*r
= getRecord(*inputList_
, i
);
661 // if the current bucket isn't the right one, find the one that is
662 // We have a special flag for the last bucket so that we don't look any further
663 while (upperBoundary
!= NULL
&&
664 collatorPrimaryOnly_
->compare(r
->name_
, *upperBoundary
, errorCode
) >= 0) {
665 currentBucket
= nextBucket
;
666 // now reset the boundary that we compare against
667 if (bucketIndex
< buckets_
->bucketList_
->size()) {
668 nextBucket
= getBucket(*buckets_
->bucketList_
, bucketIndex
++);
669 upperBoundary
= &nextBucket
->lowerBoundary_
;
671 upperBoundary
= NULL
;
674 // now put the record into the bucket.
675 Bucket
*bucket
= currentBucket
;
676 if (bucket
->displayBucket_
!= NULL
) {
677 bucket
= bucket
->displayBucket_
;
679 if (bucket
->records_
== NULL
) {
680 bucket
->records_
= new UVector(errorCode
);
681 if (bucket
->records_
== NULL
) {
682 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
686 bucket
->records_
->addElement(r
, errorCode
);
690 void AlphabeticIndex::clearBuckets() {
691 if (buckets_
!= NULL
) {
694 internalResetBucketIterator();
698 void AlphabeticIndex::internalResetBucketIterator() {
699 labelsIterIndex_
= -1;
700 currentBucket_
= NULL
;
704 void AlphabeticIndex::addIndexExemplars(const Locale
&locale
, UErrorCode
&status
) {
705 LocalULocaleDataPointer
uld(ulocdata_open(locale
.getName(), &status
));
706 if (U_FAILURE(status
)) {
710 UnicodeSet exemplars
;
711 ulocdata_getExemplarSet(uld
.getAlias(), exemplars
.toUSet(), 0, ULOCDATA_ES_INDEX
, &status
);
712 if (U_SUCCESS(status
)) {
713 initialLabels_
->addAll(exemplars
);
716 status
= U_ZERO_ERROR
; // Clear out U_MISSING_RESOURCE_ERROR
718 // The locale data did not include explicit Index characters.
719 // Synthesize a set of them from the locale's standard exemplar characters.
720 ulocdata_getExemplarSet(uld
.getAlias(), exemplars
.toUSet(), 0, ULOCDATA_ES_STANDARD
, &status
);
721 if (U_FAILURE(status
)) {
725 // question: should we add auxiliary exemplars?
726 if (exemplars
.containsSome(0x61, 0x7A) /* a-z */ || exemplars
.size() == 0) {
727 exemplars
.add(0x61, 0x7A);
729 if (exemplars
.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
730 // cut down to small list
731 exemplars
.remove(0xAC00, 0xD7A3).
732 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
733 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
734 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
735 add(0xD30C).add(0xD558);
737 if (exemplars
.containsSome(0x1200, 0x137F)) { // Ethiopic block
738 // cut down to small list
739 // make use of the fact that Ethiopic is allocated in 8's, where
740 // the base is 0 mod 8.
742 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status
);
743 UnicodeSetIterator
it(ethiopic
);
744 while (it
.next() && !it
.isString()) {
745 if ((it
.getCodepoint() & 0x7) != 0) {
746 exemplars
.remove(it
.getCodepoint());
751 // Upper-case any that aren't already so.
752 // (We only do this for synthesized index characters.)
753 UnicodeSetIterator
it(exemplars
);
754 UnicodeString upperC
;
756 const UnicodeString
&exemplarC
= it
.getString();
758 upperC
.toUpper(locale
);
759 initialLabels_
->add(upperC
);
763 UBool
AlphabeticIndex::addChineseIndexCharacters(UErrorCode
&errorCode
) {
764 UnicodeSet contractions
;
765 collatorPrimaryOnly_
->internalAddContractions(BASE
[0], contractions
, errorCode
);
766 if (U_FAILURE(errorCode
) || contractions
.isEmpty()) { return FALSE
; }
767 initialLabels_
->addAll(contractions
);
768 UnicodeSetIterator
iter(contractions
);
769 while (iter
.next()) {
770 const UnicodeString
&s
= iter
.getString();
771 U_ASSERT (s
.startsWith(BASE
, BASE_LENGTH
));
772 UChar c
= s
.charAt(s
.length() - 1);
773 if (0x41 <= c
&& c
<= 0x5A) { // A-Z
774 // There are Pinyin labels, add ASCII A-Z labels as well.
775 initialLabels_
->add(0x41, 0x5A); // A-Z
784 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
786 static const UChar CGJ
= 0x034F;
787 UnicodeString
AlphabeticIndex::separated(const UnicodeString
&item
) {
788 UnicodeString result
;
789 if (item
.length() == 0) {
794 UChar32 cp
= item
.char32At(i
);
796 i
= item
.moveIndex32(i
, 1);
797 if (i
>= item
.length()) {
806 UBool
AlphabeticIndex::operator==(const AlphabeticIndex
& /* other */) const {
811 UBool
AlphabeticIndex::operator!=(const AlphabeticIndex
& /* other */) const {
816 const RuleBasedCollator
&AlphabeticIndex::getCollator() const {
821 const UnicodeString
&AlphabeticIndex::getInflowLabel() const {
825 const UnicodeString
&AlphabeticIndex::getOverflowLabel() const {
826 return overflowLabel_
;
830 const UnicodeString
&AlphabeticIndex::getUnderflowLabel() const {
831 return underflowLabel_
;
835 AlphabeticIndex
&AlphabeticIndex::setInflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
836 inflowLabel_
= label
;
842 AlphabeticIndex
&AlphabeticIndex::setOverflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
843 overflowLabel_
= label
;
849 AlphabeticIndex
&AlphabeticIndex::setUnderflowLabel(const UnicodeString
&label
, UErrorCode
&/*status*/) {
850 underflowLabel_
= label
;
856 int32_t AlphabeticIndex::getMaxLabelCount() const {
857 return maxLabelCount_
;
861 AlphabeticIndex
&AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount
, UErrorCode
&status
) {
862 if (U_FAILURE(status
)) {
865 if (maxLabelCount
<= 0) {
866 status
= U_ILLEGAL_ARGUMENT_ERROR
;
869 maxLabelCount_
= maxLabelCount
;
876 // init() - Common code for constructors.
879 void AlphabeticIndex::init(const Locale
*locale
, UErrorCode
&status
) {
880 if (U_FAILURE(status
)) { return; }
881 if (locale
== NULL
&& collator_
== NULL
) {
882 status
= U_ILLEGAL_ARGUMENT_ERROR
;
886 initialLabels_
= new UnicodeSet();
887 if (initialLabels_
== NULL
) {
888 status
= U_MEMORY_ALLOCATION_ERROR
;
892 inflowLabel_
.setTo((UChar
)0x2026); // Ellipsis
893 overflowLabel_
= inflowLabel_
;
894 underflowLabel_
= inflowLabel_
;
896 if (collator_
== NULL
) {
897 Collator
*coll
= Collator::createInstance(*locale
, status
);
898 if (U_FAILURE(status
)) {
903 status
= U_MEMORY_ALLOCATION_ERROR
;
906 collator_
= dynamic_cast<RuleBasedCollator
*>(coll
);
907 if (collator_
== NULL
) {
909 status
= U_UNSUPPORTED_ERROR
;
913 collatorPrimaryOnly_
= static_cast<RuleBasedCollator
*>(collator_
->clone());
914 if (collatorPrimaryOnly_
== NULL
) {
915 status
= U_MEMORY_ALLOCATION_ERROR
;
918 collatorPrimaryOnly_
->setAttribute(UCOL_STRENGTH
, UCOL_PRIMARY
, status
);
919 firstCharsInScripts_
= firstStringsInScript(status
);
920 if (U_FAILURE(status
)) { return; }
921 firstCharsInScripts_
->sortWithUComparator(collatorComparator
, collatorPrimaryOnly_
, status
);
922 // Guard against a degenerate collator where
923 // some script boundary strings are primary ignorable.
925 if (U_FAILURE(status
)) { return; }
926 if (firstCharsInScripts_
->isEmpty()) {
927 // AlphabeticIndex requires some non-ignorable script boundary strings.
928 status
= U_ILLEGAL_ARGUMENT_ERROR
;
931 if (collatorPrimaryOnly_
->compare(
932 *static_cast<UnicodeString
*>(firstCharsInScripts_
->elementAt(0)),
933 emptyString_
, status
) == UCOL_EQUAL
) {
934 firstCharsInScripts_
->removeElementAt(0);
940 // Chinese index characters, which are specific to each of the several Chinese tailorings,
941 // take precedence over the single locale data exemplar set per language.
942 if (!addChineseIndexCharacters(status
) && locale
!= NULL
) {
943 addIndexExemplars(*locale
, status
);
949 // Comparison function for UVector<UnicodeString *> sorting with a collator.
951 static int32_t U_CALLCONV
952 collatorComparator(const void *context
, const void *left
, const void *right
) {
953 const UElement
*leftElement
= static_cast<const UElement
*>(left
);
954 const UElement
*rightElement
= static_cast<const UElement
*>(right
);
955 const UnicodeString
*leftString
= static_cast<const UnicodeString
*>(leftElement
->pointer
);
956 const UnicodeString
*rightString
= static_cast<const UnicodeString
*>(rightElement
->pointer
);
958 if (leftString
== rightString
) {
959 // Catches case where both are NULL
962 if (leftString
== NULL
) {
965 if (rightString
== NULL
) {
968 const Collator
*col
= static_cast<const Collator
*>(context
);
969 UErrorCode errorCode
= U_ZERO_ERROR
;
970 return col
->compare(*leftString
, *rightString
, errorCode
);
974 // Comparison function for UVector<Record *> sorting with a collator.
976 static int32_t U_CALLCONV
977 recordCompareFn(const void *context
, const void *left
, const void *right
) {
978 const UElement
*leftElement
= static_cast<const UElement
*>(left
);
979 const UElement
*rightElement
= static_cast<const UElement
*>(right
);
980 const AlphabeticIndex::Record
*leftRec
= static_cast<const AlphabeticIndex::Record
*>(leftElement
->pointer
);
981 const AlphabeticIndex::Record
*rightRec
= static_cast<const AlphabeticIndex::Record
*>(rightElement
->pointer
);
982 const Collator
*col
= static_cast<const Collator
*>(context
);
983 UErrorCode errorCode
= U_ZERO_ERROR
;
984 return col
->compare(leftRec
->name_
, rightRec
->name_
, errorCode
);
987 UVector
*AlphabeticIndex::firstStringsInScript(UErrorCode
&status
) {
988 if (U_FAILURE(status
)) {
991 LocalPointer
<UVector
> dest(new UVector(status
), status
);
992 if (U_FAILURE(status
)) {
995 dest
->setDeleter(uprv_deleteUObject
);
996 // Fetch the script-first-primary contractions which are defined in the root collator.
997 // They all start with U+FDD1.
999 collatorPrimaryOnly_
->internalAddContractions(0xFDD1, set
, status
);
1000 if (U_FAILURE(status
)) {
1003 if (set
.isEmpty()) {
1004 status
= U_UNSUPPORTED_ERROR
;
1007 UnicodeSetIterator
iter(set
);
1008 while (iter
.next()) {
1009 const UnicodeString
&boundary
= iter
.getString();
1010 uint32_t gcMask
= U_GET_GC_MASK(boundary
.char32At(1));
1011 if ((gcMask
& (U_GC_L_MASK
| U_GC_CN_MASK
)) == 0) {
1012 // Ignore boundaries for the special reordering groups.
1013 // Take only those for "real scripts" (where the sample character is a Letter,
1014 // and the one for unassigned implicit weights (Cn).
1017 UnicodeString
*s
= new UnicodeString(boundary
);
1019 status
= U_MEMORY_ALLOCATION_ERROR
;
1022 dest
->addElement(s
, status
);
1024 return dest
.orphan();
1031 * Returns true if one index character string is "better" than the other.
1032 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1033 * better, and otherwise binary-less-than is better.
1035 UBool
isOneLabelBetterThanOther(const Normalizer2
&nfkdNormalizer
,
1036 const UnicodeString
&one
, const UnicodeString
&other
) {
1037 // This is called with primary-equal strings, but never with one.equals(other).
1038 UErrorCode status
= U_ZERO_ERROR
;
1039 UnicodeString n1
= nfkdNormalizer
.normalize(one
, status
);
1040 UnicodeString n2
= nfkdNormalizer
.normalize(other
, status
);
1041 if (U_FAILURE(status
)) { return FALSE
; }
1042 int32_t result
= n1
.countChar32() - n2
.countChar32();
1046 result
= n1
.compareCodePointOrder(n2
);
1050 return one
.compareCodePointOrder(other
) < 0;
1056 // Constructor & Destructor for AlphabeticIndex::Record
1058 // Records are internal only, instances are not directly surfaced in the public API.
1059 // This class is mostly struct-like, with all public fields.
1061 AlphabeticIndex::Record::Record(const UnicodeString
&name
, const void *data
)
1062 : name_(name
), data_(data
) {}
1064 AlphabeticIndex::Record::~Record() {
1068 AlphabeticIndex
& AlphabeticIndex::addRecord(const UnicodeString
&name
, const void *data
, UErrorCode
&status
) {
1069 if (U_FAILURE(status
)) {
1072 if (inputList_
== NULL
) {
1073 inputList_
= new UVector(status
);
1074 if (inputList_
== NULL
) {
1075 status
= U_MEMORY_ALLOCATION_ERROR
;
1078 inputList_
->setDeleter(alphaIndex_deleteRecord
);
1080 Record
*r
= new Record(name
, data
);
1082 status
= U_MEMORY_ALLOCATION_ERROR
;
1085 inputList_
->addElement(r
, status
);
1089 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1090 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1095 AlphabeticIndex
&AlphabeticIndex::clearRecords(UErrorCode
&status
) {
1096 if (U_SUCCESS(status
) && inputList_
!= NULL
&& !inputList_
->isEmpty()) {
1097 inputList_
->removeAllElements();
1103 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString
&name
, UErrorCode
&status
) {
1104 initBuckets(status
);
1105 if (U_FAILURE(status
)) {
1108 return buckets_
->getBucketIndex(name
, *collatorPrimaryOnly_
, status
);
1112 int32_t AlphabeticIndex::getBucketIndex() const {
1113 return labelsIterIndex_
;
1117 UBool
AlphabeticIndex::nextBucket(UErrorCode
&status
) {
1118 if (U_FAILURE(status
)) {
1121 if (buckets_
== NULL
&& currentBucket_
!= NULL
) {
1122 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
1125 initBuckets(status
);
1126 if (U_FAILURE(status
)) {
1130 if (labelsIterIndex_
>= buckets_
->getBucketCount()) {
1131 labelsIterIndex_
= buckets_
->getBucketCount();
1134 currentBucket_
= getBucket(*buckets_
->immutableVisibleList_
, labelsIterIndex_
);
1135 resetRecordIterator();
1139 const UnicodeString
&AlphabeticIndex::getBucketLabel() const {
1140 if (currentBucket_
!= NULL
) {
1141 return currentBucket_
->label_
;
1143 return emptyString_
;
1148 UAlphabeticIndexLabelType
AlphabeticIndex::getBucketLabelType() const {
1149 if (currentBucket_
!= NULL
) {
1150 return currentBucket_
->labelType_
;
1152 return U_ALPHAINDEX_NORMAL
;
1157 int32_t AlphabeticIndex::getBucketRecordCount() const {
1158 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
) {
1159 return currentBucket_
->records_
->size();
1166 AlphabeticIndex
&AlphabeticIndex::resetBucketIterator(UErrorCode
&status
) {
1167 if (U_FAILURE(status
)) {
1170 internalResetBucketIterator();
1175 UBool
AlphabeticIndex::nextRecord(UErrorCode
&status
) {
1176 if (U_FAILURE(status
)) {
1179 if (currentBucket_
== NULL
) {
1180 // We are trying to iterate over the items in a bucket, but there is no
1181 // current bucket from the enumeration of buckets.
1182 status
= U_INVALID_STATE_ERROR
;
1185 if (buckets_
== NULL
) {
1186 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
1189 if (currentBucket_
->records_
== NULL
) {
1193 if (itemsIterIndex_
>= currentBucket_
->records_
->size()) {
1194 itemsIterIndex_
= currentBucket_
->records_
->size();
1201 const UnicodeString
&AlphabeticIndex::getRecordName() const {
1202 const UnicodeString
*retStr
= &emptyString_
;
1203 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
&&
1204 itemsIterIndex_
>= 0 &&
1205 itemsIterIndex_
< currentBucket_
->records_
->size()) {
1206 Record
*item
= static_cast<Record
*>(currentBucket_
->records_
->elementAt(itemsIterIndex_
));
1207 retStr
= &item
->name_
;
1212 const void *AlphabeticIndex::getRecordData() const {
1213 const void *retPtr
= NULL
;
1214 if (currentBucket_
!= NULL
&& currentBucket_
->records_
!= NULL
&&
1215 itemsIterIndex_
>= 0 &&
1216 itemsIterIndex_
< currentBucket_
->records_
->size()) {
1217 Record
*item
= static_cast<Record
*>(currentBucket_
->records_
->elementAt(itemsIterIndex_
));
1218 retPtr
= item
->data_
;
1224 AlphabeticIndex
& AlphabeticIndex::resetRecordIterator() {
1225 itemsIterIndex_
= -1;
1231 AlphabeticIndex::Bucket::Bucket(const UnicodeString
&label
,
1232 const UnicodeString
&lowerBoundary
,
1233 UAlphabeticIndexLabelType type
)
1234 : label_(label
), lowerBoundary_(lowerBoundary
), labelType_(type
),
1235 displayBucket_(NULL
), displayIndex_(-1),
1240 AlphabeticIndex::Bucket::~Bucket() {
1246 #endif // !UCONFIG_NO_COLLATION