]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/alphaindex.cpp
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / i18n / alphaindex.cpp
CommitLineData
4388f060
A
1/*
2*******************************************************************************
51004dcb
A
3* Copyright (C) 2009-2013, International Business Machines Corporation and
4* others. All Rights Reserved.
4388f060
A
5*******************************************************************************
6*/
7
4388f060
A
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
11
12#include "unicode/alphaindex.h"
51004dcb 13#include "unicode/coleitr.h"
4388f060 14#include "unicode/coll.h"
51004dcb 15#include "unicode/localpointer.h"
4388f060 16#include "unicode/normalizer2.h"
4388f060
A
17#include "unicode/tblcoll.h"
18#include "unicode/ulocdata.h"
19#include "unicode/uniset.h"
20#include "unicode/uobject.h"
4388f060 21#include "unicode/usetiter.h"
4388f060
A
22#include "unicode/utf16.h"
23
51004dcb 24#include "cmemory.h"
4388f060 25#include "cstring.h"
4388f060 26#include "uassert.h"
4388f060
A
27#include "uvector.h"
28
29//#include <string>
30//#include <iostream>
51004dcb
A
31
32#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33
4388f060
A
34U_NAMESPACE_BEGIN
35
51004dcb 36namespace {
4388f060 37
51004dcb
A
38/**
39 * Prefix string for Chinese index buckets.
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
41 */
42const UChar BASE[1] = { 0xFDD0 };
43const int32_t BASE_LENGTH = 1;
44
45UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
46 const UnicodeString &one, const UnicodeString &other);
47
48} // namespace
4388f060
A
49
50static int32_t U_CALLCONV
51004dcb 51collatorComparator(const void *context, const void *left, const void *right);
4388f060
A
52
53static int32_t U_CALLCONV
54recordCompareFn(const void *context, const void *left, const void *right);
55
4388f060
A
56// UVector<Record *> support function, delete a Record.
57static void U_CALLCONV
58alphaIndex_deleteRecord(void *obj) {
59 delete static_cast<AlphabeticIndex::Record *>(obj);
60}
61
51004dcb 62namespace {
4388f060 63
51004dcb
A
64UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
65 UErrorCode &errorCode) {
66 if (U_FAILURE(errorCode)) { return NULL; }
67 if (owned.isValid()) {
68 return owned.orphan();
69 }
70 UnicodeString *p = new UnicodeString(s);
71 if (p == NULL) {
72 errorCode = U_MEMORY_ALLOCATION_ERROR;
73 }
74 return p;
75}
4388f060 76
51004dcb
A
77inline UnicodeString *getString(const UVector &list, int32_t i) {
78 return static_cast<UnicodeString *>(list[i]);
79}
4388f060 80
51004dcb
A
81inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
82 return static_cast<AlphabeticIndex::Bucket *>(list[i]);
83}
4388f060 84
51004dcb
A
85inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
86 return static_cast<AlphabeticIndex::Record *>(list[i]);
87}
88
89/**
90 * Like Java Collections.binarySearch(List, String, Comparator).
91 *
92 * @return the index>=0 where the item was found,
93 * or the index<0 for inserting the string at ~index in sorted order
94 */
95int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
96 if (list.size() == 0) { return ~0; }
97 int32_t start = 0;
98 int32_t limit = list.size();
99 for (;;) {
100 int32_t i = (start + limit) / 2;
101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
102 UErrorCode errorCode = U_ZERO_ERROR;
103 UCollationResult cmp = coll.compare(s, *si, errorCode);
104 if (cmp == UCOL_EQUAL) {
105 return i;
106 } else if (cmp < 0) {
107 if (i == start) {
108 return ~start; // insert s before *si
109 }
110 limit = i;
111 } else {
112 if (i == start) {
113 return ~(start + 1); // insert s after *si
114 }
115 start = i;
116 }
4388f060
A
117 }
118}
119
51004dcb
A
120} // namespace
121
122// The BucketList is not in the anonymous namespace because only Clang
123// seems to support its use in other classes from there.
124// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
125class BucketList : public UObject {
126public:
127 BucketList(UVector *bucketList, UVector *publicBucketList)
128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
129 int32_t displayIndex = 0;
130 for (int32_t i = 0; i < publicBucketList->size(); ++i) {
131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
132 }
133 }
4388f060 134
51004dcb
A
135 // The virtual destructor must not be inline.
136 // See ticket #8454 for details.
137 virtual ~BucketList();
138
139 int32_t getBucketCount() const {
140 return immutableVisibleList_->size();
4388f060 141 }
4388f060 142
51004dcb
A
143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
144 UErrorCode &errorCode) {
145 // binary search
146 int32_t start = 0;
147 int32_t limit = bucketList_->size();
148 while ((start + 1) < limit) {
149 int32_t i = (start + limit) / 2;
150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
151 UCollationResult nameVsBucket =
152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
153 if (nameVsBucket < 0) {
154 limit = i;
155 } else {
156 start = i;
157 }
158 }
159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
160 if (bucket->displayBucket_ != NULL) {
161 bucket = bucket->displayBucket_;
162 }
163 return bucket->displayIndex_;
4388f060 164 }
51004dcb
A
165
166 /** All of the buckets, visible and invisible. */
167 UVector *bucketList_;
168 /** Just the visible buckets. */
169 UVector *immutableVisibleList_;
170};
171
172BucketList::~BucketList() {
173 delete bucketList_;
174 if (immutableVisibleList_ != bucketList_) {
175 delete immutableVisibleList_;
4388f060 176 }
51004dcb
A
177}
178
179AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
180 delete buckets_;
181 delete collatorPrimaryOnly_;
182}
183
184int32_t
185AlphabeticIndex::ImmutableIndex::getBucketCount() const {
186 return buckets_->getBucketCount();
187}
188
189int32_t
190AlphabeticIndex::ImmutableIndex::getBucketIndex(
191 const UnicodeString &name, UErrorCode &errorCode) const {
192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
193}
194
195const AlphabeticIndex::Bucket *
196AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
197 if (0 <= index && index < buckets_->getBucketCount()) {
198 return icu::getBucket(*buckets_->immutableVisibleList_, index);
199 } else {
200 return NULL;
4388f060 201 }
4388f060
A
202}
203
51004dcb
A
204AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
205 : inputList_(NULL),
206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
207 maxLabelCount_(99),
208 initialLabels_(NULL), firstCharsInScripts_(NULL),
209 collator_(NULL), collatorPrimaryOnly_(NULL),
210 buckets_(NULL) {
211 init(&locale, status);
212}
213
214
215AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
216 : inputList_(NULL),
217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
218 maxLabelCount_(99),
219 initialLabels_(NULL), firstCharsInScripts_(NULL),
220 collator_(collator), collatorPrimaryOnly_(NULL),
221 buckets_(NULL) {
222 init(NULL, status);
223}
224
225
4388f060
A
226
227AlphabeticIndex::~AlphabeticIndex() {
4388f060
A
228 delete collator_;
229 delete collatorPrimaryOnly_;
51004dcb
A
230 delete firstCharsInScripts_;
231 delete buckets_;
232 delete inputList_;
4388f060
A
233 delete initialLabels_;
234}
235
236
237AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
238 if (U_FAILURE(status)) {
239 return *this;
240 }
241 initialLabels_->addAll(additions);
51004dcb 242 clearBuckets();
4388f060
A
243 return *this;
244}
245
246
247AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
51004dcb
A
248 addIndexExemplars(locale, status);
249 clearBuckets();
4388f060
A
250 return *this;
251}
252
253
51004dcb
A
254AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
255 if (U_FAILURE(errorCode)) { return NULL; }
256 // In C++, the ImmutableIndex must own its copy of the BucketList,
257 // even if it contains no records, for proper memory management.
258 // We could clone the buckets_ if they are not NULL,
259 // but that would be worth it only if this method is called multiple times,
260 // or called after using the old-style bucket iterator API.
261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
262 LocalPointer<RuleBasedCollator> coll(
263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
264 if (immutableBucketList.isNull() || coll.isNull()) {
265 errorCode = U_MEMORY_ALLOCATION_ERROR;
266 return NULL;
267 }
268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269 if (immIndex == NULL) {
270 errorCode = U_MEMORY_ALLOCATION_ERROR;
271 return NULL;
272 }
273 // The ImmutableIndex adopted its parameter objects.
274 immutableBucketList.orphan();
275 coll.orphan();
276 return immIndex;
277}
278
4388f060 279int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
51004dcb 280 initBuckets(status);
4388f060
A
281 if (U_FAILURE(status)) {
282 return 0;
283 }
51004dcb 284 return buckets_->getBucketCount();
4388f060
A
285}
286
287
288int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
51004dcb 289 if (U_FAILURE(status) || inputList_ == NULL) {
4388f060
A
290 return 0;
291 }
51004dcb 292 return inputList_->size();
4388f060
A
293}
294
51004dcb
A
295void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297 if (U_FAILURE(errorCode)) { return; }
298
299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
300 const UnicodeString &overflowBoundary =
301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
302
303 // We make a sorted array of elements.
304 // Some of the input may be redundant.
305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306 // We filter out those cases.
307 UnicodeSetIterator iter(*initialLabels_);
308 while (iter.next()) {
309 const UnicodeString *item = &iter.getString();
310 LocalPointer<UnicodeString> ownedItem;
311 UBool checkDistinct;
312 int32_t itemLength = item->length();
313 if (!item->hasMoreChar32Than(0, itemLength, 1)) {
314 checkDistinct = FALSE;
315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
316 item->charAt(itemLength - 2) != 0x2a) {
317 // Use a label if it is marked with one trailing star,
318 // even if the label string sorts the same when all contractions are suppressed.
319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
320 item = ownedItem.getAlias();
321 if (item == NULL) {
322 errorCode = U_MEMORY_ALLOCATION_ERROR;
323 return;
4388f060 324 }
51004dcb 325 checkDistinct = FALSE;
4388f060 326 } else {
51004dcb
A
327 checkDistinct = TRUE;
328 }
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330 // Ignore a primary-ignorable or non-alphabetic index character.
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
332 // Ignore an index characters that will land in the overflow bucket.
333 } else if (checkDistinct &&
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
335 // Ignore a multi-code point index character that does not sort distinctly
336 // from the sequence of its separate characters.
337 } else {
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
339 if (insertionPoint < 0) {
340 indexCharacters.insertElementAt(
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
342 } else {
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
345 indexCharacters.setElementAt(
346 ownedString(*item, ownedItem, errorCode), insertionPoint);
347 }
348 }
4388f060
A
349 }
350 }
51004dcb 351 if (U_FAILURE(errorCode)) { return; }
4388f060 352
51004dcb 353 // if the result is still too large, cut down to maxCount elements, by removing every nth element
4388f060 354
51004dcb 355 int32_t size = indexCharacters.size() - 1;
4388f060 356 if (size > maxLabelCount_) {
4388f060
A
357 int32_t count = 0;
358 int32_t old = -1;
51004dcb 359 for (int32_t i = 0; i < indexCharacters.size();) {
4388f060 360 ++count;
51004dcb 361 int32_t bump = count * maxLabelCount_ / size;
4388f060 362 if (bump == old) {
51004dcb 363 indexCharacters.removeElementAt(i);
4388f060 364 } else {
4388f060 365 old = bump;
51004dcb 366 ++i;
4388f060
A
367 }
368 }
4388f060 369 }
51004dcb 370}
4388f060 371
51004dcb
A
372namespace {
373
374const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
375 if (!current.startsWith(BASE, BASE_LENGTH)) {
376 return current;
377 }
378 UChar rest = current.charAt(BASE_LENGTH);
379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
380 int32_t count = rest-0x2800;
381 temp.setTo((UChar)(0x30 + count % 10));
382 if (count >= 10) {
383 count /= 10;
384 temp.insert(0, (UChar)(0x30 + count % 10));
385 if (count >= 10) {
386 count /= 10;
387 temp.insert(0, (UChar)(0x30 + count));
388 }
389 }
390 return temp.append((UChar)0x5283);
391 }
392 return temp.setTo(current, BASE_LENGTH);
4388f060
A
393}
394
51004dcb
A
395UBool hasMultiplePrimaryWeights(
396 CollationElementIterator &cei, int32_t variableTop,
397 const UnicodeString &s, UErrorCode &errorCode) {
398 cei.setText(s, errorCode);
399 UBool seenPrimary = FALSE;
400 for (;;) {
401 int32_t ce32 = cei.next(errorCode);
402 if (ce32 == CollationElementIterator::NULLORDER) {
403 break;
404 }
405 int32_t p = CollationElementIterator::primaryOrder(ce32);
406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
407 // not primary ignorable, and not a continuation CE
408 if (seenPrimary) {
409 return TRUE;
4388f060 410 }
51004dcb 411 seenPrimary = TRUE;
4388f060 412 }
51004dcb
A
413 }
414 return FALSE;
4388f060
A
415}
416
51004dcb 417} // namespace
4388f060 418
51004dcb
A
419BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420 // Initialize indexCharacters.
421 UVector indexCharacters(errorCode);
422 indexCharacters.setDeleter(uprv_deleteUObject);
423 initLabels(indexCharacters, errorCode);
424 if (U_FAILURE(errorCode)) { return NULL; }
425
426 // Variables for hasMultiplePrimaryWeights().
427 LocalPointer<CollationElementIterator> cei(
428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
429 if (cei.isNull()) {
430 errorCode = U_MEMORY_ALLOCATION_ERROR;
431 return NULL;
432 }
433 int32_t variableTop;
434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
435 variableTop = CollationElementIterator::primaryOrder(
436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
437 } else {
438 variableTop = 0;
439 }
440 UBool hasInvisibleBuckets = FALSE;
441
442 // Helper arrays for Chinese Pinyin collation.
443 Bucket *asciiBuckets[26] = {
444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
446 };
447 Bucket *pinyinBuckets[26] = {
448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
450 };
451 UBool hasPinyin = FALSE;
452
453 LocalPointer<UVector> bucketList(new UVector(errorCode));
454 if (bucketList.isNull()) {
455 errorCode = U_MEMORY_ALLOCATION_ERROR;
456 return NULL;
457 }
458 bucketList->setDeleter(uprv_deleteUObject);
459
460 // underflow bucket
461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
462 if (bucket == NULL) {
463 errorCode = U_MEMORY_ALLOCATION_ERROR;
464 return NULL;
465 }
466 bucketList->addElement(bucket, errorCode);
467 if (U_FAILURE(errorCode)) { return NULL; }
468
469 UnicodeString temp;
470
471 // fix up the list, adding underflow, additions, overflow
472 // Insert inflow labels as needed.
473 int32_t scriptIndex = -1;
474 const UnicodeString *scriptUpperBoundary = &emptyString_;
475 for (int32_t i = 0; i < indexCharacters.size(); ++i) {
476 UnicodeString &current = *getString(indexCharacters, i);
477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
478 // We crossed the script boundary into a new script.
479 const UnicodeString &inflowBoundary = *scriptUpperBoundary;
480 UBool skippedScript = FALSE;
481 for (;;) {
482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
484 break;
485 }
486 skippedScript = TRUE;
487 }
488 if (skippedScript && bucketList->size() > 1) {
489 // We are skipping one or more scripts,
490 // and we are not just getting out of the underflow label.
491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
492 if (bucket == NULL) {
493 errorCode = U_MEMORY_ALLOCATION_ERROR;
494 return NULL;
495 }
496 bucketList->addElement(bucket, errorCode);
497 }
498 }
499 // Add a bucket with the current label.
500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
501 if (bucket == NULL) {
502 errorCode = U_MEMORY_ALLOCATION_ERROR;
503 return NULL;
504 }
505 bucketList->addElement(bucket, errorCode);
506 // Remember ASCII and Pinyin buckets for Pinyin redirects.
507 UChar c;
508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
509 asciiBuckets[c - 0x41] = bucket;
510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
512 pinyinBuckets[c - 0x41] = bucket;
513 hasPinyin = TRUE;
514 }
515 // Check for multiple primary weights.
516 if (!current.startsWith(BASE, BASE_LENGTH) &&
517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
519 // "AE-ligature" or "Sch" etc.
520 for (int32_t i = bucketList->size() - 2;; --i) {
521 Bucket *singleBucket = getBucket(*bucketList, i);
522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
523 // There is no single-character bucket since the last
524 // underflow or inflow label.
525 break;
526 }
527 if (singleBucket->displayBucket_ == NULL &&
528 !hasMultiplePrimaryWeights(
529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
530 // Add an invisible bucket that redirects strings greater than the expansion
531 // to the previous single-character bucket.
532 // For example, after ... Q R S Sch we add Sch\uFFFF->S
533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
534 bucket = new Bucket(emptyString_,
535 UnicodeString(current).append((UChar)0xFFFF),
536 U_ALPHAINDEX_NORMAL);
537 if (bucket == NULL) {
538 errorCode = U_MEMORY_ALLOCATION_ERROR;
539 return NULL;
540 }
541 bucket->displayBucket_ = singleBucket;
542 bucketList->addElement(bucket, errorCode);
543 hasInvisibleBuckets = TRUE;
544 break;
545 }
546 }
547 }
548 }
549 if (U_FAILURE(errorCode)) { return NULL; }
550 if (bucketList->size() == 1) {
551 // No real labels, show only the underflow label.
552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
553 if (bl == NULL) {
554 errorCode = U_MEMORY_ALLOCATION_ERROR;
555 return NULL;
556 }
557 bucketList.orphan();
558 return bl;
559 }
560 // overflow bucket
561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
562 if (bucket == NULL) {
563 errorCode = U_MEMORY_ALLOCATION_ERROR;
564 return NULL;
565 }
566 bucketList->addElement(bucket, errorCode); // final
567
568 if (hasPinyin) {
569 // Redirect Pinyin buckets.
570 Bucket *asciiBucket = NULL;
571 for (int32_t i = 0; i < 26; ++i) {
572 if (asciiBuckets[i] != NULL) {
573 asciiBucket = asciiBuckets[i];
574 }
575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
576 pinyinBuckets[i]->displayBucket_ = asciiBucket;
577 hasInvisibleBuckets = TRUE;
578 }
579 }
580 }
581
582 if (U_FAILURE(errorCode)) { return NULL; }
583 if (!hasInvisibleBuckets) {
584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
585 if (bl == NULL) {
586 errorCode = U_MEMORY_ALLOCATION_ERROR;
587 return NULL;
588 }
589 bucketList.orphan();
590 return bl;
591 }
592 // Merge inflow buckets that are visually adjacent.
593 // Iterate backwards: Merge inflow into overflow rather than the other way around.
594 int32_t i = bucketList->size() - 1;
595 Bucket *nextBucket = getBucket(*bucketList, i);
596 while (--i > 0) {
597 bucket = getBucket(*bucketList, i);
598 if (bucket->displayBucket_ != NULL) {
599 continue; // skip invisible buckets
600 }
601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
603 bucket->displayBucket_ = nextBucket;
604 continue;
605 }
606 }
607 nextBucket = bucket;
608 }
609
610 LocalPointer<UVector> publicBucketList(new UVector(errorCode));
611 if (bucketList.isNull()) {
612 errorCode = U_MEMORY_ALLOCATION_ERROR;
613 return NULL;
614 }
615 // Do not call publicBucketList->setDeleter():
616 // This vector shares its objects with the bucketList.
617 for (int32_t i = 0; i < bucketList->size(); ++i) {
618 bucket = getBucket(*bucketList, i);
619 if (bucket->displayBucket_ == NULL) {
620 publicBucketList->addElement(bucket, errorCode);
621 }
622 }
623 if (U_FAILURE(errorCode)) { return NULL; }
624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
625 if (bl == NULL) {
626 errorCode = U_MEMORY_ALLOCATION_ERROR;
627 return NULL;
628 }
629 bucketList.orphan();
630 publicBucketList.orphan();
631 return bl;
632}
633
634/**
635 * Creates an index, and buckets and sorts the list of records into the index.
636 */
637void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
638 if (U_FAILURE(errorCode) || buckets_ != NULL) {
639 return;
640 }
641 buckets_ = createBucketList(errorCode);
642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
4388f060
A
643 return;
644 }
645
51004dcb
A
646 // Sort the records by name.
647 // Stable sort preserves input order of collation duplicates.
648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
649
650 // Now, we traverse all of the input, which is now sorted.
651 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
652 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
654 // we need to improve it for that case.
655
656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
657 int32_t bucketIndex = 1;
658 Bucket *nextBucket;
659 const UnicodeString *upperBoundary;
660 if (bucketIndex < buckets_->bucketList_->size()) {
661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
662 upperBoundary = &nextBucket->lowerBoundary_;
663 } else {
664 nextBucket = NULL;
665 upperBoundary = NULL;
666 }
667 for (int32_t i = 0; i < inputList_->size(); ++i) {
668 Record *r = getRecord(*inputList_, i);
669 // if the current bucket isn't the right one, find the one that is
670 // We have a special flag for the last bucket so that we don't look any further
671 while (upperBoundary != NULL &&
672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
673 currentBucket = nextBucket;
674 // now reset the boundary that we compare against
675 if (bucketIndex < buckets_->bucketList_->size()) {
676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
677 upperBoundary = &nextBucket->lowerBoundary_;
4388f060 678 } else {
51004dcb 679 upperBoundary = NULL;
4388f060 680 }
4388f060 681 }
51004dcb
A
682 // now put the record into the bucket.
683 Bucket *bucket = currentBucket;
684 if (bucket->displayBucket_ != NULL) {
685 bucket = bucket->displayBucket_;
686 }
687 if (bucket->records_ == NULL) {
688 bucket->records_ = new UVector(errorCode);
689 if (bucket->records_ == NULL) {
690 errorCode = U_MEMORY_ALLOCATION_ERROR;
691 return;
692 }
693 }
694 bucket->records_->addElement(r, errorCode);
4388f060 695 }
51004dcb 696}
4388f060 697
51004dcb
A
698void AlphabeticIndex::clearBuckets() {
699 if (buckets_ != NULL) {
700 delete buckets_;
701 buckets_ = NULL;
702 internalResetBucketIterator();
703 }
4388f060
A
704}
705
51004dcb
A
706void AlphabeticIndex::internalResetBucketIterator() {
707 labelsIterIndex_ = -1;
708 currentBucket_ = NULL;
709}
4388f060 710
51004dcb
A
711
712void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
713 if (U_FAILURE(status)) { return; }
714 // Chinese index characters, which are specific to each of the several Chinese tailorings,
715 // take precedence over the single locale data exemplar set per language.
716 const char *language = locale.getLanguage();
717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
718 uprv_strcmp(language, "ko") == 0) {
719 // TODO: This should be done regardless of the language, but it's expensive.
720 // We should add a Collator function (can be @internal)
721 // to enumerate just the contractions that start with a given code point or string.
722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
723 return;
724 }
725 }
726
727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
4388f060
A
728 if (U_FAILURE(status)) {
729 return;
730 }
731
4388f060
A
732 UnicodeSet exemplars;
733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
734 if (U_SUCCESS(status)) {
51004dcb 735 initialLabels_->addAll(exemplars);
4388f060
A
736 return;
737 }
738 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
739
51004dcb 740 // The locale data did not include explicit Index characters.
4388f060 741 // Synthesize a set of them from the locale's standard exemplar characters.
4388f060
A
742 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
743 if (U_FAILURE(status)) {
744 return;
745 }
746
51004dcb
A
747 // question: should we add auxiliary exemplars?
748 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
749 exemplars.add(0x61, 0x7A);
750 }
751 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
752 // cut down to small list
753 exemplars.remove(0xAC00, 0xD7A3).
754 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
755 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
756 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
757 add(0xD30C).add(0xD558);
758 }
759 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
760 // cut down to small list
761 // make use of the fact that Ethiopic is allocated in 8's, where
762 // the base is 0 mod 8.
763 UnicodeSet ethiopic(
764 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
765 UnicodeSetIterator it(ethiopic);
766 while (it.next() && !it.isString()) {
767 if ((it.getCodepoint() & 0x7) != 0) {
768 exemplars.remove(it.getCodepoint());
769 }
770 }
771 }
772
4388f060
A
773 // Upper-case any that aren't already so.
774 // (We only do this for synthesized index characters.)
4388f060
A
775 UnicodeSetIterator it(exemplars);
776 UnicodeString upperC;
4388f060
A
777 while (it.next()) {
778 const UnicodeString &exemplarC = it.getString();
779 upperC = exemplarC;
780 upperC.toUpper(locale);
51004dcb 781 initialLabels_->add(upperC);
4388f060 782 }
51004dcb 783}
4388f060 784
51004dcb
A
785UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
786 UnicodeSet contractions;
787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
788 contractions.toUSet(), NULL, FALSE, &errorCode);
789 if (U_FAILURE(errorCode)) { return FALSE; }
790 UnicodeString firstHanBoundary;
791 UBool hasPinyin = FALSE;
792 UnicodeSetIterator iter(contractions);
793 while (iter.next()) {
794 const UnicodeString &s = iter.getString();
795 if (s.startsWith(BASE, BASE_LENGTH)) {
796 initialLabels_->add(s);
797 if (firstHanBoundary.isEmpty() ||
798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
799 firstHanBoundary = s;
800 }
801 UChar c = s.charAt(s.length() - 1);
802 if (0x41 <= c && c <= 0x5A) { // A-Z
803 hasPinyin = TRUE;
804 }
805 }
4388f060 806 }
51004dcb
A
807 if (hasPinyin) {
808 initialLabels_->add(0x41, 0x5A); // A-Z
809 }
810 if (!firstHanBoundary.isEmpty()) {
811 // The hardcoded list of script boundaries includes U+4E00
812 // which is tailored to not be the first primary
813 // in all Chinese tailorings except "unihan".
814 // Replace U+4E00 with the first boundary string from the tailoring.
815 // TODO: This becomes obsolete when the root collator gets
816 // reliable script-first-primary mappings.
817 int32_t hanIndex = binarySearch(
818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
819 if (hanIndex >= 0) {
820 UnicodeString *fh = new UnicodeString(firstHanBoundary);
821 if (fh == NULL) {
822 errorCode = U_MEMORY_ALLOCATION_ERROR;
823 return FALSE;
4388f060 824 }
51004dcb 825 firstCharsInScripts_->setElementAt(fh, hanIndex);
4388f060 826 }
51004dcb
A
827 return TRUE;
828 } else {
829 return FALSE;
4388f060 830 }
4388f060
A
831}
832
833
834/*
835 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
836 */
51004dcb 837static const UChar CGJ = 0x034F;
4388f060
A
838UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
839 UnicodeString result;
840 if (item.length() == 0) {
841 return result;
842 }
843 int32_t i = 0;
844 for (;;) {
845 UChar32 cp = item.char32At(i);
846 result.append(cp);
847 i = item.moveIndex32(i, 1);
848 if (i >= item.length()) {
849 break;
850 }
851 result.append(CGJ);
852 }
853 return result;
854}
855
856
857UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
858 return FALSE;
859}
860
861
862UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
863 return FALSE;
864}
865
866
867const RuleBasedCollator &AlphabeticIndex::getCollator() const {
868 // There are no known non-RuleBasedCollator collators, and none ever expected.
869 // But, in case that changes, better a null pointer than a wrong type.
870 return *dynamic_cast<RuleBasedCollator *>(collator_);
871}
872
873
874const UnicodeString &AlphabeticIndex::getInflowLabel() const {
875 return inflowLabel_;
876}
877
878const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
879 return overflowLabel_;
880}
881
882
883const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
884 return underflowLabel_;
885}
886
887
888AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
889 inflowLabel_ = label;
51004dcb 890 clearBuckets();
4388f060
A
891 return *this;
892}
893
894
895AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
896 overflowLabel_ = label;
51004dcb 897 clearBuckets();
4388f060
A
898 return *this;
899}
900
901
902AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
903 underflowLabel_ = label;
51004dcb 904 clearBuckets();
4388f060
A
905 return *this;
906}
907
908
909int32_t AlphabeticIndex::getMaxLabelCount() const {
910 return maxLabelCount_;
911}
912
913
914AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
915 if (U_FAILURE(status)) {
916 return *this;
917 }
918 if (maxLabelCount <= 0) {
919 status = U_ILLEGAL_ARGUMENT_ERROR;
920 return *this;
921 }
922 maxLabelCount_ = maxLabelCount;
51004dcb 923 clearBuckets();
4388f060
A
924 return *this;
925}
926
927
4388f060
A
928//
929// init() - Common code for constructors.
930//
931
51004dcb
A
932void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
933 if (U_FAILURE(status)) { return; }
934 if (locale == NULL && collator_ == NULL) {
935 status = U_ILLEGAL_ARGUMENT_ERROR;
4388f060
A
936 return;
937 }
51004dcb 938
4388f060 939 initialLabels_ = new UnicodeSet();
51004dcb
A
940 if (initialLabels_ == NULL) {
941 status = U_MEMORY_ALLOCATION_ERROR;
942 return;
943 }
4388f060 944
51004dcb 945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
4388f060
A
946 overflowLabel_ = inflowLabel_;
947 underflowLabel_ = inflowLabel_;
948
51004dcb
A
949 if (collator_ == NULL) {
950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
951 if (U_FAILURE(status)) { return; }
952 if (collator_ == NULL) {
953 status = U_MEMORY_ALLOCATION_ERROR;
954 return;
955 }
956 }
957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
958 if (collatorPrimaryOnly_ == NULL) {
959 status = U_MEMORY_ALLOCATION_ERROR;
4388f060
A
960 return;
961 }
51004dcb
A
962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
963 firstCharsInScripts_ = firstStringsInScript(status);
964 if (U_FAILURE(status)) { return; }
965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
966 UnicodeString _4E00((UChar)0x4E00);
967 UnicodeString _1100((UChar)0x1100);
968 UnicodeString _1112((UChar)0x1112);
969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
971 // The standard Korean tailoring sorts Hanja (Han characters)
972 // as secondary differences from Hangul syllables.
973 // This makes U+4E00 not useful as a Han-script boundary.
974 // TODO: This becomes obsolete when the root collator gets
975 // reliable script-first-primary mappings.
976 int32_t hanIndex = binarySearch(
977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
978 if (hanIndex >= 0) {
979 firstCharsInScripts_->removeElementAt(hanIndex);
4388f060 980 }
51004dcb
A
981 }
982 // Guard against a degenerate collator where
983 // some script boundary strings are primary ignorable.
984 for (;;) {
985 if (U_FAILURE(status)) { return; }
986 if (firstCharsInScripts_->isEmpty()) {
987 // AlphabeticIndex requires some non-ignorable script boundary strings.
988 status = U_ILLEGAL_ARGUMENT_ERROR;
989 return;
4388f060 990 }
51004dcb
A
991 if (collatorPrimaryOnly_->compare(
992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
993 emptyString_, status) == UCOL_EQUAL) {
994 firstCharsInScripts_->removeElementAt(0);
995 } else {
996 break;
4388f060
A
997 }
998 }
4388f060 999
51004dcb
A
1000 if (locale != NULL) {
1001 addIndexExemplars(*locale, status);
4388f060 1002 }
4388f060
A
1003}
1004
1005
1006//
1007// Comparison function for UVector<UnicodeString *> sorting with a collator.
1008//
1009static int32_t U_CALLCONV
51004dcb 1010collatorComparator(const void *context, const void *left, const void *right) {
4388f060
A
1011 const UElement *leftElement = static_cast<const UElement *>(left);
1012 const UElement *rightElement = static_cast<const UElement *>(right);
1013 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
1014 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
4388f060
A
1015
1016 if (leftString == rightString) {
1017 // Catches case where both are NULL
1018 return 0;
1019 }
1020 if (leftString == NULL) {
1021 return 1;
1022 };
1023 if (rightString == NULL) {
1024 return -1;
1025 }
51004dcb
A
1026 const Collator *col = static_cast<const Collator *>(context);
1027 UErrorCode errorCode = U_ZERO_ERROR;
1028 return col->compare(*leftString, *rightString, errorCode);
4388f060
A
1029}
1030
1031//
1032// Comparison function for UVector<Record *> sorting with a collator.
1033//
1034static int32_t U_CALLCONV
1035recordCompareFn(const void *context, const void *left, const void *right) {
1036 const UElement *leftElement = static_cast<const UElement *>(left);
1037 const UElement *rightElement = static_cast<const UElement *>(right);
1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
1040 const Collator *col = static_cast<const Collator *>(context);
51004dcb
A
1041 UErrorCode errorCode = U_ZERO_ERROR;
1042 return col->compare(leftRec->name_, rightRec->name_, errorCode);
4388f060
A
1043}
1044
1045
51004dcb
A
1046/**
1047 * This list contains one character per script that has the
1048 * lowest primary weight for that script in the root collator.
1049 * This list will be copied and sorted to account for script reordering.
1050 *
1051 * <p>TODO: This is fragile. If the first character of a script is tailored
1052 * so that it does not map to the script's lowest primary weight any more,
1053 * then the buckets will be off.
1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1055 *
1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1057 *
1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1060 */
1061static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
1062 0x41, 0, 0x03B1, 0,
1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1066 0xAAF2, 0, // Meetei Mayek
1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
1070 0x1B83, 0,
1071 0xD802, 0xDE00, 0, 0x0E01, 0,
1072 0x0EDE, 0, // Lao
1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
1080 0xD800, 0xDE80, 0,
1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1082 0xD801, 0xDC80, 0,
1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
1088 0x4E00, 0,
1089 // TODO: The overflow bucket's lowerBoundary string should be the
1090 // first item after the last reordering group in the collator's script order.
1091 // This should normally be the first Unicode code point
1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1093 // However, at least up to ICU 51 the Hani reordering group includes
1094 // unassigned code points,
1095 // and there is no stable string for the start of the trailing-weights range.
1096 // The only known string that sorts "high" is U+FFFF.
1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1098 // and fix relevant test code.
1099 // Ideally, FractionalUCA.txt will have a "script first primary"
1100 // for unassigned code points.
1101 0xFFFF, 0
1102};
4388f060
A
1103
1104UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1105 if (U_FAILURE(status)) {
1106 return NULL;
1107 }
1108 UVector *dest = new UVector(status);
1109 if (dest == NULL) {
51004dcb 1110 status = U_MEMORY_ALLOCATION_ERROR;
4388f060
A
1111 return NULL;
1112 }
1113 dest->setDeleter(uprv_deleteUObject);
1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
51004dcb 1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
4388f060
A
1116 do {
1117 if (U_FAILURE(status)) {
1118 return dest;
1119 }
1120 UnicodeString *str = new UnicodeString(src, -1);
1121 if (str == NULL) {
1122 status = U_MEMORY_ALLOCATION_ERROR;
51004dcb 1123 return dest;
4388f060 1124 }
51004dcb
A
1125 dest->addElement(str, status);
1126 src += str->length() + 1;
4388f060 1127 } while (src < limit);
4388f060
A
1128 return dest;
1129}
1130
1131
51004dcb 1132namespace {
4388f060
A
1133
1134/**
51004dcb
A
1135 * Returns true if one index character string is "better" than the other.
1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1137 * better, and otherwise binary-less-than is better.
4388f060 1138 */
51004dcb
A
1139UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1140 const UnicodeString &one, const UnicodeString &other) {
1141 // This is called with primary-equal strings, but never with one.equals(other).
1142 UErrorCode status = U_ZERO_ERROR;
1143 UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1144 UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1145 if (U_FAILURE(status)) { return FALSE; }
1146 int32_t result = n1.countChar32() - n2.countChar32();
4388f060 1147 if (result != 0) {
51004dcb 1148 return result < 0;
4388f060 1149 }
4388f060
A
1150 result = n1.compareCodePointOrder(n2);
1151 if (result != 0) {
51004dcb 1152 return result < 0;
4388f060 1153 }
51004dcb 1154 return one.compareCodePointOrder(other) < 0;
4388f060
A
1155}
1156
51004dcb 1157} // namespace
4388f060
A
1158
1159//
1160// Constructor & Destructor for AlphabeticIndex::Record
1161//
1162// Records are internal only, instances are not directly surfaced in the public API.
1163// This class is mostly struct-like, with all public fields.
1164
51004dcb
A
1165AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1166 : name_(name), data_(data) {}
1167
4388f060
A
1168AlphabeticIndex::Record::~Record() {
1169}
1170
1171
1172AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1173 if (U_FAILURE(status)) {
1174 return *this;
1175 }
51004dcb
A
1176 if (inputList_ == NULL) {
1177 inputList_ = new UVector(status);
1178 if (inputList_ == NULL) {
1179 status = U_MEMORY_ALLOCATION_ERROR;
1180 return *this;
1181 }
1182 inputList_->setDeleter(alphaIndex_deleteRecord);
1183 }
1184 Record *r = new Record(name, data);
1185 if (r == NULL) {
1186 status = U_MEMORY_ALLOCATION_ERROR;
1187 return *this;
1188 }
1189 inputList_->addElement(r, status);
1190 clearBuckets();
4388f060
A
1191 //std::string ss;
1192 //std::string ss2;
1193 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1194 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1195 return *this;
1196}
1197
1198
1199AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
51004dcb
A
1200 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1201 inputList_->removeAllElements();
1202 clearBuckets();
4388f060 1203 }
4388f060
A
1204 return *this;
1205}
1206
4388f060 1207int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
51004dcb 1208 initBuckets(status);
4388f060
A
1209 if (U_FAILURE(status)) {
1210 return 0;
1211 }
51004dcb 1212 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
4388f060
A
1213}
1214
1215
1216int32_t AlphabeticIndex::getBucketIndex() const {
1217 return labelsIterIndex_;
1218}
1219
1220
1221UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1222 if (U_FAILURE(status)) {
1223 return FALSE;
1224 }
51004dcb 1225 if (buckets_ == NULL && currentBucket_ != NULL) {
4388f060
A
1226 status = U_ENUM_OUT_OF_SYNC_ERROR;
1227 return FALSE;
1228 }
51004dcb 1229 initBuckets(status);
4388f060
A
1230 if (U_FAILURE(status)) {
1231 return FALSE;
1232 }
1233 ++labelsIterIndex_;
51004dcb
A
1234 if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1235 labelsIterIndex_ = buckets_->getBucketCount();
4388f060
A
1236 return FALSE;
1237 }
51004dcb 1238 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
4388f060
A
1239 resetRecordIterator();
1240 return TRUE;
1241}
1242
1243const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1244 if (currentBucket_ != NULL) {
1245 return currentBucket_->label_;
1246 } else {
51004dcb 1247 return emptyString_;
4388f060
A
1248 }
1249}
1250
1251
1252UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1253 if (currentBucket_ != NULL) {
1254 return currentBucket_->labelType_;
1255 } else {
1256 return U_ALPHAINDEX_NORMAL;
1257 }
1258}
1259
1260
1261int32_t AlphabeticIndex::getBucketRecordCount() const {
51004dcb 1262 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
4388f060
A
1263 return currentBucket_->records_->size();
1264 } else {
1265 return 0;
1266 }
1267}
1268
1269
1270AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1271 if (U_FAILURE(status)) {
1272 return *this;
1273 }
51004dcb 1274 internalResetBucketIterator();
4388f060
A
1275 return *this;
1276}
1277
1278
1279UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1280 if (U_FAILURE(status)) {
1281 return FALSE;
1282 }
1283 if (currentBucket_ == NULL) {
1284 // We are trying to iterate over the items in a bucket, but there is no
1285 // current bucket from the enumeration of buckets.
1286 status = U_INVALID_STATE_ERROR;
1287 return FALSE;
1288 }
51004dcb 1289 if (buckets_ == NULL) {
4388f060
A
1290 status = U_ENUM_OUT_OF_SYNC_ERROR;
1291 return FALSE;
1292 }
51004dcb
A
1293 if (currentBucket_->records_ == NULL) {
1294 return FALSE;
1295 }
4388f060
A
1296 ++itemsIterIndex_;
1297 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1298 itemsIterIndex_ = currentBucket_->records_->size();
1299 return FALSE;
1300 }
1301 return TRUE;
1302}
1303
1304
1305const UnicodeString &AlphabeticIndex::getRecordName() const {
51004dcb
A
1306 const UnicodeString *retStr = &emptyString_;
1307 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
4388f060
A
1308 itemsIterIndex_ >= 0 &&
1309 itemsIterIndex_ < currentBucket_->records_->size()) {
1310 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1311 retStr = &item->name_;
1312 }
1313 return *retStr;
1314}
1315
1316const void *AlphabeticIndex::getRecordData() const {
1317 const void *retPtr = NULL;
51004dcb 1318 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
4388f060
A
1319 itemsIterIndex_ >= 0 &&
1320 itemsIterIndex_ < currentBucket_->records_->size()) {
1321 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1322 retPtr = item->data_;
1323 }
1324 return retPtr;
1325}
1326
1327
1328AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1329 itemsIterIndex_ = -1;
1330 return *this;
1331}
1332
1333
1334
1335AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1336 const UnicodeString &lowerBoundary,
51004dcb
A
1337 UAlphabeticIndexLabelType type)
1338 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1339 displayBucket_(NULL), displayIndex_(-1),
1340 records_(NULL) {
4388f060
A
1341}
1342
1343
1344AlphabeticIndex::Bucket::~Bucket() {
1345 delete records_;
1346}
1347
1348U_NAMESPACE_END
1349
1350#endif