]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/alphaindex.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / alphaindex.cpp
... / ...
CommitLineData
1/*
2*******************************************************************************
3* Copyright (C) 2009-2012, International Business Machines Corporation and *
4* others. All Rights Reserved. *
5*******************************************************************************
6*/
7
8/**
9 * \file
10 * \brief C API: AlphabeticIndex class
11 */
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
16
17#include "unicode/alphaindex.h"
18#include "unicode/coll.h"
19#include "unicode/normalizer2.h"
20#include "unicode/strenum.h"
21#include "unicode/tblcoll.h"
22#include "unicode/ulocdata.h"
23#include "unicode/uniset.h"
24#include "unicode/uobject.h"
25#include "unicode/uscript.h"
26#include "unicode/usetiter.h"
27#include "unicode/ustring.h"
28#include "unicode/utf16.h"
29
30#include "cstring.h"
31#include "mutex.h"
32#include "uassert.h"
33#include "ucln_in.h"
34#include "uhash.h"
35#include "uvector.h"
36
37//#include <string>
38//#include <iostream>
39U_NAMESPACE_BEGIN
40
41UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
42
43// Forward Declarations
44static int32_t U_CALLCONV
45PreferenceComparator(const void *context, const void *left, const void *right);
46
47static int32_t U_CALLCONV
48sortCollateComparator(const void *context, const void *left, const void *right);
49
50static int32_t U_CALLCONV
51recordCompareFn(const void *context, const void *left, const void *right);
52
53// UVector<Bucket *> support function, delete a Bucket.
54static void U_CALLCONV
55alphaIndex_deleteBucket(void *obj) {
56 delete static_cast<AlphabeticIndex::Bucket *>(obj);
57}
58
59// UVector<Record *> support function, delete a Record.
60static void U_CALLCONV
61alphaIndex_deleteRecord(void *obj) {
62 delete static_cast<AlphabeticIndex::Record *>(obj);
63}
64
65
66
67static const Normalizer2 *nfkdNormalizer;
68
69//
70// Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
71// Append everything - individual characters are handled as strings of length 1.
72// The destination vector owns the appended strings.
73
74static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
75 UnicodeSetIterator setIter(source);
76 while (setIter.next()) {
77 const UnicodeString &str = setIter.getString();
78 dest.addElement(str.clone(), status);
79 }
80}
81
82
83AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
84 init(status);
85 if (U_FAILURE(status)) {
86 return;
87 }
88 locale_ = locale;
89 langType_ = langTypeFromLocale(locale_);
90
91 collator_ = Collator::createInstance(locale, status);
92 if (collator_ != NULL) {
93 collatorPrimaryOnly_ = collator_->clone();
94 }
95 if (collatorPrimaryOnly_ != NULL) {
96 collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
97 }
98 getIndexExemplars(*initialLabels_, locale, status);
99 indexBuildRequired_ = TRUE;
100 if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
101 status = U_MEMORY_ALLOCATION_ERROR;
102 }
103 firstScriptCharacters_ = firstStringsInScript(status);
104}
105
106
107AlphabeticIndex::~AlphabeticIndex() {
108 uhash_close(alreadyIn_);
109 delete bucketList_;
110 delete collator_;
111 delete collatorPrimaryOnly_;
112 delete firstScriptCharacters_;
113 delete labels_;
114 delete inputRecords_;
115 delete noDistinctSorting_;
116 delete notAlphabetic_;
117 delete initialLabels_;
118}
119
120
121AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
122 if (U_FAILURE(status)) {
123 return *this;
124 }
125 initialLabels_->addAll(additions);
126 return *this;
127}
128
129
130AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
131 if (U_FAILURE(status)) {
132 return *this;
133 }
134 UnicodeSet additions;
135 getIndexExemplars(additions, locale, status);
136 initialLabels_->addAll(additions);
137 return *this;
138}
139
140
141int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
142 buildIndex(status);
143 if (U_FAILURE(status)) {
144 return 0;
145 }
146 return bucketList_->size();
147}
148
149
150int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
151 if (U_FAILURE(status)) {
152 return 0;
153 }
154 return inputRecords_->size();
155}
156
157
158void AlphabeticIndex::buildIndex(UErrorCode &status) {
159 if (U_FAILURE(status)) {
160 return;
161 }
162 if (!indexBuildRequired_) {
163 return;
164 }
165
166 // Discard any already-built data.
167 // This is important when the user builds and uses an index, then subsequently modifies it,
168 // necessitating a rebuild.
169
170 bucketList_->removeAllElements();
171 labels_->removeAllElements();
172 uhash_removeAll(alreadyIn_);
173 noDistinctSorting_->clear();
174 notAlphabetic_->clear();
175
176 // first sort the incoming Labels, with a "best" ordering among items
177 // that are the same according to the collator
178
179 UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector.
180 preferenceSorting.setDeleter(uprv_deleteUObject);
181 appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
182 preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
183
184 // We now make a set of Labels.
185 // Some of the input may, however, be redundant.
186 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
187 // So we make a pass through, filtering out those cases.
188 // TODO: filtering these out would seem to be at odds with the eventual goal
189 // of being able to split buckets that contain too many items.
190
191 UnicodeSet labelSet;
192 for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
193 UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
194 // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet,
195 // is it even possible for duplicates to show up in this check?
196 if (labelSet.contains(item)) {
197 UnicodeSetIterator itemAlreadyInIter(labelSet);
198 while (itemAlreadyInIter.next()) {
199 const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
200 if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
201 UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
202 if (targets == NULL) {
203 // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
204 targets = new UnicodeSet();
205 uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
206 }
207 targets->add(item);
208 break;
209 }
210 }
211 } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point.
212 collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
213 noDistinctSorting_->add(item);
214 } else if (!ALPHABETIC->containsSome(item)) {
215 notAlphabetic_->add(item);
216 } else {
217 labelSet.add(item);
218 }
219 }
220
221 // If we have no labels, hard-code a fallback default set of [A-Z]
222 // This case can occur with locales that don't have exemplar character data, including root.
223 // A no-labels situation will cause other problems; it needs to be avoided.
224 if (labelSet.isEmpty()) {
225 labelSet.add((UChar32)0x41, (UChar32)0x5A);
226 }
227
228 // Move the set of Labels from the set into a vector, and sort
229 // according to the collator.
230
231 appendUnicodeSetToUVector(*labels_, labelSet, status);
232 labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
233
234 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
235 // Implemented by copying the elements to be retained to a new UVector.
236
237 const int32_t size = labelSet.size() - 1;
238 if (size > maxLabelCount_) {
239 UVector *newLabels = new UVector(status);
240 newLabels->setDeleter(uprv_deleteUObject);
241 int32_t count = 0;
242 int32_t old = -1;
243 for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
244 const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
245 ++count;
246 const int32_t bump = count * maxLabelCount_ / size;
247 if (bump == old) {
248 // it.remove();
249 } else {
250 newLabels->addElement(str->clone(), status);
251 old = bump;
252 }
253 }
254 delete labels_;
255 labels_ = newLabels;
256 }
257
258 // We now know the list of labels.
259 // Create a corresponding list of buckets, one per label.
260
261 buildBucketList(status); // Corresponds to Java BucketList constructor.
262
263 // Bin the Records into the Buckets.
264 bucketRecords(status);
265
266 indexBuildRequired_ = FALSE;
267 resetBucketIterator(status);
268}
269
270//
271// buildBucketList() Corresponds to the BucketList constructor in the Java version.
272
273void AlphabeticIndex::buildBucketList(UErrorCode &status) {
274 UnicodeString labelStr = getUnderflowLabel();
275 Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
276 bucketList_->addElement(b, status);
277
278 // Build up the list, adding underflow, additions, overflow
279 // insert infix labels as needed, using \uFFFF.
280 const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
281 b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
282 bucketList_->addElement(b, status);
283
284 UnicodeSet lastSet;
285 UnicodeSet set;
286 AlphabeticIndex::getScriptSet(lastSet, *last, status);
287 lastSet.removeAll(*IGNORE_SCRIPTS);
288
289 for (int i = 1; i < labels_->size(); ++i) {
290 UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
291 getScriptSet(set, *current, status);
292 set.removeAll(*IGNORE_SCRIPTS);
293 if (lastSet.containsNone(set)) {
294 // check for adjacent
295 const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
296 if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
297 labelStr = getInflowLabel();
298 b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
299 bucketList_->addElement(b, status);
300 i++;
301 lastSet = set;
302 }
303 }
304 b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
305 bucketList_->addElement(b, status);
306 last = current;
307 lastSet = set;
308 }
309 const UnicodeString &limitString = getOverflowComparisonString(*last, status);
310 b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
311 bucketList_->addElement(b, status);
312 // final overflow bucket
313}
314
315
316//
317// Place all of the raw input records into the correct bucket.
318//
319// Begin by sorting the input records; this lets us bin them in a single pass.
320//
321// Note on storage management: The input records are owned by the
322// inputRecords_ vector, and will (eventually) be auto-deleted by it.
323// The Bucket objects have pointers to the Record objects, but do not own them.
324//
325void AlphabeticIndex::bucketRecords(UErrorCode &status) {
326 if (U_FAILURE(status)) {
327 return;
328 }
329
330 inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
331 U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow
332 // bucket, even if no user labels.
333 int32_t bucketIndex = 0;
334 Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex));
335 Bucket *nextBucket = NULL;
336 if (bucketIndex+1 < bucketList_->size()) {
337 nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
338 }
339 int32_t recordIndex = 0;
340 Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
341 while (recordIndex < inputRecords_->size()) {
342 if (nextBucket == NULL ||
343 collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
344 // Record goes in current bucket. Advance to next record,
345 // stay on current bucket.
346 destBucket->records_->addElement(r, status);
347 ++recordIndex;
348 r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
349 } else {
350 // Advance to the next bucket, stay on current record.
351 bucketIndex++;
352 destBucket = nextBucket;
353 if (bucketIndex+1 < bucketList_->size()) {
354 nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
355 } else {
356 nextBucket = NULL;
357 }
358 U_ASSERT(destBucket != NULL);
359 }
360 }
361
362}
363
364
365void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) {
366 if (U_FAILURE(status)) {
367 return;
368 }
369
370 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
371 UnicodeSet exemplars;
372 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
373 if (U_SUCCESS(status)) {
374 dest.addAll(exemplars);
375 return;
376 }
377 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
378
379 // Locale data did not include explicit Index characters.
380 // Synthesize a set of them from the locale's standard exemplar characters.
381
382 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
383 if (U_FAILURE(status)) {
384 return;
385 }
386
387 // Upper-case any that aren't already so.
388 // (We only do this for synthesized index characters.)
389
390 UnicodeSetIterator it(exemplars);
391 UnicodeString upperC;
392 UnicodeSet lowersToRemove;
393 UnicodeSet uppersToAdd;
394 while (it.next()) {
395 const UnicodeString &exemplarC = it.getString();
396 upperC = exemplarC;
397 upperC.toUpper(locale);
398 if (exemplarC != upperC) {
399 lowersToRemove.add(exemplarC);
400 uppersToAdd.add(upperC);
401 }
402 }
403 exemplars.removeAll(lowersToRemove);
404 exemplars.addAll(uppersToAdd);
405
406 // get the exemplars, and handle special cases
407
408 // question: should we add auxiliary exemplars?
409 if (exemplars.containsSome(*CORE_LATIN)) {
410 exemplars.addAll(*CORE_LATIN);
411 }
412 if (exemplars.containsSome(*HANGUL)) {
413 // cut down to small list
414 UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
415 exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
416 exemplars.addAll(*HANGUL);
417 }
418 if (exemplars.containsSome(*ETHIOPIC)) {
419 // cut down to small list
420 // make use of the fact that Ethiopic is allocated in 8's, where
421 // the base is 0 mod 8.
422 UnicodeSetIterator it(*ETHIOPIC);
423 while (it.next() && !it.isString()) {
424 if ((it.getCodepoint() & 0x7) != 0) {
425 exemplars.remove(it.getCodepoint());
426 }
427 }
428 }
429 dest.addAll(exemplars);
430}
431
432
433/*
434 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
435 */
436static const UChar32 CGJ = (UChar)0x034F;
437UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
438 UnicodeString result;
439 if (item.length() == 0) {
440 return result;
441 }
442 int32_t i = 0;
443 for (;;) {
444 UChar32 cp = item.char32At(i);
445 result.append(cp);
446 i = item.moveIndex32(i, 1);
447 if (i >= item.length()) {
448 break;
449 }
450 result.append(CGJ);
451 }
452 return result;
453}
454
455
456UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
457 return FALSE;
458}
459
460
461UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
462 return FALSE;
463}
464
465
466const RuleBasedCollator &AlphabeticIndex::getCollator() const {
467 // There are no known non-RuleBasedCollator collators, and none ever expected.
468 // But, in case that changes, better a null pointer than a wrong type.
469 return *dynamic_cast<RuleBasedCollator *>(collator_);
470}
471
472
473const UnicodeString &AlphabeticIndex::getInflowLabel() const {
474 return inflowLabel_;
475}
476
477const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
478 return overflowLabel_;
479}
480
481
482const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
483 return underflowLabel_;
484}
485
486
487AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
488 inflowLabel_ = label;
489 indexBuildRequired_ = TRUE;
490 return *this;
491}
492
493
494AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
495 overflowLabel_ = label;
496 indexBuildRequired_ = TRUE;
497 return *this;
498}
499
500
501AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
502 underflowLabel_ = label;
503 indexBuildRequired_ = TRUE;
504 return *this;
505}
506
507
508int32_t AlphabeticIndex::getMaxLabelCount() const {
509 return maxLabelCount_;
510}
511
512
513AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
514 if (U_FAILURE(status)) {
515 return *this;
516 }
517 if (maxLabelCount <= 0) {
518 status = U_ILLEGAL_ARGUMENT_ERROR;
519 return *this;
520 }
521 maxLabelCount_ = maxLabelCount;
522 if (maxLabelCount < bucketList_->size()) {
523 indexBuildRequired_ = TRUE;
524 }
525 return *this;
526}
527
528
529const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) {
530 for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
531 const UnicodeString *s =
532 static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
533 if (collator_->compare(*s, lowerLimit) > 0) {
534 return *s;
535 }
536 }
537 return *EMPTY_STRING;
538}
539
540UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
541 if (U_FAILURE(status)) {
542 return &dest;
543 }
544 UChar32 cp = codePoint.char32At(0);
545 UScriptCode scriptCode = uscript_getScript(cp, &status);
546 dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
547 return &dest;
548}
549
550//
551// init() - Common code for constructors.
552//
553
554void AlphabeticIndex::init(UErrorCode &status) {
555 // Initialize statics if needed.
556 AlphabeticIndex::staticInit(status);
557
558 // Put the object into a known state so that the destructor will function.
559
560 alreadyIn_ = NULL;
561 bucketList_ = NULL;
562 collator_ = NULL;
563 collatorPrimaryOnly_ = NULL;
564 currentBucket_ = NULL;
565 firstScriptCharacters_ = NULL;
566 initialLabels_ = NULL;
567 indexBuildRequired_ = TRUE;
568 inputRecords_ = NULL;
569 itemsIterIndex_ = 0;
570 labels_ = NULL;
571 labelsIterIndex_ = 0;
572 maxLabelCount_ = 99;
573 noDistinctSorting_ = NULL;
574 notAlphabetic_ = NULL;
575 recordCounter_ = 0;
576
577 if (U_FAILURE(status)) {
578 return;
579 }
580 alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash,
581 uhash_compareUnicodeString, // key Comparator,
582 NULL, // value Comparator
583 &status);
584 uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject);
585 uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject);
586
587 bucketList_ = new UVector(status);
588 bucketList_->setDeleter(alphaIndex_deleteBucket);
589 labels_ = new UVector(status);
590 labels_->setDeleter(uprv_deleteUObject);
591 labels_->setComparer(uhash_compareUnicodeString);
592 inputRecords_ = new UVector(status);
593 inputRecords_->setDeleter(alphaIndex_deleteRecord);
594
595 noDistinctSorting_ = new UnicodeSet();
596 notAlphabetic_ = new UnicodeSet();
597 initialLabels_ = new UnicodeSet();
598
599 inflowLabel_.remove();
600 inflowLabel_.append((UChar)0x2026); // Ellipsis
601 overflowLabel_ = inflowLabel_;
602 underflowLabel_ = inflowLabel_;
603
604 // TODO: check for memory allocation failures.
605}
606
607
608static UBool indexCharactersAreInitialized = FALSE;
609
610// Index Characters Clean up function. Delete statically allocated constant stuff.
611U_CDECL_BEGIN
612static UBool U_CALLCONV indexCharacters_cleanup(void) {
613 AlphabeticIndex::staticCleanup();
614 return TRUE;
615}
616U_CDECL_END
617
618void AlphabeticIndex::staticCleanup() {
619 delete ALPHABETIC;
620 ALPHABETIC = NULL;
621 delete HANGUL;
622 HANGUL = NULL;
623 delete ETHIOPIC;
624 ETHIOPIC = NULL;
625 delete CORE_LATIN;
626 CORE_LATIN = NULL;
627 delete IGNORE_SCRIPTS;
628 IGNORE_SCRIPTS = NULL;
629 delete TO_TRY;
630 TO_TRY = NULL;
631 delete UNIHAN;
632 UNIHAN = NULL;
633 delete EMPTY_STRING;
634 EMPTY_STRING = NULL;
635 nfkdNormalizer = NULL; // ref to a singleton. Do not delete.
636 indexCharactersAreInitialized = FALSE;
637}
638
639
640UnicodeSet *AlphabeticIndex::ALPHABETIC;
641UnicodeSet *AlphabeticIndex::HANGUL;
642UnicodeSet *AlphabeticIndex::ETHIOPIC;
643UnicodeSet *AlphabeticIndex::CORE_LATIN;
644UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS;
645UnicodeSet *AlphabeticIndex::TO_TRY;
646UnicodeSet *AlphabeticIndex::UNIHAN;
647const UnicodeString *AlphabeticIndex::EMPTY_STRING;
648
649//
650// staticInit() One-time initialization of constants.
651// Thread safe. Called from constructors.
652// Mutex overhead is not a concern. AlphabeticIndex constructors are
653// sufficiently heavy that the cost of the mutex check is not significant.
654
655void AlphabeticIndex::staticInit(UErrorCode &status) {
656 static UMTX IndexCharsInitMutex;
657
658 Mutex mutex(&IndexCharsInitMutex);
659 if (indexCharactersAreInitialized || U_FAILURE(status)) {
660 return;
661 }
662 UBool finishedInit = FALSE;
663
664 {
665 UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
666 ALPHABETIC = new UnicodeSet(alphaString, status);
667 if (ALPHABETIC == NULL) {
668 goto err;
669 }
670
671 HANGUL = new UnicodeSet();
672 HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
673 add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
674 if (HANGUL== NULL) {
675 goto err;
676 }
677
678
679 UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
680 ETHIOPIC = new UnicodeSet(EthiopicStr, status);
681 if (ETHIOPIC == NULL) {
682 goto err;
683 }
684
685 CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z');
686 if (CORE_LATIN == NULL) {
687 goto err;
688 }
689
690 UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
691 "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
692 IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
693 IGNORE_SCRIPTS->freeze();
694 if (IGNORE_SCRIPTS == NULL) {
695 goto err;
696 }
697
698 UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
699 TO_TRY = new UnicodeSet(nfcqcStr, status);
700 if (TO_TRY == NULL) {
701 goto err;
702 }
703
704 UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
705 UNIHAN = new UnicodeSet(unihanStr, status);
706 if (UNIHAN == NULL) {
707 goto err;
708 }
709
710 EMPTY_STRING = new UnicodeString();
711
712 nfkdNormalizer = Normalizer2::getNFKDInstance(status);
713 if (nfkdNormalizer == NULL) {
714 goto err;
715 }
716 }
717 finishedInit = TRUE;
718
719 err:
720 if (!finishedInit && U_SUCCESS(status)) {
721 status = U_MEMORY_ALLOCATION_ERROR;
722 }
723 if (U_FAILURE(status)) {
724 indexCharacters_cleanup();
725 return;
726 }
727 ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
728 indexCharactersAreInitialized = TRUE;
729}
730
731
732//
733// Comparison function for UVector<UnicodeString *> sorting with a collator.
734//
735static int32_t U_CALLCONV
736sortCollateComparator(const void *context, const void *left, const void *right) {
737 const UElement *leftElement = static_cast<const UElement *>(left);
738 const UElement *rightElement = static_cast<const UElement *>(right);
739 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
740 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
741 const Collator *col = static_cast<const Collator *>(context);
742
743 if (leftString == rightString) {
744 // Catches case where both are NULL
745 return 0;
746 }
747 if (leftString == NULL) {
748 return 1;
749 };
750 if (rightString == NULL) {
751 return -1;
752 }
753 Collator::EComparisonResult r = col->compare(*leftString, *rightString);
754 return (int32_t) r;
755}
756
757//
758// Comparison function for UVector<Record *> sorting with a collator.
759//
760static int32_t U_CALLCONV
761recordCompareFn(const void *context, const void *left, const void *right) {
762 const UElement *leftElement = static_cast<const UElement *>(left);
763 const UElement *rightElement = static_cast<const UElement *>(right);
764 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
765 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
766 const Collator *col = static_cast<const Collator *>(context);
767
768 Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
769 if (r == Collator::EQUAL) {
770 if (leftRec->serialNumber_ < rightRec->serialNumber_) {
771 r = Collator::LESS;
772 } else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
773 r = Collator::GREATER;
774 }
775 }
776 return (int32_t) r;
777}
778
779
780#if 0
781//
782// First characters in scripts.
783// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
784// The vector is sorted according to this index's collation.
785//
786// This code is too slow to use, so for now hard code the data.
787// Hard coded implementation is follows.
788//
789UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {
790
791 if (U_FAILURE(status)) {
792 return NULL;
793 }
794
795 UnicodeString results[USCRIPT_CODE_LIMIT];
796 UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
797
798 UnicodeSetIterator siter(*TO_TRY);
799 while (siter.next()) {
800 const UnicodeString &current = siter.getString();
801 Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
802 if (r < 0) { // TODO fix; we only want "real" script characters, not
803 // symbols.
804 continue;
805 }
806
807 int script = uscript_getScript(current.char32At(0), &status);
808 if (results[script].length() == 0) {
809 results[script] = current;
810 }
811 else if (ruleBasedCollator->compare(current, results[script]) < 0) {
812 results[script] = current;
813 }
814 }
815
816 UnicodeSet extras;
817 UnicodeSet expansions;
818 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
819 const UCollator *uRuleBasedCollator = rbc->getUCollator();
820 ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
821 extras.addAll(expansions).removeAll(*TO_TRY);
822 if (extras.size() != 0) {
823 const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
824 UnicodeSetIterator extrasIter(extras);
825 while (extrasIter.next()) {
826 const UnicodeString &current = extrasIter.next();
827 if (!TO_TRY->containsAll(current))
828 continue;
829 if (!normalizer->isNormalized(current, status) ||
830 ruleBasedCollator->compare(current, LOWER_A) < 0) {
831 continue;
832 }
833 int script = uscript_getScript(current.char32At(0), &status);
834 if (results[script].length() == 0) {
835 results[script] = current;
836 } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
837 results[script] = current;
838 }
839 }
840 }
841
842 UVector *dest = new UVector(status);
843 dest->setDeleter(uprv_deleteUObject);
844 for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
845 if (results[i].length() > 0) {
846 dest->addElement(results[i].clone(), status);
847 }
848 }
849 dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
850 return dest;
851}
852#endif
853
854
855//
856// First characters in scripts.
857// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
858// The vector is sorted according to this index's collation.
859//
860// It takes too much time to compute this from character properties, so hard code it for now.
861// Character constants copied from corresponding declaration in ICU4J.
862// See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
863
864static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0,
865 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
866 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
867 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
868 0xAAF2, 0, // Meetei Mayek
869 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
870 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
871 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
872 0x1B83, 0,
873 0xD802, 0xDE00, 0, 0x0E01, 0,
874 0x0EDE, 0, // Lao
875 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
876 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
877 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
878 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
879 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
880 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
881 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
882 0xD800, 0xDE80, 0,
883 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
884 0xD801, 0xDC80, 0,
885 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
886 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
887 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
888 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
889 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
890 0x4E00, 0 };
891
892UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
893 if (U_FAILURE(status)) {
894 return NULL;
895 }
896 UVector *dest = new UVector(status);
897 if (dest == NULL) {
898 if (U_SUCCESS(status)) {
899 status = U_MEMORY_ALLOCATION_ERROR;
900 }
901 return NULL;
902 }
903 dest->setDeleter(uprv_deleteUObject);
904 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
905 const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
906 do {
907 if (U_FAILURE(status)) {
908 return dest;
909 }
910 UnicodeString *str = new UnicodeString(src, -1);
911 if (str == NULL) {
912 status = U_MEMORY_ALLOCATION_ERROR;
913 } else {
914 dest->addElement(str, status);
915 src += str->length() + 1;
916 }
917 } while (src < limit);
918 dest->sortWithUComparator(sortCollateComparator, collator_, status);
919 return dest;
920}
921
922
923AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
924 const char *lang = loc.getLanguage();
925 if (uprv_strcmp(lang, "zh") != 0) {
926 return kNormal;
927 }
928 const char *script = loc.getScript();
929 if (uprv_strcmp(script, "Hant") == 0) {
930 return kTraditional;
931 }
932 const char *country = loc.getCountry();
933 if (uprv_strcmp(country, "TW") == 0) {
934 return kTraditional;
935 }
936 return kSimplified;
937}
938
939
940//
941// Pinyin Hacks. Direct port from Java.
942//
943
944static const UChar32 probeCharInLong = 0x28EAD;
945
946
947static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
948 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
949 /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
950
951
952// Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
953
954AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = {
955 {(UChar)0, (UChar)0, (UChar)0}, // A
956 {(UChar)0x516B, (UChar)0, (UChar)0}, // B
957 {(UChar)0x5693, (UChar)0, (UChar)0}, // C
958 {(UChar)0x5491, (UChar)0, (UChar)0}, // D
959 {(UChar)0x59B8, (UChar)0, (UChar)0}, // E
960 {(UChar)0x53D1, (UChar)0, (UChar)0}, // F
961 {(UChar)0x65EE, (UChar)0, (UChar)0}, // G
962 {(UChar)0x54C8, (UChar)0, (UChar)0}, // H
963 {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
964 {(UChar)0x5494, (UChar)0, (UChar)0}, // K
965 {(UChar)0x5783, (UChar)0, (UChar)0}, // L
966 {(UChar)0x5452, (UChar)0, (UChar)0}, // M
967 {(UChar)0x5514, (UChar)0, (UChar)0}, // N
968 {(UChar)0x5594, (UChar)0, (UChar)0}, // O
969 {(UChar)0x5991, (UChar)0, (UChar)0}, // P
970 {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
971 {(UChar)0x513F, (UChar)0, (UChar)0}, // R
972 {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
973 {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
974 {(UChar)0x7A75, (UChar)0, (UChar)0}, // W
975 {(UChar)0x5915, (UChar)0, (UChar)0}, // X
976 {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
977 {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
978 {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
979 };
980
981static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
982 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
983 /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
984
985AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = {
986 {(UChar)0, (UChar)0, (UChar)0}, // A
987 {(UChar)0x516B, (UChar)0, (UChar)0}, // b
988 {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
989 {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
990 {(UChar)0x59B8, (UChar)0, (UChar)0}, // e
991 {(UChar)0x53D1, (UChar)0, (UChar)0}, // f
992 {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
993 {(UChar)0x54C8, (UChar)0, (UChar)0}, // h
994 {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j
995 {(UChar)0x5494, (UChar)0, (UChar)0}, // k
996 {(UChar)0x3547, (UChar)0, (UChar)0}, // l
997 {(UChar)0x5452, (UChar)0, (UChar)0}, // m
998 {(UChar)0x5514, (UChar)0, (UChar)0}, // n
999 {(UChar)0x5594, (UChar)0, (UChar)0}, // o
1000 {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
1001 {(UChar)0x4E03, (UChar)0, (UChar)0}, // q
1002 {(UChar)0x513F, (UChar)0, (UChar)0}, // r
1003 {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s
1004 {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t
1005 {(UChar)0x7A75, (UChar)0, (UChar)0}, // w
1006 {(UChar)0x5915, (UChar)0, (UChar)0}, // x
1007 {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y
1008 {(UChar)0x5E00, (UChar)0, (UChar)0}, // z
1009 {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
1010 };
1011
1012
1013//
1014// Probe the collation data, and decide which Pinyin tables should be used
1015//
1016// ICU can be built with a choice between two Chinese collations.
1017// The hack Pinyin tables to use depend on which one is in use.
1018// We can assume that any given copy of ICU will have only one of the collations available,
1019// and that there is no way, in a given process, to create two alphabetic indexes using
1020// different Chinese collations. Which means the probe can be done once
1021// and the results cached.
1022//
1023// This whole arrangement is temporary.
1024//
1025AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL;
1026const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL;
1027
1028void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
1029 {
1030 Mutex m;
1031 if (PINYIN_LOWER_BOUNDS != NULL) {
1032 return;
1033 }
1034 }
1035 UnicodeSet *colSet = col->getTailoredSet(status);
1036 if (U_FAILURE(status) || colSet == NULL) {
1037 delete colSet;
1038 if (U_SUCCESS(status)) {
1039 status = U_MEMORY_ALLOCATION_ERROR;
1040 }
1041 return;
1042 }
1043 UBool useLongTables = colSet->contains(probeCharInLong);
1044 delete colSet;
1045 {
1046 Mutex m;
1047 if (useLongTables) {
1048 PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
1049 HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG;
1050 } else {
1051 PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
1052 HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT;
1053 }
1054 }
1055}
1056
1057// Pinyin Hack:
1058// Modify a Chinese name by prepending a Latin letter. The modified name is used
1059// when putting records (names) into buckets, to put the name under a Latin index heading.
1060
1061void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
1062
1063 if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
1064 dest = name;
1065 return;
1066 }
1067
1068 UErrorCode status = U_ZERO_ERROR;
1069 initPinyinBounds(col, status);
1070 if (U_FAILURE(status)) {
1071 dest = name;
1072 return;
1073 }
1074 // TODO: use binary search
1075 int index;
1076 for (index=0; ; index++) {
1077 if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
1078 index--;
1079 break;
1080 }
1081 int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1));
1082 if (compareResult < 0) {
1083 index--;
1084 }
1085 if (compareResult <= 0) {
1086 break;
1087 }
1088 }
1089 UChar c = PINYIN_LOWER_BOUNDS[index];
1090 dest.setTo(c);
1091 dest.append(name);
1092 return;
1093}
1094
1095
1096
1097/**
1098 * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
1099 * better, and otherwise binary order is better.
1100 *
1101 * For use with array sort or UVector.
1102 * @param context A UErrorCode pointer.
1103 * @param left A UElement pointer, which must refer to a UnicodeString *
1104 * @param right A UElement pointer, which must refer to a UnicodeString *
1105 */
1106
1107static int32_t U_CALLCONV
1108PreferenceComparator(const void *context, const void *left, const void *right) {
1109 const UElement *leftElement = static_cast<const UElement *>(left);
1110 const UElement *rightElement = static_cast<const UElement *>(right);
1111 const UnicodeString *s1 = static_cast<const UnicodeString *>(leftElement->pointer);
1112 const UnicodeString *s2 = static_cast<const UnicodeString *>(rightElement->pointer);
1113 UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const.
1114 if (s1 == s2) {
1115 return 0;
1116 }
1117
1118 UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
1119 UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
1120 int32_t result = n1.length() - n2.length();
1121 if (result != 0) {
1122 return result;
1123 }
1124
1125 result = n1.compareCodePointOrder(n2);
1126 if (result != 0) {
1127 return result;
1128 }
1129 return s1->compareCodePointOrder(*s2);
1130}
1131
1132
1133//
1134// Constructor & Destructor for AlphabeticIndex::Record
1135//
1136// Records are internal only, instances are not directly surfaced in the public API.
1137// This class is mostly struct-like, with all public fields.
1138
1139AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data):
1140 alphaIndex_(alphaIndex), name_(name), data_(data)
1141{
1142 UnicodeString prefixedName;
1143 alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
1144 serialNumber_ = ++alphaIndex->recordCounter_;
1145}
1146
1147AlphabeticIndex::Record::~Record() {
1148}
1149
1150
1151AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1152 if (U_FAILURE(status)) {
1153 return *this;
1154 }
1155 Record *r = new Record(this, name, data);
1156 inputRecords_->addElement(r, status);
1157 indexBuildRequired_ = TRUE;
1158 //std::string ss;
1159 //std::string ss2;
1160 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1161 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1162 return *this;
1163}
1164
1165
1166AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1167 if (U_FAILURE(status)) {
1168 return *this;
1169 }
1170 inputRecords_->removeAllElements();
1171 indexBuildRequired_ = TRUE;
1172 return *this;
1173}
1174
1175
1176int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1177 buildIndex(status);
1178 if (U_FAILURE(status)) {
1179 return 0;
1180 }
1181
1182 // For simplified Chinese prepend a prefix to the name.
1183 // For non-Chinese locales or non-Chinese names, the name is not modified.
1184
1185 UnicodeString prefixedName;
1186 hackName(prefixedName, name, collatorPrimaryOnly_);
1187
1188 // TODO: use a binary search.
1189 for (int32_t i = 0; i < bucketList_->size(); ++i) {
1190 Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i));
1191 Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
1192 if (comp < 0) {
1193 return i - 1;
1194 }
1195 }
1196 // Loop runs until we find the bucket following the one that would hold prefixedName.
1197 // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
1198 // than returning from the middle.
1199
1200 return bucketList_->size() - 1;
1201}
1202
1203
1204int32_t AlphabeticIndex::getBucketIndex() const {
1205 return labelsIterIndex_;
1206}
1207
1208
1209UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1210 if (U_FAILURE(status)) {
1211 return FALSE;
1212 }
1213 if (indexBuildRequired_ && currentBucket_ != NULL) {
1214 status = U_ENUM_OUT_OF_SYNC_ERROR;
1215 return FALSE;
1216 }
1217 buildIndex(status);
1218 if (U_FAILURE(status)) {
1219 return FALSE;
1220 }
1221 ++labelsIterIndex_;
1222 if (labelsIterIndex_ >= bucketList_->size()) {
1223 labelsIterIndex_ = bucketList_->size();
1224 return FALSE;
1225 }
1226 currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
1227 resetRecordIterator();
1228 return TRUE;
1229}
1230
1231const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1232 if (currentBucket_ != NULL) {
1233 return currentBucket_->label_;
1234 } else {
1235 return *EMPTY_STRING;
1236 }
1237}
1238
1239
1240UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1241 if (currentBucket_ != NULL) {
1242 return currentBucket_->labelType_;
1243 } else {
1244 return U_ALPHAINDEX_NORMAL;
1245 }
1246}
1247
1248
1249int32_t AlphabeticIndex::getBucketRecordCount() const {
1250 if (currentBucket_ != NULL) {
1251 return currentBucket_->records_->size();
1252 } else {
1253 return 0;
1254 }
1255}
1256
1257
1258AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1259 if (U_FAILURE(status)) {
1260 return *this;
1261 }
1262 buildIndex(status);
1263 labelsIterIndex_ = -1;
1264 currentBucket_ = NULL;
1265 return *this;
1266}
1267
1268
1269UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1270 if (U_FAILURE(status)) {
1271 return FALSE;
1272 }
1273 if (currentBucket_ == NULL) {
1274 // We are trying to iterate over the items in a bucket, but there is no
1275 // current bucket from the enumeration of buckets.
1276 status = U_INVALID_STATE_ERROR;
1277 return FALSE;
1278 }
1279 if (indexBuildRequired_) {
1280 status = U_ENUM_OUT_OF_SYNC_ERROR;
1281 return FALSE;
1282 }
1283 ++itemsIterIndex_;
1284 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1285 itemsIterIndex_ = currentBucket_->records_->size();
1286 return FALSE;
1287 }
1288 return TRUE;
1289}
1290
1291
1292const UnicodeString &AlphabeticIndex::getRecordName() const {
1293 const UnicodeString *retStr = EMPTY_STRING;
1294 if (currentBucket_ != NULL &&
1295 itemsIterIndex_ >= 0 &&
1296 itemsIterIndex_ < currentBucket_->records_->size()) {
1297 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1298 retStr = &item->name_;
1299 }
1300 return *retStr;
1301}
1302
1303const void *AlphabeticIndex::getRecordData() const {
1304 const void *retPtr = NULL;
1305 if (currentBucket_ != NULL &&
1306 itemsIterIndex_ >= 0 &&
1307 itemsIterIndex_ < currentBucket_->records_->size()) {
1308 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1309 retPtr = item->data_;
1310 }
1311 return retPtr;
1312}
1313
1314
1315AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1316 itemsIterIndex_ = -1;
1317 return *this;
1318}
1319
1320
1321
1322AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1323 const UnicodeString &lowerBoundary,
1324 UAlphabeticIndexLabelType type,
1325 UErrorCode &status):
1326 label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
1327 if (U_FAILURE(status)) {
1328 return;
1329 }
1330 records_ = new UVector(status);
1331 if (records_ == NULL && U_SUCCESS(status)) {
1332 status = U_MEMORY_ALLOCATION_ERROR;
1333 }
1334}
1335
1336
1337AlphabeticIndex::Bucket::~Bucket() {
1338 delete records_;
1339}
1340
1341U_NAMESPACE_END
1342
1343#endif