icuSources/i18n/alphaindex.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2009-2012, International Business Machines Corporation and    *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 */
   7
   8 /**
   9  * \file
  10  * \brief C API: AlphabeticIndex class
  11  */
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
  16
  17 #include "unicode/alphaindex.h"
  18 #include "unicode/coll.h"
  19 #include "unicode/normalizer2.h"
  20 #include "unicode/strenum.h"
  21 #include "unicode/tblcoll.h"
  22 #include "unicode/ulocdata.h"
  23 #include "unicode/uniset.h"
  24 #include "unicode/uobject.h"
  25 #include "unicode/uscript.h"
  26 #include "unicode/usetiter.h"
  27 #include "unicode/ustring.h"
  28 #include "unicode/utf16.h"
  29
  30 #include "cstring.h"
  31 #include "mutex.h"
  32 #include "uassert.h"
  33 #include "ucln_in.h"
  34 #include "uhash.h"
  35 #include "uvector.h"
  36
  37 //#include <string>
  38 //#include <iostream>
  39 U_NAMESPACE_BEGIN
  40
  41 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
  42
  43 // Forward Declarations
  44 static int32_t U_CALLCONV
  45 PreferenceComparator(const void *context, const void *left, const void *right);
  46
  47 static int32_t U_CALLCONV
  48 sortCollateComparator(const void *context, const void *left, const void *right);
  49
  50 static int32_t U_CALLCONV
  51 recordCompareFn(const void *context, const void *left, const void *right);
  52
  53 //  UVector<Bucket *> support function, delete a Bucket.
  54 static void U_CALLCONV
  55 alphaIndex_deleteBucket(void *obj) {
  56     delete static_cast<AlphabeticIndex::Bucket *>(obj);
  57 }
  58
  59 //  UVector<Record *> support function, delete a Record.
  60 static void U_CALLCONV
  61 alphaIndex_deleteRecord(void *obj) {
  62     delete static_cast<AlphabeticIndex::Record *>(obj);
  63 }
  64
  65
  66
  67 static const Normalizer2 *nfkdNormalizer;
  68
  69 //
  70 //  Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
  71 //  Append everything - individual characters are handled as strings of length 1.
  72 //  The destination vector owns the appended strings.
  73
  74 static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
  75     UnicodeSetIterator setIter(source);
  76     while (setIter.next()) {
  77         const UnicodeString &str = setIter.getString();
  78         dest.addElement(str.clone(), status);
  79     }
  80 }
  81
  82
  83 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
  84     init(status);
  85     if (U_FAILURE(status)) {
  86         return;
  87     }
  88     locale_ = locale;
  89     langType_ = langTypeFromLocale(locale_);
  90
  91     collator_ = Collator::createInstance(locale, status);
  92     if (collator_ != NULL) {
  93         collatorPrimaryOnly_ = collator_->clone();
  94     }
  95     if (collatorPrimaryOnly_ != NULL) {
  96         collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
  97     }
  98     getIndexExemplars(*initialLabels_, locale, status);
  99     indexBuildRequired_ = TRUE;
 100     if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
 101         status = U_MEMORY_ALLOCATION_ERROR;
 102     }
 103     firstScriptCharacters_ = firstStringsInScript(status);
 104 }
 105
 106
 107 AlphabeticIndex::~AlphabeticIndex() {
 108     uhash_close(alreadyIn_);
 109     delete bucketList_;
 110     delete collator_;
 111     delete collatorPrimaryOnly_;
 112     delete firstScriptCharacters_;
 113     delete labels_;
 114     delete inputRecords_;
 115     delete noDistinctSorting_;
 116     delete notAlphabetic_;
 117     delete initialLabels_;
 118 }
 119
 120
 121 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
 122     if (U_FAILURE(status)) {
 123         return *this;
 124     }
 125     initialLabels_->addAll(additions);
 126     return *this;
 127 }
 128
 129
 130 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
 131     if (U_FAILURE(status)) {
 132         return *this;
 133     }
 134     UnicodeSet additions;
 135     getIndexExemplars(additions, locale, status);
 136     initialLabels_->addAll(additions);
 137     return *this;
 138 }
 139
 140
 141 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
 142     buildIndex(status);
 143     if (U_FAILURE(status)) {
 144         return 0;
 145     }
 146     return bucketList_->size();
 147 }
 148
 149
 150 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
 151     if (U_FAILURE(status)) {
 152         return 0;
 153     }
 154     return inputRecords_->size();
 155 }
 156
 157
 158 void AlphabeticIndex::buildIndex(UErrorCode &status) {
 159     if (U_FAILURE(status)) {
 160         return;
 161     }
 162     if (!indexBuildRequired_) {
 163         return;
 164     }
 165
 166     // Discard any already-built data.
 167     // This is important when the user builds and uses an index, then subsequently modifies it,
 168     // necessitating a rebuild.
 169
 170     bucketList_->removeAllElements();
 171     labels_->removeAllElements();
 172     uhash_removeAll(alreadyIn_);
 173     noDistinctSorting_->clear();
 174     notAlphabetic_->clear();
 175
 176     // first sort the incoming Labels, with a "best" ordering among items
 177     // that are the same according to the collator
 178
 179     UVector preferenceSorting(status);   // Vector of UnicodeStrings; owned by the vector.
 180     preferenceSorting.setDeleter(uprv_deleteUObject);
 181     appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
 182     preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
 183
 184     // We now make a set of Labels.
 185     // Some of the input may, however, be redundant.
 186     // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
 187     // So we make a pass through, filtering out those cases.
 188     // TODO: filtering these out would seem to be at odds with the eventual goal
 189     //       of being able to split buckets that contain too many items.
 190
 191     UnicodeSet labelSet;
 192     for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
 193         UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
 194         // TODO:  Since preferenceSorting was originally populated from the contents of a UnicodeSet,
 195         //        is it even possible for duplicates to show up in this check?
 196         if (labelSet.contains(item)) {
 197             UnicodeSetIterator itemAlreadyInIter(labelSet);
 198             while (itemAlreadyInIter.next()) {
 199                 const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
 200                 if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
 201                     UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
 202                     if (targets == NULL) {
 203                         // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
 204                         targets = new UnicodeSet();
 205                         uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
 206                     }
 207                     targets->add(item);
 208                     break;
 209                 }
 210             }
 211         } else if (item.moveIndex32(0, 1) < item.length() &&  // Label contains more than one code point.
 212                    collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
 213             noDistinctSorting_->add(item);
 214         } else if (!ALPHABETIC->containsSome(item)) {
 215             notAlphabetic_->add(item);
 216         } else {
 217             labelSet.add(item);
 218         }
 219     }
 220
 221     // If we have no labels, hard-code a fallback default set of [A-Z]
 222     // This case can occur with locales that don't have exemplar character data, including root.
 223     // A no-labels situation will cause other problems; it needs to be avoided.
 224     if (labelSet.isEmpty()) {
 225         labelSet.add((UChar32)0x41, (UChar32)0x5A);
 226     }
 227
 228     // Move the set of Labels from the set into a vector, and sort
 229     // according to the collator.
 230
 231     appendUnicodeSetToUVector(*labels_, labelSet, status);
 232     labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
 233
 234     // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
 235     //    Implemented by copying the elements to be retained to a new UVector.
 236
 237     const int32_t size = labelSet.size() - 1;
 238     if (size > maxLabelCount_) {
 239         UVector *newLabels = new UVector(status);
 240         newLabels->setDeleter(uprv_deleteUObject);
 241         int32_t count = 0;
 242         int32_t old = -1;
 243         for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
 244             const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
 245             ++count;
 246             const int32_t bump = count * maxLabelCount_ / size;
 247             if (bump == old) {
 248                 // it.remove();
 249             } else {
 250                 newLabels->addElement(str->clone(), status);
 251                 old = bump;
 252             }
 253         }
 254         delete labels_;
 255         labels_ = newLabels;
 256     }
 257
 258     // We now know the list of labels.
 259     // Create a corresponding list of buckets, one per label.
 260
 261     buildBucketList(status);    // Corresponds to Java BucketList constructor.
 262
 263     // Bin the Records into the Buckets.
 264     bucketRecords(status);
 265
 266     indexBuildRequired_ = FALSE;
 267     resetBucketIterator(status);
 268 }
 269
 270 //
 271 //  buildBucketList()    Corresponds to the BucketList constructor in the Java version.
 272
 273 void AlphabeticIndex::buildBucketList(UErrorCode &status) {
 274     UnicodeString labelStr = getUnderflowLabel();
 275     Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
 276     bucketList_->addElement(b, status);
 277
 278     // Build up the list, adding underflow, additions, overflow
 279     // insert infix labels as needed, using \uFFFF.
 280     const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
 281     b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
 282     bucketList_->addElement(b, status);
 283
 284     UnicodeSet lastSet;
 285     UnicodeSet set;
 286     AlphabeticIndex::getScriptSet(lastSet, *last, status);
 287     lastSet.removeAll(*IGNORE_SCRIPTS);
 288
 289     for (int i = 1; i < labels_->size(); ++i) {
 290         UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
 291         getScriptSet(set, *current, status);
 292         set.removeAll(*IGNORE_SCRIPTS);
 293         if (lastSet.containsNone(set)) {
 294             // check for adjacent
 295             const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
 296             if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
 297                 labelStr = getInflowLabel();
 298                 b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
 299                 bucketList_->addElement(b, status);
 300                 i++;
 301                 lastSet = set;
 302             }
 303         }
 304         b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
 305         bucketList_->addElement(b, status);
 306         last = current;
 307         lastSet = set;
 308     }
 309     const UnicodeString &limitString = getOverflowComparisonString(*last, status);
 310     b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
 311     bucketList_->addElement(b, status);
 312     // final overflow bucket
 313 }
 314
 315
 316 //
 317 //   Place all of the raw input records into the correct bucket.
 318 //
 319 //       Begin by sorting the input records; this lets us bin them in a single pass.
 320 //
 321 //       Note on storage management:  The input records are owned by the
 322 //       inputRecords_ vector, and will (eventually) be auto-deleted by it.
 323 //       The Bucket objects have pointers to the Record objects, but do not own them.
 324 //
 325 void AlphabeticIndex::bucketRecords(UErrorCode &status) {
 326     if (U_FAILURE(status)) {
 327         return;
 328     }
 329
 330     inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
 331     U_ASSERT(bucketList_->size() > 0);   // Should always have at least an overflow
 332                                          //   bucket, even if no user labels.
 333     int32_t bucketIndex = 0;
 334     Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex));
 335     Bucket *nextBucket = NULL;
 336     if (bucketIndex+1 < bucketList_->size()) {
 337         nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
 338     }
 339     int32_t recordIndex = 0;
 340     Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
 341     while (recordIndex < inputRecords_->size()) {
 342         if (nextBucket == NULL ||
 343             collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
 344                 // Record goes in current bucket.  Advance to next record,
 345                 // stay on current bucket.
 346                 destBucket->records_->addElement(r, status);
 347                 ++recordIndex;
 348                 r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
 349         } else {
 350             // Advance to the next bucket, stay on current record.
 351             bucketIndex++;
 352             destBucket = nextBucket;
 353             if (bucketIndex+1 < bucketList_->size()) {
 354                 nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
 355             } else {
 356                 nextBucket = NULL;
 357             }
 358             U_ASSERT(destBucket != NULL);
 359         }
 360     }
 361
 362 }
 363
 364
 365 void AlphabeticIndex::getIndexExemplars(UnicodeSet  &dest, const Locale &locale, UErrorCode &status) {
 366     if (U_FAILURE(status)) {
 367         return;
 368     }
 369
 370     LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
 371     UnicodeSet exemplars;
 372     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
 373     if (U_SUCCESS(status)) {
 374         dest.addAll(exemplars);
 375         return;
 376     }
 377     status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
 378
 379     // Locale data did not include explicit Index characters.
 380     // Synthesize a set of them from the locale's standard exemplar characters.
 381
 382     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
 383     if (U_FAILURE(status)) {
 384         return;
 385     }
 386
 387     // Upper-case any that aren't already so.
 388     //   (We only do this for synthesized index characters.)
 389
 390     UnicodeSetIterator it(exemplars);
 391     UnicodeString upperC;
 392     UnicodeSet  lowersToRemove;
 393     UnicodeSet  uppersToAdd;
 394     while (it.next()) {
 395         const UnicodeString &exemplarC = it.getString();
 396         upperC = exemplarC;
 397         upperC.toUpper(locale);
 398         if (exemplarC != upperC) {
 399             lowersToRemove.add(exemplarC);
 400             uppersToAdd.add(upperC);
 401         }
 402     }
 403     exemplars.removeAll(lowersToRemove);
 404     exemplars.addAll(uppersToAdd);
 405
 406     // get the exemplars, and handle special cases
 407
 408     // question: should we add auxiliary exemplars?
 409     if (exemplars.containsSome(*CORE_LATIN)) {
 410         exemplars.addAll(*CORE_LATIN);
 411     }
 412     if (exemplars.containsSome(*HANGUL)) {
 413         // cut down to small list
 414         UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
 415         exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
 416         exemplars.addAll(*HANGUL);
 417     }
 418     if (exemplars.containsSome(*ETHIOPIC)) {
 419         // cut down to small list
 420         // make use of the fact that Ethiopic is allocated in 8's, where
 421         // the base is 0 mod 8.
 422         UnicodeSetIterator  it(*ETHIOPIC);
 423         while (it.next() && !it.isString()) {
 424             if ((it.getCodepoint() & 0x7) != 0) {
 425                 exemplars.remove(it.getCodepoint());
 426             }
 427         }
 428     }
 429     dest.addAll(exemplars);
 430 }
 431
 432
 433 /*
 434  * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
 435  */
 436 static const UChar32 CGJ = (UChar)0x034F;
 437 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
 438     UnicodeString result;
 439     if (item.length() == 0) {
 440         return result;
 441     }
 442     int32_t i = 0;
 443     for (;;) {
 444         UChar32  cp = item.char32At(i);
 445         result.append(cp);
 446         i = item.moveIndex32(i, 1);
 447         if (i >= item.length()) {
 448             break;
 449         }
 450         result.append(CGJ);
 451     }
 452     return result;
 453 }
 454
 455
 456 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
 457     return FALSE;
 458 }
 459
 460
 461 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
 462     return FALSE;
 463 }
 464
 465
 466 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
 467     // There are no known non-RuleBasedCollator collators, and none ever expected.
 468     // But, in case that changes, better a null pointer than a wrong type.
 469     return *dynamic_cast<RuleBasedCollator *>(collator_);
 470 }
 471
 472
 473 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
 474     return inflowLabel_;
 475 }
 476
 477 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
 478     return overflowLabel_;
 479 }
 480
 481
 482 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
 483     return underflowLabel_;
 484 }
 485
 486
 487 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
 488     inflowLabel_ = label;
 489     indexBuildRequired_ = TRUE;
 490     return *this;
 491 }
 492
 493
 494 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
 495     overflowLabel_ = label;
 496     indexBuildRequired_ = TRUE;
 497     return *this;
 498 }
 499
 500
 501 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
 502     underflowLabel_ = label;
 503     indexBuildRequired_ = TRUE;
 504     return *this;
 505 }
 506
 507
 508 int32_t AlphabeticIndex::getMaxLabelCount() const {
 509     return maxLabelCount_;
 510 }
 511
 512
 513 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
 514     if (U_FAILURE(status)) {
 515         return *this;
 516     }
 517     if (maxLabelCount <= 0) {
 518         status = U_ILLEGAL_ARGUMENT_ERROR;
 519         return *this;
 520     }
 521     maxLabelCount_ = maxLabelCount;
 522     if (maxLabelCount < bucketList_->size()) {
 523         indexBuildRequired_ = TRUE;
 524     }
 525     return *this;
 526 }
 527
 528
 529 const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) {
 530     for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
 531         const UnicodeString *s =
 532                 static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
 533         if (collator_->compare(*s, lowerLimit) > 0) {
 534             return *s;
 535         }
 536     }
 537     return *EMPTY_STRING;
 538 }
 539
 540 UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
 541     if (U_FAILURE(status)) {
 542         return &dest;
 543     }
 544     UChar32 cp = codePoint.char32At(0);
 545     UScriptCode scriptCode = uscript_getScript(cp, &status);
 546     dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
 547     return &dest;
 548 }
 549
 550 //
 551 //  init() - Common code for constructors.
 552 //
 553
 554 void AlphabeticIndex::init(UErrorCode &status) {
 555     // Initialize statics if needed.
 556     AlphabeticIndex::staticInit(status);
 557
 558     // Put the object into a known state so that the destructor will function.
 559
 560     alreadyIn_             = NULL;
 561     bucketList_            = NULL;
 562     collator_              = NULL;
 563     collatorPrimaryOnly_   = NULL;
 564     currentBucket_         = NULL;
 565     firstScriptCharacters_ = NULL;
 566     initialLabels_         = NULL;
 567     indexBuildRequired_    = TRUE;
 568     inputRecords_          = NULL;
 569     itemsIterIndex_        = 0;
 570     labels_                = NULL;
 571     labelsIterIndex_       = 0;
 572     maxLabelCount_         = 99;
 573     noDistinctSorting_     = NULL;
 574     notAlphabetic_         = NULL;
 575     recordCounter_         = 0;
 576
 577     if (U_FAILURE(status)) {
 578         return;
 579     }
 580     alreadyIn_             = uhash_open(uhash_hashUnicodeString,    // Key Hash,
 581                                         uhash_compareUnicodeString, // key Comparator,
 582                                         NULL,                       // value Comparator
 583                                         &status);
 584     uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject);
 585     uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject);
 586
 587     bucketList_            = new UVector(status);
 588     bucketList_->setDeleter(alphaIndex_deleteBucket);
 589     labels_                = new UVector(status);
 590     labels_->setDeleter(uprv_deleteUObject);
 591     labels_->setComparer(uhash_compareUnicodeString);
 592     inputRecords_          = new UVector(status);
 593     inputRecords_->setDeleter(alphaIndex_deleteRecord);
 594
 595     noDistinctSorting_     = new UnicodeSet();
 596     notAlphabetic_         = new UnicodeSet();
 597     initialLabels_         = new UnicodeSet();
 598
 599     inflowLabel_.remove();
 600     inflowLabel_.append((UChar)0x2026);    // Ellipsis
 601     overflowLabel_ = inflowLabel_;
 602     underflowLabel_ = inflowLabel_;
 603
 604     // TODO:  check for memory allocation failures.
 605 }
 606
 607
 608 static  UBool  indexCharactersAreInitialized = FALSE;
 609
 610 //  Index Characters Clean up function.  Delete statically allocated constant stuff.
 611 U_CDECL_BEGIN
 612 static UBool U_CALLCONV indexCharacters_cleanup(void) {
 613     AlphabeticIndex::staticCleanup();
 614     return TRUE;
 615 }
 616 U_CDECL_END
 617
 618 void AlphabeticIndex::staticCleanup() {
 619     delete ALPHABETIC;
 620     ALPHABETIC = NULL;
 621     delete HANGUL;
 622     HANGUL = NULL;
 623     delete ETHIOPIC;
 624     ETHIOPIC = NULL;
 625     delete CORE_LATIN;
 626     CORE_LATIN = NULL;
 627     delete IGNORE_SCRIPTS;
 628     IGNORE_SCRIPTS = NULL;
 629     delete TO_TRY;
 630     TO_TRY = NULL;
 631     delete UNIHAN;
 632     UNIHAN = NULL;
 633     delete EMPTY_STRING;
 634     EMPTY_STRING = NULL;
 635     nfkdNormalizer = NULL;  // ref to a singleton.  Do not delete.
 636     indexCharactersAreInitialized = FALSE;
 637 }
 638
 639
 640 UnicodeSet *AlphabeticIndex::ALPHABETIC;
 641 UnicodeSet *AlphabeticIndex::HANGUL;
 642 UnicodeSet *AlphabeticIndex::ETHIOPIC;
 643 UnicodeSet *AlphabeticIndex::CORE_LATIN;
 644 UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS;
 645 UnicodeSet *AlphabeticIndex::TO_TRY;
 646 UnicodeSet *AlphabeticIndex::UNIHAN;
 647 const UnicodeString *AlphabeticIndex::EMPTY_STRING;
 648
 649 //
 650 //  staticInit()    One-time initialization of constants.
 651 //                  Thread safe.  Called from constructors.
 652 //                  Mutex overhead is not a concern.  AlphabeticIndex constructors are
 653 //                  sufficiently heavy that the cost of the mutex check is not significant.
 654
 655 void AlphabeticIndex::staticInit(UErrorCode &status) {
 656     static UMTX IndexCharsInitMutex;
 657
 658     Mutex mutex(&IndexCharsInitMutex);
 659     if (indexCharactersAreInitialized || U_FAILURE(status)) {
 660         return;
 661     }
 662     UBool finishedInit = FALSE;
 663
 664     {
 665         UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
 666         ALPHABETIC = new UnicodeSet(alphaString, status);
 667         if (ALPHABETIC == NULL) {
 668             goto err;
 669         }
 670
 671         HANGUL = new UnicodeSet();
 672         HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
 673                 add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
 674         if (HANGUL== NULL) {
 675             goto err;
 676         }
 677
 678
 679         UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
 680         ETHIOPIC = new UnicodeSet(EthiopicStr, status);
 681         if (ETHIOPIC == NULL) {
 682             goto err;
 683         }
 684
 685         CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a);  // ('a', 'z');
 686         if (CORE_LATIN == NULL) {
 687             goto err;
 688         }
 689
 690         UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
 691                 "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
 692         IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
 693         IGNORE_SCRIPTS->freeze();
 694         if (IGNORE_SCRIPTS == NULL) {
 695             goto err;
 696         }
 697
 698         UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
 699         TO_TRY = new UnicodeSet(nfcqcStr, status);
 700         if (TO_TRY == NULL) {
 701             goto err;
 702         }
 703
 704         UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
 705         UNIHAN = new UnicodeSet(unihanStr, status);
 706         if (UNIHAN == NULL) {
 707             goto err;
 708         }
 709
 710         EMPTY_STRING = new UnicodeString();
 711
 712         nfkdNormalizer = Normalizer2::getNFKDInstance(status);
 713         if (nfkdNormalizer == NULL) {
 714             goto err;
 715         }
 716     }
 717     finishedInit = TRUE;
 718
 719   err:
 720     if (!finishedInit && U_SUCCESS(status)) {
 721         status = U_MEMORY_ALLOCATION_ERROR;
 722     }
 723     if (U_FAILURE(status)) {
 724         indexCharacters_cleanup();
 725         return;
 726     }
 727     ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
 728     indexCharactersAreInitialized = TRUE;
 729 }
 730
 731
 732 //
 733 //  Comparison function for UVector<UnicodeString *> sorting with a collator.
 734 //
 735 static int32_t U_CALLCONV
 736 sortCollateComparator(const void *context, const void *left, const void *right) {
 737     const UElement *leftElement = static_cast<const UElement *>(left);
 738     const UElement *rightElement = static_cast<const UElement *>(right);
 739     const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
 740     const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
 741     const Collator *col = static_cast<const Collator *>(context);
 742
 743     if (leftString == rightString) {
 744         // Catches case where both are NULL
 745         return 0;
 746     }
 747     if (leftString == NULL) {
 748         return 1;
 749     };
 750     if (rightString == NULL) {
 751         return -1;
 752     }
 753     Collator::EComparisonResult r = col->compare(*leftString, *rightString);
 754     return (int32_t) r;
 755 }
 756
 757 //
 758 //  Comparison function for UVector<Record *> sorting with a collator.
 759 //
 760 static int32_t U_CALLCONV
 761 recordCompareFn(const void *context, const void *left, const void *right) {
 762     const UElement *leftElement = static_cast<const UElement *>(left);
 763     const UElement *rightElement = static_cast<const UElement *>(right);
 764     const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
 765     const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
 766     const Collator *col = static_cast<const Collator *>(context);
 767
 768     Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
 769     if (r == Collator::EQUAL) {
 770         if (leftRec->serialNumber_ < rightRec->serialNumber_) {
 771             r = Collator::LESS;
 772         } else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
 773             r = Collator::GREATER;
 774         }
 775     }
 776     return (int32_t) r;
 777 }
 778
 779
 780 #if 0
 781 //
 782 //  First characters in scripts.
 783 //  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
 784 //  The vector is sorted according to this index's collation.
 785 //
 786 //  This code is too slow to use, so for now hard code the data.
 787 //    Hard coded implementation is follows.
 788 //
 789 UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {
 790
 791     if (U_FAILURE(status)) {
 792         return NULL;
 793     }
 794
 795     UnicodeString results[USCRIPT_CODE_LIMIT];
 796     UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
 797
 798     UnicodeSetIterator siter(*TO_TRY);
 799     while (siter.next()) {
 800         const UnicodeString &current = siter.getString();
 801         Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
 802         if (r < 0) {  // TODO fix; we only want "real" script characters, not
 803                       // symbols.
 804             continue;
 805         }
 806
 807         int script = uscript_getScript(current.char32At(0), &status);
 808         if (results[script].length() == 0) {
 809             results[script] = current;
 810         }
 811         else if (ruleBasedCollator->compare(current, results[script]) < 0) {
 812             results[script] = current;
 813         }
 814     }
 815
 816     UnicodeSet extras;
 817     UnicodeSet expansions;
 818     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
 819     const UCollator *uRuleBasedCollator = rbc->getUCollator();
 820     ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
 821     extras.addAll(expansions).removeAll(*TO_TRY);
 822     if (extras.size() != 0) {
 823         const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
 824         UnicodeSetIterator extrasIter(extras);
 825         while (extrasIter.next()) {
 826             const UnicodeString &current = extrasIter.next();
 827             if (!TO_TRY->containsAll(current))
 828                 continue;
 829             if (!normalizer->isNormalized(current, status) ||
 830                 ruleBasedCollator->compare(current, LOWER_A) < 0) {
 831                 continue;
 832             }
 833             int script = uscript_getScript(current.char32At(0), &status);
 834             if (results[script].length() == 0) {
 835                 results[script] = current;
 836             } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
 837                 results[script] = current;
 838             }
 839         }
 840     }
 841
 842     UVector *dest = new UVector(status);
 843     dest->setDeleter(uprv_deleteUObject);
 844     for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
 845         if (results[i].length() > 0) {
 846             dest->addElement(results[i].clone(), status);
 847         }
 848     }
 849     dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
 850     return dest;
 851 }
 852 #endif
 853
 854
 855 //
 856 //  First characters in scripts.
 857 //  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
 858 //  The vector is sorted according to this index's collation.
 859 //
 860 //  It takes too much time to compute this from character properties, so hard code it for now.
 861 //  Character constants copied from corresponding declaration in ICU4J.
 862 //  See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
 863
 864 static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  { 0x61, 0, 0x03B1, 0,
 865             0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
 866             0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
 867             0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
 868             0xAAF2, 0,  // Meetei Mayek
 869             0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
 870             U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada
 871             U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri
 872             0x1B83, 0,
 873             0xD802, 0xDE00, 0, 0x0E01, 0,
 874             0x0EDE, 0,  // Lao
 875             0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
 876             0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
 877             U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma
 878             0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
 879             0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
 880             0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
 881             U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao
 882             0xD800, 0xDE80, 0,
 883             0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
 884             0xD801, 0xDC80, 0,
 885             U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng
 886             0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
 887             0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
 888             U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0,  // Meroitic Cursive
 889             U16_LEAD(0x10980), U16_TRAIL(0x10980), 0,  // Meroitic Hieroglyphs
 890             0x4E00, 0 };
 891
 892 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
 893     if (U_FAILURE(status)) {
 894         return NULL;
 895     }
 896     UVector *dest = new UVector(status);
 897     if (dest == NULL) {
 898         if (U_SUCCESS(status)) {
 899             status = U_MEMORY_ALLOCATION_ERROR;
 900         }
 901         return NULL;
 902     }
 903     dest->setDeleter(uprv_deleteUObject);
 904     const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
 905     const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
 906     do {
 907         if (U_FAILURE(status)) {
 908             return dest;
 909         }
 910         UnicodeString *str = new UnicodeString(src, -1);
 911         if (str == NULL) {
 912             status = U_MEMORY_ALLOCATION_ERROR;
 913         } else {
 914             dest->addElement(str, status);
 915             src += str->length() + 1;
 916         }
 917     } while (src < limit);
 918     dest->sortWithUComparator(sortCollateComparator, collator_, status);
 919     return dest;
 920 }
 921
 922
 923 AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
 924     const char *lang = loc.getLanguage();
 925     if (uprv_strcmp(lang, "zh") != 0) {
 926         return kNormal;
 927     }
 928     const char *script = loc.getScript();
 929     if (uprv_strcmp(script, "Hant") == 0) {
 930         return kTraditional;
 931     }
 932     const char *country = loc.getCountry();
 933     if (uprv_strcmp(country, "TW") == 0) {
 934         return kTraditional;
 935     }
 936     return kSimplified;
 937 }
 938
 939
 940 //
 941 // Pinyin Hacks.  Direct port from Java.
 942 //
 943
 944 static const UChar32  probeCharInLong = 0x28EAD;
 945
 946
 947 static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = {      // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
 948             0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
 949             /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
 950
 951
 952 // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
 953
 954 AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = {
 955         {(UChar)0,      (UChar)0, (UChar)0}, // A
 956         {(UChar)0x516B, (UChar)0, (UChar)0}, // B
 957         {(UChar)0x5693, (UChar)0, (UChar)0}, // C
 958         {(UChar)0x5491, (UChar)0, (UChar)0}, // D
 959         {(UChar)0x59B8, (UChar)0, (UChar)0}, // E
 960         {(UChar)0x53D1, (UChar)0, (UChar)0}, // F
 961         {(UChar)0x65EE, (UChar)0, (UChar)0}, // G
 962         {(UChar)0x54C8, (UChar)0, (UChar)0}, // H
 963         {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
 964         {(UChar)0x5494, (UChar)0, (UChar)0}, // K
 965         {(UChar)0x5783, (UChar)0, (UChar)0}, // L
 966         {(UChar)0x5452, (UChar)0, (UChar)0}, // M
 967         {(UChar)0x5514, (UChar)0, (UChar)0}, // N
 968         {(UChar)0x5594, (UChar)0, (UChar)0}, // O
 969         {(UChar)0x5991, (UChar)0, (UChar)0}, // P
 970         {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
 971         {(UChar)0x513F, (UChar)0, (UChar)0}, // R
 972         {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
 973         {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
 974         {(UChar)0x7A75, (UChar)0, (UChar)0}, // W
 975         {(UChar)0x5915, (UChar)0, (UChar)0}, // X
 976         {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
 977         {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
 978         {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
 979     };
 980
 981 static const UChar PINYIN_LOWER_BOUNDS_LONG[] = {   // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
 982             0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
 983             /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
 984
 985 AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = {
 986         {(UChar)0,      (UChar)0,      (UChar)0}, // A
 987         {(UChar)0x516B, (UChar)0,      (UChar)0}, // b
 988         {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
 989         {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
 990         {(UChar)0x59B8, (UChar)0,      (UChar)0}, // e
 991         {(UChar)0x53D1, (UChar)0,      (UChar)0}, // f
 992         {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
 993         {(UChar)0x54C8, (UChar)0,      (UChar)0}, // h
 994         {(UChar)0x4E0C, (UChar)0,      (UChar)0}, // j
 995         {(UChar)0x5494, (UChar)0,      (UChar)0}, // k
 996         {(UChar)0x3547, (UChar)0,      (UChar)0}, // l
 997         {(UChar)0x5452, (UChar)0,      (UChar)0}, // m
 998         {(UChar)0x5514, (UChar)0,      (UChar)0}, // n
 999         {(UChar)0x5594, (UChar)0,      (UChar)0}, // o
1000         {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
1001         {(UChar)0x4E03, (UChar)0,      (UChar)0}, // q
1002         {(UChar)0x513F, (UChar)0,      (UChar)0}, // r
1003         {(UChar)0x4EE8, (UChar)0,      (UChar)0}, // s
1004         {(UChar)0x4ED6, (UChar)0,      (UChar)0}, // t
1005         {(UChar)0x7A75, (UChar)0,      (UChar)0}, // w
1006         {(UChar)0x5915, (UChar)0,      (UChar)0}, // x
1007         {(UChar)0x4E2B, (UChar)0,      (UChar)0}, // y
1008         {(UChar)0x5E00, (UChar)0,      (UChar)0}, // z
1009         {(UChar)0xFFFF, (UChar)0,      (UChar)0}, // mark end of array
1010     };
1011
1012
1013 //
1014 //  Probe the collation data, and decide which Pinyin tables should be used
1015 //
1016 //  ICU can be built with a choice between two Chinese collations.
1017 //  The hack Pinyin tables to use depend on which one is in use.
1018 //  We can assume that any given copy of ICU will have only one of the collations available,
1019 //  and that there is no way, in a given process, to create two alphabetic indexes using
1020 //  different Chinese collations.  Which means the probe can be done once
1021 //  and the results cached.
1022 //
1023 //  This whole arrangement is temporary.
1024 //
1025 AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP  = NULL;
1026 const UChar  *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL;
1027
1028 void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
1029     {
1030         Mutex m;
1031         if (PINYIN_LOWER_BOUNDS != NULL) {
1032             return;
1033         }
1034     }
1035     UnicodeSet *colSet = col->getTailoredSet(status);
1036     if (U_FAILURE(status) || colSet == NULL) {
1037         delete colSet;
1038         if (U_SUCCESS(status))  {
1039             status = U_MEMORY_ALLOCATION_ERROR;
1040         }
1041         return;
1042     }
1043     UBool useLongTables = colSet->contains(probeCharInLong);
1044     delete colSet;
1045     {
1046         Mutex m;
1047         if (useLongTables) {
1048             PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
1049             HACK_PINYIN_LOOKUP  = &HACK_PINYIN_LOOKUP_LONG;
1050         } else {
1051             PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
1052             HACK_PINYIN_LOOKUP  = &HACK_PINYIN_LOOKUP_SHORT;
1053         }
1054     }
1055 }
1056
1057 // Pinyin Hack:
1058 //    Modify a Chinese name by prepending a Latin letter.  The modified name is used
1059 //      when putting records (names) into buckets, to put the name under a Latin index heading.
1060
1061 void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
1062
1063     if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
1064         dest = name;
1065         return;
1066     }
1067
1068     UErrorCode status = U_ZERO_ERROR;
1069     initPinyinBounds(col, status);
1070     if (U_FAILURE(status)) {
1071         dest = name;
1072         return;
1073     }
1074     // TODO:  use binary search
1075     int index;
1076     for (index=0; ; index++) {
1077         if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
1078             index--;
1079             break;
1080         }
1081         int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1));
1082         if (compareResult < 0) {
1083             index--;
1084         }
1085         if (compareResult <= 0) {
1086             break;
1087         }
1088     }
1089     UChar c = PINYIN_LOWER_BOUNDS[index];
1090     dest.setTo(c);
1091     dest.append(name);
1092     return;
1093 }
1094
1095
1096
1097 /**
1098  * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
1099  * better, and otherwise binary order is better.
1100  *
1101  * For use with array sort or UVector.
1102  * @param context  A UErrorCode pointer.
1103  * @param left     A UElement pointer, which must refer to a UnicodeString *
1104  * @param right    A UElement pointer, which must refer to a UnicodeString *
1105  */
1106
1107 static int32_t U_CALLCONV
1108 PreferenceComparator(const void *context, const void *left, const void *right) {
1109     const UElement *leftElement  = static_cast<const UElement *>(left);
1110     const UElement *rightElement = static_cast<const UElement *>(right);
1111     const UnicodeString *s1  = static_cast<const UnicodeString *>(leftElement->pointer);
1112     const UnicodeString *s2  = static_cast<const UnicodeString *>(rightElement->pointer);
1113     UErrorCode &status       = *(UErrorCode *)(context);   // Cast off both static and const.
1114     if (s1 == s2) {
1115         return 0;
1116     }
1117
1118     UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
1119     UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
1120     int32_t result = n1.length() - n2.length();
1121     if (result != 0) {
1122         return result;
1123     }
1124
1125     result = n1.compareCodePointOrder(n2);
1126     if (result != 0) {
1127         return result;
1128     }
1129     return s1->compareCodePointOrder(*s2);
1130 }
1131
1132
1133 //
1134 //  Constructor & Destructor for AlphabeticIndex::Record
1135 //
1136 //     Records are internal only, instances are not directly surfaced in the public API.
1137 //     This class is mostly struct-like, with all public fields.
1138
1139 AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data):
1140     alphaIndex_(alphaIndex), name_(name), data_(data)
1141 {
1142     UnicodeString prefixedName;
1143     alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
1144     serialNumber_ = ++alphaIndex->recordCounter_;
1145 }
1146
1147 AlphabeticIndex::Record::~Record() {
1148 }
1149
1150
1151 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1152     if (U_FAILURE(status)) {
1153         return *this;
1154     }
1155     Record *r = new Record(this, name, data);
1156     inputRecords_->addElement(r, status);
1157     indexBuildRequired_ = TRUE;
1158     //std::string ss;
1159     //std::string ss2;
1160     //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1161     //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1162     return *this;
1163 }
1164
1165
1166 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1167     if (U_FAILURE(status)) {
1168         return *this;
1169     }
1170     inputRecords_->removeAllElements();
1171     indexBuildRequired_ = TRUE;
1172     return *this;
1173 }
1174
1175
1176 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1177     buildIndex(status);
1178     if (U_FAILURE(status)) {
1179         return 0;
1180     }
1181
1182     // For simplified Chinese prepend a prefix to the name.
1183     //   For non-Chinese locales or non-Chinese names, the name is not modified.
1184
1185     UnicodeString prefixedName;
1186     hackName(prefixedName, name, collatorPrimaryOnly_);
1187
1188     // TODO:  use a binary search.
1189     for (int32_t i = 0; i < bucketList_->size(); ++i) {
1190         Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i));
1191         Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
1192         if (comp < 0) {
1193             return i - 1;
1194         }
1195     }
1196     // Loop runs until we find the bucket following the one that would hold prefixedName.
1197     // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
1198     //  than returning from the middle.
1199
1200     return bucketList_->size() - 1;
1201 }
1202
1203
1204 int32_t AlphabeticIndex::getBucketIndex() const {
1205     return labelsIterIndex_;
1206 }
1207
1208
1209 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1210     if (U_FAILURE(status)) {
1211         return FALSE;
1212     }
1213     if (indexBuildRequired_ && currentBucket_ != NULL) {
1214         status = U_ENUM_OUT_OF_SYNC_ERROR;
1215         return FALSE;
1216     }
1217     buildIndex(status);
1218     if (U_FAILURE(status)) {
1219         return FALSE;
1220     }
1221     ++labelsIterIndex_;
1222     if (labelsIterIndex_ >= bucketList_->size()) {
1223         labelsIterIndex_ = bucketList_->size();
1224         return FALSE;
1225     }
1226     currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
1227     resetRecordIterator();
1228     return TRUE;
1229 }
1230
1231 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1232     if (currentBucket_ != NULL) {
1233         return currentBucket_->label_;
1234     } else {
1235         return *EMPTY_STRING;
1236     }
1237 }
1238
1239
1240 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1241     if (currentBucket_ != NULL) {
1242         return currentBucket_->labelType_;
1243     } else {
1244         return U_ALPHAINDEX_NORMAL;
1245     }
1246 }
1247
1248
1249 int32_t AlphabeticIndex::getBucketRecordCount() const {
1250     if (currentBucket_ != NULL) {
1251         return currentBucket_->records_->size();
1252     } else {
1253         return 0;
1254     }
1255 }
1256
1257
1258 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1259     if (U_FAILURE(status)) {
1260         return *this;
1261     }
1262     buildIndex(status);
1263     labelsIterIndex_ = -1;
1264     currentBucket_ = NULL;
1265     return *this;
1266 }
1267
1268
1269 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1270     if (U_FAILURE(status)) {
1271         return FALSE;
1272     }
1273     if (currentBucket_ == NULL) {
1274         // We are trying to iterate over the items in a bucket, but there is no
1275         // current bucket from the enumeration of buckets.
1276         status = U_INVALID_STATE_ERROR;
1277         return FALSE;
1278     }
1279     if (indexBuildRequired_) {
1280         status = U_ENUM_OUT_OF_SYNC_ERROR;
1281         return FALSE;
1282     }
1283     ++itemsIterIndex_;
1284     if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1285         itemsIterIndex_  = currentBucket_->records_->size();
1286         return FALSE;
1287     }
1288     return TRUE;
1289 }
1290
1291
1292 const UnicodeString &AlphabeticIndex::getRecordName() const {
1293     const UnicodeString *retStr = EMPTY_STRING;
1294     if (currentBucket_ != NULL &&
1295         itemsIterIndex_ >= 0 &&
1296         itemsIterIndex_ < currentBucket_->records_->size()) {
1297             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1298             retStr = &item->name_;
1299     }
1300     return *retStr;
1301 }
1302
1303 const void *AlphabeticIndex::getRecordData() const {
1304     const void *retPtr = NULL;
1305     if (currentBucket_ != NULL &&
1306         itemsIterIndex_ >= 0 &&
1307         itemsIterIndex_ < currentBucket_->records_->size()) {
1308             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1309             retPtr = item->data_;
1310     }
1311     return retPtr;
1312 }
1313
1314
1315 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1316     itemsIterIndex_ = -1;
1317     return *this;
1318 }
1319
1320
1321
1322 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1323                                 const UnicodeString &lowerBoundary,
1324                                 UAlphabeticIndexLabelType type,
1325                                 UErrorCode &status):
1326          label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
1327     if (U_FAILURE(status)) {
1328         return;
1329     }
1330     records_ = new UVector(status);
1331     if (records_ == NULL && U_SUCCESS(status)) {
1332         status = U_MEMORY_ALLOCATION_ERROR;
1333     }
1334 }
1335
1336
1337 AlphabeticIndex::Bucket::~Bucket() {
1338     delete records_;
1339 }
1340
1341 U_NAMESPACE_END
1342
1343 #endif