/*
*******************************************************************************
-* Copyright (C) 2009-2012, International Business Machines Corporation and *
-* others. All Rights Reserved. *
+* Copyright (C) 2009-2013, International Business Machines Corporation and
+* others. All Rights Reserved.
*******************************************************************************
*/
-/**
- * \file
- * \brief C API: AlphabeticIndex class
- */
-
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
#include "unicode/alphaindex.h"
+#include "unicode/coleitr.h"
#include "unicode/coll.h"
+#include "unicode/localpointer.h"
#include "unicode/normalizer2.h"
-#include "unicode/strenum.h"
#include "unicode/tblcoll.h"
#include "unicode/ulocdata.h"
#include "unicode/uniset.h"
#include "unicode/uobject.h"
-#include "unicode/uscript.h"
#include "unicode/usetiter.h"
-#include "unicode/ustring.h"
#include "unicode/utf16.h"
+#include "cmemory.h"
#include "cstring.h"
-#include "mutex.h"
#include "uassert.h"
-#include "ucln_in.h"
-#include "uhash.h"
#include "uvector.h"
//#include <string>
//#include <iostream>
+
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
U_NAMESPACE_BEGIN
-UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
+namespace {
-// Forward Declarations
-static int32_t U_CALLCONV
-PreferenceComparator(const void *context, const void *left, const void *right);
+/**
+ * Prefix string for Chinese index buckets.
+ * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
+ */
+const UChar BASE[1] = { 0xFDD0 };
+const int32_t BASE_LENGTH = 1;
+
+UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
+ const UnicodeString &one, const UnicodeString &other);
+
+} // namespace
static int32_t U_CALLCONV
-sortCollateComparator(const void *context, const void *left, const void *right);
+collatorComparator(const void *context, const void *left, const void *right);
static int32_t U_CALLCONV
recordCompareFn(const void *context, const void *left, const void *right);
-// UVector<Bucket *> support function, delete a Bucket.
-static void U_CALLCONV
-alphaIndex_deleteBucket(void *obj) {
- delete static_cast<AlphabeticIndex::Bucket *>(obj);
-}
-
// UVector<Record *> support function, delete a Record.
static void U_CALLCONV
alphaIndex_deleteRecord(void *obj) {
delete static_cast<AlphabeticIndex::Record *>(obj);
}
+namespace {
+UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
+ UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return NULL; }
+ if (owned.isValid()) {
+ return owned.orphan();
+ }
+ UnicodeString *p = new UnicodeString(s);
+ if (p == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ }
+ return p;
+}
-static const Normalizer2 *nfkdNormalizer;
+inline UnicodeString *getString(const UVector &list, int32_t i) {
+ return static_cast<UnicodeString *>(list[i]);
+}
-//
-// Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
-// Append everything - individual characters are handled as strings of length 1.
-// The destination vector owns the appended strings.
+inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
+ return static_cast<AlphabeticIndex::Bucket *>(list[i]);
+}
-static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
- UnicodeSetIterator setIter(source);
- while (setIter.next()) {
- const UnicodeString &str = setIter.getString();
- dest.addElement(str.clone(), status);
+inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
+ return static_cast<AlphabeticIndex::Record *>(list[i]);
+}
+
+/**
+ * Like Java Collections.binarySearch(List, String, Comparator).
+ *
+ * @return the index>=0 where the item was found,
+ * or the index<0 for inserting the string at ~index in sorted order
+ */
+int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
+ if (list.size() == 0) { return ~0; }
+ int32_t start = 0;
+ int32_t limit = list.size();
+ for (;;) {
+ int32_t i = (start + limit) / 2;
+ const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
+ UErrorCode errorCode = U_ZERO_ERROR;
+ UCollationResult cmp = coll.compare(s, *si, errorCode);
+ if (cmp == UCOL_EQUAL) {
+ return i;
+ } else if (cmp < 0) {
+ if (i == start) {
+ return ~start; // insert s before *si
+ }
+ limit = i;
+ } else {
+ if (i == start) {
+ return ~(start + 1); // insert s after *si
+ }
+ start = i;
+ }
}
}
+} // namespace
+
+// The BucketList is not in the anonymous namespace because only Clang
+// seems to support its use in other classes from there.
+// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
+class BucketList : public UObject {
+public:
+ BucketList(UVector *bucketList, UVector *publicBucketList)
+ : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
+ int32_t displayIndex = 0;
+ for (int32_t i = 0; i < publicBucketList->size(); ++i) {
+ getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
+ }
+ }
-AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
- init(status);
- if (U_FAILURE(status)) {
- return;
+ // The virtual destructor must not be inline.
+ // See ticket #8454 for details.
+ virtual ~BucketList();
+
+ int32_t getBucketCount() const {
+ return immutableVisibleList_->size();
}
- locale_ = locale;
- langType_ = langTypeFromLocale(locale_);
- collator_ = Collator::createInstance(locale, status);
- if (collator_ != NULL) {
- collatorPrimaryOnly_ = collator_->clone();
+ int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
+ UErrorCode &errorCode) {
+ // binary search
+ int32_t start = 0;
+ int32_t limit = bucketList_->size();
+ while ((start + 1) < limit) {
+ int32_t i = (start + limit) / 2;
+ const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
+ UCollationResult nameVsBucket =
+ collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
+ if (nameVsBucket < 0) {
+ limit = i;
+ } else {
+ start = i;
+ }
+ }
+ const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
+ if (bucket->displayBucket_ != NULL) {
+ bucket = bucket->displayBucket_;
+ }
+ return bucket->displayIndex_;
}
- if (collatorPrimaryOnly_ != NULL) {
- collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
+
+ /** All of the buckets, visible and invisible. */
+ UVector *bucketList_;
+ /** Just the visible buckets. */
+ UVector *immutableVisibleList_;
+};
+
+BucketList::~BucketList() {
+ delete bucketList_;
+ if (immutableVisibleList_ != bucketList_) {
+ delete immutableVisibleList_;
}
- getIndexExemplars(*initialLabels_, locale, status);
- indexBuildRequired_ = TRUE;
- if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
- status = U_MEMORY_ALLOCATION_ERROR;
+}
+
+AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
+ delete buckets_;
+ delete collatorPrimaryOnly_;
+}
+
+int32_t
+AlphabeticIndex::ImmutableIndex::getBucketCount() const {
+ return buckets_->getBucketCount();
+}
+
+int32_t
+AlphabeticIndex::ImmutableIndex::getBucketIndex(
+ const UnicodeString &name, UErrorCode &errorCode) const {
+ return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
+}
+
+const AlphabeticIndex::Bucket *
+AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
+ if (0 <= index && index < buckets_->getBucketCount()) {
+ return icu::getBucket(*buckets_->immutableVisibleList_, index);
+ } else {
+ return NULL;
}
- firstScriptCharacters_ = firstStringsInScript(status);
}
+AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
+ : inputList_(NULL),
+ labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
+ maxLabelCount_(99),
+ initialLabels_(NULL), firstCharsInScripts_(NULL),
+ collator_(NULL), collatorPrimaryOnly_(NULL),
+ buckets_(NULL) {
+ init(&locale, status);
+}
+
+
+AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
+ : inputList_(NULL),
+ labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
+ maxLabelCount_(99),
+ initialLabels_(NULL), firstCharsInScripts_(NULL),
+ collator_(collator), collatorPrimaryOnly_(NULL),
+ buckets_(NULL) {
+ init(NULL, status);
+}
+
+
AlphabeticIndex::~AlphabeticIndex() {
- uhash_close(alreadyIn_);
- delete bucketList_;
delete collator_;
delete collatorPrimaryOnly_;
- delete firstScriptCharacters_;
- delete labels_;
- delete inputRecords_;
- delete noDistinctSorting_;
- delete notAlphabetic_;
+ delete firstCharsInScripts_;
+ delete buckets_;
+ delete inputList_;
delete initialLabels_;
}
return *this;
}
initialLabels_->addAll(additions);
+ clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
- if (U_FAILURE(status)) {
- return *this;
- }
- UnicodeSet additions;
- getIndexExemplars(additions, locale, status);
- initialLabels_->addAll(additions);
+ addIndexExemplars(locale, status);
+ clearBuckets();
return *this;
}
+AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return NULL; }
+ // In C++, the ImmutableIndex must own its copy of the BucketList,
+ // even if it contains no records, for proper memory management.
+ // We could clone the buckets_ if they are not NULL,
+ // but that would be worth it only if this method is called multiple times,
+ // or called after using the old-style bucket iterator API.
+ LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
+ LocalPointer<RuleBasedCollator> coll(
+ static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
+ if (immutableBucketList.isNull() || coll.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
+ if (immIndex == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ // The ImmutableIndex adopted its parameter objects.
+ immutableBucketList.orphan();
+ coll.orphan();
+ return immIndex;
+}
+
int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
- buildIndex(status);
+ initBuckets(status);
if (U_FAILURE(status)) {
return 0;
}
- return bucketList_->size();
+ return buckets_->getBucketCount();
}
int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
- if (U_FAILURE(status)) {
+ if (U_FAILURE(status) || inputList_ == NULL) {
return 0;
}
- return inputRecords_->size();
+ return inputList_->size();
}
-
-void AlphabeticIndex::buildIndex(UErrorCode &status) {
- if (U_FAILURE(status)) {
- return;
- }
- if (!indexBuildRequired_) {
- return;
- }
-
- // Discard any already-built data.
- // This is important when the user builds and uses an index, then subsequently modifies it,
- // necessitating a rebuild.
-
- bucketList_->removeAllElements();
- labels_->removeAllElements();
- uhash_removeAll(alreadyIn_);
- noDistinctSorting_->clear();
- notAlphabetic_->clear();
-
- // first sort the incoming Labels, with a "best" ordering among items
- // that are the same according to the collator
-
- UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector.
- preferenceSorting.setDeleter(uprv_deleteUObject);
- appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
- preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
-
- // We now make a set of Labels.
- // Some of the input may, however, be redundant.
- // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
- // So we make a pass through, filtering out those cases.
- // TODO: filtering these out would seem to be at odds with the eventual goal
- // of being able to split buckets that contain too many items.
-
- UnicodeSet labelSet;
- for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
- UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
- // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet,
- // is it even possible for duplicates to show up in this check?
- if (labelSet.contains(item)) {
- UnicodeSetIterator itemAlreadyInIter(labelSet);
- while (itemAlreadyInIter.next()) {
- const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
- if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
- UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
- if (targets == NULL) {
- // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
- targets = new UnicodeSet();
- uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
- }
- targets->add(item);
- break;
- }
+void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
+ const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
+ if (U_FAILURE(errorCode)) { return; }
+
+ const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
+ const UnicodeString &overflowBoundary =
+ *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
+
+ // We make a sorted array of elements.
+ // Some of the input may be redundant.
+ // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
+ // We filter out those cases.
+ UnicodeSetIterator iter(*initialLabels_);
+ while (iter.next()) {
+ const UnicodeString *item = &iter.getString();
+ LocalPointer<UnicodeString> ownedItem;
+ UBool checkDistinct;
+ int32_t itemLength = item->length();
+ if (!item->hasMoreChar32Than(0, itemLength, 1)) {
+ checkDistinct = FALSE;
+ } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
+ item->charAt(itemLength - 2) != 0x2a) {
+ // Use a label if it is marked with one trailing star,
+ // even if the label string sorts the same when all contractions are suppressed.
+ ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
+ item = ownedItem.getAlias();
+ if (item == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
}
- } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point.
- collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
- noDistinctSorting_->add(item);
- } else if (!ALPHABETIC->containsSome(item)) {
- notAlphabetic_->add(item);
+ checkDistinct = FALSE;
} else {
- labelSet.add(item);
+ checkDistinct = TRUE;
+ }
+ if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
+ // Ignore a primary-ignorable or non-alphabetic index character.
+ } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
+ // Ignore an index characters that will land in the overflow bucket.
+ } else if (checkDistinct &&
+ collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
+ // Ignore a multi-code point index character that does not sort distinctly
+ // from the sequence of its separate characters.
+ } else {
+ int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
+ if (insertionPoint < 0) {
+ indexCharacters.insertElementAt(
+ ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
+ } else {
+ const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
+ if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
+ indexCharacters.setElementAt(
+ ownedString(*item, ownedItem, errorCode), insertionPoint);
+ }
+ }
}
}
+ if (U_FAILURE(errorCode)) { return; }
- // If we have no labels, hard-code a fallback default set of [A-Z]
- // This case can occur with locales that don't have exemplar character data, including root.
- // A no-labels situation will cause other problems; it needs to be avoided.
- if (labelSet.isEmpty()) {
- labelSet.add((UChar32)0x41, (UChar32)0x5A);
- }
-
- // Move the set of Labels from the set into a vector, and sort
- // according to the collator.
+ // if the result is still too large, cut down to maxCount elements, by removing every nth element
- appendUnicodeSetToUVector(*labels_, labelSet, status);
- labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
-
- // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
- // Implemented by copying the elements to be retained to a new UVector.
-
- const int32_t size = labelSet.size() - 1;
+ int32_t size = indexCharacters.size() - 1;
if (size > maxLabelCount_) {
- UVector *newLabels = new UVector(status);
- newLabels->setDeleter(uprv_deleteUObject);
int32_t count = 0;
int32_t old = -1;
- for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
- const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
+ for (int32_t i = 0; i < indexCharacters.size();) {
++count;
- const int32_t bump = count * maxLabelCount_ / size;
+ int32_t bump = count * maxLabelCount_ / size;
if (bump == old) {
- // it.remove();
+ indexCharacters.removeElementAt(i);
} else {
- newLabels->addElement(str->clone(), status);
old = bump;
+ ++i;
}
}
- delete labels_;
- labels_ = newLabels;
}
+}
- // We now know the list of labels.
- // Create a corresponding list of buckets, one per label.
-
- buildBucketList(status); // Corresponds to Java BucketList constructor.
-
- // Bin the Records into the Buckets.
- bucketRecords(status);
-
- indexBuildRequired_ = FALSE;
- resetBucketIterator(status);
+namespace {
+
+const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) {
+ if (!current.startsWith(BASE, BASE_LENGTH)) {
+ return current;
+ }
+ UChar rest = current.charAt(BASE_LENGTH);
+ if (0x2800 < rest && rest <= 0x28FF) { // stroke count
+ int32_t count = rest-0x2800;
+ temp.setTo((UChar)(0x30 + count % 10));
+ if (count >= 10) {
+ count /= 10;
+ temp.insert(0, (UChar)(0x30 + count % 10));
+ if (count >= 10) {
+ count /= 10;
+ temp.insert(0, (UChar)(0x30 + count));
+ }
+ }
+ return temp.append((UChar)0x5283);
+ }
+ return temp.setTo(current, BASE_LENGTH);
}
-//
-// buildBucketList() Corresponds to the BucketList constructor in the Java version.
-
-void AlphabeticIndex::buildBucketList(UErrorCode &status) {
- UnicodeString labelStr = getUnderflowLabel();
- Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
- bucketList_->addElement(b, status);
-
- // Build up the list, adding underflow, additions, overflow
- // insert infix labels as needed, using \uFFFF.
- const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
- b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
- bucketList_->addElement(b, status);
-
- UnicodeSet lastSet;
- UnicodeSet set;
- AlphabeticIndex::getScriptSet(lastSet, *last, status);
- lastSet.removeAll(*IGNORE_SCRIPTS);
-
- for (int i = 1; i < labels_->size(); ++i) {
- UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
- getScriptSet(set, *current, status);
- set.removeAll(*IGNORE_SCRIPTS);
- if (lastSet.containsNone(set)) {
- // check for adjacent
- const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
- if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
- labelStr = getInflowLabel();
- b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
- bucketList_->addElement(b, status);
- i++;
- lastSet = set;
+UBool hasMultiplePrimaryWeights(
+ CollationElementIterator &cei, int32_t variableTop,
+ const UnicodeString &s, UErrorCode &errorCode) {
+ cei.setText(s, errorCode);
+ UBool seenPrimary = FALSE;
+ for (;;) {
+ int32_t ce32 = cei.next(errorCode);
+ if (ce32 == CollationElementIterator::NULLORDER) {
+ break;
+ }
+ int32_t p = CollationElementIterator::primaryOrder(ce32);
+ if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
+ // not primary ignorable, and not a continuation CE
+ if (seenPrimary) {
+ return TRUE;
}
+ seenPrimary = TRUE;
}
- b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
- bucketList_->addElement(b, status);
- last = current;
- lastSet = set;
- }
- const UnicodeString &limitString = getOverflowComparisonString(*last, status);
- b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
- bucketList_->addElement(b, status);
- // final overflow bucket
+ }
+ return FALSE;
}
+} // namespace
-//
-// Place all of the raw input records into the correct bucket.
-//
-// Begin by sorting the input records; this lets us bin them in a single pass.
-//
-// Note on storage management: The input records are owned by the
-// inputRecords_ vector, and will (eventually) be auto-deleted by it.
-// The Bucket objects have pointers to the Record objects, but do not own them.
-//
-void AlphabeticIndex::bucketRecords(UErrorCode &status) {
- if (U_FAILURE(status)) {
+BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
+ // Initialize indexCharacters.
+ UVector indexCharacters(errorCode);
+ indexCharacters.setDeleter(uprv_deleteUObject);
+ initLabels(indexCharacters, errorCode);
+ if (U_FAILURE(errorCode)) { return NULL; }
+
+ // Variables for hasMultiplePrimaryWeights().
+ LocalPointer<CollationElementIterator> cei(
+ collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
+ if (cei.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ int32_t variableTop;
+ if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
+ variableTop = CollationElementIterator::primaryOrder(
+ (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
+ } else {
+ variableTop = 0;
+ }
+ UBool hasInvisibleBuckets = FALSE;
+
+ // Helper arrays for Chinese Pinyin collation.
+ Bucket *asciiBuckets[26] = {
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+ };
+ Bucket *pinyinBuckets[26] = {
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+ };
+ UBool hasPinyin = FALSE;
+
+ LocalPointer<UVector> bucketList(new UVector(errorCode));
+ if (bucketList.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList->setDeleter(uprv_deleteUObject);
+
+ // underflow bucket
+ Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
+ if (bucket == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList->addElement(bucket, errorCode);
+ if (U_FAILURE(errorCode)) { return NULL; }
+
+ UnicodeString temp;
+
+ // fix up the list, adding underflow, additions, overflow
+ // Insert inflow labels as needed.
+ int32_t scriptIndex = -1;
+ const UnicodeString *scriptUpperBoundary = &emptyString_;
+ for (int32_t i = 0; i < indexCharacters.size(); ++i) {
+ UnicodeString ¤t = *getString(indexCharacters, i);
+ if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
+ // We crossed the script boundary into a new script.
+ const UnicodeString &inflowBoundary = *scriptUpperBoundary;
+ UBool skippedScript = FALSE;
+ for (;;) {
+ scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
+ if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
+ break;
+ }
+ skippedScript = TRUE;
+ }
+ if (skippedScript && bucketList->size() > 1) {
+ // We are skipping one or more scripts,
+ // and we are not just getting out of the underflow label.
+ bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
+ if (bucket == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList->addElement(bucket, errorCode);
+ }
+ }
+ // Add a bucket with the current label.
+ bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
+ if (bucket == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList->addElement(bucket, errorCode);
+ // Remember ASCII and Pinyin buckets for Pinyin redirects.
+ UChar c;
+ if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
+ asciiBuckets[c - 0x41] = bucket;
+ } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
+ 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
+ pinyinBuckets[c - 0x41] = bucket;
+ hasPinyin = TRUE;
+ }
+ // Check for multiple primary weights.
+ if (!current.startsWith(BASE, BASE_LENGTH) &&
+ hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
+ current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
+ // "AE-ligature" or "Sch" etc.
+ for (int32_t i = bucketList->size() - 2;; --i) {
+ Bucket *singleBucket = getBucket(*bucketList, i);
+ if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
+ // There is no single-character bucket since the last
+ // underflow or inflow label.
+ break;
+ }
+ if (singleBucket->displayBucket_ == NULL &&
+ !hasMultiplePrimaryWeights(
+ *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
+ // Add an invisible bucket that redirects strings greater than the expansion
+ // to the previous single-character bucket.
+ // For example, after ... Q R S Sch we add Sch\uFFFF->S
+ // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
+ bucket = new Bucket(emptyString_,
+ UnicodeString(current).append((UChar)0xFFFF),
+ U_ALPHAINDEX_NORMAL);
+ if (bucket == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucket->displayBucket_ = singleBucket;
+ bucketList->addElement(bucket, errorCode);
+ hasInvisibleBuckets = TRUE;
+ break;
+ }
+ }
+ }
+ }
+ if (U_FAILURE(errorCode)) { return NULL; }
+ if (bucketList->size() == 1) {
+ // No real labels, show only the underflow label.
+ BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
+ if (bl == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList.orphan();
+ return bl;
+ }
+ // overflow bucket
+ bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
+ if (bucket == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList->addElement(bucket, errorCode); // final
+
+ if (hasPinyin) {
+ // Redirect Pinyin buckets.
+ Bucket *asciiBucket = NULL;
+ for (int32_t i = 0; i < 26; ++i) {
+ if (asciiBuckets[i] != NULL) {
+ asciiBucket = asciiBuckets[i];
+ }
+ if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
+ pinyinBuckets[i]->displayBucket_ = asciiBucket;
+ hasInvisibleBuckets = TRUE;
+ }
+ }
+ }
+
+ if (U_FAILURE(errorCode)) { return NULL; }
+ if (!hasInvisibleBuckets) {
+ BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
+ if (bl == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList.orphan();
+ return bl;
+ }
+ // Merge inflow buckets that are visually adjacent.
+ // Iterate backwards: Merge inflow into overflow rather than the other way around.
+ int32_t i = bucketList->size() - 1;
+ Bucket *nextBucket = getBucket(*bucketList, i);
+ while (--i > 0) {
+ bucket = getBucket(*bucketList, i);
+ if (bucket->displayBucket_ != NULL) {
+ continue; // skip invisible buckets
+ }
+ if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
+ if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
+ bucket->displayBucket_ = nextBucket;
+ continue;
+ }
+ }
+ nextBucket = bucket;
+ }
+
+ LocalPointer<UVector> publicBucketList(new UVector(errorCode));
+ if (bucketList.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ // Do not call publicBucketList->setDeleter():
+ // This vector shares its objects with the bucketList.
+ for (int32_t i = 0; i < bucketList->size(); ++i) {
+ bucket = getBucket(*bucketList, i);
+ if (bucket->displayBucket_ == NULL) {
+ publicBucketList->addElement(bucket, errorCode);
+ }
+ }
+ if (U_FAILURE(errorCode)) { return NULL; }
+ BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
+ if (bl == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ bucketList.orphan();
+ publicBucketList.orphan();
+ return bl;
+}
+
+/**
+ * Creates an index, and buckets and sorts the list of records into the index.
+ */
+void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode) || buckets_ != NULL) {
+ return;
+ }
+ buckets_ = createBucketList(errorCode);
+ if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
return;
}
- inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
- U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow
- // bucket, even if no user labels.
- int32_t bucketIndex = 0;
- Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex));
- Bucket *nextBucket = NULL;
- if (bucketIndex+1 < bucketList_->size()) {
- nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
- }
- int32_t recordIndex = 0;
- Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
- while (recordIndex < inputRecords_->size()) {
- if (nextBucket == NULL ||
- collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
- // Record goes in current bucket. Advance to next record,
- // stay on current bucket.
- destBucket->records_->addElement(r, status);
- ++recordIndex;
- r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
- } else {
- // Advance to the next bucket, stay on current record.
- bucketIndex++;
- destBucket = nextBucket;
- if (bucketIndex+1 < bucketList_->size()) {
- nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
+ // Sort the records by name.
+ // Stable sort preserves input order of collation duplicates.
+ inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
+
+ // Now, we traverse all of the input, which is now sorted.
+ // If the item doesn't go in the current bucket, we find the next bucket that contains it.
+ // This makes the process order n*log(n), since we just sort the list and then do a linear process.
+ // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
+ // we need to improve it for that case.
+
+ Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
+ int32_t bucketIndex = 1;
+ Bucket *nextBucket;
+ const UnicodeString *upperBoundary;
+ if (bucketIndex < buckets_->bucketList_->size()) {
+ nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
+ upperBoundary = &nextBucket->lowerBoundary_;
+ } else {
+ nextBucket = NULL;
+ upperBoundary = NULL;
+ }
+ for (int32_t i = 0; i < inputList_->size(); ++i) {
+ Record *r = getRecord(*inputList_, i);
+ // if the current bucket isn't the right one, find the one that is
+ // We have a special flag for the last bucket so that we don't look any further
+ while (upperBoundary != NULL &&
+ collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
+ currentBucket = nextBucket;
+ // now reset the boundary that we compare against
+ if (bucketIndex < buckets_->bucketList_->size()) {
+ nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
+ upperBoundary = &nextBucket->lowerBoundary_;
} else {
- nextBucket = NULL;
+ upperBoundary = NULL;
}
- U_ASSERT(destBucket != NULL);
}
+ // now put the record into the bucket.
+ Bucket *bucket = currentBucket;
+ if (bucket->displayBucket_ != NULL) {
+ bucket = bucket->displayBucket_;
+ }
+ if (bucket->records_ == NULL) {
+ bucket->records_ = new UVector(errorCode);
+ if (bucket->records_ == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ }
+ bucket->records_->addElement(r, errorCode);
}
+}
+void AlphabeticIndex::clearBuckets() {
+ if (buckets_ != NULL) {
+ delete buckets_;
+ buckets_ = NULL;
+ internalResetBucketIterator();
+ }
}
+void AlphabeticIndex::internalResetBucketIterator() {
+ labelsIterIndex_ = -1;
+ currentBucket_ = NULL;
+}
-void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) {
+
+void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
+ if (U_FAILURE(status)) { return; }
+ // Chinese index characters, which are specific to each of the several Chinese tailorings,
+ // take precedence over the single locale data exemplar set per language.
+ const char *language = locale.getLanguage();
+ if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
+ uprv_strcmp(language, "ko") == 0) {
+ // TODO: This should be done regardless of the language, but it's expensive.
+ // We should add a Collator function (can be @internal)
+ // to enumerate just the contractions that start with a given code point or string.
+ if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
+ return;
+ }
+ }
+
+ LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
if (U_FAILURE(status)) {
return;
}
- LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
UnicodeSet exemplars;
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
if (U_SUCCESS(status)) {
- dest.addAll(exemplars);
+ initialLabels_->addAll(exemplars);
return;
}
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
- // Locale data did not include explicit Index characters.
+ // The locale data did not include explicit Index characters.
// Synthesize a set of them from the locale's standard exemplar characters.
-
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
if (U_FAILURE(status)) {
return;
}
+ // question: should we add auxiliary exemplars?
+ if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
+ exemplars.add(0x61, 0x7A);
+ }
+ if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
+ // cut down to small list
+ exemplars.remove(0xAC00, 0xD7A3).
+ add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
+ add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
+ add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
+ add(0xD30C).add(0xD558);
+ }
+ if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
+ // cut down to small list
+ // make use of the fact that Ethiopic is allocated in 8's, where
+ // the base is 0 mod 8.
+ UnicodeSet ethiopic(
+ UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
+ UnicodeSetIterator it(ethiopic);
+ while (it.next() && !it.isString()) {
+ if ((it.getCodepoint() & 0x7) != 0) {
+ exemplars.remove(it.getCodepoint());
+ }
+ }
+ }
+
// Upper-case any that aren't already so.
// (We only do this for synthesized index characters.)
-
UnicodeSetIterator it(exemplars);
UnicodeString upperC;
- UnicodeSet lowersToRemove;
- UnicodeSet uppersToAdd;
while (it.next()) {
const UnicodeString &exemplarC = it.getString();
upperC = exemplarC;
upperC.toUpper(locale);
- if (exemplarC != upperC) {
- lowersToRemove.add(exemplarC);
- uppersToAdd.add(upperC);
- }
+ initialLabels_->add(upperC);
}
- exemplars.removeAll(lowersToRemove);
- exemplars.addAll(uppersToAdd);
-
- // get the exemplars, and handle special cases
+}
- // question: should we add auxiliary exemplars?
- if (exemplars.containsSome(*CORE_LATIN)) {
- exemplars.addAll(*CORE_LATIN);
- }
- if (exemplars.containsSome(*HANGUL)) {
- // cut down to small list
- UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
- exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
- exemplars.addAll(*HANGUL);
+UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
+ UnicodeSet contractions;
+ ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
+ contractions.toUSet(), NULL, FALSE, &errorCode);
+ if (U_FAILURE(errorCode)) { return FALSE; }
+ UnicodeString firstHanBoundary;
+ UBool hasPinyin = FALSE;
+ UnicodeSetIterator iter(contractions);
+ while (iter.next()) {
+ const UnicodeString &s = iter.getString();
+ if (s.startsWith(BASE, BASE_LENGTH)) {
+ initialLabels_->add(s);
+ if (firstHanBoundary.isEmpty() ||
+ collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
+ firstHanBoundary = s;
+ }
+ UChar c = s.charAt(s.length() - 1);
+ if (0x41 <= c && c <= 0x5A) { // A-Z
+ hasPinyin = TRUE;
+ }
+ }
}
- if (exemplars.containsSome(*ETHIOPIC)) {
- // cut down to small list
- // make use of the fact that Ethiopic is allocated in 8's, where
- // the base is 0 mod 8.
- UnicodeSetIterator it(*ETHIOPIC);
- while (it.next() && !it.isString()) {
- if ((it.getCodepoint() & 0x7) != 0) {
- exemplars.remove(it.getCodepoint());
+ if (hasPinyin) {
+ initialLabels_->add(0x41, 0x5A); // A-Z
+ }
+ if (!firstHanBoundary.isEmpty()) {
+ // The hardcoded list of script boundaries includes U+4E00
+ // which is tailored to not be the first primary
+ // in all Chinese tailorings except "unihan".
+ // Replace U+4E00 with the first boundary string from the tailoring.
+ // TODO: This becomes obsolete when the root collator gets
+ // reliable script-first-primary mappings.
+ int32_t hanIndex = binarySearch(
+ *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
+ if (hanIndex >= 0) {
+ UnicodeString *fh = new UnicodeString(firstHanBoundary);
+ if (fh == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return FALSE;
}
+ firstCharsInScripts_->setElementAt(fh, hanIndex);
}
+ return TRUE;
+ } else {
+ return FALSE;
}
- dest.addAll(exemplars);
}
/*
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
*/
-static const UChar32 CGJ = (UChar)0x034F;
+static const UChar CGJ = 0x034F;
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
UnicodeString result;
if (item.length() == 0) {
AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
inflowLabel_ = label;
- indexBuildRequired_ = TRUE;
+ clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
overflowLabel_ = label;
- indexBuildRequired_ = TRUE;
+ clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
underflowLabel_ = label;
- indexBuildRequired_ = TRUE;
+ clearBuckets();
return *this;
}
return *this;
}
maxLabelCount_ = maxLabelCount;
- if (maxLabelCount < bucketList_->size()) {
- indexBuildRequired_ = TRUE;
- }
+ clearBuckets();
return *this;
}
-const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) {
- for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
- const UnicodeString *s =
- static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
- if (collator_->compare(*s, lowerLimit) > 0) {
- return *s;
- }
- }
- return *EMPTY_STRING;
-}
-
-UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
- if (U_FAILURE(status)) {
- return &dest;
- }
- UChar32 cp = codePoint.char32At(0);
- UScriptCode scriptCode = uscript_getScript(cp, &status);
- dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
- return &dest;
-}
-
//
// init() - Common code for constructors.
//
-void AlphabeticIndex::init(UErrorCode &status) {
- // Initialize statics if needed.
- AlphabeticIndex::staticInit(status);
-
- // Put the object into a known state so that the destructor will function.
-
- alreadyIn_ = NULL;
- bucketList_ = NULL;
- collator_ = NULL;
- collatorPrimaryOnly_ = NULL;
- currentBucket_ = NULL;
- firstScriptCharacters_ = NULL;
- initialLabels_ = NULL;
- indexBuildRequired_ = TRUE;
- inputRecords_ = NULL;
- itemsIterIndex_ = 0;
- labels_ = NULL;
- labelsIterIndex_ = 0;
- maxLabelCount_ = 99;
- noDistinctSorting_ = NULL;
- notAlphabetic_ = NULL;
- recordCounter_ = 0;
-
- if (U_FAILURE(status)) {
+void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
+ if (U_FAILURE(status)) { return; }
+ if (locale == NULL && collator_ == NULL) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
- alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash,
- uhash_compareUnicodeString, // key Comparator,
- NULL, // value Comparator
- &status);
- uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject);
- uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject);
-
- bucketList_ = new UVector(status);
- bucketList_->setDeleter(alphaIndex_deleteBucket);
- labels_ = new UVector(status);
- labels_->setDeleter(uprv_deleteUObject);
- labels_->setComparer(uhash_compareUnicodeString);
- inputRecords_ = new UVector(status);
- inputRecords_->setDeleter(alphaIndex_deleteRecord);
-
- noDistinctSorting_ = new UnicodeSet();
- notAlphabetic_ = new UnicodeSet();
+
initialLabels_ = new UnicodeSet();
+ if (initialLabels_ == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
- inflowLabel_.remove();
- inflowLabel_.append((UChar)0x2026); // Ellipsis
+ inflowLabel_.setTo((UChar)0x2026); // Ellipsis
overflowLabel_ = inflowLabel_;
underflowLabel_ = inflowLabel_;
- // TODO: check for memory allocation failures.
-}
-
-
-static UBool indexCharactersAreInitialized = FALSE;
-
-// Index Characters Clean up function. Delete statically allocated constant stuff.
-U_CDECL_BEGIN
-static UBool U_CALLCONV indexCharacters_cleanup(void) {
- AlphabeticIndex::staticCleanup();
- return TRUE;
-}
-U_CDECL_END
-
-void AlphabeticIndex::staticCleanup() {
- delete ALPHABETIC;
- ALPHABETIC = NULL;
- delete HANGUL;
- HANGUL = NULL;
- delete ETHIOPIC;
- ETHIOPIC = NULL;
- delete CORE_LATIN;
- CORE_LATIN = NULL;
- delete IGNORE_SCRIPTS;
- IGNORE_SCRIPTS = NULL;
- delete TO_TRY;
- TO_TRY = NULL;
- delete UNIHAN;
- UNIHAN = NULL;
- delete EMPTY_STRING;
- EMPTY_STRING = NULL;
- nfkdNormalizer = NULL; // ref to a singleton. Do not delete.
- indexCharactersAreInitialized = FALSE;
-}
-
-
-UnicodeSet *AlphabeticIndex::ALPHABETIC;
-UnicodeSet *AlphabeticIndex::HANGUL;
-UnicodeSet *AlphabeticIndex::ETHIOPIC;
-UnicodeSet *AlphabeticIndex::CORE_LATIN;
-UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS;
-UnicodeSet *AlphabeticIndex::TO_TRY;
-UnicodeSet *AlphabeticIndex::UNIHAN;
-const UnicodeString *AlphabeticIndex::EMPTY_STRING;
-
-//
-// staticInit() One-time initialization of constants.
-// Thread safe. Called from constructors.
-// Mutex overhead is not a concern. AlphabeticIndex constructors are
-// sufficiently heavy that the cost of the mutex check is not significant.
-
-void AlphabeticIndex::staticInit(UErrorCode &status) {
- static UMTX IndexCharsInitMutex;
-
- Mutex mutex(&IndexCharsInitMutex);
- if (indexCharactersAreInitialized || U_FAILURE(status)) {
+ if (collator_ == NULL) {
+ collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
+ if (U_FAILURE(status)) { return; }
+ if (collator_ == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ }
+ collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
+ if (collatorPrimaryOnly_ == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- UBool finishedInit = FALSE;
-
- {
- UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
- ALPHABETIC = new UnicodeSet(alphaString, status);
- if (ALPHABETIC == NULL) {
- goto err;
- }
-
- HANGUL = new UnicodeSet();
- HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
- add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
- if (HANGUL== NULL) {
- goto err;
- }
-
-
- UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
- ETHIOPIC = new UnicodeSet(EthiopicStr, status);
- if (ETHIOPIC == NULL) {
- goto err;
+ collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
+ firstCharsInScripts_ = firstStringsInScript(status);
+ if (U_FAILURE(status)) { return; }
+ firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
+ UnicodeString _4E00((UChar)0x4E00);
+ UnicodeString _1100((UChar)0x1100);
+ UnicodeString _1112((UChar)0x1112);
+ if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
+ collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
+ // The standard Korean tailoring sorts Hanja (Han characters)
+ // as secondary differences from Hangul syllables.
+ // This makes U+4E00 not useful as a Han-script boundary.
+ // TODO: This becomes obsolete when the root collator gets
+ // reliable script-first-primary mappings.
+ int32_t hanIndex = binarySearch(
+ *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
+ if (hanIndex >= 0) {
+ firstCharsInScripts_->removeElementAt(hanIndex);
}
-
- CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z');
- if (CORE_LATIN == NULL) {
- goto err;
- }
-
- UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
- "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
- IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
- IGNORE_SCRIPTS->freeze();
- if (IGNORE_SCRIPTS == NULL) {
- goto err;
- }
-
- UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
- TO_TRY = new UnicodeSet(nfcqcStr, status);
- if (TO_TRY == NULL) {
- goto err;
- }
-
- UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
- UNIHAN = new UnicodeSet(unihanStr, status);
- if (UNIHAN == NULL) {
- goto err;
+ }
+ // Guard against a degenerate collator where
+ // some script boundary strings are primary ignorable.
+ for (;;) {
+ if (U_FAILURE(status)) { return; }
+ if (firstCharsInScripts_->isEmpty()) {
+ // AlphabeticIndex requires some non-ignorable script boundary strings.
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
}
-
- EMPTY_STRING = new UnicodeString();
-
- nfkdNormalizer = Normalizer2::getNFKDInstance(status);
- if (nfkdNormalizer == NULL) {
- goto err;
+ if (collatorPrimaryOnly_->compare(
+ *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
+ emptyString_, status) == UCOL_EQUAL) {
+ firstCharsInScripts_->removeElementAt(0);
+ } else {
+ break;
}
}
- finishedInit = TRUE;
- err:
- if (!finishedInit && U_SUCCESS(status)) {
- status = U_MEMORY_ALLOCATION_ERROR;
+ if (locale != NULL) {
+ addIndexExemplars(*locale, status);
}
- if (U_FAILURE(status)) {
- indexCharacters_cleanup();
- return;
- }
- ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
- indexCharactersAreInitialized = TRUE;
}
// Comparison function for UVector<UnicodeString *> sorting with a collator.
//
static int32_t U_CALLCONV
-sortCollateComparator(const void *context, const void *left, const void *right) {
+collatorComparator(const void *context, const void *left, const void *right) {
const UElement *leftElement = static_cast<const UElement *>(left);
const UElement *rightElement = static_cast<const UElement *>(right);
const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
- const Collator *col = static_cast<const Collator *>(context);
if (leftString == rightString) {
// Catches case where both are NULL
if (rightString == NULL) {
return -1;
}
- Collator::EComparisonResult r = col->compare(*leftString, *rightString);
- return (int32_t) r;
+ const Collator *col = static_cast<const Collator *>(context);
+ UErrorCode errorCode = U_ZERO_ERROR;
+ return col->compare(*leftString, *rightString, errorCode);
}
//
const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
const Collator *col = static_cast<const Collator *>(context);
-
- Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
- if (r == Collator::EQUAL) {
- if (leftRec->serialNumber_ < rightRec->serialNumber_) {
- r = Collator::LESS;
- } else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
- r = Collator::GREATER;
- }
- }
- return (int32_t) r;
+ UErrorCode errorCode = U_ZERO_ERROR;
+ return col->compare(leftRec->name_, rightRec->name_, errorCode);
}
-#if 0
-//
-// First characters in scripts.
-// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
-// The vector is sorted according to this index's collation.
-//
-// This code is too slow to use, so for now hard code the data.
-// Hard coded implementation is follows.
-//
-UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {
-
- if (U_FAILURE(status)) {
- return NULL;
- }
-
- UnicodeString results[USCRIPT_CODE_LIMIT];
- UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
-
- UnicodeSetIterator siter(*TO_TRY);
- while (siter.next()) {
- const UnicodeString ¤t = siter.getString();
- Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
- if (r < 0) { // TODO fix; we only want "real" script characters, not
- // symbols.
- continue;
- }
-
- int script = uscript_getScript(current.char32At(0), &status);
- if (results[script].length() == 0) {
- results[script] = current;
- }
- else if (ruleBasedCollator->compare(current, results[script]) < 0) {
- results[script] = current;
- }
- }
-
- UnicodeSet extras;
- UnicodeSet expansions;
- RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
- const UCollator *uRuleBasedCollator = rbc->getUCollator();
- ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
- extras.addAll(expansions).removeAll(*TO_TRY);
- if (extras.size() != 0) {
- const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
- UnicodeSetIterator extrasIter(extras);
- while (extrasIter.next()) {
- const UnicodeString ¤t = extrasIter.next();
- if (!TO_TRY->containsAll(current))
- continue;
- if (!normalizer->isNormalized(current, status) ||
- ruleBasedCollator->compare(current, LOWER_A) < 0) {
- continue;
- }
- int script = uscript_getScript(current.char32At(0), &status);
- if (results[script].length() == 0) {
- results[script] = current;
- } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
- results[script] = current;
- }
- }
- }
-
- UVector *dest = new UVector(status);
- dest->setDeleter(uprv_deleteUObject);
- for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
- if (results[i].length() > 0) {
- dest->addElement(results[i].clone(), status);
- }
- }
- dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
- return dest;
-}
-#endif
-
-
-//
-// First characters in scripts.
-// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
-// The vector is sorted according to this index's collation.
-//
-// It takes too much time to compute this from character properties, so hard code it for now.
-// Character constants copied from corresponding declaration in ICU4J.
-// See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
-
-static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0,
- 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
- 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
- 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
- 0xAAF2, 0, // Meetei Mayek
- 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
- U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
- U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
- 0x1B83, 0,
- 0xD802, 0xDE00, 0, 0x0E01, 0,
- 0x0EDE, 0, // Lao
- 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
- 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
- U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
- 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
- 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
- 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
- U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
- 0xD800, 0xDE80, 0,
- 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
- 0xD801, 0xDC80, 0,
- U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
- 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
- 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
- U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
- U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
- 0x4E00, 0 };
+/**
+ * This list contains one character per script that has the
+ * lowest primary weight for that script in the root collator.
+ * This list will be copied and sorted to account for script reordering.
+ *
+ * <p>TODO: This is fragile. If the first character of a script is tailored
+ * so that it does not map to the script's lowest primary weight any more,
+ * then the buckets will be off.
+ * There are hacks in the code to handle the known CJK tailorings of U+4E00.
+ *
+ * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
+ *
+ * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
+ * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
+ */
+static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
+ 0x41, 0, 0x03B1, 0,
+ 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
+ 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
+ 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
+ 0xAAF2, 0, // Meetei Mayek
+ 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
+ U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
+ U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
+ 0x1B83, 0,
+ 0xD802, 0xDE00, 0, 0x0E01, 0,
+ 0x0EDE, 0, // Lao
+ 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
+ 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
+ U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
+ 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
+ 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
+ 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
+ U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
+ 0xD800, 0xDE80, 0,
+ 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
+ 0xD801, 0xDC80, 0,
+ U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
+ 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
+ 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
+ U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
+ U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
+ 0x4E00, 0,
+ // TODO: The overflow bucket's lowerBoundary string should be the
+ // first item after the last reordering group in the collator's script order.
+ // This should normally be the first Unicode code point
+ // that is unassigned (U+0378 in Unicode 6.3) and untailored.
+ // However, at least up to ICU 51 the Hani reordering group includes
+ // unassigned code points,
+ // and there is no stable string for the start of the trailing-weights range.
+ // The only known string that sorts "high" is U+FFFF.
+ // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
+ // and fix relevant test code.
+ // Ideally, FractionalUCA.txt will have a "script first primary"
+ // for unassigned code points.
+ 0xFFFF, 0
+};
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
if (U_FAILURE(status)) {
}
UVector *dest = new UVector(status);
if (dest == NULL) {
- if (U_SUCCESS(status)) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
+ status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
dest->setDeleter(uprv_deleteUObject);
const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
- const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
+ const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
do {
if (U_FAILURE(status)) {
return dest;
UnicodeString *str = new UnicodeString(src, -1);
if (str == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
- } else {
- dest->addElement(str, status);
- src += str->length() + 1;
+ return dest;
}
+ dest->addElement(str, status);
+ src += str->length() + 1;
} while (src < limit);
- dest->sortWithUComparator(sortCollateComparator, collator_, status);
return dest;
}
-AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
- const char *lang = loc.getLanguage();
- if (uprv_strcmp(lang, "zh") != 0) {
- return kNormal;
- }
- const char *script = loc.getScript();
- if (uprv_strcmp(script, "Hant") == 0) {
- return kTraditional;
- }
- const char *country = loc.getCountry();
- if (uprv_strcmp(country, "TW") == 0) {
- return kTraditional;
- }
- return kSimplified;
-}
-
-
-//
-// Pinyin Hacks. Direct port from Java.
-//
-
-static const UChar32 probeCharInLong = 0x28EAD;
-
-
-static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
- 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
- /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
-
-
-// Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
-
-AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = {
- {(UChar)0, (UChar)0, (UChar)0}, // A
- {(UChar)0x516B, (UChar)0, (UChar)0}, // B
- {(UChar)0x5693, (UChar)0, (UChar)0}, // C
- {(UChar)0x5491, (UChar)0, (UChar)0}, // D
- {(UChar)0x59B8, (UChar)0, (UChar)0}, // E
- {(UChar)0x53D1, (UChar)0, (UChar)0}, // F
- {(UChar)0x65EE, (UChar)0, (UChar)0}, // G
- {(UChar)0x54C8, (UChar)0, (UChar)0}, // H
- {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
- {(UChar)0x5494, (UChar)0, (UChar)0}, // K
- {(UChar)0x5783, (UChar)0, (UChar)0}, // L
- {(UChar)0x5452, (UChar)0, (UChar)0}, // M
- {(UChar)0x5514, (UChar)0, (UChar)0}, // N
- {(UChar)0x5594, (UChar)0, (UChar)0}, // O
- {(UChar)0x5991, (UChar)0, (UChar)0}, // P
- {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
- {(UChar)0x513F, (UChar)0, (UChar)0}, // R
- {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
- {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
- {(UChar)0x7A75, (UChar)0, (UChar)0}, // W
- {(UChar)0x5915, (UChar)0, (UChar)0}, // X
- {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
- {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
- {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
- };
-
-static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
- 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
- /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
-
-AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = {
- {(UChar)0, (UChar)0, (UChar)0}, // A
- {(UChar)0x516B, (UChar)0, (UChar)0}, // b
- {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
- {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
- {(UChar)0x59B8, (UChar)0, (UChar)0}, // e
- {(UChar)0x53D1, (UChar)0, (UChar)0}, // f
- {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
- {(UChar)0x54C8, (UChar)0, (UChar)0}, // h
- {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j
- {(UChar)0x5494, (UChar)0, (UChar)0}, // k
- {(UChar)0x3547, (UChar)0, (UChar)0}, // l
- {(UChar)0x5452, (UChar)0, (UChar)0}, // m
- {(UChar)0x5514, (UChar)0, (UChar)0}, // n
- {(UChar)0x5594, (UChar)0, (UChar)0}, // o
- {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
- {(UChar)0x4E03, (UChar)0, (UChar)0}, // q
- {(UChar)0x513F, (UChar)0, (UChar)0}, // r
- {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s
- {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t
- {(UChar)0x7A75, (UChar)0, (UChar)0}, // w
- {(UChar)0x5915, (UChar)0, (UChar)0}, // x
- {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y
- {(UChar)0x5E00, (UChar)0, (UChar)0}, // z
- {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
- };
-
-
-//
-// Probe the collation data, and decide which Pinyin tables should be used
-//
-// ICU can be built with a choice between two Chinese collations.
-// The hack Pinyin tables to use depend on which one is in use.
-// We can assume that any given copy of ICU will have only one of the collations available,
-// and that there is no way, in a given process, to create two alphabetic indexes using
-// different Chinese collations. Which means the probe can be done once
-// and the results cached.
-//
-// This whole arrangement is temporary.
-//
-AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL;
-const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL;
-
-void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
- {
- Mutex m;
- if (PINYIN_LOWER_BOUNDS != NULL) {
- return;
- }
- }
- UnicodeSet *colSet = col->getTailoredSet(status);
- if (U_FAILURE(status) || colSet == NULL) {
- delete colSet;
- if (U_SUCCESS(status)) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- return;
- }
- UBool useLongTables = colSet->contains(probeCharInLong);
- delete colSet;
- {
- Mutex m;
- if (useLongTables) {
- PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
- HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG;
- } else {
- PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
- HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT;
- }
- }
-}
-
-// Pinyin Hack:
-// Modify a Chinese name by prepending a Latin letter. The modified name is used
-// when putting records (names) into buckets, to put the name under a Latin index heading.
-
-void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
-
- if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
- dest = name;
- return;
- }
-
- UErrorCode status = U_ZERO_ERROR;
- initPinyinBounds(col, status);
- if (U_FAILURE(status)) {
- dest = name;
- return;
- }
- // TODO: use binary search
- int index;
- for (index=0; ; index++) {
- if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
- index--;
- break;
- }
- int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1));
- if (compareResult < 0) {
- index--;
- }
- if (compareResult <= 0) {
- break;
- }
- }
- UChar c = PINYIN_LOWER_BOUNDS[index];
- dest.setTo(c);
- dest.append(name);
- return;
-}
-
-
+namespace {
/**
- * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
- * better, and otherwise binary order is better.
- *
- * For use with array sort or UVector.
- * @param context A UErrorCode pointer.
- * @param left A UElement pointer, which must refer to a UnicodeString *
- * @param right A UElement pointer, which must refer to a UnicodeString *
+ * Returns true if one index character string is "better" than the other.
+ * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
+ * better, and otherwise binary-less-than is better.
*/
-
-static int32_t U_CALLCONV
-PreferenceComparator(const void *context, const void *left, const void *right) {
- const UElement *leftElement = static_cast<const UElement *>(left);
- const UElement *rightElement = static_cast<const UElement *>(right);
- const UnicodeString *s1 = static_cast<const UnicodeString *>(leftElement->pointer);
- const UnicodeString *s2 = static_cast<const UnicodeString *>(rightElement->pointer);
- UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const.
- if (s1 == s2) {
- return 0;
- }
-
- UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
- UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
- int32_t result = n1.length() - n2.length();
+UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
+ const UnicodeString &one, const UnicodeString &other) {
+ // This is called with primary-equal strings, but never with one.equals(other).
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString n1 = nfkdNormalizer.normalize(one, status);
+ UnicodeString n2 = nfkdNormalizer.normalize(other, status);
+ if (U_FAILURE(status)) { return FALSE; }
+ int32_t result = n1.countChar32() - n2.countChar32();
if (result != 0) {
- return result;
+ return result < 0;
}
-
result = n1.compareCodePointOrder(n2);
if (result != 0) {
- return result;
+ return result < 0;
}
- return s1->compareCodePointOrder(*s2);
+ return one.compareCodePointOrder(other) < 0;
}
+} // namespace
//
// Constructor & Destructor for AlphabeticIndex::Record
// Records are internal only, instances are not directly surfaced in the public API.
// This class is mostly struct-like, with all public fields.
-AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data):
- alphaIndex_(alphaIndex), name_(name), data_(data)
-{
- UnicodeString prefixedName;
- alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
- serialNumber_ = ++alphaIndex->recordCounter_;
-}
-
+AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
+ : name_(name), data_(data) {}
+
AlphabeticIndex::Record::~Record() {
}
if (U_FAILURE(status)) {
return *this;
}
- Record *r = new Record(this, name, data);
- inputRecords_->addElement(r, status);
- indexBuildRequired_ = TRUE;
+ if (inputList_ == NULL) {
+ inputList_ = new UVector(status);
+ if (inputList_ == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ inputList_->setDeleter(alphaIndex_deleteRecord);
+ }
+ Record *r = new Record(name, data);
+ if (r == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ inputList_->addElement(r, status);
+ clearBuckets();
//std::string ss;
//std::string ss2;
//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
- if (U_FAILURE(status)) {
- return *this;
+ if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
+ inputList_->removeAllElements();
+ clearBuckets();
}
- inputRecords_->removeAllElements();
- indexBuildRequired_ = TRUE;
return *this;
}
-
int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
- buildIndex(status);
+ initBuckets(status);
if (U_FAILURE(status)) {
return 0;
}
-
- // For simplified Chinese prepend a prefix to the name.
- // For non-Chinese locales or non-Chinese names, the name is not modified.
-
- UnicodeString prefixedName;
- hackName(prefixedName, name, collatorPrimaryOnly_);
-
- // TODO: use a binary search.
- for (int32_t i = 0; i < bucketList_->size(); ++i) {
- Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i));
- Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
- if (comp < 0) {
- return i - 1;
- }
- }
- // Loop runs until we find the bucket following the one that would hold prefixedName.
- // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
- // than returning from the middle.
-
- return bucketList_->size() - 1;
+ return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
}
if (U_FAILURE(status)) {
return FALSE;
}
- if (indexBuildRequired_ && currentBucket_ != NULL) {
+ if (buckets_ == NULL && currentBucket_ != NULL) {
status = U_ENUM_OUT_OF_SYNC_ERROR;
return FALSE;
}
- buildIndex(status);
+ initBuckets(status);
if (U_FAILURE(status)) {
return FALSE;
}
++labelsIterIndex_;
- if (labelsIterIndex_ >= bucketList_->size()) {
- labelsIterIndex_ = bucketList_->size();
+ if (labelsIterIndex_ >= buckets_->getBucketCount()) {
+ labelsIterIndex_ = buckets_->getBucketCount();
return FALSE;
}
- currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
+ currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
resetRecordIterator();
return TRUE;
}
if (currentBucket_ != NULL) {
return currentBucket_->label_;
} else {
- return *EMPTY_STRING;
+ return emptyString_;
}
}
int32_t AlphabeticIndex::getBucketRecordCount() const {
- if (currentBucket_ != NULL) {
+ if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
return currentBucket_->records_->size();
} else {
return 0;
if (U_FAILURE(status)) {
return *this;
}
- buildIndex(status);
- labelsIterIndex_ = -1;
- currentBucket_ = NULL;
+ internalResetBucketIterator();
return *this;
}
status = U_INVALID_STATE_ERROR;
return FALSE;
}
- if (indexBuildRequired_) {
+ if (buckets_ == NULL) {
status = U_ENUM_OUT_OF_SYNC_ERROR;
return FALSE;
}
+ if (currentBucket_->records_ == NULL) {
+ return FALSE;
+ }
++itemsIterIndex_;
if (itemsIterIndex_ >= currentBucket_->records_->size()) {
itemsIterIndex_ = currentBucket_->records_->size();
const UnicodeString &AlphabeticIndex::getRecordName() const {
- const UnicodeString *retStr = EMPTY_STRING;
- if (currentBucket_ != NULL &&
+ const UnicodeString *retStr = &emptyString_;
+ if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
itemsIterIndex_ >= 0 &&
itemsIterIndex_ < currentBucket_->records_->size()) {
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
const void *AlphabeticIndex::getRecordData() const {
const void *retPtr = NULL;
- if (currentBucket_ != NULL &&
+ if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
itemsIterIndex_ >= 0 &&
itemsIterIndex_ < currentBucket_->records_->size()) {
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
const UnicodeString &lowerBoundary,
- UAlphabeticIndexLabelType type,
- UErrorCode &status):
- label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
- if (U_FAILURE(status)) {
- return;
- }
- records_ = new UVector(status);
- if (records_ == NULL && U_SUCCESS(status)) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
+ UAlphabeticIndexLabelType type)
+ : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
+ displayBucket_(NULL), displayIndex_(-1),
+ records_(NULL) {
}