git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	* Copyright (C) 2009-2012, International Business Machines Corporation and *
	4	* others. All Rights Reserved. *
	5	*******************************************************************************
	6	*/
	7
	8	/**
	9	* \file
	10	* \brief C API: AlphabeticIndex class
	11	*/
	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
	16
	17	#include "unicode/alphaindex.h"
	18	#include "unicode/coll.h"
	19	#include "unicode/normalizer2.h"
	20	#include "unicode/strenum.h"
	21	#include "unicode/tblcoll.h"
	22	#include "unicode/ulocdata.h"
	23	#include "unicode/uniset.h"
	24	#include "unicode/uobject.h"
	25	#include "unicode/uscript.h"
	26	#include "unicode/usetiter.h"
	27	#include "unicode/ustring.h"
	28	#include "unicode/utf16.h"
	29
	30	#include "cstring.h"
	31	#include "mutex.h"
	32	#include "uassert.h"
	33	#include "ucln_in.h"
	34	#include "uhash.h"
	35	#include "uvector.h"
	36
	37	//#include <string>
	38	//#include <iostream>
	39	U_NAMESPACE_BEGIN
	40
	41	UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
	42
	43	// Forward Declarations
	44	static int32_t U_CALLCONV
	45	PreferenceComparator(const void context, const void left, const void *right);
	46
	47	static int32_t U_CALLCONV
	48	sortCollateComparator(const void context, const void left, const void *right);
	49
	50	static int32_t U_CALLCONV
	51	recordCompareFn(const void context, const void left, const void *right);
	52
	53	// UVector<Bucket *> support function, delete a Bucket.
	54	static void U_CALLCONV
	55	alphaIndex_deleteBucket(void *obj) {
	56	delete static_cast<AlphabeticIndex::Bucket *>(obj);
	57	}
	58
	59	// UVector<Record *> support function, delete a Record.
	60	static void U_CALLCONV
	61	alphaIndex_deleteRecord(void *obj) {
	62	delete static_cast<AlphabeticIndex::Record *>(obj);
	63	}
	64
	65
	66
	67	static const Normalizer2 *nfkdNormalizer;
	68
	69	//
	70	// Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
	71	// Append everything - individual characters are handled as strings of length 1.
	72	// The destination vector owns the appended strings.
	73
	74	static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
	75	UnicodeSetIterator setIter(source);
	76	while (setIter.next()) {
	77	const UnicodeString &str = setIter.getString();
	78	dest.addElement(str.clone(), status);
	79	}
	80	}
	81
	82
	83	AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
	84	init(status);
	85	if (U_FAILURE(status)) {
	86	return;
	87	}
	88	locale_ = locale;
	89	langType_ = langTypeFromLocale(locale_);
	90
	91	collator_ = Collator::createInstance(locale, status);
	92	if (collator_ != NULL) {
	93	collatorPrimaryOnly_ = collator_->clone();
	94	}
	95	if (collatorPrimaryOnly_ != NULL) {
	96	collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
	97	}
	98	getIndexExemplars(*initialLabels_, locale, status);
	99	indexBuildRequired_ = TRUE;
	100	if ((collator_ == NULL \|\| collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
	101	status = U_MEMORY_ALLOCATION_ERROR;
	102	}
	103	firstScriptCharacters_ = firstStringsInScript(status);
	104	}
	105
	106
	107	AlphabeticIndex::~AlphabeticIndex() {
	108	uhash_close(alreadyIn_);
	109	delete bucketList_;
	110	delete collator_;
	111	delete collatorPrimaryOnly_;
	112	delete firstScriptCharacters_;
	113	delete labels_;
	114	delete inputRecords_;
	115	delete noDistinctSorting_;
	116	delete notAlphabetic_;
	117	delete initialLabels_;
	118	}
	119
	120
	121	AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
	122	if (U_FAILURE(status)) {
	123	return *this;
	124	}
	125	initialLabels_->addAll(additions);
	126	return *this;
	127	}
	128
	129
	130	AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
	131	if (U_FAILURE(status)) {
	132	return *this;
	133	}
	134	UnicodeSet additions;
	135	getIndexExemplars(additions, locale, status);
	136	initialLabels_->addAll(additions);
	137	return *this;
	138	}
	139
	140
	141	int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
	142	buildIndex(status);
	143	if (U_FAILURE(status)) {
	144	return 0;
	145	}
	146	return bucketList_->size();
	147	}
	148
	149
	150	int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
	151	if (U_FAILURE(status)) {
	152	return 0;
	153	}
	154	return inputRecords_->size();
	155	}
	156
	157
	158	void AlphabeticIndex::buildIndex(UErrorCode &status) {
	159	if (U_FAILURE(status)) {
	160	return;
	161	}
	162	if (!indexBuildRequired_) {
	163	return;
	164	}
	165
	166	// Discard any already-built data.
	167	// This is important when the user builds and uses an index, then subsequently modifies it,
	168	// necessitating a rebuild.
	169
	170	bucketList_->removeAllElements();
	171	labels_->removeAllElements();
	172	uhash_removeAll(alreadyIn_);
	173	noDistinctSorting_->clear();
	174	notAlphabetic_->clear();
	175
	176	// first sort the incoming Labels, with a "best" ordering among items
	177	// that are the same according to the collator
	178
	179	UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector.
	180	preferenceSorting.setDeleter(uprv_deleteUObject);
	181	appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
	182	preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
	183
	184	// We now make a set of Labels.
	185	// Some of the input may, however, be redundant.
	186	// That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
	187	// So we make a pass through, filtering out those cases.
	188	// TODO: filtering these out would seem to be at odds with the eventual goal
	189	// of being able to split buckets that contain too many items.
	190
	191	UnicodeSet labelSet;
	192	for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
	193	UnicodeString item = static_cast<const UnicodeString >(preferenceSorting.elementAt(psIndex));
	194	// TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet,
	195	// is it even possible for duplicates to show up in this check?
	196	if (labelSet.contains(item)) {
	197	UnicodeSetIterator itemAlreadyInIter(labelSet);
	198	while (itemAlreadyInIter.next()) {
	199	const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
	200	if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
	201	UnicodeSet targets = static_cast<UnicodeSet >(uhash_get(alreadyIn_, &itemAlreadyIn));
	202	if (targets == NULL) {
	203	// alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
	204	targets = new UnicodeSet();
	205	uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
	206	}
	207	targets->add(item);
	208	break;
	209	}
	210	}
	211	} else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point.
	212	collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
	213	noDistinctSorting_->add(item);
	214	} else if (!ALPHABETIC->containsSome(item)) {
	215	notAlphabetic_->add(item);
	216	} else {
	217	labelSet.add(item);
	218	}
	219	}
	220
	221	// If we have no labels, hard-code a fallback default set of [A-Z]
	222	// This case can occur with locales that don't have exemplar character data, including root.
	223	// A no-labels situation will cause other problems; it needs to be avoided.
	224	if (labelSet.isEmpty()) {
	225	labelSet.add((UChar32)0x41, (UChar32)0x5A);
	226	}
	227
	228	// Move the set of Labels from the set into a vector, and sort
	229	// according to the collator.
	230
	231	appendUnicodeSetToUVector(*labels_, labelSet, status);
	232	labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
	233
	234	// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
	235	// Implemented by copying the elements to be retained to a new UVector.
	236
	237	const int32_t size = labelSet.size() - 1;
	238	if (size > maxLabelCount_) {
	239	UVector *newLabels = new UVector(status);
	240	newLabels->setDeleter(uprv_deleteUObject);
	241	int32_t count = 0;
	242	int32_t old = -1;
	243	for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
	244	const UnicodeString str = static_cast<const UnicodeString >(labels_->elementAt(srcIndex));
	245	++count;
	246	const int32_t bump = count * maxLabelCount_ / size;
	247	if (bump == old) {
	248	// it.remove();
	249	} else {
	250	newLabels->addElement(str->clone(), status);
	251	old = bump;
	252	}
	253	}
	254	delete labels_;
	255	labels_ = newLabels;
	256	}
	257
	258	// We now know the list of labels.
	259	// Create a corresponding list of buckets, one per label.
	260
	261	buildBucketList(status); // Corresponds to Java BucketList constructor.
	262
	263	// Bin the Records into the Buckets.
	264	bucketRecords(status);
	265
	266	indexBuildRequired_ = FALSE;
	267	resetBucketIterator(status);
	268	}
	269
	270	//
	271	// buildBucketList() Corresponds to the BucketList constructor in the Java version.
	272
	273	void AlphabeticIndex::buildBucketList(UErrorCode &status) {
	274	UnicodeString labelStr = getUnderflowLabel();
	275	Bucket b = new Bucket(labelStr, EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
	276	bucketList_->addElement(b, status);
	277
	278	// Build up the list, adding underflow, additions, overflow
	279	// insert infix labels as needed, using \uFFFF.
	280	const UnicodeString last = static_cast<UnicodeString >(labels_->elementAt(0));
	281	b = new Bucket(last, last, U_ALPHAINDEX_NORMAL, status);
	282	bucketList_->addElement(b, status);
	283
	284	UnicodeSet lastSet;
	285	UnicodeSet set;
	286	AlphabeticIndex::getScriptSet(lastSet, *last, status);
	287	lastSet.removeAll(*IGNORE_SCRIPTS);
	288
	289	for (int i = 1; i < labels_->size(); ++i) {
	290	UnicodeString current = static_cast<UnicodeString >(labels_->elementAt(i));
	291	getScriptSet(set, *current, status);
	292	set.removeAll(*IGNORE_SCRIPTS);
	293	if (lastSet.containsNone(set)) {
	294	// check for adjacent
	295	const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
	296	if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
	297	labelStr = getInflowLabel();
	298	b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
	299	bucketList_->addElement(b, status);
	300	i++;
	301	lastSet = set;
	302	}
	303	}
	304	b = new Bucket(current, current, U_ALPHAINDEX_NORMAL, status);
	305	bucketList_->addElement(b, status);
	306	last = current;
	307	lastSet = set;
	308	}
	309	const UnicodeString &limitString = getOverflowComparisonString(*last, status);
	310	b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
	311	bucketList_->addElement(b, status);
	312	// final overflow bucket
	313	}
	314
	315
	316	//
	317	// Place all of the raw input records into the correct bucket.
	318	//
	319	// Begin by sorting the input records; this lets us bin them in a single pass.
	320	//
	321	// Note on storage management: The input records are owned by the
	322	// inputRecords_ vector, and will (eventually) be auto-deleted by it.
	323	// The Bucket objects have pointers to the Record objects, but do not own them.
	324	//
	325	void AlphabeticIndex::bucketRecords(UErrorCode &status) {
	326	if (U_FAILURE(status)) {
	327	return;
	328	}
	329
	330	inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
	331	U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow
	332	// bucket, even if no user labels.
	333	int32_t bucketIndex = 0;
	334	Bucket destBucket = static_cast<Bucket >(bucketList_->elementAt(bucketIndex));
	335	Bucket *nextBucket = NULL;
	336	if (bucketIndex+1 < bucketList_->size()) {
	337	nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
	338	}
	339	int32_t recordIndex = 0;
	340	Record r = static_cast<Record >(inputRecords_->elementAt(recordIndex));
	341	while (recordIndex < inputRecords_->size()) {
	342	if (nextBucket == NULL \|\|
	343	collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
	344	// Record goes in current bucket. Advance to next record,
	345	// stay on current bucket.
	346	destBucket->records_->addElement(r, status);
	347	++recordIndex;
	348	r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
	349	} else {
	350	// Advance to the next bucket, stay on current record.
	351	bucketIndex++;
	352	destBucket = nextBucket;
	353	if (bucketIndex+1 < bucketList_->size()) {
	354	nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
	355	} else {
	356	nextBucket = NULL;
	357	}
	358	U_ASSERT(destBucket != NULL);
	359	}
	360	}
	361
	362	}
	363
	364
	365	void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) {
	366	if (U_FAILURE(status)) {
	367	return;
	368	}
	369
	370	LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
	371	UnicodeSet exemplars;
	372	ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
	373	if (U_SUCCESS(status)) {
	374	dest.addAll(exemplars);
	375	return;
	376	}
	377	status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
	378
	379	// Locale data did not include explicit Index characters.
	380	// Synthesize a set of them from the locale's standard exemplar characters.
	381
	382	ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
	383	if (U_FAILURE(status)) {
	384	return;
	385	}
	386
	387	// Upper-case any that aren't already so.
	388	// (We only do this for synthesized index characters.)
	389
	390	UnicodeSetIterator it(exemplars);
	391	UnicodeString upperC;
	392	UnicodeSet lowersToRemove;
	393	UnicodeSet uppersToAdd;
	394	while (it.next()) {
	395	const UnicodeString &exemplarC = it.getString();
	396	upperC = exemplarC;
	397	upperC.toUpper(locale);
	398	if (exemplarC != upperC) {
	399	lowersToRemove.add(exemplarC);
	400	uppersToAdd.add(upperC);
	401	}
	402	}
	403	exemplars.removeAll(lowersToRemove);
	404	exemplars.addAll(uppersToAdd);
	405
	406	// get the exemplars, and handle special cases
	407
	408	// question: should we add auxiliary exemplars?
	409	if (exemplars.containsSome(*CORE_LATIN)) {
	410	exemplars.addAll(*CORE_LATIN);
	411	}
	412	if (exemplars.containsSome(*HANGUL)) {
	413	// cut down to small list
	414	UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
	415	exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
	416	exemplars.addAll(*HANGUL);
	417	}
	418	if (exemplars.containsSome(*ETHIOPIC)) {
	419	// cut down to small list
	420	// make use of the fact that Ethiopic is allocated in 8's, where
	421	// the base is 0 mod 8.
	422	UnicodeSetIterator it(*ETHIOPIC);
	423	while (it.next() && !it.isString()) {
	424	if ((it.getCodepoint() & 0x7) != 0) {
	425	exemplars.remove(it.getCodepoint());
	426	}
	427	}
	428	}
	429	dest.addAll(exemplars);
	430	}
	431
	432
	433	/*
	434	* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
	435	*/
	436	static const UChar32 CGJ = (UChar)0x034F;
	437	UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
	438	UnicodeString result;
	439	if (item.length() == 0) {
	440	return result;
	441	}
	442	int32_t i = 0;
	443	for (;;) {
	444	UChar32 cp = item.char32At(i);
	445	result.append(cp);
	446	i = item.moveIndex32(i, 1);
	447	if (i >= item.length()) {
	448	break;
	449	}
	450	result.append(CGJ);
	451	}
	452	return result;
	453	}
	454
	455
	456	UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
	457	return FALSE;
	458	}
	459
	460
	461	UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
	462	return FALSE;
	463	}
	464
	465
	466	const RuleBasedCollator &AlphabeticIndex::getCollator() const {
	467	// There are no known non-RuleBasedCollator collators, and none ever expected.
	468	// But, in case that changes, better a null pointer than a wrong type.
	469	return dynamic_cast<RuleBasedCollator >(collator_);
	470	}
	471
	472
	473	const UnicodeString &AlphabeticIndex::getInflowLabel() const {
	474	return inflowLabel_;
	475	}
	476
	477	const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
	478	return overflowLabel_;
	479	}
	480
	481
	482	const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
	483	return underflowLabel_;
	484	}
	485
	486
	487	AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/status/) {
	488	inflowLabel_ = label;
	489	indexBuildRequired_ = TRUE;
	490	return *this;
	491	}
	492
	493
	494	AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/status/) {
	495	overflowLabel_ = label;
	496	indexBuildRequired_ = TRUE;
	497	return *this;
	498	}
	499
	500
	501	AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/status/) {
	502	underflowLabel_ = label;
	503	indexBuildRequired_ = TRUE;
	504	return *this;
	505	}
	506
	507
	508	int32_t AlphabeticIndex::getMaxLabelCount() const {
	509	return maxLabelCount_;
	510	}
	511
	512
	513	AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
	514	if (U_FAILURE(status)) {
	515	return *this;
	516	}
	517	if (maxLabelCount <= 0) {
	518	status = U_ILLEGAL_ARGUMENT_ERROR;
	519	return *this;
	520	}
	521	maxLabelCount_ = maxLabelCount;
	522	if (maxLabelCount < bucketList_->size()) {
	523	indexBuildRequired_ = TRUE;
	524	}
	525	return *this;
	526	}
	527
	528
	529	const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/status/) {
	530	for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
	531	const UnicodeString *s =
	532	static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
	533	if (collator_->compare(*s, lowerLimit) > 0) {
	534	return *s;
	535	}
	536	}
	537	return *EMPTY_STRING;
	538	}
	539
	540	UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
	541	if (U_FAILURE(status)) {
	542	return &dest;
	543	}
	544	UChar32 cp = codePoint.char32At(0);
	545	UScriptCode scriptCode = uscript_getScript(cp, &status);
	546	dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
	547	return &dest;
	548	}
	549
	550	//
	551	// init() - Common code for constructors.
	552	//
	553
	554	void AlphabeticIndex::init(UErrorCode &status) {
	555	// Initialize statics if needed.
	556	AlphabeticIndex::staticInit(status);
	557
	558	// Put the object into a known state so that the destructor will function.
	559
	560	alreadyIn_ = NULL;
	561	bucketList_ = NULL;
	562	collator_ = NULL;
	563	collatorPrimaryOnly_ = NULL;
	564	currentBucket_ = NULL;
	565	firstScriptCharacters_ = NULL;
	566	initialLabels_ = NULL;
	567	indexBuildRequired_ = TRUE;
	568	inputRecords_ = NULL;
	569	itemsIterIndex_ = 0;
	570	labels_ = NULL;
	571	labelsIterIndex_ = 0;
	572	maxLabelCount_ = 99;
	573	noDistinctSorting_ = NULL;
	574	notAlphabetic_ = NULL;
	575	recordCounter_ = 0;
	576
	577	if (U_FAILURE(status)) {
	578	return;
	579	}
	580	alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash,
	581	uhash_compareUnicodeString, // key Comparator,
	582	NULL, // value Comparator
	583	&status);
	584	uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject);
	585	uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject);
	586
	587	bucketList_ = new UVector(status);
	588	bucketList_->setDeleter(alphaIndex_deleteBucket);
	589	labels_ = new UVector(status);
	590	labels_->setDeleter(uprv_deleteUObject);
	591	labels_->setComparer(uhash_compareUnicodeString);
	592	inputRecords_ = new UVector(status);
	593	inputRecords_->setDeleter(alphaIndex_deleteRecord);
	594
	595	noDistinctSorting_ = new UnicodeSet();
	596	notAlphabetic_ = new UnicodeSet();
	597	initialLabels_ = new UnicodeSet();
	598
	599	inflowLabel_.remove();
	600	inflowLabel_.append((UChar)0x2026); // Ellipsis
	601	overflowLabel_ = inflowLabel_;
	602	underflowLabel_ = inflowLabel_;
	603
	604	// TODO: check for memory allocation failures.
	605	}
	606
	607
	608	static UBool indexCharactersAreInitialized = FALSE;
	609
	610	// Index Characters Clean up function. Delete statically allocated constant stuff.
	611	U_CDECL_BEGIN
	612	static UBool U_CALLCONV indexCharacters_cleanup(void) {
	613	AlphabeticIndex::staticCleanup();
	614	return TRUE;
	615	}
	616	U_CDECL_END
	617
	618	void AlphabeticIndex::staticCleanup() {
	619	delete ALPHABETIC;
	620	ALPHABETIC = NULL;
	621	delete HANGUL;
	622	HANGUL = NULL;
	623	delete ETHIOPIC;
	624	ETHIOPIC = NULL;
	625	delete CORE_LATIN;
	626	CORE_LATIN = NULL;
	627	delete IGNORE_SCRIPTS;
	628	IGNORE_SCRIPTS = NULL;
	629	delete TO_TRY;
	630	TO_TRY = NULL;
	631	delete UNIHAN;
	632	UNIHAN = NULL;
	633	delete EMPTY_STRING;
	634	EMPTY_STRING = NULL;
	635	nfkdNormalizer = NULL; // ref to a singleton. Do not delete.
	636	indexCharactersAreInitialized = FALSE;
	637	}
	638
	639
	640	UnicodeSet *AlphabeticIndex::ALPHABETIC;
	641	UnicodeSet *AlphabeticIndex::HANGUL;
	642	UnicodeSet *AlphabeticIndex::ETHIOPIC;
	643	UnicodeSet *AlphabeticIndex::CORE_LATIN;
	644	UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS;
	645	UnicodeSet *AlphabeticIndex::TO_TRY;
	646	UnicodeSet *AlphabeticIndex::UNIHAN;
	647	const UnicodeString *AlphabeticIndex::EMPTY_STRING;
	648
	649	//
	650	// staticInit() One-time initialization of constants.
	651	// Thread safe. Called from constructors.
	652	// Mutex overhead is not a concern. AlphabeticIndex constructors are
	653	// sufficiently heavy that the cost of the mutex check is not significant.
	654
	655	void AlphabeticIndex::staticInit(UErrorCode &status) {
	656	static UMTX IndexCharsInitMutex;
	657
	658	Mutex mutex(&IndexCharsInitMutex);
	659	if (indexCharactersAreInitialized \|\| U_FAILURE(status)) {
	660	return;
	661	}
	662	UBool finishedInit = FALSE;
	663
	664	{
	665	UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
	666	ALPHABETIC = new UnicodeSet(alphaString, status);
	667	if (ALPHABETIC == NULL) {
	668	goto err;
	669	}
	670
	671	HANGUL = new UnicodeSet();
	672	HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
	673	add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
	674	if (HANGUL== NULL) {
	675	goto err;
	676	}
	677
	678
	679	UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
	680	ETHIOPIC = new UnicodeSet(EthiopicStr, status);
	681	if (ETHIOPIC == NULL) {
	682	goto err;
	683	}
	684
	685	CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z');
	686	if (CORE_LATIN == NULL) {
	687	goto err;
	688	}
	689
	690	UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
	691	"[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
	692	IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
	693	IGNORE_SCRIPTS->freeze();
	694	if (IGNORE_SCRIPTS == NULL) {
	695	goto err;
	696	}
	697
	698	UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
	699	TO_TRY = new UnicodeSet(nfcqcStr, status);
	700	if (TO_TRY == NULL) {
	701	goto err;
	702	}
	703
	704	UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
	705	UNIHAN = new UnicodeSet(unihanStr, status);
	706	if (UNIHAN == NULL) {
	707	goto err;
	708	}
	709
	710	EMPTY_STRING = new UnicodeString();
	711
	712	nfkdNormalizer = Normalizer2::getNFKDInstance(status);
	713	if (nfkdNormalizer == NULL) {
	714	goto err;
	715	}
	716	}
	717	finishedInit = TRUE;
	718
	719	err:
	720	if (!finishedInit && U_SUCCESS(status)) {
	721	status = U_MEMORY_ALLOCATION_ERROR;
	722	}
	723	if (U_FAILURE(status)) {
	724	indexCharacters_cleanup();
	725	return;
	726	}
	727	ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
	728	indexCharactersAreInitialized = TRUE;
	729	}
	730
	731
	732	//
	733	// Comparison function for UVector<UnicodeString *> sorting with a collator.
	734	//
	735	static int32_t U_CALLCONV
	736	sortCollateComparator(const void context, const void left, const void *right) {
	737	const UElement leftElement = static_cast<const UElement >(left);
	738	const UElement rightElement = static_cast<const UElement >(right);
	739	const UnicodeString leftString = static_cast<const UnicodeString >(leftElement->pointer);
	740	const UnicodeString rightString = static_cast<const UnicodeString >(rightElement->pointer);
	741	const Collator col = static_cast<const Collator >(context);
	742
	743	if (leftString == rightString) {
	744	// Catches case where both are NULL
	745	return 0;
	746	}
	747	if (leftString == NULL) {
	748	return 1;
	749	};
	750	if (rightString == NULL) {
	751	return -1;
	752	}
	753	Collator::EComparisonResult r = col->compare(leftString, rightString);
	754	return (int32_t) r;
	755	}
	756
	757	//
	758	// Comparison function for UVector<Record *> sorting with a collator.
	759	//
	760	static int32_t U_CALLCONV
	761	recordCompareFn(const void context, const void left, const void *right) {
	762	const UElement leftElement = static_cast<const UElement >(left);
	763	const UElement rightElement = static_cast<const UElement >(right);
	764	const AlphabeticIndex::Record leftRec = static_cast<const AlphabeticIndex::Record >(leftElement->pointer);
	765	const AlphabeticIndex::Record rightRec = static_cast<const AlphabeticIndex::Record >(rightElement->pointer);
	766	const Collator col = static_cast<const Collator >(context);
	767
	768	Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
	769	if (r == Collator::EQUAL) {
	770	if (leftRec->serialNumber_ < rightRec->serialNumber_) {
	771	r = Collator::LESS;
	772	} else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
	773	r = Collator::GREATER;
	774	}
	775	}
	776	return (int32_t) r;
	777	}
	778
	779
	780	#if 0
	781	//
	782	// First characters in scripts.
	783	// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
	784	// The vector is sorted according to this index's collation.
	785	//
	786	// This code is too slow to use, so for now hard code the data.
	787	// Hard coded implementation is follows.
	788	//
	789	UVector AlphabeticIndex::firstStringsInScript(Collator ruleBasedCollator, UErrorCode &status) {
	790
	791	if (U_FAILURE(status)) {
	792	return NULL;
	793	}
	794
	795	UnicodeString results[USCRIPT_CODE_LIMIT];
	796	UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
	797
	798	UnicodeSetIterator siter(*TO_TRY);
	799	while (siter.next()) {
	800	const UnicodeString &current = siter.getString();
	801	Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
	802	if (r < 0) { // TODO fix; we only want "real" script characters, not
	803	// symbols.
	804	continue;
	805	}
	806
	807	int script = uscript_getScript(current.char32At(0), &status);
	808	if (results[script].length() == 0) {
	809	results[script] = current;
	810	}
	811	else if (ruleBasedCollator->compare(current, results[script]) < 0) {
	812	results[script] = current;
	813	}
	814	}
	815
	816	UnicodeSet extras;
	817	UnicodeSet expansions;
	818	RuleBasedCollator rbc = dynamic_cast<RuleBasedCollator >(ruleBasedCollator);
	819	const UCollator *uRuleBasedCollator = rbc->getUCollator();
	820	ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
	821	extras.addAll(expansions).removeAll(*TO_TRY);
	822	if (extras.size() != 0) {
	823	const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
	824	UnicodeSetIterator extrasIter(extras);
	825	while (extrasIter.next()) {
	826	const UnicodeString &current = extrasIter.next();
	827	if (!TO_TRY->containsAll(current))
	828	continue;
	829	if (!normalizer->isNormalized(current, status) \|\|
	830	ruleBasedCollator->compare(current, LOWER_A) < 0) {
	831	continue;
	832	}
	833	int script = uscript_getScript(current.char32At(0), &status);
	834	if (results[script].length() == 0) {
	835	results[script] = current;
	836	} else if (ruleBasedCollator->compare(current, results[script]) < 0) {
	837	results[script] = current;
	838	}
	839	}
	840	}
	841
	842	UVector *dest = new UVector(status);
	843	dest->setDeleter(uprv_deleteUObject);
	844	for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
	845	if (results[i].length() > 0) {
	846	dest->addElement(results[i].clone(), status);
	847	}
	848	}
	849	dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
	850	return dest;
	851	}
	852	#endif
	853
	854
	855	//
	856	// First characters in scripts.
	857	// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
	858	// The vector is sorted according to this index's collation.
	859	//
	860	// It takes too much time to compute this from character properties, so hard code it for now.
	861	// Character constants copied from corresponding declaration in ICU4J.
	862	// See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
	863
	864	static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0,
	865	0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
	866	0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
	867	0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
	868	0xAAF2, 0, // Meetei Mayek
	869	0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
	870	U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
	871	U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
	872	0x1B83, 0,
	873	0xD802, 0xDE00, 0, 0x0E01, 0,
	874	0x0EDE, 0, // Lao
	875	0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
	876	0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
	877	U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
	878	0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
	879	0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
	880	0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
	881	U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
	882	0xD800, 0xDE80, 0,
	883	0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
	884	0xD801, 0xDC80, 0,
	885	U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
	886	0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
	887	0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
	888	U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
	889	U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
	890	0x4E00, 0 };
	891
	892	UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
	893	if (U_FAILURE(status)) {
	894	return NULL;
	895	}
	896	UVector *dest = new UVector(status);
	897	if (dest == NULL) {
	898	if (U_SUCCESS(status)) {
	899	status = U_MEMORY_ALLOCATION_ERROR;
	900	}
	901	return NULL;
	902	}
	903	dest->setDeleter(uprv_deleteUObject);
	904	const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
	905	const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
	906	do {
	907	if (U_FAILURE(status)) {
	908	return dest;
	909	}
	910	UnicodeString *str = new UnicodeString(src, -1);
	911	if (str == NULL) {
	912	status = U_MEMORY_ALLOCATION_ERROR;
	913	} else {
	914	dest->addElement(str, status);
	915	src += str->length() + 1;
	916	}
	917	} while (src < limit);
	918	dest->sortWithUComparator(sortCollateComparator, collator_, status);
	919	return dest;
	920	}
	921
	922
	923	AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
	924	const char *lang = loc.getLanguage();
	925	if (uprv_strcmp(lang, "zh") != 0) {
	926	return kNormal;
	927	}
	928	const char *script = loc.getScript();
	929	if (uprv_strcmp(script, "Hant") == 0) {
	930	return kTraditional;
	931	}
	932	const char *country = loc.getCountry();
	933	if (uprv_strcmp(country, "TW") == 0) {
	934	return kTraditional;
	935	}
	936	return kSimplified;
	937	}
	938
	939
	940	//
	941	// Pinyin Hacks. Direct port from Java.
	942	//
	943
	944	static const UChar32 probeCharInLong = 0x28EAD;
	945
	946
	947	static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
	948	0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /l/0x6C, 0x1E3F, 0x0144, 0x014D,
	949	/p/0x70, 0x71, 0x72, 0x73, 0x74, /w/0x77, 0x78, 0x79, 0x7A};
	950
	951
	952	// Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
	953
	954	AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = {
	955	{(UChar)0, (UChar)0, (UChar)0}, // A
	956	{(UChar)0x516B, (UChar)0, (UChar)0}, // B
	957	{(UChar)0x5693, (UChar)0, (UChar)0}, // C
	958	{(UChar)0x5491, (UChar)0, (UChar)0}, // D
	959	{(UChar)0x59B8, (UChar)0, (UChar)0}, // E
	960	{(UChar)0x53D1, (UChar)0, (UChar)0}, // F
	961	{(UChar)0x65EE, (UChar)0, (UChar)0}, // G
	962	{(UChar)0x54C8, (UChar)0, (UChar)0}, // H
	963	{(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
	964	{(UChar)0x5494, (UChar)0, (UChar)0}, // K
	965	{(UChar)0x5783, (UChar)0, (UChar)0}, // L
	966	{(UChar)0x5452, (UChar)0, (UChar)0}, // M
	967	{(UChar)0x5514, (UChar)0, (UChar)0}, // N
	968	{(UChar)0x5594, (UChar)0, (UChar)0}, // O
	969	{(UChar)0x5991, (UChar)0, (UChar)0}, // P
	970	{(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
	971	{(UChar)0x513F, (UChar)0, (UChar)0}, // R
	972	{(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
	973	{(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
	974	{(UChar)0x7A75, (UChar)0, (UChar)0}, // W
	975	{(UChar)0x5915, (UChar)0, (UChar)0}, // X
	976	{(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
	977	{(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
	978	{(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
	979	};
	980
	981	static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
	982	0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /l/0x6C, 0x1E3F, 0x0144, 0x014D,
	983	/p/0x70, 0x71, 0x72, 0x73, 0x74, /w/0x77, 0x78, 0x79, 0x7A};
	984
	985	AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = {
	986	{(UChar)0, (UChar)0, (UChar)0}, // A
	987	{(UChar)0x516B, (UChar)0, (UChar)0}, // b
	988	{(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
	989	{(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
	990	{(UChar)0x59B8, (UChar)0, (UChar)0}, // e
	991	{(UChar)0x53D1, (UChar)0, (UChar)0}, // f
	992	{(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
	993	{(UChar)0x54C8, (UChar)0, (UChar)0}, // h
	994	{(UChar)0x4E0C, (UChar)0, (UChar)0}, // j
	995	{(UChar)0x5494, (UChar)0, (UChar)0}, // k
	996	{(UChar)0x3547, (UChar)0, (UChar)0}, // l
	997	{(UChar)0x5452, (UChar)0, (UChar)0}, // m
	998	{(UChar)0x5514, (UChar)0, (UChar)0}, // n
	999	{(UChar)0x5594, (UChar)0, (UChar)0}, // o
	1000	{(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
	1001	{(UChar)0x4E03, (UChar)0, (UChar)0}, // q
	1002	{(UChar)0x513F, (UChar)0, (UChar)0}, // r
	1003	{(UChar)0x4EE8, (UChar)0, (UChar)0}, // s
	1004	{(UChar)0x4ED6, (UChar)0, (UChar)0}, // t
	1005	{(UChar)0x7A75, (UChar)0, (UChar)0}, // w
	1006	{(UChar)0x5915, (UChar)0, (UChar)0}, // x
	1007	{(UChar)0x4E2B, (UChar)0, (UChar)0}, // y
	1008	{(UChar)0x5E00, (UChar)0, (UChar)0}, // z
	1009	{(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
	1010	};
	1011
	1012
	1013	//
	1014	// Probe the collation data, and decide which Pinyin tables should be used
	1015	//
	1016	// ICU can be built with a choice between two Chinese collations.
	1017	// The hack Pinyin tables to use depend on which one is in use.
	1018	// We can assume that any given copy of ICU will have only one of the collations available,
	1019	// and that there is no way, in a given process, to create two alphabetic indexes using
	1020	// different Chinese collations. Which means the probe can be done once
	1021	// and the results cached.
	1022	//
	1023	// This whole arrangement is temporary.
	1024	//
	1025	AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL;
	1026	const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL;
	1027
	1028	void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
	1029	{
	1030	Mutex m;
	1031	if (PINYIN_LOWER_BOUNDS != NULL) {
	1032	return;
	1033	}
	1034	}
	1035	UnicodeSet *colSet = col->getTailoredSet(status);
	1036	if (U_FAILURE(status) \|\| colSet == NULL) {
	1037	delete colSet;
	1038	if (U_SUCCESS(status)) {
	1039	status = U_MEMORY_ALLOCATION_ERROR;
	1040	}
	1041	return;
	1042	}
	1043	UBool useLongTables = colSet->contains(probeCharInLong);
	1044	delete colSet;
	1045	{
	1046	Mutex m;
	1047	if (useLongTables) {
	1048	PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
	1049	HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG;
	1050	} else {
	1051	PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
	1052	HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT;
	1053	}
	1054	}
	1055	}
	1056
	1057	// Pinyin Hack:
	1058	// Modify a Chinese name by prepending a Latin letter. The modified name is used
	1059	// when putting records (names) into buckets, to put the name under a Latin index heading.
	1060
	1061	void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
	1062
	1063	if (langType_ != kSimplified \|\| !UNIHAN->contains(name.char32At(0))) {
	1064	dest = name;
	1065	return;
	1066	}
	1067
	1068	UErrorCode status = U_ZERO_ERROR;
	1069	initPinyinBounds(col, status);
	1070	if (U_FAILURE(status)) {
	1071	dest = name;
	1072	return;
	1073	}
	1074	// TODO: use binary search
	1075	int index;
	1076	for (index=0; ; index++) {
	1077	if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
	1078	index--;
	1079	break;
	1080	}
	1081	int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1));
	1082	if (compareResult < 0) {
	1083	index--;
	1084	}
	1085	if (compareResult <= 0) {
	1086	break;
	1087	}
	1088	}
	1089	UChar c = PINYIN_LOWER_BOUNDS[index];
	1090	dest.setTo(c);
	1091	dest.append(name);
	1092	return;
	1093	}
	1094
	1095
	1096
	1097	/**
	1098	* Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
	1099	* better, and otherwise binary order is better.
	1100	*
	1101	* For use with array sort or UVector.
	1102	* @param context A UErrorCode pointer.
	1103	* @param left A UElement pointer, which must refer to a UnicodeString *
	1104	* @param right A UElement pointer, which must refer to a UnicodeString *
	1105	*/
	1106
	1107	static int32_t U_CALLCONV
	1108	PreferenceComparator(const void context, const void left, const void *right) {
	1109	const UElement leftElement = static_cast<const UElement >(left);
	1110	const UElement rightElement = static_cast<const UElement >(right);
	1111	const UnicodeString s1 = static_cast<const UnicodeString >(leftElement->pointer);
	1112	const UnicodeString s2 = static_cast<const UnicodeString >(rightElement->pointer);
	1113	UErrorCode &status = (UErrorCode )(context); // Cast off both static and const.
	1114	if (s1 == s2) {
	1115	return 0;
	1116	}
	1117
	1118	UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
	1119	UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
	1120	int32_t result = n1.length() - n2.length();
	1121	if (result != 0) {
	1122	return result;
	1123	}
	1124
	1125	result = n1.compareCodePointOrder(n2);
	1126	if (result != 0) {
	1127	return result;
	1128	}
	1129	return s1->compareCodePointOrder(*s2);
	1130	}
	1131
	1132
	1133	//
	1134	// Constructor & Destructor for AlphabeticIndex::Record
	1135	//
	1136	// Records are internal only, instances are not directly surfaced in the public API.
	1137	// This class is mostly struct-like, with all public fields.
	1138
	1139	AlphabeticIndex::Record::Record(AlphabeticIndex alphaIndex, const UnicodeString &name, const void data):
	1140	alphaIndex_(alphaIndex), name_(name), data_(data)
	1141	{
	1142	UnicodeString prefixedName;
	1143	alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
	1144	serialNumber_ = ++alphaIndex->recordCounter_;
	1145	}
	1146
	1147	AlphabeticIndex::Record::~Record() {
	1148	}
	1149
	1150
	1151	AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
	1152	if (U_FAILURE(status)) {
	1153	return *this;
	1154	}
	1155	Record *r = new Record(this, name, data);
	1156	inputRecords_->addElement(r, status);
	1157	indexBuildRequired_ = TRUE;
	1158	//std::string ss;
	1159	//std::string ss2;
	1160	//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
	1161	// " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
	1162	return *this;
	1163	}
	1164
	1165
	1166	AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
	1167	if (U_FAILURE(status)) {
	1168	return *this;
	1169	}
	1170	inputRecords_->removeAllElements();
	1171	indexBuildRequired_ = TRUE;
	1172	return *this;
	1173	}
	1174
	1175
	1176	int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
	1177	buildIndex(status);
	1178	if (U_FAILURE(status)) {
	1179	return 0;
	1180	}
	1181
	1182	// For simplified Chinese prepend a prefix to the name.
	1183	// For non-Chinese locales or non-Chinese names, the name is not modified.
	1184
	1185	UnicodeString prefixedName;
	1186	hackName(prefixedName, name, collatorPrimaryOnly_);
	1187
	1188	// TODO: use a binary search.
	1189	for (int32_t i = 0; i < bucketList_->size(); ++i) {
	1190	Bucket bucket = static_cast<Bucket >(bucketList_->elementAt(i));
	1191	Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
	1192	if (comp < 0) {
	1193	return i - 1;
	1194	}
	1195	}
	1196	// Loop runs until we find the bucket following the one that would hold prefixedName.
	1197	// If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
	1198	// than returning from the middle.
	1199
	1200	return bucketList_->size() - 1;
	1201	}
	1202
	1203
	1204	int32_t AlphabeticIndex::getBucketIndex() const {
	1205	return labelsIterIndex_;
	1206	}
	1207
	1208
	1209	UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
	1210	if (U_FAILURE(status)) {
	1211	return FALSE;
	1212	}
	1213	if (indexBuildRequired_ && currentBucket_ != NULL) {
	1214	status = U_ENUM_OUT_OF_SYNC_ERROR;
	1215	return FALSE;
	1216	}
	1217	buildIndex(status);
	1218	if (U_FAILURE(status)) {
	1219	return FALSE;
	1220	}
	1221	++labelsIterIndex_;
	1222	if (labelsIterIndex_ >= bucketList_->size()) {
	1223	labelsIterIndex_ = bucketList_->size();
	1224	return FALSE;
	1225	}
	1226	currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
	1227	resetRecordIterator();
	1228	return TRUE;
	1229	}
	1230
	1231	const UnicodeString &AlphabeticIndex::getBucketLabel() const {
	1232	if (currentBucket_ != NULL) {
	1233	return currentBucket_->label_;
	1234	} else {
	1235	return *EMPTY_STRING;
	1236	}
	1237	}
	1238
	1239
	1240	UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
	1241	if (currentBucket_ != NULL) {
	1242	return currentBucket_->labelType_;
	1243	} else {
	1244	return U_ALPHAINDEX_NORMAL;
	1245	}
	1246	}
	1247
	1248
	1249	int32_t AlphabeticIndex::getBucketRecordCount() const {
	1250	if (currentBucket_ != NULL) {
	1251	return currentBucket_->records_->size();
	1252	} else {
	1253	return 0;
	1254	}
	1255	}
	1256
	1257
	1258	AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
	1259	if (U_FAILURE(status)) {
	1260	return *this;
	1261	}
	1262	buildIndex(status);
	1263	labelsIterIndex_ = -1;
	1264	currentBucket_ = NULL;
	1265	return *this;
	1266	}
	1267
	1268
	1269	UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
	1270	if (U_FAILURE(status)) {
	1271	return FALSE;
	1272	}
	1273	if (currentBucket_ == NULL) {
	1274	// We are trying to iterate over the items in a bucket, but there is no
	1275	// current bucket from the enumeration of buckets.
	1276	status = U_INVALID_STATE_ERROR;
	1277	return FALSE;
	1278	}
	1279	if (indexBuildRequired_) {
	1280	status = U_ENUM_OUT_OF_SYNC_ERROR;
	1281	return FALSE;
	1282	}
	1283	++itemsIterIndex_;
	1284	if (itemsIterIndex_ >= currentBucket_->records_->size()) {
	1285	itemsIterIndex_ = currentBucket_->records_->size();
	1286	return FALSE;
	1287	}
	1288	return TRUE;
	1289	}
	1290
	1291
	1292	const UnicodeString &AlphabeticIndex::getRecordName() const {
	1293	const UnicodeString *retStr = EMPTY_STRING;
	1294	if (currentBucket_ != NULL &&
	1295	itemsIterIndex_ >= 0 &&
	1296	itemsIterIndex_ < currentBucket_->records_->size()) {
	1297	Record item = static_cast<Record >(currentBucket_->records_->elementAt(itemsIterIndex_));
	1298	retStr = &item->name_;
	1299	}
	1300	return *retStr;
	1301	}
	1302
	1303	const void *AlphabeticIndex::getRecordData() const {
	1304	const void *retPtr = NULL;
	1305	if (currentBucket_ != NULL &&
	1306	itemsIterIndex_ >= 0 &&
	1307	itemsIterIndex_ < currentBucket_->records_->size()) {
	1308	Record item = static_cast<Record >(currentBucket_->records_->elementAt(itemsIterIndex_));
	1309	retPtr = item->data_;
	1310	}
	1311	return retPtr;
	1312	}
	1313
	1314
	1315	AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
	1316	itemsIterIndex_ = -1;
	1317	return *this;
	1318	}
	1319
	1320
	1321
	1322	AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
	1323	const UnicodeString &lowerBoundary,
	1324	UAlphabeticIndexLabelType type,
	1325	UErrorCode &status):
	1326	label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
	1327	if (U_FAILURE(status)) {
	1328	return;
	1329	}
	1330	records_ = new UVector(status);
	1331	if (records_ == NULL && U_SUCCESS(status)) {
	1332	status = U_MEMORY_ALLOCATION_ERROR;
	1333	}
	1334	}
	1335
	1336
	1337	AlphabeticIndex::Bucket::~Bucket() {
	1338	delete records_;
	1339	}
	1340
	1341	U_NAMESPACE_END
	1342
	1343	#endif