]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2009-2012, International Business Machines Corporation and * | |
4 | * others. All Rights Reserved. * | |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
8 | /** | |
9 | * \file | |
10 | * \brief C API: AlphabeticIndex class | |
11 | */ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION | |
16 | ||
17 | #include "unicode/alphaindex.h" | |
18 | #include "unicode/coll.h" | |
19 | #include "unicode/normalizer2.h" | |
20 | #include "unicode/strenum.h" | |
21 | #include "unicode/tblcoll.h" | |
22 | #include "unicode/ulocdata.h" | |
23 | #include "unicode/uniset.h" | |
24 | #include "unicode/uobject.h" | |
25 | #include "unicode/uscript.h" | |
26 | #include "unicode/usetiter.h" | |
27 | #include "unicode/ustring.h" | |
28 | #include "unicode/utf16.h" | |
29 | ||
30 | #include "cstring.h" | |
31 | #include "mutex.h" | |
32 | #include "uassert.h" | |
33 | #include "ucln_in.h" | |
34 | #include "uhash.h" | |
35 | #include "uvector.h" | |
36 | ||
37 | //#include <string> | |
38 | //#include <iostream> | |
39 | U_NAMESPACE_BEGIN | |
40 | ||
41 | UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex) | |
42 | ||
43 | // Forward Declarations | |
44 | static int32_t U_CALLCONV | |
45 | PreferenceComparator(const void *context, const void *left, const void *right); | |
46 | ||
47 | static int32_t U_CALLCONV | |
48 | sortCollateComparator(const void *context, const void *left, const void *right); | |
49 | ||
50 | static int32_t U_CALLCONV | |
51 | recordCompareFn(const void *context, const void *left, const void *right); | |
52 | ||
53 | // UVector<Bucket *> support function, delete a Bucket. | |
54 | static void U_CALLCONV | |
55 | alphaIndex_deleteBucket(void *obj) { | |
56 | delete static_cast<AlphabeticIndex::Bucket *>(obj); | |
57 | } | |
58 | ||
59 | // UVector<Record *> support function, delete a Record. | |
60 | static void U_CALLCONV | |
61 | alphaIndex_deleteRecord(void *obj) { | |
62 | delete static_cast<AlphabeticIndex::Record *>(obj); | |
63 | } | |
64 | ||
65 | ||
66 | ||
67 | static const Normalizer2 *nfkdNormalizer; | |
68 | ||
69 | // | |
70 | // Append the contents of a UnicodeSet to a UVector of UnicodeStrings. | |
71 | // Append everything - individual characters are handled as strings of length 1. | |
72 | // The destination vector owns the appended strings. | |
73 | ||
74 | static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) { | |
75 | UnicodeSetIterator setIter(source); | |
76 | while (setIter.next()) { | |
77 | const UnicodeString &str = setIter.getString(); | |
78 | dest.addElement(str.clone(), status); | |
79 | } | |
80 | } | |
81 | ||
82 | ||
83 | AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) { | |
84 | init(status); | |
85 | if (U_FAILURE(status)) { | |
86 | return; | |
87 | } | |
88 | locale_ = locale; | |
89 | langType_ = langTypeFromLocale(locale_); | |
90 | ||
91 | collator_ = Collator::createInstance(locale, status); | |
92 | if (collator_ != NULL) { | |
93 | collatorPrimaryOnly_ = collator_->clone(); | |
94 | } | |
95 | if (collatorPrimaryOnly_ != NULL) { | |
96 | collatorPrimaryOnly_->setStrength(Collator::PRIMARY); | |
97 | } | |
98 | getIndexExemplars(*initialLabels_, locale, status); | |
99 | indexBuildRequired_ = TRUE; | |
100 | if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) { | |
101 | status = U_MEMORY_ALLOCATION_ERROR; | |
102 | } | |
103 | firstScriptCharacters_ = firstStringsInScript(status); | |
104 | } | |
105 | ||
106 | ||
107 | AlphabeticIndex::~AlphabeticIndex() { | |
108 | uhash_close(alreadyIn_); | |
109 | delete bucketList_; | |
110 | delete collator_; | |
111 | delete collatorPrimaryOnly_; | |
112 | delete firstScriptCharacters_; | |
113 | delete labels_; | |
114 | delete inputRecords_; | |
115 | delete noDistinctSorting_; | |
116 | delete notAlphabetic_; | |
117 | delete initialLabels_; | |
118 | } | |
119 | ||
120 | ||
121 | AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { | |
122 | if (U_FAILURE(status)) { | |
123 | return *this; | |
124 | } | |
125 | initialLabels_->addAll(additions); | |
126 | return *this; | |
127 | } | |
128 | ||
129 | ||
130 | AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { | |
131 | if (U_FAILURE(status)) { | |
132 | return *this; | |
133 | } | |
134 | UnicodeSet additions; | |
135 | getIndexExemplars(additions, locale, status); | |
136 | initialLabels_->addAll(additions); | |
137 | return *this; | |
138 | } | |
139 | ||
140 | ||
141 | int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { | |
142 | buildIndex(status); | |
143 | if (U_FAILURE(status)) { | |
144 | return 0; | |
145 | } | |
146 | return bucketList_->size(); | |
147 | } | |
148 | ||
149 | ||
150 | int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { | |
151 | if (U_FAILURE(status)) { | |
152 | return 0; | |
153 | } | |
154 | return inputRecords_->size(); | |
155 | } | |
156 | ||
157 | ||
158 | void AlphabeticIndex::buildIndex(UErrorCode &status) { | |
159 | if (U_FAILURE(status)) { | |
160 | return; | |
161 | } | |
162 | if (!indexBuildRequired_) { | |
163 | return; | |
164 | } | |
165 | ||
166 | // Discard any already-built data. | |
167 | // This is important when the user builds and uses an index, then subsequently modifies it, | |
168 | // necessitating a rebuild. | |
169 | ||
170 | bucketList_->removeAllElements(); | |
171 | labels_->removeAllElements(); | |
172 | uhash_removeAll(alreadyIn_); | |
173 | noDistinctSorting_->clear(); | |
174 | notAlphabetic_->clear(); | |
175 | ||
176 | // first sort the incoming Labels, with a "best" ordering among items | |
177 | // that are the same according to the collator | |
178 | ||
179 | UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. | |
180 | preferenceSorting.setDeleter(uprv_deleteUObject); | |
181 | appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); | |
182 | preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); | |
183 | ||
184 | // We now make a set of Labels. | |
185 | // Some of the input may, however, be redundant. | |
186 | // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" | |
187 | // So we make a pass through, filtering out those cases. | |
188 | // TODO: filtering these out would seem to be at odds with the eventual goal | |
189 | // of being able to split buckets that contain too many items. | |
190 | ||
191 | UnicodeSet labelSet; | |
192 | for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) { | |
193 | UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex)); | |
194 | // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, | |
195 | // is it even possible for duplicates to show up in this check? | |
196 | if (labelSet.contains(item)) { | |
197 | UnicodeSetIterator itemAlreadyInIter(labelSet); | |
198 | while (itemAlreadyInIter.next()) { | |
199 | const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); | |
200 | if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { | |
201 | UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn)); | |
202 | if (targets == NULL) { | |
203 | // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>()); | |
204 | targets = new UnicodeSet(); | |
205 | uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); | |
206 | } | |
207 | targets->add(item); | |
208 | break; | |
209 | } | |
210 | } | |
211 | } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. | |
212 | collatorPrimaryOnly_->compare(item, separated(item)) == 0) { | |
213 | noDistinctSorting_->add(item); | |
214 | } else if (!ALPHABETIC->containsSome(item)) { | |
215 | notAlphabetic_->add(item); | |
216 | } else { | |
217 | labelSet.add(item); | |
218 | } | |
219 | } | |
220 | ||
221 | // If we have no labels, hard-code a fallback default set of [A-Z] | |
222 | // This case can occur with locales that don't have exemplar character data, including root. | |
223 | // A no-labels situation will cause other problems; it needs to be avoided. | |
224 | if (labelSet.isEmpty()) { | |
225 | labelSet.add((UChar32)0x41, (UChar32)0x5A); | |
226 | } | |
227 | ||
228 | // Move the set of Labels from the set into a vector, and sort | |
229 | // according to the collator. | |
230 | ||
231 | appendUnicodeSetToUVector(*labels_, labelSet, status); | |
232 | labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); | |
233 | ||
234 | // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element | |
235 | // Implemented by copying the elements to be retained to a new UVector. | |
236 | ||
237 | const int32_t size = labelSet.size() - 1; | |
238 | if (size > maxLabelCount_) { | |
239 | UVector *newLabels = new UVector(status); | |
240 | newLabels->setDeleter(uprv_deleteUObject); | |
241 | int32_t count = 0; | |
242 | int32_t old = -1; | |
243 | for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) { | |
244 | const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex)); | |
245 | ++count; | |
246 | const int32_t bump = count * maxLabelCount_ / size; | |
247 | if (bump == old) { | |
248 | // it.remove(); | |
249 | } else { | |
250 | newLabels->addElement(str->clone(), status); | |
251 | old = bump; | |
252 | } | |
253 | } | |
254 | delete labels_; | |
255 | labels_ = newLabels; | |
256 | } | |
257 | ||
258 | // We now know the list of labels. | |
259 | // Create a corresponding list of buckets, one per label. | |
260 | ||
261 | buildBucketList(status); // Corresponds to Java BucketList constructor. | |
262 | ||
263 | // Bin the Records into the Buckets. | |
264 | bucketRecords(status); | |
265 | ||
266 | indexBuildRequired_ = FALSE; | |
267 | resetBucketIterator(status); | |
268 | } | |
269 | ||
270 | // | |
271 | // buildBucketList() Corresponds to the BucketList constructor in the Java version. | |
272 | ||
273 | void AlphabeticIndex::buildBucketList(UErrorCode &status) { | |
274 | UnicodeString labelStr = getUnderflowLabel(); | |
275 | Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status); | |
276 | bucketList_->addElement(b, status); | |
277 | ||
278 | // Build up the list, adding underflow, additions, overflow | |
279 | // insert infix labels as needed, using \uFFFF. | |
280 | const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0)); | |
281 | b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status); | |
282 | bucketList_->addElement(b, status); | |
283 | ||
284 | UnicodeSet lastSet; | |
285 | UnicodeSet set; | |
286 | AlphabeticIndex::getScriptSet(lastSet, *last, status); | |
287 | lastSet.removeAll(*IGNORE_SCRIPTS); | |
288 | ||
289 | for (int i = 1; i < labels_->size(); ++i) { | |
290 | UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i)); | |
291 | getScriptSet(set, *current, status); | |
292 | set.removeAll(*IGNORE_SCRIPTS); | |
293 | if (lastSet.containsNone(set)) { | |
294 | // check for adjacent | |
295 | const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status); | |
296 | if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) { | |
297 | labelStr = getInflowLabel(); | |
298 | b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status); | |
299 | bucketList_->addElement(b, status); | |
300 | i++; | |
301 | lastSet = set; | |
302 | } | |
303 | } | |
304 | b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status); | |
305 | bucketList_->addElement(b, status); | |
306 | last = current; | |
307 | lastSet = set; | |
308 | } | |
309 | const UnicodeString &limitString = getOverflowComparisonString(*last, status); | |
310 | b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status); | |
311 | bucketList_->addElement(b, status); | |
312 | // final overflow bucket | |
313 | } | |
314 | ||
315 | ||
316 | // | |
317 | // Place all of the raw input records into the correct bucket. | |
318 | // | |
319 | // Begin by sorting the input records; this lets us bin them in a single pass. | |
320 | // | |
321 | // Note on storage management: The input records are owned by the | |
322 | // inputRecords_ vector, and will (eventually) be auto-deleted by it. | |
323 | // The Bucket objects have pointers to the Record objects, but do not own them. | |
324 | // | |
325 | void AlphabeticIndex::bucketRecords(UErrorCode &status) { | |
326 | if (U_FAILURE(status)) { | |
327 | return; | |
328 | } | |
329 | ||
330 | inputRecords_->sortWithUComparator(recordCompareFn, collator_, status); | |
331 | U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow | |
332 | // bucket, even if no user labels. | |
333 | int32_t bucketIndex = 0; | |
334 | Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex)); | |
335 | Bucket *nextBucket = NULL; | |
336 | if (bucketIndex+1 < bucketList_->size()) { | |
337 | nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1)); | |
338 | } | |
339 | int32_t recordIndex = 0; | |
340 | Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex)); | |
341 | while (recordIndex < inputRecords_->size()) { | |
342 | if (nextBucket == NULL || | |
343 | collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) { | |
344 | // Record goes in current bucket. Advance to next record, | |
345 | // stay on current bucket. | |
346 | destBucket->records_->addElement(r, status); | |
347 | ++recordIndex; | |
348 | r = static_cast<Record *>(inputRecords_->elementAt(recordIndex)); | |
349 | } else { | |
350 | // Advance to the next bucket, stay on current record. | |
351 | bucketIndex++; | |
352 | destBucket = nextBucket; | |
353 | if (bucketIndex+1 < bucketList_->size()) { | |
354 | nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1)); | |
355 | } else { | |
356 | nextBucket = NULL; | |
357 | } | |
358 | U_ASSERT(destBucket != NULL); | |
359 | } | |
360 | } | |
361 | ||
362 | } | |
363 | ||
364 | ||
365 | void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) { | |
366 | if (U_FAILURE(status)) { | |
367 | return; | |
368 | } | |
369 | ||
370 | LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); | |
371 | UnicodeSet exemplars; | |
372 | ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); | |
373 | if (U_SUCCESS(status)) { | |
374 | dest.addAll(exemplars); | |
375 | return; | |
376 | } | |
377 | status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR | |
378 | ||
379 | // Locale data did not include explicit Index characters. | |
380 | // Synthesize a set of them from the locale's standard exemplar characters. | |
381 | ||
382 | ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); | |
383 | if (U_FAILURE(status)) { | |
384 | return; | |
385 | } | |
386 | ||
387 | // Upper-case any that aren't already so. | |
388 | // (We only do this for synthesized index characters.) | |
389 | ||
390 | UnicodeSetIterator it(exemplars); | |
391 | UnicodeString upperC; | |
392 | UnicodeSet lowersToRemove; | |
393 | UnicodeSet uppersToAdd; | |
394 | while (it.next()) { | |
395 | const UnicodeString &exemplarC = it.getString(); | |
396 | upperC = exemplarC; | |
397 | upperC.toUpper(locale); | |
398 | if (exemplarC != upperC) { | |
399 | lowersToRemove.add(exemplarC); | |
400 | uppersToAdd.add(upperC); | |
401 | } | |
402 | } | |
403 | exemplars.removeAll(lowersToRemove); | |
404 | exemplars.addAll(uppersToAdd); | |
405 | ||
406 | // get the exemplars, and handle special cases | |
407 | ||
408 | // question: should we add auxiliary exemplars? | |
409 | if (exemplars.containsSome(*CORE_LATIN)) { | |
410 | exemplars.addAll(*CORE_LATIN); | |
411 | } | |
412 | if (exemplars.containsSome(*HANGUL)) { | |
413 | // cut down to small list | |
414 | UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status); | |
415 | exemplars.removeAll(BLOCK_HANGUL_SYLLABLES); | |
416 | exemplars.addAll(*HANGUL); | |
417 | } | |
418 | if (exemplars.containsSome(*ETHIOPIC)) { | |
419 | // cut down to small list | |
420 | // make use of the fact that Ethiopic is allocated in 8's, where | |
421 | // the base is 0 mod 8. | |
422 | UnicodeSetIterator it(*ETHIOPIC); | |
423 | while (it.next() && !it.isString()) { | |
424 | if ((it.getCodepoint() & 0x7) != 0) { | |
425 | exemplars.remove(it.getCodepoint()); | |
426 | } | |
427 | } | |
428 | } | |
429 | dest.addAll(exemplars); | |
430 | } | |
431 | ||
432 | ||
433 | /* | |
434 | * Return the string with interspersed CGJs. Input must have more than 2 codepoints. | |
435 | */ | |
436 | static const UChar32 CGJ = (UChar)0x034F; | |
437 | UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { | |
438 | UnicodeString result; | |
439 | if (item.length() == 0) { | |
440 | return result; | |
441 | } | |
442 | int32_t i = 0; | |
443 | for (;;) { | |
444 | UChar32 cp = item.char32At(i); | |
445 | result.append(cp); | |
446 | i = item.moveIndex32(i, 1); | |
447 | if (i >= item.length()) { | |
448 | break; | |
449 | } | |
450 | result.append(CGJ); | |
451 | } | |
452 | return result; | |
453 | } | |
454 | ||
455 | ||
456 | UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { | |
457 | return FALSE; | |
458 | } | |
459 | ||
460 | ||
461 | UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { | |
462 | return FALSE; | |
463 | } | |
464 | ||
465 | ||
466 | const RuleBasedCollator &AlphabeticIndex::getCollator() const { | |
467 | // There are no known non-RuleBasedCollator collators, and none ever expected. | |
468 | // But, in case that changes, better a null pointer than a wrong type. | |
469 | return *dynamic_cast<RuleBasedCollator *>(collator_); | |
470 | } | |
471 | ||
472 | ||
473 | const UnicodeString &AlphabeticIndex::getInflowLabel() const { | |
474 | return inflowLabel_; | |
475 | } | |
476 | ||
477 | const UnicodeString &AlphabeticIndex::getOverflowLabel() const { | |
478 | return overflowLabel_; | |
479 | } | |
480 | ||
481 | ||
482 | const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { | |
483 | return underflowLabel_; | |
484 | } | |
485 | ||
486 | ||
487 | AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | |
488 | inflowLabel_ = label; | |
489 | indexBuildRequired_ = TRUE; | |
490 | return *this; | |
491 | } | |
492 | ||
493 | ||
494 | AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | |
495 | overflowLabel_ = label; | |
496 | indexBuildRequired_ = TRUE; | |
497 | return *this; | |
498 | } | |
499 | ||
500 | ||
501 | AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | |
502 | underflowLabel_ = label; | |
503 | indexBuildRequired_ = TRUE; | |
504 | return *this; | |
505 | } | |
506 | ||
507 | ||
508 | int32_t AlphabeticIndex::getMaxLabelCount() const { | |
509 | return maxLabelCount_; | |
510 | } | |
511 | ||
512 | ||
513 | AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { | |
514 | if (U_FAILURE(status)) { | |
515 | return *this; | |
516 | } | |
517 | if (maxLabelCount <= 0) { | |
518 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
519 | return *this; | |
520 | } | |
521 | maxLabelCount_ = maxLabelCount; | |
522 | if (maxLabelCount < bucketList_->size()) { | |
523 | indexBuildRequired_ = TRUE; | |
524 | } | |
525 | return *this; | |
526 | } | |
527 | ||
528 | ||
529 | const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) { | |
530 | for (int32_t i=0; i<firstScriptCharacters_->size(); i++) { | |
531 | const UnicodeString *s = | |
532 | static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i)); | |
533 | if (collator_->compare(*s, lowerLimit) > 0) { | |
534 | return *s; | |
535 | } | |
536 | } | |
537 | return *EMPTY_STRING; | |
538 | } | |
539 | ||
540 | UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) { | |
541 | if (U_FAILURE(status)) { | |
542 | return &dest; | |
543 | } | |
544 | UChar32 cp = codePoint.char32At(0); | |
545 | UScriptCode scriptCode = uscript_getScript(cp, &status); | |
546 | dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status); | |
547 | return &dest; | |
548 | } | |
549 | ||
550 | // | |
551 | // init() - Common code for constructors. | |
552 | // | |
553 | ||
554 | void AlphabeticIndex::init(UErrorCode &status) { | |
555 | // Initialize statics if needed. | |
556 | AlphabeticIndex::staticInit(status); | |
557 | ||
558 | // Put the object into a known state so that the destructor will function. | |
559 | ||
560 | alreadyIn_ = NULL; | |
561 | bucketList_ = NULL; | |
562 | collator_ = NULL; | |
563 | collatorPrimaryOnly_ = NULL; | |
564 | currentBucket_ = NULL; | |
565 | firstScriptCharacters_ = NULL; | |
566 | initialLabels_ = NULL; | |
567 | indexBuildRequired_ = TRUE; | |
568 | inputRecords_ = NULL; | |
569 | itemsIterIndex_ = 0; | |
570 | labels_ = NULL; | |
571 | labelsIterIndex_ = 0; | |
572 | maxLabelCount_ = 99; | |
573 | noDistinctSorting_ = NULL; | |
574 | notAlphabetic_ = NULL; | |
575 | recordCounter_ = 0; | |
576 | ||
577 | if (U_FAILURE(status)) { | |
578 | return; | |
579 | } | |
580 | alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash, | |
581 | uhash_compareUnicodeString, // key Comparator, | |
582 | NULL, // value Comparator | |
583 | &status); | |
584 | uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject); | |
585 | uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject); | |
586 | ||
587 | bucketList_ = new UVector(status); | |
588 | bucketList_->setDeleter(alphaIndex_deleteBucket); | |
589 | labels_ = new UVector(status); | |
590 | labels_->setDeleter(uprv_deleteUObject); | |
591 | labels_->setComparer(uhash_compareUnicodeString); | |
592 | inputRecords_ = new UVector(status); | |
593 | inputRecords_->setDeleter(alphaIndex_deleteRecord); | |
594 | ||
595 | noDistinctSorting_ = new UnicodeSet(); | |
596 | notAlphabetic_ = new UnicodeSet(); | |
597 | initialLabels_ = new UnicodeSet(); | |
598 | ||
599 | inflowLabel_.remove(); | |
600 | inflowLabel_.append((UChar)0x2026); // Ellipsis | |
601 | overflowLabel_ = inflowLabel_; | |
602 | underflowLabel_ = inflowLabel_; | |
603 | ||
604 | // TODO: check for memory allocation failures. | |
605 | } | |
606 | ||
607 | ||
608 | static UBool indexCharactersAreInitialized = FALSE; | |
609 | ||
610 | // Index Characters Clean up function. Delete statically allocated constant stuff. | |
611 | U_CDECL_BEGIN | |
612 | static UBool U_CALLCONV indexCharacters_cleanup(void) { | |
613 | AlphabeticIndex::staticCleanup(); | |
614 | return TRUE; | |
615 | } | |
616 | U_CDECL_END | |
617 | ||
618 | void AlphabeticIndex::staticCleanup() { | |
619 | delete ALPHABETIC; | |
620 | ALPHABETIC = NULL; | |
621 | delete HANGUL; | |
622 | HANGUL = NULL; | |
623 | delete ETHIOPIC; | |
624 | ETHIOPIC = NULL; | |
625 | delete CORE_LATIN; | |
626 | CORE_LATIN = NULL; | |
627 | delete IGNORE_SCRIPTS; | |
628 | IGNORE_SCRIPTS = NULL; | |
629 | delete TO_TRY; | |
630 | TO_TRY = NULL; | |
631 | delete UNIHAN; | |
632 | UNIHAN = NULL; | |
633 | delete EMPTY_STRING; | |
634 | EMPTY_STRING = NULL; | |
635 | nfkdNormalizer = NULL; // ref to a singleton. Do not delete. | |
636 | indexCharactersAreInitialized = FALSE; | |
637 | } | |
638 | ||
639 | ||
640 | UnicodeSet *AlphabeticIndex::ALPHABETIC; | |
641 | UnicodeSet *AlphabeticIndex::HANGUL; | |
642 | UnicodeSet *AlphabeticIndex::ETHIOPIC; | |
643 | UnicodeSet *AlphabeticIndex::CORE_LATIN; | |
644 | UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS; | |
645 | UnicodeSet *AlphabeticIndex::TO_TRY; | |
646 | UnicodeSet *AlphabeticIndex::UNIHAN; | |
647 | const UnicodeString *AlphabeticIndex::EMPTY_STRING; | |
648 | ||
649 | // | |
650 | // staticInit() One-time initialization of constants. | |
651 | // Thread safe. Called from constructors. | |
652 | // Mutex overhead is not a concern. AlphabeticIndex constructors are | |
653 | // sufficiently heavy that the cost of the mutex check is not significant. | |
654 | ||
655 | void AlphabeticIndex::staticInit(UErrorCode &status) { | |
656 | static UMTX IndexCharsInitMutex; | |
657 | ||
658 | Mutex mutex(&IndexCharsInitMutex); | |
659 | if (indexCharactersAreInitialized || U_FAILURE(status)) { | |
660 | return; | |
661 | } | |
662 | UBool finishedInit = FALSE; | |
663 | ||
664 | { | |
665 | UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]"); | |
666 | ALPHABETIC = new UnicodeSet(alphaString, status); | |
667 | if (ALPHABETIC == NULL) { | |
668 | goto err; | |
669 | } | |
670 | ||
671 | HANGUL = new UnicodeSet(); | |
672 | HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC). | |
673 | add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558); | |
674 | if (HANGUL== NULL) { | |
675 | goto err; | |
676 | } | |
677 | ||
678 | ||
679 | UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); | |
680 | ETHIOPIC = new UnicodeSet(EthiopicStr, status); | |
681 | if (ETHIOPIC == NULL) { | |
682 | goto err; | |
683 | } | |
684 | ||
685 | CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z'); | |
686 | if (CORE_LATIN == NULL) { | |
687 | goto err; | |
688 | } | |
689 | ||
690 | UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE( | |
691 | "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]"); | |
692 | IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status); | |
693 | IGNORE_SCRIPTS->freeze(); | |
694 | if (IGNORE_SCRIPTS == NULL) { | |
695 | goto err; | |
696 | } | |
697 | ||
698 | UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]"); | |
699 | TO_TRY = new UnicodeSet(nfcqcStr, status); | |
700 | if (TO_TRY == NULL) { | |
701 | goto err; | |
702 | } | |
703 | ||
704 | UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]"); | |
705 | UNIHAN = new UnicodeSet(unihanStr, status); | |
706 | if (UNIHAN == NULL) { | |
707 | goto err; | |
708 | } | |
709 | ||
710 | EMPTY_STRING = new UnicodeString(); | |
711 | ||
712 | nfkdNormalizer = Normalizer2::getNFKDInstance(status); | |
713 | if (nfkdNormalizer == NULL) { | |
714 | goto err; | |
715 | } | |
716 | } | |
717 | finishedInit = TRUE; | |
718 | ||
719 | err: | |
720 | if (!finishedInit && U_SUCCESS(status)) { | |
721 | status = U_MEMORY_ALLOCATION_ERROR; | |
722 | } | |
723 | if (U_FAILURE(status)) { | |
724 | indexCharacters_cleanup(); | |
725 | return; | |
726 | } | |
727 | ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup); | |
728 | indexCharactersAreInitialized = TRUE; | |
729 | } | |
730 | ||
731 | ||
732 | // | |
733 | // Comparison function for UVector<UnicodeString *> sorting with a collator. | |
734 | // | |
735 | static int32_t U_CALLCONV | |
736 | sortCollateComparator(const void *context, const void *left, const void *right) { | |
737 | const UElement *leftElement = static_cast<const UElement *>(left); | |
738 | const UElement *rightElement = static_cast<const UElement *>(right); | |
739 | const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); | |
740 | const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); | |
741 | const Collator *col = static_cast<const Collator *>(context); | |
742 | ||
743 | if (leftString == rightString) { | |
744 | // Catches case where both are NULL | |
745 | return 0; | |
746 | } | |
747 | if (leftString == NULL) { | |
748 | return 1; | |
749 | }; | |
750 | if (rightString == NULL) { | |
751 | return -1; | |
752 | } | |
753 | Collator::EComparisonResult r = col->compare(*leftString, *rightString); | |
754 | return (int32_t) r; | |
755 | } | |
756 | ||
757 | // | |
758 | // Comparison function for UVector<Record *> sorting with a collator. | |
759 | // | |
760 | static int32_t U_CALLCONV | |
761 | recordCompareFn(const void *context, const void *left, const void *right) { | |
762 | const UElement *leftElement = static_cast<const UElement *>(left); | |
763 | const UElement *rightElement = static_cast<const UElement *>(right); | |
764 | const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); | |
765 | const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); | |
766 | const Collator *col = static_cast<const Collator *>(context); | |
767 | ||
768 | Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_); | |
769 | if (r == Collator::EQUAL) { | |
770 | if (leftRec->serialNumber_ < rightRec->serialNumber_) { | |
771 | r = Collator::LESS; | |
772 | } else if (leftRec->serialNumber_ > rightRec->serialNumber_) { | |
773 | r = Collator::GREATER; | |
774 | } | |
775 | } | |
776 | return (int32_t) r; | |
777 | } | |
778 | ||
779 | ||
780 | #if 0 | |
781 | // | |
782 | // First characters in scripts. | |
783 | // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. | |
784 | // The vector is sorted according to this index's collation. | |
785 | // | |
786 | // This code is too slow to use, so for now hard code the data. | |
787 | // Hard coded implementation is follows. | |
788 | // | |
789 | UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) { | |
790 | ||
791 | if (U_FAILURE(status)) { | |
792 | return NULL; | |
793 | } | |
794 | ||
795 | UnicodeString results[USCRIPT_CODE_LIMIT]; | |
796 | UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a"); | |
797 | ||
798 | UnicodeSetIterator siter(*TO_TRY); | |
799 | while (siter.next()) { | |
800 | const UnicodeString ¤t = siter.getString(); | |
801 | Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A); | |
802 | if (r < 0) { // TODO fix; we only want "real" script characters, not | |
803 | // symbols. | |
804 | continue; | |
805 | } | |
806 | ||
807 | int script = uscript_getScript(current.char32At(0), &status); | |
808 | if (results[script].length() == 0) { | |
809 | results[script] = current; | |
810 | } | |
811 | else if (ruleBasedCollator->compare(current, results[script]) < 0) { | |
812 | results[script] = current; | |
813 | } | |
814 | } | |
815 | ||
816 | UnicodeSet extras; | |
817 | UnicodeSet expansions; | |
818 | RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator); | |
819 | const UCollator *uRuleBasedCollator = rbc->getUCollator(); | |
820 | ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status); | |
821 | extras.addAll(expansions).removeAll(*TO_TRY); | |
822 | if (extras.size() != 0) { | |
823 | const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status); | |
824 | UnicodeSetIterator extrasIter(extras); | |
825 | while (extrasIter.next()) { | |
826 | const UnicodeString ¤t = extrasIter.next(); | |
827 | if (!TO_TRY->containsAll(current)) | |
828 | continue; | |
829 | if (!normalizer->isNormalized(current, status) || | |
830 | ruleBasedCollator->compare(current, LOWER_A) < 0) { | |
831 | continue; | |
832 | } | |
833 | int script = uscript_getScript(current.char32At(0), &status); | |
834 | if (results[script].length() == 0) { | |
835 | results[script] = current; | |
836 | } else if (ruleBasedCollator->compare(current, results[script]) < 0) { | |
837 | results[script] = current; | |
838 | } | |
839 | } | |
840 | } | |
841 | ||
842 | UVector *dest = new UVector(status); | |
843 | dest->setDeleter(uprv_deleteUObject); | |
844 | for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) { | |
845 | if (results[i].length() > 0) { | |
846 | dest->addElement(results[i].clone(), status); | |
847 | } | |
848 | } | |
849 | dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status); | |
850 | return dest; | |
851 | } | |
852 | #endif | |
853 | ||
854 | ||
855 | // | |
856 | // First characters in scripts. | |
857 | // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. | |
858 | // The vector is sorted according to this index's collation. | |
859 | // | |
860 | // It takes too much time to compute this from character properties, so hard code it for now. | |
861 | // Character constants copied from corresponding declaration in ICU4J. | |
862 | // See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java | |
863 | ||
864 | static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0, | |
865 | 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, | |
866 | 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, | |
867 | 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, | |
868 | 0xAAF2, 0, // Meetei Mayek | |
869 | 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, | |
870 | U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada | |
871 | U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri | |
872 | 0x1B83, 0, | |
873 | 0xD802, 0xDE00, 0, 0x0E01, 0, | |
874 | 0x0EDE, 0, // Lao | |
875 | 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, | |
876 | 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, | |
877 | U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma | |
878 | 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, | |
879 | 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, | |
880 | 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, | |
881 | U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao | |
882 | 0xD800, 0xDE80, 0, | |
883 | 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, | |
884 | 0xD801, 0xDC80, 0, | |
885 | U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng | |
886 | 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, | |
887 | 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, | |
888 | U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive | |
889 | U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs | |
890 | 0x4E00, 0 }; | |
891 | ||
892 | UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { | |
893 | if (U_FAILURE(status)) { | |
894 | return NULL; | |
895 | } | |
896 | UVector *dest = new UVector(status); | |
897 | if (dest == NULL) { | |
898 | if (U_SUCCESS(status)) { | |
899 | status = U_MEMORY_ALLOCATION_ERROR; | |
900 | } | |
901 | return NULL; | |
902 | } | |
903 | dest->setDeleter(uprv_deleteUObject); | |
904 | const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; | |
905 | const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]); | |
906 | do { | |
907 | if (U_FAILURE(status)) { | |
908 | return dest; | |
909 | } | |
910 | UnicodeString *str = new UnicodeString(src, -1); | |
911 | if (str == NULL) { | |
912 | status = U_MEMORY_ALLOCATION_ERROR; | |
913 | } else { | |
914 | dest->addElement(str, status); | |
915 | src += str->length() + 1; | |
916 | } | |
917 | } while (src < limit); | |
918 | dest->sortWithUComparator(sortCollateComparator, collator_, status); | |
919 | return dest; | |
920 | } | |
921 | ||
922 | ||
923 | AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) { | |
924 | const char *lang = loc.getLanguage(); | |
925 | if (uprv_strcmp(lang, "zh") != 0) { | |
926 | return kNormal; | |
927 | } | |
928 | const char *script = loc.getScript(); | |
929 | if (uprv_strcmp(script, "Hant") == 0) { | |
930 | return kTraditional; | |
931 | } | |
932 | const char *country = loc.getCountry(); | |
933 | if (uprv_strcmp(country, "TW") == 0) { | |
934 | return kTraditional; | |
935 | } | |
936 | return kSimplified; | |
937 | } | |
938 | ||
939 | ||
940 | // | |
941 | // Pinyin Hacks. Direct port from Java. | |
942 | // | |
943 | ||
944 | static const UChar32 probeCharInLong = 0x28EAD; | |
945 | ||
946 | ||
947 | static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz" | |
948 | 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, | |
949 | /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; | |
950 | ||
951 | ||
952 | // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code. | |
953 | ||
954 | AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = { | |
955 | {(UChar)0, (UChar)0, (UChar)0}, // A | |
956 | {(UChar)0x516B, (UChar)0, (UChar)0}, // B | |
957 | {(UChar)0x5693, (UChar)0, (UChar)0}, // C | |
958 | {(UChar)0x5491, (UChar)0, (UChar)0}, // D | |
959 | {(UChar)0x59B8, (UChar)0, (UChar)0}, // E | |
960 | {(UChar)0x53D1, (UChar)0, (UChar)0}, // F | |
961 | {(UChar)0x65EE, (UChar)0, (UChar)0}, // G | |
962 | {(UChar)0x54C8, (UChar)0, (UChar)0}, // H | |
963 | {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J | |
964 | {(UChar)0x5494, (UChar)0, (UChar)0}, // K | |
965 | {(UChar)0x5783, (UChar)0, (UChar)0}, // L | |
966 | {(UChar)0x5452, (UChar)0, (UChar)0}, // M | |
967 | {(UChar)0x5514, (UChar)0, (UChar)0}, // N | |
968 | {(UChar)0x5594, (UChar)0, (UChar)0}, // O | |
969 | {(UChar)0x5991, (UChar)0, (UChar)0}, // P | |
970 | {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q | |
971 | {(UChar)0x513F, (UChar)0, (UChar)0}, // R | |
972 | {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S | |
973 | {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T | |
974 | {(UChar)0x7A75, (UChar)0, (UChar)0}, // W | |
975 | {(UChar)0x5915, (UChar)0, (UChar)0}, // X | |
976 | {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y | |
977 | {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z | |
978 | {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array | |
979 | }; | |
980 | ||
981 | static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"; | |
982 | 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, | |
983 | /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; | |
984 | ||
985 | AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = { | |
986 | {(UChar)0, (UChar)0, (UChar)0}, // A | |
987 | {(UChar)0x516B, (UChar)0, (UChar)0}, // b | |
988 | {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c | |
989 | {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d | |
990 | {(UChar)0x59B8, (UChar)0, (UChar)0}, // e | |
991 | {(UChar)0x53D1, (UChar)0, (UChar)0}, // f | |
992 | {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g | |
993 | {(UChar)0x54C8, (UChar)0, (UChar)0}, // h | |
994 | {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j | |
995 | {(UChar)0x5494, (UChar)0, (UChar)0}, // k | |
996 | {(UChar)0x3547, (UChar)0, (UChar)0}, // l | |
997 | {(UChar)0x5452, (UChar)0, (UChar)0}, // m | |
998 | {(UChar)0x5514, (UChar)0, (UChar)0}, // n | |
999 | {(UChar)0x5594, (UChar)0, (UChar)0}, // o | |
1000 | {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p | |
1001 | {(UChar)0x4E03, (UChar)0, (UChar)0}, // q | |
1002 | {(UChar)0x513F, (UChar)0, (UChar)0}, // r | |
1003 | {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s | |
1004 | {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t | |
1005 | {(UChar)0x7A75, (UChar)0, (UChar)0}, // w | |
1006 | {(UChar)0x5915, (UChar)0, (UChar)0}, // x | |
1007 | {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y | |
1008 | {(UChar)0x5E00, (UChar)0, (UChar)0}, // z | |
1009 | {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array | |
1010 | }; | |
1011 | ||
1012 | ||
1013 | // | |
1014 | // Probe the collation data, and decide which Pinyin tables should be used | |
1015 | // | |
1016 | // ICU can be built with a choice between two Chinese collations. | |
1017 | // The hack Pinyin tables to use depend on which one is in use. | |
1018 | // We can assume that any given copy of ICU will have only one of the collations available, | |
1019 | // and that there is no way, in a given process, to create two alphabetic indexes using | |
1020 | // different Chinese collations. Which means the probe can be done once | |
1021 | // and the results cached. | |
1022 | // | |
1023 | // This whole arrangement is temporary. | |
1024 | // | |
1025 | AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL; | |
1026 | const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL; | |
1027 | ||
1028 | void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) { | |
1029 | { | |
1030 | Mutex m; | |
1031 | if (PINYIN_LOWER_BOUNDS != NULL) { | |
1032 | return; | |
1033 | } | |
1034 | } | |
1035 | UnicodeSet *colSet = col->getTailoredSet(status); | |
1036 | if (U_FAILURE(status) || colSet == NULL) { | |
1037 | delete colSet; | |
1038 | if (U_SUCCESS(status)) { | |
1039 | status = U_MEMORY_ALLOCATION_ERROR; | |
1040 | } | |
1041 | return; | |
1042 | } | |
1043 | UBool useLongTables = colSet->contains(probeCharInLong); | |
1044 | delete colSet; | |
1045 | { | |
1046 | Mutex m; | |
1047 | if (useLongTables) { | |
1048 | PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG; | |
1049 | HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG; | |
1050 | } else { | |
1051 | PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT; | |
1052 | HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT; | |
1053 | } | |
1054 | } | |
1055 | } | |
1056 | ||
1057 | // Pinyin Hack: | |
1058 | // Modify a Chinese name by prepending a Latin letter. The modified name is used | |
1059 | // when putting records (names) into buckets, to put the name under a Latin index heading. | |
1060 | ||
1061 | void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) { | |
1062 | ||
1063 | if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) { | |
1064 | dest = name; | |
1065 | return; | |
1066 | } | |
1067 | ||
1068 | UErrorCode status = U_ZERO_ERROR; | |
1069 | initPinyinBounds(col, status); | |
1070 | if (U_FAILURE(status)) { | |
1071 | dest = name; | |
1072 | return; | |
1073 | } | |
1074 | // TODO: use binary search | |
1075 | int index; | |
1076 | for (index=0; ; index++) { | |
1077 | if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) { | |
1078 | index--; | |
1079 | break; | |
1080 | } | |
1081 | int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1)); | |
1082 | if (compareResult < 0) { | |
1083 | index--; | |
1084 | } | |
1085 | if (compareResult <= 0) { | |
1086 | break; | |
1087 | } | |
1088 | } | |
1089 | UChar c = PINYIN_LOWER_BOUNDS[index]; | |
1090 | dest.setTo(c); | |
1091 | dest.append(name); | |
1092 | return; | |
1093 | } | |
1094 | ||
1095 | ||
1096 | ||
1097 | /** | |
1098 | * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is | |
1099 | * better, and otherwise binary order is better. | |
1100 | * | |
1101 | * For use with array sort or UVector. | |
1102 | * @param context A UErrorCode pointer. | |
1103 | * @param left A UElement pointer, which must refer to a UnicodeString * | |
1104 | * @param right A UElement pointer, which must refer to a UnicodeString * | |
1105 | */ | |
1106 | ||
1107 | static int32_t U_CALLCONV | |
1108 | PreferenceComparator(const void *context, const void *left, const void *right) { | |
1109 | const UElement *leftElement = static_cast<const UElement *>(left); | |
1110 | const UElement *rightElement = static_cast<const UElement *>(right); | |
1111 | const UnicodeString *s1 = static_cast<const UnicodeString *>(leftElement->pointer); | |
1112 | const UnicodeString *s2 = static_cast<const UnicodeString *>(rightElement->pointer); | |
1113 | UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const. | |
1114 | if (s1 == s2) { | |
1115 | return 0; | |
1116 | } | |
1117 | ||
1118 | UnicodeString n1 = nfkdNormalizer->normalize(*s1, status); | |
1119 | UnicodeString n2 = nfkdNormalizer->normalize(*s2, status); | |
1120 | int32_t result = n1.length() - n2.length(); | |
1121 | if (result != 0) { | |
1122 | return result; | |
1123 | } | |
1124 | ||
1125 | result = n1.compareCodePointOrder(n2); | |
1126 | if (result != 0) { | |
1127 | return result; | |
1128 | } | |
1129 | return s1->compareCodePointOrder(*s2); | |
1130 | } | |
1131 | ||
1132 | ||
1133 | // | |
1134 | // Constructor & Destructor for AlphabeticIndex::Record | |
1135 | // | |
1136 | // Records are internal only, instances are not directly surfaced in the public API. | |
1137 | // This class is mostly struct-like, with all public fields. | |
1138 | ||
1139 | AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data): | |
1140 | alphaIndex_(alphaIndex), name_(name), data_(data) | |
1141 | { | |
1142 | UnicodeString prefixedName; | |
1143 | alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_); | |
1144 | serialNumber_ = ++alphaIndex->recordCounter_; | |
1145 | } | |
1146 | ||
1147 | AlphabeticIndex::Record::~Record() { | |
1148 | } | |
1149 | ||
1150 | ||
1151 | AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { | |
1152 | if (U_FAILURE(status)) { | |
1153 | return *this; | |
1154 | } | |
1155 | Record *r = new Record(this, name, data); | |
1156 | inputRecords_->addElement(r, status); | |
1157 | indexBuildRequired_ = TRUE; | |
1158 | //std::string ss; | |
1159 | //std::string ss2; | |
1160 | //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << | |
1161 | // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; | |
1162 | return *this; | |
1163 | } | |
1164 | ||
1165 | ||
1166 | AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { | |
1167 | if (U_FAILURE(status)) { | |
1168 | return *this; | |
1169 | } | |
1170 | inputRecords_->removeAllElements(); | |
1171 | indexBuildRequired_ = TRUE; | |
1172 | return *this; | |
1173 | } | |
1174 | ||
1175 | ||
1176 | int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { | |
1177 | buildIndex(status); | |
1178 | if (U_FAILURE(status)) { | |
1179 | return 0; | |
1180 | } | |
1181 | ||
1182 | // For simplified Chinese prepend a prefix to the name. | |
1183 | // For non-Chinese locales or non-Chinese names, the name is not modified. | |
1184 | ||
1185 | UnicodeString prefixedName; | |
1186 | hackName(prefixedName, name, collatorPrimaryOnly_); | |
1187 | ||
1188 | // TODO: use a binary search. | |
1189 | for (int32_t i = 0; i < bucketList_->size(); ++i) { | |
1190 | Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i)); | |
1191 | Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_); | |
1192 | if (comp < 0) { | |
1193 | return i - 1; | |
1194 | } | |
1195 | } | |
1196 | // Loop runs until we find the bucket following the one that would hold prefixedName. | |
1197 | // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather | |
1198 | // than returning from the middle. | |
1199 | ||
1200 | return bucketList_->size() - 1; | |
1201 | } | |
1202 | ||
1203 | ||
1204 | int32_t AlphabeticIndex::getBucketIndex() const { | |
1205 | return labelsIterIndex_; | |
1206 | } | |
1207 | ||
1208 | ||
1209 | UBool AlphabeticIndex::nextBucket(UErrorCode &status) { | |
1210 | if (U_FAILURE(status)) { | |
1211 | return FALSE; | |
1212 | } | |
1213 | if (indexBuildRequired_ && currentBucket_ != NULL) { | |
1214 | status = U_ENUM_OUT_OF_SYNC_ERROR; | |
1215 | return FALSE; | |
1216 | } | |
1217 | buildIndex(status); | |
1218 | if (U_FAILURE(status)) { | |
1219 | return FALSE; | |
1220 | } | |
1221 | ++labelsIterIndex_; | |
1222 | if (labelsIterIndex_ >= bucketList_->size()) { | |
1223 | labelsIterIndex_ = bucketList_->size(); | |
1224 | return FALSE; | |
1225 | } | |
1226 | currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_)); | |
1227 | resetRecordIterator(); | |
1228 | return TRUE; | |
1229 | } | |
1230 | ||
1231 | const UnicodeString &AlphabeticIndex::getBucketLabel() const { | |
1232 | if (currentBucket_ != NULL) { | |
1233 | return currentBucket_->label_; | |
1234 | } else { | |
1235 | return *EMPTY_STRING; | |
1236 | } | |
1237 | } | |
1238 | ||
1239 | ||
1240 | UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { | |
1241 | if (currentBucket_ != NULL) { | |
1242 | return currentBucket_->labelType_; | |
1243 | } else { | |
1244 | return U_ALPHAINDEX_NORMAL; | |
1245 | } | |
1246 | } | |
1247 | ||
1248 | ||
1249 | int32_t AlphabeticIndex::getBucketRecordCount() const { | |
1250 | if (currentBucket_ != NULL) { | |
1251 | return currentBucket_->records_->size(); | |
1252 | } else { | |
1253 | return 0; | |
1254 | } | |
1255 | } | |
1256 | ||
1257 | ||
1258 | AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { | |
1259 | if (U_FAILURE(status)) { | |
1260 | return *this; | |
1261 | } | |
1262 | buildIndex(status); | |
1263 | labelsIterIndex_ = -1; | |
1264 | currentBucket_ = NULL; | |
1265 | return *this; | |
1266 | } | |
1267 | ||
1268 | ||
1269 | UBool AlphabeticIndex::nextRecord(UErrorCode &status) { | |
1270 | if (U_FAILURE(status)) { | |
1271 | return FALSE; | |
1272 | } | |
1273 | if (currentBucket_ == NULL) { | |
1274 | // We are trying to iterate over the items in a bucket, but there is no | |
1275 | // current bucket from the enumeration of buckets. | |
1276 | status = U_INVALID_STATE_ERROR; | |
1277 | return FALSE; | |
1278 | } | |
1279 | if (indexBuildRequired_) { | |
1280 | status = U_ENUM_OUT_OF_SYNC_ERROR; | |
1281 | return FALSE; | |
1282 | } | |
1283 | ++itemsIterIndex_; | |
1284 | if (itemsIterIndex_ >= currentBucket_->records_->size()) { | |
1285 | itemsIterIndex_ = currentBucket_->records_->size(); | |
1286 | return FALSE; | |
1287 | } | |
1288 | return TRUE; | |
1289 | } | |
1290 | ||
1291 | ||
1292 | const UnicodeString &AlphabeticIndex::getRecordName() const { | |
1293 | const UnicodeString *retStr = EMPTY_STRING; | |
1294 | if (currentBucket_ != NULL && | |
1295 | itemsIterIndex_ >= 0 && | |
1296 | itemsIterIndex_ < currentBucket_->records_->size()) { | |
1297 | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); | |
1298 | retStr = &item->name_; | |
1299 | } | |
1300 | return *retStr; | |
1301 | } | |
1302 | ||
1303 | const void *AlphabeticIndex::getRecordData() const { | |
1304 | const void *retPtr = NULL; | |
1305 | if (currentBucket_ != NULL && | |
1306 | itemsIterIndex_ >= 0 && | |
1307 | itemsIterIndex_ < currentBucket_->records_->size()) { | |
1308 | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); | |
1309 | retPtr = item->data_; | |
1310 | } | |
1311 | return retPtr; | |
1312 | } | |
1313 | ||
1314 | ||
1315 | AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { | |
1316 | itemsIterIndex_ = -1; | |
1317 | return *this; | |
1318 | } | |
1319 | ||
1320 | ||
1321 | ||
1322 | AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, | |
1323 | const UnicodeString &lowerBoundary, | |
1324 | UAlphabeticIndexLabelType type, | |
1325 | UErrorCode &status): | |
1326 | label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) { | |
1327 | if (U_FAILURE(status)) { | |
1328 | return; | |
1329 | } | |
1330 | records_ = new UVector(status); | |
1331 | if (records_ == NULL && U_SUCCESS(status)) { | |
1332 | status = U_MEMORY_ALLOCATION_ERROR; | |
1333 | } | |
1334 | } | |
1335 | ||
1336 | ||
1337 | AlphabeticIndex::Bucket::~Bucket() { | |
1338 | delete records_; | |
1339 | } | |
1340 | ||
1341 | U_NAMESPACE_END | |
1342 | ||
1343 | #endif |