]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/colldata.cpp
ICU-461.17.tar.gz
[apple/icu.git] / icuSources / i18n / colldata.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2009, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32
33 #include "unicode/colldata.h"
34
35 U_NAMESPACE_BEGIN
36
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
41
42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
43
44 #ifdef INSTRUMENT_CELIST
45 int32_t CEList::_active = 0;
46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
47 #endif
48
49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
51 {
52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
53 UCollationStrength strength = ucol_getStrength(coll);
54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
55 uint32_t variableTop = ucol_getVariableTop(coll, &status);
56 uint32_t strengthMask = 0;
57 int32_t order;
58
59 if (U_FAILURE(status)) {
60 return;
61 }
62
63 // **** only set flag if string has Han(gul) ****
64 ucol_forceHanImplicit(elems, &status);
65
66 switch (strength)
67 {
68 default:
69 strengthMask |= UCOL_TERTIARYORDERMASK;
70 /* fall through */
71
72 case UCOL_SECONDARY:
73 strengthMask |= UCOL_SECONDARYORDERMASK;
74 /* fall through */
75
76 case UCOL_PRIMARY:
77 strengthMask |= UCOL_PRIMARYORDERMASK;
78 }
79
80 #ifdef INSTRUMENT_CELIST
81 _active += 1;
82 _histogram[0] += 1;
83 #endif
84
85 ces = ceBuffer;
86
87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
88 UBool cont = isContinuation(order);
89
90 order &= strengthMask;
91
92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
93 if (strength >= UCOL_QUATERNARY) {
94 order &= UCOL_PRIMARYORDERMASK;
95 } else {
96 order = UCOL_IGNORABLE;
97 }
98 }
99
100 if (order == UCOL_IGNORABLE) {
101 continue;
102 }
103
104 if (cont) {
105 order |= UCOL_CONTINUATION_MARKER;
106 }
107
108 add(order, status);
109 }
110
111 ucol_closeElements(elems);
112 }
113
114 CEList::~CEList()
115 {
116 #ifdef INSTRUMENT_CELIST
117 _active -= 1;
118 #endif
119
120 if (ces != ceBuffer) {
121 DELETE_ARRAY(ces);
122 }
123 }
124
125 void CEList::add(uint32_t ce, UErrorCode &status)
126 {
127 if (U_FAILURE(status)) {
128 return;
129 }
130
131 if (listSize >= listMax) {
132 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
133
134 #ifdef INSTRUMENT_CELIST
135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
136 #endif
137
138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
139
140 if (newCEs == NULL) {
141 status = U_MEMORY_ALLOCATION_ERROR;
142 return;
143 }
144
145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
146
147 if (ces != ceBuffer) {
148 DELETE_ARRAY(ces);
149 }
150
151 ces = newCEs;
152 listMax = newMax;
153 }
154
155 ces[listSize++] = ce;
156 }
157
158 uint32_t CEList::get(int32_t index) const
159 {
160 if (index >= 0 && index < listSize) {
161 return ces[index];
162 }
163
164 return UCOL_NULLORDER;
165 }
166
167 uint32_t &CEList::operator[](int32_t index) const
168 {
169 return ces[index];
170 }
171
172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
173 {
174 if (other == NULL || listSize - offset < other->size()) {
175 return FALSE;
176 }
177
178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
179 if (ces[i] != (*other)[j]) {
180 return FALSE;
181 }
182 }
183
184 return TRUE;
185 }
186
187 int32_t CEList::size() const
188 {
189 return listSize;
190 }
191
192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
193
194 #ifdef INSTRUMENT_STRING_LIST
195 int32_t StringList::_lists = 0;
196 int32_t StringList::_strings = 0;
197 int32_t StringList::_histogram[101] = {0};
198 #endif
199
200 StringList::StringList(UErrorCode &status)
201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
202 {
203 if (U_FAILURE(status)) {
204 return;
205 }
206
207 strings = new UnicodeString [listMax];
208
209 if (strings == NULL) {
210 status = U_MEMORY_ALLOCATION_ERROR;
211 return;
212 }
213
214 #ifdef INSTRUMENT_STRING_LIST
215 _lists += 1;
216 _histogram[0] += 1;
217 #endif
218 }
219
220 StringList::~StringList()
221 {
222 delete[] strings;
223 }
224
225 void StringList::add(const UnicodeString *string, UErrorCode &status)
226 {
227 if (U_FAILURE(status)) {
228 return;
229 }
230
231 #ifdef INSTRUMENT_STRING_LIST
232 _strings += 1;
233 #endif
234
235 if (listSize >= listMax) {
236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
237
238 UnicodeString *newStrings = new UnicodeString[newMax];
239
240 uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
241
242 #ifdef INSTRUMENT_STRING_LIST
243 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
244
245 if (_h > 100) {
246 _h = 100;
247 }
248
249 _histogram[_h] += 1;
250 #endif
251
252 delete[] strings;
253 strings = newStrings;
254 listMax = newMax;
255 }
256
257 // The ctor initialized all the strings in
258 // the array to empty strings, so this
259 // is the same as copying the source string.
260 strings[listSize++].append(*string);
261 }
262
263 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
264 {
265 const UnicodeString string(chars, count);
266
267 add(&string, status);
268 }
269
270 const UnicodeString *StringList::get(int32_t index) const
271 {
272 if (index >= 0 && index < listSize) {
273 return &strings[index];
274 }
275
276 return NULL;
277 }
278
279 int32_t StringList::size() const
280 {
281 return listSize;
282 }
283
284
285 U_CFUNC void deleteStringList(void *obj);
286
287 class CEToStringsMap : public UMemory
288 {
289 public:
290
291 CEToStringsMap(UErrorCode &status);
292 ~CEToStringsMap();
293
294 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
295 StringList *getStringList(uint32_t ce) const;
296
297 private:
298
299 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
300 UHashtable *map;
301 };
302
303 CEToStringsMap::CEToStringsMap(UErrorCode &status)
304 : map(NULL)
305 {
306 if (U_FAILURE(status)) {
307 return;
308 }
309
310 map = uhash_open(uhash_hashLong, uhash_compareLong,
311 uhash_compareCaselessUnicodeString,
312 &status);
313
314 if (U_FAILURE(status)) {
315 return;
316 }
317
318 uhash_setValueDeleter(map, deleteStringList);
319 }
320
321 CEToStringsMap::~CEToStringsMap()
322 {
323 uhash_close(map);
324 }
325
326 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
327 {
328 StringList *strings = getStringList(ce);
329
330 if (strings == NULL) {
331 strings = new StringList(status);
332
333 if (strings == NULL || U_FAILURE(status)) {
334 status = U_MEMORY_ALLOCATION_ERROR;
335 return;
336 }
337
338 putStringList(ce, strings, status);
339 }
340
341 strings->add(string, status);
342 }
343
344 StringList *CEToStringsMap::getStringList(uint32_t ce) const
345 {
346 return (StringList *) uhash_iget(map, ce);
347 }
348
349 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
350 {
351 uhash_iput(map, ce, (void *) stringList, &status);
352 }
353
354 U_CFUNC void deleteStringList(void *obj)
355 {
356 StringList *strings = (StringList *) obj;
357
358 delete strings;
359 }
360
361 U_CFUNC void deleteCEList(void *obj);
362 U_CFUNC void deleteUnicodeStringKey(void *obj);
363
364 class StringToCEsMap : public UMemory
365 {
366 public:
367 StringToCEsMap(UErrorCode &status);
368 ~StringToCEsMap();
369
370 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
371 const CEList *get(const UnicodeString *string);
372 void free(const CEList *list);
373
374 private:
375
376
377 UHashtable *map;
378 };
379
380 StringToCEsMap::StringToCEsMap(UErrorCode &status)
381 : map(NULL)
382 {
383 if (U_FAILURE(status)) {
384 return;
385 }
386
387 map = uhash_open(uhash_hashUnicodeString,
388 uhash_compareUnicodeString,
389 uhash_compareLong,
390 &status);
391
392 if (U_FAILURE(status)) {
393 return;
394 }
395
396 uhash_setValueDeleter(map, deleteCEList);
397 uhash_setKeyDeleter(map, deleteUnicodeStringKey);
398 }
399
400 StringToCEsMap::~StringToCEsMap()
401 {
402 uhash_close(map);
403 }
404
405 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
406 {
407 uhash_put(map, (void *) string, (void *) ces, &status);
408 }
409
410 const CEList *StringToCEsMap::get(const UnicodeString *string)
411 {
412 return (const CEList *) uhash_get(map, string);
413 }
414
415 U_CFUNC void deleteCEList(void *obj)
416 {
417 CEList *list = (CEList *) obj;
418
419 delete list;
420 }
421
422 U_CFUNC void deleteUnicodeStringKey(void *obj)
423 {
424 UnicodeString *key = (UnicodeString *) obj;
425
426 delete key;
427 }
428
429 class CollDataCacheEntry : public UMemory
430 {
431 public:
432 CollDataCacheEntry(CollData *theData);
433 ~CollDataCacheEntry();
434
435 CollData *data;
436 int32_t refCount;
437 };
438
439 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
440 : data(theData), refCount(1)
441 {
442 // nothing else to do
443 }
444
445 CollDataCacheEntry::~CollDataCacheEntry()
446 {
447 // check refCount?
448 delete data;
449 }
450
451 class CollDataCache : public UMemory
452 {
453 public:
454 CollDataCache(UErrorCode &status);
455 ~CollDataCache();
456
457 CollData *get(UCollator *collator, UErrorCode &status);
458 void unref(CollData *collData);
459
460 void flush();
461
462 private:
463 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
464 static void deleteKey(char *key);
465
466 UMTX lock;
467 UHashtable *cache;
468 };
469
470 U_CFUNC void deleteChars(void * /*obj*/)
471 {
472 // char *chars = (char *) obj;
473 // All the key strings are owned by the
474 // CollData objects and don't need to
475 // be freed here.
476 //DELETE_ARRAY(chars);
477 }
478
479 U_CFUNC void deleteCollDataCacheEntry(void *obj)
480 {
481 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
482
483 delete entry;
484 }
485
486 CollDataCache::CollDataCache(UErrorCode &status)
487 : lock(0), cache(NULL)
488 {
489 if (U_FAILURE(status)) {
490 return;
491 }
492
493 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
494
495 if (U_FAILURE(status)) {
496 return;
497 }
498
499 uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
500 uhash_setKeyDeleter(cache, deleteChars);
501 }
502
503 CollDataCache::~CollDataCache()
504 {
505 umtx_lock(&lock);
506 uhash_close(cache);
507 cache = NULL;
508 umtx_unlock(&lock);
509
510 umtx_destroy(&lock);
511 }
512
513 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
514 {
515 char keyBuffer[KEY_BUFFER_SIZE];
516 int32_t keyLength = KEY_BUFFER_SIZE;
517 char *key = getKey(collator, keyBuffer, &keyLength);
518 CollData *result = NULL, *newData = NULL;
519 CollDataCacheEntry *entry = NULL, *newEntry = NULL;
520
521 umtx_lock(&lock);
522 entry = (CollDataCacheEntry *) uhash_get(cache, key);
523
524 if (entry == NULL) {
525 umtx_unlock(&lock);
526
527 newData = new CollData(collator, key, keyLength, status);
528 newEntry = new CollDataCacheEntry(newData);
529
530 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
531 status = U_MEMORY_ALLOCATION_ERROR;
532 return NULL;
533 }
534
535 umtx_lock(&lock);
536 entry = (CollDataCacheEntry *) uhash_get(cache, key);
537
538 if (entry == NULL) {
539 uhash_put(cache, newData->key, newEntry, &status);
540 umtx_unlock(&lock);
541
542 if (U_FAILURE(status)) {
543 delete newEntry;
544 delete newData;
545
546 return NULL;
547 }
548
549 return newData;
550 }
551 }
552
553 result = entry->data;
554 entry->refCount += 1;
555 umtx_unlock(&lock);
556
557 if (key != keyBuffer) {
558 deleteKey(key);
559 }
560
561 if (newEntry != NULL) {
562 delete newEntry;
563 delete newData;
564 }
565
566 return result;
567 }
568
569 void CollDataCache::unref(CollData *collData)
570 {
571 CollDataCacheEntry *entry = NULL;
572
573 umtx_lock(&lock);
574 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
575
576 if (entry != NULL) {
577 entry->refCount -= 1;
578 }
579 umtx_unlock(&lock);
580 }
581
582 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
583 {
584 UErrorCode status = U_ZERO_ERROR;
585 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
586
587 if (len >= *keyBufferLength) {
588 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
589 keyBuffer = NEW_ARRAY(char, *keyBufferLength);
590 status = U_ZERO_ERROR;
591
592 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
593 }
594
595 keyBuffer[len] = '\0';
596
597 return keyBuffer;
598 }
599
600 void CollDataCache::flush()
601 {
602 const UHashElement *element;
603 int32_t pos = -1;
604
605 umtx_lock(&lock);
606 while ((element = uhash_nextElement(cache, &pos)) != NULL) {
607 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
608
609 if (entry->refCount <= 0) {
610 uhash_removeElement(cache, element);
611 }
612 }
613 umtx_unlock(&lock);
614 }
615
616 void CollDataCache::deleteKey(char *key)
617 {
618 DELETE_ARRAY(key);
619 }
620
621 U_CDECL_BEGIN
622 static UBool coll_data_cleanup(void) {
623 CollData::freeCollDataCache();
624 return TRUE;
625 }
626 U_CDECL_END
627
628 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
629
630 CollData::CollData()
631 {
632 // nothing
633 }
634
635 #define CLONE_COLLATOR
636
637 //#define CACHE_CELISTS
638 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
639 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
640 {
641 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
642 // i.e. other, control, private use, format, surrogate
643 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
644 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
645 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
646
647 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
648 // i.e. all the characers we handle implicitly
649 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
650 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
651 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
652
653 if (U_FAILURE(status)) {
654 return;
655 }
656
657 USet *expansions = uset_openEmpty();
658 USet *contractions = uset_openEmpty();
659 int32_t itemCount;
660
661 #ifdef CACHE_CELISTS
662 charsToCEList = new StringToCEsMap(status);
663
664 if (U_FAILURE(status)) {
665 goto bail;
666 }
667 #else
668 charsToCEList = NULL;
669 #endif
670
671 ceToCharsStartingWith = new CEToStringsMap(status);
672
673 if (U_FAILURE(status)) {
674 goto bail;
675 }
676
677 if (cacheKeyLength > KEY_BUFFER_SIZE) {
678 key = NEW_ARRAY(char, cacheKeyLength);
679
680 if (key == NULL) {
681 status = U_MEMORY_ALLOCATION_ERROR;
682 goto bail;
683 }
684 } else {
685 key = keyBuffer;
686 }
687
688 ARRAY_COPY(key, cacheKey, cacheKeyLength);
689
690 #ifdef CLONE_COLLATOR
691 coll = ucol_safeClone(collator, NULL, NULL, &status);
692
693 if (U_FAILURE(status)) {
694 goto bail;
695 }
696 #else
697 coll = collator;
698 #endif
699
700 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
701
702 uset_addAll(charsToTest, contractions);
703 uset_addAll(charsToTest, expansions);
704 uset_removeAll(charsToTest, charsToRemove);
705
706 itemCount = uset_getItemCount(charsToTest);
707 for(int32_t item = 0; item < itemCount; item += 1) {
708 UChar32 start = 0, end = 0;
709 UChar buffer[16];
710 int32_t len = uset_getItem(charsToTest, item, &start, &end,
711 buffer, 16, &status);
712
713 if (len == 0) {
714 for (UChar32 ch = start; ch <= end; ch += 1) {
715 UnicodeString *st = new UnicodeString(ch);
716
717 if (st == NULL) {
718 status = U_MEMORY_ALLOCATION_ERROR;
719 break;
720 }
721
722 CEList *ceList = new CEList(coll, *st, status);
723
724 ceToCharsStartingWith->put(ceList->get(0), st, status);
725
726 #ifdef CACHE_CELISTS
727 charsToCEList->put(st, ceList, status);
728 #else
729 delete ceList;
730 delete st;
731 #endif
732 }
733 } else if (len > 0) {
734 UnicodeString *st = new UnicodeString(buffer, len);
735
736 if (st == NULL) {
737 status = U_MEMORY_ALLOCATION_ERROR;
738 break;
739 }
740
741 CEList *ceList = new CEList(coll, *st, status);
742
743 ceToCharsStartingWith->put(ceList->get(0), st, status);
744
745 #ifdef CACHE_CELISTS
746 charsToCEList->put(st, ceList, status);
747 #else
748 delete ceList;
749 delete st;
750 #endif
751 } else {
752 // shouldn't happen...
753 }
754
755 if (U_FAILURE(status)) {
756 break;
757 }
758 }
759
760 bail:
761 uset_close(contractions);
762 uset_close(expansions);
763 uset_close(charsToRemove);
764 uset_close(charsToTest);
765
766 if (U_FAILURE(status)) {
767 return;
768 }
769
770 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
771 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
772 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
773 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
774 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
775 CEList hanList(coll, hanString, status);
776 CEList jamoList(coll, jamoString, status);
777 int32_t j = 0;
778
779 if (U_FAILURE(status)) {
780 return;
781 }
782
783 for (int32_t c = 0; c < jamoList.size(); c += 1) {
784 uint32_t jce = jamoList[c];
785
786 if (! isContinuation(jce)) {
787 jamoLimits[j++] = jce;
788 }
789 }
790
791 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
792
793 minHan = 0xFFFFFFFF;
794 maxHan = 0;
795
796 for(int32_t h = 0; h < hanList.size(); h += 2) {
797 uint32_t han = (uint32_t) hanList[h];
798
799 if (han < minHan) {
800 minHan = han;
801 }
802
803 if (han > maxHan) {
804 maxHan = han;
805 }
806 }
807
808 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
809 }
810
811 CollData::~CollData()
812 {
813 #ifdef CLONE_COLLATOR
814 ucol_close(coll);
815 #endif
816
817 if (key != keyBuffer) {
818 DELETE_ARRAY(key);
819 }
820
821 delete ceToCharsStartingWith;
822
823 #ifdef CACHE_CELISTS
824 delete charsToCEList;
825 #endif
826 }
827
828 UCollator *CollData::getCollator() const
829 {
830 return coll;
831 }
832
833 const StringList *CollData::getStringList(int32_t ce) const
834 {
835 return ceToCharsStartingWith->getStringList(ce);
836 }
837
838 const CEList *CollData::getCEList(const UnicodeString *string) const
839 {
840 #ifdef CACHE_CELISTS
841 return charsToCEList->get(string);
842 #else
843 UErrorCode status = U_ZERO_ERROR;
844 const CEList *list = new CEList(coll, *string, status);
845
846 if (U_FAILURE(status)) {
847 delete list;
848 list = NULL;
849 }
850
851 return list;
852 #endif
853 }
854
855 void CollData::freeCEList(const CEList *list)
856 {
857 #ifndef CACHE_CELISTS
858 delete list;
859 #endif
860 }
861
862 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
863 {
864 // find out shortest string for the longest sequence of ces.
865 // this can probably be folded with the minLengthCache...
866
867 if (history[offset] >= 0) {
868 return history[offset];
869 }
870
871 uint32_t ce = ceList->get(offset);
872 int32_t maxOffset = ceList->size();
873 int32_t shortestLength = INT32_MAX;
874 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
875
876 if (strings != NULL) {
877 int32_t stringCount = strings->size();
878
879 for (int32_t s = 0; s < stringCount; s += 1) {
880 const UnicodeString *string = strings->get(s);
881 #ifdef CACHE_CELISTS
882 const CEList *ceList2 = charsToCEList->get(string);
883 #else
884 UErrorCode status = U_ZERO_ERROR;
885 const CEList *ceList2 = new CEList(coll, *string, status);
886
887 if (U_FAILURE(status)) {
888 delete ceList2;
889 ceList2 = NULL;
890 }
891 #endif
892
893 if (ceList->matchesAt(offset, ceList2)) {
894 int32_t clength = ceList2->size();
895 int32_t slength = string->length();
896 int32_t roffset = offset + clength;
897 int32_t rlength = 0;
898
899 if (roffset < maxOffset) {
900 rlength = minLengthInChars(ceList, roffset, history);
901
902 if (rlength <= 0) {
903 // delete before continue to avoid memory leak.
904 #ifndef CACHE_CELISTS
905 delete ceList2;
906 #endif
907 // ignore any dead ends
908 continue;
909 }
910 }
911
912 if (shortestLength > slength + rlength) {
913 shortestLength = slength + rlength;
914 }
915 }
916
917 #ifndef CACHE_CELISTS
918 delete ceList2;
919 #endif
920 }
921 }
922
923 if (shortestLength == INT32_MAX) {
924 // No matching strings at this offset. See if
925 // the CE is in a range we can handle manually.
926 if (ce >= minHan && ce < maxHan) {
927 // all han have implicit orders which
928 // generate two CEs.
929 int32_t roffset = offset + 2;
930 int32_t rlength = 0;
931
932 //history[roffset++] = -1;
933 //history[roffset++] = 1;
934
935 if (roffset < maxOffset) {
936 rlength = minLengthInChars(ceList, roffset, history);
937 }
938
939 if (rlength < 0) {
940 return -1;
941 }
942
943 shortestLength = 1 + rlength;
944 goto have_shortest;
945 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
946 int32_t roffset = offset;
947 int32_t rlength = 0;
948
949 // **** this loop may not handle archaic Hangul correctly ****
950 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
951 uint32_t jce = ceList->get(roffset);
952
953 // Some Jamo have 24-bit primary order; skip the
954 // 2nd CE. This should always be OK because if
955 // we're still in the loop all we've seen are
956 // a series of Jamo in LVT order.
957 if (isContinuation(jce)) {
958 continue;
959 }
960
961 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
962 break;
963 }
964 }
965
966 if (roffset == offset) {
967 // we started with a non-L Jamo...
968 // just say it comes from a single character
969 roffset += 1;
970
971 // See if the single Jamo has a 24-bit order.
972 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
973 roffset += 1;
974 }
975 }
976
977 if (roffset < maxOffset) {
978 rlength = minLengthInChars(ceList, roffset, history);
979 }
980
981 if (rlength < 0) {
982 return -1;
983 }
984
985 shortestLength = 1 + rlength;
986 goto have_shortest;
987 }
988
989 // Can't handle it manually either. Just move on.
990 return -1;
991 }
992
993 have_shortest:
994 history[offset] = shortestLength;
995
996 return shortestLength;
997 }
998
999 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1000 {
1001 int32_t clength = ceList->size();
1002 int32_t *history = NEW_ARRAY(int32_t, clength);
1003
1004 for (int32_t i = 0; i < clength; i += 1) {
1005 history[i] = -1;
1006 }
1007
1008 int32_t minLength = minLengthInChars(ceList, offset, history);
1009
1010 DELETE_ARRAY(history);
1011
1012 return minLength;
1013 }
1014
1015 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1016 {
1017 if (U_FAILURE(status)) {
1018 return NULL;
1019 }
1020
1021 CollDataCache *cache = getCollDataCache();
1022
1023 return cache->get(collator, status);
1024 }
1025
1026 void CollData::close(CollData *collData)
1027 {
1028 CollDataCache *cache = getCollDataCache();
1029
1030 cache->unref(collData);
1031 }
1032
1033 CollDataCache *CollData::collDataCache = NULL;
1034
1035 CollDataCache *CollData::getCollDataCache()
1036 {
1037 UErrorCode status = U_ZERO_ERROR;
1038 CollDataCache *cache = NULL;
1039
1040 UMTX_CHECK(NULL, collDataCache, cache);
1041
1042 if (cache == NULL) {
1043 cache = new CollDataCache(status);
1044
1045 if (U_FAILURE(status)) {
1046 delete cache;
1047 return NULL;
1048 }
1049
1050 umtx_lock(NULL);
1051 if (collDataCache == NULL) {
1052 collDataCache = cache;
1053
1054 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1055 }
1056 umtx_unlock(NULL);
1057
1058 if (collDataCache != cache) {
1059 delete cache;
1060 }
1061 }
1062
1063 return collDataCache;
1064 }
1065
1066 void CollData::freeCollDataCache()
1067 {
1068 CollDataCache *cache = NULL;
1069
1070 UMTX_CHECK(NULL, collDataCache, cache);
1071
1072 if (cache != NULL) {
1073 umtx_lock(NULL);
1074 if (collDataCache != NULL) {
1075 collDataCache = NULL;
1076 } else {
1077 cache = NULL;
1078 }
1079 umtx_unlock(NULL);
1080
1081 delete cache;
1082 }
1083 }
1084
1085 void CollData::flushCollDataCache()
1086 {
1087 CollDataCache *cache = NULL;
1088
1089 UMTX_CHECK(NULL, collDataCache, cache);
1090
1091 // **** this will fail if the another ****
1092 // **** thread deletes the cache here ****
1093 if (cache != NULL) {
1094 cache->flush();
1095 }
1096 }
1097
1098 U_NAMESPACE_END
1099
1100 #endif // #if !UCONFIG_NO_COLLATION