]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/colldata.cpp
ICU-491.11.2.tar.gz
[apple/icu.git] / icuSources / i18n / colldata.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
15
16 #include "cmemory.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
21
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "uhash.h"
29 #include "ucln_in.h"
30 #include "ucol_imp.h"
31 #include "umutex.h"
32 #include "uassert.h"
33
34 #include "unicode/colldata.h"
35
36 U_NAMESPACE_BEGIN
37
38 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
39 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40 #define DELETE_ARRAY(array) uprv_free((void *) (array))
41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
42
43 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
44
45 #ifdef INSTRUMENT_CELIST
46 int32_t CEList::_active = 0;
47 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
48 #endif
49
50 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
51 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
52 {
53 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
54 UCollationStrength strength = ucol_getStrength(coll);
55 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
56 uint32_t variableTop = ucol_getVariableTop(coll, &status);
57 uint32_t strengthMask = 0;
58 int32_t order;
59
60 if (U_FAILURE(status)) {
61 return;
62 }
63
64 // **** only set flag if string has Han(gul) ****
65 ucol_forceHanImplicit(elems, &status);
66
67 switch (strength)
68 {
69 default:
70 strengthMask |= UCOL_TERTIARYORDERMASK;
71 /* fall through */
72
73 case UCOL_SECONDARY:
74 strengthMask |= UCOL_SECONDARYORDERMASK;
75 /* fall through */
76
77 case UCOL_PRIMARY:
78 strengthMask |= UCOL_PRIMARYORDERMASK;
79 }
80
81 #ifdef INSTRUMENT_CELIST
82 _active += 1;
83 _histogram[0] += 1;
84 #endif
85
86 ces = ceBuffer;
87
88 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
89 UBool cont = isContinuation(order);
90
91 order &= strengthMask;
92
93 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
94 if (strength >= UCOL_QUATERNARY) {
95 order &= UCOL_PRIMARYORDERMASK;
96 } else {
97 order = UCOL_IGNORABLE;
98 }
99 }
100
101 if (order == UCOL_IGNORABLE) {
102 continue;
103 }
104
105 if (cont) {
106 order |= UCOL_CONTINUATION_MARKER;
107 }
108
109 add(order, status);
110 }
111
112 ucol_closeElements(elems);
113 }
114
115 CEList::~CEList()
116 {
117 #ifdef INSTRUMENT_CELIST
118 _active -= 1;
119 #endif
120
121 if (ces != ceBuffer) {
122 DELETE_ARRAY(ces);
123 }
124 }
125
126 void CEList::add(uint32_t ce, UErrorCode &status)
127 {
128 if (U_FAILURE(status)) {
129 return;
130 }
131
132 if (listSize >= listMax) {
133 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
134
135 #ifdef INSTRUMENT_CELIST
136 _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
137 #endif
138
139 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
140
141 if (newCEs == NULL) {
142 status = U_MEMORY_ALLOCATION_ERROR;
143 return;
144 }
145
146 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
147
148 if (ces != ceBuffer) {
149 DELETE_ARRAY(ces);
150 }
151
152 ces = newCEs;
153 listMax = newMax;
154 }
155
156 ces[listSize++] = ce;
157 }
158
159 uint32_t CEList::get(int32_t index) const
160 {
161 if (index >= 0 && index < listSize) {
162 return ces[index];
163 }
164
165 return UCOL_NULLORDER;
166 }
167
168 uint32_t &CEList::operator[](int32_t index) const
169 {
170 return ces[index];
171 }
172
173 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
174 {
175 if (other == NULL || listSize - offset < other->size()) {
176 return FALSE;
177 }
178
179 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
180 if (ces[i] != (*other)[j]) {
181 return FALSE;
182 }
183 }
184
185 return TRUE;
186 }
187
188 int32_t CEList::size() const
189 {
190 return listSize;
191 }
192
193 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
194
195 #ifdef INSTRUMENT_STRING_LIST
196 int32_t StringList::_lists = 0;
197 int32_t StringList::_strings = 0;
198 int32_t StringList::_histogram[101] = {0};
199 #endif
200
201 StringList::StringList(UErrorCode &status)
202 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
203 {
204 if (U_FAILURE(status)) {
205 return;
206 }
207
208 strings = new UnicodeString [listMax];
209
210 if (strings == NULL) {
211 status = U_MEMORY_ALLOCATION_ERROR;
212 return;
213 }
214
215 #ifdef INSTRUMENT_STRING_LIST
216 _lists += 1;
217 _histogram[0] += 1;
218 #endif
219 }
220
221 StringList::~StringList()
222 {
223 delete[] strings;
224 }
225
226 void StringList::add(const UnicodeString *string, UErrorCode &status)
227 {
228 if (U_FAILURE(status)) {
229 return;
230 }
231
232 #ifdef INSTRUMENT_STRING_LIST
233 _strings += 1;
234 #endif
235
236 if (listSize >= listMax) {
237 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
238 UnicodeString *newStrings = new UnicodeString[newMax];
239 if (newStrings == NULL) {
240 status = U_MEMORY_ALLOCATION_ERROR;
241 return;
242 }
243 for (int32_t i=0; i<listSize; ++i) {
244 newStrings[i] = strings[i];
245 }
246
247 #ifdef INSTRUMENT_STRING_LIST
248 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
249
250 if (_h > 100) {
251 _h = 100;
252 }
253
254 _histogram[_h] += 1;
255 #endif
256
257 delete[] strings;
258 strings = newStrings;
259 listMax = newMax;
260 }
261
262 // The ctor initialized all the strings in
263 // the array to empty strings, so this
264 // is the same as copying the source string.
265 strings[listSize++].append(*string);
266 }
267
268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
269 {
270 const UnicodeString string(chars, count);
271
272 add(&string, status);
273 }
274
275 const UnicodeString *StringList::get(int32_t index) const
276 {
277 if (index >= 0 && index < listSize) {
278 return &strings[index];
279 }
280
281 return NULL;
282 }
283
284 int32_t StringList::size() const
285 {
286 return listSize;
287 }
288
289
290 U_CFUNC void deleteStringList(void *obj);
291
292 class CEToStringsMap : public UMemory
293 {
294 public:
295
296 CEToStringsMap(UErrorCode &status);
297 ~CEToStringsMap();
298
299 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
300 StringList *getStringList(uint32_t ce) const;
301
302 private:
303
304 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
305 UHashtable *map;
306 };
307
308 CEToStringsMap::CEToStringsMap(UErrorCode &status)
309 : map(NULL)
310 {
311 if (U_FAILURE(status)) {
312 return;
313 }
314
315 map = uhash_open(uhash_hashLong, uhash_compareLong,
316 uhash_compareCaselessUnicodeString,
317 &status);
318
319 if (U_FAILURE(status)) {
320 return;
321 }
322
323 uhash_setValueDeleter(map, deleteStringList);
324 }
325
326 CEToStringsMap::~CEToStringsMap()
327 {
328 uhash_close(map);
329 }
330
331 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
332 {
333 StringList *strings = getStringList(ce);
334
335 if (strings == NULL) {
336 strings = new StringList(status);
337
338 if (strings == NULL || U_FAILURE(status)) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return;
341 }
342
343 putStringList(ce, strings, status);
344 }
345
346 strings->add(string, status);
347 }
348
349 StringList *CEToStringsMap::getStringList(uint32_t ce) const
350 {
351 return (StringList *) uhash_iget(map, ce);
352 }
353
354 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
355 {
356 uhash_iput(map, ce, (void *) stringList, &status);
357 }
358
359 U_CFUNC void deleteStringList(void *obj)
360 {
361 StringList *strings = (StringList *) obj;
362
363 delete strings;
364 }
365
366 U_CFUNC void deleteCEList(void *obj);
367 U_CFUNC void deleteUnicodeStringKey(void *obj);
368
369 class StringToCEsMap : public UMemory
370 {
371 public:
372 StringToCEsMap(UErrorCode &status);
373 ~StringToCEsMap();
374
375 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
376 const CEList *get(const UnicodeString *string);
377 void free(const CEList *list);
378
379 private:
380
381
382 UHashtable *map;
383 };
384
385 StringToCEsMap::StringToCEsMap(UErrorCode &status)
386 : map(NULL)
387 {
388 if (U_FAILURE(status)) {
389 return;
390 }
391
392 map = uhash_open(uhash_hashUnicodeString,
393 uhash_compareUnicodeString,
394 uhash_compareLong,
395 &status);
396
397 if (U_FAILURE(status)) {
398 return;
399 }
400
401 uhash_setValueDeleter(map, deleteCEList);
402 uhash_setKeyDeleter(map, deleteUnicodeStringKey);
403 }
404
405 StringToCEsMap::~StringToCEsMap()
406 {
407 uhash_close(map);
408 }
409
410 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
411 {
412 uhash_put(map, (void *) string, (void *) ces, &status);
413 }
414
415 const CEList *StringToCEsMap::get(const UnicodeString *string)
416 {
417 return (const CEList *) uhash_get(map, string);
418 }
419
420 U_CFUNC void deleteCEList(void *obj)
421 {
422 CEList *list = (CEList *) obj;
423
424 delete list;
425 }
426
427 U_CFUNC void deleteUnicodeStringKey(void *obj)
428 {
429 UnicodeString *key = (UnicodeString *) obj;
430
431 delete key;
432 }
433
434 class CollDataCacheEntry : public UMemory
435 {
436 public:
437 CollDataCacheEntry(CollData *theData);
438 ~CollDataCacheEntry();
439
440 CollData *data;
441 int32_t refCount;
442 };
443
444 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
445 : data(theData), refCount(1)
446 {
447 // nothing else to do
448 }
449
450 CollDataCacheEntry::~CollDataCacheEntry()
451 {
452 // check refCount?
453 delete data;
454 }
455
456 class CollDataCache : public UMemory
457 {
458 public:
459 CollDataCache(UErrorCode &status);
460 ~CollDataCache();
461
462 CollData *get(UCollator *collator, UErrorCode &status);
463 void unref(CollData *collData);
464
465 void flush();
466
467 private:
468 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
469 static void deleteKey(char *key);
470
471 UHashtable *cache;
472 };
473 static UMTX lock;
474
475 U_CFUNC void deleteChars(void * /*obj*/)
476 {
477 // char *chars = (char *) obj;
478 // All the key strings are owned by the
479 // CollData objects and don't need to
480 // be freed here.
481 //DELETE_ARRAY(chars);
482 }
483
484 U_CFUNC void deleteCollDataCacheEntry(void *obj)
485 {
486 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
487
488 delete entry;
489 }
490
491 CollDataCache::CollDataCache(UErrorCode &status)
492 : cache(NULL)
493 {
494 if (U_FAILURE(status)) {
495 return;
496 }
497
498 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
499
500 if (U_FAILURE(status)) {
501 return;
502 }
503
504 uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
505 uhash_setKeyDeleter(cache, deleteChars);
506 }
507
508 CollDataCache::~CollDataCache()
509 {
510 umtx_lock(&lock);
511 uhash_close(cache);
512 cache = NULL;
513 umtx_unlock(&lock);
514 }
515
516 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
517 {
518 char keyBuffer[KEY_BUFFER_SIZE];
519 int32_t keyLength = KEY_BUFFER_SIZE;
520 char *key = getKey(collator, keyBuffer, &keyLength);
521 CollData *result = NULL, *newData = NULL;
522 CollDataCacheEntry *entry = NULL, *newEntry = NULL;
523
524 umtx_lock(&lock);
525 entry = (CollDataCacheEntry *) uhash_get(cache, key);
526
527 if (entry == NULL) {
528 umtx_unlock(&lock);
529
530 newData = new CollData(collator, key, keyLength, status);
531 newEntry = new CollDataCacheEntry(newData);
532
533 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
534 status = U_MEMORY_ALLOCATION_ERROR;
535 return NULL;
536 }
537
538 umtx_lock(&lock);
539 entry = (CollDataCacheEntry *) uhash_get(cache, key);
540
541 if (entry == NULL) {
542 uhash_put(cache, newData->key, newEntry, &status);
543 umtx_unlock(&lock);
544
545 if (U_FAILURE(status)) {
546 delete newEntry;
547 delete newData;
548
549 return NULL;
550 }
551
552 return newData;
553 }
554 }
555
556 result = entry->data;
557 entry->refCount += 1;
558 umtx_unlock(&lock);
559
560 if (key != keyBuffer) {
561 deleteKey(key);
562 }
563
564 if (newEntry != NULL) {
565 delete newEntry;
566 delete newData;
567 }
568
569 return result;
570 }
571
572 void CollDataCache::unref(CollData *collData)
573 {
574 CollDataCacheEntry *entry = NULL;
575
576 umtx_lock(&lock);
577 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
578
579 if (entry != NULL) {
580 entry->refCount -= 1;
581 }
582 umtx_unlock(&lock);
583 }
584
585 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
586 {
587 UErrorCode status = U_ZERO_ERROR;
588 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
589
590 if (len >= *keyBufferLength) {
591 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
592 keyBuffer = NEW_ARRAY(char, *keyBufferLength);
593 status = U_ZERO_ERROR;
594
595 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
596 }
597
598 keyBuffer[len] = '\0';
599
600 return keyBuffer;
601 }
602
603 void CollDataCache::flush()
604 {
605 const UHashElement *element;
606 int32_t pos = -1;
607
608 umtx_lock(&lock);
609 while ((element = uhash_nextElement(cache, &pos)) != NULL) {
610 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
611
612 if (entry->refCount <= 0) {
613 uhash_removeElement(cache, element);
614 }
615 }
616 umtx_unlock(&lock);
617 }
618
619 void CollDataCache::deleteKey(char *key)
620 {
621 DELETE_ARRAY(key);
622 }
623
624 U_CDECL_BEGIN
625 static UBool coll_data_cleanup(void) {
626 CollData::freeCollDataCache();
627 return TRUE;
628 }
629 U_CDECL_END
630
631 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
632
633 CollData::CollData()
634 {
635 // nothing
636 }
637
638 #define CLONE_COLLATOR
639
640 //#define CACHE_CELISTS
641 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
642 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
643 {
644 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
645 // i.e. other, control, private use, format, surrogate
646 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
647 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
648 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
649
650 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
651 // i.e. all the characers we handle implicitly
652 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
653 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
654 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
655
656 if (U_FAILURE(status)) {
657 return;
658 }
659
660 USet *expansions = uset_openEmpty();
661 USet *contractions = uset_openEmpty();
662 int32_t itemCount;
663
664 #ifdef CACHE_CELISTS
665 charsToCEList = new StringToCEsMap(status);
666
667 if (U_FAILURE(status)) {
668 goto bail;
669 }
670 #else
671 charsToCEList = NULL;
672 #endif
673
674 ceToCharsStartingWith = new CEToStringsMap(status);
675
676 if (U_FAILURE(status)) {
677 goto bail;
678 }
679
680 if (cacheKeyLength > KEY_BUFFER_SIZE) {
681 key = NEW_ARRAY(char, cacheKeyLength);
682
683 if (key == NULL) {
684 status = U_MEMORY_ALLOCATION_ERROR;
685 goto bail;
686 }
687 } else {
688 key = keyBuffer;
689 }
690
691 ARRAY_COPY(key, cacheKey, cacheKeyLength);
692
693 #ifdef CLONE_COLLATOR
694 coll = ucol_safeClone(collator, NULL, NULL, &status);
695
696 if (U_FAILURE(status)) {
697 goto bail;
698 }
699 #else
700 coll = collator;
701 #endif
702
703 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
704
705 uset_addAll(charsToTest, contractions);
706 uset_addAll(charsToTest, expansions);
707 uset_removeAll(charsToTest, charsToRemove);
708
709 itemCount = uset_getItemCount(charsToTest);
710 for(int32_t item = 0; item < itemCount; item += 1) {
711 UChar32 start = 0, end = 0;
712 UChar buffer[16];
713 int32_t len = uset_getItem(charsToTest, item, &start, &end,
714 buffer, 16, &status);
715
716 if (len == 0) {
717 for (UChar32 ch = start; ch <= end; ch += 1) {
718 UnicodeString *st = new UnicodeString(ch);
719
720 if (st == NULL) {
721 status = U_MEMORY_ALLOCATION_ERROR;
722 break;
723 }
724
725 CEList *ceList = new CEList(coll, *st, status);
726
727 ceToCharsStartingWith->put(ceList->get(0), st, status);
728
729 #ifdef CACHE_CELISTS
730 charsToCEList->put(st, ceList, status);
731 #else
732 delete ceList;
733 delete st;
734 #endif
735 }
736 } else if (len > 0) {
737 UnicodeString *st = new UnicodeString(buffer, len);
738
739 if (st == NULL) {
740 status = U_MEMORY_ALLOCATION_ERROR;
741 break;
742 }
743
744 CEList *ceList = new CEList(coll, *st, status);
745
746 ceToCharsStartingWith->put(ceList->get(0), st, status);
747
748 #ifdef CACHE_CELISTS
749 charsToCEList->put(st, ceList, status);
750 #else
751 delete ceList;
752 delete st;
753 #endif
754 } else {
755 // shouldn't happen...
756 }
757
758 if (U_FAILURE(status)) {
759 break;
760 }
761 }
762
763 bail:
764 uset_close(contractions);
765 uset_close(expansions);
766 uset_close(charsToRemove);
767 uset_close(charsToTest);
768
769 if (U_FAILURE(status)) {
770 return;
771 }
772
773 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
774 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
775 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
776 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
777 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
778 CEList hanList(coll, hanString, status);
779 CEList jamoList(coll, jamoString, status);
780 int32_t j = 0;
781
782 if (U_FAILURE(status)) {
783 return;
784 }
785
786 for (int32_t c = 0; c < jamoList.size(); c += 1) {
787 uint32_t jce = jamoList[c];
788
789 if (! isContinuation(jce)) {
790 jamoLimits[j++] = jce;
791 }
792 }
793
794 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
795
796 minHan = 0xFFFFFFFF;
797 maxHan = 0;
798
799 for(int32_t h = 0; h < hanList.size(); h += 2) {
800 uint32_t han = (uint32_t) hanList[h];
801
802 if (han < minHan) {
803 minHan = han;
804 }
805
806 if (han > maxHan) {
807 maxHan = han;
808 }
809 }
810
811 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
812 }
813
814 CollData::~CollData()
815 {
816 #ifdef CLONE_COLLATOR
817 ucol_close(coll);
818 #endif
819
820 if (key != keyBuffer) {
821 DELETE_ARRAY(key);
822 }
823
824 delete ceToCharsStartingWith;
825
826 #ifdef CACHE_CELISTS
827 delete charsToCEList;
828 #endif
829 }
830
831 UCollator *CollData::getCollator() const
832 {
833 return coll;
834 }
835
836 const StringList *CollData::getStringList(int32_t ce) const
837 {
838 return ceToCharsStartingWith->getStringList(ce);
839 }
840
841 const CEList *CollData::getCEList(const UnicodeString *string) const
842 {
843 #ifdef CACHE_CELISTS
844 return charsToCEList->get(string);
845 #else
846 UErrorCode status = U_ZERO_ERROR;
847 const CEList *list = new CEList(coll, *string, status);
848
849 if (U_FAILURE(status)) {
850 delete list;
851 list = NULL;
852 }
853
854 return list;
855 #endif
856 }
857
858 void CollData::freeCEList(const CEList *list)
859 {
860 #ifndef CACHE_CELISTS
861 delete list;
862 #endif
863 }
864
865 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
866 {
867 // find out shortest string for the longest sequence of ces.
868 // this can probably be folded with the minLengthCache...
869
870 if (history[offset] >= 0) {
871 return history[offset];
872 }
873
874 uint32_t ce = ceList->get(offset);
875 int32_t maxOffset = ceList->size();
876 int32_t shortestLength = INT32_MAX;
877 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
878
879 if (strings != NULL) {
880 int32_t stringCount = strings->size();
881
882 for (int32_t s = 0; s < stringCount; s += 1) {
883 const UnicodeString *string = strings->get(s);
884 #ifdef CACHE_CELISTS
885 const CEList *ceList2 = charsToCEList->get(string);
886 #else
887 UErrorCode status = U_ZERO_ERROR;
888 const CEList *ceList2 = new CEList(coll, *string, status);
889
890 if (U_FAILURE(status)) {
891 delete ceList2;
892 ceList2 = NULL;
893 }
894 #endif
895
896 if (ceList->matchesAt(offset, ceList2)) {
897 U_ASSERT(ceList2 != NULL);
898 int32_t clength = ceList2->size();
899 int32_t slength = string->length();
900 int32_t roffset = offset + clength;
901 int32_t rlength = 0;
902
903 if (roffset < maxOffset) {
904 rlength = minLengthInChars(ceList, roffset, history);
905
906 if (rlength <= 0) {
907 // delete before continue to avoid memory leak.
908 #ifndef CACHE_CELISTS
909 delete ceList2;
910 #endif
911 // ignore any dead ends
912 continue;
913 }
914 }
915
916 if (shortestLength > slength + rlength) {
917 shortestLength = slength + rlength;
918 }
919 }
920
921 #ifndef CACHE_CELISTS
922 delete ceList2;
923 #endif
924 }
925 }
926
927 if (shortestLength == INT32_MAX) {
928 // No matching strings at this offset. See if
929 // the CE is in a range we can handle manually.
930 if (ce >= minHan && ce < maxHan) {
931 // all han have implicit orders which
932 // generate two CEs.
933 int32_t roffset = offset + 2;
934 int32_t rlength = 0;
935
936 //history[roffset++] = -1;
937 //history[roffset++] = 1;
938
939 if (roffset < maxOffset) {
940 rlength = minLengthInChars(ceList, roffset, history);
941 }
942
943 if (rlength < 0) {
944 return -1;
945 }
946
947 shortestLength = 1 + rlength;
948 goto have_shortest;
949 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
950 int32_t roffset = offset;
951 int32_t rlength = 0;
952
953 // **** this loop may not handle archaic Hangul correctly ****
954 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
955 uint32_t jce = ceList->get(roffset);
956
957 // Some Jamo have 24-bit primary order; skip the
958 // 2nd CE. This should always be OK because if
959 // we're still in the loop all we've seen are
960 // a series of Jamo in LVT order.
961 if (isContinuation(jce)) {
962 continue;
963 }
964
965 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
966 break;
967 }
968 }
969
970 if (roffset == offset) {
971 // we started with a non-L Jamo...
972 // just say it comes from a single character
973 roffset += 1;
974
975 // See if the single Jamo has a 24-bit order.
976 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
977 roffset += 1;
978 }
979 }
980
981 if (roffset < maxOffset) {
982 rlength = minLengthInChars(ceList, roffset, history);
983 }
984
985 if (rlength < 0) {
986 return -1;
987 }
988
989 shortestLength = 1 + rlength;
990 goto have_shortest;
991 }
992
993 // Can't handle it manually either. Just move on.
994 return -1;
995 }
996
997 have_shortest:
998 history[offset] = shortestLength;
999
1000 return shortestLength;
1001 }
1002
1003 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
1004 {
1005 int32_t clength = ceList->size();
1006 int32_t *history = NEW_ARRAY(int32_t, clength);
1007
1008 for (int32_t i = 0; i < clength; i += 1) {
1009 history[i] = -1;
1010 }
1011
1012 int32_t minLength = minLengthInChars(ceList, offset, history);
1013
1014 DELETE_ARRAY(history);
1015
1016 return minLength;
1017 }
1018
1019 CollData *CollData::open(UCollator *collator, UErrorCode &status)
1020 {
1021 if (U_FAILURE(status)) {
1022 return NULL;
1023 }
1024
1025 CollDataCache *cache = getCollDataCache();
1026
1027 return cache->get(collator, status);
1028 }
1029
1030 void CollData::close(CollData *collData)
1031 {
1032 CollDataCache *cache = getCollDataCache();
1033
1034 cache->unref(collData);
1035 }
1036
1037 CollDataCache *CollData::collDataCache = NULL;
1038
1039 CollDataCache *CollData::getCollDataCache()
1040 {
1041 UErrorCode status = U_ZERO_ERROR;
1042 CollDataCache *cache = NULL;
1043
1044 UMTX_CHECK(NULL, collDataCache, cache);
1045
1046 if (cache == NULL) {
1047 cache = new CollDataCache(status);
1048
1049 if (U_FAILURE(status)) {
1050 delete cache;
1051 return NULL;
1052 }
1053
1054 umtx_lock(NULL);
1055 if (collDataCache == NULL) {
1056 collDataCache = cache;
1057
1058 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
1059 }
1060 umtx_unlock(NULL);
1061
1062 if (collDataCache != cache) {
1063 delete cache;
1064 }
1065 }
1066
1067 return collDataCache;
1068 }
1069
1070 void CollData::freeCollDataCache()
1071 {
1072 CollDataCache *cache = NULL;
1073
1074 UMTX_CHECK(NULL, collDataCache, cache);
1075
1076 if (cache != NULL) {
1077 umtx_lock(NULL);
1078 if (collDataCache != NULL) {
1079 collDataCache = NULL;
1080 } else {
1081 cache = NULL;
1082 }
1083 umtx_unlock(NULL);
1084
1085 delete cache;
1086 }
1087 }
1088
1089 void CollData::flushCollDataCache()
1090 {
1091 CollDataCache *cache = NULL;
1092
1093 UMTX_CHECK(NULL, collDataCache, cache);
1094
1095 // **** this will fail if the another ****
1096 // **** thread deletes the cache here ****
1097 if (cache != NULL) {
1098 cache->flush();
1099 }
1100 }
1101
1102 U_NAMESPACE_END
1103
1104 #endif // #if !UCONFIG_NO_COLLATION