]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/colldata.cpp
ICU-511.31.tar.gz
[apple/icu.git] / icuSources / test / intltest / colldata.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
14
15 #include "cmemory.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
20
21 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
22
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/ustring.h"
26 #include "hash.h"
27 #include "uhash.h"
28 #include "ucol_imp.h"
29 #include "uassert.h"
30
31 #include "colldata.h"
32
33 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
34 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
35 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37
38 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
39 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
40 {
41 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
42 UCollationStrength strength = ucol_getStrength(coll);
43 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
44 uint32_t variableTop = ucol_getVariableTop(coll, &status);
45 uint32_t strengthMask = 0;
46 int32_t order;
47
48 if (U_FAILURE(status)) {
49 return;
50 }
51
52 // **** only set flag if string has Han(gul) ****
53 ucol_forceHanImplicit(elems, &status);
54
55 switch (strength)
56 {
57 default:
58 strengthMask |= UCOL_TERTIARYORDERMASK;
59 /* fall through */
60
61 case UCOL_SECONDARY:
62 strengthMask |= UCOL_SECONDARYORDERMASK;
63 /* fall through */
64
65 case UCOL_PRIMARY:
66 strengthMask |= UCOL_PRIMARYORDERMASK;
67 }
68
69 ces = ceBuffer;
70
71 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
72 UBool cont = isContinuation(order);
73
74 order &= strengthMask;
75
76 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
77 if (strength >= UCOL_QUATERNARY) {
78 order &= UCOL_PRIMARYORDERMASK;
79 } else {
80 order = UCOL_IGNORABLE;
81 }
82 }
83
84 if (order == UCOL_IGNORABLE) {
85 continue;
86 }
87
88 if (cont) {
89 order |= UCOL_CONTINUATION_MARKER;
90 }
91
92 add(order, status);
93 }
94
95 ucol_closeElements(elems);
96 }
97
98 CEList::~CEList()
99 {
100 if (ces != ceBuffer) {
101 DELETE_ARRAY(ces);
102 }
103 }
104
105 void CEList::add(uint32_t ce, UErrorCode &status)
106 {
107 if (U_FAILURE(status)) {
108 return;
109 }
110
111 if (listSize >= listMax) {
112 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
113 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
114
115 if (newCEs == NULL) {
116 status = U_MEMORY_ALLOCATION_ERROR;
117 return;
118 }
119
120 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
121
122 if (ces != ceBuffer) {
123 DELETE_ARRAY(ces);
124 }
125
126 ces = newCEs;
127 listMax = newMax;
128 }
129
130 ces[listSize++] = ce;
131 }
132
133 uint32_t CEList::get(int32_t index) const
134 {
135 if (index >= 0 && index < listSize) {
136 return ces[index];
137 }
138
139 return (uint32_t)UCOL_NULLORDER;
140 }
141
142 uint32_t &CEList::operator[](int32_t index) const
143 {
144 return ces[index];
145 }
146
147 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
148 {
149 if (other == NULL || listSize - offset < other->size()) {
150 return FALSE;
151 }
152
153 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
154 if (ces[i] != (*other)[j]) {
155 return FALSE;
156 }
157 }
158
159 return TRUE;
160 }
161
162 int32_t CEList::size() const
163 {
164 return listSize;
165 }
166
167 StringList::StringList(UErrorCode &status)
168 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
169 {
170 if (U_FAILURE(status)) {
171 return;
172 }
173
174 strings = new UnicodeString [listMax];
175
176 if (strings == NULL) {
177 status = U_MEMORY_ALLOCATION_ERROR;
178 return;
179 }
180 }
181
182 StringList::~StringList()
183 {
184 delete[] strings;
185 }
186
187 void StringList::add(const UnicodeString *string, UErrorCode &status)
188 {
189 if (U_FAILURE(status)) {
190 return;
191 }
192 if (listSize >= listMax) {
193 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
194 UnicodeString *newStrings = new UnicodeString[newMax];
195 if (newStrings == NULL) {
196 status = U_MEMORY_ALLOCATION_ERROR;
197 return;
198 }
199 for (int32_t i=0; i<listSize; ++i) {
200 newStrings[i] = strings[i];
201 }
202 delete[] strings;
203 strings = newStrings;
204 listMax = newMax;
205 }
206
207 // The ctor initialized all the strings in
208 // the array to empty strings, so this
209 // is the same as copying the source string.
210 strings[listSize++].append(*string);
211 }
212
213 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
214 {
215 const UnicodeString string(chars, count);
216
217 add(&string, status);
218 }
219
220 const UnicodeString *StringList::get(int32_t index) const
221 {
222 if (index >= 0 && index < listSize) {
223 return &strings[index];
224 }
225
226 return NULL;
227 }
228
229 int32_t StringList::size() const
230 {
231 return listSize;
232 }
233
234
235 U_CDECL_BEGIN
236 static void U_CALLCONV
237 deleteStringList(void *obj)
238 {
239 StringList *strings = (StringList *) obj;
240
241 delete strings;
242 }
243 U_CDECL_END
244
245 class CEToStringsMap
246 {
247 public:
248 CEToStringsMap(UErrorCode &status);
249 ~CEToStringsMap();
250
251 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
252 StringList *getStringList(uint32_t ce) const;
253
254 private:
255 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
256 UHashtable *map;
257 };
258
259 CEToStringsMap::CEToStringsMap(UErrorCode &status)
260 : map(NULL)
261 {
262 if (U_FAILURE(status)) {
263 return;
264 }
265
266 map = uhash_open(uhash_hashLong, uhash_compareLong,
267 uhash_compareCaselessUnicodeString,
268 &status);
269
270 if (U_FAILURE(status)) {
271 return;
272 }
273
274 uhash_setValueDeleter(map, deleteStringList);
275 }
276
277 CEToStringsMap::~CEToStringsMap()
278 {
279 uhash_close(map);
280 }
281
282 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
283 {
284 StringList *strings = getStringList(ce);
285
286 if (strings == NULL) {
287 strings = new StringList(status);
288
289 if (strings == NULL || U_FAILURE(status)) {
290 status = U_MEMORY_ALLOCATION_ERROR;
291 return;
292 }
293
294 putStringList(ce, strings, status);
295 }
296
297 strings->add(string, status);
298 }
299
300 StringList *CEToStringsMap::getStringList(uint32_t ce) const
301 {
302 return (StringList *) uhash_iget(map, ce);
303 }
304
305 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
306 {
307 uhash_iput(map, ce, (void *) stringList, &status);
308 }
309
310 #define CLONE_COLLATOR
311
312 CollData::CollData(UCollator *collator, UErrorCode &status)
313 : coll(NULL), ceToCharsStartingWith(NULL)
314 {
315 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
316 // i.e. other, control, private use, format, surrogate
317 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
318 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
319 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
320
321 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
322 // i.e. all the characers we handle implicitly
323 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
326
327 if (U_FAILURE(status)) {
328 return;
329 }
330
331 USet *expansions = uset_openEmpty();
332 USet *contractions = uset_openEmpty();
333 int32_t itemCount;
334
335 ceToCharsStartingWith = new CEToStringsMap(status);
336
337 if (U_FAILURE(status)) {
338 goto bail;
339 }
340
341 #ifdef CLONE_COLLATOR
342 coll = ucol_safeClone(collator, NULL, NULL, &status);
343
344 if (U_FAILURE(status)) {
345 goto bail;
346 }
347 #else
348 coll = collator;
349 #endif
350
351 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
352
353 uset_addAll(charsToTest, contractions);
354 uset_addAll(charsToTest, expansions);
355 uset_removeAll(charsToTest, charsToRemove);
356
357 itemCount = uset_getItemCount(charsToTest);
358 for(int32_t item = 0; item < itemCount; item += 1) {
359 UChar32 start = 0, end = 0;
360 UChar buffer[16];
361 int32_t len = uset_getItem(charsToTest, item, &start, &end,
362 buffer, 16, &status);
363
364 if (len == 0) {
365 for (UChar32 ch = start; ch <= end; ch += 1) {
366 UnicodeString *st = new UnicodeString(ch);
367
368 if (st == NULL) {
369 status = U_MEMORY_ALLOCATION_ERROR;
370 break;
371 }
372
373 CEList *ceList = new CEList(coll, *st, status);
374
375 ceToCharsStartingWith->put(ceList->get(0), st, status);
376
377 delete ceList;
378 delete st;
379 }
380 } else if (len > 0) {
381 UnicodeString *st = new UnicodeString(buffer, len);
382
383 if (st == NULL) {
384 status = U_MEMORY_ALLOCATION_ERROR;
385 break;
386 }
387
388 CEList *ceList = new CEList(coll, *st, status);
389
390 ceToCharsStartingWith->put(ceList->get(0), st, status);
391
392 delete ceList;
393 delete st;
394 } else {
395 // shouldn't happen...
396 }
397
398 if (U_FAILURE(status)) {
399 break;
400 }
401 }
402
403 bail:
404 uset_close(contractions);
405 uset_close(expansions);
406 uset_close(charsToRemove);
407 uset_close(charsToTest);
408
409 if (U_FAILURE(status)) {
410 return;
411 }
412
413 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
414 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
415 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
416 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
417 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
418 CEList hanList(coll, hanString, status);
419 CEList jamoList(coll, jamoString, status);
420 int32_t j = 0;
421
422 if (U_FAILURE(status)) {
423 return;
424 }
425
426 for (int32_t c = 0; c < jamoList.size(); c += 1) {
427 uint32_t jce = jamoList[c];
428
429 if (! isContinuation(jce)) {
430 jamoLimits[j++] = jce;
431 }
432 }
433
434 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
435
436 minHan = 0xFFFFFFFF;
437 maxHan = 0;
438
439 for(int32_t h = 0; h < hanList.size(); h += 2) {
440 uint32_t han = (uint32_t) hanList[h];
441
442 if (han < minHan) {
443 minHan = han;
444 }
445
446 if (han > maxHan) {
447 maxHan = han;
448 }
449 }
450
451 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
452 }
453
454 CollData::~CollData()
455 {
456 #ifdef CLONE_COLLATOR
457 ucol_close(coll);
458 #endif
459
460 delete ceToCharsStartingWith;
461 }
462
463 UCollator *CollData::getCollator() const
464 {
465 return coll;
466 }
467
468 const StringList *CollData::getStringList(int32_t ce) const
469 {
470 return ceToCharsStartingWith->getStringList(ce);
471 }
472
473 const CEList *CollData::getCEList(const UnicodeString *string) const
474 {
475 UErrorCode status = U_ZERO_ERROR;
476 const CEList *list = new CEList(coll, *string, status);
477
478 if (U_FAILURE(status)) {
479 delete list;
480 list = NULL;
481 }
482
483 return list;
484 }
485
486 void CollData::freeCEList(const CEList *list)
487 {
488 delete list;
489 }
490
491 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
492 {
493 // find out shortest string for the longest sequence of ces.
494 // this can probably be folded with the minLengthCache...
495
496 if (history[offset] >= 0) {
497 return history[offset];
498 }
499
500 uint32_t ce = ceList->get(offset);
501 int32_t maxOffset = ceList->size();
502 int32_t shortestLength = INT32_MAX;
503 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
504
505 if (strings != NULL) {
506 int32_t stringCount = strings->size();
507
508 for (int32_t s = 0; s < stringCount; s += 1) {
509 const UnicodeString *string = strings->get(s);
510 UErrorCode status = U_ZERO_ERROR;
511 const CEList *ceList2 = new CEList(coll, *string, status);
512
513 if (U_FAILURE(status)) {
514 delete ceList2;
515 ceList2 = NULL;
516 }
517
518 if (ceList->matchesAt(offset, ceList2)) {
519 U_ASSERT(ceList2 != NULL);
520 int32_t clength = ceList2->size();
521 int32_t slength = string->length();
522 int32_t roffset = offset + clength;
523 int32_t rlength = 0;
524
525 if (roffset < maxOffset) {
526 rlength = minLengthInChars(ceList, roffset, history);
527
528 if (rlength <= 0) {
529 // delete before continue to avoid memory leak.
530 delete ceList2;
531
532 // ignore any dead ends
533 continue;
534 }
535 }
536
537 if (shortestLength > slength + rlength) {
538 shortestLength = slength + rlength;
539 }
540 }
541
542 delete ceList2;
543 }
544 }
545
546 if (shortestLength == INT32_MAX) {
547 // No matching strings at this offset. See if
548 // the CE is in a range we can handle manually.
549 if (ce >= minHan && ce < maxHan) {
550 // all han have implicit orders which
551 // generate two CEs.
552 int32_t roffset = offset + 2;
553 int32_t rlength = 0;
554
555 //history[roffset++] = -1;
556 //history[roffset++] = 1;
557
558 if (roffset < maxOffset) {
559 rlength = minLengthInChars(ceList, roffset, history);
560 }
561
562 if (rlength < 0) {
563 return -1;
564 }
565
566 shortestLength = 1 + rlength;
567 goto have_shortest;
568 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
569 int32_t roffset = offset;
570 int32_t rlength = 0;
571
572 // **** this loop may not handle archaic Hangul correctly ****
573 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
574 uint32_t jce = ceList->get(roffset);
575
576 // Some Jamo have 24-bit primary order; skip the
577 // 2nd CE. This should always be OK because if
578 // we're still in the loop all we've seen are
579 // a series of Jamo in LVT order.
580 if (isContinuation(jce)) {
581 continue;
582 }
583
584 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
585 break;
586 }
587 }
588
589 if (roffset == offset) {
590 // we started with a non-L Jamo...
591 // just say it comes from a single character
592 roffset += 1;
593
594 // See if the single Jamo has a 24-bit order.
595 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
596 roffset += 1;
597 }
598 }
599
600 if (roffset < maxOffset) {
601 rlength = minLengthInChars(ceList, roffset, history);
602 }
603
604 if (rlength < 0) {
605 return -1;
606 }
607
608 shortestLength = 1 + rlength;
609 goto have_shortest;
610 }
611
612 // Can't handle it manually either. Just move on.
613 return -1;
614 }
615
616 have_shortest:
617 history[offset] = shortestLength;
618
619 return shortestLength;
620 }
621
622 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
623 {
624 int32_t clength = ceList->size();
625 int32_t *history = NEW_ARRAY(int32_t, clength);
626
627 for (int32_t i = 0; i < clength; i += 1) {
628 history[i] = -1;
629 }
630
631 int32_t minLength = minLengthInChars(ceList, offset, history);
632
633 DELETE_ARRAY(history);
634
635 return minLength;
636 }
637
638 #endif // #if !UCONFIG_NO_COLLATION