]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/colldata.cpp
79d93654926d40027b1ff811e11e870962390e31
[apple/icu.git] / icuSources / test / intltest / colldata.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
14
15 #include "cmemory.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
20
21 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
22
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
27 #include "hash.h"
28 #include "normalizer2impl.h"
29 #include "uhash.h"
30 #include "usrchimp.h"
31 #include "uassert.h"
32
33 #include "colldata.h"
34
35 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
36 #define DELETE_ARRAY(array) uprv_free((void *) (array))
37 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
38
39 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
40 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
41 {
42 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
43 UCollationStrength strength = ucol_getStrength(coll);
44 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
45 uint32_t variableTop = ucol_getVariableTop(coll, &status);
46 uint32_t strengthMask = 0;
47 int32_t order;
48
49 if (U_FAILURE(status)) {
50 return;
51 }
52
53 // **** only set flag if string has Han(gul) ****
54 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
55
56 switch (strength)
57 {
58 default:
59 strengthMask |= UCOL_TERTIARYORDERMASK;
60 U_FALLTHROUGH;
61 case UCOL_SECONDARY:
62 strengthMask |= UCOL_SECONDARYORDERMASK;
63 U_FALLTHROUGH;
64 case UCOL_PRIMARY:
65 strengthMask |= UCOL_PRIMARYORDERMASK;
66 }
67
68 ces = ceBuffer;
69
70 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
71 UBool cont = isContinuation(order);
72
73 order &= strengthMask;
74
75 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
76 if (strength >= UCOL_QUATERNARY) {
77 order &= UCOL_PRIMARYORDERMASK;
78 } else {
79 order = UCOL_IGNORABLE;
80 }
81 }
82
83 if (order == UCOL_IGNORABLE) {
84 continue;
85 }
86
87 if (cont) {
88 order |= UCOL_CONTINUATION_MARKER;
89 }
90
91 add(order, status);
92 }
93
94 ucol_closeElements(elems);
95 }
96
97 CEList::~CEList()
98 {
99 if (ces != ceBuffer) {
100 DELETE_ARRAY(ces);
101 }
102 }
103
104 void CEList::add(uint32_t ce, UErrorCode &status)
105 {
106 if (U_FAILURE(status)) {
107 return;
108 }
109
110 if (listSize >= listMax) {
111 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
112 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
113
114 if (newCEs == NULL) {
115 status = U_MEMORY_ALLOCATION_ERROR;
116 return;
117 }
118
119 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
120
121 if (ces != ceBuffer) {
122 DELETE_ARRAY(ces);
123 }
124
125 ces = newCEs;
126 listMax = newMax;
127 }
128
129 ces[listSize++] = ce;
130 }
131
132 uint32_t CEList::get(int32_t index) const
133 {
134 if (index >= 0 && index < listSize) {
135 return ces[index];
136 }
137
138 return (uint32_t)UCOL_NULLORDER;
139 }
140
141 uint32_t &CEList::operator[](int32_t index) const
142 {
143 return ces[index];
144 }
145
146 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
147 {
148 if (other == NULL || listSize - offset < other->size()) {
149 return FALSE;
150 }
151
152 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
153 if (ces[i] != (*other)[j]) {
154 return FALSE;
155 }
156 }
157
158 return TRUE;
159 }
160
161 int32_t CEList::size() const
162 {
163 return listSize;
164 }
165
166 StringList::StringList(UErrorCode &status)
167 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
168 {
169 if (U_FAILURE(status)) {
170 return;
171 }
172
173 strings = new UnicodeString [listMax];
174
175 if (strings == NULL) {
176 status = U_MEMORY_ALLOCATION_ERROR;
177 return;
178 }
179 }
180
181 StringList::~StringList()
182 {
183 delete[] strings;
184 }
185
186 void StringList::add(const UnicodeString *string, UErrorCode &status)
187 {
188 if (U_FAILURE(status)) {
189 return;
190 }
191 if (listSize >= listMax) {
192 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
193 UnicodeString *newStrings = new UnicodeString[newMax];
194 if (newStrings == NULL) {
195 status = U_MEMORY_ALLOCATION_ERROR;
196 return;
197 }
198 for (int32_t i=0; i<listSize; ++i) {
199 newStrings[i] = strings[i];
200 }
201 delete[] strings;
202 strings = newStrings;
203 listMax = newMax;
204 }
205
206 // The ctor initialized all the strings in
207 // the array to empty strings, so this
208 // is the same as copying the source string.
209 strings[listSize++].append(*string);
210 }
211
212 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
213 {
214 const UnicodeString string(chars, count);
215
216 add(&string, status);
217 }
218
219 const UnicodeString *StringList::get(int32_t index) const
220 {
221 if (index >= 0 && index < listSize) {
222 return &strings[index];
223 }
224
225 return NULL;
226 }
227
228 int32_t StringList::size() const
229 {
230 return listSize;
231 }
232
233
234 U_CDECL_BEGIN
235 static void U_CALLCONV
236 deleteStringList(void *obj)
237 {
238 StringList *strings = (StringList *) obj;
239
240 delete strings;
241 }
242 U_CDECL_END
243
244 class CEToStringsMap
245 {
246 public:
247 CEToStringsMap(UErrorCode &status);
248 ~CEToStringsMap();
249
250 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
251 StringList *getStringList(uint32_t ce) const;
252
253 private:
254 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
255 UHashtable *map;
256 };
257
258 CEToStringsMap::CEToStringsMap(UErrorCode &status)
259 : map(NULL)
260 {
261 if (U_FAILURE(status)) {
262 return;
263 }
264
265 map = uhash_open(uhash_hashLong, uhash_compareLong,
266 uhash_compareCaselessUnicodeString,
267 &status);
268
269 if (U_FAILURE(status)) {
270 return;
271 }
272
273 uhash_setValueDeleter(map, deleteStringList);
274 }
275
276 CEToStringsMap::~CEToStringsMap()
277 {
278 uhash_close(map);
279 }
280
281 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
282 {
283 StringList *strings = getStringList(ce);
284
285 if (strings == NULL) {
286 strings = new StringList(status);
287
288 if (strings == NULL || U_FAILURE(status)) {
289 status = U_MEMORY_ALLOCATION_ERROR;
290 return;
291 }
292
293 putStringList(ce, strings, status);
294 }
295
296 strings->add(string, status);
297 }
298
299 StringList *CEToStringsMap::getStringList(uint32_t ce) const
300 {
301 return (StringList *) uhash_iget(map, ce);
302 }
303
304 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
305 {
306 uhash_iput(map, ce, (void *) stringList, &status);
307 }
308
309 #define CLONE_COLLATOR
310
311 CollData::CollData(UCollator *collator, UErrorCode &status)
312 : coll(NULL), ceToCharsStartingWith(NULL)
313 {
314 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
315 // i.e. other, control, private use, format, surrogate
316 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
317 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
318 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
319
320 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
321 // i.e. all the characers we handle implicitly
322 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
323 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
325
326 if (U_FAILURE(status)) {
327 return;
328 }
329
330 USet *expansions = uset_openEmpty();
331 USet *contractions = uset_openEmpty();
332 int32_t itemCount;
333
334 ceToCharsStartingWith = new CEToStringsMap(status);
335
336 if (U_FAILURE(status)) {
337 goto bail;
338 }
339
340 #ifdef CLONE_COLLATOR
341 coll = ucol_safeClone(collator, NULL, NULL, &status);
342
343 if (U_FAILURE(status)) {
344 goto bail;
345 }
346 #else
347 coll = collator;
348 #endif
349
350 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
351
352 uset_addAll(charsToTest, contractions);
353 uset_addAll(charsToTest, expansions);
354 uset_removeAll(charsToTest, charsToRemove);
355
356 itemCount = uset_getItemCount(charsToTest);
357 for(int32_t item = 0; item < itemCount; item += 1) {
358 UChar32 start = 0, end = 0;
359 UChar buffer[16];
360 int32_t len = uset_getItem(charsToTest, item, &start, &end,
361 buffer, 16, &status);
362
363 if (len == 0) {
364 for (UChar32 ch = start; ch <= end; ch += 1) {
365 UnicodeString *st = new UnicodeString(ch);
366
367 if (st == NULL) {
368 status = U_MEMORY_ALLOCATION_ERROR;
369 break;
370 }
371
372 CEList *ceList = new CEList(coll, *st, status);
373
374 ceToCharsStartingWith->put(ceList->get(0), st, status);
375
376 delete ceList;
377 delete st;
378 }
379 } else if (len > 0) {
380 UnicodeString *st = new UnicodeString(buffer, len);
381
382 if (st == NULL) {
383 status = U_MEMORY_ALLOCATION_ERROR;
384 break;
385 }
386
387 CEList *ceList = new CEList(coll, *st, status);
388
389 ceToCharsStartingWith->put(ceList->get(0), st, status);
390
391 delete ceList;
392 delete st;
393 } else {
394 // shouldn't happen...
395 }
396
397 if (U_FAILURE(status)) {
398 break;
399 }
400 }
401
402 bail:
403 uset_close(contractions);
404 uset_close(expansions);
405 uset_close(charsToRemove);
406 uset_close(charsToTest);
407
408 if (U_FAILURE(status)) {
409 return;
410 }
411
412 UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
413 if (U_FAILURE(status)) {
414 return;
415 }
416 UnicodeSetIterator hanIter(hanRanges);
417 UnicodeString hanString;
418 while(hanIter.nextRange()) {
419 hanString.append(hanIter.getCodepoint());
420 hanString.append(hanIter.getCodepointEnd());
421 }
422 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
423 // but as of Unicode 6.3 the 11xx block is filled,
424 // and there are also more Jamo T at U+D7CB..U+D7FB.
425 // Maybe use [:HST=T:] and look for the end of the last range?
426 // Maybe use script boundary mappings instead of this code??
427 UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
428 UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
429 CEList hanList(coll, hanString, status);
430 CEList jamoList(coll, jamoString, status);
431 int32_t j = 0;
432
433 if (U_FAILURE(status)) {
434 return;
435 }
436
437 for (int32_t c = 0; c < jamoList.size(); c += 1) {
438 uint32_t jce = jamoList[c];
439
440 if (! isContinuation(jce)) {
441 jamoLimits[j++] = jce;
442 }
443 }
444
445 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
446
447 minHan = 0xFFFFFFFF;
448 maxHan = 0;
449
450 for(int32_t h = 0; h < hanList.size(); h += 2) {
451 uint32_t han = (uint32_t) hanList[h];
452
453 if (han < minHan) {
454 minHan = han;
455 }
456
457 if (han > maxHan) {
458 maxHan = han;
459 }
460 }
461
462 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
463 }
464
465 CollData::~CollData()
466 {
467 #ifdef CLONE_COLLATOR
468 ucol_close(coll);
469 #endif
470
471 delete ceToCharsStartingWith;
472 }
473
474 UCollator *CollData::getCollator() const
475 {
476 return coll;
477 }
478
479 const StringList *CollData::getStringList(int32_t ce) const
480 {
481 return ceToCharsStartingWith->getStringList(ce);
482 }
483
484 const CEList *CollData::getCEList(const UnicodeString *string) const
485 {
486 UErrorCode status = U_ZERO_ERROR;
487 const CEList *list = new CEList(coll, *string, status);
488
489 if (U_FAILURE(status)) {
490 delete list;
491 list = NULL;
492 }
493
494 return list;
495 }
496
497 void CollData::freeCEList(const CEList *list)
498 {
499 delete list;
500 }
501
502 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
503 {
504 // find out shortest string for the longest sequence of ces.
505 // this can probably be folded with the minLengthCache...
506
507 if (history[offset] >= 0) {
508 return history[offset];
509 }
510
511 uint32_t ce = ceList->get(offset);
512 int32_t maxOffset = ceList->size();
513 int32_t shortestLength = INT32_MAX;
514 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
515
516 if (strings != NULL) {
517 int32_t stringCount = strings->size();
518
519 for (int32_t s = 0; s < stringCount; s += 1) {
520 const UnicodeString *string = strings->get(s);
521 UErrorCode status = U_ZERO_ERROR;
522 const CEList *ceList2 = new CEList(coll, *string, status);
523
524 if (U_FAILURE(status)) {
525 delete ceList2;
526 ceList2 = NULL;
527 }
528
529 if (ceList->matchesAt(offset, ceList2)) {
530 U_ASSERT(ceList2 != NULL);
531 int32_t clength = ceList2->size();
532 int32_t slength = string->length();
533 int32_t roffset = offset + clength;
534 int32_t rlength = 0;
535
536 if (roffset < maxOffset) {
537 rlength = minLengthInChars(ceList, roffset, history);
538
539 if (rlength <= 0) {
540 // delete before continue to avoid memory leak.
541 delete ceList2;
542
543 // ignore any dead ends
544 continue;
545 }
546 }
547
548 if (shortestLength > slength + rlength) {
549 shortestLength = slength + rlength;
550 }
551 }
552
553 delete ceList2;
554 }
555 }
556
557 if (shortestLength == INT32_MAX) {
558 // No matching strings at this offset. See if
559 // the CE is in a range we can handle manually.
560 if (ce >= minHan && ce < maxHan) {
561 // all han have implicit orders which
562 // generate two CEs.
563 int32_t roffset = offset + 2;
564 int32_t rlength = 0;
565
566 //history[roffset++] = -1;
567 //history[roffset++] = 1;
568
569 if (roffset < maxOffset) {
570 rlength = minLengthInChars(ceList, roffset, history);
571 }
572
573 if (rlength < 0) {
574 return -1;
575 }
576
577 shortestLength = 1 + rlength;
578 goto have_shortest;
579 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
580 int32_t roffset = offset;
581 int32_t rlength = 0;
582
583 // **** this loop may not handle archaic Hangul correctly ****
584 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
585 uint32_t jce = ceList->get(roffset);
586
587 // Some Jamo have 24-bit primary order; skip the
588 // 2nd CE. This should always be OK because if
589 // we're still in the loop all we've seen are
590 // a series of Jamo in LVT order.
591 if (isContinuation(jce)) {
592 continue;
593 }
594
595 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
596 break;
597 }
598 }
599
600 if (roffset == offset) {
601 // we started with a non-L Jamo...
602 // just say it comes from a single character
603 roffset += 1;
604
605 // See if the single Jamo has a 24-bit order.
606 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
607 roffset += 1;
608 }
609 }
610
611 if (roffset < maxOffset) {
612 rlength = minLengthInChars(ceList, roffset, history);
613 }
614
615 if (rlength < 0) {
616 return -1;
617 }
618
619 shortestLength = 1 + rlength;
620 goto have_shortest;
621 }
622
623 // Can't handle it manually either. Just move on.
624 return -1;
625 }
626
627 have_shortest:
628 history[offset] = shortestLength;
629
630 return shortestLength;
631 }
632
633 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
634 {
635 int32_t clength = ceList->size();
636 int32_t *history = NEW_ARRAY(int32_t, clength);
637
638 for (int32_t i = 0; i < clength; i += 1) {
639 history[i] = -1;
640 }
641
642 int32_t minLength = minLengthInChars(ceList, offset, history);
643
644 DELETE_ARRAY(history);
645
646 return minLength;
647 }
648
649 #endif // #if !UCONFIG_NO_COLLATION