]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/colldata.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / colldata.cpp
... / ...
CommitLineData
1/*
2 ******************************************************************************
3 * Copyright (C) 1996-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION
11
12#include "unicode/unistr.h"
13#include "unicode/usearch.h"
14
15#include "cmemory.h"
16#include "unicode/coll.h"
17#include "unicode/tblcoll.h"
18#include "unicode/coleitr.h"
19#include "unicode/ucoleitr.h"
20
21#include "unicode/regex.h" // TODO: make conditional on regexp being built.
22
23#include "unicode/uniset.h"
24#include "unicode/uset.h"
25#include "unicode/usetiter.h"
26#include "unicode/ustring.h"
27#include "hash.h"
28#include "normalizer2impl.h"
29#include "uhash.h"
30#include "usrchimp.h"
31#include "uassert.h"
32
33#include "colldata.h"
34
35#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
36#define DELETE_ARRAY(array) uprv_free((void *) (array))
37#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
38
39CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
40 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
41{
42 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
43 UCollationStrength strength = ucol_getStrength(coll);
44 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
45 uint32_t variableTop = ucol_getVariableTop(coll, &status);
46 uint32_t strengthMask = 0;
47 int32_t order;
48
49 if (U_FAILURE(status)) {
50 return;
51 }
52
53 // **** only set flag if string has Han(gul) ****
54 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
55
56 switch (strength)
57 {
58 default:
59 strengthMask |= UCOL_TERTIARYORDERMASK;
60 U_FALLTHROUGH;
61 case UCOL_SECONDARY:
62 strengthMask |= UCOL_SECONDARYORDERMASK;
63 U_FALLTHROUGH;
64 case UCOL_PRIMARY:
65 strengthMask |= UCOL_PRIMARYORDERMASK;
66 }
67
68 ces = ceBuffer;
69
70 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
71 UBool cont = isContinuation(order);
72
73 order &= strengthMask;
74
75 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
76 if (strength >= UCOL_QUATERNARY) {
77 order &= UCOL_PRIMARYORDERMASK;
78 } else {
79 order = UCOL_IGNORABLE;
80 }
81 }
82
83 if (order == UCOL_IGNORABLE) {
84 continue;
85 }
86
87 if (cont) {
88 order |= UCOL_CONTINUATION_MARKER;
89 }
90
91 add(order, status);
92 }
93
94 ucol_closeElements(elems);
95}
96
97CEList::~CEList()
98{
99 if (ces != ceBuffer) {
100 DELETE_ARRAY(ces);
101 }
102}
103
104void CEList::add(uint32_t ce, UErrorCode &status)
105{
106 if (U_FAILURE(status)) {
107 return;
108 }
109
110 if (listSize >= listMax) {
111 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
112 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
113
114 if (newCEs == NULL) {
115 status = U_MEMORY_ALLOCATION_ERROR;
116 return;
117 }
118
119 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
120
121 if (ces != ceBuffer) {
122 DELETE_ARRAY(ces);
123 }
124
125 ces = newCEs;
126 listMax = newMax;
127 }
128
129 ces[listSize++] = ce;
130}
131
132uint32_t CEList::get(int32_t index) const
133{
134 if (index >= 0 && index < listSize) {
135 return ces[index];
136 }
137
138 return (uint32_t)UCOL_NULLORDER;
139}
140
141uint32_t &CEList::operator[](int32_t index) const
142{
143 return ces[index];
144}
145
146UBool CEList::matchesAt(int32_t offset, const CEList *other) const
147{
148 if (other == NULL || listSize - offset < other->size()) {
149 return FALSE;
150 }
151
152 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
153 if (ces[i] != (*other)[j]) {
154 return FALSE;
155 }
156 }
157
158 return TRUE;
159}
160
161int32_t CEList::size() const
162{
163 return listSize;
164}
165
166StringList::StringList(UErrorCode &status)
167 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
168{
169 if (U_FAILURE(status)) {
170 return;
171 }
172
173 strings = new UnicodeString [listMax];
174
175 if (strings == NULL) {
176 status = U_MEMORY_ALLOCATION_ERROR;
177 return;
178 }
179}
180
181StringList::~StringList()
182{
183 delete[] strings;
184}
185
186void StringList::add(const UnicodeString *string, UErrorCode &status)
187{
188 if (U_FAILURE(status)) {
189 return;
190 }
191 if (listSize >= listMax) {
192 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
193 UnicodeString *newStrings = new UnicodeString[newMax];
194 if (newStrings == NULL) {
195 status = U_MEMORY_ALLOCATION_ERROR;
196 return;
197 }
198 for (int32_t i=0; i<listSize; ++i) {
199 newStrings[i] = strings[i];
200 }
201 delete[] strings;
202 strings = newStrings;
203 listMax = newMax;
204 }
205
206 // The ctor initialized all the strings in
207 // the array to empty strings, so this
208 // is the same as copying the source string.
209 strings[listSize++].append(*string);
210}
211
212void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
213{
214 const UnicodeString string(chars, count);
215
216 add(&string, status);
217}
218
219const UnicodeString *StringList::get(int32_t index) const
220{
221 if (index >= 0 && index < listSize) {
222 return &strings[index];
223 }
224
225 return NULL;
226}
227
228int32_t StringList::size() const
229{
230 return listSize;
231}
232
233
234U_CDECL_BEGIN
235static void U_CALLCONV
236deleteStringList(void *obj)
237{
238 StringList *strings = (StringList *) obj;
239
240 delete strings;
241}
242U_CDECL_END
243
244class CEToStringsMap
245{
246public:
247 CEToStringsMap(UErrorCode &status);
248 ~CEToStringsMap();
249
250 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
251 StringList *getStringList(uint32_t ce) const;
252
253private:
254 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
255 UHashtable *map;
256};
257
258CEToStringsMap::CEToStringsMap(UErrorCode &status)
259 : map(NULL)
260{
261 if (U_FAILURE(status)) {
262 return;
263 }
264
265 map = uhash_open(uhash_hashLong, uhash_compareLong,
266 uhash_compareCaselessUnicodeString,
267 &status);
268
269 if (U_FAILURE(status)) {
270 return;
271 }
272
273 uhash_setValueDeleter(map, deleteStringList);
274}
275
276CEToStringsMap::~CEToStringsMap()
277{
278 uhash_close(map);
279}
280
281void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
282{
283 StringList *strings = getStringList(ce);
284
285 if (strings == NULL) {
286 strings = new StringList(status);
287
288 if (strings == NULL || U_FAILURE(status)) {
289 status = U_MEMORY_ALLOCATION_ERROR;
290 return;
291 }
292
293 putStringList(ce, strings, status);
294 }
295
296 strings->add(string, status);
297}
298
299StringList *CEToStringsMap::getStringList(uint32_t ce) const
300{
301 return (StringList *) uhash_iget(map, ce);
302}
303
304void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
305{
306 uhash_iput(map, ce, (void *) stringList, &status);
307}
308
309#define CLONE_COLLATOR
310
311CollData::CollData(UCollator *collator, UErrorCode &status)
312 : coll(NULL), ceToCharsStartingWith(NULL)
313{
314 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
315 // i.e. other, control, private use, format, surrogate
316 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
317 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
318 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
319
320 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
321 // i.e. all the characers we handle implicitly
322 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
323 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
325
326 if (U_FAILURE(status)) {
327 return;
328 }
329
330 USet *expansions = uset_openEmpty();
331 USet *contractions = uset_openEmpty();
332 int32_t itemCount;
333
334 ceToCharsStartingWith = new CEToStringsMap(status);
335
336 if (U_FAILURE(status)) {
337 goto bail;
338 }
339
340#ifdef CLONE_COLLATOR
341 coll = ucol_safeClone(collator, NULL, NULL, &status);
342
343 if (U_FAILURE(status)) {
344 goto bail;
345 }
346#else
347 coll = collator;
348#endif
349
350 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
351
352 uset_addAll(charsToTest, contractions);
353 uset_addAll(charsToTest, expansions);
354 uset_removeAll(charsToTest, charsToRemove);
355
356 itemCount = uset_getItemCount(charsToTest);
357 for(int32_t item = 0; item < itemCount; item += 1) {
358 UChar32 start = 0, end = 0;
359 UChar buffer[16];
360 int32_t len = uset_getItem(charsToTest, item, &start, &end,
361 buffer, 16, &status);
362
363 if (len == 0) {
364 for (UChar32 ch = start; ch <= end; ch += 1) {
365 UnicodeString *st = new UnicodeString(ch);
366
367 if (st == NULL) {
368 status = U_MEMORY_ALLOCATION_ERROR;
369 break;
370 }
371
372 CEList *ceList = new CEList(coll, *st, status);
373
374 ceToCharsStartingWith->put(ceList->get(0), st, status);
375
376 delete ceList;
377 delete st;
378 }
379 } else if (len > 0) {
380 UnicodeString *st = new UnicodeString(buffer, len);
381
382 if (st == NULL) {
383 status = U_MEMORY_ALLOCATION_ERROR;
384 break;
385 }
386
387 CEList *ceList = new CEList(coll, *st, status);
388
389 ceToCharsStartingWith->put(ceList->get(0), st, status);
390
391 delete ceList;
392 delete st;
393 } else {
394 // shouldn't happen...
395 }
396
397 if (U_FAILURE(status)) {
398 break;
399 }
400 }
401
402bail:
403 uset_close(contractions);
404 uset_close(expansions);
405 uset_close(charsToRemove);
406 uset_close(charsToTest);
407
408 if (U_FAILURE(status)) {
409 return;
410 }
411
412 UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
413 if (U_FAILURE(status)) {
414 return;
415 }
416 UnicodeSetIterator hanIter(hanRanges);
417 UnicodeString hanString;
418 while(hanIter.nextRange()) {
419 hanString.append(hanIter.getCodepoint());
420 hanString.append(hanIter.getCodepointEnd());
421 }
422 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
423 // but as of Unicode 6.3 the 11xx block is filled,
424 // and there are also more Jamo T at U+D7CB..U+D7FB.
425 // Maybe use [:HST=T:] and look for the end of the last range?
426 // Maybe use script boundary mappings instead of this code??
427 UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
428 UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
429 CEList hanList(coll, hanString, status);
430 CEList jamoList(coll, jamoString, status);
431 int32_t j = 0;
432
433 if (U_FAILURE(status)) {
434 return;
435 }
436
437 for (int32_t c = 0; c < jamoList.size(); c += 1) {
438 uint32_t jce = jamoList[c];
439
440 if (! isContinuation(jce)) {
441 jamoLimits[j++] = jce;
442 }
443 }
444
445 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
446
447 minHan = 0xFFFFFFFF;
448 maxHan = 0;
449
450 for(int32_t h = 0; h < hanList.size(); h += 2) {
451 uint32_t han = (uint32_t) hanList[h];
452
453 if (han < minHan) {
454 minHan = han;
455 }
456
457 if (han > maxHan) {
458 maxHan = han;
459 }
460 }
461
462 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
463}
464
465CollData::~CollData()
466{
467#ifdef CLONE_COLLATOR
468 ucol_close(coll);
469#endif
470
471 delete ceToCharsStartingWith;
472}
473
474UCollator *CollData::getCollator() const
475{
476 return coll;
477}
478
479const StringList *CollData::getStringList(int32_t ce) const
480{
481 return ceToCharsStartingWith->getStringList(ce);
482}
483
484const CEList *CollData::getCEList(const UnicodeString *string) const
485{
486 UErrorCode status = U_ZERO_ERROR;
487 const CEList *list = new CEList(coll, *string, status);
488
489 if (U_FAILURE(status)) {
490 delete list;
491 list = NULL;
492 }
493
494 return list;
495}
496
497void CollData::freeCEList(const CEList *list)
498{
499 delete list;
500}
501
502int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
503{
504 // find out shortest string for the longest sequence of ces.
505 // this can probably be folded with the minLengthCache...
506
507 if (history[offset] >= 0) {
508 return history[offset];
509 }
510
511 uint32_t ce = ceList->get(offset);
512 int32_t maxOffset = ceList->size();
513 int32_t shortestLength = INT32_MAX;
514 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
515
516 if (strings != NULL) {
517 int32_t stringCount = strings->size();
518
519 for (int32_t s = 0; s < stringCount; s += 1) {
520 const UnicodeString *string = strings->get(s);
521 UErrorCode status = U_ZERO_ERROR;
522 const CEList *ceList2 = new CEList(coll, *string, status);
523
524 if (U_FAILURE(status)) {
525 delete ceList2;
526 ceList2 = NULL;
527 }
528
529 if (ceList->matchesAt(offset, ceList2)) {
530 U_ASSERT(ceList2 != NULL);
531 int32_t clength = ceList2->size();
532 int32_t slength = string->length();
533 int32_t roffset = offset + clength;
534 int32_t rlength = 0;
535
536 if (roffset < maxOffset) {
537 rlength = minLengthInChars(ceList, roffset, history);
538
539 if (rlength <= 0) {
540 // delete before continue to avoid memory leak.
541 delete ceList2;
542
543 // ignore any dead ends
544 continue;
545 }
546 }
547
548 if (shortestLength > slength + rlength) {
549 shortestLength = slength + rlength;
550 }
551 }
552
553 delete ceList2;
554 }
555 }
556
557 if (shortestLength == INT32_MAX) {
558 // No matching strings at this offset. See if
559 // the CE is in a range we can handle manually.
560 if (ce >= minHan && ce < maxHan) {
561 // all han have implicit orders which
562 // generate two CEs.
563 int32_t roffset = offset + 2;
564 int32_t rlength = 0;
565
566 //history[roffset++] = -1;
567 //history[roffset++] = 1;
568
569 if (roffset < maxOffset) {
570 rlength = minLengthInChars(ceList, roffset, history);
571 }
572
573 if (rlength < 0) {
574 return -1;
575 }
576
577 shortestLength = 1 + rlength;
578 goto have_shortest;
579 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
580 int32_t roffset = offset;
581 int32_t rlength = 0;
582
583 // **** this loop may not handle archaic Hangul correctly ****
584 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
585 uint32_t jce = ceList->get(roffset);
586
587 // Some Jamo have 24-bit primary order; skip the
588 // 2nd CE. This should always be OK because if
589 // we're still in the loop all we've seen are
590 // a series of Jamo in LVT order.
591 if (isContinuation(jce)) {
592 continue;
593 }
594
595 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
596 break;
597 }
598 }
599
600 if (roffset == offset) {
601 // we started with a non-L Jamo...
602 // just say it comes from a single character
603 roffset += 1;
604
605 // See if the single Jamo has a 24-bit order.
606 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
607 roffset += 1;
608 }
609 }
610
611 if (roffset < maxOffset) {
612 rlength = minLengthInChars(ceList, roffset, history);
613 }
614
615 if (rlength < 0) {
616 return -1;
617 }
618
619 shortestLength = 1 + rlength;
620 goto have_shortest;
621 }
622
623 // Can't handle it manually either. Just move on.
624 return -1;
625 }
626
627have_shortest:
628 history[offset] = shortestLength;
629
630 return shortestLength;
631}
632
633int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
634{
635 int32_t clength = ceList->size();
636 int32_t *history = NEW_ARRAY(int32_t, clength);
637
638 for (int32_t i = 0; i < clength; i += 1) {
639 history[i] = -1;
640 }
641
642 int32_t minLength = minLengthInChars(ceList, offset, history);
643
644 DELETE_ARRAY(history);
645
646 return minLength;
647}
648
649#endif // #if !UCONFIG_NO_COLLATION