]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/colldata.cpp
ICU-62135.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / colldata.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9
A
3/*
4 ******************************************************************************
2ca993e8 5 * Copyright (C) 1996-2016, International Business Machines
57a6839d 6 * Corporation and others. All Rights Reserved.
729e4ab9
A
7 ******************************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_COLLATION
13
14#include "unicode/unistr.h"
729e4ab9
A
15#include "unicode/usearch.h"
16
17#include "cmemory.h"
18#include "unicode/coll.h"
19#include "unicode/tblcoll.h"
20#include "unicode/coleitr.h"
21#include "unicode/ucoleitr.h"
22
23#include "unicode/regex.h" // TODO: make conditional on regexp being built.
24
25#include "unicode/uniset.h"
26#include "unicode/uset.h"
57a6839d 27#include "unicode/usetiter.h"
729e4ab9
A
28#include "unicode/ustring.h"
29#include "hash.h"
57a6839d 30#include "normalizer2impl.h"
729e4ab9 31#include "uhash.h"
57a6839d 32#include "usrchimp.h"
4388f060 33#include "uassert.h"
729e4ab9 34
51004dcb 35#include "colldata.h"
729e4ab9 36
a62d09fc 37#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
729e4ab9 38#define DELETE_ARRAY(array) uprv_free((void *) (array))
a62d09fc 39#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
729e4ab9 40
729e4ab9
A
41CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
42 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
43{
44 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
45 UCollationStrength strength = ucol_getStrength(coll);
46 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
47 uint32_t variableTop = ucol_getVariableTop(coll, &status);
48 uint32_t strengthMask = 0;
49 int32_t order;
50
51 if (U_FAILURE(status)) {
52 return;
53 }
54
55 // **** only set flag if string has Han(gul) ****
57a6839d 56 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
729e4ab9
A
57
58 switch (strength)
59 {
60 default:
61 strengthMask |= UCOL_TERTIARYORDERMASK;
2ca993e8 62 U_FALLTHROUGH;
729e4ab9
A
63 case UCOL_SECONDARY:
64 strengthMask |= UCOL_SECONDARYORDERMASK;
2ca993e8 65 U_FALLTHROUGH;
729e4ab9
A
66 case UCOL_PRIMARY:
67 strengthMask |= UCOL_PRIMARYORDERMASK;
68 }
69
729e4ab9
A
70 ces = ceBuffer;
71
72 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
73 UBool cont = isContinuation(order);
74
75 order &= strengthMask;
76
77 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
78 if (strength >= UCOL_QUATERNARY) {
79 order &= UCOL_PRIMARYORDERMASK;
80 } else {
81 order = UCOL_IGNORABLE;
82 }
83 }
84
85 if (order == UCOL_IGNORABLE) {
86 continue;
87 }
88
89 if (cont) {
90 order |= UCOL_CONTINUATION_MARKER;
91 }
92
93 add(order, status);
94 }
95
96 ucol_closeElements(elems);
97}
98
99CEList::~CEList()
100{
729e4ab9
A
101 if (ces != ceBuffer) {
102 DELETE_ARRAY(ces);
103 }
104}
105
106void CEList::add(uint32_t ce, UErrorCode &status)
107{
108 if (U_FAILURE(status)) {
109 return;
110 }
111
112 if (listSize >= listMax) {
113 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
729e4ab9
A
114 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
115
116 if (newCEs == NULL) {
117 status = U_MEMORY_ALLOCATION_ERROR;
118 return;
119 }
120
121 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
122
123 if (ces != ceBuffer) {
124 DELETE_ARRAY(ces);
125 }
126
127 ces = newCEs;
128 listMax = newMax;
129 }
130
131 ces[listSize++] = ce;
132}
133
134uint32_t CEList::get(int32_t index) const
135{
136 if (index >= 0 && index < listSize) {
137 return ces[index];
138 }
139
51004dcb 140 return (uint32_t)UCOL_NULLORDER;
729e4ab9
A
141}
142
143uint32_t &CEList::operator[](int32_t index) const
144{
145 return ces[index];
146}
147
148UBool CEList::matchesAt(int32_t offset, const CEList *other) const
149{
150 if (other == NULL || listSize - offset < other->size()) {
151 return FALSE;
152 }
153
154 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
155 if (ces[i] != (*other)[j]) {
156 return FALSE;
157 }
158 }
159
160 return TRUE;
161}
162
163int32_t CEList::size() const
164{
165 return listSize;
166}
167
729e4ab9
A
168StringList::StringList(UErrorCode &status)
169 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
170{
171 if (U_FAILURE(status)) {
172 return;
173 }
174
175 strings = new UnicodeString [listMax];
176
177 if (strings == NULL) {
178 status = U_MEMORY_ALLOCATION_ERROR;
179 return;
180 }
729e4ab9
A
181}
182
183StringList::~StringList()
184{
185 delete[] strings;
186}
187
188void StringList::add(const UnicodeString *string, UErrorCode &status)
189{
190 if (U_FAILURE(status)) {
191 return;
192 }
729e4ab9
A
193 if (listSize >= listMax) {
194 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
729e4ab9 195 UnicodeString *newStrings = new UnicodeString[newMax];
4388f060
A
196 if (newStrings == NULL) {
197 status = U_MEMORY_ALLOCATION_ERROR;
198 return;
199 }
200 for (int32_t i=0; i<listSize; ++i) {
201 newStrings[i] = strings[i];
202 }
729e4ab9
A
203 delete[] strings;
204 strings = newStrings;
205 listMax = newMax;
206 }
207
208 // The ctor initialized all the strings in
209 // the array to empty strings, so this
210 // is the same as copying the source string.
211 strings[listSize++].append(*string);
212}
213
214void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
215{
216 const UnicodeString string(chars, count);
217
218 add(&string, status);
219}
220
221const UnicodeString *StringList::get(int32_t index) const
222{
223 if (index >= 0 && index < listSize) {
224 return &strings[index];
225 }
226
227 return NULL;
228}
229
230int32_t StringList::size() const
231{
232 return listSize;
233}
234
235
51004dcb
A
236U_CDECL_BEGIN
237static void U_CALLCONV
238deleteStringList(void *obj)
239{
240 StringList *strings = (StringList *) obj;
241
242 delete strings;
243}
244U_CDECL_END
729e4ab9 245
51004dcb 246class CEToStringsMap
729e4ab9
A
247{
248public:
729e4ab9
A
249 CEToStringsMap(UErrorCode &status);
250 ~CEToStringsMap();
251
252 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
253 StringList *getStringList(uint32_t ce) const;
254
255private:
729e4ab9
A
256 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
257 UHashtable *map;
258};
259
260CEToStringsMap::CEToStringsMap(UErrorCode &status)
261 : map(NULL)
262{
263 if (U_FAILURE(status)) {
264 return;
265 }
266
267 map = uhash_open(uhash_hashLong, uhash_compareLong,
268 uhash_compareCaselessUnicodeString,
269 &status);
270
271 if (U_FAILURE(status)) {
272 return;
273 }
274
275 uhash_setValueDeleter(map, deleteStringList);
276}
277
278CEToStringsMap::~CEToStringsMap()
279{
280 uhash_close(map);
281}
282
283void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
284{
285 StringList *strings = getStringList(ce);
286
287 if (strings == NULL) {
288 strings = new StringList(status);
289
290 if (strings == NULL || U_FAILURE(status)) {
291 status = U_MEMORY_ALLOCATION_ERROR;
292 return;
293 }
294
295 putStringList(ce, strings, status);
296 }
297
298 strings->add(string, status);
299}
300
301StringList *CEToStringsMap::getStringList(uint32_t ce) const
302{
303 return (StringList *) uhash_iget(map, ce);
304}
305
306void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
307{
308 uhash_iput(map, ce, (void *) stringList, &status);
309}
310
729e4ab9
A
311#define CLONE_COLLATOR
312
51004dcb
A
313CollData::CollData(UCollator *collator, UErrorCode &status)
314 : coll(NULL), ceToCharsStartingWith(NULL)
729e4ab9
A
315{
316 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
317 // i.e. other, control, private use, format, surrogate
318 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
319 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
320 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
321
322 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
323 // i.e. all the characers we handle implicitly
324 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
327
328 if (U_FAILURE(status)) {
329 return;
330 }
331
332 USet *expansions = uset_openEmpty();
333 USet *contractions = uset_openEmpty();
334 int32_t itemCount;
335
729e4ab9
A
336 ceToCharsStartingWith = new CEToStringsMap(status);
337
338 if (U_FAILURE(status)) {
339 goto bail;
340 }
341
729e4ab9
A
342#ifdef CLONE_COLLATOR
343 coll = ucol_safeClone(collator, NULL, NULL, &status);
344
345 if (U_FAILURE(status)) {
346 goto bail;
347 }
348#else
349 coll = collator;
350#endif
351
352 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
353
354 uset_addAll(charsToTest, contractions);
355 uset_addAll(charsToTest, expansions);
356 uset_removeAll(charsToTest, charsToRemove);
357
358 itemCount = uset_getItemCount(charsToTest);
359 for(int32_t item = 0; item < itemCount; item += 1) {
360 UChar32 start = 0, end = 0;
361 UChar buffer[16];
362 int32_t len = uset_getItem(charsToTest, item, &start, &end,
363 buffer, 16, &status);
364
365 if (len == 0) {
366 for (UChar32 ch = start; ch <= end; ch += 1) {
367 UnicodeString *st = new UnicodeString(ch);
368
369 if (st == NULL) {
370 status = U_MEMORY_ALLOCATION_ERROR;
371 break;
372 }
373
374 CEList *ceList = new CEList(coll, *st, status);
375
376 ceToCharsStartingWith->put(ceList->get(0), st, status);
377
729e4ab9
A
378 delete ceList;
379 delete st;
729e4ab9
A
380 }
381 } else if (len > 0) {
382 UnicodeString *st = new UnicodeString(buffer, len);
383
384 if (st == NULL) {
385 status = U_MEMORY_ALLOCATION_ERROR;
386 break;
387 }
388
389 CEList *ceList = new CEList(coll, *st, status);
390
391 ceToCharsStartingWith->put(ceList->get(0), st, status);
392
729e4ab9
A
393 delete ceList;
394 delete st;
729e4ab9
A
395 } else {
396 // shouldn't happen...
397 }
398
399 if (U_FAILURE(status)) {
400 break;
401 }
402 }
403
404bail:
405 uset_close(contractions);
406 uset_close(expansions);
407 uset_close(charsToRemove);
408 uset_close(charsToTest);
409
410 if (U_FAILURE(status)) {
411 return;
412 }
413
57a6839d
A
414 UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
415 if (U_FAILURE(status)) {
416 return;
417 }
418 UnicodeSetIterator hanIter(hanRanges);
419 UnicodeString hanString;
420 while(hanIter.nextRange()) {
421 hanString.append(hanIter.getCodepoint());
422 hanString.append(hanIter.getCodepointEnd());
423 }
424 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
425 // but as of Unicode 6.3 the 11xx block is filled,
426 // and there are also more Jamo T at U+D7CB..U+D7FB.
427 // Maybe use [:HST=T:] and look for the end of the last range?
428 // Maybe use script boundary mappings instead of this code??
429 UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
2ca993e8 430 UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
729e4ab9
A
431 CEList hanList(coll, hanString, status);
432 CEList jamoList(coll, jamoString, status);
433 int32_t j = 0;
434
435 if (U_FAILURE(status)) {
436 return;
437 }
438
439 for (int32_t c = 0; c < jamoList.size(); c += 1) {
440 uint32_t jce = jamoList[c];
441
442 if (! isContinuation(jce)) {
443 jamoLimits[j++] = jce;
444 }
445 }
446
447 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
448
449 minHan = 0xFFFFFFFF;
450 maxHan = 0;
451
452 for(int32_t h = 0; h < hanList.size(); h += 2) {
453 uint32_t han = (uint32_t) hanList[h];
454
455 if (han < minHan) {
456 minHan = han;
457 }
458
459 if (han > maxHan) {
460 maxHan = han;
461 }
462 }
463
464 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
465}
466
467CollData::~CollData()
468{
469#ifdef CLONE_COLLATOR
470 ucol_close(coll);
471#endif
472
729e4ab9 473 delete ceToCharsStartingWith;
729e4ab9
A
474}
475
476UCollator *CollData::getCollator() const
477{
478 return coll;
479}
480
481const StringList *CollData::getStringList(int32_t ce) const
482{
483 return ceToCharsStartingWith->getStringList(ce);
484}
485
486const CEList *CollData::getCEList(const UnicodeString *string) const
487{
729e4ab9
A
488 UErrorCode status = U_ZERO_ERROR;
489 const CEList *list = new CEList(coll, *string, status);
490
491 if (U_FAILURE(status)) {
492 delete list;
493 list = NULL;
494 }
495
496 return list;
729e4ab9
A
497}
498
499void CollData::freeCEList(const CEList *list)
500{
729e4ab9 501 delete list;
729e4ab9
A
502}
503
504int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
505{
506 // find out shortest string for the longest sequence of ces.
507 // this can probably be folded with the minLengthCache...
508
509 if (history[offset] >= 0) {
510 return history[offset];
511 }
512
513 uint32_t ce = ceList->get(offset);
514 int32_t maxOffset = ceList->size();
515 int32_t shortestLength = INT32_MAX;
516 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
517
518 if (strings != NULL) {
519 int32_t stringCount = strings->size();
520
521 for (int32_t s = 0; s < stringCount; s += 1) {
522 const UnicodeString *string = strings->get(s);
729e4ab9
A
523 UErrorCode status = U_ZERO_ERROR;
524 const CEList *ceList2 = new CEList(coll, *string, status);
525
526 if (U_FAILURE(status)) {
527 delete ceList2;
528 ceList2 = NULL;
529 }
729e4ab9
A
530
531 if (ceList->matchesAt(offset, ceList2)) {
4388f060 532 U_ASSERT(ceList2 != NULL);
729e4ab9
A
533 int32_t clength = ceList2->size();
534 int32_t slength = string->length();
535 int32_t roffset = offset + clength;
536 int32_t rlength = 0;
537
538 if (roffset < maxOffset) {
539 rlength = minLengthInChars(ceList, roffset, history);
540
541 if (rlength <= 0) {
542 // delete before continue to avoid memory leak.
729e4ab9 543 delete ceList2;
51004dcb 544
729e4ab9
A
545 // ignore any dead ends
546 continue;
547 }
548 }
549
550 if (shortestLength > slength + rlength) {
551 shortestLength = slength + rlength;
552 }
553 }
554
729e4ab9 555 delete ceList2;
729e4ab9
A
556 }
557 }
558
559 if (shortestLength == INT32_MAX) {
560 // No matching strings at this offset. See if
561 // the CE is in a range we can handle manually.
562 if (ce >= minHan && ce < maxHan) {
563 // all han have implicit orders which
564 // generate two CEs.
565 int32_t roffset = offset + 2;
566 int32_t rlength = 0;
567
568 //history[roffset++] = -1;
569 //history[roffset++] = 1;
570
571 if (roffset < maxOffset) {
572 rlength = minLengthInChars(ceList, roffset, history);
573 }
574
575 if (rlength < 0) {
576 return -1;
577 }
578
579 shortestLength = 1 + rlength;
580 goto have_shortest;
581 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
582 int32_t roffset = offset;
583 int32_t rlength = 0;
584
585 // **** this loop may not handle archaic Hangul correctly ****
586 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
587 uint32_t jce = ceList->get(roffset);
588
589 // Some Jamo have 24-bit primary order; skip the
590 // 2nd CE. This should always be OK because if
591 // we're still in the loop all we've seen are
592 // a series of Jamo in LVT order.
593 if (isContinuation(jce)) {
594 continue;
595 }
596
597 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
598 break;
599 }
600 }
601
602 if (roffset == offset) {
603 // we started with a non-L Jamo...
604 // just say it comes from a single character
605 roffset += 1;
606
607 // See if the single Jamo has a 24-bit order.
608 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
609 roffset += 1;
610 }
611 }
612
613 if (roffset < maxOffset) {
614 rlength = minLengthInChars(ceList, roffset, history);
615 }
616
617 if (rlength < 0) {
618 return -1;
619 }
620
621 shortestLength = 1 + rlength;
622 goto have_shortest;
623 }
624
625 // Can't handle it manually either. Just move on.
626 return -1;
627 }
628
629have_shortest:
630 history[offset] = shortestLength;
631
632 return shortestLength;
633}
634
635int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
636{
637 int32_t clength = ceList->size();
638 int32_t *history = NEW_ARRAY(int32_t, clength);
639
640 for (int32_t i = 0; i < clength; i += 1) {
641 history[i] = -1;
642 }
643
644 int32_t minLength = minLengthInChars(ceList, offset, history);
645
646 DELETE_ARRAY(history);
647
648 return minLength;
649}
650
729e4ab9 651#endif // #if !UCONFIG_NO_COLLATION