]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ****************************************************************************** | |
3 | * Copyright (C) 1996-2016, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ****************************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_COLLATION | |
11 | ||
12 | #include "unicode/unistr.h" | |
13 | #include "unicode/usearch.h" | |
14 | ||
15 | #include "cmemory.h" | |
16 | #include "unicode/coll.h" | |
17 | #include "unicode/tblcoll.h" | |
18 | #include "unicode/coleitr.h" | |
19 | #include "unicode/ucoleitr.h" | |
20 | ||
21 | #include "unicode/regex.h" // TODO: make conditional on regexp being built. | |
22 | ||
23 | #include "unicode/uniset.h" | |
24 | #include "unicode/uset.h" | |
25 | #include "unicode/usetiter.h" | |
26 | #include "unicode/ustring.h" | |
27 | #include "hash.h" | |
28 | #include "normalizer2impl.h" | |
29 | #include "uhash.h" | |
30 | #include "usrchimp.h" | |
31 | #include "uassert.h" | |
32 | ||
33 | #include "colldata.h" | |
34 | ||
35 | #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type)) | |
36 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
37 | #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0]) | |
38 | ||
39 | CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) | |
40 | : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) | |
41 | { | |
42 | UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); | |
43 | UCollationStrength strength = ucol_getStrength(coll); | |
44 | UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; | |
45 | uint32_t variableTop = ucol_getVariableTop(coll, &status); | |
46 | uint32_t strengthMask = 0; | |
47 | int32_t order; | |
48 | ||
49 | if (U_FAILURE(status)) { | |
50 | return; | |
51 | } | |
52 | ||
53 | // **** only set flag if string has Han(gul) **** | |
54 | // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476 | |
55 | ||
56 | switch (strength) | |
57 | { | |
58 | default: | |
59 | strengthMask |= UCOL_TERTIARYORDERMASK; | |
60 | U_FALLTHROUGH; | |
61 | case UCOL_SECONDARY: | |
62 | strengthMask |= UCOL_SECONDARYORDERMASK; | |
63 | U_FALLTHROUGH; | |
64 | case UCOL_PRIMARY: | |
65 | strengthMask |= UCOL_PRIMARYORDERMASK; | |
66 | } | |
67 | ||
68 | ces = ceBuffer; | |
69 | ||
70 | while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { | |
71 | UBool cont = isContinuation(order); | |
72 | ||
73 | order &= strengthMask; | |
74 | ||
75 | if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { | |
76 | if (strength >= UCOL_QUATERNARY) { | |
77 | order &= UCOL_PRIMARYORDERMASK; | |
78 | } else { | |
79 | order = UCOL_IGNORABLE; | |
80 | } | |
81 | } | |
82 | ||
83 | if (order == UCOL_IGNORABLE) { | |
84 | continue; | |
85 | } | |
86 | ||
87 | if (cont) { | |
88 | order |= UCOL_CONTINUATION_MARKER; | |
89 | } | |
90 | ||
91 | add(order, status); | |
92 | } | |
93 | ||
94 | ucol_closeElements(elems); | |
95 | } | |
96 | ||
97 | CEList::~CEList() | |
98 | { | |
99 | if (ces != ceBuffer) { | |
100 | DELETE_ARRAY(ces); | |
101 | } | |
102 | } | |
103 | ||
104 | void CEList::add(uint32_t ce, UErrorCode &status) | |
105 | { | |
106 | if (U_FAILURE(status)) { | |
107 | return; | |
108 | } | |
109 | ||
110 | if (listSize >= listMax) { | |
111 | int32_t newMax = listMax + CELIST_BUFFER_SIZE; | |
112 | uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); | |
113 | ||
114 | if (newCEs == NULL) { | |
115 | status = U_MEMORY_ALLOCATION_ERROR; | |
116 | return; | |
117 | } | |
118 | ||
119 | uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); | |
120 | ||
121 | if (ces != ceBuffer) { | |
122 | DELETE_ARRAY(ces); | |
123 | } | |
124 | ||
125 | ces = newCEs; | |
126 | listMax = newMax; | |
127 | } | |
128 | ||
129 | ces[listSize++] = ce; | |
130 | } | |
131 | ||
132 | uint32_t CEList::get(int32_t index) const | |
133 | { | |
134 | if (index >= 0 && index < listSize) { | |
135 | return ces[index]; | |
136 | } | |
137 | ||
138 | return (uint32_t)UCOL_NULLORDER; | |
139 | } | |
140 | ||
141 | uint32_t &CEList::operator[](int32_t index) const | |
142 | { | |
143 | return ces[index]; | |
144 | } | |
145 | ||
146 | UBool CEList::matchesAt(int32_t offset, const CEList *other) const | |
147 | { | |
148 | if (other == NULL || listSize - offset < other->size()) { | |
149 | return FALSE; | |
150 | } | |
151 | ||
152 | for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { | |
153 | if (ces[i] != (*other)[j]) { | |
154 | return FALSE; | |
155 | } | |
156 | } | |
157 | ||
158 | return TRUE; | |
159 | } | |
160 | ||
161 | int32_t CEList::size() const | |
162 | { | |
163 | return listSize; | |
164 | } | |
165 | ||
166 | StringList::StringList(UErrorCode &status) | |
167 | : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) | |
168 | { | |
169 | if (U_FAILURE(status)) { | |
170 | return; | |
171 | } | |
172 | ||
173 | strings = new UnicodeString [listMax]; | |
174 | ||
175 | if (strings == NULL) { | |
176 | status = U_MEMORY_ALLOCATION_ERROR; | |
177 | return; | |
178 | } | |
179 | } | |
180 | ||
181 | StringList::~StringList() | |
182 | { | |
183 | delete[] strings; | |
184 | } | |
185 | ||
186 | void StringList::add(const UnicodeString *string, UErrorCode &status) | |
187 | { | |
188 | if (U_FAILURE(status)) { | |
189 | return; | |
190 | } | |
191 | if (listSize >= listMax) { | |
192 | int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; | |
193 | UnicodeString *newStrings = new UnicodeString[newMax]; | |
194 | if (newStrings == NULL) { | |
195 | status = U_MEMORY_ALLOCATION_ERROR; | |
196 | return; | |
197 | } | |
198 | for (int32_t i=0; i<listSize; ++i) { | |
199 | newStrings[i] = strings[i]; | |
200 | } | |
201 | delete[] strings; | |
202 | strings = newStrings; | |
203 | listMax = newMax; | |
204 | } | |
205 | ||
206 | // The ctor initialized all the strings in | |
207 | // the array to empty strings, so this | |
208 | // is the same as copying the source string. | |
209 | strings[listSize++].append(*string); | |
210 | } | |
211 | ||
212 | void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) | |
213 | { | |
214 | const UnicodeString string(chars, count); | |
215 | ||
216 | add(&string, status); | |
217 | } | |
218 | ||
219 | const UnicodeString *StringList::get(int32_t index) const | |
220 | { | |
221 | if (index >= 0 && index < listSize) { | |
222 | return &strings[index]; | |
223 | } | |
224 | ||
225 | return NULL; | |
226 | } | |
227 | ||
228 | int32_t StringList::size() const | |
229 | { | |
230 | return listSize; | |
231 | } | |
232 | ||
233 | ||
234 | U_CDECL_BEGIN | |
235 | static void U_CALLCONV | |
236 | deleteStringList(void *obj) | |
237 | { | |
238 | StringList *strings = (StringList *) obj; | |
239 | ||
240 | delete strings; | |
241 | } | |
242 | U_CDECL_END | |
243 | ||
244 | class CEToStringsMap | |
245 | { | |
246 | public: | |
247 | CEToStringsMap(UErrorCode &status); | |
248 | ~CEToStringsMap(); | |
249 | ||
250 | void put(uint32_t ce, UnicodeString *string, UErrorCode &status); | |
251 | StringList *getStringList(uint32_t ce) const; | |
252 | ||
253 | private: | |
254 | void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); | |
255 | UHashtable *map; | |
256 | }; | |
257 | ||
258 | CEToStringsMap::CEToStringsMap(UErrorCode &status) | |
259 | : map(NULL) | |
260 | { | |
261 | if (U_FAILURE(status)) { | |
262 | return; | |
263 | } | |
264 | ||
265 | map = uhash_open(uhash_hashLong, uhash_compareLong, | |
266 | uhash_compareCaselessUnicodeString, | |
267 | &status); | |
268 | ||
269 | if (U_FAILURE(status)) { | |
270 | return; | |
271 | } | |
272 | ||
273 | uhash_setValueDeleter(map, deleteStringList); | |
274 | } | |
275 | ||
276 | CEToStringsMap::~CEToStringsMap() | |
277 | { | |
278 | uhash_close(map); | |
279 | } | |
280 | ||
281 | void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) | |
282 | { | |
283 | StringList *strings = getStringList(ce); | |
284 | ||
285 | if (strings == NULL) { | |
286 | strings = new StringList(status); | |
287 | ||
288 | if (strings == NULL || U_FAILURE(status)) { | |
289 | status = U_MEMORY_ALLOCATION_ERROR; | |
290 | return; | |
291 | } | |
292 | ||
293 | putStringList(ce, strings, status); | |
294 | } | |
295 | ||
296 | strings->add(string, status); | |
297 | } | |
298 | ||
299 | StringList *CEToStringsMap::getStringList(uint32_t ce) const | |
300 | { | |
301 | return (StringList *) uhash_iget(map, ce); | |
302 | } | |
303 | ||
304 | void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) | |
305 | { | |
306 | uhash_iput(map, ce, (void *) stringList, &status); | |
307 | } | |
308 | ||
309 | #define CLONE_COLLATOR | |
310 | ||
311 | CollData::CollData(UCollator *collator, UErrorCode &status) | |
312 | : coll(NULL), ceToCharsStartingWith(NULL) | |
313 | { | |
314 | // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] | |
315 | // i.e. other, control, private use, format, surrogate | |
316 | U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); | |
317 | U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); | |
318 | USet *charsToTest = uset_openPattern(test_pattern, 20, &status); | |
319 | ||
320 | // Han ext. A, Han, Jamo, Hangul, Han Ext. B | |
321 | // i.e. all the characers we handle implicitly | |
322 | U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); | |
323 | U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); | |
324 | USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); | |
325 | ||
326 | if (U_FAILURE(status)) { | |
327 | return; | |
328 | } | |
329 | ||
330 | USet *expansions = uset_openEmpty(); | |
331 | USet *contractions = uset_openEmpty(); | |
332 | int32_t itemCount; | |
333 | ||
334 | ceToCharsStartingWith = new CEToStringsMap(status); | |
335 | ||
336 | if (U_FAILURE(status)) { | |
337 | goto bail; | |
338 | } | |
339 | ||
340 | #ifdef CLONE_COLLATOR | |
341 | coll = ucol_safeClone(collator, NULL, NULL, &status); | |
342 | ||
343 | if (U_FAILURE(status)) { | |
344 | goto bail; | |
345 | } | |
346 | #else | |
347 | coll = collator; | |
348 | #endif | |
349 | ||
350 | ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); | |
351 | ||
352 | uset_addAll(charsToTest, contractions); | |
353 | uset_addAll(charsToTest, expansions); | |
354 | uset_removeAll(charsToTest, charsToRemove); | |
355 | ||
356 | itemCount = uset_getItemCount(charsToTest); | |
357 | for(int32_t item = 0; item < itemCount; item += 1) { | |
358 | UChar32 start = 0, end = 0; | |
359 | UChar buffer[16]; | |
360 | int32_t len = uset_getItem(charsToTest, item, &start, &end, | |
361 | buffer, 16, &status); | |
362 | ||
363 | if (len == 0) { | |
364 | for (UChar32 ch = start; ch <= end; ch += 1) { | |
365 | UnicodeString *st = new UnicodeString(ch); | |
366 | ||
367 | if (st == NULL) { | |
368 | status = U_MEMORY_ALLOCATION_ERROR; | |
369 | break; | |
370 | } | |
371 | ||
372 | CEList *ceList = new CEList(coll, *st, status); | |
373 | ||
374 | ceToCharsStartingWith->put(ceList->get(0), st, status); | |
375 | ||
376 | delete ceList; | |
377 | delete st; | |
378 | } | |
379 | } else if (len > 0) { | |
380 | UnicodeString *st = new UnicodeString(buffer, len); | |
381 | ||
382 | if (st == NULL) { | |
383 | status = U_MEMORY_ALLOCATION_ERROR; | |
384 | break; | |
385 | } | |
386 | ||
387 | CEList *ceList = new CEList(coll, *st, status); | |
388 | ||
389 | ceToCharsStartingWith->put(ceList->get(0), st, status); | |
390 | ||
391 | delete ceList; | |
392 | delete st; | |
393 | } else { | |
394 | // shouldn't happen... | |
395 | } | |
396 | ||
397 | if (U_FAILURE(status)) { | |
398 | break; | |
399 | } | |
400 | } | |
401 | ||
402 | bail: | |
403 | uset_close(contractions); | |
404 | uset_close(expansions); | |
405 | uset_close(charsToRemove); | |
406 | uset_close(charsToTest); | |
407 | ||
408 | if (U_FAILURE(status)) { | |
409 | return; | |
410 | } | |
411 | ||
412 | UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status); | |
413 | if (U_FAILURE(status)) { | |
414 | return; | |
415 | } | |
416 | UnicodeSetIterator hanIter(hanRanges); | |
417 | UnicodeString hanString; | |
418 | while(hanIter.nextRange()) { | |
419 | hanString.append(hanIter.getCodepoint()); | |
420 | hanString.append(hanIter.getCodepointEnd()); | |
421 | } | |
422 | // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, | |
423 | // but as of Unicode 6.3 the 11xx block is filled, | |
424 | // and there are also more Jamo T at U+D7CB..U+D7FB. | |
425 | // Maybe use [:HST=T:] and look for the end of the last range? | |
426 | // Maybe use script boundary mappings instead of this code?? | |
427 | UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF}; | |
428 | UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges)); | |
429 | CEList hanList(coll, hanString, status); | |
430 | CEList jamoList(coll, jamoString, status); | |
431 | int32_t j = 0; | |
432 | ||
433 | if (U_FAILURE(status)) { | |
434 | return; | |
435 | } | |
436 | ||
437 | for (int32_t c = 0; c < jamoList.size(); c += 1) { | |
438 | uint32_t jce = jamoList[c]; | |
439 | ||
440 | if (! isContinuation(jce)) { | |
441 | jamoLimits[j++] = jce; | |
442 | } | |
443 | } | |
444 | ||
445 | jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); | |
446 | ||
447 | minHan = 0xFFFFFFFF; | |
448 | maxHan = 0; | |
449 | ||
450 | for(int32_t h = 0; h < hanList.size(); h += 2) { | |
451 | uint32_t han = (uint32_t) hanList[h]; | |
452 | ||
453 | if (han < minHan) { | |
454 | minHan = han; | |
455 | } | |
456 | ||
457 | if (han > maxHan) { | |
458 | maxHan = han; | |
459 | } | |
460 | } | |
461 | ||
462 | maxHan += (1 << UCOL_PRIMARYORDERSHIFT); | |
463 | } | |
464 | ||
465 | CollData::~CollData() | |
466 | { | |
467 | #ifdef CLONE_COLLATOR | |
468 | ucol_close(coll); | |
469 | #endif | |
470 | ||
471 | delete ceToCharsStartingWith; | |
472 | } | |
473 | ||
474 | UCollator *CollData::getCollator() const | |
475 | { | |
476 | return coll; | |
477 | } | |
478 | ||
479 | const StringList *CollData::getStringList(int32_t ce) const | |
480 | { | |
481 | return ceToCharsStartingWith->getStringList(ce); | |
482 | } | |
483 | ||
484 | const CEList *CollData::getCEList(const UnicodeString *string) const | |
485 | { | |
486 | UErrorCode status = U_ZERO_ERROR; | |
487 | const CEList *list = new CEList(coll, *string, status); | |
488 | ||
489 | if (U_FAILURE(status)) { | |
490 | delete list; | |
491 | list = NULL; | |
492 | } | |
493 | ||
494 | return list; | |
495 | } | |
496 | ||
497 | void CollData::freeCEList(const CEList *list) | |
498 | { | |
499 | delete list; | |
500 | } | |
501 | ||
502 | int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const | |
503 | { | |
504 | // find out shortest string for the longest sequence of ces. | |
505 | // this can probably be folded with the minLengthCache... | |
506 | ||
507 | if (history[offset] >= 0) { | |
508 | return history[offset]; | |
509 | } | |
510 | ||
511 | uint32_t ce = ceList->get(offset); | |
512 | int32_t maxOffset = ceList->size(); | |
513 | int32_t shortestLength = INT32_MAX; | |
514 | const StringList *strings = ceToCharsStartingWith->getStringList(ce); | |
515 | ||
516 | if (strings != NULL) { | |
517 | int32_t stringCount = strings->size(); | |
518 | ||
519 | for (int32_t s = 0; s < stringCount; s += 1) { | |
520 | const UnicodeString *string = strings->get(s); | |
521 | UErrorCode status = U_ZERO_ERROR; | |
522 | const CEList *ceList2 = new CEList(coll, *string, status); | |
523 | ||
524 | if (U_FAILURE(status)) { | |
525 | delete ceList2; | |
526 | ceList2 = NULL; | |
527 | } | |
528 | ||
529 | if (ceList->matchesAt(offset, ceList2)) { | |
530 | U_ASSERT(ceList2 != NULL); | |
531 | int32_t clength = ceList2->size(); | |
532 | int32_t slength = string->length(); | |
533 | int32_t roffset = offset + clength; | |
534 | int32_t rlength = 0; | |
535 | ||
536 | if (roffset < maxOffset) { | |
537 | rlength = minLengthInChars(ceList, roffset, history); | |
538 | ||
539 | if (rlength <= 0) { | |
540 | // delete before continue to avoid memory leak. | |
541 | delete ceList2; | |
542 | ||
543 | // ignore any dead ends | |
544 | continue; | |
545 | } | |
546 | } | |
547 | ||
548 | if (shortestLength > slength + rlength) { | |
549 | shortestLength = slength + rlength; | |
550 | } | |
551 | } | |
552 | ||
553 | delete ceList2; | |
554 | } | |
555 | } | |
556 | ||
557 | if (shortestLength == INT32_MAX) { | |
558 | // No matching strings at this offset. See if | |
559 | // the CE is in a range we can handle manually. | |
560 | if (ce >= minHan && ce < maxHan) { | |
561 | // all han have implicit orders which | |
562 | // generate two CEs. | |
563 | int32_t roffset = offset + 2; | |
564 | int32_t rlength = 0; | |
565 | ||
566 | //history[roffset++] = -1; | |
567 | //history[roffset++] = 1; | |
568 | ||
569 | if (roffset < maxOffset) { | |
570 | rlength = minLengthInChars(ceList, roffset, history); | |
571 | } | |
572 | ||
573 | if (rlength < 0) { | |
574 | return -1; | |
575 | } | |
576 | ||
577 | shortestLength = 1 + rlength; | |
578 | goto have_shortest; | |
579 | } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { | |
580 | int32_t roffset = offset; | |
581 | int32_t rlength = 0; | |
582 | ||
583 | // **** this loop may not handle archaic Hangul correctly **** | |
584 | for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { | |
585 | uint32_t jce = ceList->get(roffset); | |
586 | ||
587 | // Some Jamo have 24-bit primary order; skip the | |
588 | // 2nd CE. This should always be OK because if | |
589 | // we're still in the loop all we've seen are | |
590 | // a series of Jamo in LVT order. | |
591 | if (isContinuation(jce)) { | |
592 | continue; | |
593 | } | |
594 | ||
595 | if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { | |
596 | break; | |
597 | } | |
598 | } | |
599 | ||
600 | if (roffset == offset) { | |
601 | // we started with a non-L Jamo... | |
602 | // just say it comes from a single character | |
603 | roffset += 1; | |
604 | ||
605 | // See if the single Jamo has a 24-bit order. | |
606 | if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { | |
607 | roffset += 1; | |
608 | } | |
609 | } | |
610 | ||
611 | if (roffset < maxOffset) { | |
612 | rlength = minLengthInChars(ceList, roffset, history); | |
613 | } | |
614 | ||
615 | if (rlength < 0) { | |
616 | return -1; | |
617 | } | |
618 | ||
619 | shortestLength = 1 + rlength; | |
620 | goto have_shortest; | |
621 | } | |
622 | ||
623 | // Can't handle it manually either. Just move on. | |
624 | return -1; | |
625 | } | |
626 | ||
627 | have_shortest: | |
628 | history[offset] = shortestLength; | |
629 | ||
630 | return shortestLength; | |
631 | } | |
632 | ||
633 | int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const | |
634 | { | |
635 | int32_t clength = ceList->size(); | |
636 | int32_t *history = NEW_ARRAY(int32_t, clength); | |
637 | ||
638 | for (int32_t i = 0; i < clength; i += 1) { | |
639 | history[i] = -1; | |
640 | } | |
641 | ||
642 | int32_t minLength = minLengthInChars(ceList, offset, history); | |
643 | ||
644 | DELETE_ARRAY(history); | |
645 | ||
646 | return minLength; | |
647 | } | |
648 | ||
649 | #endif // #if !UCONFIG_NO_COLLATION |