]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ****************************************************************************** | |
3 | * Copyright (C) 1996-2012, International Business Machines * | |
4 | * Corporation and others. All Rights Reserved. * | |
5 | ****************************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_COLLATION | |
11 | ||
12 | #include "unicode/unistr.h" | |
13 | #include "unicode/usearch.h" | |
14 | ||
15 | #include "cmemory.h" | |
16 | #include "unicode/coll.h" | |
17 | #include "unicode/tblcoll.h" | |
18 | #include "unicode/coleitr.h" | |
19 | #include "unicode/ucoleitr.h" | |
20 | ||
21 | #include "unicode/regex.h" // TODO: make conditional on regexp being built. | |
22 | ||
23 | #include "unicode/uniset.h" | |
24 | #include "unicode/uset.h" | |
25 | #include "unicode/ustring.h" | |
26 | #include "hash.h" | |
27 | #include "uhash.h" | |
28 | #include "ucol_imp.h" | |
29 | #include "uassert.h" | |
30 | ||
31 | #include "colldata.h" | |
32 | ||
33 | #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) | |
34 | #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) | |
35 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
36 | #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) | |
37 | ||
38 | CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) | |
39 | : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) | |
40 | { | |
41 | UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); | |
42 | UCollationStrength strength = ucol_getStrength(coll); | |
43 | UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; | |
44 | uint32_t variableTop = ucol_getVariableTop(coll, &status); | |
45 | uint32_t strengthMask = 0; | |
46 | int32_t order; | |
47 | ||
48 | if (U_FAILURE(status)) { | |
49 | return; | |
50 | } | |
51 | ||
52 | // **** only set flag if string has Han(gul) **** | |
53 | ucol_forceHanImplicit(elems, &status); | |
54 | ||
55 | switch (strength) | |
56 | { | |
57 | default: | |
58 | strengthMask |= UCOL_TERTIARYORDERMASK; | |
59 | /* fall through */ | |
60 | ||
61 | case UCOL_SECONDARY: | |
62 | strengthMask |= UCOL_SECONDARYORDERMASK; | |
63 | /* fall through */ | |
64 | ||
65 | case UCOL_PRIMARY: | |
66 | strengthMask |= UCOL_PRIMARYORDERMASK; | |
67 | } | |
68 | ||
69 | ces = ceBuffer; | |
70 | ||
71 | while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { | |
72 | UBool cont = isContinuation(order); | |
73 | ||
74 | order &= strengthMask; | |
75 | ||
76 | if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { | |
77 | if (strength >= UCOL_QUATERNARY) { | |
78 | order &= UCOL_PRIMARYORDERMASK; | |
79 | } else { | |
80 | order = UCOL_IGNORABLE; | |
81 | } | |
82 | } | |
83 | ||
84 | if (order == UCOL_IGNORABLE) { | |
85 | continue; | |
86 | } | |
87 | ||
88 | if (cont) { | |
89 | order |= UCOL_CONTINUATION_MARKER; | |
90 | } | |
91 | ||
92 | add(order, status); | |
93 | } | |
94 | ||
95 | ucol_closeElements(elems); | |
96 | } | |
97 | ||
98 | CEList::~CEList() | |
99 | { | |
100 | if (ces != ceBuffer) { | |
101 | DELETE_ARRAY(ces); | |
102 | } | |
103 | } | |
104 | ||
105 | void CEList::add(uint32_t ce, UErrorCode &status) | |
106 | { | |
107 | if (U_FAILURE(status)) { | |
108 | return; | |
109 | } | |
110 | ||
111 | if (listSize >= listMax) { | |
112 | int32_t newMax = listMax + CELIST_BUFFER_SIZE; | |
113 | uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); | |
114 | ||
115 | if (newCEs == NULL) { | |
116 | status = U_MEMORY_ALLOCATION_ERROR; | |
117 | return; | |
118 | } | |
119 | ||
120 | uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); | |
121 | ||
122 | if (ces != ceBuffer) { | |
123 | DELETE_ARRAY(ces); | |
124 | } | |
125 | ||
126 | ces = newCEs; | |
127 | listMax = newMax; | |
128 | } | |
129 | ||
130 | ces[listSize++] = ce; | |
131 | } | |
132 | ||
133 | uint32_t CEList::get(int32_t index) const | |
134 | { | |
135 | if (index >= 0 && index < listSize) { | |
136 | return ces[index]; | |
137 | } | |
138 | ||
139 | return (uint32_t)UCOL_NULLORDER; | |
140 | } | |
141 | ||
142 | uint32_t &CEList::operator[](int32_t index) const | |
143 | { | |
144 | return ces[index]; | |
145 | } | |
146 | ||
147 | UBool CEList::matchesAt(int32_t offset, const CEList *other) const | |
148 | { | |
149 | if (other == NULL || listSize - offset < other->size()) { | |
150 | return FALSE; | |
151 | } | |
152 | ||
153 | for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { | |
154 | if (ces[i] != (*other)[j]) { | |
155 | return FALSE; | |
156 | } | |
157 | } | |
158 | ||
159 | return TRUE; | |
160 | } | |
161 | ||
162 | int32_t CEList::size() const | |
163 | { | |
164 | return listSize; | |
165 | } | |
166 | ||
167 | StringList::StringList(UErrorCode &status) | |
168 | : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) | |
169 | { | |
170 | if (U_FAILURE(status)) { | |
171 | return; | |
172 | } | |
173 | ||
174 | strings = new UnicodeString [listMax]; | |
175 | ||
176 | if (strings == NULL) { | |
177 | status = U_MEMORY_ALLOCATION_ERROR; | |
178 | return; | |
179 | } | |
180 | } | |
181 | ||
182 | StringList::~StringList() | |
183 | { | |
184 | delete[] strings; | |
185 | } | |
186 | ||
187 | void StringList::add(const UnicodeString *string, UErrorCode &status) | |
188 | { | |
189 | if (U_FAILURE(status)) { | |
190 | return; | |
191 | } | |
192 | if (listSize >= listMax) { | |
193 | int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; | |
194 | UnicodeString *newStrings = new UnicodeString[newMax]; | |
195 | if (newStrings == NULL) { | |
196 | status = U_MEMORY_ALLOCATION_ERROR; | |
197 | return; | |
198 | } | |
199 | for (int32_t i=0; i<listSize; ++i) { | |
200 | newStrings[i] = strings[i]; | |
201 | } | |
202 | delete[] strings; | |
203 | strings = newStrings; | |
204 | listMax = newMax; | |
205 | } | |
206 | ||
207 | // The ctor initialized all the strings in | |
208 | // the array to empty strings, so this | |
209 | // is the same as copying the source string. | |
210 | strings[listSize++].append(*string); | |
211 | } | |
212 | ||
213 | void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) | |
214 | { | |
215 | const UnicodeString string(chars, count); | |
216 | ||
217 | add(&string, status); | |
218 | } | |
219 | ||
220 | const UnicodeString *StringList::get(int32_t index) const | |
221 | { | |
222 | if (index >= 0 && index < listSize) { | |
223 | return &strings[index]; | |
224 | } | |
225 | ||
226 | return NULL; | |
227 | } | |
228 | ||
229 | int32_t StringList::size() const | |
230 | { | |
231 | return listSize; | |
232 | } | |
233 | ||
234 | ||
235 | U_CDECL_BEGIN | |
236 | static void U_CALLCONV | |
237 | deleteStringList(void *obj) | |
238 | { | |
239 | StringList *strings = (StringList *) obj; | |
240 | ||
241 | delete strings; | |
242 | } | |
243 | U_CDECL_END | |
244 | ||
245 | class CEToStringsMap | |
246 | { | |
247 | public: | |
248 | CEToStringsMap(UErrorCode &status); | |
249 | ~CEToStringsMap(); | |
250 | ||
251 | void put(uint32_t ce, UnicodeString *string, UErrorCode &status); | |
252 | StringList *getStringList(uint32_t ce) const; | |
253 | ||
254 | private: | |
255 | void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); | |
256 | UHashtable *map; | |
257 | }; | |
258 | ||
259 | CEToStringsMap::CEToStringsMap(UErrorCode &status) | |
260 | : map(NULL) | |
261 | { | |
262 | if (U_FAILURE(status)) { | |
263 | return; | |
264 | } | |
265 | ||
266 | map = uhash_open(uhash_hashLong, uhash_compareLong, | |
267 | uhash_compareCaselessUnicodeString, | |
268 | &status); | |
269 | ||
270 | if (U_FAILURE(status)) { | |
271 | return; | |
272 | } | |
273 | ||
274 | uhash_setValueDeleter(map, deleteStringList); | |
275 | } | |
276 | ||
277 | CEToStringsMap::~CEToStringsMap() | |
278 | { | |
279 | uhash_close(map); | |
280 | } | |
281 | ||
282 | void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) | |
283 | { | |
284 | StringList *strings = getStringList(ce); | |
285 | ||
286 | if (strings == NULL) { | |
287 | strings = new StringList(status); | |
288 | ||
289 | if (strings == NULL || U_FAILURE(status)) { | |
290 | status = U_MEMORY_ALLOCATION_ERROR; | |
291 | return; | |
292 | } | |
293 | ||
294 | putStringList(ce, strings, status); | |
295 | } | |
296 | ||
297 | strings->add(string, status); | |
298 | } | |
299 | ||
300 | StringList *CEToStringsMap::getStringList(uint32_t ce) const | |
301 | { | |
302 | return (StringList *) uhash_iget(map, ce); | |
303 | } | |
304 | ||
305 | void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) | |
306 | { | |
307 | uhash_iput(map, ce, (void *) stringList, &status); | |
308 | } | |
309 | ||
310 | #define CLONE_COLLATOR | |
311 | ||
312 | CollData::CollData(UCollator *collator, UErrorCode &status) | |
313 | : coll(NULL), ceToCharsStartingWith(NULL) | |
314 | { | |
315 | // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] | |
316 | // i.e. other, control, private use, format, surrogate | |
317 | U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); | |
318 | U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); | |
319 | USet *charsToTest = uset_openPattern(test_pattern, 20, &status); | |
320 | ||
321 | // Han ext. A, Han, Jamo, Hangul, Han Ext. B | |
322 | // i.e. all the characers we handle implicitly | |
323 | U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); | |
324 | U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); | |
325 | USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); | |
326 | ||
327 | if (U_FAILURE(status)) { | |
328 | return; | |
329 | } | |
330 | ||
331 | USet *expansions = uset_openEmpty(); | |
332 | USet *contractions = uset_openEmpty(); | |
333 | int32_t itemCount; | |
334 | ||
335 | ceToCharsStartingWith = new CEToStringsMap(status); | |
336 | ||
337 | if (U_FAILURE(status)) { | |
338 | goto bail; | |
339 | } | |
340 | ||
341 | #ifdef CLONE_COLLATOR | |
342 | coll = ucol_safeClone(collator, NULL, NULL, &status); | |
343 | ||
344 | if (U_FAILURE(status)) { | |
345 | goto bail; | |
346 | } | |
347 | #else | |
348 | coll = collator; | |
349 | #endif | |
350 | ||
351 | ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); | |
352 | ||
353 | uset_addAll(charsToTest, contractions); | |
354 | uset_addAll(charsToTest, expansions); | |
355 | uset_removeAll(charsToTest, charsToRemove); | |
356 | ||
357 | itemCount = uset_getItemCount(charsToTest); | |
358 | for(int32_t item = 0; item < itemCount; item += 1) { | |
359 | UChar32 start = 0, end = 0; | |
360 | UChar buffer[16]; | |
361 | int32_t len = uset_getItem(charsToTest, item, &start, &end, | |
362 | buffer, 16, &status); | |
363 | ||
364 | if (len == 0) { | |
365 | for (UChar32 ch = start; ch <= end; ch += 1) { | |
366 | UnicodeString *st = new UnicodeString(ch); | |
367 | ||
368 | if (st == NULL) { | |
369 | status = U_MEMORY_ALLOCATION_ERROR; | |
370 | break; | |
371 | } | |
372 | ||
373 | CEList *ceList = new CEList(coll, *st, status); | |
374 | ||
375 | ceToCharsStartingWith->put(ceList->get(0), st, status); | |
376 | ||
377 | delete ceList; | |
378 | delete st; | |
379 | } | |
380 | } else if (len > 0) { | |
381 | UnicodeString *st = new UnicodeString(buffer, len); | |
382 | ||
383 | if (st == NULL) { | |
384 | status = U_MEMORY_ALLOCATION_ERROR; | |
385 | break; | |
386 | } | |
387 | ||
388 | CEList *ceList = new CEList(coll, *st, status); | |
389 | ||
390 | ceToCharsStartingWith->put(ceList->get(0), st, status); | |
391 | ||
392 | delete ceList; | |
393 | delete st; | |
394 | } else { | |
395 | // shouldn't happen... | |
396 | } | |
397 | ||
398 | if (U_FAILURE(status)) { | |
399 | break; | |
400 | } | |
401 | } | |
402 | ||
403 | bail: | |
404 | uset_close(contractions); | |
405 | uset_close(expansions); | |
406 | uset_close(charsToRemove); | |
407 | uset_close(charsToTest); | |
408 | ||
409 | if (U_FAILURE(status)) { | |
410 | return; | |
411 | } | |
412 | ||
413 | UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, | |
414 | UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; | |
415 | UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; | |
416 | UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges)); | |
417 | UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); | |
418 | CEList hanList(coll, hanString, status); | |
419 | CEList jamoList(coll, jamoString, status); | |
420 | int32_t j = 0; | |
421 | ||
422 | if (U_FAILURE(status)) { | |
423 | return; | |
424 | } | |
425 | ||
426 | for (int32_t c = 0; c < jamoList.size(); c += 1) { | |
427 | uint32_t jce = jamoList[c]; | |
428 | ||
429 | if (! isContinuation(jce)) { | |
430 | jamoLimits[j++] = jce; | |
431 | } | |
432 | } | |
433 | ||
434 | jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); | |
435 | ||
436 | minHan = 0xFFFFFFFF; | |
437 | maxHan = 0; | |
438 | ||
439 | for(int32_t h = 0; h < hanList.size(); h += 2) { | |
440 | uint32_t han = (uint32_t) hanList[h]; | |
441 | ||
442 | if (han < minHan) { | |
443 | minHan = han; | |
444 | } | |
445 | ||
446 | if (han > maxHan) { | |
447 | maxHan = han; | |
448 | } | |
449 | } | |
450 | ||
451 | maxHan += (1 << UCOL_PRIMARYORDERSHIFT); | |
452 | } | |
453 | ||
454 | CollData::~CollData() | |
455 | { | |
456 | #ifdef CLONE_COLLATOR | |
457 | ucol_close(coll); | |
458 | #endif | |
459 | ||
460 | delete ceToCharsStartingWith; | |
461 | } | |
462 | ||
463 | UCollator *CollData::getCollator() const | |
464 | { | |
465 | return coll; | |
466 | } | |
467 | ||
468 | const StringList *CollData::getStringList(int32_t ce) const | |
469 | { | |
470 | return ceToCharsStartingWith->getStringList(ce); | |
471 | } | |
472 | ||
473 | const CEList *CollData::getCEList(const UnicodeString *string) const | |
474 | { | |
475 | UErrorCode status = U_ZERO_ERROR; | |
476 | const CEList *list = new CEList(coll, *string, status); | |
477 | ||
478 | if (U_FAILURE(status)) { | |
479 | delete list; | |
480 | list = NULL; | |
481 | } | |
482 | ||
483 | return list; | |
484 | } | |
485 | ||
486 | void CollData::freeCEList(const CEList *list) | |
487 | { | |
488 | delete list; | |
489 | } | |
490 | ||
491 | int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const | |
492 | { | |
493 | // find out shortest string for the longest sequence of ces. | |
494 | // this can probably be folded with the minLengthCache... | |
495 | ||
496 | if (history[offset] >= 0) { | |
497 | return history[offset]; | |
498 | } | |
499 | ||
500 | uint32_t ce = ceList->get(offset); | |
501 | int32_t maxOffset = ceList->size(); | |
502 | int32_t shortestLength = INT32_MAX; | |
503 | const StringList *strings = ceToCharsStartingWith->getStringList(ce); | |
504 | ||
505 | if (strings != NULL) { | |
506 | int32_t stringCount = strings->size(); | |
507 | ||
508 | for (int32_t s = 0; s < stringCount; s += 1) { | |
509 | const UnicodeString *string = strings->get(s); | |
510 | UErrorCode status = U_ZERO_ERROR; | |
511 | const CEList *ceList2 = new CEList(coll, *string, status); | |
512 | ||
513 | if (U_FAILURE(status)) { | |
514 | delete ceList2; | |
515 | ceList2 = NULL; | |
516 | } | |
517 | ||
518 | if (ceList->matchesAt(offset, ceList2)) { | |
519 | U_ASSERT(ceList2 != NULL); | |
520 | int32_t clength = ceList2->size(); | |
521 | int32_t slength = string->length(); | |
522 | int32_t roffset = offset + clength; | |
523 | int32_t rlength = 0; | |
524 | ||
525 | if (roffset < maxOffset) { | |
526 | rlength = minLengthInChars(ceList, roffset, history); | |
527 | ||
528 | if (rlength <= 0) { | |
529 | // delete before continue to avoid memory leak. | |
530 | delete ceList2; | |
531 | ||
532 | // ignore any dead ends | |
533 | continue; | |
534 | } | |
535 | } | |
536 | ||
537 | if (shortestLength > slength + rlength) { | |
538 | shortestLength = slength + rlength; | |
539 | } | |
540 | } | |
541 | ||
542 | delete ceList2; | |
543 | } | |
544 | } | |
545 | ||
546 | if (shortestLength == INT32_MAX) { | |
547 | // No matching strings at this offset. See if | |
548 | // the CE is in a range we can handle manually. | |
549 | if (ce >= minHan && ce < maxHan) { | |
550 | // all han have implicit orders which | |
551 | // generate two CEs. | |
552 | int32_t roffset = offset + 2; | |
553 | int32_t rlength = 0; | |
554 | ||
555 | //history[roffset++] = -1; | |
556 | //history[roffset++] = 1; | |
557 | ||
558 | if (roffset < maxOffset) { | |
559 | rlength = minLengthInChars(ceList, roffset, history); | |
560 | } | |
561 | ||
562 | if (rlength < 0) { | |
563 | return -1; | |
564 | } | |
565 | ||
566 | shortestLength = 1 + rlength; | |
567 | goto have_shortest; | |
568 | } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { | |
569 | int32_t roffset = offset; | |
570 | int32_t rlength = 0; | |
571 | ||
572 | // **** this loop may not handle archaic Hangul correctly **** | |
573 | for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { | |
574 | uint32_t jce = ceList->get(roffset); | |
575 | ||
576 | // Some Jamo have 24-bit primary order; skip the | |
577 | // 2nd CE. This should always be OK because if | |
578 | // we're still in the loop all we've seen are | |
579 | // a series of Jamo in LVT order. | |
580 | if (isContinuation(jce)) { | |
581 | continue; | |
582 | } | |
583 | ||
584 | if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { | |
585 | break; | |
586 | } | |
587 | } | |
588 | ||
589 | if (roffset == offset) { | |
590 | // we started with a non-L Jamo... | |
591 | // just say it comes from a single character | |
592 | roffset += 1; | |
593 | ||
594 | // See if the single Jamo has a 24-bit order. | |
595 | if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { | |
596 | roffset += 1; | |
597 | } | |
598 | } | |
599 | ||
600 | if (roffset < maxOffset) { | |
601 | rlength = minLengthInChars(ceList, roffset, history); | |
602 | } | |
603 | ||
604 | if (rlength < 0) { | |
605 | return -1; | |
606 | } | |
607 | ||
608 | shortestLength = 1 + rlength; | |
609 | goto have_shortest; | |
610 | } | |
611 | ||
612 | // Can't handle it manually either. Just move on. | |
613 | return -1; | |
614 | } | |
615 | ||
616 | have_shortest: | |
617 | history[offset] = shortestLength; | |
618 | ||
619 | return shortestLength; | |
620 | } | |
621 | ||
622 | int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const | |
623 | { | |
624 | int32_t clength = ceList->size(); | |
625 | int32_t *history = NEW_ARRAY(int32_t, clength); | |
626 | ||
627 | for (int32_t i = 0; i < clength; i += 1) { | |
628 | history[i] = -1; | |
629 | } | |
630 | ||
631 | int32_t minLength = minLengthInChars(ceList, offset, history); | |
632 | ||
633 | DELETE_ARRAY(history); | |
634 | ||
635 | return minLength; | |
636 | } | |
637 | ||
638 | #endif // #if !UCONFIG_NO_COLLATION |