1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
5 * Copyright (C) 1996-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/unistr.h"
15 #include "unicode/usearch.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
37 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
38 #define DELETE_ARRAY(array) uprv_free((void *) (array))
39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
41 CEList::CEList(UCollator
*coll
, const UnicodeString
&string
, UErrorCode
&status
)
42 : ces(NULL
), listMax(CELIST_BUFFER_SIZE
), listSize(0)
44 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
45 UCollationStrength strength
= ucol_getStrength(coll
);
46 UBool toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
47 uint32_t variableTop
= ucol_getVariableTop(coll
, &status
);
48 uint32_t strengthMask
= 0;
51 if (U_FAILURE(status
)) {
55 // **** only set flag if string has Han(gul) ****
56 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
61 strengthMask
|= UCOL_TERTIARYORDERMASK
;
64 strengthMask
|= UCOL_SECONDARYORDERMASK
;
67 strengthMask
|= UCOL_PRIMARYORDERMASK
;
72 while ((order
= ucol_next(elems
, &status
)) != UCOL_NULLORDER
) {
73 UBool cont
= isContinuation(order
);
75 order
&= strengthMask
;
77 if (toShift
&& variableTop
> (uint32_t)order
&& (order
& UCOL_PRIMARYORDERMASK
) != 0) {
78 if (strength
>= UCOL_QUATERNARY
) {
79 order
&= UCOL_PRIMARYORDERMASK
;
81 order
= UCOL_IGNORABLE
;
85 if (order
== UCOL_IGNORABLE
) {
90 order
|= UCOL_CONTINUATION_MARKER
;
96 ucol_closeElements(elems
);
101 if (ces
!= ceBuffer
) {
106 void CEList::add(uint32_t ce
, UErrorCode
&status
)
108 if (U_FAILURE(status
)) {
112 if (listSize
>= listMax
) {
113 int32_t newMax
= listMax
+ CELIST_BUFFER_SIZE
;
114 uint32_t *newCEs
= NEW_ARRAY(uint32_t, newMax
);
116 if (newCEs
== NULL
) {
117 status
= U_MEMORY_ALLOCATION_ERROR
;
121 uprv_memcpy(newCEs
, ces
, listSize
* sizeof(uint32_t));
123 if (ces
!= ceBuffer
) {
131 ces
[listSize
++] = ce
;
134 uint32_t CEList::get(int32_t index
) const
136 if (index
>= 0 && index
< listSize
) {
140 return (uint32_t)UCOL_NULLORDER
;
143 uint32_t &CEList::operator[](int32_t index
) const
148 UBool
CEList::matchesAt(int32_t offset
, const CEList
*other
) const
150 if (other
== NULL
|| listSize
- offset
< other
->size()) {
154 for (int32_t i
= offset
, j
= 0; j
< other
->size(); i
+= 1, j
+= 1) {
155 if (ces
[i
] != (*other
)[j
]) {
163 int32_t CEList::size() const
168 StringList::StringList(UErrorCode
&status
)
169 : strings(NULL
), listMax(STRING_LIST_BUFFER_SIZE
), listSize(0)
171 if (U_FAILURE(status
)) {
175 strings
= new UnicodeString
[listMax
];
177 if (strings
== NULL
) {
178 status
= U_MEMORY_ALLOCATION_ERROR
;
183 StringList::~StringList()
188 void StringList::add(const UnicodeString
*string
, UErrorCode
&status
)
190 if (U_FAILURE(status
)) {
193 if (listSize
>= listMax
) {
194 int32_t newMax
= listMax
+ STRING_LIST_BUFFER_SIZE
;
195 UnicodeString
*newStrings
= new UnicodeString
[newMax
];
196 if (newStrings
== NULL
) {
197 status
= U_MEMORY_ALLOCATION_ERROR
;
200 for (int32_t i
=0; i
<listSize
; ++i
) {
201 newStrings
[i
] = strings
[i
];
204 strings
= newStrings
;
208 // The ctor initialized all the strings in
209 // the array to empty strings, so this
210 // is the same as copying the source string.
211 strings
[listSize
++].append(*string
);
214 void StringList::add(const UChar
*chars
, int32_t count
, UErrorCode
&status
)
216 const UnicodeString
string(chars
, count
);
218 add(&string
, status
);
221 const UnicodeString
*StringList::get(int32_t index
) const
223 if (index
>= 0 && index
< listSize
) {
224 return &strings
[index
];
230 int32_t StringList::size() const
237 static void U_CALLCONV
238 deleteStringList(void *obj
)
240 StringList
*strings
= (StringList
*) obj
;
249 CEToStringsMap(UErrorCode
&status
);
252 void put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
);
253 StringList
*getStringList(uint32_t ce
) const;
256 void putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
);
260 CEToStringsMap::CEToStringsMap(UErrorCode
&status
)
263 if (U_FAILURE(status
)) {
267 map
= uhash_open(uhash_hashLong
, uhash_compareLong
,
268 uhash_compareCaselessUnicodeString
,
271 if (U_FAILURE(status
)) {
275 uhash_setValueDeleter(map
, deleteStringList
);
278 CEToStringsMap::~CEToStringsMap()
283 void CEToStringsMap::put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
)
285 StringList
*strings
= getStringList(ce
);
287 if (strings
== NULL
) {
288 strings
= new StringList(status
);
290 if (strings
== NULL
|| U_FAILURE(status
)) {
291 status
= U_MEMORY_ALLOCATION_ERROR
;
295 putStringList(ce
, strings
, status
);
298 strings
->add(string
, status
);
301 StringList
*CEToStringsMap::getStringList(uint32_t ce
) const
303 return (StringList
*) uhash_iget(map
, ce
);
306 void CEToStringsMap::putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
)
308 uhash_iput(map
, ce
, (void *) stringList
, &status
);
311 #define CLONE_COLLATOR
313 CollData::CollData(UCollator
*collator
, UErrorCode
&status
)
314 : coll(NULL
), ceToCharsStartingWith(NULL
)
316 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
317 // i.e. other, control, private use, format, surrogate
318 U_STRING_DECL(test_pattern
, "[[:assigned:]-[:c:]]", 20);
319 U_STRING_INIT(test_pattern
, "[[:assigned:]-[:c:]]", 20);
320 USet
*charsToTest
= uset_openPattern(test_pattern
, 20, &status
);
322 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
323 // i.e. all the characers we handle implicitly
324 U_STRING_DECL(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325 U_STRING_INIT(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326 USet
*charsToRemove
= uset_openPattern(remove_pattern
, 70, &status
);
328 if (U_FAILURE(status
)) {
332 USet
*expansions
= uset_openEmpty();
333 USet
*contractions
= uset_openEmpty();
336 ceToCharsStartingWith
= new CEToStringsMap(status
);
338 if (U_FAILURE(status
)) {
342 #ifdef CLONE_COLLATOR
343 coll
= ucol_safeClone(collator
, NULL
, NULL
, &status
);
345 if (U_FAILURE(status
)) {
352 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
354 uset_addAll(charsToTest
, contractions
);
355 uset_addAll(charsToTest
, expansions
);
356 uset_removeAll(charsToTest
, charsToRemove
);
358 itemCount
= uset_getItemCount(charsToTest
);
359 for(int32_t item
= 0; item
< itemCount
; item
+= 1) {
360 UChar32 start
= 0, end
= 0;
362 int32_t len
= uset_getItem(charsToTest
, item
, &start
, &end
,
363 buffer
, 16, &status
);
366 for (UChar32 ch
= start
; ch
<= end
; ch
+= 1) {
367 UnicodeString
*st
= new UnicodeString(ch
);
370 status
= U_MEMORY_ALLOCATION_ERROR
;
374 CEList
*ceList
= new CEList(coll
, *st
, status
);
376 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
381 } else if (len
> 0) {
382 UnicodeString
*st
= new UnicodeString(buffer
, len
);
385 status
= U_MEMORY_ALLOCATION_ERROR
;
389 CEList
*ceList
= new CEList(coll
, *st
, status
);
391 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
396 // shouldn't happen...
399 if (U_FAILURE(status
)) {
405 uset_close(contractions
);
406 uset_close(expansions
);
407 uset_close(charsToRemove
);
408 uset_close(charsToTest
);
410 if (U_FAILURE(status
)) {
414 UnicodeSet
hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status
);
415 if (U_FAILURE(status
)) {
418 UnicodeSetIterator
hanIter(hanRanges
);
419 UnicodeString hanString
;
420 while(hanIter
.nextRange()) {
421 hanString
.append(hanIter
.getCodepoint());
422 hanString
.append(hanIter
.getCodepointEnd());
424 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
425 // but as of Unicode 6.3 the 11xx block is filled,
426 // and there are also more Jamo T at U+D7CB..U+D7FB.
427 // Maybe use [:HST=T:] and look for the end of the last range?
428 // Maybe use script boundary mappings instead of this code??
429 UChar jamoRanges
[] = {Hangul::JAMO_L_BASE
, Hangul::JAMO_V_BASE
, Hangul::JAMO_T_BASE
+ 1, 0x11FF};
430 UnicodeString
jamoString(FALSE
, jamoRanges
, UPRV_LENGTHOF(jamoRanges
));
431 CEList
hanList(coll
, hanString
, status
);
432 CEList
jamoList(coll
, jamoString
, status
);
435 if (U_FAILURE(status
)) {
439 for (int32_t c
= 0; c
< jamoList
.size(); c
+= 1) {
440 uint32_t jce
= jamoList
[c
];
442 if (! isContinuation(jce
)) {
443 jamoLimits
[j
++] = jce
;
447 jamoLimits
[3] += (1 << UCOL_PRIMARYORDERSHIFT
);
452 for(int32_t h
= 0; h
< hanList
.size(); h
+= 2) {
453 uint32_t han
= (uint32_t) hanList
[h
];
464 maxHan
+= (1 << UCOL_PRIMARYORDERSHIFT
);
467 CollData::~CollData()
469 #ifdef CLONE_COLLATOR
473 delete ceToCharsStartingWith
;
476 UCollator
*CollData::getCollator() const
481 const StringList
*CollData::getStringList(int32_t ce
) const
483 return ceToCharsStartingWith
->getStringList(ce
);
486 const CEList
*CollData::getCEList(const UnicodeString
*string
) const
488 UErrorCode status
= U_ZERO_ERROR
;
489 const CEList
*list
= new CEList(coll
, *string
, status
);
491 if (U_FAILURE(status
)) {
499 void CollData::freeCEList(const CEList
*list
)
504 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
, int32_t *history
) const
506 // find out shortest string for the longest sequence of ces.
507 // this can probably be folded with the minLengthCache...
509 if (history
[offset
] >= 0) {
510 return history
[offset
];
513 uint32_t ce
= ceList
->get(offset
);
514 int32_t maxOffset
= ceList
->size();
515 int32_t shortestLength
= INT32_MAX
;
516 const StringList
*strings
= ceToCharsStartingWith
->getStringList(ce
);
518 if (strings
!= NULL
) {
519 int32_t stringCount
= strings
->size();
521 for (int32_t s
= 0; s
< stringCount
; s
+= 1) {
522 const UnicodeString
*string
= strings
->get(s
);
523 UErrorCode status
= U_ZERO_ERROR
;
524 const CEList
*ceList2
= new CEList(coll
, *string
, status
);
526 if (U_FAILURE(status
)) {
531 if (ceList
->matchesAt(offset
, ceList2
)) {
532 U_ASSERT(ceList2
!= NULL
);
533 int32_t clength
= ceList2
->size();
534 int32_t slength
= string
->length();
535 int32_t roffset
= offset
+ clength
;
538 if (roffset
< maxOffset
) {
539 rlength
= minLengthInChars(ceList
, roffset
, history
);
542 // delete before continue to avoid memory leak.
545 // ignore any dead ends
550 if (shortestLength
> slength
+ rlength
) {
551 shortestLength
= slength
+ rlength
;
559 if (shortestLength
== INT32_MAX
) {
560 // No matching strings at this offset. See if
561 // the CE is in a range we can handle manually.
562 if (ce
>= minHan
&& ce
< maxHan
) {
563 // all han have implicit orders which
565 int32_t roffset
= offset
+ 2;
568 //history[roffset++] = -1;
569 //history[roffset++] = 1;
571 if (roffset
< maxOffset
) {
572 rlength
= minLengthInChars(ceList
, roffset
, history
);
579 shortestLength
= 1 + rlength
;
581 } else if (ce
>= jamoLimits
[0] && ce
< jamoLimits
[3]) {
582 int32_t roffset
= offset
;
585 // **** this loop may not handle archaic Hangul correctly ****
586 for (int32_t j
= 0; roffset
< maxOffset
&& j
< 4; j
+= 1, roffset
+= 1) {
587 uint32_t jce
= ceList
->get(roffset
);
589 // Some Jamo have 24-bit primary order; skip the
590 // 2nd CE. This should always be OK because if
591 // we're still in the loop all we've seen are
592 // a series of Jamo in LVT order.
593 if (isContinuation(jce
)) {
597 if (j
>= 3 || jce
< jamoLimits
[j
] || jce
>= jamoLimits
[j
+ 1]) {
602 if (roffset
== offset
) {
603 // we started with a non-L Jamo...
604 // just say it comes from a single character
607 // See if the single Jamo has a 24-bit order.
608 if (roffset
< maxOffset
&& isContinuation(ceList
->get(roffset
))) {
613 if (roffset
< maxOffset
) {
614 rlength
= minLengthInChars(ceList
, roffset
, history
);
621 shortestLength
= 1 + rlength
;
625 // Can't handle it manually either. Just move on.
630 history
[offset
] = shortestLength
;
632 return shortestLength
;
635 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
) const
637 int32_t clength
= ceList
->size();
638 int32_t *history
= NEW_ARRAY(int32_t, clength
);
640 for (int32_t i
= 0; i
< clength
; i
+= 1) {
644 int32_t minLength
= minLengthInChars(ceList
, offset
, history
);
646 DELETE_ARRAY(history
);
651 #endif // #if !UCONFIG_NO_COLLATION