2 ******************************************************************************
3 * Copyright (C) 1996-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
21 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
28 #include "normalizer2impl.h"
35 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
36 #define DELETE_ARRAY(array) uprv_free((void *) (array))
37 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
39 CEList::CEList(UCollator
*coll
, const UnicodeString
&string
, UErrorCode
&status
)
40 : ces(NULL
), listMax(CELIST_BUFFER_SIZE
), listSize(0)
42 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
43 UCollationStrength strength
= ucol_getStrength(coll
);
44 UBool toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
45 uint32_t variableTop
= ucol_getVariableTop(coll
, &status
);
46 uint32_t strengthMask
= 0;
49 if (U_FAILURE(status
)) {
53 // **** only set flag if string has Han(gul) ****
54 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
59 strengthMask
|= UCOL_TERTIARYORDERMASK
;
62 strengthMask
|= UCOL_SECONDARYORDERMASK
;
65 strengthMask
|= UCOL_PRIMARYORDERMASK
;
70 while ((order
= ucol_next(elems
, &status
)) != UCOL_NULLORDER
) {
71 UBool cont
= isContinuation(order
);
73 order
&= strengthMask
;
75 if (toShift
&& variableTop
> (uint32_t)order
&& (order
& UCOL_PRIMARYORDERMASK
) != 0) {
76 if (strength
>= UCOL_QUATERNARY
) {
77 order
&= UCOL_PRIMARYORDERMASK
;
79 order
= UCOL_IGNORABLE
;
83 if (order
== UCOL_IGNORABLE
) {
88 order
|= UCOL_CONTINUATION_MARKER
;
94 ucol_closeElements(elems
);
99 if (ces
!= ceBuffer
) {
104 void CEList::add(uint32_t ce
, UErrorCode
&status
)
106 if (U_FAILURE(status
)) {
110 if (listSize
>= listMax
) {
111 int32_t newMax
= listMax
+ CELIST_BUFFER_SIZE
;
112 uint32_t *newCEs
= NEW_ARRAY(uint32_t, newMax
);
114 if (newCEs
== NULL
) {
115 status
= U_MEMORY_ALLOCATION_ERROR
;
119 uprv_memcpy(newCEs
, ces
, listSize
* sizeof(uint32_t));
121 if (ces
!= ceBuffer
) {
129 ces
[listSize
++] = ce
;
132 uint32_t CEList::get(int32_t index
) const
134 if (index
>= 0 && index
< listSize
) {
138 return (uint32_t)UCOL_NULLORDER
;
141 uint32_t &CEList::operator[](int32_t index
) const
146 UBool
CEList::matchesAt(int32_t offset
, const CEList
*other
) const
148 if (other
== NULL
|| listSize
- offset
< other
->size()) {
152 for (int32_t i
= offset
, j
= 0; j
< other
->size(); i
+= 1, j
+= 1) {
153 if (ces
[i
] != (*other
)[j
]) {
161 int32_t CEList::size() const
166 StringList::StringList(UErrorCode
&status
)
167 : strings(NULL
), listMax(STRING_LIST_BUFFER_SIZE
), listSize(0)
169 if (U_FAILURE(status
)) {
173 strings
= new UnicodeString
[listMax
];
175 if (strings
== NULL
) {
176 status
= U_MEMORY_ALLOCATION_ERROR
;
181 StringList::~StringList()
186 void StringList::add(const UnicodeString
*string
, UErrorCode
&status
)
188 if (U_FAILURE(status
)) {
191 if (listSize
>= listMax
) {
192 int32_t newMax
= listMax
+ STRING_LIST_BUFFER_SIZE
;
193 UnicodeString
*newStrings
= new UnicodeString
[newMax
];
194 if (newStrings
== NULL
) {
195 status
= U_MEMORY_ALLOCATION_ERROR
;
198 for (int32_t i
=0; i
<listSize
; ++i
) {
199 newStrings
[i
] = strings
[i
];
202 strings
= newStrings
;
206 // The ctor initialized all the strings in
207 // the array to empty strings, so this
208 // is the same as copying the source string.
209 strings
[listSize
++].append(*string
);
212 void StringList::add(const UChar
*chars
, int32_t count
, UErrorCode
&status
)
214 const UnicodeString
string(chars
, count
);
216 add(&string
, status
);
219 const UnicodeString
*StringList::get(int32_t index
) const
221 if (index
>= 0 && index
< listSize
) {
222 return &strings
[index
];
228 int32_t StringList::size() const
235 static void U_CALLCONV
236 deleteStringList(void *obj
)
238 StringList
*strings
= (StringList
*) obj
;
247 CEToStringsMap(UErrorCode
&status
);
250 void put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
);
251 StringList
*getStringList(uint32_t ce
) const;
254 void putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
);
258 CEToStringsMap::CEToStringsMap(UErrorCode
&status
)
261 if (U_FAILURE(status
)) {
265 map
= uhash_open(uhash_hashLong
, uhash_compareLong
,
266 uhash_compareCaselessUnicodeString
,
269 if (U_FAILURE(status
)) {
273 uhash_setValueDeleter(map
, deleteStringList
);
276 CEToStringsMap::~CEToStringsMap()
281 void CEToStringsMap::put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
)
283 StringList
*strings
= getStringList(ce
);
285 if (strings
== NULL
) {
286 strings
= new StringList(status
);
288 if (strings
== NULL
|| U_FAILURE(status
)) {
289 status
= U_MEMORY_ALLOCATION_ERROR
;
293 putStringList(ce
, strings
, status
);
296 strings
->add(string
, status
);
299 StringList
*CEToStringsMap::getStringList(uint32_t ce
) const
301 return (StringList
*) uhash_iget(map
, ce
);
304 void CEToStringsMap::putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
)
306 uhash_iput(map
, ce
, (void *) stringList
, &status
);
309 #define CLONE_COLLATOR
311 CollData::CollData(UCollator
*collator
, UErrorCode
&status
)
312 : coll(NULL
), ceToCharsStartingWith(NULL
)
314 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
315 // i.e. other, control, private use, format, surrogate
316 U_STRING_DECL(test_pattern
, "[[:assigned:]-[:c:]]", 20);
317 U_STRING_INIT(test_pattern
, "[[:assigned:]-[:c:]]", 20);
318 USet
*charsToTest
= uset_openPattern(test_pattern
, 20, &status
);
320 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
321 // i.e. all the characers we handle implicitly
322 U_STRING_DECL(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
323 U_STRING_INIT(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324 USet
*charsToRemove
= uset_openPattern(remove_pattern
, 70, &status
);
326 if (U_FAILURE(status
)) {
330 USet
*expansions
= uset_openEmpty();
331 USet
*contractions
= uset_openEmpty();
334 ceToCharsStartingWith
= new CEToStringsMap(status
);
336 if (U_FAILURE(status
)) {
340 #ifdef CLONE_COLLATOR
341 coll
= ucol_safeClone(collator
, NULL
, NULL
, &status
);
343 if (U_FAILURE(status
)) {
350 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
352 uset_addAll(charsToTest
, contractions
);
353 uset_addAll(charsToTest
, expansions
);
354 uset_removeAll(charsToTest
, charsToRemove
);
356 itemCount
= uset_getItemCount(charsToTest
);
357 for(int32_t item
= 0; item
< itemCount
; item
+= 1) {
358 UChar32 start
= 0, end
= 0;
360 int32_t len
= uset_getItem(charsToTest
, item
, &start
, &end
,
361 buffer
, 16, &status
);
364 for (UChar32 ch
= start
; ch
<= end
; ch
+= 1) {
365 UnicodeString
*st
= new UnicodeString(ch
);
368 status
= U_MEMORY_ALLOCATION_ERROR
;
372 CEList
*ceList
= new CEList(coll
, *st
, status
);
374 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
379 } else if (len
> 0) {
380 UnicodeString
*st
= new UnicodeString(buffer
, len
);
383 status
= U_MEMORY_ALLOCATION_ERROR
;
387 CEList
*ceList
= new CEList(coll
, *st
, status
);
389 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
394 // shouldn't happen...
397 if (U_FAILURE(status
)) {
403 uset_close(contractions
);
404 uset_close(expansions
);
405 uset_close(charsToRemove
);
406 uset_close(charsToTest
);
408 if (U_FAILURE(status
)) {
412 UnicodeSet
hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status
);
413 if (U_FAILURE(status
)) {
416 UnicodeSetIterator
hanIter(hanRanges
);
417 UnicodeString hanString
;
418 while(hanIter
.nextRange()) {
419 hanString
.append(hanIter
.getCodepoint());
420 hanString
.append(hanIter
.getCodepointEnd());
422 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
423 // but as of Unicode 6.3 the 11xx block is filled,
424 // and there are also more Jamo T at U+D7CB..U+D7FB.
425 // Maybe use [:HST=T:] and look for the end of the last range?
426 // Maybe use script boundary mappings instead of this code??
427 UChar jamoRanges
[] = {Hangul::JAMO_L_BASE
, Hangul::JAMO_V_BASE
, Hangul::JAMO_T_BASE
+ 1, 0x11FF};
428 UnicodeString
jamoString(FALSE
, jamoRanges
, UPRV_LENGTHOF(jamoRanges
));
429 CEList
hanList(coll
, hanString
, status
);
430 CEList
jamoList(coll
, jamoString
, status
);
433 if (U_FAILURE(status
)) {
437 for (int32_t c
= 0; c
< jamoList
.size(); c
+= 1) {
438 uint32_t jce
= jamoList
[c
];
440 if (! isContinuation(jce
)) {
441 jamoLimits
[j
++] = jce
;
445 jamoLimits
[3] += (1 << UCOL_PRIMARYORDERSHIFT
);
450 for(int32_t h
= 0; h
< hanList
.size(); h
+= 2) {
451 uint32_t han
= (uint32_t) hanList
[h
];
462 maxHan
+= (1 << UCOL_PRIMARYORDERSHIFT
);
465 CollData::~CollData()
467 #ifdef CLONE_COLLATOR
471 delete ceToCharsStartingWith
;
474 UCollator
*CollData::getCollator() const
479 const StringList
*CollData::getStringList(int32_t ce
) const
481 return ceToCharsStartingWith
->getStringList(ce
);
484 const CEList
*CollData::getCEList(const UnicodeString
*string
) const
486 UErrorCode status
= U_ZERO_ERROR
;
487 const CEList
*list
= new CEList(coll
, *string
, status
);
489 if (U_FAILURE(status
)) {
497 void CollData::freeCEList(const CEList
*list
)
502 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
, int32_t *history
) const
504 // find out shortest string for the longest sequence of ces.
505 // this can probably be folded with the minLengthCache...
507 if (history
[offset
] >= 0) {
508 return history
[offset
];
511 uint32_t ce
= ceList
->get(offset
);
512 int32_t maxOffset
= ceList
->size();
513 int32_t shortestLength
= INT32_MAX
;
514 const StringList
*strings
= ceToCharsStartingWith
->getStringList(ce
);
516 if (strings
!= NULL
) {
517 int32_t stringCount
= strings
->size();
519 for (int32_t s
= 0; s
< stringCount
; s
+= 1) {
520 const UnicodeString
*string
= strings
->get(s
);
521 UErrorCode status
= U_ZERO_ERROR
;
522 const CEList
*ceList2
= new CEList(coll
, *string
, status
);
524 if (U_FAILURE(status
)) {
529 if (ceList
->matchesAt(offset
, ceList2
)) {
530 U_ASSERT(ceList2
!= NULL
);
531 int32_t clength
= ceList2
->size();
532 int32_t slength
= string
->length();
533 int32_t roffset
= offset
+ clength
;
536 if (roffset
< maxOffset
) {
537 rlength
= minLengthInChars(ceList
, roffset
, history
);
540 // delete before continue to avoid memory leak.
543 // ignore any dead ends
548 if (shortestLength
> slength
+ rlength
) {
549 shortestLength
= slength
+ rlength
;
557 if (shortestLength
== INT32_MAX
) {
558 // No matching strings at this offset. See if
559 // the CE is in a range we can handle manually.
560 if (ce
>= minHan
&& ce
< maxHan
) {
561 // all han have implicit orders which
563 int32_t roffset
= offset
+ 2;
566 //history[roffset++] = -1;
567 //history[roffset++] = 1;
569 if (roffset
< maxOffset
) {
570 rlength
= minLengthInChars(ceList
, roffset
, history
);
577 shortestLength
= 1 + rlength
;
579 } else if (ce
>= jamoLimits
[0] && ce
< jamoLimits
[3]) {
580 int32_t roffset
= offset
;
583 // **** this loop may not handle archaic Hangul correctly ****
584 for (int32_t j
= 0; roffset
< maxOffset
&& j
< 4; j
+= 1, roffset
+= 1) {
585 uint32_t jce
= ceList
->get(roffset
);
587 // Some Jamo have 24-bit primary order; skip the
588 // 2nd CE. This should always be OK because if
589 // we're still in the loop all we've seen are
590 // a series of Jamo in LVT order.
591 if (isContinuation(jce
)) {
595 if (j
>= 3 || jce
< jamoLimits
[j
] || jce
>= jamoLimits
[j
+ 1]) {
600 if (roffset
== offset
) {
601 // we started with a non-L Jamo...
602 // just say it comes from a single character
605 // See if the single Jamo has a 24-bit order.
606 if (roffset
< maxOffset
&& isContinuation(ceList
->get(roffset
))) {
611 if (roffset
< maxOffset
) {
612 rlength
= minLengthInChars(ceList
, roffset
, history
);
619 shortestLength
= 1 + rlength
;
623 // Can't handle it manually either. Just move on.
628 history
[offset
] = shortestLength
;
630 return shortestLength
;
633 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
) const
635 int32_t clength
= ceList
->size();
636 int32_t *history
= NEW_ARRAY(int32_t, clength
);
638 for (int32_t i
= 0; i
< clength
; i
+= 1) {
642 int32_t minLength
= minLengthInChars(ceList
, offset
, history
);
644 DELETE_ARRAY(history
);
649 #endif // #if !UCONFIG_NO_COLLATION