2 ******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
21 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
28 #include "normalizer2impl.h"
35 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
36 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
37 #define DELETE_ARRAY(array) uprv_free((void *) (array))
38 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
40 CEList::CEList(UCollator
*coll
, const UnicodeString
&string
, UErrorCode
&status
)
41 : ces(NULL
), listMax(CELIST_BUFFER_SIZE
), listSize(0)
43 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
44 UCollationStrength strength
= ucol_getStrength(coll
);
45 UBool toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
46 uint32_t variableTop
= ucol_getVariableTop(coll
, &status
);
47 uint32_t strengthMask
= 0;
50 if (U_FAILURE(status
)) {
54 // **** only set flag if string has Han(gul) ****
55 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
60 strengthMask
|= UCOL_TERTIARYORDERMASK
;
64 strengthMask
|= UCOL_SECONDARYORDERMASK
;
68 strengthMask
|= UCOL_PRIMARYORDERMASK
;
73 while ((order
= ucol_next(elems
, &status
)) != UCOL_NULLORDER
) {
74 UBool cont
= isContinuation(order
);
76 order
&= strengthMask
;
78 if (toShift
&& variableTop
> (uint32_t)order
&& (order
& UCOL_PRIMARYORDERMASK
) != 0) {
79 if (strength
>= UCOL_QUATERNARY
) {
80 order
&= UCOL_PRIMARYORDERMASK
;
82 order
= UCOL_IGNORABLE
;
86 if (order
== UCOL_IGNORABLE
) {
91 order
|= UCOL_CONTINUATION_MARKER
;
97 ucol_closeElements(elems
);
102 if (ces
!= ceBuffer
) {
107 void CEList::add(uint32_t ce
, UErrorCode
&status
)
109 if (U_FAILURE(status
)) {
113 if (listSize
>= listMax
) {
114 int32_t newMax
= listMax
+ CELIST_BUFFER_SIZE
;
115 uint32_t *newCEs
= NEW_ARRAY(uint32_t, newMax
);
117 if (newCEs
== NULL
) {
118 status
= U_MEMORY_ALLOCATION_ERROR
;
122 uprv_memcpy(newCEs
, ces
, listSize
* sizeof(uint32_t));
124 if (ces
!= ceBuffer
) {
132 ces
[listSize
++] = ce
;
135 uint32_t CEList::get(int32_t index
) const
137 if (index
>= 0 && index
< listSize
) {
141 return (uint32_t)UCOL_NULLORDER
;
144 uint32_t &CEList::operator[](int32_t index
) const
149 UBool
CEList::matchesAt(int32_t offset
, const CEList
*other
) const
151 if (other
== NULL
|| listSize
- offset
< other
->size()) {
155 for (int32_t i
= offset
, j
= 0; j
< other
->size(); i
+= 1, j
+= 1) {
156 if (ces
[i
] != (*other
)[j
]) {
164 int32_t CEList::size() const
169 StringList::StringList(UErrorCode
&status
)
170 : strings(NULL
), listMax(STRING_LIST_BUFFER_SIZE
), listSize(0)
172 if (U_FAILURE(status
)) {
176 strings
= new UnicodeString
[listMax
];
178 if (strings
== NULL
) {
179 status
= U_MEMORY_ALLOCATION_ERROR
;
184 StringList::~StringList()
189 void StringList::add(const UnicodeString
*string
, UErrorCode
&status
)
191 if (U_FAILURE(status
)) {
194 if (listSize
>= listMax
) {
195 int32_t newMax
= listMax
+ STRING_LIST_BUFFER_SIZE
;
196 UnicodeString
*newStrings
= new UnicodeString
[newMax
];
197 if (newStrings
== NULL
) {
198 status
= U_MEMORY_ALLOCATION_ERROR
;
201 for (int32_t i
=0; i
<listSize
; ++i
) {
202 newStrings
[i
] = strings
[i
];
205 strings
= newStrings
;
209 // The ctor initialized all the strings in
210 // the array to empty strings, so this
211 // is the same as copying the source string.
212 strings
[listSize
++].append(*string
);
215 void StringList::add(const UChar
*chars
, int32_t count
, UErrorCode
&status
)
217 const UnicodeString
string(chars
, count
);
219 add(&string
, status
);
222 const UnicodeString
*StringList::get(int32_t index
) const
224 if (index
>= 0 && index
< listSize
) {
225 return &strings
[index
];
231 int32_t StringList::size() const
238 static void U_CALLCONV
239 deleteStringList(void *obj
)
241 StringList
*strings
= (StringList
*) obj
;
250 CEToStringsMap(UErrorCode
&status
);
253 void put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
);
254 StringList
*getStringList(uint32_t ce
) const;
257 void putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
);
261 CEToStringsMap::CEToStringsMap(UErrorCode
&status
)
264 if (U_FAILURE(status
)) {
268 map
= uhash_open(uhash_hashLong
, uhash_compareLong
,
269 uhash_compareCaselessUnicodeString
,
272 if (U_FAILURE(status
)) {
276 uhash_setValueDeleter(map
, deleteStringList
);
279 CEToStringsMap::~CEToStringsMap()
284 void CEToStringsMap::put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
)
286 StringList
*strings
= getStringList(ce
);
288 if (strings
== NULL
) {
289 strings
= new StringList(status
);
291 if (strings
== NULL
|| U_FAILURE(status
)) {
292 status
= U_MEMORY_ALLOCATION_ERROR
;
296 putStringList(ce
, strings
, status
);
299 strings
->add(string
, status
);
302 StringList
*CEToStringsMap::getStringList(uint32_t ce
) const
304 return (StringList
*) uhash_iget(map
, ce
);
307 void CEToStringsMap::putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
)
309 uhash_iput(map
, ce
, (void *) stringList
, &status
);
312 #define CLONE_COLLATOR
314 CollData::CollData(UCollator
*collator
, UErrorCode
&status
)
315 : coll(NULL
), ceToCharsStartingWith(NULL
)
317 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
318 // i.e. other, control, private use, format, surrogate
319 U_STRING_DECL(test_pattern
, "[[:assigned:]-[:c:]]", 20);
320 U_STRING_INIT(test_pattern
, "[[:assigned:]-[:c:]]", 20);
321 USet
*charsToTest
= uset_openPattern(test_pattern
, 20, &status
);
323 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
324 // i.e. all the characers we handle implicitly
325 U_STRING_DECL(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326 U_STRING_INIT(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
327 USet
*charsToRemove
= uset_openPattern(remove_pattern
, 70, &status
);
329 if (U_FAILURE(status
)) {
333 USet
*expansions
= uset_openEmpty();
334 USet
*contractions
= uset_openEmpty();
337 ceToCharsStartingWith
= new CEToStringsMap(status
);
339 if (U_FAILURE(status
)) {
343 #ifdef CLONE_COLLATOR
344 coll
= ucol_safeClone(collator
, NULL
, NULL
, &status
);
346 if (U_FAILURE(status
)) {
353 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
355 uset_addAll(charsToTest
, contractions
);
356 uset_addAll(charsToTest
, expansions
);
357 uset_removeAll(charsToTest
, charsToRemove
);
359 itemCount
= uset_getItemCount(charsToTest
);
360 for(int32_t item
= 0; item
< itemCount
; item
+= 1) {
361 UChar32 start
= 0, end
= 0;
363 int32_t len
= uset_getItem(charsToTest
, item
, &start
, &end
,
364 buffer
, 16, &status
);
367 for (UChar32 ch
= start
; ch
<= end
; ch
+= 1) {
368 UnicodeString
*st
= new UnicodeString(ch
);
371 status
= U_MEMORY_ALLOCATION_ERROR
;
375 CEList
*ceList
= new CEList(coll
, *st
, status
);
377 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
382 } else if (len
> 0) {
383 UnicodeString
*st
= new UnicodeString(buffer
, len
);
386 status
= U_MEMORY_ALLOCATION_ERROR
;
390 CEList
*ceList
= new CEList(coll
, *st
, status
);
392 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
397 // shouldn't happen...
400 if (U_FAILURE(status
)) {
406 uset_close(contractions
);
407 uset_close(expansions
);
408 uset_close(charsToRemove
);
409 uset_close(charsToTest
);
411 if (U_FAILURE(status
)) {
415 UnicodeSet
hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status
);
416 if (U_FAILURE(status
)) {
419 UnicodeSetIterator
hanIter(hanRanges
);
420 UnicodeString hanString
;
421 while(hanIter
.nextRange()) {
422 hanString
.append(hanIter
.getCodepoint());
423 hanString
.append(hanIter
.getCodepointEnd());
425 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
426 // but as of Unicode 6.3 the 11xx block is filled,
427 // and there are also more Jamo T at U+D7CB..U+D7FB.
428 // Maybe use [:HST=T:] and look for the end of the last range?
429 // Maybe use script boundary mappings instead of this code??
430 UChar jamoRanges
[] = {Hangul::JAMO_L_BASE
, Hangul::JAMO_V_BASE
, Hangul::JAMO_T_BASE
+ 1, 0x11FF};
431 UnicodeString
jamoString(FALSE
, jamoRanges
, ARRAY_SIZE(jamoRanges
));
432 CEList
hanList(coll
, hanString
, status
);
433 CEList
jamoList(coll
, jamoString
, status
);
436 if (U_FAILURE(status
)) {
440 for (int32_t c
= 0; c
< jamoList
.size(); c
+= 1) {
441 uint32_t jce
= jamoList
[c
];
443 if (! isContinuation(jce
)) {
444 jamoLimits
[j
++] = jce
;
448 jamoLimits
[3] += (1 << UCOL_PRIMARYORDERSHIFT
);
453 for(int32_t h
= 0; h
< hanList
.size(); h
+= 2) {
454 uint32_t han
= (uint32_t) hanList
[h
];
465 maxHan
+= (1 << UCOL_PRIMARYORDERSHIFT
);
468 CollData::~CollData()
470 #ifdef CLONE_COLLATOR
474 delete ceToCharsStartingWith
;
477 UCollator
*CollData::getCollator() const
482 const StringList
*CollData::getStringList(int32_t ce
) const
484 return ceToCharsStartingWith
->getStringList(ce
);
487 const CEList
*CollData::getCEList(const UnicodeString
*string
) const
489 UErrorCode status
= U_ZERO_ERROR
;
490 const CEList
*list
= new CEList(coll
, *string
, status
);
492 if (U_FAILURE(status
)) {
500 void CollData::freeCEList(const CEList
*list
)
505 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
, int32_t *history
) const
507 // find out shortest string for the longest sequence of ces.
508 // this can probably be folded with the minLengthCache...
510 if (history
[offset
] >= 0) {
511 return history
[offset
];
514 uint32_t ce
= ceList
->get(offset
);
515 int32_t maxOffset
= ceList
->size();
516 int32_t shortestLength
= INT32_MAX
;
517 const StringList
*strings
= ceToCharsStartingWith
->getStringList(ce
);
519 if (strings
!= NULL
) {
520 int32_t stringCount
= strings
->size();
522 for (int32_t s
= 0; s
< stringCount
; s
+= 1) {
523 const UnicodeString
*string
= strings
->get(s
);
524 UErrorCode status
= U_ZERO_ERROR
;
525 const CEList
*ceList2
= new CEList(coll
, *string
, status
);
527 if (U_FAILURE(status
)) {
532 if (ceList
->matchesAt(offset
, ceList2
)) {
533 U_ASSERT(ceList2
!= NULL
);
534 int32_t clength
= ceList2
->size();
535 int32_t slength
= string
->length();
536 int32_t roffset
= offset
+ clength
;
539 if (roffset
< maxOffset
) {
540 rlength
= minLengthInChars(ceList
, roffset
, history
);
543 // delete before continue to avoid memory leak.
546 // ignore any dead ends
551 if (shortestLength
> slength
+ rlength
) {
552 shortestLength
= slength
+ rlength
;
560 if (shortestLength
== INT32_MAX
) {
561 // No matching strings at this offset. See if
562 // the CE is in a range we can handle manually.
563 if (ce
>= minHan
&& ce
< maxHan
) {
564 // all han have implicit orders which
566 int32_t roffset
= offset
+ 2;
569 //history[roffset++] = -1;
570 //history[roffset++] = 1;
572 if (roffset
< maxOffset
) {
573 rlength
= minLengthInChars(ceList
, roffset
, history
);
580 shortestLength
= 1 + rlength
;
582 } else if (ce
>= jamoLimits
[0] && ce
< jamoLimits
[3]) {
583 int32_t roffset
= offset
;
586 // **** this loop may not handle archaic Hangul correctly ****
587 for (int32_t j
= 0; roffset
< maxOffset
&& j
< 4; j
+= 1, roffset
+= 1) {
588 uint32_t jce
= ceList
->get(roffset
);
590 // Some Jamo have 24-bit primary order; skip the
591 // 2nd CE. This should always be OK because if
592 // we're still in the loop all we've seen are
593 // a series of Jamo in LVT order.
594 if (isContinuation(jce
)) {
598 if (j
>= 3 || jce
< jamoLimits
[j
] || jce
>= jamoLimits
[j
+ 1]) {
603 if (roffset
== offset
) {
604 // we started with a non-L Jamo...
605 // just say it comes from a single character
608 // See if the single Jamo has a 24-bit order.
609 if (roffset
< maxOffset
&& isContinuation(ceList
->get(roffset
))) {
614 if (roffset
< maxOffset
) {
615 rlength
= minLengthInChars(ceList
, roffset
, history
);
622 shortestLength
= 1 + rlength
;
626 // Can't handle it manually either. Just move on.
631 history
[offset
] = shortestLength
;
633 return shortestLength
;
636 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
) const
638 int32_t clength
= ceList
->size();
639 int32_t *history
= NEW_ARRAY(int32_t, clength
);
641 for (int32_t i
= 0; i
< clength
; i
+= 1) {
645 int32_t minLength
= minLengthInChars(ceList
, offset
, history
);
647 DELETE_ARRAY(history
);
652 #endif // #if !UCONFIG_NO_COLLATION