2 ******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION
12 #include "unicode/unistr.h"
13 #include "unicode/usearch.h"
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/coleitr.h"
19 #include "unicode/ucoleitr.h"
21 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
23 #include "unicode/uniset.h"
24 #include "unicode/uset.h"
25 #include "unicode/ustring.h"
33 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
34 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
35 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
38 CEList::CEList(UCollator
*coll
, const UnicodeString
&string
, UErrorCode
&status
)
39 : ces(NULL
), listMax(CELIST_BUFFER_SIZE
), listSize(0)
41 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
42 UCollationStrength strength
= ucol_getStrength(coll
);
43 UBool toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
44 uint32_t variableTop
= ucol_getVariableTop(coll
, &status
);
45 uint32_t strengthMask
= 0;
48 if (U_FAILURE(status
)) {
52 // **** only set flag if string has Han(gul) ****
53 ucol_forceHanImplicit(elems
, &status
);
58 strengthMask
|= UCOL_TERTIARYORDERMASK
;
62 strengthMask
|= UCOL_SECONDARYORDERMASK
;
66 strengthMask
|= UCOL_PRIMARYORDERMASK
;
71 while ((order
= ucol_next(elems
, &status
)) != UCOL_NULLORDER
) {
72 UBool cont
= isContinuation(order
);
74 order
&= strengthMask
;
76 if (toShift
&& variableTop
> (uint32_t)order
&& (order
& UCOL_PRIMARYORDERMASK
) != 0) {
77 if (strength
>= UCOL_QUATERNARY
) {
78 order
&= UCOL_PRIMARYORDERMASK
;
80 order
= UCOL_IGNORABLE
;
84 if (order
== UCOL_IGNORABLE
) {
89 order
|= UCOL_CONTINUATION_MARKER
;
95 ucol_closeElements(elems
);
100 if (ces
!= ceBuffer
) {
105 void CEList::add(uint32_t ce
, UErrorCode
&status
)
107 if (U_FAILURE(status
)) {
111 if (listSize
>= listMax
) {
112 int32_t newMax
= listMax
+ CELIST_BUFFER_SIZE
;
113 uint32_t *newCEs
= NEW_ARRAY(uint32_t, newMax
);
115 if (newCEs
== NULL
) {
116 status
= U_MEMORY_ALLOCATION_ERROR
;
120 uprv_memcpy(newCEs
, ces
, listSize
* sizeof(uint32_t));
122 if (ces
!= ceBuffer
) {
130 ces
[listSize
++] = ce
;
133 uint32_t CEList::get(int32_t index
) const
135 if (index
>= 0 && index
< listSize
) {
139 return (uint32_t)UCOL_NULLORDER
;
142 uint32_t &CEList::operator[](int32_t index
) const
147 UBool
CEList::matchesAt(int32_t offset
, const CEList
*other
) const
149 if (other
== NULL
|| listSize
- offset
< other
->size()) {
153 for (int32_t i
= offset
, j
= 0; j
< other
->size(); i
+= 1, j
+= 1) {
154 if (ces
[i
] != (*other
)[j
]) {
162 int32_t CEList::size() const
167 StringList::StringList(UErrorCode
&status
)
168 : strings(NULL
), listMax(STRING_LIST_BUFFER_SIZE
), listSize(0)
170 if (U_FAILURE(status
)) {
174 strings
= new UnicodeString
[listMax
];
176 if (strings
== NULL
) {
177 status
= U_MEMORY_ALLOCATION_ERROR
;
182 StringList::~StringList()
187 void StringList::add(const UnicodeString
*string
, UErrorCode
&status
)
189 if (U_FAILURE(status
)) {
192 if (listSize
>= listMax
) {
193 int32_t newMax
= listMax
+ STRING_LIST_BUFFER_SIZE
;
194 UnicodeString
*newStrings
= new UnicodeString
[newMax
];
195 if (newStrings
== NULL
) {
196 status
= U_MEMORY_ALLOCATION_ERROR
;
199 for (int32_t i
=0; i
<listSize
; ++i
) {
200 newStrings
[i
] = strings
[i
];
203 strings
= newStrings
;
207 // The ctor initialized all the strings in
208 // the array to empty strings, so this
209 // is the same as copying the source string.
210 strings
[listSize
++].append(*string
);
213 void StringList::add(const UChar
*chars
, int32_t count
, UErrorCode
&status
)
215 const UnicodeString
string(chars
, count
);
217 add(&string
, status
);
220 const UnicodeString
*StringList::get(int32_t index
) const
222 if (index
>= 0 && index
< listSize
) {
223 return &strings
[index
];
229 int32_t StringList::size() const
236 static void U_CALLCONV
237 deleteStringList(void *obj
)
239 StringList
*strings
= (StringList
*) obj
;
248 CEToStringsMap(UErrorCode
&status
);
251 void put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
);
252 StringList
*getStringList(uint32_t ce
) const;
255 void putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
);
259 CEToStringsMap::CEToStringsMap(UErrorCode
&status
)
262 if (U_FAILURE(status
)) {
266 map
= uhash_open(uhash_hashLong
, uhash_compareLong
,
267 uhash_compareCaselessUnicodeString
,
270 if (U_FAILURE(status
)) {
274 uhash_setValueDeleter(map
, deleteStringList
);
277 CEToStringsMap::~CEToStringsMap()
282 void CEToStringsMap::put(uint32_t ce
, UnicodeString
*string
, UErrorCode
&status
)
284 StringList
*strings
= getStringList(ce
);
286 if (strings
== NULL
) {
287 strings
= new StringList(status
);
289 if (strings
== NULL
|| U_FAILURE(status
)) {
290 status
= U_MEMORY_ALLOCATION_ERROR
;
294 putStringList(ce
, strings
, status
);
297 strings
->add(string
, status
);
300 StringList
*CEToStringsMap::getStringList(uint32_t ce
) const
302 return (StringList
*) uhash_iget(map
, ce
);
305 void CEToStringsMap::putStringList(uint32_t ce
, StringList
*stringList
, UErrorCode
&status
)
307 uhash_iput(map
, ce
, (void *) stringList
, &status
);
310 #define CLONE_COLLATOR
312 CollData::CollData(UCollator
*collator
, UErrorCode
&status
)
313 : coll(NULL
), ceToCharsStartingWith(NULL
)
315 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
316 // i.e. other, control, private use, format, surrogate
317 U_STRING_DECL(test_pattern
, "[[:assigned:]-[:c:]]", 20);
318 U_STRING_INIT(test_pattern
, "[[:assigned:]-[:c:]]", 20);
319 USet
*charsToTest
= uset_openPattern(test_pattern
, 20, &status
);
321 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
322 // i.e. all the characers we handle implicitly
323 U_STRING_DECL(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324 U_STRING_INIT(remove_pattern
, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325 USet
*charsToRemove
= uset_openPattern(remove_pattern
, 70, &status
);
327 if (U_FAILURE(status
)) {
331 USet
*expansions
= uset_openEmpty();
332 USet
*contractions
= uset_openEmpty();
335 ceToCharsStartingWith
= new CEToStringsMap(status
);
337 if (U_FAILURE(status
)) {
341 #ifdef CLONE_COLLATOR
342 coll
= ucol_safeClone(collator
, NULL
, NULL
, &status
);
344 if (U_FAILURE(status
)) {
351 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
353 uset_addAll(charsToTest
, contractions
);
354 uset_addAll(charsToTest
, expansions
);
355 uset_removeAll(charsToTest
, charsToRemove
);
357 itemCount
= uset_getItemCount(charsToTest
);
358 for(int32_t item
= 0; item
< itemCount
; item
+= 1) {
359 UChar32 start
= 0, end
= 0;
361 int32_t len
= uset_getItem(charsToTest
, item
, &start
, &end
,
362 buffer
, 16, &status
);
365 for (UChar32 ch
= start
; ch
<= end
; ch
+= 1) {
366 UnicodeString
*st
= new UnicodeString(ch
);
369 status
= U_MEMORY_ALLOCATION_ERROR
;
373 CEList
*ceList
= new CEList(coll
, *st
, status
);
375 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
380 } else if (len
> 0) {
381 UnicodeString
*st
= new UnicodeString(buffer
, len
);
384 status
= U_MEMORY_ALLOCATION_ERROR
;
388 CEList
*ceList
= new CEList(coll
, *st
, status
);
390 ceToCharsStartingWith
->put(ceList
->get(0), st
, status
);
395 // shouldn't happen...
398 if (U_FAILURE(status
)) {
404 uset_close(contractions
);
405 uset_close(expansions
);
406 uset_close(charsToRemove
);
407 uset_close(charsToTest
);
409 if (U_FAILURE(status
)) {
413 UChar32 hanRanges
[] = {UCOL_FIRST_HAN
, UCOL_LAST_HAN
, UCOL_FIRST_HAN_COMPAT
, UCOL_LAST_HAN_COMPAT
, UCOL_FIRST_HAN_A
, UCOL_LAST_HAN_A
,
414 UCOL_FIRST_HAN_B
, UCOL_LAST_HAN_B
};
415 UChar jamoRanges
[] = {UCOL_FIRST_L_JAMO
, UCOL_FIRST_V_JAMO
, UCOL_FIRST_T_JAMO
, UCOL_LAST_T_JAMO
};
416 UnicodeString hanString
= UnicodeString::fromUTF32(hanRanges
, ARRAY_SIZE(hanRanges
));
417 UnicodeString
jamoString(FALSE
, jamoRanges
, ARRAY_SIZE(jamoRanges
));
418 CEList
hanList(coll
, hanString
, status
);
419 CEList
jamoList(coll
, jamoString
, status
);
422 if (U_FAILURE(status
)) {
426 for (int32_t c
= 0; c
< jamoList
.size(); c
+= 1) {
427 uint32_t jce
= jamoList
[c
];
429 if (! isContinuation(jce
)) {
430 jamoLimits
[j
++] = jce
;
434 jamoLimits
[3] += (1 << UCOL_PRIMARYORDERSHIFT
);
439 for(int32_t h
= 0; h
< hanList
.size(); h
+= 2) {
440 uint32_t han
= (uint32_t) hanList
[h
];
451 maxHan
+= (1 << UCOL_PRIMARYORDERSHIFT
);
454 CollData::~CollData()
456 #ifdef CLONE_COLLATOR
460 delete ceToCharsStartingWith
;
463 UCollator
*CollData::getCollator() const
468 const StringList
*CollData::getStringList(int32_t ce
) const
470 return ceToCharsStartingWith
->getStringList(ce
);
473 const CEList
*CollData::getCEList(const UnicodeString
*string
) const
475 UErrorCode status
= U_ZERO_ERROR
;
476 const CEList
*list
= new CEList(coll
, *string
, status
);
478 if (U_FAILURE(status
)) {
486 void CollData::freeCEList(const CEList
*list
)
491 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
, int32_t *history
) const
493 // find out shortest string for the longest sequence of ces.
494 // this can probably be folded with the minLengthCache...
496 if (history
[offset
] >= 0) {
497 return history
[offset
];
500 uint32_t ce
= ceList
->get(offset
);
501 int32_t maxOffset
= ceList
->size();
502 int32_t shortestLength
= INT32_MAX
;
503 const StringList
*strings
= ceToCharsStartingWith
->getStringList(ce
);
505 if (strings
!= NULL
) {
506 int32_t stringCount
= strings
->size();
508 for (int32_t s
= 0; s
< stringCount
; s
+= 1) {
509 const UnicodeString
*string
= strings
->get(s
);
510 UErrorCode status
= U_ZERO_ERROR
;
511 const CEList
*ceList2
= new CEList(coll
, *string
, status
);
513 if (U_FAILURE(status
)) {
518 if (ceList
->matchesAt(offset
, ceList2
)) {
519 U_ASSERT(ceList2
!= NULL
);
520 int32_t clength
= ceList2
->size();
521 int32_t slength
= string
->length();
522 int32_t roffset
= offset
+ clength
;
525 if (roffset
< maxOffset
) {
526 rlength
= minLengthInChars(ceList
, roffset
, history
);
529 // delete before continue to avoid memory leak.
532 // ignore any dead ends
537 if (shortestLength
> slength
+ rlength
) {
538 shortestLength
= slength
+ rlength
;
546 if (shortestLength
== INT32_MAX
) {
547 // No matching strings at this offset. See if
548 // the CE is in a range we can handle manually.
549 if (ce
>= minHan
&& ce
< maxHan
) {
550 // all han have implicit orders which
552 int32_t roffset
= offset
+ 2;
555 //history[roffset++] = -1;
556 //history[roffset++] = 1;
558 if (roffset
< maxOffset
) {
559 rlength
= minLengthInChars(ceList
, roffset
, history
);
566 shortestLength
= 1 + rlength
;
568 } else if (ce
>= jamoLimits
[0] && ce
< jamoLimits
[3]) {
569 int32_t roffset
= offset
;
572 // **** this loop may not handle archaic Hangul correctly ****
573 for (int32_t j
= 0; roffset
< maxOffset
&& j
< 4; j
+= 1, roffset
+= 1) {
574 uint32_t jce
= ceList
->get(roffset
);
576 // Some Jamo have 24-bit primary order; skip the
577 // 2nd CE. This should always be OK because if
578 // we're still in the loop all we've seen are
579 // a series of Jamo in LVT order.
580 if (isContinuation(jce
)) {
584 if (j
>= 3 || jce
< jamoLimits
[j
] || jce
>= jamoLimits
[j
+ 1]) {
589 if (roffset
== offset
) {
590 // we started with a non-L Jamo...
591 // just say it comes from a single character
594 // See if the single Jamo has a 24-bit order.
595 if (roffset
< maxOffset
&& isContinuation(ceList
->get(roffset
))) {
600 if (roffset
< maxOffset
) {
601 rlength
= minLengthInChars(ceList
, roffset
, history
);
608 shortestLength
= 1 + rlength
;
612 // Can't handle it manually either. Just move on.
617 history
[offset
] = shortestLength
;
619 return shortestLength
;
622 int32_t CollData::minLengthInChars(const CEList
*ceList
, int32_t offset
) const
624 int32_t clength
= ceList
->size();
625 int32_t *history
= NEW_ARRAY(int32_t, clength
);
627 for (int32_t i
= 0; i
< clength
; i
+= 1) {
631 int32_t minLength
= minLengthInChars(ceList
, offset
, history
);
633 DELETE_ARRAY(history
);
638 #endif // #if !UCONFIG_NO_COLLATION