2 *******************************************************************************
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ustrcase.cpp
10 * tab size: 8 (not used)
13 * created on: 2002feb20
14 * created by: Markus W. Scherer
16 * Implementation file for string casing C API functions.
17 * Uses functions from uchar.c for basic functionality that requires access
18 * to the Unicode Character Database (uprops.dat).
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ucasemap.h"
25 #include "unicode/ubrk.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
36 /* string casing ------------------------------------------------------------ */
38 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
40 appendResult(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
41 int32_t result
, const UChar
*s
) {
45 /* decode the result */
47 /* (not) original code point */
50 } else if(result
<=UCASE_MAX_STRING_LENGTH
) {
58 if(destIndex
<destCapacity
) {
59 /* append the result */
63 U16_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
65 /* overflow, nothing written */
66 destIndex
+=U16_LENGTH(c
);
70 if((destIndex
+length
)<=destCapacity
) {
72 dest
[destIndex
++]=*s
++;
83 destIndex
+=U16_LENGTH(c
);
91 static UChar32 U_CALLCONV
92 utf16_caseContextIterator(void *context
, int8_t dir
) {
93 UCaseContext
*csc
=(UCaseContext
*)context
;
97 /* reset for backward iteration */
98 csc
->index
=csc
->cpStart
;
101 /* reset for forward iteration */
102 csc
->index
=csc
->cpLimit
;
105 /* continue current iteration direction */
110 if(csc
->start
<csc
->index
) {
111 U16_PREV((const UChar
*)csc
->p
, csc
->start
, csc
->index
, c
);
115 if(csc
->index
<csc
->limit
) {
116 U16_NEXT((const UChar
*)csc
->p
, csc
->index
, csc
->limit
, c
);
124 * Case-maps [srcStart..srcLimit[ but takes
125 * context [0..srcLength[ into account.
128 _caseMap(const UCaseMap
*csm
, UCaseMapFull
*map
,
129 UChar
*dest
, int32_t destCapacity
,
130 const UChar
*src
, UCaseContext
*csc
,
131 int32_t srcStart
, int32_t srcLimit
,
132 UErrorCode
*pErrorCode
) {
135 int32_t srcIndex
, destIndex
;
138 locCache
=csm
->locCache
;
140 /* case mapping loop */
143 while(srcIndex
<srcLimit
) {
144 csc
->cpStart
=srcIndex
;
145 U16_NEXT(src
, srcIndex
, srcLimit
, c
);
146 csc
->cpLimit
=srcIndex
;
147 c
=map(csm
->csp
, c
, utf16_caseContextIterator
, csc
, &s
, csm
->locale
, &locCache
);
148 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0xffff : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0xffff)) {
149 /* fast path version of appendResult() for BMP results */
150 dest
[destIndex
++]=(UChar
)c2
;
152 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
156 if(destIndex
>destCapacity
) {
157 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
162 #if !UCONFIG_NO_BREAK_ITERATION
164 U_CFUNC
int32_t U_CALLCONV
165 ustrcase_internalToTitle(const UCaseMap
*csm
,
166 UChar
*dest
, int32_t destCapacity
,
167 const UChar
*src
, int32_t srcLength
,
168 UErrorCode
*pErrorCode
) {
171 int32_t prev
, titleStart
, titleLimit
, idx
, destIndex
, length
;
174 if(U_FAILURE(*pErrorCode
)) {
178 // Use the C++ abstract base class to minimize dependencies.
179 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
180 BreakIterator
*bi
=reinterpret_cast<BreakIterator
*>(csm
->iter
);
182 /* set up local variables */
183 int32_t locCache
=csm
->locCache
;
184 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
191 /* titlecasing loop */
192 while(prev
<srcLength
) {
193 /* find next index where to titlecase */
200 if(idx
==UBRK_DONE
|| idx
>srcLength
) {
205 * Unicode 4 & 5 section 3.13 Default Case Operations:
207 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
208 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
209 * cased character F. If F exists, map F to default_title(F); then map each
210 * subsequent character C to default_lower(C).
212 * In this implementation, segment [prev..index[ into 3 parts:
213 * a) uncased characters (copy as-is) [prev..titleStart[
214 * b) first case letter (titlecase) [titleStart..titleLimit[
215 * c) subsequent characters (lowercase) [titleLimit..index[
218 /* find and copy uncased characters [prev..titleStart[ */
219 titleStart
=titleLimit
=prev
;
220 U16_NEXT(src
, titleLimit
, idx
, c
);
221 if((csm
->options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0 && UCASE_NONE
==ucase_getType(csm
->csp
, c
)) {
222 /* Adjust the titlecasing index (titleStart) to the next cased character. */
224 titleStart
=titleLimit
;
225 if(titleLimit
==idx
) {
227 * only uncased characters in [prev..index[
228 * stop with titleStart==titleLimit==index
232 U16_NEXT(src
, titleLimit
, idx
, c
);
233 if(UCASE_NONE
!=ucase_getType(csm
->csp
, c
)) {
234 break; /* cased letter at [titleStart..titleLimit[ */
237 length
=titleStart
-prev
;
239 if((destIndex
+length
)<=destCapacity
) {
240 uprv_memcpy(dest
+destIndex
, src
+prev
, length
*U_SIZEOF_UCHAR
);
246 if(titleStart
<titleLimit
) {
247 /* titlecase c which is from [titleStart..titleLimit[ */
248 csc
.cpStart
=titleStart
;
249 csc
.cpLimit
=titleLimit
;
250 c
=ucase_toFullTitle(csm
->csp
, c
, utf16_caseContextIterator
, &csc
, &s
, csm
->locale
, &locCache
);
251 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
253 /* Special case Dutch IJ titlecasing */
254 if ( titleStart
+1 < idx
&&
255 ucase_getCaseLocale(csm
->locale
,&locCache
) == UCASE_LOC_DUTCH
&&
256 ( src
[titleStart
] == (UChar32
) 0x0049 || src
[titleStart
] == (UChar32
) 0x0069 ) &&
257 ( src
[titleStart
+1] == (UChar32
) 0x004A || src
[titleStart
+1] == (UChar32
) 0x006A )) {
259 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
263 /* lowercase [titleLimit..index[ */
265 if((csm
->options
&U_TITLECASE_NO_LOWERCASE
)==0) {
266 /* Normal operation: Lowercase the rest of the word. */
269 csm
, ucase_toFullLower
,
270 dest
+destIndex
, destCapacity
-destIndex
,
275 /* Optionally just copy the rest of the word unchanged. */
276 length
=idx
-titleLimit
;
277 if((destIndex
+length
)<=destCapacity
) {
278 uprv_memcpy(dest
+destIndex
, src
+titleLimit
, length
*U_SIZEOF_UCHAR
);
289 if(destIndex
>destCapacity
) {
290 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
295 #endif // !UCONFIG_NO_BREAK_ITERATION
297 /* functions available in the common library (for unistr_case.cpp) */
299 U_CFUNC
int32_t U_CALLCONV
300 ustrcase_internalToLower(const UCaseMap
*csm
,
301 UChar
*dest
, int32_t destCapacity
,
302 const UChar
*src
, int32_t srcLength
,
303 UErrorCode
*pErrorCode
) {
304 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
308 csm
, ucase_toFullLower
,
310 src
, &csc
, 0, srcLength
,
314 U_CFUNC
int32_t U_CALLCONV
315 ustrcase_internalToUpper(const UCaseMap
*csm
,
316 UChar
*dest
, int32_t destCapacity
,
317 const UChar
*src
, int32_t srcLength
,
318 UErrorCode
*pErrorCode
) {
319 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
323 csm
, ucase_toFullUpper
,
325 src
, &csc
, 0, srcLength
,
330 ustr_foldCase(const UCaseProps
*csp
,
331 UChar
*dest
, int32_t destCapacity
,
332 const UChar
*src
, int32_t srcLength
,
334 UErrorCode
*pErrorCode
) {
335 int32_t srcIndex
, destIndex
;
340 /* case mapping loop */
341 srcIndex
=destIndex
=0;
342 while(srcIndex
<srcLength
) {
343 U16_NEXT(src
, srcIndex
, srcLength
, c
);
344 c
=ucase_toFullFolding(csp
, c
, &s
, options
);
345 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0xffff : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0xffff)) {
346 /* fast path version of appendResult() for BMP results */
347 dest
[destIndex
++]=(UChar
)c2
;
349 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
353 if(destIndex
>destCapacity
) {
354 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
359 U_CFUNC
int32_t U_CALLCONV
360 ustrcase_internalFold(const UCaseMap
*csm
,
361 UChar
*dest
, int32_t destCapacity
,
362 const UChar
*src
, int32_t srcLength
,
363 UErrorCode
*pErrorCode
) {
364 return ustr_foldCase(csm
->csp
, dest
, destCapacity
, src
, srcLength
, csm
->options
, pErrorCode
);
368 ustrcase_map(const UCaseMap
*csm
,
369 UChar
*dest
, int32_t destCapacity
,
370 const UChar
*src
, int32_t srcLength
,
371 UStringCaseMapper
*stringCaseMapper
,
372 UErrorCode
*pErrorCode
) {
378 /* check argument values */
379 if(U_FAILURE(*pErrorCode
)) {
382 if( destCapacity
<0 ||
383 (dest
==NULL
&& destCapacity
>0) ||
387 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
391 /* get the string length */
393 srcLength
=u_strlen(src
);
396 /* check for overlapping source and destination */
398 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
399 (dest
>=src
&& dest
<(src
+srcLength
)))
401 /* overlap: provide a temporary destination buffer and later copy the result */
402 if(destCapacity
<=LENGTHOF(buffer
)) {
403 /* the stack buffer is large enough */
406 /* allocate a buffer */
407 temp
=(UChar
*)uprv_malloc(destCapacity
*U_SIZEOF_UCHAR
);
409 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
417 destLength
=stringCaseMapper(csm
, temp
, destCapacity
, src
, srcLength
, pErrorCode
);
419 /* copy the result string to the destination buffer */
421 int32_t copyLength
= destLength
<=destCapacity
? destLength
: destCapacity
;
423 uprv_memmove(dest
, temp
, copyLength
*U_SIZEOF_UCHAR
);
431 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
434 /* public API functions */
436 U_CAPI
int32_t U_EXPORT2
437 u_strFoldCase(UChar
*dest
, int32_t destCapacity
,
438 const UChar
*src
, int32_t srcLength
,
440 UErrorCode
*pErrorCode
) {
441 UCaseMap csm
=UCASEMAP_INITIALIZER
;
442 csm
.csp
=ucase_getSingleton();
448 ustrcase_internalFold
, pErrorCode
);
451 /* case-insensitive string comparisons -------------------------------------- */
454 * This function is a copy of unorm_cmpEquivFold() minus the parts for
455 * canonical equivalence.
456 * Keep the functions in sync, and see there for how this works.
457 * The duplication is for modularization:
458 * It makes caseless (but not canonical caseless) matches independent of
459 * the normalization code.
462 /* stack element for previous-level source/decomposition pointers */
463 struct CmpEquivLevel
{
464 const UChar
*start
, *s
, *limit
;
466 typedef struct CmpEquivLevel CmpEquivLevel
;
468 /* internal function */
470 u_strcmpFold(const UChar
*s1
, int32_t length1
,
471 const UChar
*s2
, int32_t length2
,
473 UErrorCode
*pErrorCode
) {
474 const UCaseProps
*csp
;
476 /* current-level start/limit - s1/s2 as current */
477 const UChar
*start1
, *start2
, *limit1
, *limit2
;
479 /* case folding variables */
483 /* stacks of previous-level start/current/limit */
484 CmpEquivLevel stack1
[2], stack2
[2];
486 /* case folding buffers, only use current-level start/limit */
487 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
489 /* track which is the current level per string */
490 int32_t level1
, level2
;
492 /* current code units, and code points for lookups */
493 UChar32 c1
, c2
, cp1
, cp2
;
495 /* no argument error checking because this itself is not an API */
498 * assume that at least the option U_COMPARE_IGNORE_CASE is set
499 * otherwise this function would have to behave exactly as uprv_strCompare()
501 csp
=ucase_getSingleton();
502 if(U_FAILURE(*pErrorCode
)) {
524 /* comparison loop */
527 * here a code unit value of -1 means "get another code unit"
528 * below it will mean "this source is finished"
532 /* get next code unit from string 1, post-increment */
534 if(s1
==limit1
|| ((c1
=*s1
)==0 && (limit1
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
544 /* reached end of level buffer, pop one level */
547 start1
=stack1
[level1
].start
; /*Not uninitialized*/
548 } while(start1
==NULL
);
549 s1
=stack1
[level1
].s
; /*Not uninitialized*/
550 limit1
=stack1
[level1
].limit
; /*Not uninitialized*/
555 /* get next code unit from string 2, post-increment */
557 if(s2
==limit2
|| ((c2
=*s2
)==0 && (limit2
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
567 /* reached end of level buffer, pop one level */
570 start2
=stack2
[level2
].start
; /*Not uninitialized*/
571 } while(start2
==NULL
);
572 s2
=stack2
[level2
].s
; /*Not uninitialized*/
573 limit2
=stack2
[level2
].limit
; /*Not uninitialized*/
579 * either variable c1, c2 is -1 only if the corresponding string is finished
583 return 0; /* c1==c2==-1 indicating end of strings */
585 c1
=c2
=-1; /* make us fetch new code units */
588 return -1; /* string 1 ends before string 2 */
590 return 1; /* string 2 ends before string 1 */
592 /* c1!=c2 && c1>=0 && c2>=0 */
594 /* get complete code points for c1, c2 for lookups if either is a surrogate */
596 if(U_IS_SURROGATE(c1
)) {
599 if(U_IS_SURROGATE_LEAD(c1
)) {
600 if(s1
!=limit1
&& U16_IS_TRAIL(c
=*s1
)) {
601 /* advance ++s1; only below if cp1 decomposes/case-folds */
602 cp1
=U16_GET_SUPPLEMENTARY(c1
, c
);
604 } else /* isTrail(c1) */ {
605 if(start1
<=(s1
-2) && U16_IS_LEAD(c
=*(s1
-2))) {
606 cp1
=U16_GET_SUPPLEMENTARY(c
, c1
);
612 if(U_IS_SURROGATE(c2
)) {
615 if(U_IS_SURROGATE_LEAD(c2
)) {
616 if(s2
!=limit2
&& U16_IS_TRAIL(c
=*s2
)) {
617 /* advance ++s2; only below if cp2 decomposes/case-folds */
618 cp2
=U16_GET_SUPPLEMENTARY(c2
, c
);
620 } else /* isTrail(c2) */ {
621 if(start2
<=(s2
-2) && U16_IS_LEAD(c
=*(s2
-2))) {
622 cp2
=U16_GET_SUPPLEMENTARY(c
, c2
);
628 * go down one level for each string
629 * continue with the main loop as soon as there is a real change
633 (length
=ucase_toFullFolding(csp
, (UChar32
)cp1
, &p
, options
))>=0
635 /* cp1 case-folds to the code point "length" or to p[length] */
636 if(U_IS_SURROGATE(c1
)) {
637 if(U_IS_SURROGATE_LEAD(c1
)) {
638 /* advance beyond source surrogate pair if it case-folds */
640 } else /* isTrail(c1) */ {
642 * we got a supplementary code point when hitting its trail surrogate,
643 * therefore the lead surrogate must have been the same as in the other string;
644 * compare this decomposition with the lead surrogate in the other string
645 * remember that this simulates bulk text replacement:
646 * the decomposition would replace the entire code point
653 /* push current level pointers */
654 stack1
[0].start
=start1
;
656 stack1
[0].limit
=limit1
;
659 /* copy the folding result to fold1[] */
660 if(length
<=UCASE_MAX_STRING_LENGTH
) {
661 u_memcpy(fold1
, p
, length
);
664 U16_APPEND_UNSAFE(fold1
, i
, length
);
668 /* set next level pointers to case folding */
672 /* get ready to read from decomposition, continue with loop */
678 (length
=ucase_toFullFolding(csp
, (UChar32
)cp2
, &p
, options
))>=0
680 /* cp2 case-folds to the code point "length" or to p[length] */
681 if(U_IS_SURROGATE(c2
)) {
682 if(U_IS_SURROGATE_LEAD(c2
)) {
683 /* advance beyond source surrogate pair if it case-folds */
685 } else /* isTrail(c2) */ {
687 * we got a supplementary code point when hitting its trail surrogate,
688 * therefore the lead surrogate must have been the same as in the other string;
689 * compare this decomposition with the lead surrogate in the other string
690 * remember that this simulates bulk text replacement:
691 * the decomposition would replace the entire code point
698 /* push current level pointers */
699 stack2
[0].start
=start2
;
701 stack2
[0].limit
=limit2
;
704 /* copy the folding result to fold2[] */
705 if(length
<=UCASE_MAX_STRING_LENGTH
) {
706 u_memcpy(fold2
, p
, length
);
709 U16_APPEND_UNSAFE(fold2
, i
, length
);
713 /* set next level pointers to case folding */
717 /* get ready to read from decomposition, continue with loop */
723 * no decomposition/case folding, max level for both sides:
724 * return difference result
726 * code point order comparison must not just return cp1-cp2
727 * because when single surrogates are present then the surrogate pairs
728 * that formed cp1 and cp2 may be from different string indexes
730 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
731 * c1=d800 cp1=10001 c2=dc00 cp2=10000
732 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
734 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
735 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
736 * so we have slightly different pointer/start/limit comparisons here
739 if(c1
>=0xd800 && c2
>=0xd800 && (options
&U_COMPARE_CODE_POINT_ORDER
)) {
740 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
742 (c1
<=0xdbff && s1
!=limit1
&& U16_IS_TRAIL(*s1
)) ||
743 (U16_IS_TRAIL(c1
) && start1
!=(s1
-1) && U16_IS_LEAD(*(s1
-2)))
745 /* part of a surrogate pair, leave >=d800 */
747 /* BMP code point - may be surrogate code point - make <d800 */
752 (c2
<=0xdbff && s2
!=limit2
&& U16_IS_TRAIL(*s2
)) ||
753 (U16_IS_TRAIL(c2
) && start2
!=(s2
-1) && U16_IS_LEAD(*(s2
-2)))
755 /* part of a surrogate pair, leave >=d800 */
757 /* BMP code point - may be surrogate code point - make <d800 */
766 /* public API functions */
768 U_CAPI
int32_t U_EXPORT2
769 u_strCaseCompare(const UChar
*s1
, int32_t length1
,
770 const UChar
*s2
, int32_t length2
,
772 UErrorCode
*pErrorCode
) {
773 /* argument checking */
774 if(pErrorCode
==0 || U_FAILURE(*pErrorCode
)) {
777 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
778 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
781 return u_strcmpFold(s1
, length1
, s2
, length2
,
782 options
|U_COMPARE_IGNORE_CASE
,
786 U_CAPI
int32_t U_EXPORT2
787 u_strcasecmp(const UChar
*s1
, const UChar
*s2
, uint32_t options
) {
788 UErrorCode errorCode
=U_ZERO_ERROR
;
789 return u_strcmpFold(s1
, -1, s2
, -1,
790 options
|U_COMPARE_IGNORE_CASE
,
794 U_CAPI
int32_t U_EXPORT2
795 u_memcasecmp(const UChar
*s1
, const UChar
*s2
, int32_t length
, uint32_t options
) {
796 UErrorCode errorCode
=U_ZERO_ERROR
;
797 return u_strcmpFold(s1
, length
, s2
, length
,
798 options
|U_COMPARE_IGNORE_CASE
,
802 U_CAPI
int32_t U_EXPORT2
803 u_strncasecmp(const UChar
*s1
, const UChar
*s2
, int32_t n
, uint32_t options
) {
804 UErrorCode errorCode
=U_ZERO_ERROR
;
805 return u_strcmpFold(s1
, n
, s2
, n
,
806 options
|(U_COMPARE_IGNORE_CASE
|_STRNCMP_STYLE
),