2 *******************************************************************************
4 * Copyright (C) 2001-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ustrcase.c
10 * tab size: 8 (not used)
13 * created on: 2002feb20
14 * created by: Markus W. Scherer
16 * Implementation file for string casing C API functions.
17 * Uses functions from uchar.c for basic functionality that requires access
18 * to the Unicode Character Database (uprops.dat).
21 #include "unicode/utypes.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ubrk.h"
30 /* string casing ------------------------------------------------------------ */
32 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
33 static U_INLINE
int32_t
34 appendResult(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
35 int32_t result
, const UChar
*s
) {
39 /* decode the result */
41 /* (not) original code point */
44 } else if(result
<=UCASE_MAX_STRING_LENGTH
) {
52 if(destIndex
<destCapacity
) {
53 /* append the result */
57 U16_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
59 /* overflow, nothing written */
60 destIndex
+=U16_LENGTH(c
);
64 if((destIndex
+length
)<=destCapacity
) {
66 dest
[destIndex
++]=*s
++;
77 destIndex
+=U16_LENGTH(c
);
85 static UChar32 U_CALLCONV
86 utf16_caseContextIterator(void *context
, int8_t dir
) {
87 UCaseContext
*csc
=(UCaseContext
*)context
;
91 /* reset for backward iteration */
92 csc
->index
=csc
->cpStart
;
95 /* reset for forward iteration */
96 csc
->index
=csc
->cpLimit
;
99 /* continue current iteration direction */
104 if(csc
->start
<csc
->index
) {
105 U16_PREV((const UChar
*)csc
->p
, csc
->start
, csc
->index
, c
);
109 if(csc
->index
<csc
->limit
) {
110 U16_NEXT((const UChar
*)csc
->p
, csc
->index
, csc
->limit
, c
);
117 typedef int32_t U_CALLCONV
118 UCaseMapFull(const UCaseProps
*csp
, UChar32 c
,
119 UCaseContextIterator
*iter
, void *context
,
120 const UChar
**pString
,
121 const char *locale
, int32_t *locCache
);
124 * Case-maps [srcStart..srcLimit[ but takes
125 * context [0..srcLength[ into account.
128 _caseMap(const UCaseProps
*csp
, UCaseMapFull
*map
,
129 UChar
*dest
, int32_t destCapacity
,
130 const UChar
*src
, UCaseContext
*csc
,
131 int32_t srcStart
, int32_t srcLimit
,
132 const char *locale
, int32_t *locCache
,
133 UErrorCode
*pErrorCode
) {
136 int32_t srcIndex
, destIndex
;
138 /* case mapping loop */
141 while(srcIndex
<srcLimit
) {
142 csc
->cpStart
=srcIndex
;
143 U16_NEXT(src
, srcIndex
, srcLimit
, c
);
144 csc
->cpLimit
=srcIndex
;
145 c
=map(csp
, c
, utf16_caseContextIterator
, csc
, &s
, locale
, locCache
);
146 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0xffff : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0xffff)) {
147 /* fast path version of appendResult() for BMP results */
148 dest
[destIndex
++]=(UChar
)c2
;
150 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
154 if(destIndex
>destCapacity
) {
155 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
160 #if !UCONFIG_NO_BREAK_ITERATION
163 * Internal titlecasing function.
165 * Must get titleIter!=NULL.
168 _toTitle(const UCaseProps
*csp
,
169 UChar
*dest
, int32_t destCapacity
,
170 const UChar
*src
, UCaseContext
*csc
,
172 UBreakIterator
*titleIter
,
173 const char *locale
, int32_t *locCache
,
174 UErrorCode
*pErrorCode
) {
177 int32_t prev
, titleStart
, titleLimit
, index
, destIndex
, length
;
180 /* set up local variables */
185 /* titlecasing loop */
186 while(prev
<srcLength
) {
187 /* find next index where to titlecase */
190 index
=ubrk_first(titleIter
);
192 index
=ubrk_next(titleIter
);
194 if(index
==UBRK_DONE
|| index
>srcLength
) {
199 * Unicode 4 & 5 section 3.13 Default Case Operations:
201 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
202 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
203 * cased character F. If F exists, map F to default_title(F); then map each
204 * subsequent character C to default_lower(C).
206 * In this implementation, segment [prev..index[ into 3 parts:
207 * a) uncased characters (copy as-is) [prev..titleStart[
208 * b) first case letter (titlecase) [titleStart..titleLimit[
209 * c) subsequent characters (lowercase) [titleLimit..index[
212 /* find and copy uncased characters [prev..titleStart[ */
213 titleStart
=titleLimit
=prev
;
215 U16_NEXT(src
, titleLimit
, srcLength
, c
);
216 if(UCASE_NONE
!=ucase_getType(csp
, c
)) {
217 break; /* cased letter at [titleStart..titleLimit[ */
219 titleStart
=titleLimit
;
220 if(titleLimit
==index
) {
222 * only uncased characters in [prev..index[
223 * stop with titleStart==titleLimit==index
228 length
=titleStart
-prev
;
230 if((destIndex
+length
)<=destCapacity
) {
231 uprv_memcpy(dest
+destIndex
, src
+prev
, length
*U_SIZEOF_UCHAR
);
236 if(titleStart
<titleLimit
) {
237 /* titlecase c which is from [titleStart..titleLimit[ */
238 csc
->cpStart
=titleStart
;
239 csc
->cpLimit
=titleLimit
;
240 c
=ucase_toFullTitle(csp
, c
, utf16_caseContextIterator
, csc
, &s
, locale
, locCache
);
241 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
243 /* lowercase [titleLimit..index[ */
244 if(titleLimit
<index
) {
247 csp
, ucase_toFullLower
,
248 dest
+destIndex
, destCapacity
-destIndex
,
260 if(destIndex
>destCapacity
) {
261 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
267 ustr_toTitle(const UCaseProps
*csp
,
268 UChar
*dest
, int32_t destCapacity
,
269 const UChar
*src
, int32_t srcLength
,
270 UBreakIterator
*titleIter
,
272 UErrorCode
*pErrorCode
) {
273 UCaseContext csc
={ NULL
};
282 src
, &csc
, srcLength
,
283 titleIter
, locale
, &locCache
, pErrorCode
);
288 /* functions available in the common library (for unistr_case.cpp) */
291 ustr_toLower(const UCaseProps
*csp
,
292 UChar
*dest
, int32_t destCapacity
,
293 const UChar
*src
, int32_t srcLength
,
295 UErrorCode
*pErrorCode
) {
296 UCaseContext csc
={ NULL
};
303 return _caseMap(csp
, ucase_toFullLower
,
305 src
, &csc
, 0, srcLength
,
306 locale
, &locCache
, pErrorCode
);
310 ustr_toUpper(const UCaseProps
*csp
,
311 UChar
*dest
, int32_t destCapacity
,
312 const UChar
*src
, int32_t srcLength
,
314 UErrorCode
*pErrorCode
) {
315 UCaseContext csc
={ NULL
};
322 return _caseMap(csp
, ucase_toFullUpper
,
324 src
, &csc
, 0, srcLength
,
325 locale
, &locCache
, pErrorCode
);
329 ustr_foldCase(const UCaseProps
*csp
,
330 UChar
*dest
, int32_t destCapacity
,
331 const UChar
*src
, int32_t srcLength
,
333 UErrorCode
*pErrorCode
) {
334 int32_t srcIndex
, destIndex
;
339 /* case mapping loop */
340 srcIndex
=destIndex
=0;
341 while(srcIndex
<srcLength
) {
342 U16_NEXT(src
, srcIndex
, srcLength
, c
);
343 c
=ucase_toFullFolding(csp
, c
, &s
, options
);
344 if((destIndex
<destCapacity
) && (c
<0 ? (c2
=~c
)<=0xffff : UCASE_MAX_STRING_LENGTH
<c
&& (c2
=c
)<=0xffff)) {
345 /* fast path version of appendResult() for BMP results */
346 dest
[destIndex
++]=(UChar
)c2
;
348 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
352 if(destIndex
>destCapacity
) {
353 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
359 * Implement argument checking and buffer handling
360 * for string case mapping as a common function.
369 /* common internal function for public API functions */
372 caseMap(UChar
*dest
, int32_t destCapacity
,
373 const UChar
*src
, int32_t srcLength
,
374 UBreakIterator
*titleIter
,
378 UErrorCode
*pErrorCode
) {
382 const UCaseProps
*csp
;
387 /* check argument values */
388 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
391 if( destCapacity
<0 ||
392 (dest
==NULL
&& destCapacity
>0) ||
396 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
400 csp
=ucase_getSingleton(pErrorCode
);
401 if(U_FAILURE(*pErrorCode
)) {
405 /* get the string length */
407 srcLength
=u_strlen(src
);
410 /* check for overlapping source and destination */
412 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
413 (dest
>=src
&& dest
<(src
+srcLength
)))
415 /* overlap: provide a temporary destination buffer and later copy the result */
416 if(destCapacity
<=(sizeof(buffer
)/U_SIZEOF_UCHAR
)) {
417 /* the stack buffer is large enough */
420 /* allocate a buffer */
421 temp
=(UChar
*)uprv_malloc(destCapacity
*U_SIZEOF_UCHAR
);
423 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
434 if(toWhichCase
==FOLD_CASE
) {
435 destLength
=ustr_foldCase(csp
, temp
, destCapacity
, src
, srcLength
,
436 options
, pErrorCode
);
438 UCaseContext csc
={ NULL
};
445 /* the internal functions require locale!=NULL */
447 locale
=uloc_getDefault();
450 if(toWhichCase
==TO_LOWER
) {
451 destLength
=_caseMap(csp
, ucase_toFullLower
,
455 locale
, &locCache
, pErrorCode
);
456 } else if(toWhichCase
==TO_UPPER
) {
457 destLength
=_caseMap(csp
, ucase_toFullUpper
,
461 locale
, &locCache
, pErrorCode
);
462 } else /* if(toWhichCase==TO_TITLE) */ {
463 #if UCONFIG_NO_BREAK_ITERATION
464 *pErrorCode
=U_UNSUPPORTED_ERROR
;
466 if(titleIter
==NULL
) {
467 titleIter
=ubrk_open(UBRK_WORD
, locale
,
470 ownTitleIter
=(UBool
)U_SUCCESS(*pErrorCode
);
472 if(U_SUCCESS(*pErrorCode
)) {
473 destLength
=_toTitle(csp
, temp
, destCapacity
,
474 src
, &csc
, srcLength
,
475 titleIter
, locale
, &locCache
, pErrorCode
);
481 /* copy the result string to the destination buffer */
483 int32_t copyLength
= destLength
<=destCapacity
? destLength
: destCapacity
;
485 uprv_memmove(dest
, temp
, copyLength
*U_SIZEOF_UCHAR
);
493 #if !UCONFIG_NO_BREAK_ITERATION
495 ubrk_close(titleIter
);
499 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
502 /* public API functions */
504 U_CAPI
int32_t U_EXPORT2
505 u_strToLower(UChar
*dest
, int32_t destCapacity
,
506 const UChar
*src
, int32_t srcLength
,
508 UErrorCode
*pErrorCode
) {
509 return caseMap(dest
, destCapacity
,
512 TO_LOWER
, pErrorCode
);
515 U_CAPI
int32_t U_EXPORT2
516 u_strToUpper(UChar
*dest
, int32_t destCapacity
,
517 const UChar
*src
, int32_t srcLength
,
519 UErrorCode
*pErrorCode
) {
520 return caseMap(dest
, destCapacity
,
523 TO_UPPER
, pErrorCode
);
526 #if !UCONFIG_NO_BREAK_ITERATION
528 U_CAPI
int32_t U_EXPORT2
529 u_strToTitle(UChar
*dest
, int32_t destCapacity
,
530 const UChar
*src
, int32_t srcLength
,
531 UBreakIterator
*titleIter
,
533 UErrorCode
*pErrorCode
) {
534 return caseMap(dest
, destCapacity
,
536 titleIter
, locale
, 0,
537 TO_TITLE
, pErrorCode
);
542 U_CAPI
int32_t U_EXPORT2
543 u_strFoldCase(UChar
*dest
, int32_t destCapacity
,
544 const UChar
*src
, int32_t srcLength
,
546 UErrorCode
*pErrorCode
) {
547 return caseMap(dest
, destCapacity
,
550 FOLD_CASE
, pErrorCode
);
553 /* case-insensitive string comparisons -------------------------------------- */
556 * This function is a copy of unorm_cmpEquivFold() minus the parts for
557 * canonical equivalence.
558 * Keep the functions in sync, and see there for how this works.
559 * The duplication is for modularization:
560 * It makes caseless (but not canonical caseless) matches independent of
561 * the normalization code.
564 /* stack element for previous-level source/decomposition pointers */
565 struct CmpEquivLevel
{
566 const UChar
*start
, *s
, *limit
;
568 typedef struct CmpEquivLevel CmpEquivLevel
;
570 /* internal function */
572 u_strcmpFold(const UChar
*s1
, int32_t length1
,
573 const UChar
*s2
, int32_t length2
,
575 UErrorCode
*pErrorCode
) {
576 const UCaseProps
*csp
;
578 /* current-level start/limit - s1/s2 as current */
579 const UChar
*start1
, *start2
, *limit1
, *limit2
;
581 /* case folding variables */
585 /* stacks of previous-level start/current/limit */
586 CmpEquivLevel stack1
[2], stack2
[2];
588 /* case folding buffers, only use current-level start/limit */
589 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
591 /* track which is the current level per string */
592 int32_t level1
, level2
;
594 /* current code units, and code points for lookups */
595 UChar32 c1
, c2
, cp1
, cp2
;
597 /* no argument error checking because this itself is not an API */
600 * assume that at least the option U_COMPARE_IGNORE_CASE is set
601 * otherwise this function would have to behave exactly as uprv_strCompare()
603 csp
=ucase_getSingleton(pErrorCode
);
604 if(U_FAILURE(*pErrorCode
)) {
626 /* comparison loop */
629 * here a code unit value of -1 means "get another code unit"
630 * below it will mean "this source is finished"
634 /* get next code unit from string 1, post-increment */
636 if(s1
==limit1
|| ((c1
=*s1
)==0 && (limit1
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
646 /* reached end of level buffer, pop one level */
649 start1
=stack1
[level1
].start
;
650 } while(start1
==NULL
);
652 limit1
=stack1
[level1
].limit
;
657 /* get next code unit from string 2, post-increment */
659 if(s2
==limit2
|| ((c2
=*s2
)==0 && (limit2
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
669 /* reached end of level buffer, pop one level */
672 start2
=stack2
[level2
].start
;
673 } while(start2
==NULL
);
675 limit2
=stack2
[level2
].limit
;
681 * either variable c1, c2 is -1 only if the corresponding string is finished
685 return 0; /* c1==c2==-1 indicating end of strings */
687 c1
=c2
=-1; /* make us fetch new code units */
690 return -1; /* string 1 ends before string 2 */
692 return 1; /* string 2 ends before string 1 */
694 /* c1!=c2 && c1>=0 && c2>=0 */
696 /* get complete code points for c1, c2 for lookups if either is a surrogate */
698 if(U_IS_SURROGATE(c1
)) {
701 if(U_IS_SURROGATE_LEAD(c1
)) {
702 if(s1
!=limit1
&& U16_IS_TRAIL(c
=*s1
)) {
703 /* advance ++s1; only below if cp1 decomposes/case-folds */
704 cp1
=U16_GET_SUPPLEMENTARY(c1
, c
);
706 } else /* isTrail(c1) */ {
707 if(start1
<=(s1
-2) && U16_IS_LEAD(c
=*(s1
-2))) {
708 cp1
=U16_GET_SUPPLEMENTARY(c
, c1
);
714 if(U_IS_SURROGATE(c2
)) {
717 if(U_IS_SURROGATE_LEAD(c2
)) {
718 if(s2
!=limit2
&& U16_IS_TRAIL(c
=*s2
)) {
719 /* advance ++s2; only below if cp2 decomposes/case-folds */
720 cp2
=U16_GET_SUPPLEMENTARY(c2
, c
);
722 } else /* isTrail(c2) */ {
723 if(start2
<=(s2
-2) && U16_IS_LEAD(c
=*(s2
-2))) {
724 cp2
=U16_GET_SUPPLEMENTARY(c
, c2
);
730 * go down one level for each string
731 * continue with the main loop as soon as there is a real change
735 (length
=ucase_toFullFolding(csp
, (UChar32
)cp1
, &p
, options
))>=0
737 /* cp1 case-folds to the code point "length" or to p[length] */
738 if(U_IS_SURROGATE(c1
)) {
739 if(U_IS_SURROGATE_LEAD(c1
)) {
740 /* advance beyond source surrogate pair if it case-folds */
742 } else /* isTrail(c1) */ {
744 * we got a supplementary code point when hitting its trail surrogate,
745 * therefore the lead surrogate must have been the same as in the other string;
746 * compare this decomposition with the lead surrogate in the other string
747 * remember that this simulates bulk text replacement:
748 * the decomposition would replace the entire code point
755 /* push current level pointers */
756 stack1
[0].start
=start1
;
758 stack1
[0].limit
=limit1
;
761 /* copy the folding result to fold1[] */
762 if(length
<=UCASE_MAX_STRING_LENGTH
) {
763 u_memcpy(fold1
, p
, length
);
766 U16_APPEND_UNSAFE(fold1
, i
, length
);
770 /* set next level pointers to case folding */
774 /* get ready to read from decomposition, continue with loop */
780 (length
=ucase_toFullFolding(csp
, (UChar32
)cp2
, &p
, options
))>=0
782 /* cp2 case-folds to the code point "length" or to p[length] */
783 if(U_IS_SURROGATE(c2
)) {
784 if(U_IS_SURROGATE_LEAD(c2
)) {
785 /* advance beyond source surrogate pair if it case-folds */
787 } else /* isTrail(c2) */ {
789 * we got a supplementary code point when hitting its trail surrogate,
790 * therefore the lead surrogate must have been the same as in the other string;
791 * compare this decomposition with the lead surrogate in the other string
792 * remember that this simulates bulk text replacement:
793 * the decomposition would replace the entire code point
800 /* push current level pointers */
801 stack2
[0].start
=start2
;
803 stack2
[0].limit
=limit2
;
806 /* copy the folding result to fold2[] */
807 if(length
<=UCASE_MAX_STRING_LENGTH
) {
808 u_memcpy(fold2
, p
, length
);
811 U16_APPEND_UNSAFE(fold2
, i
, length
);
815 /* set next level pointers to case folding */
819 /* get ready to read from decomposition, continue with loop */
825 * no decomposition/case folding, max level for both sides:
826 * return difference result
828 * code point order comparison must not just return cp1-cp2
829 * because when single surrogates are present then the surrogate pairs
830 * that formed cp1 and cp2 may be from different string indexes
832 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
833 * c1=d800 cp1=10001 c2=dc00 cp2=10000
834 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
836 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
837 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
838 * so we have slightly different pointer/start/limit comparisons here
841 if(c1
>=0xd800 && c2
>=0xd800 && (options
&U_COMPARE_CODE_POINT_ORDER
)) {
842 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
844 (c1
<=0xdbff && s1
!=limit1
&& U16_IS_TRAIL(*s1
)) ||
845 (U16_IS_TRAIL(c1
) && start1
!=(s1
-1) && U16_IS_LEAD(*(s1
-2)))
847 /* part of a surrogate pair, leave >=d800 */
849 /* BMP code point - may be surrogate code point - make <d800 */
854 (c2
<=0xdbff && s2
!=limit2
&& U16_IS_TRAIL(*s2
)) ||
855 (U16_IS_TRAIL(c2
) && start2
!=(s2
-1) && U16_IS_LEAD(*(s2
-2)))
857 /* part of a surrogate pair, leave >=d800 */
859 /* BMP code point - may be surrogate code point - make <d800 */
868 /* public API functions */
870 U_CAPI
int32_t U_EXPORT2
871 u_strCaseCompare(const UChar
*s1
, int32_t length1
,
872 const UChar
*s2
, int32_t length2
,
874 UErrorCode
*pErrorCode
) {
875 /* argument checking */
876 if(pErrorCode
==0 || U_FAILURE(*pErrorCode
)) {
879 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
880 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
883 return u_strcmpFold(s1
, length1
, s2
, length2
,
884 options
|U_COMPARE_IGNORE_CASE
,
888 U_CAPI
int32_t U_EXPORT2
889 u_strcasecmp(const UChar
*s1
, const UChar
*s2
, uint32_t options
) {
890 UErrorCode errorCode
=U_ZERO_ERROR
;
891 return u_strcmpFold(s1
, -1, s2
, -1,
892 options
|U_COMPARE_IGNORE_CASE
,
896 U_CAPI
int32_t U_EXPORT2
897 u_memcasecmp(const UChar
*s1
, const UChar
*s2
, int32_t length
, uint32_t options
) {
898 UErrorCode errorCode
=U_ZERO_ERROR
;
899 return u_strcmpFold(s1
, length
, s2
, length
,
900 options
|U_COMPARE_IGNORE_CASE
,
904 U_CAPI
int32_t U_EXPORT2
905 u_strncasecmp(const UChar
*s1
, const UChar
*s2
, int32_t n
, uint32_t options
) {
906 UErrorCode errorCode
=U_ZERO_ERROR
;
907 return u_strcmpFold(s1
, n
, s2
, n
,
908 options
|(U_COMPARE_IGNORE_CASE
|_STRNCMP_STYLE
),