2 *******************************************************************************
4 * Copyright (C) 2001-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ustrcase.c
10 * tab size: 8 (not used)
13 * created on: 2002feb20
14 * created by: Markus W. Scherer
16 * Implementation file for string casing C API functions.
17 * Uses functions from uchar.c for basic functionality that requires access
18 * to the Unicode Character Database (uprops.dat).
21 #include "unicode/utypes.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ubrk.h"
30 /* string casing ------------------------------------------------------------ */
32 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
33 static U_INLINE
int32_t
34 appendResult(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
35 int32_t result
, const UChar
*s
) {
39 /* decode the result */
41 /* (not) original code point */
44 } else if(result
<=UCASE_MAX_STRING_LENGTH
) {
52 if(destIndex
<destCapacity
) {
53 /* append the result */
57 U16_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
59 /* overflow, nothing written */
60 destIndex
+=U16_LENGTH(c
);
64 if((destIndex
+length
)<=destCapacity
) {
66 dest
[destIndex
++]=*s
++;
77 destIndex
+=U16_LENGTH(c
);
85 static UChar32 U_CALLCONV
86 utf16_caseContextIterator(void *context
, int8_t dir
) {
87 UCaseContext
*csc
=(UCaseContext
*)context
;
91 /* reset for backward iteration */
92 csc
->index
=csc
->cpStart
;
95 /* reset for forward iteration */
96 csc
->index
=csc
->cpLimit
;
99 /* continue current iteration direction */
104 if(csc
->start
<csc
->index
) {
105 U16_PREV((const UChar
*)csc
->p
, csc
->start
, csc
->index
, c
);
109 if(csc
->index
<csc
->limit
) {
110 U16_NEXT((const UChar
*)csc
->p
, csc
->index
, csc
->limit
, c
);
117 typedef int32_t U_CALLCONV
118 UCaseMapFull(const UCaseProps
*csp
, UChar32 c
,
119 UCaseContextIterator
*iter
, void *context
,
120 const UChar
**pString
,
121 const char *locale
, int32_t *locCache
);
124 * Lowercases [srcStart..srcLimit[ but takes
125 * context [0..srcLength[ into account.
128 _caseMap(UCaseProps
*csp
, UCaseMapFull
*map
,
129 UChar
*dest
, int32_t destCapacity
,
130 const UChar
*src
, UCaseContext
*csc
,
131 int32_t srcStart
, int32_t srcLimit
,
132 const char *locale
, int32_t *locCache
,
133 UErrorCode
*pErrorCode
) {
136 int32_t srcIndex
, destIndex
;
138 /* case mapping loop */
141 while(srcIndex
<srcLimit
) {
142 csc
->cpStart
=srcIndex
;
143 U16_NEXT(src
, srcIndex
, srcLimit
, c
);
144 csc
->cpLimit
=srcIndex
;
145 c
=map(csp
, c
, utf16_caseContextIterator
, csc
, &s
, locale
, locCache
);
146 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
149 if(destIndex
>destCapacity
) {
150 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
155 #if !UCONFIG_NO_BREAK_ITERATION
158 * Internal titlecasing function.
160 * Must get titleIter!=NULL.
163 _toTitle(UCaseProps
*csp
,
164 UChar
*dest
, int32_t destCapacity
,
165 const UChar
*src
, UCaseContext
*csc
,
167 UBreakIterator
*titleIter
,
168 const char *locale
, int32_t *locCache
,
169 UErrorCode
*pErrorCode
) {
172 int32_t prev
, index
, destIndex
;
175 /* set up local variables */
180 /* titlecasing loop */
181 while(prev
<srcLength
) {
182 /* find next index where to titlecase */
185 index
=ubrk_first(titleIter
);
187 index
=ubrk_next(titleIter
);
189 if(index
==UBRK_DONE
|| index
>srcLength
) {
193 /* lowercase [prev..index[ */
197 csp
, ucase_toFullLower
,
198 dest
+destIndex
, destCapacity
-destIndex
,
205 if(index
>=srcLength
) {
209 /* titlecase the character at the found index */
211 U16_NEXT(src
, index
, srcLength
, c
);
213 c
=ucase_toFullTitle(csp
, c
, utf16_caseContextIterator
, csc
, &s
, locale
, locCache
);
214 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
219 if(destIndex
>destCapacity
) {
220 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
226 ustr_toTitle(UCaseProps
*csp
,
227 UChar
*dest
, int32_t destCapacity
,
228 const UChar
*src
, int32_t srcLength
,
229 UBreakIterator
*titleIter
,
231 UErrorCode
*pErrorCode
) {
232 UCaseContext csc
={ NULL
};
241 src
, &csc
, srcLength
,
242 titleIter
, locale
, &locCache
, pErrorCode
);
247 /* functions available in the common library (for unistr_case.cpp) */
250 ustr_toLower(UCaseProps
*csp
,
251 UChar
*dest
, int32_t destCapacity
,
252 const UChar
*src
, int32_t srcLength
,
254 UErrorCode
*pErrorCode
) {
255 UCaseContext csc
={ NULL
};
262 return _caseMap(csp
, ucase_toFullLower
,
264 src
, &csc
, 0, srcLength
,
265 locale
, &locCache
, pErrorCode
);
269 ustr_toUpper(UCaseProps
*csp
,
270 UChar
*dest
, int32_t destCapacity
,
271 const UChar
*src
, int32_t srcLength
,
273 UErrorCode
*pErrorCode
) {
274 UCaseContext csc
={ NULL
};
281 return _caseMap(csp
, ucase_toFullUpper
,
283 src
, &csc
, 0, srcLength
,
284 locale
, &locCache
, pErrorCode
);
288 ustr_foldCase(UCaseProps
*csp
,
289 UChar
*dest
, int32_t destCapacity
,
290 const UChar
*src
, int32_t srcLength
,
292 UErrorCode
*pErrorCode
) {
293 int32_t srcIndex
, destIndex
;
298 /* case mapping loop */
299 srcIndex
=destIndex
=0;
300 while(srcIndex
<srcLength
) {
301 U16_NEXT(src
, srcIndex
, srcLength
, c
);
302 c
=ucase_toFullFolding(csp
, c
, &s
, options
);
303 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
);
306 if(destIndex
>destCapacity
) {
307 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
313 * Implement argument checking and buffer handling
314 * for string case mapping as a common function.
323 /* common internal function for public API functions */
326 caseMap(UChar
*dest
, int32_t destCapacity
,
327 const UChar
*src
, int32_t srcLength
,
328 UBreakIterator
*titleIter
,
332 UErrorCode
*pErrorCode
) {
341 /* check argument values */
342 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
345 if( destCapacity
<0 ||
346 (dest
==NULL
&& destCapacity
>0) ||
350 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
354 csp
=ucase_getSingleton(pErrorCode
);
355 if(U_FAILURE(*pErrorCode
)) {
359 /* get the string length */
361 srcLength
=u_strlen(src
);
364 /* check for overlapping source and destination */
366 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
367 (dest
>=src
&& dest
<(src
+srcLength
)))
369 /* overlap: provide a temporary destination buffer and later copy the result */
370 if(destCapacity
<=(sizeof(buffer
)/U_SIZEOF_UCHAR
)) {
371 /* the stack buffer is large enough */
374 /* allocate a buffer */
375 temp
=(UChar
*)uprv_malloc(destCapacity
*U_SIZEOF_UCHAR
);
377 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
388 if(toWhichCase
==FOLD_CASE
) {
389 destLength
=ustr_foldCase(csp
, temp
, destCapacity
, src
, srcLength
,
390 options
, pErrorCode
);
392 UCaseContext csc
={ NULL
};
399 /* the internal functions require locale!=NULL */
401 locale
=uloc_getDefault();
404 if(toWhichCase
==TO_LOWER
) {
405 destLength
=_caseMap(csp
, ucase_toFullLower
,
409 locale
, &locCache
, pErrorCode
);
410 } else if(toWhichCase
==TO_UPPER
) {
411 destLength
=_caseMap(csp
, ucase_toFullUpper
,
415 locale
, &locCache
, pErrorCode
);
416 } else /* if(toWhichCase==TO_TITLE) */ {
417 #if UCONFIG_NO_BREAK_ITERATION
418 *pErrorCode
=U_UNSUPPORTED_ERROR
;
420 if(titleIter
==NULL
) {
421 titleIter
=ubrk_open(UBRK_WORD
, locale
,
424 ownTitleIter
=(UBool
)U_SUCCESS(*pErrorCode
);
426 if(U_SUCCESS(*pErrorCode
)) {
427 destLength
=_toTitle(csp
, temp
, destCapacity
,
428 src
, &csc
, srcLength
,
429 titleIter
, locale
, &locCache
, pErrorCode
);
435 /* copy the result string to the destination buffer */
437 int32_t copyLength
= destLength
<=destCapacity
? destLength
: destCapacity
;
439 uprv_memmove(dest
, temp
, copyLength
*U_SIZEOF_UCHAR
);
447 #if !UCONFIG_NO_BREAK_ITERATION
449 ubrk_close(titleIter
);
453 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
456 /* public API functions */
458 U_CAPI
int32_t U_EXPORT2
459 u_strToLower(UChar
*dest
, int32_t destCapacity
,
460 const UChar
*src
, int32_t srcLength
,
462 UErrorCode
*pErrorCode
) {
463 return caseMap(dest
, destCapacity
,
466 TO_LOWER
, pErrorCode
);
469 U_CAPI
int32_t U_EXPORT2
470 u_strToUpper(UChar
*dest
, int32_t destCapacity
,
471 const UChar
*src
, int32_t srcLength
,
473 UErrorCode
*pErrorCode
) {
474 return caseMap(dest
, destCapacity
,
477 TO_UPPER
, pErrorCode
);
480 #if !UCONFIG_NO_BREAK_ITERATION
482 U_CAPI
int32_t U_EXPORT2
483 u_strToTitle(UChar
*dest
, int32_t destCapacity
,
484 const UChar
*src
, int32_t srcLength
,
485 UBreakIterator
*titleIter
,
487 UErrorCode
*pErrorCode
) {
488 return caseMap(dest
, destCapacity
,
490 titleIter
, locale
, 0,
491 TO_TITLE
, pErrorCode
);
496 U_CAPI
int32_t U_EXPORT2
497 u_strFoldCase(UChar
*dest
, int32_t destCapacity
,
498 const UChar
*src
, int32_t srcLength
,
500 UErrorCode
*pErrorCode
) {
501 return caseMap(dest
, destCapacity
,
504 FOLD_CASE
, pErrorCode
);
507 /* case-insensitive string comparisons -------------------------------------- */
510 * This function is a copy of unorm_cmpEquivFold() minus the parts for
511 * canonical equivalence.
512 * Keep the functions in sync, and see there for how this works.
513 * The duplication is for modularization:
514 * It makes caseless (but not canonical caseless) matches independent of
515 * the normalization code.
518 /* stack element for previous-level source/decomposition pointers */
519 struct CmpEquivLevel
{
520 const UChar
*start
, *s
, *limit
;
522 typedef struct CmpEquivLevel CmpEquivLevel
;
524 /* internal function */
526 u_strcmpFold(const UChar
*s1
, int32_t length1
,
527 const UChar
*s2
, int32_t length2
,
529 UErrorCode
*pErrorCode
) {
532 /* current-level start/limit - s1/s2 as current */
533 const UChar
*start1
, *start2
, *limit1
, *limit2
;
535 /* case folding variables */
539 /* stacks of previous-level start/current/limit */
540 CmpEquivLevel stack1
[2], stack2
[2];
542 /* case folding buffers, only use current-level start/limit */
543 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
545 /* track which is the current level per string */
546 int32_t level1
, level2
;
548 /* current code units, and code points for lookups */
549 UChar32 c1
, c2
, cp1
, cp2
;
551 /* no argument error checking because this itself is not an API */
554 * assume that at least the option U_COMPARE_IGNORE_CASE is set
555 * otherwise this function would have to behave exactly as uprv_strCompare()
557 csp
=ucase_getSingleton(pErrorCode
);
558 if(U_FAILURE(*pErrorCode
)) {
580 /* comparison loop */
583 * here a code unit value of -1 means "get another code unit"
584 * below it will mean "this source is finished"
588 /* get next code unit from string 1, post-increment */
590 if(s1
==limit1
|| ((c1
=*s1
)==0 && (limit1
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
600 /* reached end of level buffer, pop one level */
603 start1
=stack1
[level1
].start
;
604 } while(start1
==NULL
);
606 limit1
=stack1
[level1
].limit
;
611 /* get next code unit from string 2, post-increment */
613 if(s2
==limit2
|| ((c2
=*s2
)==0 && (limit2
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
623 /* reached end of level buffer, pop one level */
626 start2
=stack2
[level2
].start
;
627 } while(start2
==NULL
);
629 limit2
=stack2
[level2
].limit
;
635 * either variable c1, c2 is -1 only if the corresponding string is finished
639 return 0; /* c1==c2==-1 indicating end of strings */
641 c1
=c2
=-1; /* make us fetch new code units */
644 return -1; /* string 1 ends before string 2 */
646 return 1; /* string 2 ends before string 1 */
648 /* c1!=c2 && c1>=0 && c2>=0 */
650 /* get complete code points for c1, c2 for lookups if either is a surrogate */
652 if(U_IS_SURROGATE(c1
)) {
655 if(U_IS_SURROGATE_LEAD(c1
)) {
656 if(s1
!=limit1
&& U16_IS_TRAIL(c
=*s1
)) {
657 /* advance ++s1; only below if cp1 decomposes/case-folds */
658 cp1
=U16_GET_SUPPLEMENTARY(c1
, c
);
660 } else /* isTrail(c1) */ {
661 if(start1
<=(s1
-2) && U16_IS_LEAD(c
=*(s1
-2))) {
662 cp1
=U16_GET_SUPPLEMENTARY(c
, c1
);
668 if(U_IS_SURROGATE(c2
)) {
671 if(U_IS_SURROGATE_LEAD(c2
)) {
672 if(s2
!=limit2
&& U16_IS_TRAIL(c
=*s2
)) {
673 /* advance ++s2; only below if cp2 decomposes/case-folds */
674 cp2
=U16_GET_SUPPLEMENTARY(c2
, c
);
676 } else /* isTrail(c2) */ {
677 if(start2
<=(s2
-2) && U16_IS_LEAD(c
=*(s2
-2))) {
678 cp2
=U16_GET_SUPPLEMENTARY(c
, c2
);
684 * go down one level for each string
685 * continue with the main loop as soon as there is a real change
689 (length
=ucase_toFullFolding(csp
, (UChar32
)cp1
, &p
, options
))>=0
691 /* cp1 case-folds to the code point "length" or to p[length] */
692 if(U_IS_SURROGATE(c1
)) {
693 if(U_IS_SURROGATE_LEAD(c1
)) {
694 /* advance beyond source surrogate pair if it case-folds */
696 } else /* isTrail(c1) */ {
698 * we got a supplementary code point when hitting its trail surrogate,
699 * therefore the lead surrogate must have been the same as in the other string;
700 * compare this decomposition with the lead surrogate in the other string
701 * remember that this simulates bulk text replacement:
702 * the decomposition would replace the entire code point
709 /* push current level pointers */
710 stack1
[0].start
=start1
;
712 stack1
[0].limit
=limit1
;
715 /* copy the folding result to fold1[] */
716 if(length
<=UCASE_MAX_STRING_LENGTH
) {
717 u_memcpy(fold1
, p
, length
);
720 U16_APPEND_UNSAFE(fold1
, i
, length
);
724 /* set next level pointers to case folding */
728 /* get ready to read from decomposition, continue with loop */
734 (length
=ucase_toFullFolding(csp
, (UChar32
)cp2
, &p
, options
))>=0
736 /* cp2 case-folds to the code point "length" or to p[length] */
737 if(U_IS_SURROGATE(c2
)) {
738 if(U_IS_SURROGATE_LEAD(c2
)) {
739 /* advance beyond source surrogate pair if it case-folds */
741 } else /* isTrail(c2) */ {
743 * we got a supplementary code point when hitting its trail surrogate,
744 * therefore the lead surrogate must have been the same as in the other string;
745 * compare this decomposition with the lead surrogate in the other string
746 * remember that this simulates bulk text replacement:
747 * the decomposition would replace the entire code point
754 /* push current level pointers */
755 stack2
[0].start
=start2
;
757 stack2
[0].limit
=limit2
;
760 /* copy the folding result to fold2[] */
761 if(length
<=UCASE_MAX_STRING_LENGTH
) {
762 u_memcpy(fold2
, p
, length
);
765 U16_APPEND_UNSAFE(fold2
, i
, length
);
769 /* set next level pointers to case folding */
773 /* get ready to read from decomposition, continue with loop */
779 * no decomposition/case folding, max level for both sides:
780 * return difference result
782 * code point order comparison must not just return cp1-cp2
783 * because when single surrogates are present then the surrogate pairs
784 * that formed cp1 and cp2 may be from different string indexes
786 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
787 * c1=d800 cp1=10001 c2=dc00 cp2=10000
788 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
790 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
791 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
792 * so we have slightly different pointer/start/limit comparisons here
795 if(c1
>=0xd800 && c2
>=0xd800 && (options
&U_COMPARE_CODE_POINT_ORDER
)) {
796 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
798 (c1
<=0xdbff && s1
!=limit1
&& U16_IS_TRAIL(*s1
)) ||
799 (U16_IS_TRAIL(c1
) && start1
!=(s1
-1) && U16_IS_LEAD(*(s1
-2)))
801 /* part of a surrogate pair, leave >=d800 */
803 /* BMP code point - may be surrogate code point - make <d800 */
808 (c2
<=0xdbff && s2
!=limit2
&& U16_IS_TRAIL(*s2
)) ||
809 (U16_IS_TRAIL(c2
) && start2
!=(s2
-1) && U16_IS_LEAD(*(s2
-2)))
811 /* part of a surrogate pair, leave >=d800 */
813 /* BMP code point - may be surrogate code point - make <d800 */
822 /* public API functions */
824 U_CAPI
int32_t U_EXPORT2
825 u_strCaseCompare(const UChar
*s1
, int32_t length1
,
826 const UChar
*s2
, int32_t length2
,
828 UErrorCode
*pErrorCode
) {
829 /* argument checking */
830 if(pErrorCode
==0 || U_FAILURE(*pErrorCode
)) {
833 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
834 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
837 return u_strcmpFold(s1
, length1
, s2
, length2
,
838 options
|U_COMPARE_IGNORE_CASE
,
842 U_CAPI
int32_t U_EXPORT2
843 u_strcasecmp(const UChar
*s1
, const UChar
*s2
, uint32_t options
) {
844 UErrorCode errorCode
=U_ZERO_ERROR
;
845 return u_strcmpFold(s1
, -1, s2
, -1,
846 options
|U_COMPARE_IGNORE_CASE
,
850 U_CAPI
int32_t U_EXPORT2
851 u_memcasecmp(const UChar
*s1
, const UChar
*s2
, int32_t length
, uint32_t options
) {
852 UErrorCode errorCode
=U_ZERO_ERROR
;
853 return u_strcmpFold(s1
, length
, s2
, length
,
854 options
|U_COMPARE_IGNORE_CASE
,
858 U_CAPI
int32_t U_EXPORT2
859 u_strncasecmp(const UChar
*s1
, const UChar
*s2
, int32_t n
, uint32_t options
) {
860 UErrorCode errorCode
=U_ZERO_ERROR
;
861 return u_strcmpFold(s1
, n
, s2
, n
,
862 options
|(U_COMPARE_IGNORE_CASE
|_STRNCMP_STYLE
),