1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ustrcase.cpp
12 * tab size: 8 (not used)
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/ustring.h"
28 #include "unicode/ucasemap.h"
29 #include "unicode/ubrk.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
34 #include "ucasemap_imp.h"
42 int32_t checkOverflowAndEditsError(int32_t destIndex
, int32_t destCapacity
,
43 Edits
*edits
, UErrorCode
&errorCode
) {
44 if (U_SUCCESS(errorCode
)) {
45 if (destIndex
> destCapacity
) {
46 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
47 } else if (edits
!= NULL
) {
48 edits
->copyErrorTo(errorCode
);
60 /* string casing ------------------------------------------------------------ */
62 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
64 appendResult(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
65 int32_t result
, const UChar
*s
,
66 int32_t cpLength
, uint32_t options
, icu::Edits
*edits
) {
70 /* decode the result */
72 /* (not) original code point */
74 edits
->addUnchanged(cpLength
);
75 if(options
& UCASEMAP_OMIT_UNCHANGED_TEXT
) {
80 if(destIndex
<destCapacity
&& c
<=0xffff) { // BMP slightly-fastpath
81 dest
[destIndex
++]=(UChar
)c
;
86 if(result
<=UCASE_MAX_STRING_LENGTH
) {
89 } else if(destIndex
<destCapacity
&& result
<=0xffff) { // BMP slightly-fastpath
90 dest
[destIndex
++]=(UChar
)result
;
92 edits
->addReplace(cpLength
, 1);
100 edits
->addReplace(cpLength
, length
);
103 if(length
>(INT32_MAX
-destIndex
)) {
104 return -1; // integer overflow
107 if(destIndex
<destCapacity
) {
108 /* append the result */
112 U16_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
114 /* overflow, nothing written */
119 if((destIndex
+length
)<=destCapacity
) {
121 dest
[destIndex
++]=*s
++;
136 static inline int32_t
137 appendUChar(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
, UChar c
) {
138 if(destIndex
<destCapacity
) {
140 } else if(destIndex
==INT32_MAX
) {
141 return -1; // integer overflow
146 static inline int32_t
147 appendUnchanged(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
148 const UChar
*s
, int32_t length
, uint32_t options
, icu::Edits
*edits
) {
151 edits
->addUnchanged(length
);
152 if(options
& UCASEMAP_OMIT_UNCHANGED_TEXT
) {
156 if(length
>(INT32_MAX
-destIndex
)) {
157 return -1; // integer overflow
159 if((destIndex
+length
)<=destCapacity
) {
160 u_memcpy(dest
+destIndex
, s
, length
);
167 static UChar32 U_CALLCONV
168 utf16_caseContextIterator(void *context
, int8_t dir
) {
169 UCaseContext
*csc
=(UCaseContext
*)context
;
173 /* reset for backward iteration */
174 csc
->index
=csc
->cpStart
;
177 /* reset for forward iteration */
178 csc
->index
=csc
->cpLimit
;
181 /* continue current iteration direction */
186 if(csc
->start
<csc
->index
) {
187 U16_PREV((const UChar
*)csc
->p
, csc
->start
, csc
->index
, c
);
191 if(csc
->index
<csc
->limit
) {
192 U16_NEXT((const UChar
*)csc
->p
, csc
->index
, csc
->limit
, c
);
200 * Case-maps [srcStart..srcLimit[ but takes
201 * context [0..srcLength[ into account.
204 _caseMap(int32_t caseLocale
, uint32_t options
, UCaseMapFull
*map
,
205 UChar
*dest
, int32_t destCapacity
,
206 const UChar
*src
, UCaseContext
*csc
,
207 int32_t srcStart
, int32_t srcLimit
,
209 UErrorCode
&errorCode
) {
210 /* case mapping loop */
211 int32_t srcIndex
=srcStart
;
213 while(srcIndex
<srcLimit
) {
215 csc
->cpStart
=cpStart
=srcIndex
;
217 U16_NEXT(src
, srcIndex
, srcLimit
, c
);
218 csc
->cpLimit
=srcIndex
;
220 c
=map(c
, utf16_caseContextIterator
, csc
, &s
, caseLocale
);
221 destIndex
= appendResult(dest
, destIndex
, destCapacity
, c
, s
,
222 srcIndex
- cpStart
, options
, edits
);
224 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
232 #if !UCONFIG_NO_BREAK_ITERATION
234 U_CFUNC
int32_t U_CALLCONV
235 ustrcase_internalToTitle(int32_t caseLocale
, uint32_t options
, BreakIterator
*iter
,
236 UChar
*dest
, int32_t destCapacity
,
237 const UChar
*src
, int32_t srcLength
,
239 UErrorCode
&errorCode
) {
240 if(U_FAILURE(errorCode
)) {
244 /* set up local variables */
245 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
250 UBool isFirstIndex
=TRUE
;
252 /* titlecasing loop */
253 while(prev
<srcLength
) {
254 /* find next index where to titlecase */
262 if(index
==UBRK_DONE
|| index
>srcLength
) {
267 * Unicode 4 & 5 section 3.13 Default Case Operations:
269 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
270 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
271 * cased character F. If F exists, map F to default_title(F); then map each
272 * subsequent character C to default_lower(C).
274 * In this implementation, segment [prev..index[ into 3 parts:
275 * a) uncased characters (copy as-is) [prev..titleStart[
276 * b) first case letter (titlecase) [titleStart..titleLimit[
277 * c) subsequent characters (lowercase) [titleLimit..index[
280 /* find and copy uncased characters [prev..titleStart[ */
281 int32_t titleStart
=prev
;
282 int32_t titleLimit
=prev
;
284 U16_NEXT(src
, titleLimit
, index
, c
);
285 if((options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0 && UCASE_NONE
==ucase_getType(c
)) {
286 /* Adjust the titlecasing index (titleStart) to the next cased character. */
288 titleStart
=titleLimit
;
289 if(titleLimit
==index
) {
291 * only uncased characters in [prev..index[
292 * stop with titleStart==titleLimit==index
296 U16_NEXT(src
, titleLimit
, index
, c
);
297 if(UCASE_NONE
!=ucase_getType(c
)) {
298 break; /* cased letter at [titleStart..titleLimit[ */
301 destIndex
=appendUnchanged(dest
, destIndex
, destCapacity
,
302 src
+prev
, titleStart
-prev
, options
, edits
);
304 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
309 if(titleStart
<titleLimit
) {
310 /* titlecase c which is from [titleStart..titleLimit[ */
311 csc
.cpStart
=titleStart
;
312 csc
.cpLimit
=titleLimit
;
314 c
=ucase_toFullTitle(c
, utf16_caseContextIterator
, &csc
, &s
, caseLocale
);
315 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
,
316 titleLimit
-titleStart
, options
, edits
);
318 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
322 /* Special case Dutch IJ titlecasing */
323 if (titleStart
+1 < index
&&
324 caseLocale
== UCASE_LOC_DUTCH
&&
325 (src
[titleStart
] == 0x0049 || src
[titleStart
] == 0x0069)) {
326 if (src
[titleStart
+1] == 0x006A) {
327 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x004A);
329 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
333 edits
->addReplace(1, 1);
336 } else if (src
[titleStart
+1] == 0x004A) {
337 // Keep the capital J from getting lowercased.
338 destIndex
=appendUnchanged(dest
, destIndex
, destCapacity
,
339 src
+titleStart
+1, 1, options
, edits
);
341 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
348 /* lowercase [titleLimit..index[ */
349 if(titleLimit
<index
) {
350 if((options
&U_TITLECASE_NO_LOWERCASE
)==0) {
351 /* Normal operation: Lowercase the rest of the word. */
354 caseLocale
, options
, ucase_toFullLower
,
355 dest
+destIndex
, destCapacity
-destIndex
,
359 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
360 errorCode
=U_ZERO_ERROR
;
362 if(U_FAILURE(errorCode
)) {
366 /* Optionally just copy the rest of the word unchanged. */
367 destIndex
=appendUnchanged(dest
, destIndex
, destCapacity
,
368 src
+titleLimit
, index
-titleLimit
, options
, edits
);
370 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
381 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
384 #endif // !UCONFIG_NO_BREAK_ITERATION
387 namespace GreekUpper
{
389 // Data generated by prototype code, see
390 // http://site.icu-project.org/design/case/greek-upper
391 // TODO: Move this data into ucase.icu.
392 static const uint16_t data0370
[] = {
416 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
418 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
419 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
420 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
422 0x039F | HAS_VOWEL
| HAS_ACCENT
,
424 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
425 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
426 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
452 0x0399 | HAS_VOWEL
| HAS_DIALYTIKA
,
453 0x03A5 | HAS_VOWEL
| HAS_DIALYTIKA
,
454 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
455 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
456 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
457 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
458 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
484 0x0399 | HAS_VOWEL
| HAS_DIALYTIKA
,
485 0x03A5 | HAS_VOWEL
| HAS_DIALYTIKA
,
486 0x039F | HAS_VOWEL
| HAS_ACCENT
,
487 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
488 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
494 0x03D2 | HAS_DIALYTIKA
,
540 static const uint16_t data1F00
[] = {
544 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
545 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
546 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
547 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
548 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
549 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
552 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
553 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
554 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
555 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
556 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
557 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
560 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
561 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
562 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
563 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
568 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
569 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
570 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
571 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
576 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
577 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
578 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
579 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
580 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
581 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
584 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
585 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
586 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
587 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
588 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
589 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
592 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
593 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
594 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
595 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
596 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
597 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
600 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
601 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
602 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
603 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
604 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
605 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
608 0x039F | HAS_VOWEL
| HAS_ACCENT
,
609 0x039F | HAS_VOWEL
| HAS_ACCENT
,
610 0x039F | HAS_VOWEL
| HAS_ACCENT
,
611 0x039F | HAS_VOWEL
| HAS_ACCENT
,
616 0x039F | HAS_VOWEL
| HAS_ACCENT
,
617 0x039F | HAS_VOWEL
| HAS_ACCENT
,
618 0x039F | HAS_VOWEL
| HAS_ACCENT
,
619 0x039F | HAS_VOWEL
| HAS_ACCENT
,
624 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
625 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
626 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
627 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
628 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
629 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
633 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
635 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
637 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
640 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
641 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
642 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
643 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
644 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
645 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
648 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
649 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
650 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
651 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
652 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
653 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
654 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
655 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
656 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
657 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
658 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
659 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
660 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
661 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
662 0x039F | HAS_VOWEL
| HAS_ACCENT
,
663 0x039F | HAS_VOWEL
| HAS_ACCENT
,
664 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
665 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
666 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
667 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
670 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
671 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
672 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
673 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
674 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
675 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
676 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
677 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
678 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
679 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
680 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
681 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
682 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
683 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
684 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
685 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
686 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
687 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
688 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
689 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
690 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
691 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
692 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
693 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
694 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
695 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
696 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
697 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
698 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
699 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
700 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
701 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
702 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
703 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
704 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
705 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
706 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
707 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
708 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
709 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
710 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
711 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
712 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
713 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
714 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
715 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
716 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
717 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
720 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
721 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
722 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
724 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
725 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
728 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
729 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
730 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
736 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
737 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
738 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
740 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
741 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
742 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
743 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
744 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
745 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
746 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
752 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
753 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
756 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
757 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
760 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
761 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
768 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
769 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
772 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
773 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
776 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
777 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
784 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
785 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
786 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
788 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
789 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
790 0x039F | HAS_VOWEL
| HAS_ACCENT
,
791 0x039F | HAS_VOWEL
| HAS_ACCENT
,
792 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
793 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
794 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
801 static const uint16_t data2126
= 0x03A9 | HAS_VOWEL
;
803 uint32_t getLetterData(UChar32 c
) {
804 if (c
< 0x370 || 0x2126 < c
|| (0x3ff < c
&& c
< 0x1f00)) {
806 } else if (c
<= 0x3ff) {
807 return data0370
[c
- 0x370];
808 } else if (c
<= 0x1fff) {
809 return data1F00
[c
- 0x1f00];
810 } else if (c
== 0x2126) {
817 uint32_t getDiacriticData(UChar32 c
) {
819 case 0x0300: // varia
820 case 0x0301: // tonos = oxia
821 case 0x0342: // perispomeni
822 case 0x0302: // circumflex can look like perispomeni
823 case 0x0303: // tilde can look like perispomeni
824 case 0x0311: // inverted breve can look like perispomeni
826 case 0x0308: // dialytika = diaeresis
827 return HAS_COMBINING_DIALYTIKA
;
828 case 0x0344: // dialytika tonos
829 return HAS_COMBINING_DIALYTIKA
| HAS_ACCENT
;
830 case 0x0345: // ypogegrammeni = iota subscript
831 return HAS_YPOGEGRAMMENI
;
832 case 0x0304: // macron
833 case 0x0306: // breve
834 case 0x0313: // comma above
835 case 0x0314: // reversed comma above
836 case 0x0343: // koronis
837 return HAS_OTHER_GREEK_DIACRITIC
;
843 UBool
isFollowedByCasedLetter(const UChar
*s
, int32_t i
, int32_t length
) {
846 U16_NEXT(s
, i
, length
, c
);
847 int32_t type
= ucase_getTypeOrIgnorable(c
);
848 if ((type
& UCASE_IGNORABLE
) != 0) {
849 // Case-ignorable, continue with the loop.
850 } else if (type
!= UCASE_NONE
) {
851 return TRUE
; // Followed by cased letter.
853 return FALSE
; // Uncased and not case-ignorable.
856 return FALSE
; // Not followed by cased letter.
860 * Greek string uppercasing with a state machine.
861 * Probably simpler than a stateless function that has to figure out complex context-before
862 * for each character.
863 * TODO: Try to re-consolidate one way or another with the non-Greek function.
865 int32_t toUpper(uint32_t options
,
866 UChar
*dest
, int32_t destCapacity
,
867 const UChar
*src
, int32_t srcLength
,
869 UErrorCode
&errorCode
) {
872 for (int32_t i
= 0; i
< srcLength
;) {
873 int32_t nextIndex
= i
;
875 U16_NEXT(src
, nextIndex
, srcLength
, c
);
876 uint32_t nextState
= 0;
877 int32_t type
= ucase_getTypeOrIgnorable(c
);
878 if ((type
& UCASE_IGNORABLE
) != 0) {
879 // c is case-ignorable
880 nextState
|= (state
& AFTER_CASED
);
881 } else if (type
!= UCASE_NONE
) {
883 nextState
|= AFTER_CASED
;
885 uint32_t data
= getLetterData(c
);
887 uint32_t upper
= data
& UPPER_MASK
;
888 // Add a dialytika to this iota or ypsilon vowel
889 // if we removed a tonos from the previous vowel,
890 // and that previous vowel did not also have (or gain) a dialytika.
891 // Adding one only to the final vowel in a longer sequence
892 // (which does not occur in normal writing) would require lookahead.
893 // Set the same flag as for preserving an existing dialytika.
894 if ((data
& HAS_VOWEL
) != 0 && (state
& AFTER_VOWEL_WITH_ACCENT
) != 0 &&
895 (upper
== 0x399 || upper
== 0x3A5)) {
896 data
|= HAS_DIALYTIKA
;
898 int32_t numYpogegrammeni
= 0; // Map each one to a trailing, spacing, capital iota.
899 if ((data
& HAS_YPOGEGRAMMENI
) != 0) {
900 numYpogegrammeni
= 1;
902 // Skip combining diacritics after this Greek letter.
903 while (nextIndex
< srcLength
) {
904 uint32_t diacriticData
= getDiacriticData(src
[nextIndex
]);
905 if (diacriticData
!= 0) {
906 data
|= diacriticData
;
907 if ((diacriticData
& HAS_YPOGEGRAMMENI
) != 0) {
912 break; // not a Greek diacritic
915 if ((data
& HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA
) == HAS_VOWEL_AND_ACCENT
) {
916 nextState
|= AFTER_VOWEL_WITH_ACCENT
;
918 // Map according to Greek rules.
919 UBool addTonos
= FALSE
;
920 if (upper
== 0x397 &&
921 (data
& HAS_ACCENT
) != 0 &&
922 numYpogegrammeni
== 0 &&
923 (state
& AFTER_CASED
) == 0 &&
924 !isFollowedByCasedLetter(src
, nextIndex
, srcLength
)) {
925 // Keep disjunctive "or" with (only) a tonos.
926 // We use the same "word boundary" conditions as for the Final_Sigma test.
927 if (i
== nextIndex
) {
928 upper
= 0x389; // Preserve the precomposed form.
932 } else if ((data
& HAS_DIALYTIKA
) != 0) {
933 // Preserve a vowel with dialytika in precomposed form if it exists.
934 if (upper
== 0x399) {
936 data
&= ~HAS_EITHER_DIALYTIKA
;
937 } else if (upper
== 0x3A5) {
939 data
&= ~HAS_EITHER_DIALYTIKA
;
945 // Find out first whether we are changing the text.
946 change
= src
[i
] != upper
|| numYpogegrammeni
> 0;
948 if ((data
& HAS_EITHER_DIALYTIKA
) != 0) {
949 change
|= i2
>= nextIndex
|| src
[i2
] != 0x308;
953 change
|= i2
>= nextIndex
|| src
[i2
] != 0x301;
956 int32_t oldLength
= nextIndex
- i
;
957 int32_t newLength
= (i2
- i
) + numYpogegrammeni
;
958 change
|= oldLength
!= newLength
;
961 edits
->addReplace(oldLength
, newLength
);
965 edits
->addUnchanged(oldLength
);
967 // Write unchanged text?
968 change
= (options
& UCASEMAP_OMIT_UNCHANGED_TEXT
) == 0;
973 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, (UChar
)upper
);
974 if (destIndex
>= 0 && (data
& HAS_EITHER_DIALYTIKA
) != 0) {
975 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x308); // restore or add a dialytika
977 if (destIndex
>= 0 && addTonos
) {
978 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x301);
980 while (destIndex
>= 0 && numYpogegrammeni
> 0) {
981 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x399);
985 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
991 c
=ucase_toFullUpper(c
, NULL
, NULL
, &s
, UCASE_LOC_GREEK
);
992 destIndex
= appendResult(dest
, destIndex
, destCapacity
, c
, s
,
993 nextIndex
- i
, options
, edits
);
995 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1006 } // namespace GreekUpper
1009 /* functions available in the common library (for unistr_case.cpp) */
1011 U_CFUNC
int32_t U_CALLCONV
1012 ustrcase_internalToLower(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
1013 UChar
*dest
, int32_t destCapacity
,
1014 const UChar
*src
, int32_t srcLength
,
1016 UErrorCode
&errorCode
) {
1017 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
1019 csc
.limit
=srcLength
;
1020 int32_t destIndex
= _caseMap(
1021 caseLocale
, options
, ucase_toFullLower
,
1023 src
, &csc
, 0, srcLength
,
1025 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
1028 U_CFUNC
int32_t U_CALLCONV
1029 ustrcase_internalToUpper(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
1030 UChar
*dest
, int32_t destCapacity
,
1031 const UChar
*src
, int32_t srcLength
,
1033 UErrorCode
&errorCode
) {
1035 if (caseLocale
== UCASE_LOC_GREEK
) {
1036 destIndex
= GreekUpper::toUpper(options
, dest
, destCapacity
,
1037 src
, srcLength
, edits
, errorCode
);
1039 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
1041 csc
.limit
=srcLength
;
1042 destIndex
= _caseMap(
1043 caseLocale
, options
, ucase_toFullUpper
,
1045 src
, &csc
, 0, srcLength
,
1048 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
1051 U_CFUNC
int32_t U_CALLCONV
1052 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
1053 UChar
*dest
, int32_t destCapacity
,
1054 const UChar
*src
, int32_t srcLength
,
1056 UErrorCode
&errorCode
) {
1057 /* case mapping loop */
1058 int32_t srcIndex
= 0;
1059 int32_t destIndex
= 0;
1060 while (srcIndex
< srcLength
) {
1061 int32_t cpStart
= srcIndex
;
1063 U16_NEXT(src
, srcIndex
, srcLength
, c
);
1065 c
= ucase_toFullFolding(c
, &s
, options
);
1066 destIndex
= appendResult(dest
, destIndex
, destCapacity
, c
, s
,
1067 srcIndex
- cpStart
, options
, edits
);
1068 if (destIndex
< 0) {
1069 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1074 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
1078 ustrcase_map(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
1079 UChar
*dest
, int32_t destCapacity
,
1080 const UChar
*src
, int32_t srcLength
,
1081 UStringCaseMapper
*stringCaseMapper
,
1083 UErrorCode
&errorCode
) {
1086 /* check argument values */
1087 if(U_FAILURE(errorCode
)) {
1090 if( destCapacity
<0 ||
1091 (dest
==NULL
&& destCapacity
>0) ||
1095 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1099 /* get the string length */
1101 srcLength
=u_strlen(src
);
1104 /* check for overlapping source and destination */
1106 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
1107 (dest
>=src
&& dest
<(src
+srcLength
)))
1109 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1116 destLength
=stringCaseMapper(caseLocale
, options
, UCASEMAP_BREAK_ITERATOR
1117 dest
, destCapacity
, src
, srcLength
, edits
, errorCode
);
1118 return u_terminateUChars(dest
, destCapacity
, destLength
, &errorCode
);
1122 ustrcase_mapWithOverlap(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
1123 UChar
*dest
, int32_t destCapacity
,
1124 const UChar
*src
, int32_t srcLength
,
1125 UStringCaseMapper
*stringCaseMapper
,
1126 UErrorCode
&errorCode
) {
1132 /* check argument values */
1133 if(U_FAILURE(errorCode
)) {
1136 if( destCapacity
<0 ||
1137 (dest
==NULL
&& destCapacity
>0) ||
1141 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1145 /* get the string length */
1147 srcLength
=u_strlen(src
);
1150 /* check for overlapping source and destination */
1152 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
1153 (dest
>=src
&& dest
<(src
+srcLength
)))
1155 /* overlap: provide a temporary destination buffer and later copy the result */
1156 if(destCapacity
<=UPRV_LENGTHOF(buffer
)) {
1157 /* the stack buffer is large enough */
1160 /* allocate a buffer */
1161 temp
=(UChar
*)uprv_malloc(destCapacity
*U_SIZEOF_UCHAR
);
1163 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
1171 destLength
=stringCaseMapper(caseLocale
, options
, UCASEMAP_BREAK_ITERATOR
1172 temp
, destCapacity
, src
, srcLength
, NULL
, errorCode
);
1174 /* copy the result string to the destination buffer */
1175 if (U_SUCCESS(errorCode
) && 0 < destLength
&& destLength
<= destCapacity
) {
1176 u_memmove(dest
, temp
, destLength
);
1183 return u_terminateUChars(dest
, destCapacity
, destLength
, &errorCode
);
1186 /* public API functions */
1188 U_CAPI
int32_t U_EXPORT2
1189 u_strFoldCase(UChar
*dest
, int32_t destCapacity
,
1190 const UChar
*src
, int32_t srcLength
,
1192 UErrorCode
*pErrorCode
) {
1193 return ustrcase_mapWithOverlap(
1194 UCASE_LOC_ROOT
, options
, UCASEMAP_BREAK_ITERATOR_NULL
1197 ustrcase_internalFold
, *pErrorCode
);
1202 int32_t CaseMap::fold(
1204 const UChar
*src
, int32_t srcLength
,
1205 UChar
*dest
, int32_t destCapacity
, Edits
*edits
,
1206 UErrorCode
&errorCode
) {
1207 return ustrcase_map(
1208 UCASE_LOC_ROOT
, options
, UCASEMAP_BREAK_ITERATOR_NULL
1211 ustrcase_internalFold
, edits
, errorCode
);
1216 /* case-insensitive string comparisons -------------------------------------- */
1219 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1220 * canonical equivalence.
1221 * Keep the functions in sync, and see there for how this works.
1222 * The duplication is for modularization:
1223 * It makes caseless (but not canonical caseless) matches independent of
1224 * the normalization code.
1227 /* stack element for previous-level source/decomposition pointers */
1228 struct CmpEquivLevel
{
1229 const UChar
*start
, *s
, *limit
;
1231 typedef struct CmpEquivLevel CmpEquivLevel
;
1234 * Internal implementation code comparing string with case fold.
1235 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1237 * @param s1 input string 1
1238 * @param length1 length of string 1, or -1 (NULL terminated)
1239 * @param s2 input string 2
1240 * @param length2 length of string 2, or -1 (NULL terminated)
1241 * @param options compare options
1242 * @param matchLen1 (output) length of partial prefix match in s1
1243 * @param matchLen2 (output) length of partial prefix match in s2
1244 * @param pErrorCode receives error status
1245 * @return The result of comparison
1247 static int32_t _cmpFold(
1248 const UChar
*s1
, int32_t length1
,
1249 const UChar
*s2
, int32_t length2
,
1251 int32_t *matchLen1
, int32_t *matchLen2
,
1252 UErrorCode
*pErrorCode
) {
1255 /* current-level start/limit - s1/s2 as current */
1256 const UChar
*start1
, *start2
, *limit1
, *limit2
;
1258 /* points to the original start address */
1259 const UChar
*org1
, *org2
;
1261 /* points to the end of match + 1 */
1262 const UChar
*m1
, *m2
;
1264 /* case folding variables */
1268 /* stacks of previous-level start/current/limit */
1269 CmpEquivLevel stack1
[2], stack2
[2];
1271 /* case folding buffers, only use current-level start/limit */
1272 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
1274 /* track which is the current level per string */
1275 int32_t level1
, level2
;
1277 /* current code units, and code points for lookups */
1278 UChar32 c1
, c2
, cp1
, cp2
;
1280 /* no argument error checking because this itself is not an API */
1283 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1284 * otherwise this function would have to behave exactly as uprv_strCompare()
1286 if(U_FAILURE(*pErrorCode
)) {
1292 U_ASSERT(matchLen2
!=NULL
);
1314 /* comparison loop */
1317 * here a code unit value of -1 means "get another code unit"
1318 * below it will mean "this source is finished"
1322 /* get next code unit from string 1, post-increment */
1324 if(s1
==limit1
|| ((c1
=*s1
)==0 && (limit1
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
1334 /* reached end of level buffer, pop one level */
1337 start1
=stack1
[level1
].start
; /*Not uninitialized*/
1338 } while(start1
==NULL
);
1339 s1
=stack1
[level1
].s
; /*Not uninitialized*/
1340 limit1
=stack1
[level1
].limit
; /*Not uninitialized*/
1345 /* get next code unit from string 2, post-increment */
1347 if(s2
==limit2
|| ((c2
=*s2
)==0 && (limit2
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
1357 /* reached end of level buffer, pop one level */
1360 start2
=stack2
[level2
].start
; /*Not uninitialized*/
1361 } while(start2
==NULL
);
1362 s2
=stack2
[level2
].s
; /*Not uninitialized*/
1363 limit2
=stack2
[level2
].limit
; /*Not uninitialized*/
1369 * either variable c1, c2 is -1 only if the corresponding string is finished
1372 const UChar
*next1
, *next2
;
1375 cmpRes
=0; /* c1==c2==-1 indicating end of strings */
1380 * Note: Move the match positions in both strings at the same time
1381 * only when corresponding code point(s) in the original strings
1382 * are fully consumed. For example, when comparing s1="Fust" and
1383 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1384 * the first code point in the case-folded data. But the second "s"
1385 * has no matching code point in s1, so this implementation returns
1386 * 2 as the prefix match length ("Fu").
1391 } else if(s1
==limit1
) {
1392 /* Note: This implementation only use a single level of stack.
1393 * If this code needs to be changed to use multiple levels
1394 * of stacks, the code above should check if the current
1395 * code is at the end of all stacks.
1397 U_ASSERT(level1
==1);
1399 /* is s1 at the end of the current stack? */
1406 } else if(s2
==limit2
) {
1407 U_ASSERT(level2
==1);
1409 /* is s2 at the end of the current stack? */
1417 c1
=c2
=-1; /* make us fetch new code units */
1420 cmpRes
=-1; /* string 1 ends before string 2 */
1423 cmpRes
=1; /* string 2 ends before string 1 */
1426 /* c1!=c2 && c1>=0 && c2>=0 */
1428 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1430 if(U_IS_SURROGATE(c1
)) {
1433 if(U_IS_SURROGATE_LEAD(c1
)) {
1434 if(s1
!=limit1
&& U16_IS_TRAIL(c
=*s1
)) {
1435 /* advance ++s1; only below if cp1 decomposes/case-folds */
1436 cp1
=U16_GET_SUPPLEMENTARY(c1
, c
);
1438 } else /* isTrail(c1) */ {
1439 if(start1
<=(s1
-2) && U16_IS_LEAD(c
=*(s1
-2))) {
1440 cp1
=U16_GET_SUPPLEMENTARY(c
, c1
);
1446 if(U_IS_SURROGATE(c2
)) {
1449 if(U_IS_SURROGATE_LEAD(c2
)) {
1450 if(s2
!=limit2
&& U16_IS_TRAIL(c
=*s2
)) {
1451 /* advance ++s2; only below if cp2 decomposes/case-folds */
1452 cp2
=U16_GET_SUPPLEMENTARY(c2
, c
);
1454 } else /* isTrail(c2) */ {
1455 if(start2
<=(s2
-2) && U16_IS_LEAD(c
=*(s2
-2))) {
1456 cp2
=U16_GET_SUPPLEMENTARY(c
, c2
);
1462 * go down one level for each string
1463 * continue with the main loop as soon as there is a real change
1467 (length
=ucase_toFullFolding((UChar32
)cp1
, &p
, options
))>=0
1469 /* cp1 case-folds to the code point "length" or to p[length] */
1470 if(U_IS_SURROGATE(c1
)) {
1471 if(U_IS_SURROGATE_LEAD(c1
)) {
1472 /* advance beyond source surrogate pair if it case-folds */
1474 } else /* isTrail(c1) */ {
1476 * we got a supplementary code point when hitting its trail surrogate,
1477 * therefore the lead surrogate must have been the same as in the other string;
1478 * compare this decomposition with the lead surrogate in the other string
1479 * remember that this simulates bulk text replacement:
1480 * the decomposition would replace the entire code point
1488 /* push current level pointers */
1489 stack1
[0].start
=start1
;
1491 stack1
[0].limit
=limit1
;
1494 /* copy the folding result to fold1[] */
1495 if(length
<=UCASE_MAX_STRING_LENGTH
) {
1496 u_memcpy(fold1
, p
, length
);
1499 U16_APPEND_UNSAFE(fold1
, i
, length
);
1503 /* set next level pointers to case folding */
1505 limit1
=fold1
+length
;
1507 /* get ready to read from decomposition, continue with loop */
1513 (length
=ucase_toFullFolding((UChar32
)cp2
, &p
, options
))>=0
1515 /* cp2 case-folds to the code point "length" or to p[length] */
1516 if(U_IS_SURROGATE(c2
)) {
1517 if(U_IS_SURROGATE_LEAD(c2
)) {
1518 /* advance beyond source surrogate pair if it case-folds */
1520 } else /* isTrail(c2) */ {
1522 * we got a supplementary code point when hitting its trail surrogate,
1523 * therefore the lead surrogate must have been the same as in the other string;
1524 * compare this decomposition with the lead surrogate in the other string
1525 * remember that this simulates bulk text replacement:
1526 * the decomposition would replace the entire code point
1534 /* push current level pointers */
1535 stack2
[0].start
=start2
;
1537 stack2
[0].limit
=limit2
;
1540 /* copy the folding result to fold2[] */
1541 if(length
<=UCASE_MAX_STRING_LENGTH
) {
1542 u_memcpy(fold2
, p
, length
);
1545 U16_APPEND_UNSAFE(fold2
, i
, length
);
1549 /* set next level pointers to case folding */
1551 limit2
=fold2
+length
;
1553 /* get ready to read from decomposition, continue with loop */
1559 * no decomposition/case folding, max level for both sides:
1560 * return difference result
1562 * code point order comparison must not just return cp1-cp2
1563 * because when single surrogates are present then the surrogate pairs
1564 * that formed cp1 and cp2 may be from different string indexes
1566 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1567 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1568 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1570 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1571 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1572 * so we have slightly different pointer/start/limit comparisons here
1575 if(c1
>=0xd800 && c2
>=0xd800 && (options
&U_COMPARE_CODE_POINT_ORDER
)) {
1576 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1578 (c1
<=0xdbff && s1
!=limit1
&& U16_IS_TRAIL(*s1
)) ||
1579 (U16_IS_TRAIL(c1
) && start1
!=(s1
-1) && U16_IS_LEAD(*(s1
-2)))
1581 /* part of a surrogate pair, leave >=d800 */
1583 /* BMP code point - may be surrogate code point - make <d800 */
1588 (c2
<=0xdbff && s2
!=limit2
&& U16_IS_TRAIL(*s2
)) ||
1589 (U16_IS_TRAIL(c2
) && start2
!=(s2
-1) && U16_IS_LEAD(*(s2
-2)))
1591 /* part of a surrogate pair, leave >=d800 */
1593 /* BMP code point - may be surrogate code point - make <d800 */
1609 /* internal function */
1611 u_strcmpFold(const UChar
*s1
, int32_t length1
,
1612 const UChar
*s2
, int32_t length2
,
1614 UErrorCode
*pErrorCode
) {
1615 return _cmpFold(s1
, length1
, s2
, length2
, options
, NULL
, NULL
, pErrorCode
);
1618 /* public API functions */
1620 U_CAPI
int32_t U_EXPORT2
1621 u_strCaseCompare(const UChar
*s1
, int32_t length1
,
1622 const UChar
*s2
, int32_t length2
,
1624 UErrorCode
*pErrorCode
) {
1625 /* argument checking */
1626 if(pErrorCode
==0 || U_FAILURE(*pErrorCode
)) {
1629 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
1630 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1633 return u_strcmpFold(s1
, length1
, s2
, length2
,
1634 options
|U_COMPARE_IGNORE_CASE
,
1638 U_CAPI
int32_t U_EXPORT2
1639 u_strcasecmp(const UChar
*s1
, const UChar
*s2
, uint32_t options
) {
1640 UErrorCode errorCode
=U_ZERO_ERROR
;
1641 return u_strcmpFold(s1
, -1, s2
, -1,
1642 options
|U_COMPARE_IGNORE_CASE
,
1646 U_CAPI
int32_t U_EXPORT2
1647 u_memcasecmp(const UChar
*s1
, const UChar
*s2
, int32_t length
, uint32_t options
) {
1648 UErrorCode errorCode
=U_ZERO_ERROR
;
1649 return u_strcmpFold(s1
, length
, s2
, length
,
1650 options
|U_COMPARE_IGNORE_CASE
,
1654 U_CAPI
int32_t U_EXPORT2
1655 u_strncasecmp(const UChar
*s1
, const UChar
*s2
, int32_t n
, uint32_t options
) {
1656 UErrorCode errorCode
=U_ZERO_ERROR
;
1657 return u_strcmpFold(s1
, n
, s2
, n
,
1658 options
|(U_COMPARE_IGNORE_CASE
|_STRNCMP_STYLE
),
1662 /* internal API - detect length of shared prefix */
1664 u_caseInsensitivePrefixMatch(const UChar
*s1
, int32_t length1
,
1665 const UChar
*s2
, int32_t length2
,
1667 int32_t *matchLen1
, int32_t *matchLen2
,
1668 UErrorCode
*pErrorCode
) {
1669 _cmpFold(s1
, length1
, s2
, length2
, options
,
1670 matchLen1
, matchLen2
, pErrorCode
);