1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ustrcase.cpp
12 * tab size: 8 (not used)
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/stringoptions.h"
28 #include "unicode/ustring.h"
29 #include "unicode/ucasemap.h"
30 #include "unicode/ubrk.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf16.h"
35 #include "ucasemap_imp.h"
43 int32_t checkOverflowAndEditsError(int32_t destIndex
, int32_t destCapacity
,
44 Edits
*edits
, UErrorCode
&errorCode
) {
45 if (U_SUCCESS(errorCode
)) {
46 if (destIndex
> destCapacity
) {
47 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
48 } else if (edits
!= NULL
) {
49 edits
->copyErrorTo(errorCode
);
55 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
57 appendResult(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
58 int32_t result
, const UChar
*s
,
59 int32_t cpLength
, uint32_t options
, icu::Edits
*edits
) {
63 /* decode the result */
65 /* (not) original code point */
67 edits
->addUnchanged(cpLength
);
69 if(options
& U_OMIT_UNCHANGED_TEXT
) {
73 if(destIndex
<destCapacity
&& c
<=0xffff) { // BMP slightly-fastpath
74 dest
[destIndex
++]=(UChar
)c
;
79 if(result
<=UCASE_MAX_STRING_LENGTH
) {
82 } else if(destIndex
<destCapacity
&& result
<=0xffff) { // BMP slightly-fastpath
83 dest
[destIndex
++]=(UChar
)result
;
85 edits
->addReplace(cpLength
, 1);
93 edits
->addReplace(cpLength
, length
);
96 if(length
>(INT32_MAX
-destIndex
)) {
97 return -1; // integer overflow
100 if(destIndex
<destCapacity
) {
101 /* append the result */
105 U16_APPEND(dest
, destIndex
, destCapacity
, c
, isError
);
107 /* overflow, nothing written */
112 if((destIndex
+length
)<=destCapacity
) {
114 dest
[destIndex
++]=*s
++;
130 appendUChar(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
, UChar c
) {
131 if(destIndex
<destCapacity
) {
133 } else if(destIndex
==INT32_MAX
) {
134 return -1; // integer overflow
140 appendNonEmptyUnchanged(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
141 const UChar
*s
, int32_t length
, uint32_t options
, icu::Edits
*edits
) {
143 edits
->addUnchanged(length
);
145 if(options
& U_OMIT_UNCHANGED_TEXT
) {
148 if(length
>(INT32_MAX
-destIndex
)) {
149 return -1; // integer overflow
151 if((destIndex
+length
)<=destCapacity
) {
152 u_memcpy(dest
+destIndex
, s
, length
);
154 return destIndex
+ length
;
158 appendUnchanged(UChar
*dest
, int32_t destIndex
, int32_t destCapacity
,
159 const UChar
*s
, int32_t length
, uint32_t options
, icu::Edits
*edits
) {
163 return appendNonEmptyUnchanged(dest
, destIndex
, destCapacity
, s
, length
, options
, edits
);
167 utf16_caseContextIterator(void *context
, int8_t dir
) {
168 UCaseContext
*csc
=(UCaseContext
*)context
;
172 /* reset for backward iteration */
173 csc
->index
=csc
->cpStart
;
176 /* reset for forward iteration */
177 csc
->index
=csc
->cpLimit
;
180 /* continue current iteration direction */
185 if(csc
->start
<csc
->index
) {
186 U16_PREV((const UChar
*)csc
->p
, csc
->start
, csc
->index
, c
);
190 if(csc
->index
<csc
->limit
) {
191 U16_NEXT((const UChar
*)csc
->p
, csc
->index
, csc
->limit
, c
);
199 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
200 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
202 int32_t toLower(int32_t caseLocale
, uint32_t options
,
203 UChar
*dest
, int32_t destCapacity
,
204 const UChar
*src
, UCaseContext
*csc
, int32_t srcStart
, int32_t srcLimit
,
205 icu::Edits
*edits
, UErrorCode
&errorCode
) {
206 const int8_t *latinToLower
;
207 if (caseLocale
== UCASE_LOC_ROOT
||
209 !(caseLocale
== UCASE_LOC_TURKISH
|| caseLocale
== UCASE_LOC_LITHUANIAN
) :
210 (options
& _FOLD_CASE_OPTIONS_MASK
) == U_FOLD_CASE_DEFAULT
)) {
211 latinToLower
= LatinCase::TO_LOWER_NORMAL
;
213 latinToLower
= LatinCase::TO_LOWER_TR_LT
;
215 const UTrie2
*trie
= ucase_getTrie();
216 int32_t destIndex
= 0;
217 int32_t prev
= srcStart
;
218 int32_t srcIndex
= srcStart
;
220 // fast path for simple cases
222 while (srcIndex
< srcLimit
) {
223 lead
= src
[srcIndex
];
225 if (lead
< LatinCase::LONG_S
) {
226 int8_t d
= latinToLower
[lead
];
227 if (d
== LatinCase::EXC
) { break; }
229 if (d
== 0) { continue; }
231 } else if (lead
>= 0xd800) {
232 break; // surrogate or higher
234 uint16_t props
= UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie
, lead
);
235 if (UCASE_HAS_EXCEPTION(props
)) { break; }
237 if (!UCASE_IS_UPPER_OR_TITLE(props
) || (delta
= UCASE_GET_DELTA(props
)) == 0) {
242 destIndex
= appendUnchanged(dest
, destIndex
, destCapacity
,
243 src
+ prev
, srcIndex
- 1 - prev
, options
, edits
);
244 if (destIndex
>= 0) {
245 destIndex
= appendUChar(dest
, destIndex
, destCapacity
, lead
);
246 if (edits
!= nullptr) {
247 edits
->addReplace(1, 1);
251 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
256 if (srcIndex
>= srcLimit
) {
260 int32_t cpStart
= srcIndex
++;
263 if (U16_IS_LEAD(lead
) && srcIndex
< srcLimit
&& U16_IS_TRAIL(trail
= src
[srcIndex
])) {
264 c
= U16_GET_SUPPLEMENTARY(lead
, trail
);
270 if (caseLocale
>= 0) {
271 csc
->cpStart
= cpStart
;
272 csc
->cpLimit
= srcIndex
;
273 c
= ucase_toFullLower(c
, utf16_caseContextIterator
, csc
, &s
, caseLocale
);
275 c
= ucase_toFullFolding(c
, &s
, options
);
278 destIndex
= appendUnchanged(dest
, destIndex
, destCapacity
,
279 src
+ prev
, cpStart
- prev
, options
, edits
);
280 if (destIndex
>= 0) {
281 destIndex
= appendResult(dest
, destIndex
, destCapacity
, c
, s
,
282 srcIndex
- cpStart
, options
, edits
);
285 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
291 destIndex
= appendUnchanged(dest
, destIndex
, destCapacity
,
292 src
+ prev
, srcIndex
- prev
, options
, edits
);
294 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
300 int32_t toUpper(int32_t caseLocale
, uint32_t options
,
301 UChar
*dest
, int32_t destCapacity
,
302 const UChar
*src
, UCaseContext
*csc
, int32_t srcLength
,
303 icu::Edits
*edits
, UErrorCode
&errorCode
) {
304 const int8_t *latinToUpper
;
305 if (caseLocale
== UCASE_LOC_TURKISH
) {
306 latinToUpper
= LatinCase::TO_UPPER_TR
;
308 latinToUpper
= LatinCase::TO_UPPER_NORMAL
;
310 const UTrie2
*trie
= ucase_getTrie();
311 int32_t destIndex
= 0;
313 int32_t srcIndex
= 0;
315 // fast path for simple cases
317 while (srcIndex
< srcLength
) {
318 lead
= src
[srcIndex
];
320 if (lead
< LatinCase::LONG_S
) {
321 int8_t d
= latinToUpper
[lead
];
322 if (d
== LatinCase::EXC
) { break; }
324 if (d
== 0) { continue; }
326 } else if (lead
>= 0xd800) {
327 break; // surrogate or higher
329 uint16_t props
= UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie
, lead
);
330 if (UCASE_HAS_EXCEPTION(props
)) { break; }
332 if (UCASE_GET_TYPE(props
) != UCASE_LOWER
|| (delta
= UCASE_GET_DELTA(props
)) == 0) {
337 destIndex
= appendUnchanged(dest
, destIndex
, destCapacity
,
338 src
+ prev
, srcIndex
- 1 - prev
, options
, edits
);
339 if (destIndex
>= 0) {
340 destIndex
= appendUChar(dest
, destIndex
, destCapacity
, lead
);
341 if (edits
!= nullptr) {
342 edits
->addReplace(1, 1);
346 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
351 if (srcIndex
>= srcLength
) {
356 csc
->cpStart
= cpStart
= srcIndex
++;
359 if (U16_IS_LEAD(lead
) && srcIndex
< srcLength
&& U16_IS_TRAIL(trail
= src
[srcIndex
])) {
360 c
= U16_GET_SUPPLEMENTARY(lead
, trail
);
365 csc
->cpLimit
= srcIndex
;
367 c
= ucase_toFullUpper(c
, utf16_caseContextIterator
, csc
, &s
, caseLocale
);
369 destIndex
= appendUnchanged(dest
, destIndex
, destCapacity
,
370 src
+ prev
, cpStart
- prev
, options
, edits
);
371 if (destIndex
>= 0) {
372 destIndex
= appendResult(dest
, destIndex
, destCapacity
, c
, s
,
373 srcIndex
- cpStart
, options
, edits
);
376 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
382 destIndex
= appendUnchanged(dest
, destIndex
, destCapacity
,
383 src
+ prev
, srcIndex
- prev
, options
, edits
);
385 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
397 #if !UCONFIG_NO_BREAK_ITERATION
399 U_CFUNC
int32_t U_CALLCONV
400 ustrcase_internalToTitle(int32_t caseLocale
, uint32_t options
, BreakIterator
*iter
,
401 UChar
*dest
, int32_t destCapacity
,
402 const UChar
*src
, int32_t srcLength
,
404 UErrorCode
&errorCode
) {
405 if (!ustrcase_checkTitleAdjustmentOptions(options
, errorCode
)) {
409 /* set up local variables */
410 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
415 UBool isFirstIndex
=TRUE
;
417 /* titlecasing loop */
418 while(prev
<srcLength
) {
419 /* find next index where to titlecase */
427 if(index
==UBRK_DONE
|| index
>srcLength
) {
432 * Segment [prev..index[ into 3 parts:
433 * a) skipped characters (copy as-is) [prev..titleStart[
434 * b) first letter (titlecase) [titleStart..titleLimit[
435 * c) subsequent characters (lowercase) [titleLimit..index[
438 // Find and copy skipped characters [prev..titleStart[
439 int32_t titleStart
=prev
;
440 int32_t titleLimit
=prev
;
442 U16_NEXT(src
, titleLimit
, index
, c
);
443 if ((options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0) {
444 // Adjust the titlecasing index to the next cased character,
445 // or to the next letter/number/symbol/private use.
446 // Stop with titleStart<titleLimit<=index
447 // if there is a character to be titlecased,
448 // or else stop with titleStart==titleLimit==index.
449 UBool toCased
= (options
&U_TITLECASE_ADJUST_TO_CASED
) != 0;
450 while (toCased
? UCASE_NONE
==ucase_getType(c
) : !ustrcase_isLNS(c
)) {
451 titleStart
=titleLimit
;
452 if(titleLimit
==index
) {
455 U16_NEXT(src
, titleLimit
, index
, c
);
457 if (prev
< titleStart
) {
458 destIndex
=appendUnchanged(dest
, destIndex
, destCapacity
,
459 src
+prev
, titleStart
-prev
, options
, edits
);
461 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
467 if(titleStart
<titleLimit
) {
468 /* titlecase c which is from [titleStart..titleLimit[ */
469 csc
.cpStart
=titleStart
;
470 csc
.cpLimit
=titleLimit
;
472 c
=ucase_toFullTitle(c
, utf16_caseContextIterator
, &csc
, &s
, caseLocale
);
473 destIndex
=appendResult(dest
, destIndex
, destCapacity
, c
, s
,
474 titleLimit
-titleStart
, options
, edits
);
476 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
480 /* Special case Dutch IJ titlecasing */
481 if (titleStart
+1 < index
&&
482 caseLocale
== UCASE_LOC_DUTCH
&&
483 (src
[titleStart
] == 0x0049 || src
[titleStart
] == 0x0069)) {
484 if (src
[titleStart
+1] == 0x006A) {
485 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x004A);
487 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
491 edits
->addReplace(1, 1);
494 } else if (src
[titleStart
+1] == 0x004A) {
495 // Keep the capital J from getting lowercased.
496 destIndex
=appendUnchanged(dest
, destIndex
, destCapacity
,
497 src
+titleStart
+1, 1, options
, edits
);
499 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
506 /* lowercase [titleLimit..index[ */
507 if(titleLimit
<index
) {
508 if((options
&U_TITLECASE_NO_LOWERCASE
)==0) {
509 /* Normal operation: Lowercase the rest of the word. */
513 dest
+destIndex
, destCapacity
-destIndex
,
514 src
, &csc
, titleLimit
, index
,
516 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
517 errorCode
=U_ZERO_ERROR
;
519 if(U_FAILURE(errorCode
)) {
523 /* Optionally just copy the rest of the word unchanged. */
524 destIndex
=appendUnchanged(dest
, destIndex
, destCapacity
,
525 src
+titleLimit
, index
-titleLimit
, options
, edits
);
527 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
538 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
541 #endif // !UCONFIG_NO_BREAK_ITERATION
544 namespace GreekUpper
{
546 // Data generated by prototype code, see
547 // http://site.icu-project.org/design/case/greek-upper
548 // TODO: Move this data into ucase.icu.
549 static const uint16_t data0370
[] = {
573 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
575 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
576 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
577 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
579 0x039F | HAS_VOWEL
| HAS_ACCENT
,
581 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
582 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
583 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
609 0x0399 | HAS_VOWEL
| HAS_DIALYTIKA
,
610 0x03A5 | HAS_VOWEL
| HAS_DIALYTIKA
,
611 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
612 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
613 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
614 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
615 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
641 0x0399 | HAS_VOWEL
| HAS_DIALYTIKA
,
642 0x03A5 | HAS_VOWEL
| HAS_DIALYTIKA
,
643 0x039F | HAS_VOWEL
| HAS_ACCENT
,
644 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
645 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
651 0x03D2 | HAS_DIALYTIKA
,
697 static const uint16_t data1F00
[] = {
701 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
702 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
703 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
704 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
705 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
706 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
709 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
710 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
711 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
712 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
713 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
714 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
717 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
718 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
719 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
720 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
725 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
726 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
727 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
728 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
733 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
734 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
735 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
736 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
737 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
738 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
741 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
742 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
743 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
744 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
745 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
746 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
749 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
750 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
751 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
752 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
753 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
754 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
757 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
758 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
759 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
760 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
761 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
762 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
765 0x039F | HAS_VOWEL
| HAS_ACCENT
,
766 0x039F | HAS_VOWEL
| HAS_ACCENT
,
767 0x039F | HAS_VOWEL
| HAS_ACCENT
,
768 0x039F | HAS_VOWEL
| HAS_ACCENT
,
773 0x039F | HAS_VOWEL
| HAS_ACCENT
,
774 0x039F | HAS_VOWEL
| HAS_ACCENT
,
775 0x039F | HAS_VOWEL
| HAS_ACCENT
,
776 0x039F | HAS_VOWEL
| HAS_ACCENT
,
781 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
782 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
783 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
784 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
785 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
786 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
790 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
792 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
794 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
797 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
798 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
799 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
800 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
801 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
802 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
805 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
806 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
807 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
808 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
809 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
810 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
811 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
812 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
813 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
814 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
815 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
816 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
817 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
818 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
819 0x039F | HAS_VOWEL
| HAS_ACCENT
,
820 0x039F | HAS_VOWEL
| HAS_ACCENT
,
821 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
822 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
823 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
824 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
827 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
828 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
829 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
830 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
831 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
832 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
833 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
834 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
835 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
836 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
837 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
838 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
839 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
840 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
841 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
842 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
843 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
844 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
845 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
846 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
847 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
848 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
849 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
850 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
851 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
852 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
853 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
854 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
855 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
856 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
857 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
858 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
859 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
860 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
861 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
862 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
863 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
864 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
865 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
866 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
867 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
868 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
869 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
870 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
871 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
872 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
873 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
874 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
877 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
878 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
879 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
881 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
882 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
885 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
886 0x0391 | HAS_VOWEL
| HAS_ACCENT
,
887 0x0391 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
893 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
894 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
895 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
897 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
898 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
899 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
900 0x0395 | HAS_VOWEL
| HAS_ACCENT
,
901 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
902 0x0397 | HAS_VOWEL
| HAS_ACCENT
,
903 0x0397 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
909 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
910 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
913 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
914 0x0399 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
917 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
918 0x0399 | HAS_VOWEL
| HAS_ACCENT
,
925 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
926 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
929 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
930 0x03A5 | HAS_VOWEL
| HAS_ACCENT
| HAS_DIALYTIKA
,
933 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
934 0x03A5 | HAS_VOWEL
| HAS_ACCENT
,
941 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
942 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
943 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
945 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
946 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
| HAS_ACCENT
,
947 0x039F | HAS_VOWEL
| HAS_ACCENT
,
948 0x039F | HAS_VOWEL
| HAS_ACCENT
,
949 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
950 0x03A9 | HAS_VOWEL
| HAS_ACCENT
,
951 0x03A9 | HAS_VOWEL
| HAS_YPOGEGRAMMENI
,
958 static const uint16_t data2126
= 0x03A9 | HAS_VOWEL
;
960 uint32_t getLetterData(UChar32 c
) {
961 if (c
< 0x370 || 0x2126 < c
|| (0x3ff < c
&& c
< 0x1f00)) {
963 } else if (c
<= 0x3ff) {
964 return data0370
[c
- 0x370];
965 } else if (c
<= 0x1fff) {
966 return data1F00
[c
- 0x1f00];
967 } else if (c
== 0x2126) {
974 uint32_t getDiacriticData(UChar32 c
) {
976 case 0x0300: // varia
977 case 0x0301: // tonos = oxia
978 case 0x0342: // perispomeni
979 case 0x0302: // circumflex can look like perispomeni
980 case 0x0303: // tilde can look like perispomeni
981 case 0x0311: // inverted breve can look like perispomeni
983 case 0x0308: // dialytika = diaeresis
984 return HAS_COMBINING_DIALYTIKA
;
985 case 0x0344: // dialytika tonos
986 return HAS_COMBINING_DIALYTIKA
| HAS_ACCENT
;
987 case 0x0345: // ypogegrammeni = iota subscript
988 return HAS_YPOGEGRAMMENI
;
989 case 0x0304: // macron
990 case 0x0306: // breve
991 case 0x0313: // comma above
992 case 0x0314: // reversed comma above
993 case 0x0343: // koronis
994 return HAS_OTHER_GREEK_DIACRITIC
;
1000 UBool
isFollowedByCasedLetter(const UChar
*s
, int32_t i
, int32_t length
) {
1001 while (i
< length
) {
1003 U16_NEXT(s
, i
, length
, c
);
1004 int32_t type
= ucase_getTypeOrIgnorable(c
);
1005 if ((type
& UCASE_IGNORABLE
) != 0) {
1006 // Case-ignorable, continue with the loop.
1007 } else if (type
!= UCASE_NONE
) {
1008 return TRUE
; // Followed by cased letter.
1010 return FALSE
; // Uncased and not case-ignorable.
1013 return FALSE
; // Not followed by cased letter.
1017 * Greek string uppercasing with a state machine.
1018 * Probably simpler than a stateless function that has to figure out complex context-before
1019 * for each character.
1020 * TODO: Try to re-consolidate one way or another with the non-Greek function.
1022 int32_t toUpper(uint32_t options
,
1023 UChar
*dest
, int32_t destCapacity
,
1024 const UChar
*src
, int32_t srcLength
,
1026 UErrorCode
&errorCode
) {
1027 int32_t destIndex
=0;
1029 for (int32_t i
= 0; i
< srcLength
;) {
1030 int32_t nextIndex
= i
;
1032 U16_NEXT(src
, nextIndex
, srcLength
, c
);
1033 uint32_t nextState
= 0;
1034 int32_t type
= ucase_getTypeOrIgnorable(c
);
1035 if ((type
& UCASE_IGNORABLE
) != 0) {
1036 // c is case-ignorable
1037 nextState
|= (state
& AFTER_CASED
);
1038 } else if (type
!= UCASE_NONE
) {
1040 nextState
|= AFTER_CASED
;
1042 uint32_t data
= getLetterData(c
);
1044 uint32_t upper
= data
& UPPER_MASK
;
1045 // Add a dialytika to this iota or ypsilon vowel
1046 // if we removed a tonos from the previous vowel,
1047 // and that previous vowel did not also have (or gain) a dialytika.
1048 // Adding one only to the final vowel in a longer sequence
1049 // (which does not occur in normal writing) would require lookahead.
1050 // Set the same flag as for preserving an existing dialytika.
1051 if ((data
& HAS_VOWEL
) != 0 && (state
& AFTER_VOWEL_WITH_ACCENT
) != 0 &&
1052 (upper
== 0x399 || upper
== 0x3A5)) {
1053 data
|= HAS_DIALYTIKA
;
1055 int32_t numYpogegrammeni
= 0; // Map each one to a trailing, spacing, capital iota.
1056 if ((data
& HAS_YPOGEGRAMMENI
) != 0) {
1057 numYpogegrammeni
= 1;
1059 // Skip combining diacritics after this Greek letter.
1060 while (nextIndex
< srcLength
) {
1061 uint32_t diacriticData
= getDiacriticData(src
[nextIndex
]);
1062 if (diacriticData
!= 0) {
1063 data
|= diacriticData
;
1064 if ((diacriticData
& HAS_YPOGEGRAMMENI
) != 0) {
1069 break; // not a Greek diacritic
1072 if ((data
& HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA
) == HAS_VOWEL_AND_ACCENT
) {
1073 nextState
|= AFTER_VOWEL_WITH_ACCENT
;
1075 // Map according to Greek rules.
1076 UBool addTonos
= FALSE
;
1077 if (upper
== 0x397 &&
1078 (data
& HAS_ACCENT
) != 0 &&
1079 numYpogegrammeni
== 0 &&
1080 (state
& AFTER_CASED
) == 0 &&
1081 !isFollowedByCasedLetter(src
, nextIndex
, srcLength
)) {
1082 // Keep disjunctive "or" with (only) a tonos.
1083 // We use the same "word boundary" conditions as for the Final_Sigma test.
1084 if (i
== nextIndex
) {
1085 upper
= 0x389; // Preserve the precomposed form.
1089 } else if ((data
& HAS_DIALYTIKA
) != 0) {
1090 // Preserve a vowel with dialytika in precomposed form if it exists.
1091 if (upper
== 0x399) {
1093 data
&= ~HAS_EITHER_DIALYTIKA
;
1094 } else if (upper
== 0x3A5) {
1096 data
&= ~HAS_EITHER_DIALYTIKA
;
1101 if (edits
== nullptr && (options
& U_OMIT_UNCHANGED_TEXT
) == 0) {
1102 change
= TRUE
; // common, simple usage
1104 // Find out first whether we are changing the text.
1105 change
= src
[i
] != upper
|| numYpogegrammeni
> 0;
1107 if ((data
& HAS_EITHER_DIALYTIKA
) != 0) {
1108 change
|= i2
>= nextIndex
|| src
[i2
] != 0x308;
1112 change
|= i2
>= nextIndex
|| src
[i2
] != 0x301;
1115 int32_t oldLength
= nextIndex
- i
;
1116 int32_t newLength
= (i2
- i
) + numYpogegrammeni
;
1117 change
|= oldLength
!= newLength
;
1119 if (edits
!= NULL
) {
1120 edits
->addReplace(oldLength
, newLength
);
1123 if (edits
!= NULL
) {
1124 edits
->addUnchanged(oldLength
);
1126 // Write unchanged text?
1127 change
= (options
& U_OMIT_UNCHANGED_TEXT
) == 0;
1132 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, (UChar
)upper
);
1133 if (destIndex
>= 0 && (data
& HAS_EITHER_DIALYTIKA
) != 0) {
1134 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x308); // restore or add a dialytika
1136 if (destIndex
>= 0 && addTonos
) {
1137 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x301);
1139 while (destIndex
>= 0 && numYpogegrammeni
> 0) {
1140 destIndex
=appendUChar(dest
, destIndex
, destCapacity
, 0x399);
1144 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1150 c
=ucase_toFullUpper(c
, NULL
, NULL
, &s
, UCASE_LOC_GREEK
);
1151 destIndex
= appendResult(dest
, destIndex
, destCapacity
, c
, s
,
1152 nextIndex
- i
, options
, edits
);
1153 if (destIndex
< 0) {
1154 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1165 } // namespace GreekUpper
1168 /* functions available in the common library (for unistr_case.cpp) */
1170 U_CFUNC
int32_t U_CALLCONV
1171 ustrcase_internalToLower(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
1172 UChar
*dest
, int32_t destCapacity
,
1173 const UChar
*src
, int32_t srcLength
,
1175 UErrorCode
&errorCode
) {
1176 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
1178 csc
.limit
=srcLength
;
1179 int32_t destIndex
= toLower(
1180 caseLocale
, options
,
1182 src
, &csc
, 0, srcLength
,
1184 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
1187 U_CFUNC
int32_t U_CALLCONV
1188 ustrcase_internalToUpper(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
1189 UChar
*dest
, int32_t destCapacity
,
1190 const UChar
*src
, int32_t srcLength
,
1192 UErrorCode
&errorCode
) {
1194 if (caseLocale
== UCASE_LOC_GREEK
) {
1195 destIndex
= GreekUpper::toUpper(options
, dest
, destCapacity
,
1196 src
, srcLength
, edits
, errorCode
);
1198 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
1200 csc
.limit
=srcLength
;
1201 destIndex
= toUpper(
1202 caseLocale
, options
,
1204 src
, &csc
, srcLength
,
1207 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
1210 U_CFUNC
int32_t U_CALLCONV
1211 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
1212 UChar
*dest
, int32_t destCapacity
,
1213 const UChar
*src
, int32_t srcLength
,
1215 UErrorCode
&errorCode
) {
1216 int32_t destIndex
= toLower(
1219 src
, nullptr, 0, srcLength
,
1221 return checkOverflowAndEditsError(destIndex
, destCapacity
, edits
, errorCode
);
1225 ustrcase_map(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
1226 UChar
*dest
, int32_t destCapacity
,
1227 const UChar
*src
, int32_t srcLength
,
1228 UStringCaseMapper
*stringCaseMapper
,
1230 UErrorCode
&errorCode
) {
1233 /* check argument values */
1234 if(U_FAILURE(errorCode
)) {
1237 if( destCapacity
<0 ||
1238 (dest
==NULL
&& destCapacity
>0) ||
1242 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1246 /* get the string length */
1248 srcLength
=u_strlen(src
);
1251 /* check for overlapping source and destination */
1253 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
1254 (dest
>=src
&& dest
<(src
+srcLength
)))
1256 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1260 if (edits
!= nullptr && (options
& U_EDITS_NO_RESET
) == 0) {
1263 destLength
=stringCaseMapper(caseLocale
, options
, UCASEMAP_BREAK_ITERATOR
1264 dest
, destCapacity
, src
, srcLength
, edits
, errorCode
);
1265 return u_terminateUChars(dest
, destCapacity
, destLength
, &errorCode
);
1269 ustrcase_mapWithOverlap(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
1270 UChar
*dest
, int32_t destCapacity
,
1271 const UChar
*src
, int32_t srcLength
,
1272 UStringCaseMapper
*stringCaseMapper
,
1273 UErrorCode
&errorCode
) {
1279 /* check argument values */
1280 if(U_FAILURE(errorCode
)) {
1283 if( destCapacity
<0 ||
1284 (dest
==NULL
&& destCapacity
>0) ||
1288 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1292 /* get the string length */
1294 srcLength
=u_strlen(src
);
1297 /* check for overlapping source and destination */
1299 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
1300 (dest
>=src
&& dest
<(src
+srcLength
)))
1302 /* overlap: provide a temporary destination buffer and later copy the result */
1303 if(destCapacity
<=UPRV_LENGTHOF(buffer
)) {
1304 /* the stack buffer is large enough */
1307 /* allocate a buffer */
1308 temp
=(UChar
*)uprv_malloc(destCapacity
*U_SIZEOF_UCHAR
);
1310 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
1318 destLength
=stringCaseMapper(caseLocale
, options
, UCASEMAP_BREAK_ITERATOR
1319 temp
, destCapacity
, src
, srcLength
, NULL
, errorCode
);
1321 /* copy the result string to the destination buffer */
1322 if (U_SUCCESS(errorCode
) && 0 < destLength
&& destLength
<= destCapacity
) {
1323 u_memmove(dest
, temp
, destLength
);
1330 return u_terminateUChars(dest
, destCapacity
, destLength
, &errorCode
);
1333 /* public API functions */
1335 U_CAPI
int32_t U_EXPORT2
1336 u_strFoldCase(UChar
*dest
, int32_t destCapacity
,
1337 const UChar
*src
, int32_t srcLength
,
1339 UErrorCode
*pErrorCode
) {
1340 return ustrcase_mapWithOverlap(
1341 UCASE_LOC_ROOT
, options
, UCASEMAP_BREAK_ITERATOR_NULL
1344 ustrcase_internalFold
, *pErrorCode
);
1349 int32_t CaseMap::fold(
1351 const UChar
*src
, int32_t srcLength
,
1352 UChar
*dest
, int32_t destCapacity
, Edits
*edits
,
1353 UErrorCode
&errorCode
) {
1354 return ustrcase_map(
1355 UCASE_LOC_ROOT
, options
, UCASEMAP_BREAK_ITERATOR_NULL
1358 ustrcase_internalFold
, edits
, errorCode
);
1363 /* case-insensitive string comparisons -------------------------------------- */
1366 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1367 * canonical equivalence.
1368 * Keep the functions in sync, and see there for how this works.
1369 * The duplication is for modularization:
1370 * It makes caseless (but not canonical caseless) matches independent of
1371 * the normalization code.
1374 /* stack element for previous-level source/decomposition pointers */
1375 struct CmpEquivLevel
{
1376 const UChar
*start
, *s
, *limit
;
1378 typedef struct CmpEquivLevel CmpEquivLevel
;
1381 * Internal implementation code comparing string with case fold.
1382 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1384 * @param s1 input string 1
1385 * @param length1 length of string 1, or -1 (NULL terminated)
1386 * @param s2 input string 2
1387 * @param length2 length of string 2, or -1 (NULL terminated)
1388 * @param options compare options
1389 * @param matchLen1 (output) length of partial prefix match in s1
1390 * @param matchLen2 (output) length of partial prefix match in s2
1391 * @param pErrorCode receives error status
1392 * @return The result of comparison
1394 static int32_t _cmpFold(
1395 const UChar
*s1
, int32_t length1
,
1396 const UChar
*s2
, int32_t length2
,
1398 int32_t *matchLen1
, int32_t *matchLen2
,
1399 UErrorCode
*pErrorCode
) {
1402 /* current-level start/limit - s1/s2 as current */
1403 const UChar
*start1
, *start2
, *limit1
, *limit2
;
1405 /* points to the original start address */
1406 const UChar
*org1
, *org2
;
1408 /* points to the end of match + 1 */
1409 const UChar
*m1
, *m2
;
1411 /* case folding variables */
1415 /* stacks of previous-level start/current/limit */
1416 CmpEquivLevel stack1
[2], stack2
[2];
1418 /* case folding buffers, only use current-level start/limit */
1419 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
1421 /* track which is the current level per string */
1422 int32_t level1
, level2
;
1424 /* current code units, and code points for lookups */
1425 UChar32 c1
, c2
, cp1
, cp2
;
1427 /* no argument error checking because this itself is not an API */
1430 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1431 * otherwise this function would have to behave exactly as uprv_strCompare()
1433 if(U_FAILURE(*pErrorCode
)) {
1439 U_ASSERT(matchLen2
!=NULL
);
1461 /* comparison loop */
1464 * here a code unit value of -1 means "get another code unit"
1465 * below it will mean "this source is finished"
1469 /* get next code unit from string 1, post-increment */
1471 if(s1
==limit1
|| ((c1
=*s1
)==0 && (limit1
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
1481 /* reached end of level buffer, pop one level */
1484 start1
=stack1
[level1
].start
; /*Not uninitialized*/
1485 } while(start1
==NULL
);
1486 s1
=stack1
[level1
].s
; /*Not uninitialized*/
1487 limit1
=stack1
[level1
].limit
; /*Not uninitialized*/
1492 /* get next code unit from string 2, post-increment */
1494 if(s2
==limit2
|| ((c2
=*s2
)==0 && (limit2
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
1504 /* reached end of level buffer, pop one level */
1507 start2
=stack2
[level2
].start
; /*Not uninitialized*/
1508 } while(start2
==NULL
);
1509 s2
=stack2
[level2
].s
; /*Not uninitialized*/
1510 limit2
=stack2
[level2
].limit
; /*Not uninitialized*/
1516 * either variable c1, c2 is -1 only if the corresponding string is finished
1519 const UChar
*next1
, *next2
;
1522 cmpRes
=0; /* c1==c2==-1 indicating end of strings */
1527 * Note: Move the match positions in both strings at the same time
1528 * only when corresponding code point(s) in the original strings
1529 * are fully consumed. For example, when comparing s1="Fust" and
1530 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1531 * the first code point in the case-folded data. But the second "s"
1532 * has no matching code point in s1, so this implementation returns
1533 * 2 as the prefix match length ("Fu").
1538 } else if(s1
==limit1
) {
1539 /* Note: This implementation only use a single level of stack.
1540 * If this code needs to be changed to use multiple levels
1541 * of stacks, the code above should check if the current
1542 * code is at the end of all stacks.
1544 U_ASSERT(level1
==1);
1546 /* is s1 at the end of the current stack? */
1553 } else if(s2
==limit2
) {
1554 U_ASSERT(level2
==1);
1556 /* is s2 at the end of the current stack? */
1564 c1
=c2
=-1; /* make us fetch new code units */
1567 cmpRes
=-1; /* string 1 ends before string 2 */
1570 cmpRes
=1; /* string 2 ends before string 1 */
1573 /* c1!=c2 && c1>=0 && c2>=0 */
1575 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1577 if(U_IS_SURROGATE(c1
)) {
1580 if(U_IS_SURROGATE_LEAD(c1
)) {
1581 if(s1
!=limit1
&& U16_IS_TRAIL(c
=*s1
)) {
1582 /* advance ++s1; only below if cp1 decomposes/case-folds */
1583 cp1
=U16_GET_SUPPLEMENTARY(c1
, c
);
1585 } else /* isTrail(c1) */ {
1586 if(start1
<=(s1
-2) && U16_IS_LEAD(c
=*(s1
-2))) {
1587 cp1
=U16_GET_SUPPLEMENTARY(c
, c1
);
1593 if(U_IS_SURROGATE(c2
)) {
1596 if(U_IS_SURROGATE_LEAD(c2
)) {
1597 if(s2
!=limit2
&& U16_IS_TRAIL(c
=*s2
)) {
1598 /* advance ++s2; only below if cp2 decomposes/case-folds */
1599 cp2
=U16_GET_SUPPLEMENTARY(c2
, c
);
1601 } else /* isTrail(c2) */ {
1602 if(start2
<=(s2
-2) && U16_IS_LEAD(c
=*(s2
-2))) {
1603 cp2
=U16_GET_SUPPLEMENTARY(c
, c2
);
1609 * go down one level for each string
1610 * continue with the main loop as soon as there is a real change
1614 (length
=ucase_toFullFolding((UChar32
)cp1
, &p
, options
))>=0
1616 /* cp1 case-folds to the code point "length" or to p[length] */
1617 if(U_IS_SURROGATE(c1
)) {
1618 if(U_IS_SURROGATE_LEAD(c1
)) {
1619 /* advance beyond source surrogate pair if it case-folds */
1621 } else /* isTrail(c1) */ {
1623 * we got a supplementary code point when hitting its trail surrogate,
1624 * therefore the lead surrogate must have been the same as in the other string;
1625 * compare this decomposition with the lead surrogate in the other string
1626 * remember that this simulates bulk text replacement:
1627 * the decomposition would replace the entire code point
1635 /* push current level pointers */
1636 stack1
[0].start
=start1
;
1638 stack1
[0].limit
=limit1
;
1641 /* copy the folding result to fold1[] */
1642 if(length
<=UCASE_MAX_STRING_LENGTH
) {
1643 u_memcpy(fold1
, p
, length
);
1646 U16_APPEND_UNSAFE(fold1
, i
, length
);
1650 /* set next level pointers to case folding */
1652 limit1
=fold1
+length
;
1654 /* get ready to read from decomposition, continue with loop */
1660 (length
=ucase_toFullFolding((UChar32
)cp2
, &p
, options
))>=0
1662 /* cp2 case-folds to the code point "length" or to p[length] */
1663 if(U_IS_SURROGATE(c2
)) {
1664 if(U_IS_SURROGATE_LEAD(c2
)) {
1665 /* advance beyond source surrogate pair if it case-folds */
1667 } else /* isTrail(c2) */ {
1669 * we got a supplementary code point when hitting its trail surrogate,
1670 * therefore the lead surrogate must have been the same as in the other string;
1671 * compare this decomposition with the lead surrogate in the other string
1672 * remember that this simulates bulk text replacement:
1673 * the decomposition would replace the entire code point
1681 /* push current level pointers */
1682 stack2
[0].start
=start2
;
1684 stack2
[0].limit
=limit2
;
1687 /* copy the folding result to fold2[] */
1688 if(length
<=UCASE_MAX_STRING_LENGTH
) {
1689 u_memcpy(fold2
, p
, length
);
1692 U16_APPEND_UNSAFE(fold2
, i
, length
);
1696 /* set next level pointers to case folding */
1698 limit2
=fold2
+length
;
1700 /* get ready to read from decomposition, continue with loop */
1706 * no decomposition/case folding, max level for both sides:
1707 * return difference result
1709 * code point order comparison must not just return cp1-cp2
1710 * because when single surrogates are present then the surrogate pairs
1711 * that formed cp1 and cp2 may be from different string indexes
1713 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1714 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1715 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1717 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1718 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1719 * so we have slightly different pointer/start/limit comparisons here
1722 if(c1
>=0xd800 && c2
>=0xd800 && (options
&U_COMPARE_CODE_POINT_ORDER
)) {
1723 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1725 (c1
<=0xdbff && s1
!=limit1
&& U16_IS_TRAIL(*s1
)) ||
1726 (U16_IS_TRAIL(c1
) && start1
!=(s1
-1) && U16_IS_LEAD(*(s1
-2)))
1728 /* part of a surrogate pair, leave >=d800 */
1730 /* BMP code point - may be surrogate code point - make <d800 */
1735 (c2
<=0xdbff && s2
!=limit2
&& U16_IS_TRAIL(*s2
)) ||
1736 (U16_IS_TRAIL(c2
) && start2
!=(s2
-1) && U16_IS_LEAD(*(s2
-2)))
1738 /* part of a surrogate pair, leave >=d800 */
1740 /* BMP code point - may be surrogate code point - make <d800 */
1756 /* internal function */
1758 u_strcmpFold(const UChar
*s1
, int32_t length1
,
1759 const UChar
*s2
, int32_t length2
,
1761 UErrorCode
*pErrorCode
) {
1762 return _cmpFold(s1
, length1
, s2
, length2
, options
, NULL
, NULL
, pErrorCode
);
1765 /* public API functions */
1767 U_CAPI
int32_t U_EXPORT2
1768 u_strCaseCompare(const UChar
*s1
, int32_t length1
,
1769 const UChar
*s2
, int32_t length2
,
1771 UErrorCode
*pErrorCode
) {
1772 /* argument checking */
1773 if(pErrorCode
==0 || U_FAILURE(*pErrorCode
)) {
1776 if(s1
==NULL
|| length1
<-1 || s2
==NULL
|| length2
<-1) {
1777 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1780 return u_strcmpFold(s1
, length1
, s2
, length2
,
1781 options
|U_COMPARE_IGNORE_CASE
,
1785 U_CAPI
int32_t U_EXPORT2
1786 u_strcasecmp(const UChar
*s1
, const UChar
*s2
, uint32_t options
) {
1787 UErrorCode errorCode
=U_ZERO_ERROR
;
1788 return u_strcmpFold(s1
, -1, s2
, -1,
1789 options
|U_COMPARE_IGNORE_CASE
,
1793 U_CAPI
int32_t U_EXPORT2
1794 u_memcasecmp(const UChar
*s1
, const UChar
*s2
, int32_t length
, uint32_t options
) {
1795 UErrorCode errorCode
=U_ZERO_ERROR
;
1796 return u_strcmpFold(s1
, length
, s2
, length
,
1797 options
|U_COMPARE_IGNORE_CASE
,
1801 U_CAPI
int32_t U_EXPORT2
1802 u_strncasecmp(const UChar
*s1
, const UChar
*s2
, int32_t n
, uint32_t options
) {
1803 UErrorCode errorCode
=U_ZERO_ERROR
;
1804 return u_strcmpFold(s1
, n
, s2
, n
,
1805 options
|(U_COMPARE_IGNORE_CASE
|_STRNCMP_STYLE
),
1809 /* internal API - detect length of shared prefix */
1811 u_caseInsensitivePrefixMatch(const UChar
*s1
, int32_t length1
,
1812 const UChar
*s2
, int32_t length2
,
1814 int32_t *matchLen1
, int32_t *matchLen2
,
1815 UErrorCode
*pErrorCode
) {
1816 _cmpFold(s1
, length1
, s2
, length2
, options
,
1817 matchLen1
, matchLen2
, pErrorCode
);