1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ucasemap.cpp
12 * tab size: 8 (not used)
15 * created on: 2005may06
16 * created by: Markus W. Scherer
18 * Case mapping service object and functions using it.
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
43 #include "ucasemap_imp.h"
48 /* UCaseMap service object -------------------------------------------------- */
50 UCaseMap::UCaseMap(const char *localeID
, uint32_t opts
, UErrorCode
*pErrorCode
) :
51 #if !UCONFIG_NO_BREAK_ITERATION
54 caseLocale(UCASE_LOC_UNKNOWN
), options(opts
) {
55 ucasemap_setLocale(this, localeID
, pErrorCode
);
58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
64 U_CAPI UCaseMap
* U_EXPORT2
65 ucasemap_open(const char *locale
, uint32_t options
, UErrorCode
*pErrorCode
) {
66 if(U_FAILURE(*pErrorCode
)) {
69 UCaseMap
*csm
= new UCaseMap(locale
, options
, pErrorCode
);
71 *pErrorCode
= U_MEMORY_ALLOCATION_ERROR
;
73 } else if (U_FAILURE(*pErrorCode
)) {
81 ucasemap_close(UCaseMap
*csm
) {
85 U_CAPI
const char * U_EXPORT2
86 ucasemap_getLocale(const UCaseMap
*csm
) {
90 U_CAPI
uint32_t U_EXPORT2
91 ucasemap_getOptions(const UCaseMap
*csm
) {
96 ucasemap_setLocale(UCaseMap
*csm
, const char *locale
, UErrorCode
*pErrorCode
) {
97 if(U_FAILURE(*pErrorCode
)) {
100 if (locale
!= NULL
&& *locale
== 0) {
102 csm
->caseLocale
= UCASE_LOC_ROOT
;
106 int32_t length
=uloc_getName(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
107 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
|| length
==sizeof(csm
->locale
)) {
108 *pErrorCode
=U_ZERO_ERROR
;
109 /* we only really need the language code for case mappings */
110 length
=uloc_getLanguage(locale
, csm
->locale
, (int32_t)sizeof(csm
->locale
), pErrorCode
);
112 if(length
==sizeof(csm
->locale
)) {
113 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
115 if(U_SUCCESS(*pErrorCode
)) {
116 csm
->caseLocale
=UCASE_LOC_UNKNOWN
;
117 csm
->caseLocale
= ucase_getCaseLocale(csm
->locale
);
120 csm
->caseLocale
= UCASE_LOC_ROOT
;
124 U_CAPI
void U_EXPORT2
125 ucasemap_setOptions(UCaseMap
*csm
, uint32_t options
, UErrorCode
*pErrorCode
) {
126 if(U_FAILURE(*pErrorCode
)) {
129 csm
->options
=options
;
132 /* UTF-8 string case mappings ----------------------------------------------- */
134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
140 appendResult(int32_t cpLength
, int32_t result
, const UChar
*s
,
141 ByteSink
&sink
, uint32_t options
, icu::Edits
*edits
, UErrorCode
&errorCode
) {
142 U_ASSERT(U_SUCCESS(errorCode
));
144 /* decode the result */
146 /* (not) original code point */
148 edits
->addUnchanged(cpLength
);
150 if((options
& U_OMIT_UNCHANGED_TEXT
) == 0) {
151 ByteSinkUtil::appendCodePoint(cpLength
, ~result
, sink
);
154 if(result
<=UCASE_MAX_STRING_LENGTH
) {
155 // string: "result" is the UTF-16 length
156 return ByteSinkUtil::appendChange(cpLength
, s
, result
, sink
, edits
, errorCode
);
158 ByteSinkUtil::appendCodePoint(cpLength
, result
, sink
, edits
);
164 // See unicode/utf8.h U8_APPEND_UNSAFE().
165 inline uint8_t getTwoByteLead(UChar32 c
) { return (uint8_t)((c
>> 6) | 0xc0); }
166 inline uint8_t getTwoByteTrail(UChar32 c
) { return (uint8_t)((c
& 0x3f) | 0x80); }
169 utf8_caseContextIterator(void *context
, int8_t dir
) {
170 UCaseContext
*csc
=(UCaseContext
*)context
;
174 /* reset for backward iteration */
175 csc
->index
=csc
->cpStart
;
178 /* reset for forward iteration */
179 csc
->index
=csc
->cpLimit
;
182 /* continue current iteration direction */
187 if(csc
->start
<csc
->index
) {
188 U8_PREV((const uint8_t *)csc
->p
, csc
->start
, csc
->index
, c
);
192 if(csc
->index
<csc
->limit
) {
193 U8_NEXT((const uint8_t *)csc
->p
, csc
->index
, csc
->limit
, c
);
201 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
202 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
204 void toLower(int32_t caseLocale
, uint32_t options
,
205 const uint8_t *src
, UCaseContext
*csc
, int32_t srcStart
, int32_t srcLimit
,
206 icu::ByteSink
&sink
, icu::Edits
*edits
, UErrorCode
&errorCode
) {
207 const int8_t *latinToLower
;
208 if (caseLocale
== UCASE_LOC_ROOT
||
210 !(caseLocale
== UCASE_LOC_TURKISH
|| caseLocale
== UCASE_LOC_LITHUANIAN
) :
211 (options
& _FOLD_CASE_OPTIONS_MASK
) == U_FOLD_CASE_DEFAULT
)) {
212 latinToLower
= LatinCase::TO_LOWER_NORMAL
;
214 latinToLower
= LatinCase::TO_LOWER_TR_LT
;
216 const UTrie2
*trie
= ucase_getTrie();
217 int32_t prev
= srcStart
;
218 int32_t srcIndex
= srcStart
;
220 // fast path for simple cases
224 if (U_FAILURE(errorCode
) || srcIndex
>= srcLimit
) {
228 uint8_t lead
= src
[srcIndex
++];
230 int8_t d
= latinToLower
[lead
];
231 if (d
== LatinCase::EXC
) {
232 cpStart
= srcIndex
- 1;
236 if (d
== 0) { continue; }
237 ByteSinkUtil::appendUnchanged(src
+ prev
, srcIndex
- 1 - prev
,
238 sink
, options
, edits
, errorCode
);
239 char ascii
= (char)(lead
+ d
);
240 sink
.Append(&ascii
, 1);
241 if (edits
!= nullptr) {
242 edits
->addReplace(1, 1);
246 } else if (lead
< 0xe3) {
248 if (0xc2 <= lead
&& lead
<= 0xc5 && srcIndex
< srcLimit
&&
249 (t
= src
[srcIndex
] - 0x80) <= 0x3f) {
252 c
= ((lead
- 0xc0) << 6) | t
;
253 int8_t d
= latinToLower
[c
];
254 if (d
== LatinCase::EXC
) {
255 cpStart
= srcIndex
- 2;
258 if (d
== 0) { continue; }
259 ByteSinkUtil::appendUnchanged(src
+ prev
, srcIndex
- 2 - prev
,
260 sink
, options
, edits
, errorCode
);
261 ByteSinkUtil::appendTwoBytes(c
+ d
, sink
);
262 if (edits
!= nullptr) {
263 edits
->addReplace(2, 2);
268 } else if ((lead
<= 0xe9 || lead
== 0xeb || lead
== 0xec) &&
269 (srcIndex
+ 2) <= srcLimit
&&
270 U8_IS_TRAIL(src
[srcIndex
]) && U8_IS_TRAIL(src
[srcIndex
+ 1])) {
271 // most of CJK: no case mappings
275 cpStart
= --srcIndex
;
276 U8_NEXT(src
, srcIndex
, srcLimit
, c
);
281 uint16_t props
= UTRIE2_GET16(trie
, c
);
282 if (UCASE_HAS_EXCEPTION(props
)) { break; }
284 if (!UCASE_IS_UPPER_OR_TITLE(props
) || (delta
= UCASE_GET_DELTA(props
)) == 0) {
287 ByteSinkUtil::appendUnchanged(src
+ prev
, cpStart
- prev
,
288 sink
, options
, edits
, errorCode
);
289 ByteSinkUtil::appendCodePoint(srcIndex
- cpStart
, c
+ delta
, sink
, edits
);
297 if (caseLocale
>= 0) {
298 csc
->cpStart
= cpStart
;
299 csc
->cpLimit
= srcIndex
;
300 c
= ucase_toFullLower(c
, utf8_caseContextIterator
, csc
, &s
, caseLocale
);
302 c
= ucase_toFullFolding(c
, &s
, options
);
305 ByteSinkUtil::appendUnchanged(src
+ prev
, cpStart
- prev
,
306 sink
, options
, edits
, errorCode
);
307 appendResult(srcIndex
- cpStart
, c
, s
, sink
, options
, edits
, errorCode
);
311 ByteSinkUtil::appendUnchanged(src
+ prev
, srcIndex
- prev
,
312 sink
, options
, edits
, errorCode
);
315 void toUpper(int32_t caseLocale
, uint32_t options
,
316 const uint8_t *src
, UCaseContext
*csc
, int32_t srcLength
,
317 icu::ByteSink
&sink
, icu::Edits
*edits
, UErrorCode
&errorCode
) {
318 const int8_t *latinToUpper
;
319 if (caseLocale
== UCASE_LOC_TURKISH
) {
320 latinToUpper
= LatinCase::TO_UPPER_TR
;
322 latinToUpper
= LatinCase::TO_UPPER_NORMAL
;
324 const UTrie2
*trie
= ucase_getTrie();
326 int32_t srcIndex
= 0;
328 // fast path for simple cases
332 if (U_FAILURE(errorCode
) || srcIndex
>= srcLength
) {
336 uint8_t lead
= src
[srcIndex
++];
338 int8_t d
= latinToUpper
[lead
];
339 if (d
== LatinCase::EXC
) {
340 cpStart
= srcIndex
- 1;
344 if (d
== 0) { continue; }
345 ByteSinkUtil::appendUnchanged(src
+ prev
, srcIndex
- 1 - prev
,
346 sink
, options
, edits
, errorCode
);
347 char ascii
= (char)(lead
+ d
);
348 sink
.Append(&ascii
, 1);
349 if (edits
!= nullptr) {
350 edits
->addReplace(1, 1);
354 } else if (lead
< 0xe3) {
356 if (0xc2 <= lead
&& lead
<= 0xc5 && srcIndex
< srcLength
&&
357 (t
= src
[srcIndex
] - 0x80) <= 0x3f) {
360 c
= ((lead
- 0xc0) << 6) | t
;
361 int8_t d
= latinToUpper
[c
];
362 if (d
== LatinCase::EXC
) {
363 cpStart
= srcIndex
- 2;
366 if (d
== 0) { continue; }
367 ByteSinkUtil::appendUnchanged(src
+ prev
, srcIndex
- 2 - prev
,
368 sink
, options
, edits
, errorCode
);
369 ByteSinkUtil::appendTwoBytes(c
+ d
, sink
);
370 if (edits
!= nullptr) {
371 edits
->addReplace(2, 2);
376 } else if ((lead
<= 0xe9 || lead
== 0xeb || lead
== 0xec) &&
377 (srcIndex
+ 2) <= srcLength
&&
378 U8_IS_TRAIL(src
[srcIndex
]) && U8_IS_TRAIL(src
[srcIndex
+ 1])) {
379 // most of CJK: no case mappings
383 cpStart
= --srcIndex
;
384 U8_NEXT(src
, srcIndex
, srcLength
, c
);
389 uint16_t props
= UTRIE2_GET16(trie
, c
);
390 if (UCASE_HAS_EXCEPTION(props
)) { break; }
392 if (UCASE_GET_TYPE(props
) != UCASE_LOWER
|| (delta
= UCASE_GET_DELTA(props
)) == 0) {
395 ByteSinkUtil::appendUnchanged(src
+ prev
, cpStart
- prev
,
396 sink
, options
, edits
, errorCode
);
397 ByteSinkUtil::appendCodePoint(srcIndex
- cpStart
, c
+ delta
, sink
, edits
);
404 csc
->cpStart
= cpStart
;
405 csc
->cpLimit
= srcIndex
;
407 c
= ucase_toFullUpper(c
, utf8_caseContextIterator
, csc
, &s
, caseLocale
);
409 ByteSinkUtil::appendUnchanged(src
+ prev
, cpStart
- prev
,
410 sink
, options
, edits
, errorCode
);
411 appendResult(srcIndex
- cpStart
, c
, s
, sink
, options
, edits
, errorCode
);
415 ByteSinkUtil::appendUnchanged(src
+ prev
, srcIndex
- prev
,
416 sink
, options
, edits
, errorCode
);
421 #if !UCONFIG_NO_BREAK_ITERATION
423 U_CFUNC
void U_CALLCONV
424 ucasemap_internalUTF8ToTitle(
425 int32_t caseLocale
, uint32_t options
, BreakIterator
*iter
,
426 const uint8_t *src
, int32_t srcLength
,
427 ByteSink
&sink
, icu::Edits
*edits
,
428 UErrorCode
&errorCode
) {
429 if (!ustrcase_checkTitleAdjustmentOptions(options
, errorCode
)) {
433 /* set up local variables */
434 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
438 UBool isFirstIndex
=TRUE
;
440 /* titlecasing loop */
441 while(prev
<srcLength
) {
442 /* find next index where to titlecase */
450 if(index
==UBRK_DONE
|| index
>srcLength
) {
455 * Segment [prev..index[ into 3 parts:
456 * a) skipped characters (copy as-is) [prev..titleStart[
457 * b) first letter (titlecase) [titleStart..titleLimit[
458 * c) subsequent characters (lowercase) [titleLimit..index[
461 /* find and copy skipped characters [prev..titleStart[ */
462 int32_t titleStart
=prev
;
463 int32_t titleLimit
=prev
;
465 U8_NEXT(src
, titleLimit
, index
, c
);
466 if ((options
&U_TITLECASE_NO_BREAK_ADJUSTMENT
)==0) {
467 // Adjust the titlecasing index to the next cased character,
468 // or to the next letter/number/symbol/private use.
469 // Stop with titleStart<titleLimit<=index
470 // if there is a character to be titlecased,
471 // or else stop with titleStart==titleLimit==index.
472 UBool toCased
= (options
&U_TITLECASE_ADJUST_TO_CASED
) != 0;
473 while (toCased
? UCASE_NONE
==ucase_getType(c
) : !ustrcase_isLNS(c
)) {
474 titleStart
=titleLimit
;
475 if(titleLimit
==index
) {
478 U8_NEXT(src
, titleLimit
, index
, c
);
480 if (prev
< titleStart
) {
481 if (!ByteSinkUtil::appendUnchanged(src
+prev
, titleStart
-prev
,
482 sink
, options
, edits
, errorCode
)) {
488 if(titleStart
<titleLimit
) {
489 /* titlecase c which is from [titleStart..titleLimit[ */
491 csc
.cpStart
=titleStart
;
492 csc
.cpLimit
=titleLimit
;
494 c
=ucase_toFullTitle(c
, utf8_caseContextIterator
, &csc
, &s
, caseLocale
);
495 if (!appendResult(titleLimit
-titleStart
, c
, s
, sink
, options
, edits
, errorCode
)) {
500 if (!ByteSinkUtil::appendUnchanged(src
+titleStart
, titleLimit
-titleStart
,
501 sink
, options
, edits
, errorCode
)) {
506 /* Special case Dutch IJ titlecasing */
507 if (titleStart
+1 < index
&&
508 caseLocale
== UCASE_LOC_DUTCH
&&
509 (src
[titleStart
] == 0x0049 || src
[titleStart
] == 0x0069)) {
510 if (src
[titleStart
+1] == 0x006A) {
511 ByteSinkUtil::appendCodePoint(1, 0x004A, sink
, edits
);
513 } else if (src
[titleStart
+1] == 0x004A) {
514 // Keep the capital J from getting lowercased.
515 if (!ByteSinkUtil::appendUnchanged(src
+titleStart
+1, 1,
516 sink
, options
, edits
, errorCode
)) {
523 /* lowercase [titleLimit..index[ */
524 if(titleLimit
<index
) {
525 if((options
&U_TITLECASE_NO_LOWERCASE
)==0) {
526 /* Normal operation: Lowercase the rest of the word. */
527 toLower(caseLocale
, options
,
528 src
, &csc
, titleLimit
, index
,
529 sink
, edits
, errorCode
);
530 if(U_FAILURE(errorCode
)) {
534 /* Optionally just copy the rest of the word unchanged. */
535 if (!ByteSinkUtil::appendUnchanged(src
+titleLimit
, index
-titleLimit
,
536 sink
, options
, edits
, errorCode
)) {
551 namespace GreekUpper
{
553 UBool
isFollowedByCasedLetter(const uint8_t *s
, int32_t i
, int32_t length
) {
556 U8_NEXT(s
, i
, length
, c
);
557 int32_t type
= ucase_getTypeOrIgnorable(c
);
558 if ((type
& UCASE_IGNORABLE
) != 0) {
559 // Case-ignorable, continue with the loop.
560 } else if (type
!= UCASE_NONE
) {
561 return TRUE
; // Followed by cased letter.
563 return FALSE
; // Uncased and not case-ignorable.
566 return FALSE
; // Not followed by cased letter.
569 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
570 void toUpper(uint32_t options
,
571 const uint8_t *src
, int32_t srcLength
,
572 ByteSink
&sink
, Edits
*edits
,
573 UErrorCode
&errorCode
) {
575 for (int32_t i
= 0; i
< srcLength
;) {
576 int32_t nextIndex
= i
;
578 U8_NEXT(src
, nextIndex
, srcLength
, c
);
579 uint32_t nextState
= 0;
580 int32_t type
= ucase_getTypeOrIgnorable(c
);
581 if ((type
& UCASE_IGNORABLE
) != 0) {
582 // c is case-ignorable
583 nextState
|= (state
& AFTER_CASED
);
584 } else if (type
!= UCASE_NONE
) {
586 nextState
|= AFTER_CASED
;
588 uint32_t data
= getLetterData(c
);
590 uint32_t upper
= data
& UPPER_MASK
;
591 // Add a dialytika to this iota or ypsilon vowel
592 // if we removed a tonos from the previous vowel,
593 // and that previous vowel did not also have (or gain) a dialytika.
594 // Adding one only to the final vowel in a longer sequence
595 // (which does not occur in normal writing) would require lookahead.
596 // Set the same flag as for preserving an existing dialytika.
597 if ((data
& HAS_VOWEL
) != 0 && (state
& AFTER_VOWEL_WITH_ACCENT
) != 0 &&
598 (upper
== 0x399 || upper
== 0x3A5)) {
599 data
|= HAS_DIALYTIKA
;
601 int32_t numYpogegrammeni
= 0; // Map each one to a trailing, spacing, capital iota.
602 if ((data
& HAS_YPOGEGRAMMENI
) != 0) {
603 numYpogegrammeni
= 1;
605 // Skip combining diacritics after this Greek letter.
606 int32_t nextNextIndex
= nextIndex
;
607 while (nextIndex
< srcLength
) {
609 U8_NEXT(src
, nextNextIndex
, srcLength
, c2
);
610 uint32_t diacriticData
= getDiacriticData(c2
);
611 if (diacriticData
!= 0) {
612 data
|= diacriticData
;
613 if ((diacriticData
& HAS_YPOGEGRAMMENI
) != 0) {
616 nextIndex
= nextNextIndex
;
618 break; // not a Greek diacritic
621 if ((data
& HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA
) == HAS_VOWEL_AND_ACCENT
) {
622 nextState
|= AFTER_VOWEL_WITH_ACCENT
;
624 // Map according to Greek rules.
625 UBool addTonos
= FALSE
;
626 if (upper
== 0x397 &&
627 (data
& HAS_ACCENT
) != 0 &&
628 numYpogegrammeni
== 0 &&
629 (state
& AFTER_CASED
) == 0 &&
630 !isFollowedByCasedLetter(src
, nextIndex
, srcLength
)) {
631 // Keep disjunctive "or" with (only) a tonos.
632 // We use the same "word boundary" conditions as for the Final_Sigma test.
633 if (i
== nextIndex
) {
634 upper
= 0x389; // Preserve the precomposed form.
638 } else if ((data
& HAS_DIALYTIKA
) != 0) {
639 // Preserve a vowel with dialytika in precomposed form if it exists.
640 if (upper
== 0x399) {
642 data
&= ~HAS_EITHER_DIALYTIKA
;
643 } else if (upper
== 0x3A5) {
645 data
&= ~HAS_EITHER_DIALYTIKA
;
650 if (edits
== nullptr && (options
& U_OMIT_UNCHANGED_TEXT
) == 0) {
651 change
= TRUE
; // common, simple usage
653 // Find out first whether we are changing the text.
654 U_ASSERT(0x370 <= upper
&& upper
<= 0x3ff); // 2-byte UTF-8, main Greek block
655 change
= (i
+ 2) > nextIndex
||
656 src
[i
] != getTwoByteLead(upper
) || src
[i
+ 1] != getTwoByteTrail(upper
) ||
657 numYpogegrammeni
> 0;
659 if ((data
& HAS_EITHER_DIALYTIKA
) != 0) {
660 change
|= (i2
+ 2) > nextIndex
||
661 src
[i2
] != (uint8_t)u8
"\u0308"[0] ||
662 src
[i2
+ 1] != (uint8_t)u8
"\u0308"[1];
666 change
|= (i2
+ 2) > nextIndex
||
667 src
[i2
] != (uint8_t)u8
"\u0301"[0] ||
668 src
[i2
+ 1] != (uint8_t)u8
"\u0301"[1];
671 int32_t oldLength
= nextIndex
- i
;
672 int32_t newLength
= (i2
- i
) + numYpogegrammeni
* 2; // 2 bytes per U+0399
673 change
|= oldLength
!= newLength
;
676 edits
->addReplace(oldLength
, newLength
);
680 edits
->addUnchanged(oldLength
);
682 // Write unchanged text?
683 change
= (options
& U_OMIT_UNCHANGED_TEXT
) == 0;
688 ByteSinkUtil::appendTwoBytes(upper
, sink
);
689 if ((data
& HAS_EITHER_DIALYTIKA
) != 0) {
690 sink
.Append(reinterpret_cast<const char*>(u8
"\u0308"), 2); // restore or add a dialytika
693 sink
.Append(reinterpret_cast<const char*>(u8
"\u0301"), 2);
695 while (numYpogegrammeni
> 0) {
696 sink
.Append(reinterpret_cast<const char*>(u8
"\u0399"), 2);
702 c
=ucase_toFullUpper(c
, NULL
, NULL
, &s
, UCASE_LOC_GREEK
);
703 if (!appendResult(nextIndex
- i
, c
, s
, sink
, options
, edits
, errorCode
)) {
708 if (!ByteSinkUtil::appendUnchanged(src
+i
, nextIndex
-i
,
709 sink
, options
, edits
, errorCode
)) {
718 } // namespace GreekUpper
721 static void U_CALLCONV
722 ucasemap_internalUTF8ToLower(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
723 const uint8_t *src
, int32_t srcLength
,
724 icu::ByteSink
&sink
, icu::Edits
*edits
,
725 UErrorCode
&errorCode
) {
726 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
731 src
, &csc
, 0, srcLength
,
732 sink
, edits
, errorCode
);
735 static void U_CALLCONV
736 ucasemap_internalUTF8ToUpper(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
737 const uint8_t *src
, int32_t srcLength
,
738 icu::ByteSink
&sink
, icu::Edits
*edits
,
739 UErrorCode
&errorCode
) {
740 if (caseLocale
== UCASE_LOC_GREEK
) {
741 GreekUpper::toUpper(options
, src
, srcLength
, sink
, edits
, errorCode
);
743 UCaseContext csc
=UCASECONTEXT_INITIALIZER
;
748 src
, &csc
, srcLength
,
749 sink
, edits
, errorCode
);
753 static void U_CALLCONV
754 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options
, UCASEMAP_BREAK_ITERATOR_UNUSED
755 const uint8_t *src
, int32_t srcLength
,
756 icu::ByteSink
&sink
, icu::Edits
*edits
,
757 UErrorCode
&errorCode
) {
760 src
, nullptr, 0, srcLength
,
761 sink
, edits
, errorCode
);
765 ucasemap_mapUTF8(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
766 const char *src
, int32_t srcLength
,
767 UTF8CaseMapper
*stringCaseMapper
,
768 icu::ByteSink
&sink
, icu::Edits
*edits
,
769 UErrorCode
&errorCode
) {
770 /* check argument values */
771 if (U_FAILURE(errorCode
)) {
774 if ((src
== nullptr && srcLength
!= 0) || srcLength
< -1) {
775 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
779 // Get the string length.
780 if (srcLength
== -1) {
781 srcLength
= (int32_t)uprv_strlen((const char *)src
);
784 if (edits
!= nullptr && (options
& U_EDITS_NO_RESET
) == 0) {
787 stringCaseMapper(caseLocale
, options
, UCASEMAP_BREAK_ITERATOR
788 (const uint8_t *)src
, srcLength
, sink
, edits
, errorCode
);
790 if (U_SUCCESS(errorCode
)) {
791 if (edits
!= nullptr) {
792 edits
->copyErrorTo(errorCode
);
798 ucasemap_mapUTF8(int32_t caseLocale
, uint32_t options
, UCASEMAP_BREAK_ITERATOR_PARAM
799 char *dest
, int32_t destCapacity
,
800 const char *src
, int32_t srcLength
,
801 UTF8CaseMapper
*stringCaseMapper
,
803 UErrorCode
&errorCode
) {
804 /* check argument values */
805 if(U_FAILURE(errorCode
)) {
808 if( destCapacity
<0 ||
809 (dest
==NULL
&& destCapacity
>0) ||
810 (src
==NULL
&& srcLength
!=0) || srcLength
<-1
812 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
816 /* get the string length */
818 srcLength
=(int32_t)uprv_strlen((const char *)src
);
821 /* check for overlapping source and destination */
823 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
824 (dest
>=src
&& dest
<(src
+srcLength
)))
826 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
830 CheckedArrayByteSink
sink(dest
, destCapacity
);
831 if (edits
!= nullptr && (options
& U_EDITS_NO_RESET
) == 0) {
834 stringCaseMapper(caseLocale
, options
, UCASEMAP_BREAK_ITERATOR
835 (const uint8_t *)src
, srcLength
, sink
, edits
, errorCode
);
837 if (U_SUCCESS(errorCode
)) {
838 if (sink
.Overflowed()) {
839 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
840 } else if (edits
!= nullptr) {
841 edits
->copyErrorTo(errorCode
);
844 return u_terminateChars(dest
, destCapacity
, sink
.NumberOfBytesAppended(), &errorCode
);
847 /* public API functions */
849 U_CAPI
int32_t U_EXPORT2
850 ucasemap_utf8ToLower(const UCaseMap
*csm
,
851 char *dest
, int32_t destCapacity
,
852 const char *src
, int32_t srcLength
,
853 UErrorCode
*pErrorCode
) {
854 return ucasemap_mapUTF8(
855 csm
->caseLocale
, csm
->options
, UCASEMAP_BREAK_ITERATOR_NULL
858 ucasemap_internalUTF8ToLower
, NULL
, *pErrorCode
);
861 U_CAPI
int32_t U_EXPORT2
862 ucasemap_utf8ToUpper(const UCaseMap
*csm
,
863 char *dest
, int32_t destCapacity
,
864 const char *src
, int32_t srcLength
,
865 UErrorCode
*pErrorCode
) {
866 return ucasemap_mapUTF8(
867 csm
->caseLocale
, csm
->options
, UCASEMAP_BREAK_ITERATOR_NULL
870 ucasemap_internalUTF8ToUpper
, NULL
, *pErrorCode
);
873 U_CAPI
int32_t U_EXPORT2
874 ucasemap_utf8FoldCase(const UCaseMap
*csm
,
875 char *dest
, int32_t destCapacity
,
876 const char *src
, int32_t srcLength
,
877 UErrorCode
*pErrorCode
) {
878 return ucasemap_mapUTF8(
879 UCASE_LOC_ROOT
, csm
->options
, UCASEMAP_BREAK_ITERATOR_NULL
882 ucasemap_internalUTF8Fold
, NULL
, *pErrorCode
);
887 void CaseMap::utf8ToLower(
888 const char *locale
, uint32_t options
,
889 StringPiece src
, ByteSink
&sink
, Edits
*edits
,
890 UErrorCode
&errorCode
) {
892 ustrcase_getCaseLocale(locale
), options
, UCASEMAP_BREAK_ITERATOR_NULL
893 src
.data(), src
.length(),
894 ucasemap_internalUTF8ToLower
, sink
, edits
, errorCode
);
897 void CaseMap::utf8ToUpper(
898 const char *locale
, uint32_t options
,
899 StringPiece src
, ByteSink
&sink
, Edits
*edits
,
900 UErrorCode
&errorCode
) {
902 ustrcase_getCaseLocale(locale
), options
, UCASEMAP_BREAK_ITERATOR_NULL
903 src
.data(), src
.length(),
904 ucasemap_internalUTF8ToUpper
, sink
, edits
, errorCode
);
907 void CaseMap::utf8Fold(
909 StringPiece src
, ByteSink
&sink
, Edits
*edits
,
910 UErrorCode
&errorCode
) {
912 UCASE_LOC_ROOT
, options
, UCASEMAP_BREAK_ITERATOR_NULL
913 src
.data(), src
.length(),
914 ucasemap_internalUTF8Fold
, sink
, edits
, errorCode
);
917 int32_t CaseMap::utf8ToLower(
918 const char *locale
, uint32_t options
,
919 const char *src
, int32_t srcLength
,
920 char *dest
, int32_t destCapacity
, Edits
*edits
,
921 UErrorCode
&errorCode
) {
922 return ucasemap_mapUTF8(
923 ustrcase_getCaseLocale(locale
), options
, UCASEMAP_BREAK_ITERATOR_NULL
926 ucasemap_internalUTF8ToLower
, edits
, errorCode
);
929 int32_t CaseMap::utf8ToUpper(
930 const char *locale
, uint32_t options
,
931 const char *src
, int32_t srcLength
,
932 char *dest
, int32_t destCapacity
, Edits
*edits
,
933 UErrorCode
&errorCode
) {
934 return ucasemap_mapUTF8(
935 ustrcase_getCaseLocale(locale
), options
, UCASEMAP_BREAK_ITERATOR_NULL
938 ucasemap_internalUTF8ToUpper
, edits
, errorCode
);
941 int32_t CaseMap::utf8Fold(
943 const char *src
, int32_t srcLength
,
944 char *dest
, int32_t destCapacity
, Edits
*edits
,
945 UErrorCode
&errorCode
) {
946 return ucasemap_mapUTF8(
947 UCASE_LOC_ROOT
, options
, UCASEMAP_BREAK_ITERATOR_NULL
950 ucasemap_internalUTF8Fold
, edits
, errorCode
);