]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucasemap.cpp
ICU-59131.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucasemap.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucasemap.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2005may06
16 * created by: Markus W. Scherer
17 *
18 * Case mapping service object and functions using it.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/casemap.h"
24 #include "unicode/edits.h"
25 #include "unicode/ubrk.h"
26 #include "unicode/uloc.h"
27 #include "unicode/ustring.h"
28 #include "unicode/ucasemap.h"
29 #if !UCONFIG_NO_BREAK_ITERATION
30 #include "unicode/utext.h"
31 #endif
32 #include "unicode/utf.h"
33 #include "unicode/utf8.h"
34 #include "unicode/utf16.h"
35 #include "cmemory.h"
36 #include "cstring.h"
37 #include "uassert.h"
38 #include "ucase.h"
39 #include "ucasemap_imp.h"
40 #include "ustr_imp.h"
41
42 U_NAMESPACE_BEGIN
43
44 namespace {
45
46 // TODO: share with UTF-16? inline in ucasemap_imp.h?
47 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
48 Edits *edits, UErrorCode &errorCode) {
49 if (U_SUCCESS(errorCode)) {
50 if (destIndex > destCapacity) {
51 errorCode = U_BUFFER_OVERFLOW_ERROR;
52 } else if (edits != NULL) {
53 edits->copyErrorTo(errorCode);
54 }
55 }
56 return destIndex;
57 }
58
59 } // namespace
60
61 U_NAMESPACE_END
62
63 U_NAMESPACE_USE
64
65 /* UCaseMap service object -------------------------------------------------- */
66
67 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
68 #if !UCONFIG_NO_BREAK_ITERATION
69 iter(NULL),
70 #endif
71 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
72 ucasemap_setLocale(this, localeID, pErrorCode);
73 }
74
75 UCaseMap::~UCaseMap() {
76 #if !UCONFIG_NO_BREAK_ITERATION
77 delete iter;
78 #endif
79 }
80
81 U_CAPI UCaseMap * U_EXPORT2
82 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
83 if(U_FAILURE(*pErrorCode)) {
84 return NULL;
85 }
86 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
87 if(csm==NULL) {
88 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
89 return NULL;
90 } else if (U_FAILURE(*pErrorCode)) {
91 delete csm;
92 return NULL;
93 }
94 return csm;
95 }
96
97 U_CAPI void U_EXPORT2
98 ucasemap_close(UCaseMap *csm) {
99 delete csm;
100 }
101
102 U_CAPI const char * U_EXPORT2
103 ucasemap_getLocale(const UCaseMap *csm) {
104 return csm->locale;
105 }
106
107 U_CAPI uint32_t U_EXPORT2
108 ucasemap_getOptions(const UCaseMap *csm) {
109 return csm->options;
110 }
111
112 U_CAPI void U_EXPORT2
113 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
114 if(U_FAILURE(*pErrorCode)) {
115 return;
116 }
117 if (locale != NULL && *locale == 0) {
118 csm->locale[0] = 0;
119 csm->caseLocale = UCASE_LOC_ROOT;
120 return;
121 }
122
123 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
124 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
125 *pErrorCode=U_ZERO_ERROR;
126 /* we only really need the language code for case mappings */
127 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
128 }
129 if(length==sizeof(csm->locale)) {
130 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
131 }
132 if(U_SUCCESS(*pErrorCode)) {
133 csm->caseLocale=UCASE_LOC_UNKNOWN;
134 csm->caseLocale = ucase_getCaseLocale(csm->locale);
135 } else {
136 csm->locale[0]=0;
137 csm->caseLocale = UCASE_LOC_ROOT;
138 }
139 }
140
141 U_CAPI void U_EXPORT2
142 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
143 if(U_FAILURE(*pErrorCode)) {
144 return;
145 }
146 csm->options=options;
147 }
148
149 /* UTF-8 string case mappings ----------------------------------------------- */
150
151 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
152
153 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
154 static inline int32_t
155 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
156 int32_t result, const UChar *s,
157 int32_t cpLength, uint32_t options, icu::Edits *edits) {
158 UChar32 c;
159 int32_t length;
160 UErrorCode errorCode;
161
162 /* decode the result */
163 if(result<0) {
164 /* (not) original code point */
165 if(edits!=NULL) {
166 edits->addUnchanged(cpLength);
167 if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
168 return destIndex;
169 }
170 }
171 c=~result;
172 if(destIndex<destCapacity && c<=0x7f) { // ASCII slightly-fastpath
173 dest[destIndex++]=(uint8_t)c;
174 return destIndex;
175 }
176 length=cpLength;
177 } else {
178 if(result<=UCASE_MAX_STRING_LENGTH) {
179 // string: "result" is the UTF-16 length
180 errorCode=U_ZERO_ERROR;
181 if(destIndex<destCapacity) {
182 u_strToUTF8((char *)(dest+destIndex), destCapacity-destIndex, &length,
183 s, result, &errorCode);
184 } else {
185 u_strToUTF8(NULL, 0, &length, s, result, &errorCode);
186 }
187 if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
188 return -1;
189 }
190 if(length>(INT32_MAX-destIndex)) {
191 return -1; // integer overflow
192 }
193 if(edits!=NULL) {
194 edits->addReplace(cpLength, length);
195 }
196 // We might have an overflow, but we know the actual length.
197 return destIndex+length;
198 } else if(destIndex<destCapacity && result<=0x7f) { // ASCII slightly-fastpath
199 dest[destIndex++]=(uint8_t)result;
200 if(edits!=NULL) {
201 edits->addReplace(cpLength, 1);
202 }
203 return destIndex;
204 } else {
205 c=result;
206 length=U8_LENGTH(c);
207 if(edits!=NULL) {
208 edits->addReplace(cpLength, length);
209 }
210 }
211 }
212 // c>=0 single code point
213 if(length>(INT32_MAX-destIndex)) {
214 return -1; // integer overflow
215 }
216
217 if(destIndex<destCapacity) {
218 /* append the result */
219 UBool isError=FALSE;
220 U8_APPEND(dest, destIndex, destCapacity, c, isError);
221 if(isError) {
222 /* overflow, nothing written */
223 destIndex+=length;
224 }
225 } else {
226 /* preflight */
227 destIndex+=length;
228 }
229 return destIndex;
230 }
231
232 static inline int32_t
233 appendASCII(uint8_t *dest, int32_t destIndex, int32_t destCapacity, uint8_t c) {
234 if(destIndex<destCapacity) {
235 dest[destIndex]=c;
236 } else if(destIndex==INT32_MAX) {
237 return -1; // integer overflow
238 }
239 return destIndex+1;
240 }
241
242 // See unicode/utf8.h U8_APPEND_UNSAFE().
243 static inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
244 static inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
245
246 static inline int32_t
247 appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar32 c) {
248 U_ASSERT(0x370 <= c && c <= 0x3ff); // 2-byte UTF-8, main Greek block
249 if(2>(INT32_MAX-destIndex)) {
250 return -1; // integer overflow
251 }
252 int32_t limit=destIndex+2;
253 if(limit<=destCapacity) {
254 dest+=destIndex;
255 dest[0]=getTwoByteLead(c);
256 dest[1]=getTwoByteTrail(c);
257 }
258 return limit;
259 }
260
261 static inline int32_t
262 appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, const char *s) {
263 if(2>(INT32_MAX-destIndex)) {
264 return -1; // integer overflow
265 }
266 int32_t limit=destIndex+2;
267 if(limit<=destCapacity) {
268 dest+=destIndex;
269 dest[0]=(uint8_t)s[0];
270 dest[1]=(uint8_t)s[1];
271 }
272 return limit;
273 }
274
275 static inline int32_t
276 appendUnchanged(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
277 const uint8_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
278 if(length>0) {
279 if(edits!=NULL) {
280 edits->addUnchanged(length);
281 if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
282 return destIndex;
283 }
284 }
285 if(length>(INT32_MAX-destIndex)) {
286 return -1; // integer overflow
287 }
288 if((destIndex+length)<=destCapacity) {
289 uprv_memcpy(dest+destIndex, s, length);
290 }
291 destIndex+=length;
292 }
293 return destIndex;
294 }
295
296 static UChar32 U_CALLCONV
297 utf8_caseContextIterator(void *context, int8_t dir) {
298 UCaseContext *csc=(UCaseContext *)context;
299 UChar32 c;
300
301 if(dir<0) {
302 /* reset for backward iteration */
303 csc->index=csc->cpStart;
304 csc->dir=dir;
305 } else if(dir>0) {
306 /* reset for forward iteration */
307 csc->index=csc->cpLimit;
308 csc->dir=dir;
309 } else {
310 /* continue current iteration direction */
311 dir=csc->dir;
312 }
313
314 if(dir<0) {
315 if(csc->start<csc->index) {
316 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
317 return c;
318 }
319 } else {
320 if(csc->index<csc->limit) {
321 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
322 return c;
323 }
324 }
325 return U_SENTINEL;
326 }
327
328 /*
329 * Case-maps [srcStart..srcLimit[ but takes
330 * context [0..srcLength[ into account.
331 */
332 static int32_t
333 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
334 uint8_t *dest, int32_t destCapacity,
335 const uint8_t *src, UCaseContext *csc,
336 int32_t srcStart, int32_t srcLimit,
337 icu::Edits *edits,
338 UErrorCode &errorCode) {
339 /* case mapping loop */
340 int32_t srcIndex=srcStart;
341 int32_t destIndex=0;
342 while(srcIndex<srcLimit) {
343 int32_t cpStart;
344 csc->cpStart=cpStart=srcIndex;
345 UChar32 c;
346 U8_NEXT(src, srcIndex, srcLimit, c);
347 csc->cpLimit=srcIndex;
348 if(c<0) {
349 // Malformed UTF-8.
350 destIndex=appendUnchanged(dest, destIndex, destCapacity,
351 src+cpStart, srcIndex-cpStart, options, edits);
352 if(destIndex<0) {
353 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
354 return 0;
355 }
356 continue;
357 }
358 const UChar *s;
359 c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
360 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
361 srcIndex - cpStart, options, edits);
362 if (destIndex < 0) {
363 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
364 return 0;
365 }
366 }
367
368 return destIndex;
369 }
370
371 #if !UCONFIG_NO_BREAK_ITERATION
372
373 U_CFUNC int32_t U_CALLCONV
374 ucasemap_internalUTF8ToTitle(
375 int32_t caseLocale, uint32_t options, BreakIterator *iter,
376 uint8_t *dest, int32_t destCapacity,
377 const uint8_t *src, int32_t srcLength,
378 icu::Edits *edits,
379 UErrorCode &errorCode) {
380 if(U_FAILURE(errorCode)) {
381 return 0;
382 }
383
384 /* set up local variables */
385 UCaseContext csc=UCASECONTEXT_INITIALIZER;
386 csc.p=(void *)src;
387 csc.limit=srcLength;
388 int32_t destIndex=0;
389 int32_t prev=0;
390 UBool isFirstIndex=TRUE;
391
392 /* titlecasing loop */
393 while(prev<srcLength) {
394 /* find next index where to titlecase */
395 int32_t index;
396 if(isFirstIndex) {
397 isFirstIndex=FALSE;
398 index=iter->first();
399 } else {
400 index=iter->next();
401 }
402 if(index==UBRK_DONE || index>srcLength) {
403 index=srcLength;
404 }
405
406 /*
407 * Unicode 4 & 5 section 3.13 Default Case Operations:
408 *
409 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
410 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
411 * cased character F. If F exists, map F to default_title(F); then map each
412 * subsequent character C to default_lower(C).
413 *
414 * In this implementation, segment [prev..index[ into 3 parts:
415 * a) uncased characters (copy as-is) [prev..titleStart[
416 * b) first case letter (titlecase) [titleStart..titleLimit[
417 * c) subsequent characters (lowercase) [titleLimit..index[
418 */
419 if(prev<index) {
420 /* find and copy uncased characters [prev..titleStart[ */
421 int32_t titleStart=prev;
422 int32_t titleLimit=prev;
423 UChar32 c;
424 U8_NEXT(src, titleLimit, index, c);
425 if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
426 /* Adjust the titlecasing index (titleStart) to the next cased character. */
427 for(;;) {
428 titleStart=titleLimit;
429 if(titleLimit==index) {
430 /*
431 * only uncased characters in [prev..index[
432 * stop with titleStart==titleLimit==index
433 */
434 break;
435 }
436 U8_NEXT(src, titleLimit, index, c);
437 if(UCASE_NONE!=ucase_getType(c)) {
438 break; /* cased letter at [titleStart..titleLimit[ */
439 }
440 }
441 destIndex=appendUnchanged(dest, destIndex, destCapacity,
442 src+prev, titleStart-prev, options, edits);
443 if(destIndex<0) {
444 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
445 return 0;
446 }
447 }
448
449 if(titleStart<titleLimit) {
450 /* titlecase c which is from [titleStart..titleLimit[ */
451 if(c>=0) {
452 csc.cpStart=titleStart;
453 csc.cpLimit=titleLimit;
454 const UChar *s;
455 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
456 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
457 titleLimit-titleStart, options, edits);
458 } else {
459 // Malformed UTF-8.
460 destIndex=appendUnchanged(dest, destIndex, destCapacity,
461 src+titleStart, titleLimit-titleStart, options, edits);
462 }
463 if(destIndex<0) {
464 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
465 return 0;
466 }
467
468 /* Special case Dutch IJ titlecasing */
469 if (titleStart+1 < index &&
470 caseLocale == UCASE_LOC_DUTCH &&
471 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
472 if (src[titleStart+1] == 0x006A) {
473 destIndex=appendASCII(dest, destIndex, destCapacity, 0x004A);
474 if(destIndex<0) {
475 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
476 return 0;
477 }
478 if(edits!=NULL) {
479 edits->addReplace(1, 1);
480 }
481 titleLimit++;
482 } else if (src[titleStart+1] == 0x004A) {
483 // Keep the capital J from getting lowercased.
484 destIndex=appendUnchanged(dest, destIndex, destCapacity,
485 src+titleStart+1, 1, options, edits);
486 if(destIndex<0) {
487 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
488 return 0;
489 }
490 titleLimit++;
491 }
492 }
493
494 /* lowercase [titleLimit..index[ */
495 if(titleLimit<index) {
496 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
497 /* Normal operation: Lowercase the rest of the word. */
498 destIndex+=
499 _caseMap(
500 caseLocale, options, ucase_toFullLower,
501 dest+destIndex, destCapacity-destIndex,
502 src, &csc,
503 titleLimit, index,
504 edits, errorCode);
505 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
506 errorCode=U_ZERO_ERROR;
507 }
508 if(U_FAILURE(errorCode)) {
509 return destIndex;
510 }
511 } else {
512 /* Optionally just copy the rest of the word unchanged. */
513 destIndex=appendUnchanged(dest, destIndex, destCapacity,
514 src+titleLimit, index-titleLimit, options, edits);
515 if(destIndex<0) {
516 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
517 return 0;
518 }
519 }
520 }
521 }
522 }
523
524 prev=index;
525 }
526
527 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
528 }
529
530 #endif
531
532 U_NAMESPACE_BEGIN
533 namespace GreekUpper {
534
535 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
536 while (i < length) {
537 UChar32 c;
538 U8_NEXT(s, i, length, c);
539 int32_t type = ucase_getTypeOrIgnorable(c);
540 if ((type & UCASE_IGNORABLE) != 0) {
541 // Case-ignorable, continue with the loop.
542 } else if (type != UCASE_NONE) {
543 return TRUE; // Followed by cased letter.
544 } else {
545 return FALSE; // Uncased and not case-ignorable.
546 }
547 }
548 return FALSE; // Not followed by cased letter.
549 }
550
551 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
552 int32_t toUpper(uint32_t options,
553 uint8_t *dest, int32_t destCapacity,
554 const uint8_t *src, int32_t srcLength,
555 Edits *edits,
556 UErrorCode &errorCode) {
557 int32_t destIndex=0;
558 uint32_t state = 0;
559 for (int32_t i = 0; i < srcLength;) {
560 int32_t nextIndex = i;
561 UChar32 c;
562 U8_NEXT(src, nextIndex, srcLength, c);
563 uint32_t nextState = 0;
564 int32_t type = ucase_getTypeOrIgnorable(c);
565 if ((type & UCASE_IGNORABLE) != 0) {
566 // c is case-ignorable
567 nextState |= (state & AFTER_CASED);
568 } else if (type != UCASE_NONE) {
569 // c is cased
570 nextState |= AFTER_CASED;
571 }
572 uint32_t data = getLetterData(c);
573 if (data > 0) {
574 uint32_t upper = data & UPPER_MASK;
575 // Add a dialytika to this iota or ypsilon vowel
576 // if we removed a tonos from the previous vowel,
577 // and that previous vowel did not also have (or gain) a dialytika.
578 // Adding one only to the final vowel in a longer sequence
579 // (which does not occur in normal writing) would require lookahead.
580 // Set the same flag as for preserving an existing dialytika.
581 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
582 (upper == 0x399 || upper == 0x3A5)) {
583 data |= HAS_DIALYTIKA;
584 }
585 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
586 if ((data & HAS_YPOGEGRAMMENI) != 0) {
587 numYpogegrammeni = 1;
588 }
589 // Skip combining diacritics after this Greek letter.
590 int32_t nextNextIndex = nextIndex;
591 while (nextIndex < srcLength) {
592 UChar32 c2;
593 U8_NEXT(src, nextNextIndex, srcLength, c2);
594 uint32_t diacriticData = getDiacriticData(c2);
595 if (diacriticData != 0) {
596 data |= diacriticData;
597 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
598 ++numYpogegrammeni;
599 }
600 nextIndex = nextNextIndex;
601 } else {
602 break; // not a Greek diacritic
603 }
604 }
605 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
606 nextState |= AFTER_VOWEL_WITH_ACCENT;
607 }
608 // Map according to Greek rules.
609 UBool addTonos = FALSE;
610 if (upper == 0x397 &&
611 (data & HAS_ACCENT) != 0 &&
612 numYpogegrammeni == 0 &&
613 (state & AFTER_CASED) == 0 &&
614 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
615 // Keep disjunctive "or" with (only) a tonos.
616 // We use the same "word boundary" conditions as for the Final_Sigma test.
617 if (i == nextIndex) {
618 upper = 0x389; // Preserve the precomposed form.
619 } else {
620 addTonos = TRUE;
621 }
622 } else if ((data & HAS_DIALYTIKA) != 0) {
623 // Preserve a vowel with dialytika in precomposed form if it exists.
624 if (upper == 0x399) {
625 upper = 0x3AA;
626 data &= ~HAS_EITHER_DIALYTIKA;
627 } else if (upper == 0x3A5) {
628 upper = 0x3AB;
629 data &= ~HAS_EITHER_DIALYTIKA;
630 }
631 }
632
633 UBool change = TRUE;
634 if (edits != NULL) {
635 // Find out first whether we are changing the text.
636 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
637 change = (i + 2) > nextIndex ||
638 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
639 numYpogegrammeni > 0;
640 int32_t i2 = i + 2;
641 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
642 change |= (i2 + 2) > nextIndex ||
643 src[i2] != (uint8_t)u8"\u0308"[0] ||
644 src[i2 + 1] != (uint8_t)u8"\u0308"[1];
645 i2 += 2;
646 }
647 if (addTonos) {
648 change |= (i2 + 2) > nextIndex ||
649 src[i2] != (uint8_t)u8"\u0301"[0] ||
650 src[i2 + 1] != (uint8_t)u8"\u0301"[1];
651 i2 += 2;
652 }
653 int32_t oldLength = nextIndex - i;
654 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
655 change |= oldLength != newLength;
656 if (change) {
657 if (edits != NULL) {
658 edits->addReplace(oldLength, newLength);
659 }
660 } else {
661 if (edits != NULL) {
662 edits->addUnchanged(oldLength);
663 }
664 // Write unchanged text?
665 change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0;
666 }
667 }
668
669 if (change) {
670 destIndex=appendTwoBytes(dest, destIndex, destCapacity, upper);
671 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
672 destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0308"); // restore or add a dialytika
673 }
674 if (destIndex >= 0 && addTonos) {
675 destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0301");
676 }
677 while (destIndex >= 0 && numYpogegrammeni > 0) {
678 destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0399");
679 --numYpogegrammeni;
680 }
681 if(destIndex<0) {
682 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
683 return 0;
684 }
685 }
686 } else if(c>=0) {
687 const UChar *s;
688 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
689 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
690 nextIndex - i, options, edits);
691 if (destIndex < 0) {
692 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
693 return 0;
694 }
695 } else {
696 // Malformed UTF-8.
697 destIndex=appendUnchanged(dest, destIndex, destCapacity,
698 src+i, nextIndex-i, options, edits);
699 if(destIndex<0) {
700 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
701 return 0;
702 }
703 }
704 i = nextIndex;
705 state = nextState;
706 }
707
708 return destIndex;
709 }
710
711 } // namespace GreekUpper
712 U_NAMESPACE_END
713
714 static int32_t U_CALLCONV
715 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
716 uint8_t *dest, int32_t destCapacity,
717 const uint8_t *src, int32_t srcLength,
718 icu::Edits *edits,
719 UErrorCode &errorCode) {
720 UCaseContext csc=UCASECONTEXT_INITIALIZER;
721 csc.p=(void *)src;
722 csc.limit=srcLength;
723 int32_t destIndex = _caseMap(
724 caseLocale, options, ucase_toFullLower,
725 dest, destCapacity,
726 src, &csc, 0, srcLength,
727 edits, errorCode);
728 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
729 }
730
731 static int32_t U_CALLCONV
732 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
733 uint8_t *dest, int32_t destCapacity,
734 const uint8_t *src, int32_t srcLength,
735 icu::Edits *edits,
736 UErrorCode &errorCode) {
737 int32_t destIndex;
738 if (caseLocale == UCASE_LOC_GREEK) {
739 destIndex = GreekUpper::toUpper(options, dest, destCapacity,
740 src, srcLength, edits, errorCode);
741 } else {
742 UCaseContext csc=UCASECONTEXT_INITIALIZER;
743 csc.p=(void *)src;
744 csc.limit=srcLength;
745 destIndex = _caseMap(
746 caseLocale, options, ucase_toFullUpper,
747 dest, destCapacity,
748 src, &csc, 0, srcLength,
749 edits, errorCode);
750 }
751 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
752 }
753
754 static int32_t U_CALLCONV
755 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
756 uint8_t *dest, int32_t destCapacity,
757 const uint8_t *src, int32_t srcLength,
758 icu::Edits *edits,
759 UErrorCode &errorCode) {
760 /* case mapping loop */
761 int32_t srcIndex = 0;
762 int32_t destIndex = 0;
763 while (srcIndex < srcLength) {
764 int32_t cpStart = srcIndex;
765 UChar32 c;
766 U8_NEXT(src, srcIndex, srcLength, c);
767 if(c<0) {
768 // Malformed UTF-8.
769 destIndex=appendUnchanged(dest, destIndex, destCapacity,
770 src+cpStart, srcIndex-cpStart, options, edits);
771 if(destIndex<0) {
772 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
773 return 0;
774 }
775 continue;
776 }
777 const UChar *s;
778 c = ucase_toFullFolding(c, &s, options);
779 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
780 srcIndex - cpStart, options, edits);
781 if (destIndex < 0) {
782 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
783 return 0;
784 }
785 }
786
787 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
788 }
789
790 U_CFUNC int32_t
791 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
792 uint8_t *dest, int32_t destCapacity,
793 const uint8_t *src, int32_t srcLength,
794 UTF8CaseMapper *stringCaseMapper,
795 icu::Edits *edits,
796 UErrorCode &errorCode) {
797 int32_t destLength;
798
799 /* check argument values */
800 if(U_FAILURE(errorCode)) {
801 return 0;
802 }
803 if( destCapacity<0 ||
804 (dest==NULL && destCapacity>0) ||
805 src==NULL ||
806 srcLength<-1
807 ) {
808 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
809 return 0;
810 }
811
812 /* get the string length */
813 if(srcLength==-1) {
814 srcLength=(int32_t)uprv_strlen((const char *)src);
815 }
816
817 /* check for overlapping source and destination */
818 if( dest!=NULL &&
819 ((src>=dest && src<(dest+destCapacity)) ||
820 (dest>=src && dest<(src+srcLength)))
821 ) {
822 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
823 return 0;
824 }
825
826 if(edits!=NULL) {
827 edits->reset();
828 }
829 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
830 dest, destCapacity, src, srcLength, edits, errorCode);
831 return u_terminateChars((char *)dest, destCapacity, destLength, &errorCode);
832 }
833
834 /* public API functions */
835
836 U_CAPI int32_t U_EXPORT2
837 ucasemap_utf8ToLower(const UCaseMap *csm,
838 char *dest, int32_t destCapacity,
839 const char *src, int32_t srcLength,
840 UErrorCode *pErrorCode) {
841 return ucasemap_mapUTF8(
842 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
843 (uint8_t *)dest, destCapacity,
844 (const uint8_t *)src, srcLength,
845 ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
846 }
847
848 U_CAPI int32_t U_EXPORT2
849 ucasemap_utf8ToUpper(const UCaseMap *csm,
850 char *dest, int32_t destCapacity,
851 const char *src, int32_t srcLength,
852 UErrorCode *pErrorCode) {
853 return ucasemap_mapUTF8(
854 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
855 (uint8_t *)dest, destCapacity,
856 (const uint8_t *)src, srcLength,
857 ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
858 }
859
860 U_CAPI int32_t U_EXPORT2
861 ucasemap_utf8FoldCase(const UCaseMap *csm,
862 char *dest, int32_t destCapacity,
863 const char *src, int32_t srcLength,
864 UErrorCode *pErrorCode) {
865 return ucasemap_mapUTF8(
866 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
867 (uint8_t *)dest, destCapacity,
868 (const uint8_t *)src, srcLength,
869 ucasemap_internalUTF8Fold, NULL, *pErrorCode);
870 }
871
872 U_NAMESPACE_BEGIN
873
874 int32_t CaseMap::utf8ToLower(
875 const char *locale, uint32_t options,
876 const char *src, int32_t srcLength,
877 char *dest, int32_t destCapacity, Edits *edits,
878 UErrorCode &errorCode) {
879 return ucasemap_mapUTF8(
880 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
881 (uint8_t *)dest, destCapacity,
882 (const uint8_t *)src, srcLength,
883 ucasemap_internalUTF8ToLower, edits, errorCode);
884 }
885
886 int32_t CaseMap::utf8ToUpper(
887 const char *locale, uint32_t options,
888 const char *src, int32_t srcLength,
889 char *dest, int32_t destCapacity, Edits *edits,
890 UErrorCode &errorCode) {
891 return ucasemap_mapUTF8(
892 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893 (uint8_t *)dest, destCapacity,
894 (const uint8_t *)src, srcLength,
895 ucasemap_internalUTF8ToUpper, edits, errorCode);
896 }
897
898 int32_t CaseMap::utf8Fold(
899 uint32_t options,
900 const char *src, int32_t srcLength,
901 char *dest, int32_t destCapacity, Edits *edits,
902 UErrorCode &errorCode) {
903 return ucasemap_mapUTF8(
904 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
905 (uint8_t *)dest, destCapacity,
906 (const uint8_t *)src, srcLength,
907 ucasemap_internalUTF8Fold, edits, errorCode);
908 }
909
910 U_NAMESPACE_END