]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucasemap.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / ucasemap.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/*
4*******************************************************************************
5*
2ca993e8 6* Copyright (C) 2005-2016, International Business Machines
73c04bcf
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
4388f060 10* file name: ucasemap.cpp
f3c0d7a5 11* encoding: UTF-8
73c04bcf
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2005may06
16* created by: Markus W. Scherer
17*
18* Case mapping service object and functions using it.
19*/
20
21#include "unicode/utypes.h"
4388f060 22#include "unicode/brkiter.h"
0f5d89e8 23#include "unicode/bytestream.h"
f3c0d7a5
A
24#include "unicode/casemap.h"
25#include "unicode/edits.h"
0f5d89e8
A
26#include "unicode/stringoptions.h"
27#include "unicode/stringpiece.h"
4388f060 28#include "unicode/ubrk.h"
73c04bcf
A
29#include "unicode/uloc.h"
30#include "unicode/ustring.h"
31#include "unicode/ucasemap.h"
46f4442e 32#if !UCONFIG_NO_BREAK_ITERATION
46f4442e
A
33#include "unicode/utext.h"
34#endif
4388f060
A
35#include "unicode/utf.h"
36#include "unicode/utf8.h"
37#include "unicode/utf16.h"
0f5d89e8 38#include "bytesinkutil.h"
73c04bcf
A
39#include "cmemory.h"
40#include "cstring.h"
f3c0d7a5 41#include "uassert.h"
73c04bcf 42#include "ucase.h"
f3c0d7a5 43#include "ucasemap_imp.h"
73c04bcf
A
44#include "ustr_imp.h"
45
4388f060
A
46U_NAMESPACE_USE
47
73c04bcf
A
48/* UCaseMap service object -------------------------------------------------- */
49
f3c0d7a5
A
50UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51#if !UCONFIG_NO_BREAK_ITERATION
52 iter(NULL),
53#endif
54 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55 ucasemap_setLocale(this, localeID, pErrorCode);
56}
57
58UCaseMap::~UCaseMap() {
59#if !UCONFIG_NO_BREAK_ITERATION
60 delete iter;
61#endif
62}
63
46f4442e 64U_CAPI UCaseMap * U_EXPORT2
73c04bcf 65ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
73c04bcf
A
66 if(U_FAILURE(*pErrorCode)) {
67 return NULL;
68 }
f3c0d7a5 69 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
73c04bcf 70 if(csm==NULL) {
f3c0d7a5 71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
73c04bcf 72 return NULL;
f3c0d7a5
A
73 } else if (U_FAILURE(*pErrorCode)) {
74 delete csm;
73c04bcf
A
75 return NULL;
76 }
73c04bcf
A
77 return csm;
78}
79
46f4442e 80U_CAPI void U_EXPORT2
73c04bcf 81ucasemap_close(UCaseMap *csm) {
f3c0d7a5 82 delete csm;
73c04bcf
A
83}
84
46f4442e 85U_CAPI const char * U_EXPORT2
73c04bcf
A
86ucasemap_getLocale(const UCaseMap *csm) {
87 return csm->locale;
88}
89
46f4442e 90U_CAPI uint32_t U_EXPORT2
73c04bcf
A
91ucasemap_getOptions(const UCaseMap *csm) {
92 return csm->options;
93}
94
46f4442e 95U_CAPI void U_EXPORT2
73c04bcf 96ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
73c04bcf
A
97 if(U_FAILURE(*pErrorCode)) {
98 return;
99 }
f3c0d7a5
A
100 if (locale != NULL && *locale == 0) {
101 csm->locale[0] = 0;
102 csm->caseLocale = UCASE_LOC_ROOT;
103 return;
104 }
73c04bcf 105
f3c0d7a5 106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
73c04bcf
A
107 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108 *pErrorCode=U_ZERO_ERROR;
109 /* we only really need the language code for case mappings */
110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111 }
112 if(length==sizeof(csm->locale)) {
113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114 }
73c04bcf 115 if(U_SUCCESS(*pErrorCode)) {
f3c0d7a5
A
116 csm->caseLocale=UCASE_LOC_UNKNOWN;
117 csm->caseLocale = ucase_getCaseLocale(csm->locale);
73c04bcf
A
118 } else {
119 csm->locale[0]=0;
f3c0d7a5 120 csm->caseLocale = UCASE_LOC_ROOT;
73c04bcf
A
121 }
122}
123
46f4442e 124U_CAPI void U_EXPORT2
f3c0d7a5
A
125ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
126 if(U_FAILURE(*pErrorCode)) {
127 return;
128 }
73c04bcf
A
129 csm->options=options;
130}
131
132/* UTF-8 string case mappings ----------------------------------------------- */
133
f3c0d7a5 134/* TODO(markus): Move to a new, separate utf8case.cpp file. */
46f4442e 135
0f5d89e8
A
136namespace {
137
73c04bcf 138/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
0f5d89e8
A
139inline UBool
140appendResult(int32_t cpLength, int32_t result, const UChar *s,
141 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142 U_ASSERT(U_SUCCESS(errorCode));
73c04bcf
A
143
144 /* decode the result */
145 if(result<0) {
146 /* (not) original code point */
f3c0d7a5
A
147 if(edits!=NULL) {
148 edits->addUnchanged(cpLength);
f3c0d7a5 149 }
0f5d89e8
A
150 if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
151 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
f3c0d7a5 152 }
73c04bcf 153 } else {
f3c0d7a5
A
154 if(result<=UCASE_MAX_STRING_LENGTH) {
155 // string: "result" is the UTF-16 length
0f5d89e8 156 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
f3c0d7a5 157 } else {
0f5d89e8 158 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
73c04bcf 159 }
73c04bcf 160 }
0f5d89e8 161 return TRUE;
f3c0d7a5
A
162}
163
164// See unicode/utf8.h U8_APPEND_UNSAFE().
0f5d89e8
A
165inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
166inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
a62d09fc 167
0f5d89e8 168UChar32 U_CALLCONV
73c04bcf
A
169utf8_caseContextIterator(void *context, int8_t dir) {
170 UCaseContext *csc=(UCaseContext *)context;
171 UChar32 c;
172
173 if(dir<0) {
174 /* reset for backward iteration */
175 csc->index=csc->cpStart;
176 csc->dir=dir;
177 } else if(dir>0) {
178 /* reset for forward iteration */
179 csc->index=csc->cpLimit;
180 csc->dir=dir;
181 } else {
182 /* continue current iteration direction */
183 dir=csc->dir;
184 }
185
186 if(dir<0) {
187 if(csc->start<csc->index) {
188 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
189 return c;
190 }
191 } else {
192 if(csc->index<csc->limit) {
193 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
194 return c;
195 }
196 }
197 return U_SENTINEL;
198}
199
0f5d89e8
A
200/**
201 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
202 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
73c04bcf 203 */
0f5d89e8
A
204void toLower(int32_t caseLocale, uint32_t options,
205 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
206 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
207 const int8_t *latinToLower;
208 if (caseLocale == UCASE_LOC_ROOT ||
209 (caseLocale >= 0 ?
210 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
211 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
212 latinToLower = LatinCase::TO_LOWER_NORMAL;
213 } else {
214 latinToLower = LatinCase::TO_LOWER_TR_LT;
215 }
216 const UTrie2 *trie = ucase_getTrie();
217 int32_t prev = srcStart;
218 int32_t srcIndex = srcStart;
219 for (;;) {
220 // fast path for simple cases
f3c0d7a5 221 int32_t cpStart;
f3c0d7a5 222 UChar32 c;
0f5d89e8
A
223 for (;;) {
224 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
225 c = U_SENTINEL;
226 break;
227 }
228 uint8_t lead = src[srcIndex++];
229 if (lead <= 0x7f) {
230 int8_t d = latinToLower[lead];
231 if (d == LatinCase::EXC) {
232 cpStart = srcIndex - 1;
233 c = lead;
234 break;
235 }
236 if (d == 0) { continue; }
237 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
238 sink, options, edits, errorCode);
239 char ascii = (char)(lead + d);
240 sink.Append(&ascii, 1);
241 if (edits != nullptr) {
242 edits->addReplace(1, 1);
243 }
244 prev = srcIndex;
245 continue;
246 } else if (lead < 0xe3) {
247 uint8_t t;
248 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
249 (t = src[srcIndex] - 0x80) <= 0x3f) {
250 // U+0080..U+017F
251 ++srcIndex;
252 c = ((lead - 0xc0) << 6) | t;
253 int8_t d = latinToLower[c];
254 if (d == LatinCase::EXC) {
255 cpStart = srcIndex - 2;
256 break;
257 }
258 if (d == 0) { continue; }
259 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
260 sink, options, edits, errorCode);
261 ByteSinkUtil::appendTwoBytes(c + d, sink);
262 if (edits != nullptr) {
263 edits->addReplace(2, 2);
264 }
265 prev = srcIndex;
266 continue;
267 }
268 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
269 (srcIndex + 2) <= srcLimit &&
270 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
271 // most of CJK: no case mappings
272 srcIndex += 2;
273 continue;
a62d09fc 274 }
0f5d89e8
A
275 cpStart = --srcIndex;
276 U8_NEXT(src, srcIndex, srcLimit, c);
277 if (c < 0) {
278 // ill-formed UTF-8
279 continue;
280 }
281 uint16_t props = UTRIE2_GET16(trie, c);
282 if (UCASE_HAS_EXCEPTION(props)) { break; }
283 int32_t delta;
284 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
285 continue;
286 }
287 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
288 sink, options, edits, errorCode);
289 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
290 prev = srcIndex;
291 }
292 if (c < 0) {
293 break;
f3c0d7a5 294 }
0f5d89e8 295 // slow path
f3c0d7a5 296 const UChar *s;
0f5d89e8
A
297 if (caseLocale >= 0) {
298 csc->cpStart = cpStart;
299 csc->cpLimit = srcIndex;
300 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
301 } else {
302 c = ucase_toFullFolding(c, &s, options);
303 }
304 if (c >= 0) {
305 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
306 sink, options, edits, errorCode);
307 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
308 prev = srcIndex;
46f4442e
A
309 }
310 }
0f5d89e8
A
311 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
312 sink, options, edits, errorCode);
313}
46f4442e 314
0f5d89e8
A
315void toUpper(int32_t caseLocale, uint32_t options,
316 const uint8_t *src, UCaseContext *csc, int32_t srcLength,
317 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
318 const int8_t *latinToUpper;
319 if (caseLocale == UCASE_LOC_TURKISH) {
320 latinToUpper = LatinCase::TO_UPPER_TR;
321 } else {
322 latinToUpper = LatinCase::TO_UPPER_NORMAL;
323 }
324 const UTrie2 *trie = ucase_getTrie();
325 int32_t prev = 0;
326 int32_t srcIndex = 0;
327 for (;;) {
328 // fast path for simple cases
329 int32_t cpStart;
330 UChar32 c;
331 for (;;) {
332 if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
333 c = U_SENTINEL;
334 break;
335 }
336 uint8_t lead = src[srcIndex++];
337 if (lead <= 0x7f) {
338 int8_t d = latinToUpper[lead];
339 if (d == LatinCase::EXC) {
340 cpStart = srcIndex - 1;
341 c = lead;
342 break;
343 }
344 if (d == 0) { continue; }
345 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
346 sink, options, edits, errorCode);
347 char ascii = (char)(lead + d);
348 sink.Append(&ascii, 1);
349 if (edits != nullptr) {
350 edits->addReplace(1, 1);
351 }
352 prev = srcIndex;
353 continue;
354 } else if (lead < 0xe3) {
355 uint8_t t;
356 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
357 (t = src[srcIndex] - 0x80) <= 0x3f) {
358 // U+0080..U+017F
359 ++srcIndex;
360 c = ((lead - 0xc0) << 6) | t;
361 int8_t d = latinToUpper[c];
362 if (d == LatinCase::EXC) {
363 cpStart = srcIndex - 2;
364 break;
365 }
366 if (d == 0) { continue; }
367 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
368 sink, options, edits, errorCode);
369 ByteSinkUtil::appendTwoBytes(c + d, sink);
370 if (edits != nullptr) {
371 edits->addReplace(2, 2);
372 }
373 prev = srcIndex;
374 continue;
375 }
376 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
377 (srcIndex + 2) <= srcLength &&
378 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
379 // most of CJK: no case mappings
380 srcIndex += 2;
381 continue;
382 }
383 cpStart = --srcIndex;
384 U8_NEXT(src, srcIndex, srcLength, c);
385 if (c < 0) {
386 // ill-formed UTF-8
387 continue;
388 }
389 uint16_t props = UTRIE2_GET16(trie, c);
390 if (UCASE_HAS_EXCEPTION(props)) { break; }
391 int32_t delta;
392 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
393 continue;
394 }
395 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
396 sink, options, edits, errorCode);
397 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
398 prev = srcIndex;
399 }
400 if (c < 0) {
401 break;
402 }
403 // slow path
404 csc->cpStart = cpStart;
405 csc->cpLimit = srcIndex;
406 const UChar *s;
407 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
408 if (c >= 0) {
409 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
410 sink, options, edits, errorCode);
411 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
412 prev = srcIndex;
413 }
414 }
415 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
416 sink, options, edits, errorCode);
46f4442e
A
417}
418
0f5d89e8
A
419} // namespace
420
46f4442e
A
421#if !UCONFIG_NO_BREAK_ITERATION
422
0f5d89e8 423U_CFUNC void U_CALLCONV
f3c0d7a5
A
424ucasemap_internalUTF8ToTitle(
425 int32_t caseLocale, uint32_t options, BreakIterator *iter,
f3c0d7a5 426 const uint8_t *src, int32_t srcLength,
0f5d89e8 427 ByteSink &sink, icu::Edits *edits,
f3c0d7a5 428 UErrorCode &errorCode) {
0f5d89e8
A
429 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
430 return;
46f4442e 431 }
4388f060 432
46f4442e 433 /* set up local variables */
4388f060
A
434 UCaseContext csc=UCASECONTEXT_INITIALIZER;
435 csc.p=(void *)src;
436 csc.limit=srcLength;
f3c0d7a5
A
437 int32_t prev=0;
438 UBool isFirstIndex=TRUE;
46f4442e
A
439
440 /* titlecasing loop */
441 while(prev<srcLength) {
442 /* find next index where to titlecase */
f3c0d7a5 443 int32_t index;
46f4442e
A
444 if(isFirstIndex) {
445 isFirstIndex=FALSE;
f3c0d7a5 446 index=iter->first();
46f4442e 447 } else {
f3c0d7a5 448 index=iter->next();
46f4442e 449 }
f3c0d7a5
A
450 if(index==UBRK_DONE || index>srcLength) {
451 index=srcLength;
46f4442e
A
452 }
453
454 /*
0f5d89e8
A
455 * Segment [prev..index[ into 3 parts:
456 * a) skipped characters (copy as-is) [prev..titleStart[
457 * b) first letter (titlecase) [titleStart..titleLimit[
46f4442e
A
458 * c) subsequent characters (lowercase) [titleLimit..index[
459 */
f3c0d7a5 460 if(prev<index) {
0f5d89e8 461 /* find and copy skipped characters [prev..titleStart[ */
f3c0d7a5
A
462 int32_t titleStart=prev;
463 int32_t titleLimit=prev;
464 UChar32 c;
465 U8_NEXT(src, titleLimit, index, c);
0f5d89e8
A
466 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
467 // Adjust the titlecasing index to the next cased character,
468 // or to the next letter/number/symbol/private use.
469 // Stop with titleStart<titleLimit<=index
470 // if there is a character to be titlecased,
471 // or else stop with titleStart==titleLimit==index.
472 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
473 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
46f4442e 474 titleStart=titleLimit;
f3c0d7a5 475 if(titleLimit==index) {
46f4442e
A
476 break;
477 }
f3c0d7a5 478 U8_NEXT(src, titleLimit, index, c);
46f4442e 479 }
0f5d89e8
A
480 if (prev < titleStart) {
481 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
482 sink, options, edits, errorCode)) {
483 return;
484 }
46f4442e
A
485 }
486 }
487
488 if(titleStart<titleLimit) {
489 /* titlecase c which is from [titleStart..titleLimit[ */
f3c0d7a5
A
490 if(c>=0) {
491 csc.cpStart=titleStart;
492 csc.cpLimit=titleLimit;
493 const UChar *s;
494 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
0f5d89e8
A
495 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
496 return;
497 }
f3c0d7a5
A
498 } else {
499 // Malformed UTF-8.
0f5d89e8
A
500 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
501 sink, options, edits, errorCode)) {
502 return;
503 }
a62d09fc 504 }
46f4442e 505
46f4442e 506 /* Special case Dutch IJ titlecasing */
f3c0d7a5
A
507 if (titleStart+1 < index &&
508 caseLocale == UCASE_LOC_DUTCH &&
509 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
510 if (src[titleStart+1] == 0x006A) {
0f5d89e8 511 ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
f3c0d7a5
A
512 titleLimit++;
513 } else if (src[titleStart+1] == 0x004A) {
514 // Keep the capital J from getting lowercased.
0f5d89e8
A
515 if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
516 sink, options, edits, errorCode)) {
517 return;
f3c0d7a5
A
518 }
519 titleLimit++;
520 }
46f4442e 521 }
f3c0d7a5 522
46f4442e 523 /* lowercase [titleLimit..index[ */
f3c0d7a5
A
524 if(titleLimit<index) {
525 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
46f4442e 526 /* Normal operation: Lowercase the rest of the word. */
0f5d89e8
A
527 toLower(caseLocale, options,
528 src, &csc, titleLimit, index,
529 sink, edits, errorCode);
f3c0d7a5 530 if(U_FAILURE(errorCode)) {
0f5d89e8 531 return;
a62d09fc 532 }
46f4442e
A
533 } else {
534 /* Optionally just copy the rest of the word unchanged. */
0f5d89e8
A
535 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
536 sink, options, edits, errorCode)) {
537 return;
a62d09fc 538 }
46f4442e
A
539 }
540 }
541 }
542 }
543
f3c0d7a5 544 prev=index;
46f4442e 545 }
f3c0d7a5
A
546}
547
548#endif
549
550U_NAMESPACE_BEGIN
551namespace GreekUpper {
552
553UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
554 while (i < length) {
555 UChar32 c;
556 U8_NEXT(s, i, length, c);
557 int32_t type = ucase_getTypeOrIgnorable(c);
558 if ((type & UCASE_IGNORABLE) != 0) {
559 // Case-ignorable, continue with the loop.
560 } else if (type != UCASE_NONE) {
561 return TRUE; // Followed by cased letter.
562 } else {
563 return FALSE; // Uncased and not case-ignorable.
564 }
46f4442e 565 }
f3c0d7a5
A
566 return FALSE; // Not followed by cased letter.
567}
568
569// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
0f5d89e8
A
570void toUpper(uint32_t options,
571 const uint8_t *src, int32_t srcLength,
572 ByteSink &sink, Edits *edits,
573 UErrorCode &errorCode) {
f3c0d7a5
A
574 uint32_t state = 0;
575 for (int32_t i = 0; i < srcLength;) {
576 int32_t nextIndex = i;
577 UChar32 c;
578 U8_NEXT(src, nextIndex, srcLength, c);
579 uint32_t nextState = 0;
580 int32_t type = ucase_getTypeOrIgnorable(c);
581 if ((type & UCASE_IGNORABLE) != 0) {
582 // c is case-ignorable
583 nextState |= (state & AFTER_CASED);
584 } else if (type != UCASE_NONE) {
585 // c is cased
586 nextState |= AFTER_CASED;
587 }
588 uint32_t data = getLetterData(c);
589 if (data > 0) {
590 uint32_t upper = data & UPPER_MASK;
591 // Add a dialytika to this iota or ypsilon vowel
592 // if we removed a tonos from the previous vowel,
593 // and that previous vowel did not also have (or gain) a dialytika.
594 // Adding one only to the final vowel in a longer sequence
595 // (which does not occur in normal writing) would require lookahead.
596 // Set the same flag as for preserving an existing dialytika.
597 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
598 (upper == 0x399 || upper == 0x3A5)) {
599 data |= HAS_DIALYTIKA;
600 }
601 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
602 if ((data & HAS_YPOGEGRAMMENI) != 0) {
603 numYpogegrammeni = 1;
604 }
605 // Skip combining diacritics after this Greek letter.
606 int32_t nextNextIndex = nextIndex;
607 while (nextIndex < srcLength) {
608 UChar32 c2;
609 U8_NEXT(src, nextNextIndex, srcLength, c2);
610 uint32_t diacriticData = getDiacriticData(c2);
611 if (diacriticData != 0) {
612 data |= diacriticData;
613 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
614 ++numYpogegrammeni;
615 }
616 nextIndex = nextNextIndex;
617 } else {
618 break; // not a Greek diacritic
619 }
620 }
621 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
622 nextState |= AFTER_VOWEL_WITH_ACCENT;
623 }
624 // Map according to Greek rules.
625 UBool addTonos = FALSE;
626 if (upper == 0x397 &&
627 (data & HAS_ACCENT) != 0 &&
628 numYpogegrammeni == 0 &&
629 (state & AFTER_CASED) == 0 &&
630 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
631 // Keep disjunctive "or" with (only) a tonos.
632 // We use the same "word boundary" conditions as for the Final_Sigma test.
633 if (i == nextIndex) {
634 upper = 0x389; // Preserve the precomposed form.
635 } else {
636 addTonos = TRUE;
637 }
638 } else if ((data & HAS_DIALYTIKA) != 0) {
639 // Preserve a vowel with dialytika in precomposed form if it exists.
640 if (upper == 0x399) {
641 upper = 0x3AA;
642 data &= ~HAS_EITHER_DIALYTIKA;
643 } else if (upper == 0x3A5) {
644 upper = 0x3AB;
645 data &= ~HAS_EITHER_DIALYTIKA;
646 }
647 }
648
0f5d89e8
A
649 UBool change;
650 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
651 change = TRUE; // common, simple usage
652 } else {
f3c0d7a5
A
653 // Find out first whether we are changing the text.
654 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
655 change = (i + 2) > nextIndex ||
656 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
657 numYpogegrammeni > 0;
658 int32_t i2 = i + 2;
659 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
660 change |= (i2 + 2) > nextIndex ||
661 src[i2] != (uint8_t)u8"\u0308"[0] ||
662 src[i2 + 1] != (uint8_t)u8"\u0308"[1];
663 i2 += 2;
664 }
665 if (addTonos) {
666 change |= (i2 + 2) > nextIndex ||
667 src[i2] != (uint8_t)u8"\u0301"[0] ||
668 src[i2 + 1] != (uint8_t)u8"\u0301"[1];
669 i2 += 2;
670 }
671 int32_t oldLength = nextIndex - i;
672 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
673 change |= oldLength != newLength;
674 if (change) {
675 if (edits != NULL) {
676 edits->addReplace(oldLength, newLength);
677 }
678 } else {
679 if (edits != NULL) {
680 edits->addUnchanged(oldLength);
681 }
682 // Write unchanged text?
0f5d89e8 683 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
f3c0d7a5
A
684 }
685 }
686
687 if (change) {
0f5d89e8
A
688 ByteSinkUtil::appendTwoBytes(upper, sink);
689 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
340931cb 690 sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2); // restore or add a dialytika
f3c0d7a5 691 }
0f5d89e8 692 if (addTonos) {
340931cb 693 sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);
f3c0d7a5 694 }
0f5d89e8 695 while (numYpogegrammeni > 0) {
340931cb 696 sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);
f3c0d7a5
A
697 --numYpogegrammeni;
698 }
f3c0d7a5
A
699 }
700 } else if(c>=0) {
701 const UChar *s;
702 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
0f5d89e8
A
703 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
704 return;
f3c0d7a5
A
705 }
706 } else {
707 // Malformed UTF-8.
0f5d89e8
A
708 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
709 sink, options, edits, errorCode)) {
710 return;
f3c0d7a5
A
711 }
712 }
713 i = nextIndex;
714 state = nextState;
715 }
46f4442e
A
716}
717
f3c0d7a5
A
718} // namespace GreekUpper
719U_NAMESPACE_END
46f4442e 720
0f5d89e8 721static void U_CALLCONV
f3c0d7a5 722ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
4388f060 723 const uint8_t *src, int32_t srcLength,
0f5d89e8 724 icu::ByteSink &sink, icu::Edits *edits,
f3c0d7a5 725 UErrorCode &errorCode) {
4388f060
A
726 UCaseContext csc=UCASECONTEXT_INITIALIZER;
727 csc.p=(void *)src;
728 csc.limit=srcLength;
0f5d89e8
A
729 toLower(
730 caseLocale, options,
4388f060 731 src, &csc, 0, srcLength,
0f5d89e8 732 sink, edits, errorCode);
4388f060
A
733}
734
0f5d89e8 735static void U_CALLCONV
f3c0d7a5 736ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
4388f060 737 const uint8_t *src, int32_t srcLength,
0f5d89e8 738 icu::ByteSink &sink, icu::Edits *edits,
f3c0d7a5 739 UErrorCode &errorCode) {
f3c0d7a5 740 if (caseLocale == UCASE_LOC_GREEK) {
0f5d89e8 741 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
f3c0d7a5
A
742 } else {
743 UCaseContext csc=UCASECONTEXT_INITIALIZER;
744 csc.p=(void *)src;
745 csc.limit=srcLength;
0f5d89e8
A
746 toUpper(
747 caseLocale, options,
748 src, &csc, srcLength,
749 sink, edits, errorCode);
f3c0d7a5 750 }
4388f060
A
751}
752
0f5d89e8 753static void U_CALLCONV
f3c0d7a5 754ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
f3c0d7a5 755 const uint8_t *src, int32_t srcLength,
0f5d89e8 756 icu::ByteSink &sink, icu::Edits *edits,
f3c0d7a5 757 UErrorCode &errorCode) {
0f5d89e8
A
758 toLower(
759 -1, options,
760 src, nullptr, 0, srcLength,
761 sink, edits, errorCode);
762}
763
764void
765ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
766 const char *src, int32_t srcLength,
767 UTF8CaseMapper *stringCaseMapper,
768 icu::ByteSink &sink, icu::Edits *edits,
769 UErrorCode &errorCode) {
770 /* check argument values */
771 if (U_FAILURE(errorCode)) {
772 return;
773 }
774 if ((src == nullptr && srcLength != 0) || srcLength < -1) {
775 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
776 return;
777 }
778
779 // Get the string length.
780 if (srcLength == -1) {
781 srcLength = (int32_t)uprv_strlen((const char *)src);
73c04bcf
A
782 }
783
0f5d89e8
A
784 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
785 edits->reset();
786 }
787 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
788 (const uint8_t *)src, srcLength, sink, edits, errorCode);
789 sink.Flush();
790 if (U_SUCCESS(errorCode)) {
791 if (edits != nullptr) {
792 edits->copyErrorTo(errorCode);
793 }
794 }
4388f060 795}
73c04bcf 796
0f5d89e8 797int32_t
f3c0d7a5 798ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
0f5d89e8
A
799 char *dest, int32_t destCapacity,
800 const char *src, int32_t srcLength,
4388f060 801 UTF8CaseMapper *stringCaseMapper,
f3c0d7a5
A
802 icu::Edits *edits,
803 UErrorCode &errorCode) {
73c04bcf 804 /* check argument values */
f3c0d7a5 805 if(U_FAILURE(errorCode)) {
73c04bcf
A
806 return 0;
807 }
808 if( destCapacity<0 ||
809 (dest==NULL && destCapacity>0) ||
0f5d89e8 810 (src==NULL && srcLength!=0) || srcLength<-1
73c04bcf 811 ) {
f3c0d7a5 812 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf
A
813 return 0;
814 }
815
816 /* get the string length */
817 if(srcLength==-1) {
729e4ab9 818 srcLength=(int32_t)uprv_strlen((const char *)src);
73c04bcf
A
819 }
820
821 /* check for overlapping source and destination */
822 if( dest!=NULL &&
823 ((src>=dest && src<(dest+destCapacity)) ||
824 (dest>=src && dest<(src+srcLength)))
825 ) {
f3c0d7a5 826 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf
A
827 return 0;
828 }
829
0f5d89e8
A
830 CheckedArrayByteSink sink(dest, destCapacity);
831 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
f3c0d7a5
A
832 edits->reset();
833 }
0f5d89e8
A
834 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
835 (const uint8_t *)src, srcLength, sink, edits, errorCode);
836 sink.Flush();
837 if (U_SUCCESS(errorCode)) {
838 if (sink.Overflowed()) {
839 errorCode = U_BUFFER_OVERFLOW_ERROR;
840 } else if (edits != nullptr) {
841 edits->copyErrorTo(errorCode);
842 }
843 }
844 return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
73c04bcf
A
845}
846
847/* public API functions */
848
46f4442e 849U_CAPI int32_t U_EXPORT2
73c04bcf
A
850ucasemap_utf8ToLower(const UCaseMap *csm,
851 char *dest, int32_t destCapacity,
852 const char *src, int32_t srcLength,
853 UErrorCode *pErrorCode) {
f3c0d7a5
A
854 return ucasemap_mapUTF8(
855 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
0f5d89e8
A
856 dest, destCapacity,
857 src, srcLength,
f3c0d7a5 858 ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
73c04bcf
A
859}
860
46f4442e 861U_CAPI int32_t U_EXPORT2
73c04bcf
A
862ucasemap_utf8ToUpper(const UCaseMap *csm,
863 char *dest, int32_t destCapacity,
864 const char *src, int32_t srcLength,
865 UErrorCode *pErrorCode) {
f3c0d7a5
A
866 return ucasemap_mapUTF8(
867 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
0f5d89e8
A
868 dest, destCapacity,
869 src, srcLength,
f3c0d7a5 870 ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
46f4442e
A
871}
872
46f4442e
A
873U_CAPI int32_t U_EXPORT2
874ucasemap_utf8FoldCase(const UCaseMap *csm,
875 char *dest, int32_t destCapacity,
876 const char *src, int32_t srcLength,
877 UErrorCode *pErrorCode) {
f3c0d7a5
A
878 return ucasemap_mapUTF8(
879 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
0f5d89e8
A
880 dest, destCapacity,
881 src, srcLength,
f3c0d7a5
A
882 ucasemap_internalUTF8Fold, NULL, *pErrorCode);
883}
884
885U_NAMESPACE_BEGIN
886
0f5d89e8
A
887void CaseMap::utf8ToLower(
888 const char *locale, uint32_t options,
889 StringPiece src, ByteSink &sink, Edits *edits,
890 UErrorCode &errorCode) {
891 ucasemap_mapUTF8(
892 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893 src.data(), src.length(),
894 ucasemap_internalUTF8ToLower, sink, edits, errorCode);
895}
896
897void CaseMap::utf8ToUpper(
898 const char *locale, uint32_t options,
899 StringPiece src, ByteSink &sink, Edits *edits,
900 UErrorCode &errorCode) {
901 ucasemap_mapUTF8(
902 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
903 src.data(), src.length(),
904 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
905}
906
907void CaseMap::utf8Fold(
908 uint32_t options,
909 StringPiece src, ByteSink &sink, Edits *edits,
910 UErrorCode &errorCode) {
911 ucasemap_mapUTF8(
912 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
913 src.data(), src.length(),
914 ucasemap_internalUTF8Fold, sink, edits, errorCode);
915}
916
f3c0d7a5
A
917int32_t CaseMap::utf8ToLower(
918 const char *locale, uint32_t options,
919 const char *src, int32_t srcLength,
920 char *dest, int32_t destCapacity, Edits *edits,
921 UErrorCode &errorCode) {
922 return ucasemap_mapUTF8(
923 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
0f5d89e8
A
924 dest, destCapacity,
925 src, srcLength,
f3c0d7a5
A
926 ucasemap_internalUTF8ToLower, edits, errorCode);
927}
928
929int32_t CaseMap::utf8ToUpper(
930 const char *locale, uint32_t options,
931 const char *src, int32_t srcLength,
932 char *dest, int32_t destCapacity, Edits *edits,
933 UErrorCode &errorCode) {
934 return ucasemap_mapUTF8(
935 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
0f5d89e8
A
936 dest, destCapacity,
937 src, srcLength,
f3c0d7a5 938 ucasemap_internalUTF8ToUpper, edits, errorCode);
46f4442e 939}
f3c0d7a5
A
940
941int32_t CaseMap::utf8Fold(
942 uint32_t options,
943 const char *src, int32_t srcLength,
944 char *dest, int32_t destCapacity, Edits *edits,
945 UErrorCode &errorCode) {
946 return ucasemap_mapUTF8(
947 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
0f5d89e8
A
948 dest, destCapacity,
949 src, srcLength,
f3c0d7a5
A
950 ucasemap_internalUTF8Fold, edits, errorCode);
951}
952
953U_NAMESPACE_END