git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucasemap.cpp

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /*

4 *******************************************************************************

5 *

8 *

9 *******************************************************************************

10 * file name: ucasemap.cpp

11 * encoding: UTF-8

12 * tab size: 8 (not used)

13 * indentation:4

14 *

15 * created on: 2005may06

16 * created by: Markus W. Scherer

17 *

18 * Case mapping service object and functions using it.

19 */

21 #include "unicode/utypes.h"

22 #include "unicode/brkiter.h"

23 #include "unicode/bytestream.h"

24 #include "unicode/casemap.h"

25 #include "unicode/edits.h"

26 #include "unicode/stringoptions.h"

27 #include "unicode/stringpiece.h"

28 #include "unicode/ubrk.h"

29 #include "unicode/uloc.h"

30 #include "unicode/ustring.h"

31 #include "unicode/ucasemap.h"

32 #if !UCONFIG_NO_BREAK_ITERATION

33 #include "unicode/utext.h"

34 #endif

35 #include "unicode/utf.h"

36 #include "unicode/utf8.h"

37 #include "unicode/utf16.h"

38 #include "bytesinkutil.h"

39 #include "cmemory.h"

40 #include "cstring.h"

41 #include "uassert.h"

42 #include "ucase.h"

43 #include "ucasemap_imp.h"

44 #include "ustr_imp.h"

46 U_NAMESPACE_USE

48 /* UCaseMap service object -------------------------------------------------- */

 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :

51 #if !UCONFIG_NO_BREAK_ITERATION

52 iter(NULL),

53 #endif

         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {

     ucasemap_setLocale(this, localeID, pErrorCode);

56 }

58 UCaseMap::~UCaseMap() {

59 #if !UCONFIG_NO_BREAK_ITERATION

60 delete iter;

61 #endif

62 }

64 U_CAPI UCaseMap * U_EXPORT2

 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {

     if(U_FAILURE(*pErrorCode)) {

67 return NULL;

68 }

     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);

70 if(csm==NULL) {

71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;

72 return NULL;

     } else if (U_FAILURE(*pErrorCode)) {

74 delete csm;

75 return NULL;

76 }

77 return csm;

78 }

80 U_CAPI void U_EXPORT2

81 ucasemap_close(UCaseMap *csm) {

82 delete csm;

83 }

85 U_CAPI const char * U_EXPORT2

 ucasemap_getLocale(const UCaseMap *csm) {

87 return csm->locale;

88 }

90 U_CAPI uint32_t U_EXPORT2

 ucasemap_getOptions(const UCaseMap *csm) {

92 return csm->options;

93 }

95 U_CAPI void U_EXPORT2

 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {

     if(U_FAILURE(*pErrorCode)) {

98 return;

99 }

     if (locale != NULL && *locale == 0) {

         csm->locale[0] = 0;

102 csm->caseLocale = UCASE_LOC_ROOT;

103 return;

104 }

105

     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);

     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {

108 *pErrorCode=U_ZERO_ERROR;

109 /* we only really need the language code for case mappings */

         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);

111 }

     if(length==sizeof(csm->locale)) {

113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

114 }

     if(U_SUCCESS(*pErrorCode)) {

116 csm->caseLocale=UCASE_LOC_UNKNOWN;

         csm->caseLocale = ucase_getCaseLocale(csm->locale);

118 } else {

         csm->locale[0]=0;

120 csm->caseLocale = UCASE_LOC_ROOT;

121 }

122 }

123

124 U_CAPI void U_EXPORT2

 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {

     if(U_FAILURE(*pErrorCode)) {

127 return;

128 }

129 csm->options=options;

130 }

131

132 /* UTF-8 string case mappings ----------------------------------------------- */

133

134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */

135

136 namespace {

137

138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */

139 inline UBool

 appendResult(int32_t cpLength, int32_t result, const UChar *s,

              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {

     U_ASSERT(U_SUCCESS(errorCode));

143

144 /* decode the result */

     if(result<0) {

146 /* (not) original code point */

147 if(edits!=NULL) {

148 edits->addUnchanged(cpLength);

149 }

         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {

             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);

152 }

153 } else {

154 if(result<=UCASE_MAX_STRING_LENGTH) {

155 // string: "result" is the UTF-16 length

             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);

157 } else {

             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);

159 }

160 }

161 return TRUE;

162 }

163

164 // See unicode/utf8.h U8_APPEND_UNSAFE().

 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }

 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }

167

168 UChar32 U_CALLCONV

 utf8_caseContextIterator(void *context, int8_t dir) {

170 UCaseContext *csc=(UCaseContext *)context;

171 UChar32 c;

172

     if(dir<0) {

174 /* reset for backward iteration */

175 csc->index=csc->cpStart;

176 csc->dir=dir;

     } else if(dir>0) {

178 /* reset for forward iteration */

179 csc->index=csc->cpLimit;

180 csc->dir=dir;

181 } else {

182 /* continue current iteration direction */

183 dir=csc->dir;

184 }

185

     if(dir<0) {

         if(csc->start<csc->index) {

             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);

189 return c;

190 }

191 } else {

         if(csc->index<csc->limit) {

             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);

194 return c;

195 }

196 }

197 return U_SENTINEL;

198 }

199

200 /**

201 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.

202 * caseLocale < 0: Case-folds [srcStart..srcLimit[.

203 */

 void toLower(int32_t caseLocale, uint32_t options,

              const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,

              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {

207 const int8_t *latinToLower;

208 if (caseLocale == UCASE_LOC_ROOT ||

209 (caseLocale >= 0 ?

                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :

211 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {

212 latinToLower = LatinCase::TO_LOWER_NORMAL;

213 } else {

214 latinToLower = LatinCase::TO_LOWER_TR_LT;

215 }

     const UTrie2 *trie = ucase_getTrie();

217 int32_t prev = srcStart;

218 int32_t srcIndex = srcStart;

219 for (;;) {

220 // fast path for simple cases

221 int32_t cpStart;

222 UChar32 c;

223 for (;;) {

             if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {

225 c = U_SENTINEL;

226 break;

227 }

228 uint8_t lead = src[srcIndex++];

             if (lead <= 0x7f) {

230 int8_t d = latinToLower[lead];

                 if (d == LatinCase::EXC) {

232 cpStart = srcIndex - 1;

233 c = lead;

234 break;

235 }

                 if (d == 0) { continue; }

                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,

238 sink, options, edits, errorCode);

                 char ascii = (char)(lead + d);

                 sink.Append(&ascii, 1);

                 if (edits != nullptr) {

                     edits->addReplace(1, 1);

243 }

244 prev = srcIndex;

245 continue;

             } else if (lead < 0xe3) {

247 uint8_t t;

                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&

                         (t = src[srcIndex] - 0x80) <= 0x3f) {

250 // U+0080..U+017F

251 ++srcIndex;

                     c = ((lead - 0xc0) << 6) | t;

253 int8_t d = latinToLower[c];

                     if (d == LatinCase::EXC) {

255 cpStart = srcIndex - 2;

256 break;

257 }

                     if (d == 0) { continue; }

                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,

260 sink, options, edits, errorCode);

                     ByteSinkUtil::appendTwoBytes(c + d, sink);

                     if (edits != nullptr) {

                         edits->addReplace(2, 2);

264 }

265 prev = srcIndex;

266 continue;

267 }

             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&

                     (srcIndex + 2) <= srcLimit &&

                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {

271 // most of CJK: no case mappings

272 srcIndex += 2;

273 continue;

274 }

275 cpStart = --srcIndex;

             U8_NEXT(src, srcIndex, srcLimit, c);

             if (c < 0) {

278 // ill-formed UTF-8

279 continue;

280 }

             uint16_t props = UTRIE2_GET16(trie, c);

             if (UCASE_HAS_EXCEPTION(props)) { break; }

283 int32_t delta;

             if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {

285 continue;

286 }

             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,

288 sink, options, edits, errorCode);

             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);

290 prev = srcIndex;

291 }

         if (c < 0) {

293 break;

294 }

295 // slow path

296 const UChar *s;

         if (caseLocale >= 0) {

298 csc->cpStart = cpStart;

299 csc->cpLimit = srcIndex;

             c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);

301 } else {

             c = ucase_toFullFolding(c, &s, options);

303 }

         if (c >= 0) {

             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,

306 sink, options, edits, errorCode);

             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);

308 prev = srcIndex;

309 }

310 }

     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,

312 sink, options, edits, errorCode);

313 }

314

 void toUpper(int32_t caseLocale, uint32_t options,

              const uint8_t *src, UCaseContext *csc, int32_t srcLength,

              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {

318 const int8_t *latinToUpper;

319 if (caseLocale == UCASE_LOC_TURKISH) {

320 latinToUpper = LatinCase::TO_UPPER_TR;

321 } else {

322 latinToUpper = LatinCase::TO_UPPER_NORMAL;

323 }

     const UTrie2 *trie = ucase_getTrie();

325 int32_t prev = 0;

326 int32_t srcIndex = 0;

327 for (;;) {

328 // fast path for simple cases

329 int32_t cpStart;

330 UChar32 c;

331 for (;;) {

             if (U_FAILURE(errorCode) || srcIndex >= srcLength) {

333 c = U_SENTINEL;

334 break;

335 }

336 uint8_t lead = src[srcIndex++];

             if (lead <= 0x7f) {

338 int8_t d = latinToUpper[lead];

                 if (d == LatinCase::EXC) {

340 cpStart = srcIndex - 1;

341 c = lead;

342 break;

343 }

                 if (d == 0) { continue; }

                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,

346 sink, options, edits, errorCode);

                 char ascii = (char)(lead + d);

                 sink.Append(&ascii, 1);

                 if (edits != nullptr) {

                     edits->addReplace(1, 1);

351 }

352 prev = srcIndex;

353 continue;

             } else if (lead < 0xe3) {

355 uint8_t t;

                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&

                         (t = src[srcIndex] - 0x80) <= 0x3f) {

358 // U+0080..U+017F

359 ++srcIndex;

                     c = ((lead - 0xc0) << 6) | t;

361 int8_t d = latinToUpper[c];

                     if (d == LatinCase::EXC) {

363 cpStart = srcIndex - 2;

364 break;

365 }

                     if (d == 0) { continue; }

                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,

368 sink, options, edits, errorCode);

                     ByteSinkUtil::appendTwoBytes(c + d, sink);

                     if (edits != nullptr) {

                         edits->addReplace(2, 2);

372 }

373 prev = srcIndex;

374 continue;

375 }

             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&

                     (srcIndex + 2) <= srcLength &&

                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {

379 // most of CJK: no case mappings

380 srcIndex += 2;

381 continue;

382 }

383 cpStart = --srcIndex;

             U8_NEXT(src, srcIndex, srcLength, c);

             if (c < 0) {

386 // ill-formed UTF-8

387 continue;

388 }

             uint16_t props = UTRIE2_GET16(trie, c);

             if (UCASE_HAS_EXCEPTION(props)) { break; }

391 int32_t delta;

             if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {

393 continue;

394 }

             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,

396 sink, options, edits, errorCode);

             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);

398 prev = srcIndex;

399 }

         if (c < 0) {

401 break;

402 }

403 // slow path

404 csc->cpStart = cpStart;

405 csc->cpLimit = srcIndex;

406 const UChar *s;

         c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);

         if (c >= 0) {

             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,

410 sink, options, edits, errorCode);

             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);

412 prev = srcIndex;

413 }

414 }

     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,

416 sink, options, edits, errorCode);

417 }

418

419 } // namespace

420

421 #if !UCONFIG_NO_BREAK_ITERATION

422

423 U_CFUNC void U_CALLCONV

424 ucasemap_internalUTF8ToTitle(

         int32_t caseLocale, uint32_t options, BreakIterator *iter,

         const uint8_t *src, int32_t srcLength,

         ByteSink &sink, icu::Edits *edits,

428 UErrorCode &errorCode) {

     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {

430 return;

431 }

432

433 /* set up local variables */

434 UCaseContext csc=UCASECONTEXT_INITIALIZER;

     csc.p=(void *)src;

436 csc.limit=srcLength;

437 int32_t prev=0;

438 UBool isFirstIndex=TRUE;

439

440 /* titlecasing loop */

441 while(prev<srcLength) {

442 /* find next index where to titlecase */

443 int32_t index;

444 if(isFirstIndex) {

445 isFirstIndex=FALSE;

446 index=iter->first();

447 } else {

448 index=iter->next();

449 }

         if(index==UBRK_DONE || index>srcLength) {

451 index=srcLength;

452 }

453

454 /*

455 * Segment [prev..index[ into 3 parts:

456 * a) skipped characters (copy as-is) [prev..titleStart[

457 * b) first letter (titlecase) [titleStart..titleLimit[

458 * c) subsequent characters (lowercase) [titleLimit..index[

459 */

460 if(prev<index) {

461 /* find and copy skipped characters [prev..titleStart[ */

462 int32_t titleStart=prev;

463 int32_t titleLimit=prev;

464 UChar32 c;

             U8_NEXT(src, titleLimit, index, c);

             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {

467 // Adjust the titlecasing index to the next cased character,

468 // or to the next letter/number/symbol/private use.

469 // Stop with titleStart<titleLimit<=index

470 // if there is a character to be titlecased,

471 // or else stop with titleStart==titleLimit==index.

                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;

                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {

474 titleStart=titleLimit;

475 if(titleLimit==index) {

476 break;

477 }

                     U8_NEXT(src, titleLimit, index, c);

479 }

480 if (prev < titleStart) {

                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,

482 sink, options, edits, errorCode)) {

483 return;

484 }

485 }

486 }

487

488 if(titleStart<titleLimit) {

489 /* titlecase c which is from [titleStart..titleLimit[ */

                 if(c>=0) {

491 csc.cpStart=titleStart;

492 csc.cpLimit=titleLimit;

493 const UChar *s;

                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);

                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {

496 return;

497 }

498 } else {

499 // Malformed UTF-8.

                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,

501 sink, options, edits, errorCode)) {

502 return;

503 }

504 }

505

506 /* Special case Dutch IJ titlecasing */

                 if (titleStart+1 < index &&

508 caseLocale == UCASE_LOC_DUTCH &&

                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {

                     if (src[titleStart+1] == 0x006A) {

                         ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);

512 titleLimit++;

                     } else if (src[titleStart+1] == 0x004A) {

514 // Keep the capital J from getting lowercased.

                         if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,

516 sink, options, edits, errorCode)) {

517 return;

518 }

519 titleLimit++;

520 }

521 }

522

523 /* lowercase [titleLimit..index[ */

524 if(titleLimit<index) {

                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {

526 /* Normal operation: Lowercase the rest of the word. */

527 toLower(caseLocale, options,

528 src, &csc, titleLimit, index,

529 sink, edits, errorCode);

                         if(U_FAILURE(errorCode)) {

531 return;

532 }

533 } else {

534 /* Optionally just copy the rest of the word unchanged. */

                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,

536 sink, options, edits, errorCode)) {

537 return;

538 }

539 }

540 }

541 }

542 }

543

544 prev=index;

545 }

546 }

547

548 #endif

549

550 U_NAMESPACE_BEGIN

551 namespace GreekUpper {

552

 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {

554 while (i < length) {

555 UChar32 c;

         U8_NEXT(s, i, length, c);

         int32_t type = ucase_getTypeOrIgnorable(c);

         if ((type & UCASE_IGNORABLE) != 0) {

559 // Case-ignorable, continue with the loop.

         } else if (type != UCASE_NONE) {

561 return TRUE; // Followed by cased letter.

562 } else {

563 return FALSE; // Uncased and not case-ignorable.

564 }

565 }

566 return FALSE; // Not followed by cased letter.

567 }

568

569 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.

 void toUpper(uint32_t options,

              const uint8_t *src, int32_t srcLength,

572 ByteSink &sink, Edits *edits,

573 UErrorCode &errorCode) {

574 uint32_t state = 0;

     for (int32_t i = 0; i < srcLength;) {

576 int32_t nextIndex = i;

577 UChar32 c;

         U8_NEXT(src, nextIndex, srcLength, c);

579 uint32_t nextState = 0;

         int32_t type = ucase_getTypeOrIgnorable(c);

         if ((type & UCASE_IGNORABLE) != 0) {

582 // c is case-ignorable

583 nextState |= (state & AFTER_CASED);

         } else if (type != UCASE_NONE) {

585 // c is cased

586 nextState |= AFTER_CASED;

587 }

         uint32_t data = getLetterData(c);

         if (data > 0) {

590 uint32_t upper = data & UPPER_MASK;

591 // Add a dialytika to this iota or ypsilon vowel

592 // if we removed a tonos from the previous vowel,

593 // and that previous vowel did not also have (or gain) a dialytika.

594 // Adding one only to the final vowel in a longer sequence

595 // (which does not occur in normal writing) would require lookahead.

596 // Set the same flag as for preserving an existing dialytika.

             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&

                     (upper == 0x399 || upper == 0x3A5)) {

599 data |= HAS_DIALYTIKA;

600 }

             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.

             if ((data & HAS_YPOGEGRAMMENI) != 0) {

603 numYpogegrammeni = 1;

604 }

605 // Skip combining diacritics after this Greek letter.

606 int32_t nextNextIndex = nextIndex;

607 while (nextIndex < srcLength) {

608 UChar32 c2;

                 U8_NEXT(src, nextNextIndex, srcLength, c2);

                 uint32_t diacriticData = getDiacriticData(c2);

                 if (diacriticData != 0) {

612 data |= diacriticData;

                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {

614 ++numYpogegrammeni;

615 }

616 nextIndex = nextNextIndex;

617 } else {

618 break; // not a Greek diacritic

619 }

620 }

             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {

622 nextState |= AFTER_VOWEL_WITH_ACCENT;

623 }

624 // Map according to Greek rules.

625 UBool addTonos = FALSE;

             if (upper == 0x397 &&

                     (data & HAS_ACCENT) != 0 &&

628 numYpogegrammeni == 0 &&

                     (state & AFTER_CASED) == 0 &&

                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {

631 // Keep disjunctive "or" with (only) a tonos.

632 // We use the same "word boundary" conditions as for the Final_Sigma test.

633 if (i == nextIndex) {

634 upper = 0x389; // Preserve the precomposed form.

635 } else {

636 addTonos = TRUE;

637 }

             } else if ((data & HAS_DIALYTIKA) != 0) {

639 // Preserve a vowel with dialytika in precomposed form if it exists.

                 if (upper == 0x399) {

641 upper = 0x3AA;

642 data &= ~HAS_EITHER_DIALYTIKA;

                 } else if (upper == 0x3A5) {

644 upper = 0x3AB;

645 data &= ~HAS_EITHER_DIALYTIKA;

646 }

647 }

648

649 UBool change;

             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {

651 change = TRUE; // common, simple usage

652 } else {

653 // Find out first whether we are changing the text.

                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block

                 change = (i + 2) > nextIndex ||

                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||

657 numYpogegrammeni > 0;

                 int32_t i2 = i + 2;

                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {

                     change |= (i2 + 2) > nextIndex ||

                             src[i2] != (uint8_t)u8"\u0308"[0] ||

                             src[i2 + 1] != (uint8_t)u8"\u0308"[1];

663 i2 += 2;

664 }

665 if (addTonos) {

                     change |= (i2 + 2) > nextIndex ||

                             src[i2] != (uint8_t)u8"\u0301"[0] ||

                             src[i2 + 1] != (uint8_t)u8"\u0301"[1];

669 i2 += 2;

670 }

671 int32_t oldLength = nextIndex - i;

                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399

673 change |= oldLength != newLength;

674 if (change) {

675 if (edits != NULL) {

                         edits->addReplace(oldLength, newLength);

677 }

678 } else {

679 if (edits != NULL) {

680 edits->addUnchanged(oldLength);

681 }

682 // Write unchanged text?

                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;

684 }

685 }

686

687 if (change) {

                 ByteSinkUtil::appendTwoBytes(upper, sink);

                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {

                     sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2);  // restore or add a dialytika

691 }

692 if (addTonos) {

                     sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);

694 }

                 while (numYpogegrammeni > 0) {

                     sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);

697 --numYpogegrammeni;

698 }

699 }

         } else if(c>=0) {

701 const UChar *s;

             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);

             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {

704 return;

705 }

706 } else {

707 // Malformed UTF-8.

             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,

709 sink, options, edits, errorCode)) {

710 return;

711 }

712 }

713 i = nextIndex;

714 state = nextState;

715 }

716 }

717

718 } // namespace GreekUpper

719 U_NAMESPACE_END

720

721 static void U_CALLCONV

 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED

                              const uint8_t *src, int32_t srcLength,

                              icu::ByteSink &sink, icu::Edits *edits,

725 UErrorCode &errorCode) {

726 UCaseContext csc=UCASECONTEXT_INITIALIZER;

     csc.p=(void *)src;

728 csc.limit=srcLength;

729 toLower(

730 caseLocale, options,

         src, &csc, 0, srcLength,

732 sink, edits, errorCode);

733 }

734

735 static void U_CALLCONV

 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED

                              const uint8_t *src, int32_t srcLength,

                              icu::ByteSink &sink, icu::Edits *edits,

739 UErrorCode &errorCode) {

740 if (caseLocale == UCASE_LOC_GREEK) {

         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);

742 } else {

743 UCaseContext csc=UCASECONTEXT_INITIALIZER;

         csc.p=(void *)src;

745 csc.limit=srcLength;

746 toUpper(

747 caseLocale, options,

748 src, &csc, srcLength,

749 sink, edits, errorCode);

750 }

751 }

752

753 static void U_CALLCONV

 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED

                           const uint8_t *src, int32_t srcLength,

                           icu::ByteSink &sink, icu::Edits *edits,

757 UErrorCode &errorCode) {

758 toLower(

759 -1, options,

         src, nullptr, 0, srcLength,

761 sink, edits, errorCode);

762 }

763

764 void

 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM

                  const char *src, int32_t srcLength,

767 UTF8CaseMapper *stringCaseMapper,

                  icu::ByteSink &sink, icu::Edits *edits,

769 UErrorCode &errorCode) {

770 /* check argument values */

     if (U_FAILURE(errorCode)) {

772 return;

773 }

     if ((src == nullptr && srcLength != 0) || srcLength < -1) {

775 errorCode = U_ILLEGAL_ARGUMENT_ERROR;

776 return;

777 }

778

779 // Get the string length.

     if (srcLength == -1) {

         srcLength = (int32_t)uprv_strlen((const char *)src);

782 }

783

     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {

785 edits->reset();

786 }

787 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR

                      (const uint8_t *)src, srcLength, sink, edits, errorCode);

789 sink.Flush();

     if (U_SUCCESS(errorCode)) {

         if (edits != nullptr) {

792 edits->copyErrorTo(errorCode);

793 }

794 }

795 }

796

797 int32_t

 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM

                  char *dest, int32_t destCapacity,

                  const char *src, int32_t srcLength,

801 UTF8CaseMapper *stringCaseMapper,

802 icu::Edits *edits,

803 UErrorCode &errorCode) {

804 /* check argument values */

     if(U_FAILURE(errorCode)) {

806 return 0;

807 }

     if( destCapacity<0 ||

         (dest==NULL && destCapacity>0) ||

         (src==NULL && srcLength!=0) || srcLength<-1

811 ) {

812 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

813 return 0;

814 }

815

816 /* get the string length */

     if(srcLength==-1) {

         srcLength=(int32_t)uprv_strlen((const char *)src);

819 }

820

821 /* check for overlapping source and destination */

822 if( dest!=NULL &&

         ((src>=dest && src<(dest+destCapacity)) ||

          (dest>=src && dest<(src+srcLength)))

825 ) {

826 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

827 return 0;

828 }

829

830 CheckedArrayByteSink sink(dest, destCapacity);

     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {

832 edits->reset();

833 }

834 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR

                      (const uint8_t *)src, srcLength, sink, edits, errorCode);

836 sink.Flush();

     if (U_SUCCESS(errorCode)) {

         if (sink.Overflowed()) {

839 errorCode = U_BUFFER_OVERFLOW_ERROR;

         } else if (edits != nullptr) {

841 edits->copyErrorTo(errorCode);

842 }

843 }

     return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);

845 }

846

847 /* public API functions */

848

849 U_CAPI int32_t U_EXPORT2

 ucasemap_utf8ToLower(const UCaseMap *csm,

                      char *dest, int32_t destCapacity,

                      const char *src, int32_t srcLength,

853 UErrorCode *pErrorCode) {

854 return ucasemap_mapUTF8(

855 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL

856 dest, destCapacity,

857 src, srcLength,

858 ucasemap_internalUTF8ToLower, NULL, *pErrorCode);

859 }

860

861 U_CAPI int32_t U_EXPORT2

 ucasemap_utf8ToUpper(const UCaseMap *csm,

                      char *dest, int32_t destCapacity,

                      const char *src, int32_t srcLength,

865 UErrorCode *pErrorCode) {

866 return ucasemap_mapUTF8(

867 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL

868 dest, destCapacity,

869 src, srcLength,

870 ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);

871 }

872

873 U_CAPI int32_t U_EXPORT2

 ucasemap_utf8FoldCase(const UCaseMap *csm,

                       char *dest, int32_t destCapacity,

                       const char *src, int32_t srcLength,

877 UErrorCode *pErrorCode) {

878 return ucasemap_mapUTF8(

879 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL

880 dest, destCapacity,

881 src, srcLength,

882 ucasemap_internalUTF8Fold, NULL, *pErrorCode);

883 }

884

885 U_NAMESPACE_BEGIN

886

 void CaseMap::utf8ToLower(

         const char *locale, uint32_t options,

         StringPiece src, ByteSink &sink, Edits *edits,

890 UErrorCode &errorCode) {

891 ucasemap_mapUTF8(

892 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL

         src.data(), src.length(),

894 ucasemap_internalUTF8ToLower, sink, edits, errorCode);

895 }

896

 void CaseMap::utf8ToUpper(

         const char *locale, uint32_t options,

         StringPiece src, ByteSink &sink, Edits *edits,

900 UErrorCode &errorCode) {

901 ucasemap_mapUTF8(

902 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL

         src.data(), src.length(),

904 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);

905 }

906

 void CaseMap::utf8Fold(

908 uint32_t options,

         StringPiece src, ByteSink &sink, Edits *edits,

910 UErrorCode &errorCode) {

911 ucasemap_mapUTF8(

912 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL

         src.data(), src.length(),

914 ucasemap_internalUTF8Fold, sink, edits, errorCode);

915 }

916

 int32_t CaseMap::utf8ToLower(

         const char *locale, uint32_t options,

         const char *src, int32_t srcLength,

         char *dest, int32_t destCapacity, Edits *edits,

921 UErrorCode &errorCode) {

922 return ucasemap_mapUTF8(

923 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL

924 dest, destCapacity,

925 src, srcLength,

926 ucasemap_internalUTF8ToLower, edits, errorCode);

927 }

928

 int32_t CaseMap::utf8ToUpper(

         const char *locale, uint32_t options,

         const char *src, int32_t srcLength,

         char *dest, int32_t destCapacity, Edits *edits,

933 UErrorCode &errorCode) {

934 return ucasemap_mapUTF8(

935 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL

936 dest, destCapacity,

937 src, srcLength,

938 ucasemap_internalUTF8ToUpper, edits, errorCode);

939 }

940

 int32_t CaseMap::utf8Fold(

942 uint32_t options,

         const char *src, int32_t srcLength,

         char *dest, int32_t destCapacity, Edits *edits,

945 UErrorCode &errorCode) {

946 return ucasemap_mapUTF8(

947 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL

948 dest, destCapacity,

949 src, srcLength,

950 ucasemap_internalUTF8Fold, edits, errorCode);

951 }

952

953 U_NAMESPACE_END