icuSources/common/ustrcase.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2001-2015, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  ustrcase.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2002feb20
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Implementation file for string casing C API functions.
  19 *   Uses functions from uchar.c for basic functionality that requires access
  20 *   to the Unicode Character Database (uprops.dat).
  21 */
  22
  23 #include "unicode/utypes.h"
  24 #include "unicode/brkiter.h"
  25 #include "unicode/casemap.h"
  26 #include "unicode/edits.h"
  27 #include "unicode/ustring.h"
  28 #include "unicode/ucasemap.h"
  29 #include "unicode/ubrk.h"
  30 #include "unicode/utf.h"
  31 #include "unicode/utf16.h"
  32 #include "cmemory.h"
  33 #include "ucase.h"
  34 #include "ucasemap_imp.h"
  35 #include "ustr_imp.h"
  36 #include "uassert.h"
  37
  38 U_NAMESPACE_BEGIN
  39
  40 namespace {
  41
  42 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
  43                                    Edits *edits, UErrorCode &errorCode) {
  44     if (U_SUCCESS(errorCode)) {
  45         if (destIndex > destCapacity) {
  46             errorCode = U_BUFFER_OVERFLOW_ERROR;
  47         } else if (edits != NULL) {
  48             edits->copyErrorTo(errorCode);
  49         }
  50     }
  51     return destIndex;
  52 }
  53
  54 }  // namespace
  55
  56 U_NAMESPACE_END
  57
  58 U_NAMESPACE_USE
  59
  60 /* string casing ------------------------------------------------------------ */
  61
  62 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
  63 static inline int32_t
  64 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
  65              int32_t result, const UChar *s,
  66              int32_t cpLength, uint32_t options, icu::Edits *edits) {
  67     UChar32 c;
  68     int32_t length;
  69
  70     /* decode the result */
  71     if(result<0) {
  72         /* (not) original code point */
  73         if(edits!=NULL) {
  74             edits->addUnchanged(cpLength);
  75             if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
  76                 return destIndex;
  77             }
  78         }
  79         c=~result;
  80         if(destIndex<destCapacity && c<=0xffff) {  // BMP slightly-fastpath
  81             dest[destIndex++]=(UChar)c;
  82             return destIndex;
  83         }
  84         length=cpLength;
  85     } else {
  86         if(result<=UCASE_MAX_STRING_LENGTH) {
  87             c=U_SENTINEL;
  88             length=result;
  89         } else if(destIndex<destCapacity && result<=0xffff) {  // BMP slightly-fastpath
  90             dest[destIndex++]=(UChar)result;
  91             if(edits!=NULL) {
  92                 edits->addReplace(cpLength, 1);
  93             }
  94             return destIndex;
  95         } else {
  96             c=result;
  97             length=U16_LENGTH(c);
  98         }
  99         if(edits!=NULL) {
 100             edits->addReplace(cpLength, length);
 101         }
 102     }
 103     if(length>(INT32_MAX-destIndex)) {
 104         return -1;  // integer overflow
 105     }
 106
 107     if(destIndex<destCapacity) {
 108         /* append the result */
 109         if(c>=0) {
 110             /* code point */
 111             UBool isError=FALSE;
 112             U16_APPEND(dest, destIndex, destCapacity, c, isError);
 113             if(isError) {
 114                 /* overflow, nothing written */
 115                 destIndex+=length;
 116             }
 117         } else {
 118             /* string */
 119             if((destIndex+length)<=destCapacity) {
 120                 while(length>0) {
 121                     dest[destIndex++]=*s++;
 122                     --length;
 123                 }
 124             } else {
 125                 /* overflow */
 126                 destIndex+=length;
 127             }
 128         }
 129     } else {
 130         /* preflight */
 131         destIndex+=length;
 132     }
 133     return destIndex;
 134 }
 135
 136 static inline int32_t
 137 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
 138     if(destIndex<destCapacity) {
 139         dest[destIndex]=c;
 140     } else if(destIndex==INT32_MAX) {
 141         return -1;  // integer overflow
 142     }
 143     return destIndex+1;
 144 }
 145
 146 static inline int32_t
 147 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
 148                 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
 149     if(length>0) {
 150         if(edits!=NULL) {
 151             edits->addUnchanged(length);
 152             if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
 153                 return destIndex;
 154             }
 155         }
 156         if(length>(INT32_MAX-destIndex)) {
 157             return -1;  // integer overflow
 158         }
 159         if((destIndex+length)<=destCapacity) {
 160             u_memcpy(dest+destIndex, s, length);
 161         }
 162         destIndex+=length;
 163     }
 164     return destIndex;
 165 }
 166
 167 static UChar32 U_CALLCONV
 168 utf16_caseContextIterator(void *context, int8_t dir) {
 169     UCaseContext *csc=(UCaseContext *)context;
 170     UChar32 c;
 171
 172     if(dir<0) {
 173         /* reset for backward iteration */
 174         csc->index=csc->cpStart;
 175         csc->dir=dir;
 176     } else if(dir>0) {
 177         /* reset for forward iteration */
 178         csc->index=csc->cpLimit;
 179         csc->dir=dir;
 180     } else {
 181         /* continue current iteration direction */
 182         dir=csc->dir;
 183     }
 184
 185     if(dir<0) {
 186         if(csc->start<csc->index) {
 187             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
 188             return c;
 189         }
 190     } else {
 191         if(csc->index<csc->limit) {
 192             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
 193             return c;
 194         }
 195     }
 196     return U_SENTINEL;
 197 }
 198
 199 /*
 200  * Case-maps [srcStart..srcLimit[ but takes
 201  * context [0..srcLength[ into account.
 202  */
 203 static int32_t
 204 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
 205          UChar *dest, int32_t destCapacity,
 206          const UChar *src, UCaseContext *csc,
 207          int32_t srcStart, int32_t srcLimit,
 208          icu::Edits *edits,
 209          UErrorCode &errorCode) {
 210     /* case mapping loop */
 211     int32_t srcIndex=srcStart;
 212     int32_t destIndex=0;
 213     while(srcIndex<srcLimit) {
 214         int32_t cpStart;
 215         csc->cpStart=cpStart=srcIndex;
 216         UChar32 c;
 217         U16_NEXT(src, srcIndex, srcLimit, c);
 218         csc->cpLimit=srcIndex;
 219         const UChar *s;
 220         c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
 221         destIndex = appendResult(dest, destIndex, destCapacity, c, s,
 222                                  srcIndex - cpStart, options, edits);
 223         if (destIndex < 0) {
 224             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 225             return 0;
 226         }
 227     }
 228
 229     return destIndex;
 230 }
 231
 232 #if !UCONFIG_NO_BREAK_ITERATION
 233
 234 U_CFUNC int32_t U_CALLCONV
 235 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
 236                          UChar *dest, int32_t destCapacity,
 237                          const UChar *src, int32_t srcLength,
 238                          icu::Edits *edits,
 239                          UErrorCode &errorCode) {
 240     if(U_FAILURE(errorCode)) {
 241         return 0;
 242     }
 243
 244     /* set up local variables */
 245     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 246     csc.p=(void *)src;
 247     csc.limit=srcLength;
 248     int32_t destIndex=0;
 249     int32_t prev=0;
 250     UBool isFirstIndex=TRUE;
 251
 252     /* titlecasing loop */
 253     while(prev<srcLength) {
 254         /* find next index where to titlecase */
 255         int32_t index;
 256         if(isFirstIndex) {
 257             isFirstIndex=FALSE;
 258             index=iter->first();
 259         } else {
 260             index=iter->next();
 261         }
 262         if(index==UBRK_DONE || index>srcLength) {
 263             index=srcLength;
 264         }
 265
 266         /*
 267          * Unicode 4 & 5 section 3.13 Default Case Operations:
 268          *
 269          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
 270          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
 271          * cased character F. If F exists, map F to default_title(F); then map each
 272          * subsequent character C to default_lower(C).
 273          *
 274          * In this implementation, segment [prev..index[ into 3 parts:
 275          * a) uncased characters (copy as-is) [prev..titleStart[
 276          * b) first case letter (titlecase)         [titleStart..titleLimit[
 277          * c) subsequent characters (lowercase)                 [titleLimit..index[
 278          */
 279         if(prev<index) {
 280             /* find and copy uncased characters [prev..titleStart[ */
 281             int32_t titleStart=prev;
 282             int32_t titleLimit=prev;
 283             UChar32 c;
 284             U16_NEXT(src, titleLimit, index, c);
 285             if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
 286                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
 287                 for(;;) {
 288                     titleStart=titleLimit;
 289                     if(titleLimit==index) {
 290                         /*
 291                          * only uncased characters in [prev..index[
 292                          * stop with titleStart==titleLimit==index
 293                          */
 294                         break;
 295                     }
 296                     U16_NEXT(src, titleLimit, index, c);
 297                     if(UCASE_NONE!=ucase_getType(c)) {
 298                         break; /* cased letter at [titleStart..titleLimit[ */
 299                     }
 300                 }
 301                 destIndex=appendUnchanged(dest, destIndex, destCapacity,
 302                                           src+prev, titleStart-prev, options, edits);
 303                 if(destIndex<0) {
 304                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 305                     return 0;
 306                 }
 307             }
 308
 309             if(titleStart<titleLimit) {
 310                 /* titlecase c which is from [titleStart..titleLimit[ */
 311                 csc.cpStart=titleStart;
 312                 csc.cpLimit=titleLimit;
 313                 const UChar *s;
 314                 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
 315                 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
 316                                        titleLimit-titleStart, options, edits);
 317                 if(destIndex<0) {
 318                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 319                     return 0;
 320                 }
 321
 322                 /* Special case Dutch IJ titlecasing */
 323                 if (titleStart+1 < index &&
 324                         caseLocale == UCASE_LOC_DUTCH &&
 325                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
 326                     if (src[titleStart+1] == 0x006A) {
 327                         destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
 328                         if(destIndex<0) {
 329                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 330                             return 0;
 331                         }
 332                         if(edits!=NULL) {
 333                             edits->addReplace(1, 1);
 334                         }
 335                         titleLimit++;
 336                     } else if (src[titleStart+1] == 0x004A) {
 337                         // Keep the capital J from getting lowercased.
 338                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
 339                                                   src+titleStart+1, 1, options, edits);
 340                         if(destIndex<0) {
 341                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 342                             return 0;
 343                         }
 344                         titleLimit++;
 345                     }
 346                 }
 347
 348                 /* lowercase [titleLimit..index[ */
 349                 if(titleLimit<index) {
 350                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
 351                         /* Normal operation: Lowercase the rest of the word. */
 352                         destIndex+=
 353                             _caseMap(
 354                                 caseLocale, options, ucase_toFullLower,
 355                                 dest+destIndex, destCapacity-destIndex,
 356                                 src, &csc,
 357                                 titleLimit, index,
 358                                 edits, errorCode);
 359                         if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 360                             errorCode=U_ZERO_ERROR;
 361                         }
 362                         if(U_FAILURE(errorCode)) {
 363                             return destIndex;
 364                         }
 365                     } else {
 366                         /* Optionally just copy the rest of the word unchanged. */
 367                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
 368                                                   src+titleLimit, index-titleLimit, options, edits);
 369                         if(destIndex<0) {
 370                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 371                             return 0;
 372                         }
 373                     }
 374                 }
 375             }
 376         }
 377
 378         prev=index;
 379     }
 380
 381     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
 382 }
 383
 384 #endif  // !UCONFIG_NO_BREAK_ITERATION
 385
 386 U_NAMESPACE_BEGIN
 387 namespace GreekUpper {
 388
 389 // Data generated by prototype code, see
 390 // http://site.icu-project.org/design/case/greek-upper
 391 // TODO: Move this data into ucase.icu.
 392 static const uint16_t data0370[] = {
 393     // U+0370..03FF
 394     0x0370,
 395     0x0370,
 396     0x0372,
 397     0x0372,
 398     0,
 399     0,
 400     0x0376,
 401     0x0376,
 402     0,
 403     0,
 404     0x037A,
 405     0x03FD,
 406     0x03FE,
 407     0x03FF,
 408     0,
 409     0x037F,
 410     0,
 411     0,
 412     0,
 413     0,
 414     0,
 415     0,
 416     0x0391 | HAS_VOWEL | HAS_ACCENT,
 417     0,
 418     0x0395 | HAS_VOWEL | HAS_ACCENT,
 419     0x0397 | HAS_VOWEL | HAS_ACCENT,
 420     0x0399 | HAS_VOWEL | HAS_ACCENT,
 421     0,
 422     0x039F | HAS_VOWEL | HAS_ACCENT,
 423     0,
 424     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 425     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 426     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 427     0x0391 | HAS_VOWEL,
 428     0x0392,
 429     0x0393,
 430     0x0394,
 431     0x0395 | HAS_VOWEL,
 432     0x0396,
 433     0x0397 | HAS_VOWEL,
 434     0x0398,
 435     0x0399 | HAS_VOWEL,
 436     0x039A,
 437     0x039B,
 438     0x039C,
 439     0x039D,
 440     0x039E,
 441     0x039F | HAS_VOWEL,
 442     0x03A0,
 443     0x03A1,
 444     0,
 445     0x03A3,
 446     0x03A4,
 447     0x03A5 | HAS_VOWEL,
 448     0x03A6,
 449     0x03A7,
 450     0x03A8,
 451     0x03A9 | HAS_VOWEL,
 452     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
 453     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
 454     0x0391 | HAS_VOWEL | HAS_ACCENT,
 455     0x0395 | HAS_VOWEL | HAS_ACCENT,
 456     0x0397 | HAS_VOWEL | HAS_ACCENT,
 457     0x0399 | HAS_VOWEL | HAS_ACCENT,
 458     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 459     0x0391 | HAS_VOWEL,
 460     0x0392,
 461     0x0393,
 462     0x0394,
 463     0x0395 | HAS_VOWEL,
 464     0x0396,
 465     0x0397 | HAS_VOWEL,
 466     0x0398,
 467     0x0399 | HAS_VOWEL,
 468     0x039A,
 469     0x039B,
 470     0x039C,
 471     0x039D,
 472     0x039E,
 473     0x039F | HAS_VOWEL,
 474     0x03A0,
 475     0x03A1,
 476     0x03A3,
 477     0x03A3,
 478     0x03A4,
 479     0x03A5 | HAS_VOWEL,
 480     0x03A6,
 481     0x03A7,
 482     0x03A8,
 483     0x03A9 | HAS_VOWEL,
 484     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
 485     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
 486     0x039F | HAS_VOWEL | HAS_ACCENT,
 487     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 488     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 489     0x03CF,
 490     0x0392,
 491     0x0398,
 492     0x03D2,
 493     0x03D2 | HAS_ACCENT,
 494     0x03D2 | HAS_DIALYTIKA,
 495     0x03A6,
 496     0x03A0,
 497     0x03CF,
 498     0x03D8,
 499     0x03D8,
 500     0x03DA,
 501     0x03DA,
 502     0x03DC,
 503     0x03DC,
 504     0x03DE,
 505     0x03DE,
 506     0x03E0,
 507     0x03E0,
 508     0,
 509     0,
 510     0,
 511     0,
 512     0,
 513     0,
 514     0,
 515     0,
 516     0,
 517     0,
 518     0,
 519     0,
 520     0,
 521     0,
 522     0x039A,
 523     0x03A1,
 524     0x03F9,
 525     0x037F,
 526     0x03F4,
 527     0x0395 | HAS_VOWEL,
 528     0,
 529     0x03F7,
 530     0x03F7,
 531     0x03F9,
 532     0x03FA,
 533     0x03FA,
 534     0x03FC,
 535     0x03FD,
 536     0x03FE,
 537     0x03FF,
 538 };
 539
 540 static const uint16_t data1F00[] = {
 541     // U+1F00..1FFF
 542     0x0391 | HAS_VOWEL,
 543     0x0391 | HAS_VOWEL,
 544     0x0391 | HAS_VOWEL | HAS_ACCENT,
 545     0x0391 | HAS_VOWEL | HAS_ACCENT,
 546     0x0391 | HAS_VOWEL | HAS_ACCENT,
 547     0x0391 | HAS_VOWEL | HAS_ACCENT,
 548     0x0391 | HAS_VOWEL | HAS_ACCENT,
 549     0x0391 | HAS_VOWEL | HAS_ACCENT,
 550     0x0391 | HAS_VOWEL,
 551     0x0391 | HAS_VOWEL,
 552     0x0391 | HAS_VOWEL | HAS_ACCENT,
 553     0x0391 | HAS_VOWEL | HAS_ACCENT,
 554     0x0391 | HAS_VOWEL | HAS_ACCENT,
 555     0x0391 | HAS_VOWEL | HAS_ACCENT,
 556     0x0391 | HAS_VOWEL | HAS_ACCENT,
 557     0x0391 | HAS_VOWEL | HAS_ACCENT,
 558     0x0395 | HAS_VOWEL,
 559     0x0395 | HAS_VOWEL,
 560     0x0395 | HAS_VOWEL | HAS_ACCENT,
 561     0x0395 | HAS_VOWEL | HAS_ACCENT,
 562     0x0395 | HAS_VOWEL | HAS_ACCENT,
 563     0x0395 | HAS_VOWEL | HAS_ACCENT,
 564     0,
 565     0,
 566     0x0395 | HAS_VOWEL,
 567     0x0395 | HAS_VOWEL,
 568     0x0395 | HAS_VOWEL | HAS_ACCENT,
 569     0x0395 | HAS_VOWEL | HAS_ACCENT,
 570     0x0395 | HAS_VOWEL | HAS_ACCENT,
 571     0x0395 | HAS_VOWEL | HAS_ACCENT,
 572     0,
 573     0,
 574     0x0397 | HAS_VOWEL,
 575     0x0397 | HAS_VOWEL,
 576     0x0397 | HAS_VOWEL | HAS_ACCENT,
 577     0x0397 | HAS_VOWEL | HAS_ACCENT,
 578     0x0397 | HAS_VOWEL | HAS_ACCENT,
 579     0x0397 | HAS_VOWEL | HAS_ACCENT,
 580     0x0397 | HAS_VOWEL | HAS_ACCENT,
 581     0x0397 | HAS_VOWEL | HAS_ACCENT,
 582     0x0397 | HAS_VOWEL,
 583     0x0397 | HAS_VOWEL,
 584     0x0397 | HAS_VOWEL | HAS_ACCENT,
 585     0x0397 | HAS_VOWEL | HAS_ACCENT,
 586     0x0397 | HAS_VOWEL | HAS_ACCENT,
 587     0x0397 | HAS_VOWEL | HAS_ACCENT,
 588     0x0397 | HAS_VOWEL | HAS_ACCENT,
 589     0x0397 | HAS_VOWEL | HAS_ACCENT,
 590     0x0399 | HAS_VOWEL,
 591     0x0399 | HAS_VOWEL,
 592     0x0399 | HAS_VOWEL | HAS_ACCENT,
 593     0x0399 | HAS_VOWEL | HAS_ACCENT,
 594     0x0399 | HAS_VOWEL | HAS_ACCENT,
 595     0x0399 | HAS_VOWEL | HAS_ACCENT,
 596     0x0399 | HAS_VOWEL | HAS_ACCENT,
 597     0x0399 | HAS_VOWEL | HAS_ACCENT,
 598     0x0399 | HAS_VOWEL,
 599     0x0399 | HAS_VOWEL,
 600     0x0399 | HAS_VOWEL | HAS_ACCENT,
 601     0x0399 | HAS_VOWEL | HAS_ACCENT,
 602     0x0399 | HAS_VOWEL | HAS_ACCENT,
 603     0x0399 | HAS_VOWEL | HAS_ACCENT,
 604     0x0399 | HAS_VOWEL | HAS_ACCENT,
 605     0x0399 | HAS_VOWEL | HAS_ACCENT,
 606     0x039F | HAS_VOWEL,
 607     0x039F | HAS_VOWEL,
 608     0x039F | HAS_VOWEL | HAS_ACCENT,
 609     0x039F | HAS_VOWEL | HAS_ACCENT,
 610     0x039F | HAS_VOWEL | HAS_ACCENT,
 611     0x039F | HAS_VOWEL | HAS_ACCENT,
 612     0,
 613     0,
 614     0x039F | HAS_VOWEL,
 615     0x039F | HAS_VOWEL,
 616     0x039F | HAS_VOWEL | HAS_ACCENT,
 617     0x039F | HAS_VOWEL | HAS_ACCENT,
 618     0x039F | HAS_VOWEL | HAS_ACCENT,
 619     0x039F | HAS_VOWEL | HAS_ACCENT,
 620     0,
 621     0,
 622     0x03A5 | HAS_VOWEL,
 623     0x03A5 | HAS_VOWEL,
 624     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 625     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 626     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 627     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 628     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 629     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 630     0,
 631     0x03A5 | HAS_VOWEL,
 632     0,
 633     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 634     0,
 635     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 636     0,
 637     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 638     0x03A9 | HAS_VOWEL,
 639     0x03A9 | HAS_VOWEL,
 640     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 641     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 642     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 643     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 644     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 645     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 646     0x03A9 | HAS_VOWEL,
 647     0x03A9 | HAS_VOWEL,
 648     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 649     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 650     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 651     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 652     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 653     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 654     0x0391 | HAS_VOWEL | HAS_ACCENT,
 655     0x0391 | HAS_VOWEL | HAS_ACCENT,
 656     0x0395 | HAS_VOWEL | HAS_ACCENT,
 657     0x0395 | HAS_VOWEL | HAS_ACCENT,
 658     0x0397 | HAS_VOWEL | HAS_ACCENT,
 659     0x0397 | HAS_VOWEL | HAS_ACCENT,
 660     0x0399 | HAS_VOWEL | HAS_ACCENT,
 661     0x0399 | HAS_VOWEL | HAS_ACCENT,
 662     0x039F | HAS_VOWEL | HAS_ACCENT,
 663     0x039F | HAS_VOWEL | HAS_ACCENT,
 664     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 665     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 666     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 667     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 668     0,
 669     0,
 670     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 671     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 672     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 673     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 674     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 675     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 676     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 677     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 678     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 679     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 680     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 681     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 682     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 683     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 684     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 685     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 686     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 687     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 688     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 689     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 690     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 691     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 692     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 693     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 694     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 695     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 696     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 697     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 698     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 699     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 700     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 701     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 702     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 703     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 704     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 705     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 706     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 707     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 708     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 709     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 710     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 711     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 712     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 713     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 714     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 715     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 716     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 717     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 718     0x0391 | HAS_VOWEL,
 719     0x0391 | HAS_VOWEL,
 720     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 721     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 722     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 723     0,
 724     0x0391 | HAS_VOWEL | HAS_ACCENT,
 725     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 726     0x0391 | HAS_VOWEL,
 727     0x0391 | HAS_VOWEL,
 728     0x0391 | HAS_VOWEL | HAS_ACCENT,
 729     0x0391 | HAS_VOWEL | HAS_ACCENT,
 730     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 731     0,
 732     0x0399 | HAS_VOWEL,
 733     0,
 734     0,
 735     0,
 736     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 737     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 738     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 739     0,
 740     0x0397 | HAS_VOWEL | HAS_ACCENT,
 741     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 742     0x0395 | HAS_VOWEL | HAS_ACCENT,
 743     0x0395 | HAS_VOWEL | HAS_ACCENT,
 744     0x0397 | HAS_VOWEL | HAS_ACCENT,
 745     0x0397 | HAS_VOWEL | HAS_ACCENT,
 746     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 747     0,
 748     0,
 749     0,
 750     0x0399 | HAS_VOWEL,
 751     0x0399 | HAS_VOWEL,
 752     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 753     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 754     0,
 755     0,
 756     0x0399 | HAS_VOWEL | HAS_ACCENT,
 757     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 758     0x0399 | HAS_VOWEL,
 759     0x0399 | HAS_VOWEL,
 760     0x0399 | HAS_VOWEL | HAS_ACCENT,
 761     0x0399 | HAS_VOWEL | HAS_ACCENT,
 762     0,
 763     0,
 764     0,
 765     0,
 766     0x03A5 | HAS_VOWEL,
 767     0x03A5 | HAS_VOWEL,
 768     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 769     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 770     0x03A1,
 771     0x03A1,
 772     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 773     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 774     0x03A5 | HAS_VOWEL,
 775     0x03A5 | HAS_VOWEL,
 776     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 777     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 778     0x03A1,
 779     0,
 780     0,
 781     0,
 782     0,
 783     0,
 784     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 785     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 786     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 787     0,
 788     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 789     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 790     0x039F | HAS_VOWEL | HAS_ACCENT,
 791     0x039F | HAS_VOWEL | HAS_ACCENT,
 792     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 793     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 794     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 795     0,
 796     0,
 797     0,
 798 };
 799
 800 // U+2126 Ohm sign
 801 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
 802
 803 uint32_t getLetterData(UChar32 c) {
 804     if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
 805         return 0;
 806     } else if (c <= 0x3ff) {
 807         return data0370[c - 0x370];
 808     } else if (c <= 0x1fff) {
 809         return data1F00[c - 0x1f00];
 810     } else if (c == 0x2126) {
 811         return data2126;
 812     } else {
 813         return 0;
 814     }
 815 }
 816
 817 uint32_t getDiacriticData(UChar32 c) {
 818     switch (c) {
 819     case 0x0300:  // varia
 820     case 0x0301:  // tonos = oxia
 821     case 0x0342:  // perispomeni
 822     case 0x0302:  // circumflex can look like perispomeni
 823     case 0x0303:  // tilde can look like perispomeni
 824     case 0x0311:  // inverted breve can look like perispomeni
 825         return HAS_ACCENT;
 826     case 0x0308:  // dialytika = diaeresis
 827         return HAS_COMBINING_DIALYTIKA;
 828     case 0x0344:  // dialytika tonos
 829         return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
 830     case 0x0345:  // ypogegrammeni = iota subscript
 831         return HAS_YPOGEGRAMMENI;
 832     case 0x0304:  // macron
 833     case 0x0306:  // breve
 834     case 0x0313:  // comma above
 835     case 0x0314:  // reversed comma above
 836     case 0x0343:  // koronis
 837         return HAS_OTHER_GREEK_DIACRITIC;
 838     default:
 839         return 0;
 840     }
 841 }
 842
 843 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
 844     while (i < length) {
 845         UChar32 c;
 846         U16_NEXT(s, i, length, c);
 847         int32_t type = ucase_getTypeOrIgnorable(c);
 848         if ((type & UCASE_IGNORABLE) != 0) {
 849             // Case-ignorable, continue with the loop.
 850         } else if (type != UCASE_NONE) {
 851             return TRUE;  // Followed by cased letter.
 852         } else {
 853             return FALSE;  // Uncased and not case-ignorable.
 854         }
 855     }
 856     return FALSE;  // Not followed by cased letter.
 857 }
 858
 859 /**
 860  * Greek string uppercasing with a state machine.
 861  * Probably simpler than a stateless function that has to figure out complex context-before
 862  * for each character.
 863  * TODO: Try to re-consolidate one way or another with the non-Greek function.
 864  */
 865 int32_t toUpper(uint32_t options,
 866                 UChar *dest, int32_t destCapacity,
 867                 const UChar *src, int32_t srcLength,
 868                 Edits *edits,
 869                 UErrorCode &errorCode) {
 870     int32_t destIndex=0;
 871     uint32_t state = 0;
 872     for (int32_t i = 0; i < srcLength;) {
 873         int32_t nextIndex = i;
 874         UChar32 c;
 875         U16_NEXT(src, nextIndex, srcLength, c);
 876         uint32_t nextState = 0;
 877         int32_t type = ucase_getTypeOrIgnorable(c);
 878         if ((type & UCASE_IGNORABLE) != 0) {
 879             // c is case-ignorable
 880             nextState |= (state & AFTER_CASED);
 881         } else if (type != UCASE_NONE) {
 882             // c is cased
 883             nextState |= AFTER_CASED;
 884         }
 885         uint32_t data = getLetterData(c);
 886         if (data > 0) {
 887             uint32_t upper = data & UPPER_MASK;
 888             // Add a dialytika to this iota or ypsilon vowel
 889             // if we removed a tonos from the previous vowel,
 890             // and that previous vowel did not also have (or gain) a dialytika.
 891             // Adding one only to the final vowel in a longer sequence
 892             // (which does not occur in normal writing) would require lookahead.
 893             // Set the same flag as for preserving an existing dialytika.
 894             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
 895                     (upper == 0x399 || upper == 0x3A5)) {
 896                 data |= HAS_DIALYTIKA;
 897             }
 898             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
 899             if ((data & HAS_YPOGEGRAMMENI) != 0) {
 900                 numYpogegrammeni = 1;
 901             }
 902             // Skip combining diacritics after this Greek letter.
 903             while (nextIndex < srcLength) {
 904                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
 905                 if (diacriticData != 0) {
 906                     data |= diacriticData;
 907                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
 908                         ++numYpogegrammeni;
 909                     }
 910                     ++nextIndex;
 911                 } else {
 912                     break;  // not a Greek diacritic
 913                 }
 914             }
 915             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
 916                 nextState |= AFTER_VOWEL_WITH_ACCENT;
 917             }
 918             // Map according to Greek rules.
 919             UBool addTonos = FALSE;
 920             if (upper == 0x397 &&
 921                     (data & HAS_ACCENT) != 0 &&
 922                     numYpogegrammeni == 0 &&
 923                     (state & AFTER_CASED) == 0 &&
 924                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
 925                 // Keep disjunctive "or" with (only) a tonos.
 926                 // We use the same "word boundary" conditions as for the Final_Sigma test.
 927                 if (i == nextIndex) {
 928                     upper = 0x389;  // Preserve the precomposed form.
 929                 } else {
 930                     addTonos = TRUE;
 931                 }
 932             } else if ((data & HAS_DIALYTIKA) != 0) {
 933                 // Preserve a vowel with dialytika in precomposed form if it exists.
 934                 if (upper == 0x399) {
 935                     upper = 0x3AA;
 936                     data &= ~HAS_EITHER_DIALYTIKA;
 937                 } else if (upper == 0x3A5) {
 938                     upper = 0x3AB;
 939                     data &= ~HAS_EITHER_DIALYTIKA;
 940                 }
 941             }
 942
 943             UBool change = TRUE;
 944             if (edits != NULL) {
 945                 // Find out first whether we are changing the text.
 946                 change = src[i] != upper || numYpogegrammeni > 0;
 947                 int32_t i2 = i + 1;
 948                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
 949                     change |= i2 >= nextIndex || src[i2] != 0x308;
 950                     ++i2;
 951                 }
 952                 if (addTonos) {
 953                     change |= i2 >= nextIndex || src[i2] != 0x301;
 954                     ++i2;
 955                 }
 956                 int32_t oldLength = nextIndex - i;
 957                 int32_t newLength = (i2 - i) + numYpogegrammeni;
 958                 change |= oldLength != newLength;
 959                 if (change) {
 960                     if (edits != NULL) {
 961                         edits->addReplace(oldLength, newLength);
 962                     }
 963                 } else {
 964                     if (edits != NULL) {
 965                         edits->addUnchanged(oldLength);
 966                     }
 967                     // Write unchanged text?
 968                     change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0;
 969                 }
 970             }
 971
 972             if (change) {
 973                 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
 974                 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
 975                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
 976                 }
 977                 if (destIndex >= 0 && addTonos) {
 978                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
 979                 }
 980                 while (destIndex >= 0 && numYpogegrammeni > 0) {
 981                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
 982                     --numYpogegrammeni;
 983                 }
 984                 if(destIndex<0) {
 985                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 986                     return 0;
 987                 }
 988             }
 989         } else {
 990             const UChar *s;
 991             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
 992             destIndex = appendResult(dest, destIndex, destCapacity, c, s,
 993                                      nextIndex - i, options, edits);
 994             if (destIndex < 0) {
 995                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 996                 return 0;
 997             }
 998         }
 999         i = nextIndex;
1000         state = nextState;
1001     }
1002
1003     return destIndex;
1004 }
1005
1006 }  // namespace GreekUpper
1007 U_NAMESPACE_END
1008
1009 /* functions available in the common library (for unistr_case.cpp) */
1010
1011 U_CFUNC int32_t U_CALLCONV
1012 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1013                          UChar *dest, int32_t destCapacity,
1014                          const UChar *src, int32_t srcLength,
1015                          icu::Edits *edits,
1016                          UErrorCode &errorCode) {
1017     UCaseContext csc=UCASECONTEXT_INITIALIZER;
1018     csc.p=(void *)src;
1019     csc.limit=srcLength;
1020     int32_t destIndex = _caseMap(
1021         caseLocale, options, ucase_toFullLower,
1022         dest, destCapacity,
1023         src, &csc, 0, srcLength,
1024         edits, errorCode);
1025     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1026 }
1027
1028 U_CFUNC int32_t U_CALLCONV
1029 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1030                          UChar *dest, int32_t destCapacity,
1031                          const UChar *src, int32_t srcLength,
1032                          icu::Edits *edits,
1033                          UErrorCode &errorCode) {
1034     int32_t destIndex;
1035     if (caseLocale == UCASE_LOC_GREEK) {
1036         destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1037                                         src, srcLength, edits, errorCode);
1038     } else {
1039         UCaseContext csc=UCASECONTEXT_INITIALIZER;
1040         csc.p=(void *)src;
1041         csc.limit=srcLength;
1042         destIndex = _caseMap(
1043             caseLocale, options, ucase_toFullUpper,
1044             dest, destCapacity,
1045             src, &csc, 0, srcLength,
1046             edits, errorCode);
1047     }
1048     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1049 }
1050
1051 U_CFUNC int32_t U_CALLCONV
1052 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1053                       UChar *dest, int32_t destCapacity,
1054                       const UChar *src, int32_t srcLength,
1055                       icu::Edits *edits,
1056                       UErrorCode &errorCode) {
1057     /* case mapping loop */
1058     int32_t srcIndex = 0;
1059     int32_t destIndex = 0;
1060     while (srcIndex < srcLength) {
1061         int32_t cpStart = srcIndex;
1062         UChar32 c;
1063         U16_NEXT(src, srcIndex, srcLength, c);
1064         const UChar *s;
1065         c = ucase_toFullFolding(c, &s, options);
1066         destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1067                                  srcIndex - cpStart, options, edits);
1068         if (destIndex < 0) {
1069             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1070             return 0;
1071         }
1072     }
1073
1074     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1075 }
1076
1077 U_CFUNC int32_t
1078 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1079              UChar *dest, int32_t destCapacity,
1080              const UChar *src, int32_t srcLength,
1081              UStringCaseMapper *stringCaseMapper,
1082              icu::Edits *edits,
1083              UErrorCode &errorCode) {
1084     int32_t destLength;
1085
1086     /* check argument values */
1087     if(U_FAILURE(errorCode)) {
1088         return 0;
1089     }
1090     if( destCapacity<0 ||
1091         (dest==NULL && destCapacity>0) ||
1092         src==NULL ||
1093         srcLength<-1
1094     ) {
1095         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1096         return 0;
1097     }
1098
1099     /* get the string length */
1100     if(srcLength==-1) {
1101         srcLength=u_strlen(src);
1102     }
1103
1104     /* check for overlapping source and destination */
1105     if( dest!=NULL &&
1106         ((src>=dest && src<(dest+destCapacity)) ||
1107          (dest>=src && dest<(src+srcLength)))
1108     ) {
1109         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1110         return 0;
1111     }
1112
1113     if(edits!=NULL) {
1114         edits->reset();
1115     }
1116     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1117                                 dest, destCapacity, src, srcLength, edits, errorCode);
1118     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1119 }
1120
1121 U_CFUNC int32_t
1122 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1123                         UChar *dest, int32_t destCapacity,
1124                         const UChar *src, int32_t srcLength,
1125                         UStringCaseMapper *stringCaseMapper,
1126                         UErrorCode &errorCode) {
1127     UChar buffer[300];
1128     UChar *temp;
1129
1130     int32_t destLength;
1131
1132     /* check argument values */
1133     if(U_FAILURE(errorCode)) {
1134         return 0;
1135     }
1136     if( destCapacity<0 ||
1137         (dest==NULL && destCapacity>0) ||
1138         src==NULL ||
1139         srcLength<-1
1140     ) {
1141         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1142         return 0;
1143     }
1144
1145     /* get the string length */
1146     if(srcLength==-1) {
1147         srcLength=u_strlen(src);
1148     }
1149
1150     /* check for overlapping source and destination */
1151     if( dest!=NULL &&
1152         ((src>=dest && src<(dest+destCapacity)) ||
1153          (dest>=src && dest<(src+srcLength)))
1154     ) {
1155         /* overlap: provide a temporary destination buffer and later copy the result */
1156         if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1157             /* the stack buffer is large enough */
1158             temp=buffer;
1159         } else {
1160             /* allocate a buffer */
1161             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1162             if(temp==NULL) {
1163                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1164                 return 0;
1165             }
1166         }
1167     } else {
1168         temp=dest;
1169     }
1170
1171     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1172                                 temp, destCapacity, src, srcLength, NULL, errorCode);
1173     if(temp!=dest) {
1174         /* copy the result string to the destination buffer */
1175         if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1176             u_memmove(dest, temp, destLength);
1177         }
1178         if(temp!=buffer) {
1179             uprv_free(temp);
1180         }
1181     }
1182
1183     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1184 }
1185
1186 /* public API functions */
1187
1188 U_CAPI int32_t U_EXPORT2
1189 u_strFoldCase(UChar *dest, int32_t destCapacity,
1190               const UChar *src, int32_t srcLength,
1191               uint32_t options,
1192               UErrorCode *pErrorCode) {
1193     return ustrcase_mapWithOverlap(
1194         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1195         dest, destCapacity,
1196         src, srcLength,
1197         ustrcase_internalFold, *pErrorCode);
1198 }
1199
1200 U_NAMESPACE_BEGIN
1201
1202 int32_t CaseMap::fold(
1203         uint32_t options,
1204         const UChar *src, int32_t srcLength,
1205         UChar *dest, int32_t destCapacity, Edits *edits,
1206         UErrorCode &errorCode) {
1207     return ustrcase_map(
1208         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1209         dest, destCapacity,
1210         src, srcLength,
1211         ustrcase_internalFold, edits, errorCode);
1212 }
1213
1214 U_NAMESPACE_END
1215
1216 /* case-insensitive string comparisons -------------------------------------- */
1217
1218 /*
1219  * This function is a copy of unorm_cmpEquivFold() minus the parts for
1220  * canonical equivalence.
1221  * Keep the functions in sync, and see there for how this works.
1222  * The duplication is for modularization:
1223  * It makes caseless (but not canonical caseless) matches independent of
1224  * the normalization code.
1225  */
1226
1227 /* stack element for previous-level source/decomposition pointers */
1228 struct CmpEquivLevel {
1229     const UChar *start, *s, *limit;
1230 };
1231 typedef struct CmpEquivLevel CmpEquivLevel;
1232
1233 /**
1234  * Internal implementation code comparing string with case fold.
1235  * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1236  *
1237  * @param s1            input string 1
1238  * @param length1       length of string 1, or -1 (NULL terminated)
1239  * @param s2            input string 2
1240  * @param length2       length of string 2, or -1 (NULL terminated)
1241  * @param options       compare options
1242  * @param matchLen1     (output) length of partial prefix match in s1
1243  * @param matchLen2     (output) length of partial prefix match in s2
1244  * @param pErrorCode    receives error status
1245  * @return The result of comparison
1246  */
1247 static int32_t _cmpFold(
1248             const UChar *s1, int32_t length1,
1249             const UChar *s2, int32_t length2,
1250             uint32_t options,
1251             int32_t *matchLen1, int32_t *matchLen2,
1252             UErrorCode *pErrorCode) {
1253     int32_t cmpRes = 0;
1254
1255     /* current-level start/limit - s1/s2 as current */
1256     const UChar *start1, *start2, *limit1, *limit2;
1257
1258     /* points to the original start address */
1259     const UChar *org1, *org2;
1260
1261     /* points to the end of match + 1 */
1262     const UChar *m1, *m2;
1263
1264     /* case folding variables */
1265     const UChar *p;
1266     int32_t length;
1267
1268     /* stacks of previous-level start/current/limit */
1269     CmpEquivLevel stack1[2], stack2[2];
1270
1271     /* case folding buffers, only use current-level start/limit */
1272     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1273
1274     /* track which is the current level per string */
1275     int32_t level1, level2;
1276
1277     /* current code units, and code points for lookups */
1278     UChar32 c1, c2, cp1, cp2;
1279
1280     /* no argument error checking because this itself is not an API */
1281
1282     /*
1283      * assume that at least the option U_COMPARE_IGNORE_CASE is set
1284      * otherwise this function would have to behave exactly as uprv_strCompare()
1285      */
1286     if(U_FAILURE(*pErrorCode)) {
1287         return 0;
1288     }
1289
1290     /* initialize */
1291     if(matchLen1) {
1292         U_ASSERT(matchLen2 !=NULL);
1293         *matchLen1=0;
1294         *matchLen2=0;
1295     }
1296
1297     start1=m1=org1=s1;
1298     if(length1==-1) {
1299         limit1=NULL;
1300     } else {
1301         limit1=s1+length1;
1302     }
1303
1304     start2=m2=org2=s2;
1305     if(length2==-1) {
1306         limit2=NULL;
1307     } else {
1308         limit2=s2+length2;
1309     }
1310
1311     level1=level2=0;
1312     c1=c2=-1;
1313
1314     /* comparison loop */
1315     for(;;) {
1316         /*
1317          * here a code unit value of -1 means "get another code unit"
1318          * below it will mean "this source is finished"
1319          */
1320
1321         if(c1<0) {
1322             /* get next code unit from string 1, post-increment */
1323             for(;;) {
1324                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1325                     if(level1==0) {
1326                         c1=-1;
1327                         break;
1328                     }
1329                 } else {
1330                     ++s1;
1331                     break;
1332                 }
1333
1334                 /* reached end of level buffer, pop one level */
1335                 do {
1336                     --level1;
1337                     start1=stack1[level1].start;    /*Not uninitialized*/
1338                 } while(start1==NULL);
1339                 s1=stack1[level1].s;                /*Not uninitialized*/
1340                 limit1=stack1[level1].limit;        /*Not uninitialized*/
1341             }
1342         }
1343
1344         if(c2<0) {
1345             /* get next code unit from string 2, post-increment */
1346             for(;;) {
1347                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1348                     if(level2==0) {
1349                         c2=-1;
1350                         break;
1351                     }
1352                 } else {
1353                     ++s2;
1354                     break;
1355                 }
1356
1357                 /* reached end of level buffer, pop one level */
1358                 do {
1359                     --level2;
1360                     start2=stack2[level2].start;    /*Not uninitialized*/
1361                 } while(start2==NULL);
1362                 s2=stack2[level2].s;                /*Not uninitialized*/
1363                 limit2=stack2[level2].limit;        /*Not uninitialized*/
1364             }
1365         }
1366
1367         /*
1368          * compare c1 and c2
1369          * either variable c1, c2 is -1 only if the corresponding string is finished
1370          */
1371         if(c1==c2) {
1372             const UChar *next1, *next2;
1373
1374             if(c1<0) {
1375                 cmpRes=0;   /* c1==c2==-1 indicating end of strings */
1376                 break;
1377             }
1378
1379             /*
1380              * Note: Move the match positions in both strings at the same time
1381              *      only when corresponding code point(s) in the original strings
1382              *      are fully consumed. For example, when comparing s1="Fust" and
1383              *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1384              *      the first code point in the case-folded data. But the second "s"
1385              *      has no matching code point in s1, so this implementation returns
1386              *      2 as the prefix match length ("Fu").
1387              */
1388             next1=next2=NULL;
1389             if(level1==0) {
1390                 next1=s1;
1391             } else if(s1==limit1) {
1392                 /* Note: This implementation only use a single level of stack.
1393                  *      If this code needs to be changed to use multiple levels
1394                  *      of stacks, the code above should check if the current
1395                  *      code is at the end of all stacks.
1396                  */
1397                 U_ASSERT(level1==1);
1398
1399                 /* is s1 at the end of the current stack? */
1400                 next1=stack1[0].s;
1401             }
1402
1403             if (next1!=NULL) {
1404                 if(level2==0) {
1405                     next2=s2;
1406                 } else if(s2==limit2) {
1407                     U_ASSERT(level2==1);
1408
1409                     /* is s2 at the end of the current stack? */
1410                     next2=stack2[0].s;
1411                 }
1412                 if(next2!=NULL) {
1413                     m1=next1;
1414                     m2=next2;
1415                 }
1416             }
1417             c1=c2=-1;       /* make us fetch new code units */
1418             continue;
1419         } else if(c1<0) {
1420             cmpRes=-1;      /* string 1 ends before string 2 */
1421             break;
1422         } else if(c2<0) {
1423             cmpRes=1;       /* string 2 ends before string 1 */
1424             break;
1425         }
1426         /* c1!=c2 && c1>=0 && c2>=0 */
1427
1428         /* get complete code points for c1, c2 for lookups if either is a surrogate */
1429         cp1=c1;
1430         if(U_IS_SURROGATE(c1)) {
1431             UChar c;
1432
1433             if(U_IS_SURROGATE_LEAD(c1)) {
1434                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1435                     /* advance ++s1; only below if cp1 decomposes/case-folds */
1436                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
1437                 }
1438             } else /* isTrail(c1) */ {
1439                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1440                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
1441                 }
1442             }
1443         }
1444
1445         cp2=c2;
1446         if(U_IS_SURROGATE(c2)) {
1447             UChar c;
1448
1449             if(U_IS_SURROGATE_LEAD(c2)) {
1450                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1451                     /* advance ++s2; only below if cp2 decomposes/case-folds */
1452                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
1453                 }
1454             } else /* isTrail(c2) */ {
1455                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1456                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
1457                 }
1458             }
1459         }
1460
1461         /*
1462          * go down one level for each string
1463          * continue with the main loop as soon as there is a real change
1464          */
1465
1466         if( level1==0 &&
1467             (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
1468         ) {
1469             /* cp1 case-folds to the code point "length" or to p[length] */
1470             if(U_IS_SURROGATE(c1)) {
1471                 if(U_IS_SURROGATE_LEAD(c1)) {
1472                     /* advance beyond source surrogate pair if it case-folds */
1473                     ++s1;
1474                 } else /* isTrail(c1) */ {
1475                     /*
1476                      * we got a supplementary code point when hitting its trail surrogate,
1477                      * therefore the lead surrogate must have been the same as in the other string;
1478                      * compare this decomposition with the lead surrogate in the other string
1479                      * remember that this simulates bulk text replacement:
1480                      * the decomposition would replace the entire code point
1481                      */
1482                     --s2;
1483                     --m2;
1484                     c2=*(s2-1);
1485                 }
1486             }
1487
1488             /* push current level pointers */
1489             stack1[0].start=start1;
1490             stack1[0].s=s1;
1491             stack1[0].limit=limit1;
1492             ++level1;
1493
1494             /* copy the folding result to fold1[] */
1495             if(length<=UCASE_MAX_STRING_LENGTH) {
1496                 u_memcpy(fold1, p, length);
1497             } else {
1498                 int32_t i=0;
1499                 U16_APPEND_UNSAFE(fold1, i, length);
1500                 length=i;
1501             }
1502
1503             /* set next level pointers to case folding */
1504             start1=s1=fold1;
1505             limit1=fold1+length;
1506
1507             /* get ready to read from decomposition, continue with loop */
1508             c1=-1;
1509             continue;
1510         }
1511
1512         if( level2==0 &&
1513             (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
1514         ) {
1515             /* cp2 case-folds to the code point "length" or to p[length] */
1516             if(U_IS_SURROGATE(c2)) {
1517                 if(U_IS_SURROGATE_LEAD(c2)) {
1518                     /* advance beyond source surrogate pair if it case-folds */
1519                     ++s2;
1520                 } else /* isTrail(c2) */ {
1521                     /*
1522                      * we got a supplementary code point when hitting its trail surrogate,
1523                      * therefore the lead surrogate must have been the same as in the other string;
1524                      * compare this decomposition with the lead surrogate in the other string
1525                      * remember that this simulates bulk text replacement:
1526                      * the decomposition would replace the entire code point
1527                      */
1528                     --s1;
1529                     --m2;
1530                     c1=*(s1-1);
1531                 }
1532             }
1533
1534             /* push current level pointers */
1535             stack2[0].start=start2;
1536             stack2[0].s=s2;
1537             stack2[0].limit=limit2;
1538             ++level2;
1539
1540             /* copy the folding result to fold2[] */
1541             if(length<=UCASE_MAX_STRING_LENGTH) {
1542                 u_memcpy(fold2, p, length);
1543             } else {
1544                 int32_t i=0;
1545                 U16_APPEND_UNSAFE(fold2, i, length);
1546                 length=i;
1547             }
1548
1549             /* set next level pointers to case folding */
1550             start2=s2=fold2;
1551             limit2=fold2+length;
1552
1553             /* get ready to read from decomposition, continue with loop */
1554             c2=-1;
1555             continue;
1556         }
1557
1558         /*
1559          * no decomposition/case folding, max level for both sides:
1560          * return difference result
1561          *
1562          * code point order comparison must not just return cp1-cp2
1563          * because when single surrogates are present then the surrogate pairs
1564          * that formed cp1 and cp2 may be from different string indexes
1565          *
1566          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1567          * c1=d800 cp1=10001 c2=dc00 cp2=10000
1568          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1569          *
1570          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1571          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1572          * so we have slightly different pointer/start/limit comparisons here
1573          */
1574
1575         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1576             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1577             if(
1578                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1579                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1580             ) {
1581                 /* part of a surrogate pair, leave >=d800 */
1582             } else {
1583                 /* BMP code point - may be surrogate code point - make <d800 */
1584                 c1-=0x2800;
1585             }
1586
1587             if(
1588                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1589                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1590             ) {
1591                 /* part of a surrogate pair, leave >=d800 */
1592             } else {
1593                 /* BMP code point - may be surrogate code point - make <d800 */
1594                 c2-=0x2800;
1595             }
1596         }
1597
1598         cmpRes=c1-c2;
1599         break;
1600     }
1601
1602     if(matchLen1) {
1603         *matchLen1=m1-org1;
1604         *matchLen2=m2-org2;
1605     }
1606     return cmpRes;
1607 }
1608
1609 /* internal function */
1610 U_CFUNC int32_t
1611 u_strcmpFold(const UChar *s1, int32_t length1,
1612              const UChar *s2, int32_t length2,
1613              uint32_t options,
1614              UErrorCode *pErrorCode) {
1615     return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1616 }
1617
1618 /* public API functions */
1619
1620 U_CAPI int32_t U_EXPORT2
1621 u_strCaseCompare(const UChar *s1, int32_t length1,
1622                  const UChar *s2, int32_t length2,
1623                  uint32_t options,
1624                  UErrorCode *pErrorCode) {
1625     /* argument checking */
1626     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1627         return 0;
1628     }
1629     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1630         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1631         return 0;
1632     }
1633     return u_strcmpFold(s1, length1, s2, length2,
1634                         options|U_COMPARE_IGNORE_CASE,
1635                         pErrorCode);
1636 }
1637
1638 U_CAPI int32_t U_EXPORT2
1639 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1640     UErrorCode errorCode=U_ZERO_ERROR;
1641     return u_strcmpFold(s1, -1, s2, -1,
1642                         options|U_COMPARE_IGNORE_CASE,
1643                         &errorCode);
1644 }
1645
1646 U_CAPI int32_t U_EXPORT2
1647 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1648     UErrorCode errorCode=U_ZERO_ERROR;
1649     return u_strcmpFold(s1, length, s2, length,
1650                         options|U_COMPARE_IGNORE_CASE,
1651                         &errorCode);
1652 }
1653
1654 U_CAPI int32_t U_EXPORT2
1655 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1656     UErrorCode errorCode=U_ZERO_ERROR;
1657     return u_strcmpFold(s1, n, s2, n,
1658                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1659                         &errorCode);
1660 }
1661
1662 /* internal API - detect length of shared prefix */
1663 U_CAPI void
1664 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1665                              const UChar *s2, int32_t length2,
1666                              uint32_t options,
1667                              int32_t *matchLen1, int32_t *matchLen2,
1668                              UErrorCode *pErrorCode) {
1669     _cmpFold(s1, length1, s2, length2, options,
1670         matchLen1, matchLen2, pErrorCode);
1671 }