icuSources/common/normalizer2impl.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2009-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  normalizer2impl.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2009nov22
  16 *   created by: Markus W. Scherer
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_NORMALIZATION
  22
  23 #include "unicode/bytestream.h"
  24 #include "unicode/edits.h"
  25 #include "unicode/normalizer2.h"
  26 #include "unicode/stringoptions.h"
  27 #include "unicode/udata.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/utf16.h"
  30 #include "unicode/utf8.h"
  31 #include "bytesinkutil.h"
  32 #include "cmemory.h"
  33 #include "mutex.h"
  34 #include "normalizer2impl.h"
  35 #include "putilimp.h"
  36 #include "uassert.h"
  37 #include "uset_imp.h"
  38 #include "utrie2.h"
  39 #include "uvector.h"
  40
  41 U_NAMESPACE_BEGIN
  42
  43 namespace {
  44
  45 /**
  46  * UTF-8 lead byte for minNoMaybeCP.
  47  * Can be lower than the actual lead byte for c.
  48  * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
  49  */
  50 inline uint8_t leadByteForCP(UChar32 c) {
  51     if (c <= 0x7f) {
  52         return (uint8_t)c;
  53     } else if (c <= 0x7ff) {
  54         return (uint8_t)(0xc0+(c>>6));
  55     } else {
  56         // Should not occur because ccc(U+0300)!=0.
  57         return 0xe0;
  58     }
  59 }
  60
  61 /**
  62  * Returns the code point from one single well-formed UTF-8 byte sequence
  63  * between cpStart and cpLimit.
  64  *
  65  * UTrie2 UTF-8 macros do not assemble whole code points (for efficiency).
  66  * When we do need the code point, we call this function.
  67  * We should not need it for normalization-inert data (norm16==0).
  68  * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
  69  */
  70 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
  71     // Similar to U8_NEXT_UNSAFE(s, i, c).
  72     U_ASSERT(cpStart < cpLimit);
  73     uint8_t c = *cpStart;
  74     switch(cpLimit-cpStart) {
  75     case 1:
  76         return c;
  77     case 2:
  78         return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
  79     case 3:
  80         // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
  81         return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));
  82     case 4:
  83         return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
  84     default:
  85         U_ASSERT(FALSE);  // Should not occur.
  86         return U_SENTINEL;
  87     }
  88 }
  89
  90 /**
  91  * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
  92  * Otherwise returns a negative value.
  93  */
  94 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
  95     if ((p - start) >= 3) {
  96         p -= 3;
  97         uint8_t l = *p;
  98         uint8_t t1, t2;
  99         if (0xe1 <= l && l <= 0xed &&
 100                 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&
 101                 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&
 102                 (l < 0xed || t1 <= 0x1f)) {
 103             return ((l & 0xf) << 12) | (t1 << 6) | t2;
 104         }
 105     }
 106     return U_SENTINEL;
 107 }
 108
 109 /**
 110  * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
 111  * Otherwise returns a negative value.
 112  */
 113 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
 114     // Jamo T: E1 86 A8..E1 87 82
 115     if ((limit - src) >= 3 && *src == 0xe1) {
 116         if (src[1] == 0x86) {
 117             uint8_t t = src[2];
 118             // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
 119             // Offset 0 does not correspond to any conjoining Jamo.
 120             if (0xa8 <= t && t <= 0xbf) {
 121                 return t - 0xa7;
 122             }
 123         } else if (src[1] == 0x87) {
 124             uint8_t t = src[2];
 125             if ((int8_t)t <= (int8_t)0x82) {
 126                 return t - (0xa7 - 0x40);
 127             }
 128         }
 129     }
 130     return -1;
 131 }
 132
 133 void
 134 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
 135                      ByteSink &sink, Edits *edits) {
 136     char buffer[U8_MAX_LENGTH];
 137     int32_t length;
 138     int32_t cpLength = (int32_t)(cpLimit - cpStart);
 139     if (cpLength == 1) {
 140         // The builder makes ASCII map to ASCII.
 141         buffer[0] = (uint8_t)(*cpStart + delta);
 142         length = 1;
 143     } else {
 144         int32_t trail = *(cpLimit-1) + delta;
 145         if (0x80 <= trail && trail <= 0xbf) {
 146             // The delta only changes the last trail byte.
 147             --cpLimit;
 148             length = 0;
 149             do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
 150             buffer[length++] = (uint8_t)trail;
 151         } else {
 152             // Decode the code point, add the delta, re-encode.
 153             UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
 154             length = 0;
 155             U8_APPEND_UNSAFE(buffer, length, c);
 156         }
 157     }
 158     if (edits != nullptr) {
 159         edits->addReplace(cpLength, length);
 160     }
 161     sink.Append(buffer, length);
 162 }
 163
 164 }  // namespace
 165
 166 // ReorderingBuffer -------------------------------------------------------- ***
 167
 168 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
 169                                    UErrorCode &errorCode) :
 170         impl(ni), str(dest),
 171         start(str.getBuffer(8)), reorderStart(start), limit(start),
 172         remainingCapacity(str.getCapacity()), lastCC(0) {
 173     if (start == nullptr && U_SUCCESS(errorCode)) {
 174         // getBuffer() already did str.setToBogus()
 175         errorCode = U_MEMORY_ALLOCATION_ERROR;
 176     }
 177 }
 178
 179 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
 180     int32_t length=str.length();
 181     start=str.getBuffer(destCapacity);
 182     if(start==NULL) {
 183         // getBuffer() already did str.setToBogus()
 184         errorCode=U_MEMORY_ALLOCATION_ERROR;
 185         return FALSE;
 186     }
 187     limit=start+length;
 188     remainingCapacity=str.getCapacity()-length;
 189     reorderStart=start;
 190     if(start==limit) {
 191         lastCC=0;
 192     } else {
 193         setIterator();
 194         lastCC=previousCC();
 195         // Set reorderStart after the last code point with cc<=1 if there is one.
 196         if(lastCC>1) {
 197             while(previousCC()>1) {}
 198         }
 199         reorderStart=codePointLimit;
 200     }
 201     return TRUE;
 202 }
 203
 204 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
 205     int32_t length=(int32_t)(limit-start);
 206     return
 207         length==(int32_t)(otherLimit-otherStart) &&
 208         0==u_memcmp(start, otherStart, length);
 209 }
 210
 211 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
 212     U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller
 213     int32_t length = (int32_t)(limit - start);
 214     int32_t otherLength = (int32_t)(otherLimit - otherStart);
 215     // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
 216     if (otherLength < length || (otherLength / 3) > length) {
 217         return FALSE;
 218     }
 219     // Compare valid strings from between normalization boundaries.
 220     // (Invalid sequences are normalization-inert.)
 221     for (int32_t i = 0, j = 0;;) {
 222         if (i >= length) {
 223             return j >= otherLength;
 224         } else if (j >= otherLength) {
 225             return FALSE;
 226         }
 227         // Not at the end of either string yet.
 228         UChar32 c, other;
 229         U16_NEXT_UNSAFE(start, i, c);
 230         U8_NEXT_UNSAFE(otherStart, j, other);
 231         if (c != other) {
 232             return FALSE;
 233         }
 234     }
 235 }
 236
 237 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
 238     if(remainingCapacity<2 && !resize(2, errorCode)) {
 239         return FALSE;
 240     }
 241     if(lastCC<=cc || cc==0) {
 242         limit[0]=U16_LEAD(c);
 243         limit[1]=U16_TRAIL(c);
 244         limit+=2;
 245         lastCC=cc;
 246         if(cc<=1) {
 247             reorderStart=limit;
 248         }
 249     } else {
 250         insert(c, cc);
 251     }
 252     remainingCapacity-=2;
 253     return TRUE;
 254 }
 255
 256 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
 257                                uint8_t leadCC, uint8_t trailCC,
 258                                UErrorCode &errorCode) {
 259     if(length==0) {
 260         return TRUE;
 261     }
 262     if(remainingCapacity<length && !resize(length, errorCode)) {
 263         return FALSE;
 264     }
 265     remainingCapacity-=length;
 266     if(lastCC<=leadCC || leadCC==0) {
 267         if(trailCC<=1) {
 268             reorderStart=limit+length;
 269         } else if(leadCC<=1) {
 270             reorderStart=limit+1;  // Ok if not a code point boundary.
 271         }
 272         const UChar *sLimit=s+length;
 273         do { *limit++=*s++; } while(s!=sLimit);
 274         lastCC=trailCC;
 275     } else {
 276         int32_t i=0;
 277         UChar32 c;
 278         U16_NEXT(s, i, length, c);
 279         insert(c, leadCC);  // insert first code point
 280         while(i<length) {
 281             U16_NEXT(s, i, length, c);
 282             if(i<length) {
 283                 // s must be in NFD, otherwise we need to use getCC().
 284                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
 285             } else {
 286                 leadCC=trailCC;
 287             }
 288             append(c, leadCC, errorCode);
 289         }
 290     }
 291     return TRUE;
 292 }
 293
 294 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
 295     int32_t cpLength=U16_LENGTH(c);
 296     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
 297         return FALSE;
 298     }
 299     remainingCapacity-=cpLength;
 300     if(cpLength==1) {
 301         *limit++=(UChar)c;
 302     } else {
 303         limit[0]=U16_LEAD(c);
 304         limit[1]=U16_TRAIL(c);
 305         limit+=2;
 306     }
 307     lastCC=0;
 308     reorderStart=limit;
 309     return TRUE;
 310 }
 311
 312 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
 313     if(s==sLimit) {
 314         return TRUE;
 315     }
 316     int32_t length=(int32_t)(sLimit-s);
 317     if(remainingCapacity<length && !resize(length, errorCode)) {
 318         return FALSE;
 319     }
 320     u_memcpy(limit, s, length);
 321     limit+=length;
 322     remainingCapacity-=length;
 323     lastCC=0;
 324     reorderStart=limit;
 325     return TRUE;
 326 }
 327
 328 void ReorderingBuffer::remove() {
 329     reorderStart=limit=start;
 330     remainingCapacity=str.getCapacity();
 331     lastCC=0;
 332 }
 333
 334 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
 335     if(suffixLength<(limit-start)) {
 336         limit-=suffixLength;
 337         remainingCapacity+=suffixLength;
 338     } else {
 339         limit=start;
 340         remainingCapacity=str.getCapacity();
 341     }
 342     lastCC=0;
 343     reorderStart=limit;
 344 }
 345
 346 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
 347     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
 348     int32_t length=(int32_t)(limit-start);
 349     str.releaseBuffer(length);
 350     int32_t newCapacity=length+appendLength;
 351     int32_t doubleCapacity=2*str.getCapacity();
 352     if(newCapacity<doubleCapacity) {
 353         newCapacity=doubleCapacity;
 354     }
 355     if(newCapacity<256) {
 356         newCapacity=256;
 357     }
 358     start=str.getBuffer(newCapacity);
 359     if(start==NULL) {
 360         // getBuffer() already did str.setToBogus()
 361         errorCode=U_MEMORY_ALLOCATION_ERROR;
 362         return FALSE;
 363     }
 364     reorderStart=start+reorderStartIndex;
 365     limit=start+length;
 366     remainingCapacity=str.getCapacity()-length;
 367     return TRUE;
 368 }
 369
 370 void ReorderingBuffer::skipPrevious() {
 371     codePointLimit=codePointStart;
 372     UChar c=*--codePointStart;
 373     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
 374         --codePointStart;
 375     }
 376 }
 377
 378 uint8_t ReorderingBuffer::previousCC() {
 379     codePointLimit=codePointStart;
 380     if(reorderStart>=codePointStart) {
 381         return 0;
 382     }
 383     UChar32 c=*--codePointStart;
 384     UChar c2;
 385     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
 386         --codePointStart;
 387         c=U16_GET_SUPPLEMENTARY(c2, c);
 388     }
 389     return impl.getCCFromYesOrMaybeCP(c);
 390 }
 391
 392 // Inserts c somewhere before the last character.
 393 // Requires 0<cc<lastCC which implies reorderStart<limit.
 394 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
 395     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 396     // insert c at codePointLimit, after the character with prevCC<=cc
 397     UChar *q=limit;
 398     UChar *r=limit+=U16_LENGTH(c);
 399     do {
 400         *--r=*--q;
 401     } while(codePointLimit!=q);
 402     writeCodePoint(q, c);
 403     if(cc<=1) {
 404         reorderStart=r;
 405     }
 406 }
 407
 408 // Normalizer2Impl --------------------------------------------------------- ***
 409
 410 struct CanonIterData : public UMemory {
 411     CanonIterData(UErrorCode &errorCode);
 412     ~CanonIterData();
 413     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
 414     UTrie2 *trie;
 415     UVector canonStartSets;  // contains UnicodeSet *
 416 };
 417
 418 Normalizer2Impl::~Normalizer2Impl() {
 419     delete fCanonIterData;
 420 }
 421
 422 void
 423 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
 424                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
 425     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 426     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 427     minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 428
 429     minYesNo=inIndexes[IX_MIN_YES_NO];
 430     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 431     minNoNo=inIndexes[IX_MIN_NO_NO];
 432     minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 433     minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 434     minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 435     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 436     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 437     U_ASSERT((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 438     centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 439
 440     normTrie=inTrie;
 441
 442     maybeYesCompositions=inExtraData;
 443     extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 444
 445     smallFCD=inSmallFCD;
 446 }
 447
 448 class LcccContext {
 449 public:
 450     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
 451
 452     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
 453         if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
 454                 norm16 != Normalizer2Impl::JAMO_VT) {
 455             set.add(start, end);
 456         } else if (impl.minNoNoCompNoMaybeCC <= norm16 && norm16 < impl.limitNoNo) {
 457             uint16_t fcd16=impl.getFCD16(start);
 458             if(fcd16>0xff) { set.add(start, end); }
 459         }
 460     }
 461
 462 private:
 463     const Normalizer2Impl &impl;
 464     UnicodeSet &set;
 465 };
 466
 467 namespace {
 468
 469 struct PropertyStartsContext {
 470     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
 471             : impl(ni), sa(adder) {}
 472
 473     const Normalizer2Impl &impl;
 474     const USetAdder *sa;
 475 };
 476
 477 }  // namespace
 478
 479 U_CDECL_BEGIN
 480
 481 static UBool U_CALLCONV
 482 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
 483     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
 484     return TRUE;
 485 }
 486
 487 static UBool U_CALLCONV
 488 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
 489     /* add the start code point to the USet */
 490     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
 491     const USetAdder *sa=ctx->sa;
 492     sa->add(sa->set, start);
 493     if (start != end && ctx->impl.isAlgorithmicNoNo((uint16_t)value) &&
 494             (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
 495         // Range of code points with same-norm16-value algorithmic decompositions.
 496         // They might have different non-zero FCD16 values.
 497         uint16_t prevFCD16=ctx->impl.getFCD16(start);
 498         while(++start<=end) {
 499             uint16_t fcd16=ctx->impl.getFCD16(start);
 500             if(fcd16!=prevFCD16) {
 501                 sa->add(sa->set, start);
 502                 prevFCD16=fcd16;
 503             }
 504         }
 505     }
 506     return TRUE;
 507 }
 508
 509 static UBool U_CALLCONV
 510 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
 511     /* add the start code point to the USet */
 512     const USetAdder *sa=(const USetAdder *)context;
 513     sa->add(sa->set, start);
 514     return TRUE;
 515 }
 516
 517 static uint32_t U_CALLCONV
 518 segmentStarterMapper(const void * /*context*/, uint32_t value) {
 519     return value&CANON_NOT_SEGMENT_STARTER;
 520 }
 521
 522 U_CDECL_END
 523
 524 void
 525 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
 526     LcccContext context(*this, set);
 527     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
 528 }
 529
 530 void
 531 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
 532     /* add the start code point of each same-value range of each trie */
 533     PropertyStartsContext context(*this, sa);
 534     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
 535
 536     /* add Hangul LV syllables and LV+1 because of skippables */
 537     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
 538         sa->add(sa->set, c);
 539         sa->add(sa->set, c+1);
 540     }
 541     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
 542 }
 543
 544 void
 545 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
 546     /* add the start code point of each same-value range of the canonical iterator data trie */
 547     if(ensureCanonIterData(errorCode)) {
 548         // currently only used for the SEGMENT_STARTER property
 549         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
 550     }
 551 }
 552
 553 const UChar *
 554 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
 555                                                 UChar32 minNeedDataCP,
 556                                                 ReorderingBuffer *buffer,
 557                                                 UErrorCode &errorCode) const {
 558     // Make some effort to support NUL-terminated strings reasonably.
 559     // Take the part of the fast quick check loop that does not look up
 560     // data and check the first part of the string.
 561     // After this prefix, determine the string length to simplify the rest
 562     // of the code.
 563     const UChar *prevSrc=src;
 564     UChar c;
 565     while((c=*src++)<minNeedDataCP && c!=0) {}
 566     // Back out the last character for full processing.
 567     // Copy this prefix.
 568     if(--src!=prevSrc) {
 569         if(buffer!=NULL) {
 570             buffer->appendZeroCC(prevSrc, src, errorCode);
 571         }
 572     }
 573     return src;
 574 }
 575
 576 UnicodeString &
 577 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
 578                            UErrorCode &errorCode) const {
 579     if(U_FAILURE(errorCode)) {
 580         dest.setToBogus();
 581         return dest;
 582     }
 583     const UChar *sArray=src.getBuffer();
 584     if(&dest==&src || sArray==NULL) {
 585         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 586         dest.setToBogus();
 587         return dest;
 588     }
 589     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
 590     return dest;
 591 }
 592
 593 void
 594 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 595                            UnicodeString &dest,
 596                            int32_t destLengthEstimate,
 597                            UErrorCode &errorCode) const {
 598     if(destLengthEstimate<0 && limit!=NULL) {
 599         destLengthEstimate=(int32_t)(limit-src);
 600     }
 601     dest.remove();
 602     ReorderingBuffer buffer(*this, dest);
 603     if(buffer.init(destLengthEstimate, errorCode)) {
 604         decompose(src, limit, &buffer, errorCode);
 605     }
 606 }
 607
 608 // Dual functionality:
 609 // buffer!=NULL: normalize
 610 // buffer==NULL: isNormalized/spanQuickCheckYes
 611 const UChar *
 612 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 613                            ReorderingBuffer *buffer,
 614                            UErrorCode &errorCode) const {
 615     UChar32 minNoCP=minDecompNoCP;
 616     if(limit==NULL) {
 617         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
 618         if(U_FAILURE(errorCode)) {
 619             return src;
 620         }
 621         limit=u_strchr(src, 0);
 622     }
 623
 624     const UChar *prevSrc;
 625     UChar32 c=0;
 626     uint16_t norm16=0;
 627
 628     // only for quick check
 629     const UChar *prevBoundary=src;
 630     uint8_t prevCC=0;
 631
 632     for(;;) {
 633         // count code units below the minimum or with irrelevant data for the quick check
 634         for(prevSrc=src; src!=limit;) {
 635             if( (c=*src)<minNoCP ||
 636                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
 637             ) {
 638                 ++src;
 639             } else if(!U16_IS_SURROGATE(c)) {
 640                 break;
 641             } else {
 642                 UChar c2;
 643                 if(U16_IS_SURROGATE_LEAD(c)) {
 644                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
 645                         c=U16_GET_SUPPLEMENTARY(c, c2);
 646                     }
 647                 } else /* trail surrogate */ {
 648                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
 649                         --src;
 650                         c=U16_GET_SUPPLEMENTARY(c2, c);
 651                     }
 652                 }
 653                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
 654                     src+=U16_LENGTH(c);
 655                 } else {
 656                     break;
 657                 }
 658             }
 659         }
 660         // copy these code units all at once
 661         if(src!=prevSrc) {
 662             if(buffer!=NULL) {
 663                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
 664                     break;
 665                 }
 666             } else {
 667                 prevCC=0;
 668                 prevBoundary=src;
 669             }
 670         }
 671         if(src==limit) {
 672             break;
 673         }
 674
 675         // Check one above-minimum, relevant code point.
 676         src+=U16_LENGTH(c);
 677         if(buffer!=NULL) {
 678             if(!decompose(c, norm16, *buffer, errorCode)) {
 679                 break;
 680             }
 681         } else {
 682             if(isDecompYes(norm16)) {
 683                 uint8_t cc=getCCFromYesOrMaybe(norm16);
 684                 if(prevCC<=cc || cc==0) {
 685                     prevCC=cc;
 686                     if(cc<=1) {
 687                         prevBoundary=src;
 688                     }
 689                     continue;
 690                 }
 691             }
 692             return prevBoundary;  // "no" or cc out of order
 693         }
 694     }
 695     return src;
 696 }
 697
 698 // Decompose a short piece of text which is likely to contain characters that
 699 // fail the quick check loop and/or where the quick check loop's overhead
 700 // is unlikely to be amortized.
 701 // Called by the compose() and makeFCD() implementations.
 702 const UChar *
 703 Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
 704                                 UBool stopAtCompBoundary, UBool onlyContiguous,
 705                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
 706     if (U_FAILURE(errorCode)) {
 707         return nullptr;
 708     }
 709     while(src<limit) {
 710         if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
 711             return src;
 712         }
 713         const UChar *prevSrc = src;
 714         UChar32 c;
 715         uint16_t norm16;
 716         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
 717         if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
 718             return prevSrc;
 719         }
 720         if(!decompose(c, norm16, buffer, errorCode)) {
 721             return nullptr;
 722         }
 723         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 724             return src;
 725         }
 726     }
 727     return src;
 728 }
 729
 730 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
 731                                  ReorderingBuffer &buffer,
 732                                  UErrorCode &errorCode) const {
 733     // get the decomposition and the lead and trail cc's
 734     if (norm16 >= limitNoNo) {
 735         if (isMaybeOrNonZeroCC(norm16)) {
 736             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
 737         }
 738         // Maps to an isCompYesAndZeroCC.
 739         c=mapAlgorithmic(c, norm16);
 740         norm16=getNorm16(c);
 741     }
 742     if (norm16 < minYesNo) {
 743         // c does not decompose
 744         return buffer.append(c, 0, errorCode);
 745     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 746         // Hangul syllable: decompose algorithmically
 747         UChar jamos[3];
 748         return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
 749     }
 750     // c decomposes, get everything from the variable-length extra data
 751     const uint16_t *mapping=getMapping(norm16);
 752     uint16_t firstUnit=*mapping;
 753     int32_t length=firstUnit&MAPPING_LENGTH_MASK;
 754     uint8_t leadCC, trailCC;
 755     trailCC=(uint8_t)(firstUnit>>8);
 756     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
 757         leadCC=(uint8_t)(*(mapping-1)>>8);
 758     } else {
 759         leadCC=0;
 760     }
 761     return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
 762 }
 763
 764 const uint8_t *
 765 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
 766                                 UBool stopAtCompBoundary, UBool onlyContiguous,
 767                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
 768     if (U_FAILURE(errorCode)) {
 769         return nullptr;
 770     }
 771     while (src < limit) {
 772         const uint8_t *prevSrc = src;
 773         uint16_t norm16;
 774         UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
 775         // Get the decomposition and the lead and trail cc's.
 776         UChar32 c = U_SENTINEL;
 777         if (norm16 >= limitNoNo) {
 778             if (isMaybeOrNonZeroCC(norm16)) {
 779                 // No boundaries around this character.
 780                 c = codePointFromValidUTF8(prevSrc, src);
 781                 if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
 782                     return nullptr;
 783                 }
 784                 continue;
 785             }
 786             // Maps to an isCompYesAndZeroCC.
 787             if (stopAtCompBoundary) {
 788                 return prevSrc;
 789             }
 790             c = codePointFromValidUTF8(prevSrc, src);
 791             c = mapAlgorithmic(c, norm16);
 792             norm16 = getNorm16(c);
 793         } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
 794             return prevSrc;
 795         }
 796         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
 797         // We do not see invalid UTF-8 here because
 798         // its norm16==INERT is normalization-inert,
 799         // so it gets copied unchanged in the fast path,
 800         // and we stop the slow path where invalid UTF-8 begins.
 801         U_ASSERT(norm16 != INERT);
 802         if (norm16 < minYesNo) {
 803             if (c < 0) {
 804                 c = codePointFromValidUTF8(prevSrc, src);
 805             }
 806             // does not decompose
 807             if (!buffer.append(c, 0, errorCode)) {
 808                 return nullptr;
 809             }
 810         } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
 811             // Hangul syllable: decompose algorithmically
 812             if (c < 0) {
 813                 c = codePointFromValidUTF8(prevSrc, src);
 814             }
 815             char16_t jamos[3];
 816             if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
 817                 return nullptr;
 818             }
 819         } else {
 820             // The character decomposes, get everything from the variable-length extra data.
 821             const uint16_t *mapping = getMapping(norm16);
 822             uint16_t firstUnit = *mapping;
 823             int32_t length = firstUnit & MAPPING_LENGTH_MASK;
 824             uint8_t trailCC = (uint8_t)(firstUnit >> 8);
 825             uint8_t leadCC;
 826             if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
 827                 leadCC = (uint8_t)(*(mapping-1) >> 8);
 828             } else {
 829                 leadCC = 0;
 830             }
 831             if (!buffer.append((const char16_t *)mapping+1, length, leadCC, trailCC, errorCode)) {
 832                 return nullptr;
 833             }
 834         }
 835         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 836             return src;
 837         }
 838     }
 839     return src;
 840 }
 841
 842 const UChar *
 843 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
 844     uint16_t norm16;
 845     if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 846         // c does not decompose
 847         return nullptr;
 848     }
 849     const UChar *decomp = nullptr;
 850     if(isDecompNoAlgorithmic(norm16)) {
 851         // Maps to an isCompYesAndZeroCC.
 852         c=mapAlgorithmic(c, norm16);
 853         decomp=buffer;
 854         length=0;
 855         U16_APPEND_UNSAFE(buffer, length, c);
 856         // The mapping might decompose further.
 857         norm16 = getNorm16(c);
 858     }
 859     if (norm16 < minYesNo) {
 860         return decomp;
 861     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 862         // Hangul syllable: decompose algorithmically
 863         length=Hangul::decompose(c, buffer);
 864         return buffer;
 865     }
 866     // c decomposes, get everything from the variable-length extra data
 867     const uint16_t *mapping=getMapping(norm16);
 868     length=*mapping&MAPPING_LENGTH_MASK;
 869     return (const UChar *)mapping+1;
 870 }
 871
 872 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
 873 // so that a raw mapping fits that consists of one unit ("rm0")
 874 // plus all but the first two code units of the normal mapping.
 875 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
 876 const UChar *
 877 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
 878     uint16_t norm16;
 879     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 880         // c does not decompose
 881         return NULL;
 882     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 883         // Hangul syllable: decompose algorithmically
 884         Hangul::getRawDecomposition(c, buffer);
 885         length=2;
 886         return buffer;
 887     } else if(isDecompNoAlgorithmic(norm16)) {
 888         c=mapAlgorithmic(c, norm16);
 889         length=0;
 890         U16_APPEND_UNSAFE(buffer, length, c);
 891         return buffer;
 892     }
 893     // c decomposes, get everything from the variable-length extra data
 894     const uint16_t *mapping=getMapping(norm16);
 895     uint16_t firstUnit=*mapping;
 896     int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
 897     if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
 898         // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
 899         // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
 900         const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
 901         uint16_t rm0=*rawMapping;
 902         if(rm0<=MAPPING_LENGTH_MASK) {
 903             length=rm0;
 904             return (const UChar *)rawMapping-rm0;
 905         } else {
 906             // Copy the normal mapping and replace its first two code units with rm0.
 907             buffer[0]=(UChar)rm0;
 908             u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
 909             length=mLength-1;
 910             return buffer;
 911         }
 912     } else {
 913         length=mLength;
 914         return (const UChar *)mapping+1;
 915     }
 916 }
 917
 918 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
 919                                          UBool doDecompose,
 920                                          UnicodeString &safeMiddle,
 921                                          ReorderingBuffer &buffer,
 922                                          UErrorCode &errorCode) const {
 923     buffer.copyReorderableSuffixTo(safeMiddle);
 924     if(doDecompose) {
 925         decompose(src, limit, &buffer, errorCode);
 926         return;
 927     }
 928     // Just merge the strings at the boundary.
 929     ForwardUTrie2StringIterator iter(normTrie, src, limit);
 930     uint8_t firstCC, prevCC, cc;
 931     firstCC=prevCC=cc=getCC(iter.next16());
 932     while(cc!=0) {
 933         prevCC=cc;
 934         cc=getCC(iter.next16());
 935     };
 936     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
 937         limit=u_strchr(iter.codePointStart, 0);
 938     }
 939
 940     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
 941         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
 942     }
 943 }
 944
 945 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
 946     return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
 947         norm16HasDecompBoundaryBefore(getNorm16(c));
 948 }
 949
 950 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
 951     if (norm16 < minNoNoCompNoMaybeCC) {
 952         return TRUE;
 953     }
 954     if (norm16 >= limitNoNo) {
 955         return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
 956     }
 957     // c decomposes, get everything from the variable-length extra data
 958     const uint16_t *mapping=getMapping(norm16);
 959     uint16_t firstUnit=*mapping;
 960     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
 961     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
 962 }
 963
 964 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
 965     if (c < minDecompNoCP) {
 966         return TRUE;
 967     }
 968     if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
 969         return TRUE;
 970     }
 971     return norm16HasDecompBoundaryAfter(getNorm16(c));
 972 }
 973
 974 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
 975     if(norm16 <= minYesNo || isHangulLVT(norm16)) {
 976         return TRUE;
 977     }
 978     if (norm16 >= limitNoNo) {
 979         if (isMaybeOrNonZeroCC(norm16)) {
 980             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
 981         }
 982         // Maps to an isCompYesAndZeroCC.
 983         return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
 984     }
 985     // c decomposes, get everything from the variable-length extra data
 986     const uint16_t *mapping=getMapping(norm16);
 987     uint16_t firstUnit=*mapping;
 988     // decomp after-boundary: same as hasFCDBoundaryAfter(),
 989     // fcd16<=1 || trailCC==0
 990     if(firstUnit>0x1ff) {
 991         return FALSE;  // trailCC>1
 992     }
 993     if(firstUnit<=0xff) {
 994         return TRUE;  // trailCC==0
 995     }
 996     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
 997     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
 998     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
 999 }
1000
1001 /*
1002  * Finds the recomposition result for
1003  * a forward-combining "lead" character,
1004  * specified with a pointer to its compositions list,
1005  * and a backward-combining "trail" character.
1006  *
1007  * If the lead and trail characters combine, then this function returns
1008  * the following "compositeAndFwd" value:
1009  * Bits 21..1  composite character
1010  * Bit      0  set if the composite is a forward-combining starter
1011  * otherwise it returns -1.
1012  *
1013  * The compositions list has (trail, compositeAndFwd) pair entries,
1014  * encoded as either pairs or triples of 16-bit units.
1015  * The last entry has the high bit of its first unit set.
1016  *
1017  * The list is sorted by ascending trail characters (there are no duplicates).
1018  * A linear search is used.
1019  *
1020  * See normalizer2impl.h for a more detailed description
1021  * of the compositions list format.
1022  */
1023 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1024     uint16_t key1, firstUnit;
1025     if(trail<COMP_1_TRAIL_LIMIT) {
1026         // trail character is 0..33FF
1027         // result entry may have 2 or 3 units
1028         key1=(uint16_t)(trail<<1);
1029         while(key1>(firstUnit=*list)) {
1030             list+=2+(firstUnit&COMP_1_TRIPLE);
1031         }
1032         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1033             if(firstUnit&COMP_1_TRIPLE) {
1034                 return ((int32_t)list[1]<<16)|list[2];
1035             } else {
1036                 return list[1];
1037             }
1038         }
1039     } else {
1040         // trail character is 3400..10FFFF
1041         // result entry has 3 units
1042         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1043                         (((trail>>COMP_1_TRAIL_SHIFT))&
1044                           ~COMP_1_TRIPLE));
1045         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1046         uint16_t secondUnit;
1047         for(;;) {
1048             if(key1>(firstUnit=*list)) {
1049                 list+=2+(firstUnit&COMP_1_TRIPLE);
1050             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1051                 if(key2>(secondUnit=list[1])) {
1052                     if(firstUnit&COMP_1_LAST_TUPLE) {
1053                         break;
1054                     } else {
1055                         list+=3;
1056                     }
1057                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1058                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
1059                 } else {
1060                     break;
1061                 }
1062             } else {
1063                 break;
1064             }
1065         }
1066     }
1067     return -1;
1068 }
1069
1070 /**
1071   * @param list some character's compositions list
1072   * @param set recursively receives the composites from these compositions
1073   */
1074 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1075     uint16_t firstUnit;
1076     int32_t compositeAndFwd;
1077     do {
1078         firstUnit=*list;
1079         if((firstUnit&COMP_1_TRIPLE)==0) {
1080             compositeAndFwd=list[1];
1081             list+=2;
1082         } else {
1083             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
1084             list+=3;
1085         }
1086         UChar32 composite=compositeAndFwd>>1;
1087         if((compositeAndFwd&1)!=0) {
1088             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
1089         }
1090         set.add(composite);
1091     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1092 }
1093
1094 /*
1095  * Recomposes the buffer text starting at recomposeStartIndex
1096  * (which is in NFD - decomposed and canonically ordered),
1097  * and truncates the buffer contents.
1098  *
1099  * Note that recomposition never lengthens the text:
1100  * Any character consists of either one or two code units;
1101  * a composition may contain at most one more code unit than the original starter,
1102  * while the combining mark that is removed has at least one code unit.
1103  */
1104 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1105                                 UBool onlyContiguous) const {
1106     UChar *p=buffer.getStart()+recomposeStartIndex;
1107     UChar *limit=buffer.getLimit();
1108     if(p==limit) {
1109         return;
1110     }
1111
1112     UChar *starter, *pRemove, *q, *r;
1113     const uint16_t *compositionsList;
1114     UChar32 c, compositeAndFwd;
1115     uint16_t norm16;
1116     uint8_t cc, prevCC;
1117     UBool starterIsSupplementary;
1118
1119     // Some of the following variables are not used until we have a forward-combining starter
1120     // and are only initialized now to avoid compiler warnings.
1121     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
1122     starter=NULL;
1123     starterIsSupplementary=FALSE;
1124     prevCC=0;
1125
1126     for(;;) {
1127         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
1128         cc=getCCFromYesOrMaybe(norm16);
1129         if( // this character combines backward and
1130             isMaybe(norm16) &&
1131             // we have seen a starter that combines forward and
1132             compositionsList!=NULL &&
1133             // the backward-combining character is not blocked
1134             (prevCC<cc || prevCC==0)
1135         ) {
1136             if(isJamoVT(norm16)) {
1137                 // c is a Jamo V/T, see if we can compose it with the previous character.
1138                 if(c<Hangul::JAMO_T_BASE) {
1139                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1140                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
1141                     if(prev<Hangul::JAMO_L_COUNT) {
1142                         pRemove=p-1;
1143                         UChar syllable=(UChar)
1144                             (Hangul::HANGUL_BASE+
1145                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1146                              Hangul::JAMO_T_COUNT);
1147                         UChar t;
1148                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1149                             ++p;
1150                             syllable+=t;  // The next character was a Jamo T.
1151                         }
1152                         *starter=syllable;
1153                         // remove the Jamo V/T
1154                         q=pRemove;
1155                         r=p;
1156                         while(r<limit) {
1157                             *q++=*r++;
1158                         }
1159                         limit=q;
1160                         p=pRemove;
1161                     }
1162                 }
1163                 /*
1164                  * No "else" for Jamo T:
1165                  * Since the input is in NFD, there are no Hangul LV syllables that
1166                  * a Jamo T could combine with.
1167                  * All Jamo Ts are combined above when handling Jamo Vs.
1168                  */
1169                 if(p==limit) {
1170                     break;
1171                 }
1172                 compositionsList=NULL;
1173                 continue;
1174             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1175                 // The starter and the combining mark (c) do combine.
1176                 UChar32 composite=compositeAndFwd>>1;
1177
1178                 // Replace the starter with the composite, remove the combining mark.
1179                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
1180                 if(starterIsSupplementary) {
1181                     if(U_IS_SUPPLEMENTARY(composite)) {
1182                         // both are supplementary
1183                         starter[0]=U16_LEAD(composite);
1184                         starter[1]=U16_TRAIL(composite);
1185                     } else {
1186                         *starter=(UChar)composite;
1187                         // The composite is shorter than the starter,
1188                         // move the intermediate characters forward one.
1189                         starterIsSupplementary=FALSE;
1190                         q=starter+1;
1191                         r=q+1;
1192                         while(r<pRemove) {
1193                             *q++=*r++;
1194                         }
1195                         --pRemove;
1196                     }
1197                 } else if(U_IS_SUPPLEMENTARY(composite)) {
1198                     // The composite is longer than the starter,
1199                     // move the intermediate characters back one.
1200                     starterIsSupplementary=TRUE;
1201                     ++starter;  // temporarily increment for the loop boundary
1202                     q=pRemove;
1203                     r=++pRemove;
1204                     while(starter<q) {
1205                         *--r=*--q;
1206                     }
1207                     *starter=U16_TRAIL(composite);
1208                     *--starter=U16_LEAD(composite);  // undo the temporary increment
1209                 } else {
1210                     // both are on the BMP
1211                     *starter=(UChar)composite;
1212                 }
1213
1214                 /* remove the combining mark by moving the following text over it */
1215                 if(pRemove<p) {
1216                     q=pRemove;
1217                     r=p;
1218                     while(r<limit) {
1219                         *q++=*r++;
1220                     }
1221                     limit=q;
1222                     p=pRemove;
1223                 }
1224                 // Keep prevCC because we removed the combining mark.
1225
1226                 if(p==limit) {
1227                     break;
1228                 }
1229                 // Is the composite a starter that combines forward?
1230                 if(compositeAndFwd&1) {
1231                     compositionsList=
1232                         getCompositionsListForComposite(getNorm16(composite));
1233                 } else {
1234                     compositionsList=NULL;
1235                 }
1236
1237                 // We combined; continue with looking for compositions.
1238                 continue;
1239             }
1240         }
1241
1242         // no combination this time
1243         prevCC=cc;
1244         if(p==limit) {
1245             break;
1246         }
1247
1248         // If c did not combine, then check if it is a starter.
1249         if(cc==0) {
1250             // Found a new starter.
1251             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1252                 // It may combine with something, prepare for it.
1253                 if(U_IS_BMP(c)) {
1254                     starterIsSupplementary=FALSE;
1255                     starter=p-1;
1256                 } else {
1257                     starterIsSupplementary=TRUE;
1258                     starter=p-2;
1259                 }
1260             }
1261         } else if(onlyContiguous) {
1262             // FCC: no discontiguous compositions; any intervening character blocks.
1263             compositionsList=NULL;
1264         }
1265     }
1266     buffer.setReorderingLimit(limit);
1267 }
1268
1269 UChar32
1270 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1271     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
1272     const uint16_t *list;
1273     if(isInert(norm16)) {
1274         return U_SENTINEL;
1275     } else if(norm16<minYesNoMappingsOnly) {
1276         // a combines forward.
1277         if(isJamoL(norm16)) {
1278             b-=Hangul::JAMO_V_BASE;
1279             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1280                 return
1281                     (Hangul::HANGUL_BASE+
1282                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1283                      Hangul::JAMO_T_COUNT);
1284             } else {
1285                 return U_SENTINEL;
1286             }
1287         } else if(isHangulLV(norm16)) {
1288             b-=Hangul::JAMO_T_BASE;
1289             if(0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1290                 return a+b;
1291             } else {
1292                 return U_SENTINEL;
1293             }
1294         } else {
1295             // 'a' has a compositions list in extraData
1296             list=getMapping(norm16);
1297             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1298                 list+=  // mapping pointer
1299                     1+  // +1 to skip the first unit with the mapping length
1300                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1301             }
1302         }
1303     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1304         return U_SENTINEL;
1305     } else {
1306         list=getCompositionsListForMaybe(norm16);
1307     }
1308     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1309         return U_SENTINEL;
1310     }
1311 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1312     return combine(list, b)>>1;
1313 #else
1314     int32_t compositeAndFwd=combine(list, b);
1315     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1316 #endif
1317 }
1318
1319 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1320 // doCompose: normalize
1321 // !doCompose: isNormalized (buffer must be empty and initialized)
1322 UBool
1323 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1324                          UBool onlyContiguous,
1325                          UBool doCompose,
1326                          ReorderingBuffer &buffer,
1327                          UErrorCode &errorCode) const {
1328     const UChar *prevBoundary=src;
1329     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1330     if(limit==NULL) {
1331         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1332                                            doCompose ? &buffer : NULL,
1333                                            errorCode);
1334         if(U_FAILURE(errorCode)) {
1335             return FALSE;
1336         }
1337         limit=u_strchr(src, 0);
1338         if (prevBoundary != src) {
1339             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1340                 prevBoundary = src;
1341             } else {
1342                 buffer.removeSuffix(1);
1343                 prevBoundary = --src;
1344             }
1345         }
1346     }
1347
1348     for (;;) {
1349         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1350         // or with (compYes && ccc==0) properties.
1351         const UChar *prevSrc;
1352         UChar32 c = 0;
1353         uint16_t norm16 = 0;
1354         for (;;) {
1355             if (src == limit) {
1356                 if (prevBoundary != limit && doCompose) {
1357                     buffer.appendZeroCC(prevBoundary, limit, errorCode);
1358                 }
1359                 return TRUE;
1360             }
1361             if( (c=*src)<minNoMaybeCP ||
1362                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1363             ) {
1364                 ++src;
1365             } else {
1366                 prevSrc = src++;
1367                 if(!U16_IS_SURROGATE(c)) {
1368                     break;
1369                 } else {
1370                     UChar c2;
1371                     if(U16_IS_SURROGATE_LEAD(c)) {
1372                         if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1373                             ++src;
1374                             c=U16_GET_SUPPLEMENTARY(c, c2);
1375                         }
1376                     } else /* trail surrogate */ {
1377                         if(prevBoundary<prevSrc && U16_IS_LEAD(c2=*(prevSrc-1))) {
1378                             --prevSrc;
1379                             c=U16_GET_SUPPLEMENTARY(c2, c);
1380                         }
1381                     }
1382                     if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1383                         break;
1384                     }
1385                 }
1386             }
1387         }
1388         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1389         // The current character is either a "noNo" (has a mapping)
1390         // or a "maybeYes" (combines backward)
1391         // or a "yesYes" with ccc!=0.
1392         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1393
1394         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1395         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1396             if (!doCompose) {
1397                 return FALSE;
1398             }
1399             // Fast path for mapping a character that is immediately surrounded by boundaries.
1400             // In this case, we need not decompose around the current character.
1401             if (isDecompNoAlgorithmic(norm16)) {
1402                 // Maps to a single isCompYesAndZeroCC character
1403                 // which also implies hasCompBoundaryBefore.
1404                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1405                         hasCompBoundaryBefore(src, limit)) {
1406                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1407                         break;
1408                     }
1409                     if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1410                         break;
1411                     }
1412                     prevBoundary = src;
1413                     continue;
1414                 }
1415             } else if (norm16 < minNoNoCompBoundaryBefore) {
1416                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1417                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1418                         hasCompBoundaryBefore(src, limit)) {
1419                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1420                         break;
1421                     }
1422                     const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));
1423                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1424                     if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1425                         break;
1426                     }
1427                     prevBoundary = src;
1428                     continue;
1429                 }
1430             } else if (norm16 >= minNoNoEmpty) {
1431                 // The current character maps to nothing.
1432                 // Simply omit it from the output if there is a boundary before _or_ after it.
1433                 // The character itself implies no boundaries.
1434                 if (hasCompBoundaryBefore(src, limit) ||
1435                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1436                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1437                         break;
1438                     }
1439                     prevBoundary = src;
1440                     continue;
1441                 }
1442             }
1443             // Other "noNo" type, or need to examine more text around this character:
1444             // Fall through to the slow path.
1445         } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1446             UChar prev=*(prevSrc-1);
1447             if(c<Hangul::JAMO_T_BASE) {
1448                 // The current character is a Jamo Vowel,
1449                 // compose with previous Jamo L and following Jamo T.
1450                 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);
1451                 if(l<Hangul::JAMO_L_COUNT) {
1452                     if (!doCompose) {
1453                         return FALSE;
1454                     }
1455                     int32_t t;
1456                     if (src != limit &&
1457                             0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1458                             t < Hangul::JAMO_T_COUNT) {
1459                         // The next character is a Jamo T.
1460                         ++src;
1461                     } else if (hasCompBoundaryBefore(src, limit)) {
1462                         // No Jamo T follows, not even via decomposition.
1463                         t = 0;
1464                     } else {
1465                         t = -1;
1466                     }
1467                     if (t >= 0) {
1468                         UChar32 syllable = Hangul::HANGUL_BASE +
1469                             (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1470                             Hangul::JAMO_T_COUNT + t;
1471                         --prevSrc;  // Replace the Jamo L as well.
1472                         if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1473                             break;
1474                         }
1475                         if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1476                             break;
1477                         }
1478                         prevBoundary = src;
1479                         continue;
1480                     }
1481                     // If we see L+V+x where x!=T then we drop to the slow path,
1482                     // decompose and recompose.
1483                     // This is to deal with NFKC finding normal L and V but a
1484                     // compatibility variant of a T.
1485                     // We need to either fully compose that combination here
1486                     // (which would complicate the code and may not work with strange custom data)
1487                     // or use the slow path.
1488                 }
1489             } else if (Hangul::isHangulLV(prev)) {
1490                 // The current character is a Jamo Trailing consonant,
1491                 // compose with previous Hangul LV that does not contain a Jamo T.
1492                 if (!doCompose) {
1493                     return FALSE;
1494                 }
1495                 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1496                 --prevSrc;  // Replace the Hangul LV as well.
1497                 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1498                     break;
1499                 }
1500                 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1501                     break;
1502                 }
1503                 prevBoundary = src;
1504                 continue;
1505             }
1506             // No matching context, or may need to decompose surrounding text first:
1507             // Fall through to the slow path.
1508         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1509             // One or more combining marks that do not combine-back:
1510             // Check for canonical order, copy unchanged if ok and
1511             // if followed by a character with a boundary-before.
1512             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1513             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1514                 // Fails FCD test, need to decompose and contiguously recompose.
1515                 if (!doCompose) {
1516                     return FALSE;
1517                 }
1518             } else {
1519                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1520                 // the previous character which passed the quick check "yes && ccc==0" test.
1521                 const UChar *nextSrc;
1522                 uint16_t n16;
1523                 for (;;) {
1524                     if (src == limit) {
1525                         if (doCompose) {
1526                             buffer.appendZeroCC(prevBoundary, limit, errorCode);
1527                         }
1528                         return TRUE;
1529                     }
1530                     uint8_t prevCC = cc;
1531                     nextSrc = src;
1532                     UTRIE2_U16_NEXT16(normTrie, nextSrc, limit, c, n16);
1533                     if (n16 >= MIN_YES_YES_WITH_CC) {
1534                         cc = getCCFromNormalYesOrMaybe(n16);
1535                         if (prevCC > cc) {
1536                             if (!doCompose) {
1537                                 return FALSE;
1538                             }
1539                             break;
1540                         }
1541                     } else {
1542                         break;
1543                     }
1544                     src = nextSrc;
1545                 }
1546                 // src is after the last in-order combining mark.
1547                 // If there is a boundary here, then we continue with no change.
1548                 if (norm16HasCompBoundaryBefore(n16)) {
1549                     if (isCompYesAndZeroCC(n16)) {
1550                         src = nextSrc;
1551                     }
1552                     continue;
1553                 }
1554                 // Use the slow path. There is no boundary in [prevSrc, src[.
1555             }
1556         }
1557
1558         // Slow path: Find the nearest boundaries around the current character,
1559         // decompose and recompose.
1560         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1561             const UChar *p = prevSrc;
1562             UTRIE2_U16_PREV16(normTrie, prevBoundary, p, c, norm16);
1563             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1564                 prevSrc = p;
1565             }
1566         }
1567         if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1568             break;
1569         }
1570         int32_t recomposeStartIndex=buffer.length();
1571         // We know there is not a boundary here.
1572         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1573                        buffer, errorCode);
1574         // Decompose until the next boundary.
1575         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1576                              buffer, errorCode);
1577         if (U_FAILURE(errorCode)) {
1578             break;
1579         }
1580         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1581             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1582             return TRUE;
1583         }
1584         recompose(buffer, recomposeStartIndex, onlyContiguous);
1585         if(!doCompose) {
1586             if(!buffer.equals(prevSrc, src)) {
1587                 return FALSE;
1588             }
1589             buffer.remove();
1590         }
1591         prevBoundary=src;
1592     }
1593     return TRUE;
1594 }
1595
1596 // Very similar to compose(): Make the same changes in both places if relevant.
1597 // pQCResult==NULL: spanQuickCheckYes
1598 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1599 const UChar *
1600 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1601                                    UBool onlyContiguous,
1602                                    UNormalizationCheckResult *pQCResult) const {
1603     const UChar *prevBoundary=src;
1604     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1605     if(limit==NULL) {
1606         UErrorCode errorCode=U_ZERO_ERROR;
1607         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1608         limit=u_strchr(src, 0);
1609         if (prevBoundary != src) {
1610             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1611                 prevBoundary = src;
1612             } else {
1613                 prevBoundary = --src;
1614             }
1615         }
1616     }
1617
1618     for(;;) {
1619         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1620         // or with (compYes && ccc==0) properties.
1621         const UChar *prevSrc;
1622         UChar32 c = 0;
1623         uint16_t norm16 = 0;
1624         for (;;) {
1625             if(src==limit) {
1626                 return src;
1627             }
1628             if( (c=*src)<minNoMaybeCP ||
1629                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1630             ) {
1631                 ++src;
1632             } else {
1633                 prevSrc = src++;
1634                 if(!U16_IS_SURROGATE(c)) {
1635                     break;
1636                 } else {
1637                     UChar c2;
1638                     if(U16_IS_SURROGATE_LEAD(c)) {
1639                         if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1640                             ++src;
1641                             c=U16_GET_SUPPLEMENTARY(c, c2);
1642                         }
1643                     } else /* trail surrogate */ {
1644                         if(prevBoundary<prevSrc && U16_IS_LEAD(c2=*(prevSrc-1))) {
1645                             --prevSrc;
1646                             c=U16_GET_SUPPLEMENTARY(c2, c);
1647                         }
1648                     }
1649                     if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1650                         break;
1651                     }
1652                 }
1653             }
1654         }
1655         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1656         // The current character is either a "noNo" (has a mapping)
1657         // or a "maybeYes" (combines backward)
1658         // or a "yesYes" with ccc!=0.
1659         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1660
1661         uint16_t prevNorm16 = INERT;
1662         if (prevBoundary != prevSrc) {
1663             if (norm16HasCompBoundaryBefore(norm16)) {
1664                 prevBoundary = prevSrc;
1665             } else {
1666                 const UChar *p = prevSrc;
1667                 uint16_t n16;
1668                 UTRIE2_U16_PREV16(normTrie, prevBoundary, p, c, n16);
1669                 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1670                     prevBoundary = prevSrc;
1671                 } else {
1672                     prevBoundary = p;
1673                     prevNorm16 = n16;
1674                 }
1675             }
1676         }
1677
1678         if(isMaybeOrNonZeroCC(norm16)) {
1679             uint8_t cc=getCCFromYesOrMaybe(norm16);
1680             if (onlyContiguous /* FCC */ && cc != 0 &&
1681                     getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1682                 // The [prevBoundary..prevSrc[ character
1683                 // passed the quick check "yes && ccc==0" test
1684                 // but is out of canonical order with the current combining mark.
1685             } else {
1686                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1687                 // the previous character which passed the quick check "yes && ccc==0" test.
1688                 const UChar *nextSrc;
1689                 for (;;) {
1690                     if (norm16 < MIN_YES_YES_WITH_CC) {
1691                         if (pQCResult != nullptr) {
1692                             *pQCResult = UNORM_MAYBE;
1693                         } else {
1694                             return prevBoundary;
1695                         }
1696                     }
1697                     if (src == limit) {
1698                         return src;
1699                     }
1700                     uint8_t prevCC = cc;
1701                     nextSrc = src;
1702                     UTRIE2_U16_NEXT16(normTrie, nextSrc, limit, c, norm16);
1703                     if (isMaybeOrNonZeroCC(norm16)) {
1704                         cc = getCCFromYesOrMaybe(norm16);
1705                         if (!(prevCC <= cc || cc == 0)) {
1706                             break;
1707                         }
1708                     } else {
1709                         break;
1710                     }
1711                     src = nextSrc;
1712                 }
1713                 // src is after the last in-order combining mark.
1714                 if (isCompYesAndZeroCC(norm16)) {
1715                     prevBoundary = src;
1716                     src = nextSrc;
1717                     continue;
1718                 }
1719             }
1720         }
1721         if(pQCResult!=NULL) {
1722             *pQCResult=UNORM_NO;
1723         }
1724         return prevBoundary;
1725     }
1726 }
1727
1728 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1729                                        UBool doCompose,
1730                                        UBool onlyContiguous,
1731                                        UnicodeString &safeMiddle,
1732                                        ReorderingBuffer &buffer,
1733                                        UErrorCode &errorCode) const {
1734     if(!buffer.isEmpty()) {
1735         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1736         if(src!=firstStarterInSrc) {
1737             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1738                                                                     buffer.getLimit(), onlyContiguous);
1739             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1740             UnicodeString middle(lastStarterInDest, destSuffixLength);
1741             buffer.removeSuffix(destSuffixLength);
1742             safeMiddle=middle;
1743             middle.append(src, (int32_t)(firstStarterInSrc-src));
1744             const UChar *middleStart=middle.getBuffer();
1745             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1746                     TRUE, buffer, errorCode);
1747             if(U_FAILURE(errorCode)) {
1748                 return;
1749             }
1750             src=firstStarterInSrc;
1751         }
1752     }
1753     if(doCompose) {
1754         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1755     } else {
1756         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1757             limit=u_strchr(src, 0);
1758         }
1759         buffer.appendZeroCC(src, limit, errorCode);
1760     }
1761 }
1762
1763 UBool
1764 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1765                              const uint8_t *src, const uint8_t *limit,
1766                              ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1767     U_ASSERT(limit != nullptr);
1768     UnicodeString s16;
1769     uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1770     const uint8_t *prevBoundary = src;
1771
1772     for (;;) {
1773         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1774         // or with (compYes && ccc==0) properties.
1775         const uint8_t *prevSrc;
1776         uint16_t norm16 = 0;
1777         for (;;) {
1778             if (src == limit) {
1779                 if (prevBoundary != limit && sink != nullptr) {
1780                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1781                                                   *sink, options, edits, errorCode);
1782                 }
1783                 return TRUE;
1784             }
1785             if (*src < minNoMaybeLead) {
1786                 ++src;
1787             } else {
1788                 prevSrc = src;
1789                 UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
1790                 if (!isCompYesAndZeroCC(norm16)) {
1791                     break;
1792                 }
1793             }
1794         }
1795         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1796         // The current character is either a "noNo" (has a mapping)
1797         // or a "maybeYes" (combines backward)
1798         // or a "yesYes" with ccc!=0.
1799         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1800
1801         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1802         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1803             if (sink == nullptr) {
1804                 return FALSE;
1805             }
1806             // Fast path for mapping a character that is immediately surrounded by boundaries.
1807             // In this case, we need not decompose around the current character.
1808             if (isDecompNoAlgorithmic(norm16)) {
1809                 // Maps to a single isCompYesAndZeroCC character
1810                 // which also implies hasCompBoundaryBefore.
1811                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1812                         hasCompBoundaryBefore(src, limit)) {
1813                     if (prevBoundary != prevSrc &&
1814                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1815                                                            *sink, options, edits, errorCode)) {
1816                         break;
1817                     }
1818                     appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1819                     prevBoundary = src;
1820                     continue;
1821                 }
1822             } else if (norm16 < minNoNoCompBoundaryBefore) {
1823                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1824                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1825                         hasCompBoundaryBefore(src, limit)) {
1826                     if (prevBoundary != prevSrc &&
1827                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1828                                                            *sink, options, edits, errorCode)) {
1829                         break;
1830                     }
1831                     const uint16_t *mapping = getMapping(norm16);
1832                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1833                     if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
1834                                                     *sink, edits, errorCode)) {
1835                         break;
1836                     }
1837                     prevBoundary = src;
1838                     continue;
1839                 }
1840             } else if (norm16 >= minNoNoEmpty) {
1841                 // The current character maps to nothing.
1842                 // Simply omit it from the output if there is a boundary before _or_ after it.
1843                 // The character itself implies no boundaries.
1844                 if (hasCompBoundaryBefore(src, limit) ||
1845                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1846                     if (prevBoundary != prevSrc &&
1847                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1848                                                            *sink, options, edits, errorCode)) {
1849                         break;
1850                     }
1851                     if (edits != nullptr) {
1852                         edits->addReplace((int32_t)(src - prevSrc), 0);
1853                     }
1854                     prevBoundary = src;
1855                     continue;
1856                 }
1857             }
1858             // Other "noNo" type, or need to examine more text around this character:
1859             // Fall through to the slow path.
1860         } else if (isJamoVT(norm16)) {
1861             // Jamo L: E1 84 80..92
1862             // Jamo V: E1 85 A1..B5
1863             // Jamo T: E1 86 A8..E1 87 82
1864             U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);
1865             UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1866             if (prevSrc[1] == 0x85) {
1867                 // The current character is a Jamo Vowel,
1868                 // compose with previous Jamo L and following Jamo T.
1869                 UChar32 l = prev - Hangul::JAMO_L_BASE;
1870                 if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1871                     if (sink == nullptr) {
1872                         return FALSE;
1873                     }
1874                     int32_t t = getJamoTMinusBase(src, limit);
1875                     if (t >= 0) {
1876                         // The next character is a Jamo T.
1877                         src += 3;
1878                     } else if (hasCompBoundaryBefore(src, limit)) {
1879                         // No Jamo T follows, not even via decomposition.
1880                         t = 0;
1881                     }
1882                     if (t >= 0) {
1883                         UChar32 syllable = Hangul::HANGUL_BASE +
1884                             (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
1885                             Hangul::JAMO_T_COUNT + t;
1886                         prevSrc -= 3;  // Replace the Jamo L as well.
1887                         if (prevBoundary != prevSrc &&
1888                                 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1889                                                                *sink, options, edits, errorCode)) {
1890                             break;
1891                         }
1892                         ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1893                         prevBoundary = src;
1894                         continue;
1895                     }
1896                     // If we see L+V+x where x!=T then we drop to the slow path,
1897                     // decompose and recompose.
1898                     // This is to deal with NFKC finding normal L and V but a
1899                     // compatibility variant of a T.
1900                     // We need to either fully compose that combination here
1901                     // (which would complicate the code and may not work with strange custom data)
1902                     // or use the slow path.
1903                 }
1904             } else if (Hangul::isHangulLV(prev)) {
1905                 // The current character is a Jamo Trailing consonant,
1906                 // compose with previous Hangul LV that does not contain a Jamo T.
1907                 if (sink == nullptr) {
1908                     return FALSE;
1909                 }
1910                 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
1911                 prevSrc -= 3;  // Replace the Hangul LV as well.
1912                 if (prevBoundary != prevSrc &&
1913                         !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1914                                                        *sink, options, edits, errorCode)) {
1915                     break;
1916                 }
1917                 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1918                 prevBoundary = src;
1919                 continue;
1920             }
1921             // No matching context, or may need to decompose surrounding text first:
1922             // Fall through to the slow path.
1923         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1924             // One or more combining marks that do not combine-back:
1925             // Check for canonical order, copy unchanged if ok and
1926             // if followed by a character with a boundary-before.
1927             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1928             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1929                 // Fails FCD test, need to decompose and contiguously recompose.
1930                 if (sink == nullptr) {
1931                     return FALSE;
1932                 }
1933             } else {
1934                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1935                 // the previous character which passed the quick check "yes && ccc==0" test.
1936                 const uint8_t *nextSrc;
1937                 uint16_t n16;
1938                 for (;;) {
1939                     if (src == limit) {
1940                         if (sink != nullptr) {
1941                             ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1942                                                           *sink, options, edits, errorCode);
1943                         }
1944                         return TRUE;
1945                     }
1946                     uint8_t prevCC = cc;
1947                     nextSrc = src;
1948                     UTRIE2_U8_NEXT16(normTrie, nextSrc, limit, n16);
1949                     if (n16 >= MIN_YES_YES_WITH_CC) {
1950                         cc = getCCFromNormalYesOrMaybe(n16);
1951                         if (prevCC > cc) {
1952                             if (sink == nullptr) {
1953                                 return FALSE;
1954                             }
1955                             break;
1956                         }
1957                     } else {
1958                         break;
1959                     }
1960                     src = nextSrc;
1961                 }
1962                 // src is after the last in-order combining mark.
1963                 // If there is a boundary here, then we continue with no change.
1964                 if (norm16HasCompBoundaryBefore(n16)) {
1965                     if (isCompYesAndZeroCC(n16)) {
1966                         src = nextSrc;
1967                     }
1968                     continue;
1969                 }
1970                 // Use the slow path. There is no boundary in [prevSrc, src[.
1971             }
1972         }
1973
1974         // Slow path: Find the nearest boundaries around the current character,
1975         // decompose and recompose.
1976         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1977             const uint8_t *p = prevSrc;
1978             UTRIE2_U8_PREV16(normTrie, prevBoundary, p, norm16);
1979             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1980                 prevSrc = p;
1981             }
1982         }
1983         ReorderingBuffer buffer(*this, s16, errorCode);
1984         if (U_FAILURE(errorCode)) {
1985             break;
1986         }
1987         // We know there is not a boundary here.
1988         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1989                        buffer, errorCode);
1990         // Decompose until the next boundary.
1991         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1992                              buffer, errorCode);
1993         if (U_FAILURE(errorCode)) {
1994             break;
1995         }
1996         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1997             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1998             return TRUE;
1999         }
2000         recompose(buffer, 0, onlyContiguous);
2001         if (!buffer.equals(prevSrc, src)) {
2002             if (sink == nullptr) {
2003                 return FALSE;
2004             }
2005             if (prevBoundary != prevSrc &&
2006                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2007                                                    *sink, options, edits, errorCode)) {
2008                 break;
2009             }
2010             if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
2011                                             *sink, edits, errorCode)) {
2012                 break;
2013             }
2014             prevBoundary = src;
2015         }
2016     }
2017     return TRUE;
2018 }
2019
2020 UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const {
2021     if (src == limit || *src < minCompNoMaybeCP) {
2022         return TRUE;
2023     }
2024     UChar32 c;
2025     uint16_t norm16;
2026     UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
2027     return norm16HasCompBoundaryBefore(norm16);
2028 }
2029
2030 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2031     if (src == limit) {
2032         return TRUE;
2033     }
2034     uint16_t norm16;
2035     UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
2036     return norm16HasCompBoundaryBefore(norm16);
2037 }
2038
2039 UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
2040                                             UBool onlyContiguous) const {
2041     if (start == p) {
2042         return TRUE;
2043     }
2044     UChar32 c;
2045     uint16_t norm16;
2046     UTRIE2_U16_PREV16(normTrie, start, p, c, norm16);
2047     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2048 }
2049
2050 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2051                                             UBool onlyContiguous) const {
2052     if (start == p) {
2053         return TRUE;
2054     }
2055     uint16_t norm16;
2056     UTRIE2_U8_PREV16(normTrie, start, p, norm16);
2057     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2058 }
2059
2060 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
2061                                                        UBool onlyContiguous) const {
2062     BackwardUTrie2StringIterator iter(normTrie, start, p);
2063     for(;;) {
2064         uint16_t norm16=iter.previous16();
2065         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2066             return iter.codePointLimit;
2067         }
2068         if (hasCompBoundaryBefore(iter.codePoint, norm16)) {
2069             return iter.codePointStart;
2070         }
2071     }
2072 }
2073
2074 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
2075                                                    UBool onlyContiguous) const {
2076     ForwardUTrie2StringIterator iter(normTrie, p, limit);
2077     for(;;) {
2078         uint16_t norm16=iter.next16();
2079         if (hasCompBoundaryBefore(iter.codePoint, norm16)) {
2080             return iter.codePointStart;
2081         }
2082         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2083             return iter.codePointLimit;
2084         }
2085     }
2086 }
2087
2088 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
2089     if (start == p) {
2090         return 0;
2091     }
2092     int32_t i = (int32_t)(p - start);
2093     UChar32 c;
2094     U16_PREV(start, 0, i, c);
2095     return (uint8_t)getFCD16(c);
2096 }
2097
2098 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2099     if (start == p) {
2100         return 0;
2101     }
2102     int32_t i = (int32_t)(p - start);
2103     UChar32 c;
2104     U8_PREV(start, 0, i, c);
2105     return (uint8_t)getFCD16(c);
2106 }
2107
2108 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2109 // still had getFCDTrie() which built and cached an FCD trie.
2110 // That provided faster access to FCD data than getFCD16FromNormData()
2111 // but required synchronization and consumed some 10kB of heap memory
2112 // in any process that uses FCD (e.g., via collation).
2113 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2114 // at least for ASCII & CJK.
2115
2116 // Gets the FCD value from the regular normalization data.
2117 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2118     uint16_t norm16=getNorm16(c);
2119     if (norm16 >= limitNoNo) {
2120         if(norm16>=MIN_NORMAL_MAYBE_YES) {
2121             // combining mark
2122             norm16=getCCFromNormalYesOrMaybe(norm16);
2123             return norm16|(norm16<<8);
2124         } else if(norm16>=minMaybeYes) {
2125             return 0;
2126         } else {  // isDecompNoAlgorithmic(norm16)
2127             uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2128             if (deltaTrailCC <= DELTA_TCCC_1) {
2129                 return deltaTrailCC >> OFFSET_SHIFT;
2130             }
2131             // Maps to an isCompYesAndZeroCC.
2132             c=mapAlgorithmic(c, norm16);
2133             norm16=getNorm16(c);
2134         }
2135     }
2136     if(norm16<=minYesNo || isHangulLVT(norm16)) {
2137         // no decomposition or Hangul syllable, all zeros
2138         return 0;
2139     }
2140     // c decomposes, get everything from the variable-length extra data
2141     const uint16_t *mapping=getMapping(norm16);
2142     uint16_t firstUnit=*mapping;
2143     norm16=firstUnit>>8;  // tccc
2144     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2145         norm16|=*(mapping-1)&0xff00;  // lccc
2146     }
2147     return norm16;
2148 }
2149
2150 // Dual functionality:
2151 // buffer!=NULL: normalize
2152 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2153 const UChar *
2154 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
2155                          ReorderingBuffer *buffer,
2156                          UErrorCode &errorCode) const {
2157     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2158     // Similar to the prevBoundary in the compose() implementation.
2159     const UChar *prevBoundary=src;
2160     int32_t prevFCD16=0;
2161     if(limit==NULL) {
2162         src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2163         if(U_FAILURE(errorCode)) {
2164             return src;
2165         }
2166         if(prevBoundary<src) {
2167             prevBoundary=src;
2168             // We know that the previous character's lccc==0.
2169             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2170             prevFCD16=getFCD16(*(src-1));
2171             if(prevFCD16>1) {
2172                 --prevBoundary;
2173             }
2174         }
2175         limit=u_strchr(src, 0);
2176     }
2177
2178     // Note: In this function we use buffer->appendZeroCC() because we track
2179     // the lead and trail combining classes here, rather than leaving it to
2180     // the ReorderingBuffer.
2181     // The exception is the call to decomposeShort() which uses the buffer
2182     // in the normal way.
2183
2184     const UChar *prevSrc;
2185     UChar32 c=0;
2186     uint16_t fcd16=0;
2187
2188     for(;;) {
2189         // count code units with lccc==0
2190         for(prevSrc=src; src!=limit;) {
2191             if((c=*src)<minLcccCP) {
2192                 prevFCD16=~c;
2193                 ++src;
2194             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2195                 prevFCD16=0;
2196                 ++src;
2197             } else {
2198                 if(U16_IS_SURROGATE(c)) {
2199                     UChar c2;
2200                     if(U16_IS_SURROGATE_LEAD(c)) {
2201                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
2202                             c=U16_GET_SUPPLEMENTARY(c, c2);
2203                         }
2204                     } else /* trail surrogate */ {
2205                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
2206                             --src;
2207                             c=U16_GET_SUPPLEMENTARY(c2, c);
2208                         }
2209                     }
2210                 }
2211                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2212                     prevFCD16=fcd16;
2213                     src+=U16_LENGTH(c);
2214                 } else {
2215                     break;
2216                 }
2217             }
2218         }
2219         // copy these code units all at once
2220         if(src!=prevSrc) {
2221             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2222                 break;
2223             }
2224             if(src==limit) {
2225                 break;
2226             }
2227             prevBoundary=src;
2228             // We know that the previous character's lccc==0.
2229             if(prevFCD16<0) {
2230                 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2231                 UChar32 prev=~prevFCD16;
2232                 if(prev<minDecompNoCP) {
2233                     prevFCD16=0;
2234                 } else {
2235                     prevFCD16=getFCD16FromNormData(prev);
2236                     if(prevFCD16>1) {
2237                         --prevBoundary;
2238                     }
2239                 }
2240             } else {
2241                 const UChar *p=src-1;
2242                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
2243                     --p;
2244                     // Need to fetch the previous character's FCD value because
2245                     // prevFCD16 was just for the trail surrogate code point.
2246                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
2247                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2248                 }
2249                 if(prevFCD16>1) {
2250                     prevBoundary=p;
2251                 }
2252             }
2253             // The start of the current character (c).
2254             prevSrc=src;
2255         } else if(src==limit) {
2256             break;
2257         }
2258
2259         src+=U16_LENGTH(c);
2260         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2261         // Check for proper order, and decompose locally if necessary.
2262         if((prevFCD16&0xff)<=(fcd16>>8)) {
2263             // proper order: prev tccc <= current lccc
2264             if((fcd16&0xff)<=1) {
2265                 prevBoundary=src;
2266             }
2267             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
2268                 break;
2269             }
2270             prevFCD16=fcd16;
2271             continue;
2272         } else if(buffer==NULL) {
2273             return prevBoundary;  // quick check "no"
2274         } else {
2275             /*
2276              * Back out the part of the source that we copied or appended
2277              * already but is now going to be decomposed.
2278              * prevSrc is set to after what was copied/appended.
2279              */
2280             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2281             /*
2282              * Find the part of the source that needs to be decomposed,
2283              * up to the next safe boundary.
2284              */
2285             src=findNextFCDBoundary(src, limit);
2286             /*
2287              * The source text does not fulfill the conditions for FCD.
2288              * Decompose and reorder a limited piece of the text.
2289              */
2290             decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);
2291             if (U_FAILURE(errorCode)) {
2292                 break;
2293             }
2294             prevBoundary=src;
2295             prevFCD16=0;
2296         }
2297     }
2298     return src;
2299 }
2300
2301 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
2302                                        UBool doMakeFCD,
2303                                        UnicodeString &safeMiddle,
2304                                        ReorderingBuffer &buffer,
2305                                        UErrorCode &errorCode) const {
2306     if(!buffer.isEmpty()) {
2307         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2308         if(src!=firstBoundaryInSrc) {
2309             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2310                                                                     buffer.getLimit());
2311             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2312             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2313             buffer.removeSuffix(destSuffixLength);
2314             safeMiddle=middle;
2315             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2316             const UChar *middleStart=middle.getBuffer();
2317             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2318             if(U_FAILURE(errorCode)) {
2319                 return;
2320             }
2321             src=firstBoundaryInSrc;
2322         }
2323     }
2324     if(doMakeFCD) {
2325         makeFCD(src, limit, &buffer, errorCode);
2326     } else {
2327         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
2328             limit=u_strchr(src, 0);
2329         }
2330         buffer.appendZeroCC(src, limit, errorCode);
2331     }
2332 }
2333
2334 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
2335     while(start<p) {
2336         const UChar *codePointLimit = p;
2337         UChar32 c;
2338         uint16_t norm16;
2339         UTRIE2_U16_PREV16(normTrie, start, p, c, norm16);
2340         if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2341             return codePointLimit;
2342         }
2343         if (norm16HasDecompBoundaryBefore(norm16)) {
2344             return p;
2345         }
2346     }
2347     return p;
2348 }
2349
2350 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
2351     while(p<limit) {
2352         const UChar *codePointStart=p;
2353         UChar32 c;
2354         uint16_t norm16;
2355         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
2356         if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2357             return codePointStart;
2358         }
2359         if (norm16HasDecompBoundaryAfter(norm16)) {
2360             return p;
2361         }
2362     }
2363     return p;
2364 }
2365
2366 // CanonicalIterator data -------------------------------------------------- ***
2367
2368 CanonIterData::CanonIterData(UErrorCode &errorCode) :
2369         trie(utrie2_open(0, 0, &errorCode)),
2370         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
2371
2372 CanonIterData::~CanonIterData() {
2373     utrie2_close(trie);
2374 }
2375
2376 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2377     uint32_t canonValue=utrie2_get32(trie, decompLead);
2378     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2379         // origin is the first character whose decomposition starts with
2380         // the character for which we are setting the value.
2381         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
2382     } else {
2383         // origin is not the first character, or it is U+0000.
2384         UnicodeSet *set;
2385         if((canonValue&CANON_HAS_SET)==0) {
2386             set=new UnicodeSet;
2387             if(set==NULL) {
2388                 errorCode=U_MEMORY_ALLOCATION_ERROR;
2389                 return;
2390             }
2391             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2392             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
2393             utrie2_set32(trie, decompLead, canonValue, &errorCode);
2394             canonStartSets.addElement(set, errorCode);
2395             if(firstOrigin!=0) {
2396                 set->add(firstOrigin);
2397             }
2398         } else {
2399             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
2400         }
2401         set->add(origin);
2402     }
2403 }
2404
2405 // C++ class for friend access to private Normalizer2Impl members.
2406 class InitCanonIterData {
2407 public:
2408     static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2409     static void handleRange(Normalizer2Impl *impl, UChar32 start, UChar32 end, uint16_t value, UErrorCode &errorCode);
2410 };
2411
2412 U_CDECL_BEGIN
2413
2414 // UInitOnce instantiation function for CanonIterData
2415 static void U_CALLCONV
2416 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2417     InitCanonIterData::doInit(impl, errorCode);
2418 }
2419
2420 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2421 //     context: the Normalizer2Impl
2422 static UBool U_CALLCONV
2423 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
2424     UErrorCode errorCode = U_ZERO_ERROR;
2425     if (value != Normalizer2Impl::INERT) {
2426         Normalizer2Impl *impl = (Normalizer2Impl *)context;
2427         InitCanonIterData::handleRange(impl, start, end, (uint16_t)value, errorCode);
2428     }
2429     return U_SUCCESS(errorCode);
2430 }
2431
2432 U_CDECL_END
2433
2434 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2435     U_ASSERT(impl->fCanonIterData == NULL);
2436     impl->fCanonIterData = new CanonIterData(errorCode);
2437     if (impl->fCanonIterData == NULL) {
2438         errorCode=U_MEMORY_ALLOCATION_ERROR;
2439     }
2440     if (U_SUCCESS(errorCode)) {
2441         utrie2_enum(impl->normTrie, NULL, enumCIDRangeHandler, impl);
2442         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
2443     }
2444     if (U_FAILURE(errorCode)) {
2445         delete impl->fCanonIterData;
2446         impl->fCanonIterData = NULL;
2447     }
2448 }
2449
2450 void InitCanonIterData::handleRange(
2451         Normalizer2Impl *impl, UChar32 start, UChar32 end, uint16_t value, UErrorCode &errorCode) {
2452     impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2453 }
2454
2455 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2456                                                   CanonIterData &newData,
2457                                                   UErrorCode &errorCode) const {
2458     if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
2459         // Inert, or 2-way mapping (including Hangul syllable).
2460         // We do not write a canonStartSet for any yesNo character.
2461         // Composites from 2-way mappings are added at runtime from the
2462         // starter's compositions list, and the other characters in
2463         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2464         // "maybe" characters.
2465         return;
2466     }
2467     for(UChar32 c=start; c<=end; ++c) {
2468         uint32_t oldValue=utrie2_get32(newData.trie, c);
2469         uint32_t newValue=oldValue;
2470         if(isMaybeOrNonZeroCC(norm16)) {
2471             // not a segment starter if it occurs in a decomposition or has cc!=0
2472             newValue|=CANON_NOT_SEGMENT_STARTER;
2473             if(norm16<MIN_NORMAL_MAYBE_YES) {
2474                 newValue|=CANON_HAS_COMPOSITIONS;
2475             }
2476         } else if(norm16<minYesNo) {
2477             newValue|=CANON_HAS_COMPOSITIONS;
2478         } else {
2479             // c has a one-way decomposition
2480             UChar32 c2=c;
2481             // Do not modify the whole-range norm16 value.
2482             uint16_t norm16_2=norm16;
2483             if (isDecompNoAlgorithmic(norm16_2)) {
2484                 // Maps to an isCompYesAndZeroCC.
2485                 c2 = mapAlgorithmic(c2, norm16_2);
2486                 norm16_2 = getNorm16(c2);
2487                 // No compatibility mappings for the CanonicalIterator.
2488                 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
2489             }
2490             if (norm16_2 > minYesNo) {
2491                 // c decomposes, get everything from the variable-length extra data
2492                 const uint16_t *mapping=getMapping(norm16_2);
2493                 uint16_t firstUnit=*mapping;
2494                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2495                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2496                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
2497                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
2498                     }
2499                 }
2500                 // Skip empty mappings (no characters in the decomposition).
2501                 if(length!=0) {
2502                     ++mapping;  // skip over the firstUnit
2503                     // add c to first code point's start set
2504                     int32_t i=0;
2505                     U16_NEXT_UNSAFE(mapping, i, c2);
2506                     newData.addToStartSet(c, c2, errorCode);
2507                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2508                     // one-way mapping. A 2-way mapping is possible here after
2509                     // intermediate algorithmic mapping.
2510                     if(norm16_2>=minNoNo) {
2511                         while(i<length) {
2512                             U16_NEXT_UNSAFE(mapping, i, c2);
2513                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
2514                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
2515                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
2516                                              &errorCode);
2517                             }
2518                         }
2519                     }
2520                 }
2521             } else {
2522                 // c decomposed to c2 algorithmically; c has cc==0
2523                 newData.addToStartSet(c, c2, errorCode);
2524             }
2525         }
2526         if(newValue!=oldValue) {
2527             utrie2_set32(newData.trie, c, newValue, &errorCode);
2528         }
2529     }
2530 }
2531
2532 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2533     // Logically const: Synchronized instantiation.
2534     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2535     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2536     return U_SUCCESS(errorCode);
2537 }
2538
2539 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2540     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
2541 }
2542
2543 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2544     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
2545 }
2546
2547 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2548     return getCanonValue(c)>=0;
2549 }
2550
2551 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2552     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2553     if(canonValue==0) {
2554         return FALSE;
2555     }
2556     set.clear();
2557     int32_t value=canonValue&CANON_VALUE_MASK;
2558     if((canonValue&CANON_HAS_SET)!=0) {
2559         set.addAll(getCanonStartSet(value));
2560     } else if(value!=0) {
2561         set.add(value);
2562     }
2563     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
2564         uint16_t norm16=getNorm16(c);
2565         if(norm16==JAMO_L) {
2566             UChar32 syllable=
2567                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2568             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2569         } else {
2570             addComposites(getCompositionsList(norm16), set);
2571         }
2572     }
2573     return TRUE;
2574 }
2575
2576 U_NAMESPACE_END
2577
2578 // Normalizer2 data swapping ----------------------------------------------- ***
2579
2580 U_NAMESPACE_USE
2581
2582 U_CAPI int32_t U_EXPORT2
2583 unorm2_swap(const UDataSwapper *ds,
2584             const void *inData, int32_t length, void *outData,
2585             UErrorCode *pErrorCode) {
2586     const UDataInfo *pInfo;
2587     int32_t headerSize;
2588
2589     const uint8_t *inBytes;
2590     uint8_t *outBytes;
2591
2592     const int32_t *inIndexes;
2593     int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2594
2595     int32_t i, offset, nextOffset, size;
2596
2597     /* udata_swapDataHeader checks the arguments */
2598     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2599     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2600         return 0;
2601     }
2602
2603     /* check data format and format version */
2604     pInfo=(const UDataInfo *)((const char *)inData+4);
2605     uint8_t formatVersion0=pInfo->formatVersion[0];
2606     if(!(
2607         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2608         pInfo->dataFormat[1]==0x72 &&
2609         pInfo->dataFormat[2]==0x6d &&
2610         pInfo->dataFormat[3]==0x32 &&
2611         (1<=formatVersion0 && formatVersion0<=3)
2612     )) {
2613         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2614                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2615                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2616                          pInfo->formatVersion[0]);
2617         *pErrorCode=U_UNSUPPORTED_ERROR;
2618         return 0;
2619     }
2620
2621     inBytes=(const uint8_t *)inData+headerSize;
2622     outBytes=(uint8_t *)outData+headerSize;
2623
2624     inIndexes=(const int32_t *)inBytes;
2625     int32_t minIndexesLength;
2626     if(formatVersion0==1) {
2627         minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2628     } else if(formatVersion0==2) {
2629         minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2630     } else {
2631         minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2632     }
2633
2634     if(length>=0) {
2635         length-=headerSize;
2636         if(length<minIndexesLength*4) {
2637             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2638                              length);
2639             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2640             return 0;
2641         }
2642     }
2643
2644     /* read the first few indexes */
2645     for(i=0; i<UPRV_LENGTHOF(indexes); ++i) {
2646         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2647     }
2648
2649     /* get the total length of the data */
2650     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2651
2652     if(length>=0) {
2653         if(length<size) {
2654             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2655                              length);
2656             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2657             return 0;
2658         }
2659
2660         /* copy the data for inaccessible bytes */
2661         if(inBytes!=outBytes) {
2662             uprv_memcpy(outBytes, inBytes, size);
2663         }
2664
2665         offset=0;
2666
2667         /* swap the int32_t indexes[] */
2668         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2669         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2670         offset=nextOffset;
2671
2672         /* swap the UTrie2 */
2673         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2674         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2675         offset=nextOffset;
2676
2677         /* swap the uint16_t extraData[] */
2678         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2679         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2680         offset=nextOffset;
2681
2682         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2683         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2684         offset=nextOffset;
2685
2686         U_ASSERT(offset==size);
2687     }
2688
2689     return headerSize+size;
2690 }
2691
2692 #endif  // !UCONFIG_NO_NORMALIZATION