icuSources/common/normalizer2impl.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2009-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  normalizer2impl.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2009nov22
  16 *   created by: Markus W. Scherer
  17 */
  18
  19 // #define UCPTRIE_DEBUG
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_NORMALIZATION
  24
  25 #include "unicode/bytestream.h"
  26 #include "unicode/edits.h"
  27 #include "unicode/normalizer2.h"
  28 #include "unicode/stringoptions.h"
  29 #include "unicode/ucptrie.h"
  30 #include "unicode/udata.h"
  31 #include "unicode/umutablecptrie.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/utf16.h"
  34 #include "unicode/utf8.h"
  35 #include "bytesinkutil.h"
  36 #include "cmemory.h"
  37 #include "mutex.h"
  38 #include "normalizer2impl.h"
  39 #include "putilimp.h"
  40 #include "uassert.h"
  41 #include "ucptrie_impl.h"
  42 #include "uset_imp.h"
  43 #include "uvector.h"
  44
  45 U_NAMESPACE_BEGIN
  46
  47 namespace {
  48
  49 /**
  50  * UTF-8 lead byte for minNoMaybeCP.
  51  * Can be lower than the actual lead byte for c.
  52  * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
  53  */
  54 inline uint8_t leadByteForCP(UChar32 c) {
  55     if (c <= 0x7f) {
  56         return (uint8_t)c;
  57     } else if (c <= 0x7ff) {
  58         return (uint8_t)(0xc0+(c>>6));
  59     } else {
  60         // Should not occur because ccc(U+0300)!=0.
  61         return 0xe0;
  62     }
  63 }
  64
  65 /**
  66  * Returns the code point from one single well-formed UTF-8 byte sequence
  67  * between cpStart and cpLimit.
  68  *
  69  * Trie UTF-8 macros do not assemble whole code points (for efficiency).
  70  * When we do need the code point, we call this function.
  71  * We should not need it for normalization-inert data (norm16==0).
  72  * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
  73  */
  74 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
  75     // Similar to U8_NEXT_UNSAFE(s, i, c).
  76     U_ASSERT(cpStart < cpLimit);
  77     uint8_t c = *cpStart;
  78     switch(cpLimit-cpStart) {
  79     case 1:
  80         return c;
  81     case 2:
  82         return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
  83     case 3:
  84         // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
  85         return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));
  86     case 4:
  87         return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
  88     default:
  89         UPRV_UNREACHABLE;  // Should not occur.
  90     }
  91 }
  92
  93 /**
  94  * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
  95  * Otherwise returns a negative value.
  96  */
  97 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
  98     if ((p - start) >= 3) {
  99         p -= 3;
 100         uint8_t l = *p;
 101         uint8_t t1, t2;
 102         if (0xe1 <= l && l <= 0xed &&
 103                 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&
 104                 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&
 105                 (l < 0xed || t1 <= 0x1f)) {
 106             return ((l & 0xf) << 12) | (t1 << 6) | t2;
 107         }
 108     }
 109     return U_SENTINEL;
 110 }
 111
 112 /**
 113  * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
 114  * Otherwise returns a negative value.
 115  */
 116 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
 117     // Jamo T: E1 86 A8..E1 87 82
 118     if ((limit - src) >= 3 && *src == 0xe1) {
 119         if (src[1] == 0x86) {
 120             uint8_t t = src[2];
 121             // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
 122             // Offset 0 does not correspond to any conjoining Jamo.
 123             if (0xa8 <= t && t <= 0xbf) {
 124                 return t - 0xa7;
 125             }
 126         } else if (src[1] == 0x87) {
 127             uint8_t t = src[2];
 128             if ((int8_t)t <= (int8_t)0x82u) {
 129                 return t - (0xa7 - 0x40);
 130             }
 131         }
 132     }
 133     return -1;
 134 }
 135
 136 void
 137 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
 138                      ByteSink &sink, Edits *edits) {
 139     char buffer[U8_MAX_LENGTH];
 140     int32_t length;
 141     int32_t cpLength = (int32_t)(cpLimit - cpStart);
 142     if (cpLength == 1) {
 143         // The builder makes ASCII map to ASCII.
 144         buffer[0] = (uint8_t)(*cpStart + delta);
 145         length = 1;
 146     } else {
 147         int32_t trail = *(cpLimit-1) + delta;
 148         if (0x80 <= trail && trail <= 0xbf) {
 149             // The delta only changes the last trail byte.
 150             --cpLimit;
 151             length = 0;
 152             do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
 153             buffer[length++] = (uint8_t)trail;
 154         } else {
 155             // Decode the code point, add the delta, re-encode.
 156             UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
 157             length = 0;
 158             U8_APPEND_UNSAFE(buffer, length, c);
 159         }
 160     }
 161     if (edits != nullptr) {
 162         edits->addReplace(cpLength, length);
 163     }
 164     sink.Append(buffer, length);
 165 }
 166
 167 }  // namespace
 168
 169 // ReorderingBuffer -------------------------------------------------------- ***
 170
 171 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
 172                                    UErrorCode &errorCode) :
 173         impl(ni), str(dest),
 174         start(str.getBuffer(8)), reorderStart(start), limit(start),
 175         remainingCapacity(str.getCapacity()), lastCC(0) {
 176     if (start == nullptr && U_SUCCESS(errorCode)) {
 177         // getBuffer() already did str.setToBogus()
 178         errorCode = U_MEMORY_ALLOCATION_ERROR;
 179     }
 180 }
 181
 182 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
 183     int32_t length=str.length();
 184     start=str.getBuffer(destCapacity);
 185     if(start==NULL) {
 186         // getBuffer() already did str.setToBogus()
 187         errorCode=U_MEMORY_ALLOCATION_ERROR;
 188         return FALSE;
 189     }
 190     limit=start+length;
 191     remainingCapacity=str.getCapacity()-length;
 192     reorderStart=start;
 193     if(start==limit) {
 194         lastCC=0;
 195     } else {
 196         setIterator();
 197         lastCC=previousCC();
 198         // Set reorderStart after the last code point with cc<=1 if there is one.
 199         if(lastCC>1) {
 200             while(previousCC()>1) {}
 201         }
 202         reorderStart=codePointLimit;
 203     }
 204     return TRUE;
 205 }
 206
 207 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
 208     int32_t length=(int32_t)(limit-start);
 209     return
 210         length==(int32_t)(otherLimit-otherStart) &&
 211         0==u_memcmp(start, otherStart, length);
 212 }
 213
 214 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
 215     U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller
 216     int32_t length = (int32_t)(limit - start);
 217     int32_t otherLength = (int32_t)(otherLimit - otherStart);
 218     // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
 219     if (otherLength < length || (otherLength / 3) > length) {
 220         return FALSE;
 221     }
 222     // Compare valid strings from between normalization boundaries.
 223     // (Invalid sequences are normalization-inert.)
 224     for (int32_t i = 0, j = 0;;) {
 225         if (i >= length) {
 226             return j >= otherLength;
 227         } else if (j >= otherLength) {
 228             return FALSE;
 229         }
 230         // Not at the end of either string yet.
 231         UChar32 c, other;
 232         U16_NEXT_UNSAFE(start, i, c);
 233         U8_NEXT_UNSAFE(otherStart, j, other);
 234         if (c != other) {
 235             return FALSE;
 236         }
 237     }
 238 }
 239
 240 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
 241     if(remainingCapacity<2 && !resize(2, errorCode)) {
 242         return FALSE;
 243     }
 244     if(lastCC<=cc || cc==0) {
 245         limit[0]=U16_LEAD(c);
 246         limit[1]=U16_TRAIL(c);
 247         limit+=2;
 248         lastCC=cc;
 249         if(cc<=1) {
 250             reorderStart=limit;
 251         }
 252     } else {
 253         insert(c, cc);
 254     }
 255     remainingCapacity-=2;
 256     return TRUE;
 257 }
 258
 259 UBool ReorderingBuffer::append(const UChar *s, int32_t length, UBool isNFD,
 260                                uint8_t leadCC, uint8_t trailCC,
 261                                UErrorCode &errorCode) {
 262     if(length==0) {
 263         return TRUE;
 264     }
 265     if(remainingCapacity<length && !resize(length, errorCode)) {
 266         return FALSE;
 267     }
 268     remainingCapacity-=length;
 269     if(lastCC<=leadCC || leadCC==0) {
 270         if(trailCC<=1) {
 271             reorderStart=limit+length;
 272         } else if(leadCC<=1) {
 273             reorderStart=limit+1;  // Ok if not a code point boundary.
 274         }
 275         const UChar *sLimit=s+length;
 276         do { *limit++=*s++; } while(s!=sLimit);
 277         lastCC=trailCC;
 278     } else {
 279         int32_t i=0;
 280         UChar32 c;
 281         U16_NEXT(s, i, length, c);
 282         insert(c, leadCC);  // insert first code point
 283         while(i<length) {
 284             U16_NEXT(s, i, length, c);
 285             if(i<length) {
 286                 if (isNFD) {
 287                     leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));
 288                 } else {
 289                     leadCC = impl.getCC(impl.getNorm16(c));
 290                 }
 291             } else {
 292                 leadCC=trailCC;
 293             }
 294             append(c, leadCC, errorCode);
 295         }
 296     }
 297     return TRUE;
 298 }
 299
 300 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
 301     int32_t cpLength=U16_LENGTH(c);
 302     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
 303         return FALSE;
 304     }
 305     remainingCapacity-=cpLength;
 306     if(cpLength==1) {
 307         *limit++=(UChar)c;
 308     } else {
 309         limit[0]=U16_LEAD(c);
 310         limit[1]=U16_TRAIL(c);
 311         limit+=2;
 312     }
 313     lastCC=0;
 314     reorderStart=limit;
 315     return TRUE;
 316 }
 317
 318 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
 319     if(s==sLimit) {
 320         return TRUE;
 321     }
 322     int32_t length=(int32_t)(sLimit-s);
 323     if(remainingCapacity<length && !resize(length, errorCode)) {
 324         return FALSE;
 325     }
 326     u_memcpy(limit, s, length);
 327     limit+=length;
 328     remainingCapacity-=length;
 329     lastCC=0;
 330     reorderStart=limit;
 331     return TRUE;
 332 }
 333
 334 void ReorderingBuffer::remove() {
 335     reorderStart=limit=start;
 336     remainingCapacity=str.getCapacity();
 337     lastCC=0;
 338 }
 339
 340 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
 341     if(suffixLength<(limit-start)) {
 342         limit-=suffixLength;
 343         remainingCapacity+=suffixLength;
 344     } else {
 345         limit=start;
 346         remainingCapacity=str.getCapacity();
 347     }
 348     lastCC=0;
 349     reorderStart=limit;
 350 }
 351
 352 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
 353     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
 354     int32_t length=(int32_t)(limit-start);
 355     str.releaseBuffer(length);
 356     int32_t newCapacity=length+appendLength;
 357     int32_t doubleCapacity=2*str.getCapacity();
 358     if(newCapacity<doubleCapacity) {
 359         newCapacity=doubleCapacity;
 360     }
 361     if(newCapacity<256) {
 362         newCapacity=256;
 363     }
 364     start=str.getBuffer(newCapacity);
 365     if(start==NULL) {
 366         // getBuffer() already did str.setToBogus()
 367         errorCode=U_MEMORY_ALLOCATION_ERROR;
 368         return FALSE;
 369     }
 370     reorderStart=start+reorderStartIndex;
 371     limit=start+length;
 372     remainingCapacity=str.getCapacity()-length;
 373     return TRUE;
 374 }
 375
 376 void ReorderingBuffer::skipPrevious() {
 377     codePointLimit=codePointStart;
 378     UChar c=*--codePointStart;
 379     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
 380         --codePointStart;
 381     }
 382 }
 383
 384 uint8_t ReorderingBuffer::previousCC() {
 385     codePointLimit=codePointStart;
 386     if(reorderStart>=codePointStart) {
 387         return 0;
 388     }
 389     UChar32 c=*--codePointStart;
 390     UChar c2;
 391     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
 392         --codePointStart;
 393         c=U16_GET_SUPPLEMENTARY(c2, c);
 394     }
 395     return impl.getCCFromYesOrMaybeCP(c);
 396 }
 397
 398 // Inserts c somewhere before the last character.
 399 // Requires 0<cc<lastCC which implies reorderStart<limit.
 400 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
 401     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 402     // insert c at codePointLimit, after the character with prevCC<=cc
 403     UChar *q=limit;
 404     UChar *r=limit+=U16_LENGTH(c);
 405     do {
 406         *--r=*--q;
 407     } while(codePointLimit!=q);
 408     writeCodePoint(q, c);
 409     if(cc<=1) {
 410         reorderStart=r;
 411     }
 412 }
 413
 414 // Normalizer2Impl --------------------------------------------------------- ***
 415
 416 struct CanonIterData : public UMemory {
 417     CanonIterData(UErrorCode &errorCode);
 418     ~CanonIterData();
 419     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
 420     UMutableCPTrie *mutableTrie;
 421     UCPTrie *trie;
 422     UVector canonStartSets;  // contains UnicodeSet *
 423 };
 424
 425 Normalizer2Impl::~Normalizer2Impl() {
 426     delete fCanonIterData;
 427 }
 428
 429 void
 430 Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
 431                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
 432     minDecompNoCP = static_cast<UChar>(inIndexes[IX_MIN_DECOMP_NO_CP]);
 433     minCompNoMaybeCP = static_cast<UChar>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
 434     minLcccCP = static_cast<UChar>(inIndexes[IX_MIN_LCCC_CP]);
 435
 436     minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
 437     minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
 438     minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
 439     minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
 440     minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
 441     minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
 442     limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
 443     minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
 444     U_ASSERT((minMaybeYes & 7) == 0);  // 8-aligned for noNoDelta bit fields
 445     centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;
 446
 447     normTrie=inTrie;
 448
 449     maybeYesCompositions=inExtraData;
 450     extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 451
 452     smallFCD=inSmallFCD;
 453 }
 454
 455 U_CDECL_BEGIN
 456
 457 static uint32_t U_CALLCONV
 458 segmentStarterMapper(const void * /*context*/, uint32_t value) {
 459     return value&CANON_NOT_SEGMENT_STARTER;
 460 }
 461
 462 U_CDECL_END
 463
 464 void
 465 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
 466     UChar32 start = 0, end;
 467     uint32_t norm16;
 468     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
 469                                    nullptr, nullptr, &norm16)) >= 0) {
 470         if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
 471                 norm16 != Normalizer2Impl::JAMO_VT) {
 472             set.add(start, end);
 473         } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
 474             uint16_t fcd16 = getFCD16(start);
 475             if (fcd16 > 0xff) { set.add(start, end); }
 476         }
 477         start = end + 1;
 478     }
 479 }
 480
 481 void
 482 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
 483     // Add the start code point of each same-value range of the trie.
 484     UChar32 start = 0, end;
 485     uint32_t value;
 486     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
 487                                    nullptr, nullptr, &value)) >= 0) {
 488         sa->add(sa->set, start);
 489         if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
 490                 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
 491             // Range of code points with same-norm16-value algorithmic decompositions.
 492             // They might have different non-zero FCD16 values.
 493             uint16_t prevFCD16 = getFCD16(start);
 494             while (++start <= end) {
 495                 uint16_t fcd16 = getFCD16(start);
 496                 if (fcd16 != prevFCD16) {
 497                     sa->add(sa->set, start);
 498                     prevFCD16 = fcd16;
 499                 }
 500             }
 501         }
 502         start = end + 1;
 503     }
 504
 505     /* add Hangul LV syllables and LV+1 because of skippables */
 506     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
 507         sa->add(sa->set, c);
 508         sa->add(sa->set, c+1);
 509     }
 510     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
 511 }
 512
 513 void
 514 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
 515     // Add the start code point of each same-value range of the canonical iterator data trie.
 516     if (!ensureCanonIterData(errorCode)) { return; }
 517     // Currently only used for the SEGMENT_STARTER property.
 518     UChar32 start = 0, end;
 519     uint32_t value;
 520     while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
 521                                    segmentStarterMapper, nullptr, &value)) >= 0) {
 522         sa->add(sa->set, start);
 523         start = end + 1;
 524     }
 525 }
 526
 527 const UChar *
 528 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
 529                                                 UChar32 minNeedDataCP,
 530                                                 ReorderingBuffer *buffer,
 531                                                 UErrorCode &errorCode) const {
 532     // Make some effort to support NUL-terminated strings reasonably.
 533     // Take the part of the fast quick check loop that does not look up
 534     // data and check the first part of the string.
 535     // After this prefix, determine the string length to simplify the rest
 536     // of the code.
 537     const UChar *prevSrc=src;
 538     UChar c;
 539     while((c=*src++)<minNeedDataCP && c!=0) {}
 540     // Back out the last character for full processing.
 541     // Copy this prefix.
 542     if(--src!=prevSrc) {
 543         if(buffer!=NULL) {
 544             buffer->appendZeroCC(prevSrc, src, errorCode);
 545         }
 546     }
 547     return src;
 548 }
 549
 550 UnicodeString &
 551 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
 552                            UErrorCode &errorCode) const {
 553     if(U_FAILURE(errorCode)) {
 554         dest.setToBogus();
 555         return dest;
 556     }
 557     const UChar *sArray=src.getBuffer();
 558     if(&dest==&src || sArray==NULL) {
 559         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 560         dest.setToBogus();
 561         return dest;
 562     }
 563     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
 564     return dest;
 565 }
 566
 567 void
 568 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 569                            UnicodeString &dest,
 570                            int32_t destLengthEstimate,
 571                            UErrorCode &errorCode) const {
 572     if(destLengthEstimate<0 && limit!=NULL) {
 573         destLengthEstimate=(int32_t)(limit-src);
 574     }
 575     dest.remove();
 576     ReorderingBuffer buffer(*this, dest);
 577     if(buffer.init(destLengthEstimate, errorCode)) {
 578         decompose(src, limit, &buffer, errorCode);
 579     }
 580 }
 581
 582 // Dual functionality:
 583 // buffer!=NULL: normalize
 584 // buffer==NULL: isNormalized/spanQuickCheckYes
 585 const UChar *
 586 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 587                            ReorderingBuffer *buffer,
 588                            UErrorCode &errorCode) const {
 589     UChar32 minNoCP=minDecompNoCP;
 590     if(limit==NULL) {
 591         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
 592         if(U_FAILURE(errorCode)) {
 593             return src;
 594         }
 595         limit=u_strchr(src, 0);
 596     }
 597
 598     const UChar *prevSrc;
 599     UChar32 c=0;
 600     uint16_t norm16=0;
 601
 602     // only for quick check
 603     const UChar *prevBoundary=src;
 604     uint8_t prevCC=0;
 605
 606     for(;;) {
 607         // count code units below the minimum or with irrelevant data for the quick check
 608         for(prevSrc=src; src!=limit;) {
 609             if( (c=*src)<minNoCP ||
 610                 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
 611             ) {
 612                 ++src;
 613             } else if(!U16_IS_LEAD(c)) {
 614                 break;
 615             } else {
 616                 UChar c2;
 617                 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
 618                     c=U16_GET_SUPPLEMENTARY(c, c2);
 619                     norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
 620                     if(isMostDecompYesAndZeroCC(norm16)) {
 621                         src+=2;
 622                     } else {
 623                         break;
 624                     }
 625                 } else {
 626                     ++src;  // unpaired lead surrogate: inert
 627                 }
 628             }
 629         }
 630         // copy these code units all at once
 631         if(src!=prevSrc) {
 632             if(buffer!=NULL) {
 633                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
 634                     break;
 635                 }
 636             } else {
 637                 prevCC=0;
 638                 prevBoundary=src;
 639             }
 640         }
 641         if(src==limit) {
 642             break;
 643         }
 644
 645         // Check one above-minimum, relevant code point.
 646         src+=U16_LENGTH(c);
 647         if(buffer!=NULL) {
 648             if(!decompose(c, norm16, *buffer, errorCode)) {
 649                 break;
 650             }
 651         } else {
 652             if(isDecompYes(norm16)) {
 653                 uint8_t cc=getCCFromYesOrMaybe(norm16);
 654                 if(prevCC<=cc || cc==0) {
 655                     prevCC=cc;
 656                     if(cc<=1) {
 657                         prevBoundary=src;
 658                     }
 659                     continue;
 660                 }
 661             }
 662             return prevBoundary;  // "no" or cc out of order
 663         }
 664     }
 665     return src;
 666 }
 667
 668 // Decompose a short piece of text which is likely to contain characters that
 669 // fail the quick check loop and/or where the quick check loop's overhead
 670 // is unlikely to be amortized.
 671 // Called by the compose() and makeFCD() implementations.
 672 const UChar *
 673 Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
 674                                 UBool stopAtCompBoundary, UBool onlyContiguous,
 675                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
 676     if (U_FAILURE(errorCode)) {
 677         return nullptr;
 678     }
 679     while(src<limit) {
 680         if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
 681             return src;
 682         }
 683         const UChar *prevSrc = src;
 684         UChar32 c;
 685         uint16_t norm16;
 686         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
 687         if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
 688             return prevSrc;
 689         }
 690         if(!decompose(c, norm16, buffer, errorCode)) {
 691             return nullptr;
 692         }
 693         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 694             return src;
 695         }
 696     }
 697     return src;
 698 }
 699
 700 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
 701                                  ReorderingBuffer &buffer,
 702                                  UErrorCode &errorCode) const {
 703     // get the decomposition and the lead and trail cc's
 704     if (norm16 >= limitNoNo) {
 705         if (isMaybeOrNonZeroCC(norm16)) {
 706             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
 707         }
 708         // Maps to an isCompYesAndZeroCC.
 709         c=mapAlgorithmic(c, norm16);
 710         norm16=getRawNorm16(c);
 711     }
 712     if (norm16 < minYesNo) {
 713         // c does not decompose
 714         return buffer.append(c, 0, errorCode);
 715     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 716         // Hangul syllable: decompose algorithmically
 717         UChar jamos[3];
 718         return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
 719     }
 720     // c decomposes, get everything from the variable-length extra data
 721     const uint16_t *mapping=getMapping(norm16);
 722     uint16_t firstUnit=*mapping;
 723     int32_t length=firstUnit&MAPPING_LENGTH_MASK;
 724     uint8_t leadCC, trailCC;
 725     trailCC=(uint8_t)(firstUnit>>8);
 726     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
 727         leadCC=(uint8_t)(*(mapping-1)>>8);
 728     } else {
 729         leadCC=0;
 730     }
 731     return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
 732 }
 733
 734 const uint8_t *
 735 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
 736                                 UBool stopAtCompBoundary, UBool onlyContiguous,
 737                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
 738     if (U_FAILURE(errorCode)) {
 739         return nullptr;
 740     }
 741     while (src < limit) {
 742         const uint8_t *prevSrc = src;
 743         uint16_t norm16;
 744         UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
 745         // Get the decomposition and the lead and trail cc's.
 746         UChar32 c = U_SENTINEL;
 747         if (norm16 >= limitNoNo) {
 748             if (isMaybeOrNonZeroCC(norm16)) {
 749                 // No boundaries around this character.
 750                 c = codePointFromValidUTF8(prevSrc, src);
 751                 if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
 752                     return nullptr;
 753                 }
 754                 continue;
 755             }
 756             // Maps to an isCompYesAndZeroCC.
 757             if (stopAtCompBoundary) {
 758                 return prevSrc;
 759             }
 760             c = codePointFromValidUTF8(prevSrc, src);
 761             c = mapAlgorithmic(c, norm16);
 762             norm16 = getRawNorm16(c);
 763         } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
 764             return prevSrc;
 765         }
 766         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
 767         // We do not see invalid UTF-8 here because
 768         // its norm16==INERT is normalization-inert,
 769         // so it gets copied unchanged in the fast path,
 770         // and we stop the slow path where invalid UTF-8 begins.
 771         U_ASSERT(norm16 != INERT);
 772         if (norm16 < minYesNo) {
 773             if (c < 0) {
 774                 c = codePointFromValidUTF8(prevSrc, src);
 775             }
 776             // does not decompose
 777             if (!buffer.append(c, 0, errorCode)) {
 778                 return nullptr;
 779             }
 780         } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
 781             // Hangul syllable: decompose algorithmically
 782             if (c < 0) {
 783                 c = codePointFromValidUTF8(prevSrc, src);
 784             }
 785             char16_t jamos[3];
 786             if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
 787                 return nullptr;
 788             }
 789         } else {
 790             // The character decomposes, get everything from the variable-length extra data.
 791             const uint16_t *mapping = getMapping(norm16);
 792             uint16_t firstUnit = *mapping;
 793             int32_t length = firstUnit & MAPPING_LENGTH_MASK;
 794             uint8_t trailCC = (uint8_t)(firstUnit >> 8);
 795             uint8_t leadCC;
 796             if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
 797                 leadCC = (uint8_t)(*(mapping-1) >> 8);
 798             } else {
 799                 leadCC = 0;
 800             }
 801             if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
 802                 return nullptr;
 803             }
 804         }
 805         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 806             return src;
 807         }
 808     }
 809     return src;
 810 }
 811
 812 const UChar *
 813 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
 814     uint16_t norm16;
 815     if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 816         // c does not decompose
 817         return nullptr;
 818     }
 819     const UChar *decomp = nullptr;
 820     if(isDecompNoAlgorithmic(norm16)) {
 821         // Maps to an isCompYesAndZeroCC.
 822         c=mapAlgorithmic(c, norm16);
 823         decomp=buffer;
 824         length=0;
 825         U16_APPEND_UNSAFE(buffer, length, c);
 826         // The mapping might decompose further.
 827         norm16 = getRawNorm16(c);
 828     }
 829     if (norm16 < minYesNo) {
 830         return decomp;
 831     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 832         // Hangul syllable: decompose algorithmically
 833         length=Hangul::decompose(c, buffer);
 834         return buffer;
 835     }
 836     // c decomposes, get everything from the variable-length extra data
 837     const uint16_t *mapping=getMapping(norm16);
 838     length=*mapping&MAPPING_LENGTH_MASK;
 839     return (const UChar *)mapping+1;
 840 }
 841
 842 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
 843 // so that a raw mapping fits that consists of one unit ("rm0")
 844 // plus all but the first two code units of the normal mapping.
 845 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
 846 const UChar *
 847 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
 848     uint16_t norm16;
 849     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 850         // c does not decompose
 851         return NULL;
 852     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 853         // Hangul syllable: decompose algorithmically
 854         Hangul::getRawDecomposition(c, buffer);
 855         length=2;
 856         return buffer;
 857     } else if(isDecompNoAlgorithmic(norm16)) {
 858         c=mapAlgorithmic(c, norm16);
 859         length=0;
 860         U16_APPEND_UNSAFE(buffer, length, c);
 861         return buffer;
 862     }
 863     // c decomposes, get everything from the variable-length extra data
 864     const uint16_t *mapping=getMapping(norm16);
 865     uint16_t firstUnit=*mapping;
 866     int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
 867     if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
 868         // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
 869         // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
 870         const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
 871         uint16_t rm0=*rawMapping;
 872         if(rm0<=MAPPING_LENGTH_MASK) {
 873             length=rm0;
 874             return (const UChar *)rawMapping-rm0;
 875         } else {
 876             // Copy the normal mapping and replace its first two code units with rm0.
 877             buffer[0]=(UChar)rm0;
 878             u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
 879             length=mLength-1;
 880             return buffer;
 881         }
 882     } else {
 883         length=mLength;
 884         return (const UChar *)mapping+1;
 885     }
 886 }
 887
 888 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
 889                                          UBool doDecompose,
 890                                          UnicodeString &safeMiddle,
 891                                          ReorderingBuffer &buffer,
 892                                          UErrorCode &errorCode) const {
 893     buffer.copyReorderableSuffixTo(safeMiddle);
 894     if(doDecompose) {
 895         decompose(src, limit, &buffer, errorCode);
 896         return;
 897     }
 898     // Just merge the strings at the boundary.
 899     bool isFirst = true;
 900     uint8_t firstCC = 0, prevCC = 0, cc;
 901     const UChar *p = src;
 902     while (p != limit) {
 903         const UChar *codePointStart = p;
 904         UChar32 c;
 905         uint16_t norm16;
 906         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
 907         if ((cc = getCC(norm16)) == 0) {
 908             p = codePointStart;
 909             break;
 910         }
 911         if (isFirst) {
 912             firstCC = cc;
 913             isFirst = false;
 914         }
 915         prevCC = cc;
 916     }
 917     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
 918         limit=u_strchr(p, 0);
 919     }
 920
 921     if (buffer.append(src, (int32_t)(p - src), FALSE, firstCC, prevCC, errorCode)) {
 922         buffer.appendZeroCC(p, limit, errorCode);
 923     }
 924 }
 925
 926 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
 927     return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
 928         norm16HasDecompBoundaryBefore(getNorm16(c));
 929 }
 930
 931 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
 932     if (norm16 < minNoNoCompNoMaybeCC) {
 933         return TRUE;
 934     }
 935     if (norm16 >= limitNoNo) {
 936         return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
 937     }
 938     // c decomposes, get everything from the variable-length extra data
 939     const uint16_t *mapping=getMapping(norm16);
 940     uint16_t firstUnit=*mapping;
 941     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
 942     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
 943 }
 944
 945 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
 946     if (c < minDecompNoCP) {
 947         return TRUE;
 948     }
 949     if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
 950         return TRUE;
 951     }
 952     return norm16HasDecompBoundaryAfter(getNorm16(c));
 953 }
 954
 955 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
 956     if(norm16 <= minYesNo || isHangulLVT(norm16)) {
 957         return TRUE;
 958     }
 959     if (norm16 >= limitNoNo) {
 960         if (isMaybeOrNonZeroCC(norm16)) {
 961             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
 962         }
 963         // Maps to an isCompYesAndZeroCC.
 964         return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
 965     }
 966     // c decomposes, get everything from the variable-length extra data
 967     const uint16_t *mapping=getMapping(norm16);
 968     uint16_t firstUnit=*mapping;
 969     // decomp after-boundary: same as hasFCDBoundaryAfter(),
 970     // fcd16<=1 || trailCC==0
 971     if(firstUnit>0x1ff) {
 972         return FALSE;  // trailCC>1
 973     }
 974     if(firstUnit<=0xff) {
 975         return TRUE;  // trailCC==0
 976     }
 977     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
 978     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
 979     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
 980 }
 981
 982 /*
 983  * Finds the recomposition result for
 984  * a forward-combining "lead" character,
 985  * specified with a pointer to its compositions list,
 986  * and a backward-combining "trail" character.
 987  *
 988  * If the lead and trail characters combine, then this function returns
 989  * the following "compositeAndFwd" value:
 990  * Bits 21..1  composite character
 991  * Bit      0  set if the composite is a forward-combining starter
 992  * otherwise it returns -1.
 993  *
 994  * The compositions list has (trail, compositeAndFwd) pair entries,
 995  * encoded as either pairs or triples of 16-bit units.
 996  * The last entry has the high bit of its first unit set.
 997  *
 998  * The list is sorted by ascending trail characters (there are no duplicates).
 999  * A linear search is used.
1000  *
1001  * See normalizer2impl.h for a more detailed description
1002  * of the compositions list format.
1003  */
1004 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1005     uint16_t key1, firstUnit;
1006     if(trail<COMP_1_TRAIL_LIMIT) {
1007         // trail character is 0..33FF
1008         // result entry may have 2 or 3 units
1009         key1=(uint16_t)(trail<<1);
1010         while(key1>(firstUnit=*list)) {
1011             list+=2+(firstUnit&COMP_1_TRIPLE);
1012         }
1013         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1014             if(firstUnit&COMP_1_TRIPLE) {
1015                 return ((int32_t)list[1]<<16)|list[2];
1016             } else {
1017                 return list[1];
1018             }
1019         }
1020     } else {
1021         // trail character is 3400..10FFFF
1022         // result entry has 3 units
1023         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1024                         (((trail>>COMP_1_TRAIL_SHIFT))&
1025                           ~COMP_1_TRIPLE));
1026         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1027         uint16_t secondUnit;
1028         for(;;) {
1029             if(key1>(firstUnit=*list)) {
1030                 list+=2+(firstUnit&COMP_1_TRIPLE);
1031             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1032                 if(key2>(secondUnit=list[1])) {
1033                     if(firstUnit&COMP_1_LAST_TUPLE) {
1034                         break;
1035                     } else {
1036                         list+=3;
1037                     }
1038                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1039                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
1040                 } else {
1041                     break;
1042                 }
1043             } else {
1044                 break;
1045             }
1046         }
1047     }
1048     return -1;
1049 }
1050
1051 /**
1052   * @param list some character's compositions list
1053   * @param set recursively receives the composites from these compositions
1054   */
1055 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1056     uint16_t firstUnit;
1057     int32_t compositeAndFwd;
1058     do {
1059         firstUnit=*list;
1060         if((firstUnit&COMP_1_TRIPLE)==0) {
1061             compositeAndFwd=list[1];
1062             list+=2;
1063         } else {
1064             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
1065             list+=3;
1066         }
1067         UChar32 composite=compositeAndFwd>>1;
1068         if((compositeAndFwd&1)!=0) {
1069             addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1070         }
1071         set.add(composite);
1072     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1073 }
1074
1075 /*
1076  * Recomposes the buffer text starting at recomposeStartIndex
1077  * (which is in NFD - decomposed and canonically ordered),
1078  * and truncates the buffer contents.
1079  *
1080  * Note that recomposition never lengthens the text:
1081  * Any character consists of either one or two code units;
1082  * a composition may contain at most one more code unit than the original starter,
1083  * while the combining mark that is removed has at least one code unit.
1084  */
1085 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1086                                 UBool onlyContiguous) const {
1087     UChar *p=buffer.getStart()+recomposeStartIndex;
1088     UChar *limit=buffer.getLimit();
1089     if(p==limit) {
1090         return;
1091     }
1092
1093     UChar *starter, *pRemove, *q, *r;
1094     const uint16_t *compositionsList;
1095     UChar32 c, compositeAndFwd;
1096     uint16_t norm16;
1097     uint8_t cc, prevCC;
1098     UBool starterIsSupplementary;
1099
1100     // Some of the following variables are not used until we have a forward-combining starter
1101     // and are only initialized now to avoid compiler warnings.
1102     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
1103     starter=NULL;
1104     starterIsSupplementary=FALSE;
1105     prevCC=0;
1106
1107     for(;;) {
1108         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1109         cc=getCCFromYesOrMaybe(norm16);
1110         if( // this character combines backward and
1111             isMaybe(norm16) &&
1112             // we have seen a starter that combines forward and
1113             compositionsList!=NULL &&
1114             // the backward-combining character is not blocked
1115             (prevCC<cc || prevCC==0)
1116         ) {
1117             if(isJamoVT(norm16)) {
1118                 // c is a Jamo V/T, see if we can compose it with the previous character.
1119                 if(c<Hangul::JAMO_T_BASE) {
1120                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1121                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
1122                     if(prev<Hangul::JAMO_L_COUNT) {
1123                         pRemove=p-1;
1124                         UChar syllable=(UChar)
1125                             (Hangul::HANGUL_BASE+
1126                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1127                              Hangul::JAMO_T_COUNT);
1128                         UChar t;
1129                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1130                             ++p;
1131                             syllable+=t;  // The next character was a Jamo T.
1132                         }
1133                         *starter=syllable;
1134                         // remove the Jamo V/T
1135                         q=pRemove;
1136                         r=p;
1137                         while(r<limit) {
1138                             *q++=*r++;
1139                         }
1140                         limit=q;
1141                         p=pRemove;
1142                     }
1143                 }
1144                 /*
1145                  * No "else" for Jamo T:
1146                  * Since the input is in NFD, there are no Hangul LV syllables that
1147                  * a Jamo T could combine with.
1148                  * All Jamo Ts are combined above when handling Jamo Vs.
1149                  */
1150                 if(p==limit) {
1151                     break;
1152                 }
1153                 compositionsList=NULL;
1154                 continue;
1155             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1156                 // The starter and the combining mark (c) do combine.
1157                 UChar32 composite=compositeAndFwd>>1;
1158
1159                 // Replace the starter with the composite, remove the combining mark.
1160                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
1161                 if(starterIsSupplementary) {
1162                     if(U_IS_SUPPLEMENTARY(composite)) {
1163                         // both are supplementary
1164                         starter[0]=U16_LEAD(composite);
1165                         starter[1]=U16_TRAIL(composite);
1166                     } else {
1167                         *starter=(UChar)composite;
1168                         // The composite is shorter than the starter,
1169                         // move the intermediate characters forward one.
1170                         starterIsSupplementary=FALSE;
1171                         q=starter+1;
1172                         r=q+1;
1173                         while(r<pRemove) {
1174                             *q++=*r++;
1175                         }
1176                         --pRemove;
1177                     }
1178                 } else if(U_IS_SUPPLEMENTARY(composite)) {
1179                     // The composite is longer than the starter,
1180                     // move the intermediate characters back one.
1181                     starterIsSupplementary=TRUE;
1182                     ++starter;  // temporarily increment for the loop boundary
1183                     q=pRemove;
1184                     r=++pRemove;
1185                     while(starter<q) {
1186                         *--r=*--q;
1187                     }
1188                     *starter=U16_TRAIL(composite);
1189                     *--starter=U16_LEAD(composite);  // undo the temporary increment
1190                 } else {
1191                     // both are on the BMP
1192                     *starter=(UChar)composite;
1193                 }
1194
1195                 /* remove the combining mark by moving the following text over it */
1196                 if(pRemove<p) {
1197                     q=pRemove;
1198                     r=p;
1199                     while(r<limit) {
1200                         *q++=*r++;
1201                     }
1202                     limit=q;
1203                     p=pRemove;
1204                 }
1205                 // Keep prevCC because we removed the combining mark.
1206
1207                 if(p==limit) {
1208                     break;
1209                 }
1210                 // Is the composite a starter that combines forward?
1211                 if(compositeAndFwd&1) {
1212                     compositionsList=
1213                         getCompositionsListForComposite(getRawNorm16(composite));
1214                 } else {
1215                     compositionsList=NULL;
1216                 }
1217
1218                 // We combined; continue with looking for compositions.
1219                 continue;
1220             }
1221         }
1222
1223         // no combination this time
1224         prevCC=cc;
1225         if(p==limit) {
1226             break;
1227         }
1228
1229         // If c did not combine, then check if it is a starter.
1230         if(cc==0) {
1231             // Found a new starter.
1232             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1233                 // It may combine with something, prepare for it.
1234                 if(U_IS_BMP(c)) {
1235                     starterIsSupplementary=FALSE;
1236                     starter=p-1;
1237                 } else {
1238                     starterIsSupplementary=TRUE;
1239                     starter=p-2;
1240                 }
1241             }
1242         } else if(onlyContiguous) {
1243             // FCC: no discontiguous compositions; any intervening character blocks.
1244             compositionsList=NULL;
1245         }
1246     }
1247     buffer.setReorderingLimit(limit);
1248 }
1249
1250 UChar32
1251 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1252     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16
1253     const uint16_t *list;
1254     if(isInert(norm16)) {
1255         return U_SENTINEL;
1256     } else if(norm16<minYesNoMappingsOnly) {
1257         // a combines forward.
1258         if(isJamoL(norm16)) {
1259             b-=Hangul::JAMO_V_BASE;
1260             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1261                 return
1262                     (Hangul::HANGUL_BASE+
1263                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1264                      Hangul::JAMO_T_COUNT);
1265             } else {
1266                 return U_SENTINEL;
1267             }
1268         } else if(isHangulLV(norm16)) {
1269             b-=Hangul::JAMO_T_BASE;
1270             if(0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1271                 return a+b;
1272             } else {
1273                 return U_SENTINEL;
1274             }
1275         } else {
1276             // 'a' has a compositions list in extraData
1277             list=getMapping(norm16);
1278             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1279                 list+=  // mapping pointer
1280                     1+  // +1 to skip the first unit with the mapping length
1281                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1282             }
1283         }
1284     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1285         return U_SENTINEL;
1286     } else {
1287         list=getCompositionsListForMaybe(norm16);
1288     }
1289     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1290         return U_SENTINEL;
1291     }
1292 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1293     return combine(list, b)>>1;
1294 #else
1295     int32_t compositeAndFwd=combine(list, b);
1296     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1297 #endif
1298 }
1299
1300 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1301 // doCompose: normalize
1302 // !doCompose: isNormalized (buffer must be empty and initialized)
1303 UBool
1304 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1305                          UBool onlyContiguous,
1306                          UBool doCompose,
1307                          ReorderingBuffer &buffer,
1308                          UErrorCode &errorCode) const {
1309     const UChar *prevBoundary=src;
1310     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1311     if(limit==NULL) {
1312         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1313                                            doCompose ? &buffer : NULL,
1314                                            errorCode);
1315         if(U_FAILURE(errorCode)) {
1316             return FALSE;
1317         }
1318         limit=u_strchr(src, 0);
1319         if (prevBoundary != src) {
1320             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1321                 prevBoundary = src;
1322             } else {
1323                 buffer.removeSuffix(1);
1324                 prevBoundary = --src;
1325             }
1326         }
1327     }
1328
1329     for (;;) {
1330         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1331         // or with (compYes && ccc==0) properties.
1332         const UChar *prevSrc;
1333         UChar32 c = 0;
1334         uint16_t norm16 = 0;
1335         for (;;) {
1336             if (src == limit) {
1337                 if (prevBoundary != limit && doCompose) {
1338                     buffer.appendZeroCC(prevBoundary, limit, errorCode);
1339                 }
1340                 return TRUE;
1341             }
1342             if( (c=*src)<minNoMaybeCP ||
1343                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1344             ) {
1345                 ++src;
1346             } else {
1347                 prevSrc = src++;
1348                 if(!U16_IS_LEAD(c)) {
1349                     break;
1350                 } else {
1351                     UChar c2;
1352                     if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1353                         ++src;
1354                         c=U16_GET_SUPPLEMENTARY(c, c2);
1355                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1356                         if(!isCompYesAndZeroCC(norm16)) {
1357                             break;
1358                         }
1359                     }
1360                 }
1361             }
1362         }
1363         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1364         // The current character is either a "noNo" (has a mapping)
1365         // or a "maybeYes" (combines backward)
1366         // or a "yesYes" with ccc!=0.
1367         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1368
1369         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1370         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1371             if (!doCompose) {
1372                 return FALSE;
1373             }
1374             // Fast path for mapping a character that is immediately surrounded by boundaries.
1375             // In this case, we need not decompose around the current character.
1376             if (isDecompNoAlgorithmic(norm16)) {
1377                 // Maps to a single isCompYesAndZeroCC character
1378                 // which also implies hasCompBoundaryBefore.
1379                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1380                         hasCompBoundaryBefore(src, limit)) {
1381                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1382                         break;
1383                     }
1384                     if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1385                         break;
1386                     }
1387                     prevBoundary = src;
1388                     continue;
1389                 }
1390             } else if (norm16 < minNoNoCompBoundaryBefore) {
1391                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1392                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1393                         hasCompBoundaryBefore(src, limit)) {
1394                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1395                         break;
1396                     }
1397                     const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));
1398                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1399                     if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1400                         break;
1401                     }
1402                     prevBoundary = src;
1403                     continue;
1404                 }
1405             } else if (norm16 >= minNoNoEmpty) {
1406                 // The current character maps to nothing.
1407                 // Simply omit it from the output if there is a boundary before _or_ after it.
1408                 // The character itself implies no boundaries.
1409                 if (hasCompBoundaryBefore(src, limit) ||
1410                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1411                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1412                         break;
1413                     }
1414                     prevBoundary = src;
1415                     continue;
1416                 }
1417             }
1418             // Other "noNo" type, or need to examine more text around this character:
1419             // Fall through to the slow path.
1420         } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1421             UChar prev=*(prevSrc-1);
1422             if(c<Hangul::JAMO_T_BASE) {
1423                 // The current character is a Jamo Vowel,
1424                 // compose with previous Jamo L and following Jamo T.
1425                 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);
1426                 if(l<Hangul::JAMO_L_COUNT) {
1427                     if (!doCompose) {
1428                         return FALSE;
1429                     }
1430                     int32_t t;
1431                     if (src != limit &&
1432                             0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1433                             t < Hangul::JAMO_T_COUNT) {
1434                         // The next character is a Jamo T.
1435                         ++src;
1436                     } else if (hasCompBoundaryBefore(src, limit)) {
1437                         // No Jamo T follows, not even via decomposition.
1438                         t = 0;
1439                     } else {
1440                         t = -1;
1441                     }
1442                     if (t >= 0) {
1443                         UChar32 syllable = Hangul::HANGUL_BASE +
1444                             (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1445                             Hangul::JAMO_T_COUNT + t;
1446                         --prevSrc;  // Replace the Jamo L as well.
1447                         if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1448                             break;
1449                         }
1450                         if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1451                             break;
1452                         }
1453                         prevBoundary = src;
1454                         continue;
1455                     }
1456                     // If we see L+V+x where x!=T then we drop to the slow path,
1457                     // decompose and recompose.
1458                     // This is to deal with NFKC finding normal L and V but a
1459                     // compatibility variant of a T.
1460                     // We need to either fully compose that combination here
1461                     // (which would complicate the code and may not work with strange custom data)
1462                     // or use the slow path.
1463                 }
1464             } else if (Hangul::isHangulLV(prev)) {
1465                 // The current character is a Jamo Trailing consonant,
1466                 // compose with previous Hangul LV that does not contain a Jamo T.
1467                 if (!doCompose) {
1468                     return FALSE;
1469                 }
1470                 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1471                 --prevSrc;  // Replace the Hangul LV as well.
1472                 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1473                     break;
1474                 }
1475                 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1476                     break;
1477                 }
1478                 prevBoundary = src;
1479                 continue;
1480             }
1481             // No matching context, or may need to decompose surrounding text first:
1482             // Fall through to the slow path.
1483         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1484             // One or more combining marks that do not combine-back:
1485             // Check for canonical order, copy unchanged if ok and
1486             // if followed by a character with a boundary-before.
1487             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1488             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1489                 // Fails FCD test, need to decompose and contiguously recompose.
1490                 if (!doCompose) {
1491                     return FALSE;
1492                 }
1493             } else {
1494                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1495                 // the previous character which passed the quick check "yes && ccc==0" test.
1496                 const UChar *nextSrc;
1497                 uint16_t n16;
1498                 for (;;) {
1499                     if (src == limit) {
1500                         if (doCompose) {
1501                             buffer.appendZeroCC(prevBoundary, limit, errorCode);
1502                         }
1503                         return TRUE;
1504                     }
1505                     uint8_t prevCC = cc;
1506                     nextSrc = src;
1507                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);
1508                     if (n16 >= MIN_YES_YES_WITH_CC) {
1509                         cc = getCCFromNormalYesOrMaybe(n16);
1510                         if (prevCC > cc) {
1511                             if (!doCompose) {
1512                                 return FALSE;
1513                             }
1514                             break;
1515                         }
1516                     } else {
1517                         break;
1518                     }
1519                     src = nextSrc;
1520                 }
1521                 // src is after the last in-order combining mark.
1522                 // If there is a boundary here, then we continue with no change.
1523                 if (norm16HasCompBoundaryBefore(n16)) {
1524                     if (isCompYesAndZeroCC(n16)) {
1525                         src = nextSrc;
1526                     }
1527                     continue;
1528                 }
1529                 // Use the slow path. There is no boundary in [prevSrc, src[.
1530             }
1531         }
1532
1533         // Slow path: Find the nearest boundaries around the current character,
1534         // decompose and recompose.
1535         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1536             const UChar *p = prevSrc;
1537             UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);
1538             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1539                 prevSrc = p;
1540             }
1541         }
1542         if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1543             break;
1544         }
1545         int32_t recomposeStartIndex=buffer.length();
1546         // We know there is not a boundary here.
1547         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1548                        buffer, errorCode);
1549         // Decompose until the next boundary.
1550         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1551                              buffer, errorCode);
1552         if (U_FAILURE(errorCode)) {
1553             break;
1554         }
1555         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1556             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1557             return TRUE;
1558         }
1559         recompose(buffer, recomposeStartIndex, onlyContiguous);
1560         if(!doCompose) {
1561             if(!buffer.equals(prevSrc, src)) {
1562                 return FALSE;
1563             }
1564             buffer.remove();
1565         }
1566         prevBoundary=src;
1567     }
1568     return TRUE;
1569 }
1570
1571 // Very similar to compose(): Make the same changes in both places if relevant.
1572 // pQCResult==NULL: spanQuickCheckYes
1573 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1574 const UChar *
1575 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1576                                    UBool onlyContiguous,
1577                                    UNormalizationCheckResult *pQCResult) const {
1578     const UChar *prevBoundary=src;
1579     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1580     if(limit==NULL) {
1581         UErrorCode errorCode=U_ZERO_ERROR;
1582         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1583         limit=u_strchr(src, 0);
1584         if (prevBoundary != src) {
1585             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1586                 prevBoundary = src;
1587             } else {
1588                 prevBoundary = --src;
1589             }
1590         }
1591     }
1592
1593     for(;;) {
1594         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1595         // or with (compYes && ccc==0) properties.
1596         const UChar *prevSrc;
1597         UChar32 c = 0;
1598         uint16_t norm16 = 0;
1599         for (;;) {
1600             if(src==limit) {
1601                 return src;
1602             }
1603             if( (c=*src)<minNoMaybeCP ||
1604                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1605             ) {
1606                 ++src;
1607             } else {
1608                 prevSrc = src++;
1609                 if(!U16_IS_LEAD(c)) {
1610                     break;
1611                 } else {
1612                     UChar c2;
1613                     if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1614                         ++src;
1615                         c=U16_GET_SUPPLEMENTARY(c, c2);
1616                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1617                         if(!isCompYesAndZeroCC(norm16)) {
1618                             break;
1619                         }
1620                     }
1621                 }
1622             }
1623         }
1624         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1625         // The current character is either a "noNo" (has a mapping)
1626         // or a "maybeYes" (combines backward)
1627         // or a "yesYes" with ccc!=0.
1628         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1629
1630         uint16_t prevNorm16 = INERT;
1631         if (prevBoundary != prevSrc) {
1632             if (norm16HasCompBoundaryBefore(norm16)) {
1633                 prevBoundary = prevSrc;
1634             } else {
1635                 const UChar *p = prevSrc;
1636                 uint16_t n16;
1637                 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);
1638                 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1639                     prevBoundary = prevSrc;
1640                 } else {
1641                     prevBoundary = p;
1642                     prevNorm16 = n16;
1643                 }
1644             }
1645         }
1646
1647         if(isMaybeOrNonZeroCC(norm16)) {
1648             uint8_t cc=getCCFromYesOrMaybe(norm16);
1649             if (onlyContiguous /* FCC */ && cc != 0 &&
1650                     getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1651                 // The [prevBoundary..prevSrc[ character
1652                 // passed the quick check "yes && ccc==0" test
1653                 // but is out of canonical order with the current combining mark.
1654             } else {
1655                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1656                 // the previous character which passed the quick check "yes && ccc==0" test.
1657                 const UChar *nextSrc;
1658                 for (;;) {
1659                     if (norm16 < MIN_YES_YES_WITH_CC) {
1660                         if (pQCResult != nullptr) {
1661                             *pQCResult = UNORM_MAYBE;
1662                         } else {
1663                             return prevBoundary;
1664                         }
1665                     }
1666                     if (src == limit) {
1667                         return src;
1668                     }
1669                     uint8_t prevCC = cc;
1670                     nextSrc = src;
1671                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);
1672                     if (isMaybeOrNonZeroCC(norm16)) {
1673                         cc = getCCFromYesOrMaybe(norm16);
1674                         if (!(prevCC <= cc || cc == 0)) {
1675                             break;
1676                         }
1677                     } else {
1678                         break;
1679                     }
1680                     src = nextSrc;
1681                 }
1682                 // src is after the last in-order combining mark.
1683                 if (isCompYesAndZeroCC(norm16)) {
1684                     prevBoundary = src;
1685                     src = nextSrc;
1686                     continue;
1687                 }
1688             }
1689         }
1690         if(pQCResult!=NULL) {
1691             *pQCResult=UNORM_NO;
1692         }
1693         return prevBoundary;
1694     }
1695 }
1696
1697 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1698                                        UBool doCompose,
1699                                        UBool onlyContiguous,
1700                                        UnicodeString &safeMiddle,
1701                                        ReorderingBuffer &buffer,
1702                                        UErrorCode &errorCode) const {
1703     if(!buffer.isEmpty()) {
1704         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1705         if(src!=firstStarterInSrc) {
1706             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1707                                                                     buffer.getLimit(), onlyContiguous);
1708             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1709             UnicodeString middle(lastStarterInDest, destSuffixLength);
1710             buffer.removeSuffix(destSuffixLength);
1711             safeMiddle=middle;
1712             middle.append(src, (int32_t)(firstStarterInSrc-src));
1713             const UChar *middleStart=middle.getBuffer();
1714             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1715                     TRUE, buffer, errorCode);
1716             if(U_FAILURE(errorCode)) {
1717                 return;
1718             }
1719             src=firstStarterInSrc;
1720         }
1721     }
1722     if(doCompose) {
1723         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1724     } else {
1725         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1726             limit=u_strchr(src, 0);
1727         }
1728         buffer.appendZeroCC(src, limit, errorCode);
1729     }
1730 }
1731
1732 UBool
1733 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1734                              const uint8_t *src, const uint8_t *limit,
1735                              ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1736     U_ASSERT(limit != nullptr);
1737     UnicodeString s16;
1738     uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1739     const uint8_t *prevBoundary = src;
1740
1741     for (;;) {
1742         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1743         // or with (compYes && ccc==0) properties.
1744         const uint8_t *prevSrc;
1745         uint16_t norm16 = 0;
1746         for (;;) {
1747             if (src == limit) {
1748                 if (prevBoundary != limit && sink != nullptr) {
1749                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1750                                                   *sink, options, edits, errorCode);
1751                 }
1752                 return TRUE;
1753             }
1754             if (*src < minNoMaybeLead) {
1755                 ++src;
1756             } else {
1757                 prevSrc = src;
1758                 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
1759                 if (!isCompYesAndZeroCC(norm16)) {
1760                     break;
1761                 }
1762             }
1763         }
1764         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1765         // The current character is either a "noNo" (has a mapping)
1766         // or a "maybeYes" (combines backward)
1767         // or a "yesYes" with ccc!=0.
1768         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1769
1770         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1771         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1772             if (sink == nullptr) {
1773                 return FALSE;
1774             }
1775             // Fast path for mapping a character that is immediately surrounded by boundaries.
1776             // In this case, we need not decompose around the current character.
1777             if (isDecompNoAlgorithmic(norm16)) {
1778                 // Maps to a single isCompYesAndZeroCC character
1779                 // which also implies hasCompBoundaryBefore.
1780                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1781                         hasCompBoundaryBefore(src, limit)) {
1782                     if (prevBoundary != prevSrc &&
1783                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1784                                                            *sink, options, edits, errorCode)) {
1785                         break;
1786                     }
1787                     appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1788                     prevBoundary = src;
1789                     continue;
1790                 }
1791             } else if (norm16 < minNoNoCompBoundaryBefore) {
1792                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1793                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1794                         hasCompBoundaryBefore(src, limit)) {
1795                     if (prevBoundary != prevSrc &&
1796                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1797                                                            *sink, options, edits, errorCode)) {
1798                         break;
1799                     }
1800                     const uint16_t *mapping = getMapping(norm16);
1801                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1802                     if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
1803                                                     *sink, edits, errorCode)) {
1804                         break;
1805                     }
1806                     prevBoundary = src;
1807                     continue;
1808                 }
1809             } else if (norm16 >= minNoNoEmpty) {
1810                 // The current character maps to nothing.
1811                 // Simply omit it from the output if there is a boundary before _or_ after it.
1812                 // The character itself implies no boundaries.
1813                 if (hasCompBoundaryBefore(src, limit) ||
1814                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1815                     if (prevBoundary != prevSrc &&
1816                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1817                                                            *sink, options, edits, errorCode)) {
1818                         break;
1819                     }
1820                     if (edits != nullptr) {
1821                         edits->addReplace((int32_t)(src - prevSrc), 0);
1822                     }
1823                     prevBoundary = src;
1824                     continue;
1825                 }
1826             }
1827             // Other "noNo" type, or need to examine more text around this character:
1828             // Fall through to the slow path.
1829         } else if (isJamoVT(norm16)) {
1830             // Jamo L: E1 84 80..92
1831             // Jamo V: E1 85 A1..B5
1832             // Jamo T: E1 86 A8..E1 87 82
1833             U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);
1834             UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1835             if (prevSrc[1] == 0x85) {
1836                 // The current character is a Jamo Vowel,
1837                 // compose with previous Jamo L and following Jamo T.
1838                 UChar32 l = prev - Hangul::JAMO_L_BASE;
1839                 if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1840                     if (sink == nullptr) {
1841                         return FALSE;
1842                     }
1843                     int32_t t = getJamoTMinusBase(src, limit);
1844                     if (t >= 0) {
1845                         // The next character is a Jamo T.
1846                         src += 3;
1847                     } else if (hasCompBoundaryBefore(src, limit)) {
1848                         // No Jamo T follows, not even via decomposition.
1849                         t = 0;
1850                     }
1851                     if (t >= 0) {
1852                         UChar32 syllable = Hangul::HANGUL_BASE +
1853                             (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
1854                             Hangul::JAMO_T_COUNT + t;
1855                         prevSrc -= 3;  // Replace the Jamo L as well.
1856                         if (prevBoundary != prevSrc &&
1857                                 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1858                                                                *sink, options, edits, errorCode)) {
1859                             break;
1860                         }
1861                         ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1862                         prevBoundary = src;
1863                         continue;
1864                     }
1865                     // If we see L+V+x where x!=T then we drop to the slow path,
1866                     // decompose and recompose.
1867                     // This is to deal with NFKC finding normal L and V but a
1868                     // compatibility variant of a T.
1869                     // We need to either fully compose that combination here
1870                     // (which would complicate the code and may not work with strange custom data)
1871                     // or use the slow path.
1872                 }
1873             } else if (Hangul::isHangulLV(prev)) {
1874                 // The current character is a Jamo Trailing consonant,
1875                 // compose with previous Hangul LV that does not contain a Jamo T.
1876                 if (sink == nullptr) {
1877                     return FALSE;
1878                 }
1879                 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
1880                 prevSrc -= 3;  // Replace the Hangul LV as well.
1881                 if (prevBoundary != prevSrc &&
1882                         !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1883                                                        *sink, options, edits, errorCode)) {
1884                     break;
1885                 }
1886                 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1887                 prevBoundary = src;
1888                 continue;
1889             }
1890             // No matching context, or may need to decompose surrounding text first:
1891             // Fall through to the slow path.
1892         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1893             // One or more combining marks that do not combine-back:
1894             // Check for canonical order, copy unchanged if ok and
1895             // if followed by a character with a boundary-before.
1896             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1897             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1898                 // Fails FCD test, need to decompose and contiguously recompose.
1899                 if (sink == nullptr) {
1900                     return FALSE;
1901                 }
1902             } else {
1903                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1904                 // the previous character which passed the quick check "yes && ccc==0" test.
1905                 const uint8_t *nextSrc;
1906                 uint16_t n16;
1907                 for (;;) {
1908                     if (src == limit) {
1909                         if (sink != nullptr) {
1910                             ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1911                                                           *sink, options, edits, errorCode);
1912                         }
1913                         return TRUE;
1914                     }
1915                     uint8_t prevCC = cc;
1916                     nextSrc = src;
1917                     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);
1918                     if (n16 >= MIN_YES_YES_WITH_CC) {
1919                         cc = getCCFromNormalYesOrMaybe(n16);
1920                         if (prevCC > cc) {
1921                             if (sink == nullptr) {
1922                                 return FALSE;
1923                             }
1924                             break;
1925                         }
1926                     } else {
1927                         break;
1928                     }
1929                     src = nextSrc;
1930                 }
1931                 // src is after the last in-order combining mark.
1932                 // If there is a boundary here, then we continue with no change.
1933                 if (norm16HasCompBoundaryBefore(n16)) {
1934                     if (isCompYesAndZeroCC(n16)) {
1935                         src = nextSrc;
1936                     }
1937                     continue;
1938                 }
1939                 // Use the slow path. There is no boundary in [prevSrc, src[.
1940             }
1941         }
1942
1943         // Slow path: Find the nearest boundaries around the current character,
1944         // decompose and recompose.
1945         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1946             const uint8_t *p = prevSrc;
1947             UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);
1948             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1949                 prevSrc = p;
1950             }
1951         }
1952         ReorderingBuffer buffer(*this, s16, errorCode);
1953         if (U_FAILURE(errorCode)) {
1954             break;
1955         }
1956         // We know there is not a boundary here.
1957         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1958                        buffer, errorCode);
1959         // Decompose until the next boundary.
1960         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1961                              buffer, errorCode);
1962         if (U_FAILURE(errorCode)) {
1963             break;
1964         }
1965         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1966             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1967             return TRUE;
1968         }
1969         recompose(buffer, 0, onlyContiguous);
1970         if (!buffer.equals(prevSrc, src)) {
1971             if (sink == nullptr) {
1972                 return FALSE;
1973             }
1974             if (prevBoundary != prevSrc &&
1975                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1976                                                    *sink, options, edits, errorCode)) {
1977                 break;
1978             }
1979             if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
1980                                             *sink, edits, errorCode)) {
1981                 break;
1982             }
1983             prevBoundary = src;
1984         }
1985     }
1986     return TRUE;
1987 }
1988
1989 UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const {
1990     if (src == limit || *src < minCompNoMaybeCP) {
1991         return TRUE;
1992     }
1993     UChar32 c;
1994     uint16_t norm16;
1995     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
1996     return norm16HasCompBoundaryBefore(norm16);
1997 }
1998
1999 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2000     if (src == limit) {
2001         return TRUE;
2002     }
2003     uint16_t norm16;
2004     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
2005     return norm16HasCompBoundaryBefore(norm16);
2006 }
2007
2008 UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
2009                                             UBool onlyContiguous) const {
2010     if (start == p) {
2011         return TRUE;
2012     }
2013     UChar32 c;
2014     uint16_t norm16;
2015     UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2016     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2017 }
2018
2019 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2020                                             UBool onlyContiguous) const {
2021     if (start == p) {
2022         return TRUE;
2023     }
2024     uint16_t norm16;
2025     UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);
2026     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2027 }
2028
2029 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
2030                                                        UBool onlyContiguous) const {
2031     while (p != start) {
2032         const UChar *codePointLimit = p;
2033         UChar32 c;
2034         uint16_t norm16;
2035         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2036         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2037             return codePointLimit;
2038         }
2039         if (hasCompBoundaryBefore(c, norm16)) {
2040             return p;
2041         }
2042     }
2043     return p;
2044 }
2045
2046 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
2047                                                    UBool onlyContiguous) const {
2048     while (p != limit) {
2049         const UChar *codePointStart = p;
2050         UChar32 c;
2051         uint16_t norm16;
2052         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2053         if (hasCompBoundaryBefore(c, norm16)) {
2054             return codePointStart;
2055         }
2056         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2057             return p;
2058         }
2059     }
2060     return p;
2061 }
2062
2063 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
2064     if (start == p) {
2065         return 0;
2066     }
2067     int32_t i = (int32_t)(p - start);
2068     UChar32 c;
2069     U16_PREV(start, 0, i, c);
2070     return (uint8_t)getFCD16(c);
2071 }
2072
2073 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2074     if (start == p) {
2075         return 0;
2076     }
2077     int32_t i = (int32_t)(p - start);
2078     UChar32 c;
2079     U8_PREV(start, 0, i, c);
2080     return (uint8_t)getFCD16(c);
2081 }
2082
2083 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2084 // still had getFCDTrie() which built and cached an FCD trie.
2085 // That provided faster access to FCD data than getFCD16FromNormData()
2086 // but required synchronization and consumed some 10kB of heap memory
2087 // in any process that uses FCD (e.g., via collation).
2088 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2089 // at least for ASCII & CJK.
2090
2091 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2092 // function on Windows ARM64. As a work-around, we disable optimizations for this function.
2093 // This work-around could/should be removed once the following versions of Visual Studio are no
2094 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2095 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2096 #pragma optimize( "", off )
2097 #endif
2098 // Gets the FCD value from the regular normalization data.
2099 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2100     uint16_t norm16=getNorm16(c);
2101     if (norm16 >= limitNoNo) {
2102         if(norm16>=MIN_NORMAL_MAYBE_YES) {
2103             // combining mark
2104             norm16=getCCFromNormalYesOrMaybe(norm16);
2105             return norm16|(norm16<<8);
2106         } else if(norm16>=minMaybeYes) {
2107             return 0;
2108         } else {  // isDecompNoAlgorithmic(norm16)
2109             uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2110             if (deltaTrailCC <= DELTA_TCCC_1) {
2111                 return deltaTrailCC >> OFFSET_SHIFT;
2112             }
2113             // Maps to an isCompYesAndZeroCC.
2114             c=mapAlgorithmic(c, norm16);
2115             norm16=getRawNorm16(c);
2116         }
2117     }
2118     if(norm16<=minYesNo || isHangulLVT(norm16)) {
2119         // no decomposition or Hangul syllable, all zeros
2120         return 0;
2121     }
2122     // c decomposes, get everything from the variable-length extra data
2123     const uint16_t *mapping=getMapping(norm16);
2124     uint16_t firstUnit=*mapping;
2125     norm16=firstUnit>>8;  // tccc
2126     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2127         norm16|=*(mapping-1)&0xff00;  // lccc
2128     }
2129     return norm16;
2130 }
2131 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2132 #pragma optimize( "", on )
2133 #endif
2134
2135 // Dual functionality:
2136 // buffer!=NULL: normalize
2137 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2138 const UChar *
2139 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
2140                          ReorderingBuffer *buffer,
2141                          UErrorCode &errorCode) const {
2142     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2143     // Similar to the prevBoundary in the compose() implementation.
2144     const UChar *prevBoundary=src;
2145     int32_t prevFCD16=0;
2146     if(limit==NULL) {
2147         src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2148         if(U_FAILURE(errorCode)) {
2149             return src;
2150         }
2151         if(prevBoundary<src) {
2152             prevBoundary=src;
2153             // We know that the previous character's lccc==0.
2154             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2155             prevFCD16=getFCD16(*(src-1));
2156             if(prevFCD16>1) {
2157                 --prevBoundary;
2158             }
2159         }
2160         limit=u_strchr(src, 0);
2161     }
2162
2163     // Note: In this function we use buffer->appendZeroCC() because we track
2164     // the lead and trail combining classes here, rather than leaving it to
2165     // the ReorderingBuffer.
2166     // The exception is the call to decomposeShort() which uses the buffer
2167     // in the normal way.
2168
2169     const UChar *prevSrc;
2170     UChar32 c=0;
2171     uint16_t fcd16=0;
2172
2173     for(;;) {
2174         // count code units with lccc==0
2175         for(prevSrc=src; src!=limit;) {
2176             if((c=*src)<minLcccCP) {
2177                 prevFCD16=~c;
2178                 ++src;
2179             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2180                 prevFCD16=0;
2181                 ++src;
2182             } else {
2183                 if(U16_IS_LEAD(c)) {
2184                     UChar c2;
2185                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
2186                         c=U16_GET_SUPPLEMENTARY(c, c2);
2187                     }
2188                 }
2189                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2190                     prevFCD16=fcd16;
2191                     src+=U16_LENGTH(c);
2192                 } else {
2193                     break;
2194                 }
2195             }
2196         }
2197         // copy these code units all at once
2198         if(src!=prevSrc) {
2199             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2200                 break;
2201             }
2202             if(src==limit) {
2203                 break;
2204             }
2205             prevBoundary=src;
2206             // We know that the previous character's lccc==0.
2207             if(prevFCD16<0) {
2208                 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2209                 UChar32 prev=~prevFCD16;
2210                 if(prev<minDecompNoCP) {
2211                     prevFCD16=0;
2212                 } else {
2213                     prevFCD16=getFCD16FromNormData(prev);
2214                     if(prevFCD16>1) {
2215                         --prevBoundary;
2216                     }
2217                 }
2218             } else {
2219                 const UChar *p=src-1;
2220                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
2221                     --p;
2222                     // Need to fetch the previous character's FCD value because
2223                     // prevFCD16 was just for the trail surrogate code point.
2224                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
2225                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2226                 }
2227                 if(prevFCD16>1) {
2228                     prevBoundary=p;
2229                 }
2230             }
2231             // The start of the current character (c).
2232             prevSrc=src;
2233         } else if(src==limit) {
2234             break;
2235         }
2236
2237         src+=U16_LENGTH(c);
2238         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2239         // Check for proper order, and decompose locally if necessary.
2240         if((prevFCD16&0xff)<=(fcd16>>8)) {
2241             // proper order: prev tccc <= current lccc
2242             if((fcd16&0xff)<=1) {
2243                 prevBoundary=src;
2244             }
2245             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
2246                 break;
2247             }
2248             prevFCD16=fcd16;
2249             continue;
2250         } else if(buffer==NULL) {
2251             return prevBoundary;  // quick check "no"
2252         } else {
2253             /*
2254              * Back out the part of the source that we copied or appended
2255              * already but is now going to be decomposed.
2256              * prevSrc is set to after what was copied/appended.
2257              */
2258             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2259             /*
2260              * Find the part of the source that needs to be decomposed,
2261              * up to the next safe boundary.
2262              */
2263             src=findNextFCDBoundary(src, limit);
2264             /*
2265              * The source text does not fulfill the conditions for FCD.
2266              * Decompose and reorder a limited piece of the text.
2267              */
2268             decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);
2269             if (U_FAILURE(errorCode)) {
2270                 break;
2271             }
2272             prevBoundary=src;
2273             prevFCD16=0;
2274         }
2275     }
2276     return src;
2277 }
2278
2279 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
2280                                        UBool doMakeFCD,
2281                                        UnicodeString &safeMiddle,
2282                                        ReorderingBuffer &buffer,
2283                                        UErrorCode &errorCode) const {
2284     if(!buffer.isEmpty()) {
2285         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2286         if(src!=firstBoundaryInSrc) {
2287             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2288                                                                     buffer.getLimit());
2289             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2290             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2291             buffer.removeSuffix(destSuffixLength);
2292             safeMiddle=middle;
2293             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2294             const UChar *middleStart=middle.getBuffer();
2295             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2296             if(U_FAILURE(errorCode)) {
2297                 return;
2298             }
2299             src=firstBoundaryInSrc;
2300         }
2301     }
2302     if(doMakeFCD) {
2303         makeFCD(src, limit, &buffer, errorCode);
2304     } else {
2305         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
2306             limit=u_strchr(src, 0);
2307         }
2308         buffer.appendZeroCC(src, limit, errorCode);
2309     }
2310 }
2311
2312 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
2313     while(start<p) {
2314         const UChar *codePointLimit = p;
2315         UChar32 c;
2316         uint16_t norm16;
2317         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2318         if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2319             return codePointLimit;
2320         }
2321         if (norm16HasDecompBoundaryBefore(norm16)) {
2322             return p;
2323         }
2324     }
2325     return p;
2326 }
2327
2328 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
2329     while(p<limit) {
2330         const UChar *codePointStart=p;
2331         UChar32 c;
2332         uint16_t norm16;
2333         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2334         if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2335             return codePointStart;
2336         }
2337         if (norm16HasDecompBoundaryAfter(norm16)) {
2338             return p;
2339         }
2340     }
2341     return p;
2342 }
2343
2344 // CanonicalIterator data -------------------------------------------------- ***
2345
2346 CanonIterData::CanonIterData(UErrorCode &errorCode) :
2347         mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr),
2348         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
2349
2350 CanonIterData::~CanonIterData() {
2351     umutablecptrie_close(mutableTrie);
2352     ucptrie_close(trie);
2353 }
2354
2355 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2356     uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);
2357     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2358         // origin is the first character whose decomposition starts with
2359         // the character for which we are setting the value.
2360         umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode);
2361     } else {
2362         // origin is not the first character, or it is U+0000.
2363         UnicodeSet *set;
2364         if((canonValue&CANON_HAS_SET)==0) {
2365             set=new UnicodeSet;
2366             if(set==NULL) {
2367                 errorCode=U_MEMORY_ALLOCATION_ERROR;
2368                 return;
2369             }
2370             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2371             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
2372             umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
2373             canonStartSets.addElement(set, errorCode);
2374             if(firstOrigin!=0) {
2375                 set->add(firstOrigin);
2376             }
2377         } else {
2378             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
2379         }
2380         set->add(origin);
2381     }
2382 }
2383
2384 // C++ class for friend access to private Normalizer2Impl members.
2385 class InitCanonIterData {
2386 public:
2387     static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2388 };
2389
2390 U_CDECL_BEGIN
2391
2392 // UInitOnce instantiation function for CanonIterData
2393 static void U_CALLCONV
2394 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2395     InitCanonIterData::doInit(impl, errorCode);
2396 }
2397
2398 U_CDECL_END
2399
2400 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2401     U_ASSERT(impl->fCanonIterData == NULL);
2402     impl->fCanonIterData = new CanonIterData(errorCode);
2403     if (impl->fCanonIterData == NULL) {
2404         errorCode=U_MEMORY_ALLOCATION_ERROR;
2405     }
2406     if (U_SUCCESS(errorCode)) {
2407         UChar32 start = 0, end;
2408         uint32_t value;
2409         while ((end = ucptrie_getRange(impl->normTrie, start,
2410                                        UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
2411                                        nullptr, nullptr, &value)) >= 0) {
2412             // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2413             if (value != Normalizer2Impl::INERT) {
2414                 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2415             }
2416             start = end + 1;
2417         }
2418 #ifdef UCPTRIE_DEBUG
2419         umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
2420 #endif
2421         impl->fCanonIterData->trie = umutablecptrie_buildImmutable(
2422             impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
2423         umutablecptrie_close(impl->fCanonIterData->mutableTrie);
2424         impl->fCanonIterData->mutableTrie = nullptr;
2425     }
2426     if (U_FAILURE(errorCode)) {
2427         delete impl->fCanonIterData;
2428         impl->fCanonIterData = NULL;
2429     }
2430 }
2431
2432 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2433                                                   CanonIterData &newData,
2434                                                   UErrorCode &errorCode) const {
2435     if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
2436         // Inert, or 2-way mapping (including Hangul syllable).
2437         // We do not write a canonStartSet for any yesNo character.
2438         // Composites from 2-way mappings are added at runtime from the
2439         // starter's compositions list, and the other characters in
2440         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2441         // "maybe" characters.
2442         return;
2443     }
2444     for(UChar32 c=start; c<=end; ++c) {
2445         uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);
2446         uint32_t newValue=oldValue;
2447         if(isMaybeOrNonZeroCC(norm16)) {
2448             // not a segment starter if it occurs in a decomposition or has cc!=0
2449             newValue|=CANON_NOT_SEGMENT_STARTER;
2450             if(norm16<MIN_NORMAL_MAYBE_YES) {
2451                 newValue|=CANON_HAS_COMPOSITIONS;
2452             }
2453         } else if(norm16<minYesNo) {
2454             newValue|=CANON_HAS_COMPOSITIONS;
2455         } else {
2456             // c has a one-way decomposition
2457             UChar32 c2=c;
2458             // Do not modify the whole-range norm16 value.
2459             uint16_t norm16_2=norm16;
2460             if (isDecompNoAlgorithmic(norm16_2)) {
2461                 // Maps to an isCompYesAndZeroCC.
2462                 c2 = mapAlgorithmic(c2, norm16_2);
2463                 norm16_2 = getRawNorm16(c2);
2464                 // No compatibility mappings for the CanonicalIterator.
2465                 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
2466             }
2467             if (norm16_2 > minYesNo) {
2468                 // c decomposes, get everything from the variable-length extra data
2469                 const uint16_t *mapping=getMapping(norm16_2);
2470                 uint16_t firstUnit=*mapping;
2471                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2472                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2473                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
2474                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
2475                     }
2476                 }
2477                 // Skip empty mappings (no characters in the decomposition).
2478                 if(length!=0) {
2479                     ++mapping;  // skip over the firstUnit
2480                     // add c to first code point's start set
2481                     int32_t i=0;
2482                     U16_NEXT_UNSAFE(mapping, i, c2);
2483                     newData.addToStartSet(c, c2, errorCode);
2484                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2485                     // one-way mapping. A 2-way mapping is possible here after
2486                     // intermediate algorithmic mapping.
2487                     if(norm16_2>=minNoNo) {
2488                         while(i<length) {
2489                             U16_NEXT_UNSAFE(mapping, i, c2);
2490                             uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);
2491                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
2492                                 umutablecptrie_set(newData.mutableTrie, c2,
2493                                                    c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode);
2494                             }
2495                         }
2496                     }
2497                 }
2498             } else {
2499                 // c decomposed to c2 algorithmically; c has cc==0
2500                 newData.addToStartSet(c, c2, errorCode);
2501             }
2502         }
2503         if(newValue!=oldValue) {
2504             umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);
2505         }
2506     }
2507 }
2508
2509 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2510     // Logically const: Synchronized instantiation.
2511     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2512     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2513     return U_SUCCESS(errorCode);
2514 }
2515
2516 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2517     return (int32_t)ucptrie_get(fCanonIterData->trie, c);
2518 }
2519
2520 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2521     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
2522 }
2523
2524 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2525     return getCanonValue(c)>=0;
2526 }
2527
2528 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2529     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2530     if(canonValue==0) {
2531         return FALSE;
2532     }
2533     set.clear();
2534     int32_t value=canonValue&CANON_VALUE_MASK;
2535     if((canonValue&CANON_HAS_SET)!=0) {
2536         set.addAll(getCanonStartSet(value));
2537     } else if(value!=0) {
2538         set.add(value);
2539     }
2540     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
2541         uint16_t norm16=getRawNorm16(c);
2542         if(norm16==JAMO_L) {
2543             UChar32 syllable=
2544                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2545             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2546         } else {
2547             addComposites(getCompositionsList(norm16), set);
2548         }
2549     }
2550     return TRUE;
2551 }
2552
2553 U_NAMESPACE_END
2554
2555 // Normalizer2 data swapping ----------------------------------------------- ***
2556
2557 U_NAMESPACE_USE
2558
2559 U_CAPI int32_t U_EXPORT2
2560 unorm2_swap(const UDataSwapper *ds,
2561             const void *inData, int32_t length, void *outData,
2562             UErrorCode *pErrorCode) {
2563     const UDataInfo *pInfo;
2564     int32_t headerSize;
2565
2566     const uint8_t *inBytes;
2567     uint8_t *outBytes;
2568
2569     const int32_t *inIndexes;
2570     int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2571
2572     int32_t i, offset, nextOffset, size;
2573
2574     /* udata_swapDataHeader checks the arguments */
2575     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2576     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2577         return 0;
2578     }
2579
2580     /* check data format and format version */
2581     pInfo=(const UDataInfo *)((const char *)inData+4);
2582     uint8_t formatVersion0=pInfo->formatVersion[0];
2583     if(!(
2584         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2585         pInfo->dataFormat[1]==0x72 &&
2586         pInfo->dataFormat[2]==0x6d &&
2587         pInfo->dataFormat[3]==0x32 &&
2588         (1<=formatVersion0 && formatVersion0<=4)
2589     )) {
2590         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2591                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2592                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2593                          pInfo->formatVersion[0]);
2594         *pErrorCode=U_UNSUPPORTED_ERROR;
2595         return 0;
2596     }
2597
2598     inBytes=(const uint8_t *)inData+headerSize;
2599     outBytes=(uint8_t *)outData+headerSize;
2600
2601     inIndexes=(const int32_t *)inBytes;
2602     int32_t minIndexesLength;
2603     if(formatVersion0==1) {
2604         minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2605     } else if(formatVersion0==2) {
2606         minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2607     } else {
2608         minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2609     }
2610
2611     if(length>=0) {
2612         length-=headerSize;
2613         if(length<minIndexesLength*4) {
2614             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2615                              length);
2616             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2617             return 0;
2618         }
2619     }
2620
2621     /* read the first few indexes */
2622     for(i=0; i<UPRV_LENGTHOF(indexes); ++i) {
2623         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2624     }
2625
2626     /* get the total length of the data */
2627     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2628
2629     if(length>=0) {
2630         if(length<size) {
2631             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2632                              length);
2633             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2634             return 0;
2635         }
2636
2637         /* copy the data for inaccessible bytes */
2638         if(inBytes!=outBytes) {
2639             uprv_memcpy(outBytes, inBytes, size);
2640         }
2641
2642         offset=0;
2643
2644         /* swap the int32_t indexes[] */
2645         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2646         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2647         offset=nextOffset;
2648
2649         /* swap the trie */
2650         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2651         utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2652         offset=nextOffset;
2653
2654         /* swap the uint16_t extraData[] */
2655         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2656         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2657         offset=nextOffset;
2658
2659         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2660         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2661         offset=nextOffset;
2662
2663         U_ASSERT(offset==size);
2664     }
2665
2666     return headerSize+size;
2667 }
2668
2669 #endif  // !UCONFIG_NO_NORMALIZATION