icuSources/common/normalizer2impl.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2009-2014, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  normalizer2impl.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009nov22
  14 *   created by: Markus W. Scherer
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_NORMALIZATION
  20
  21 #include "unicode/normalizer2.h"
  22 #include "unicode/udata.h"
  23 #include "unicode/ustring.h"
  24 #include "unicode/utf16.h"
  25 #include "cmemory.h"
  26 #include "mutex.h"
  27 #include "normalizer2impl.h"
  28 #include "putilimp.h"
  29 #include "uassert.h"
  30 #include "uset_imp.h"
  31 #include "utrie2.h"
  32 #include "uvector.h"
  33
  34 U_NAMESPACE_BEGIN
  35
  36 // ReorderingBuffer -------------------------------------------------------- ***
  37
  38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
  39     int32_t length=str.length();
  40     start=str.getBuffer(destCapacity);
  41     if(start==NULL) {
  42         // getBuffer() already did str.setToBogus()
  43         errorCode=U_MEMORY_ALLOCATION_ERROR;
  44         return FALSE;
  45     }
  46     limit=start+length;
  47     remainingCapacity=str.getCapacity()-length;
  48     reorderStart=start;
  49     if(start==limit) {
  50         lastCC=0;
  51     } else {
  52         setIterator();
  53         lastCC=previousCC();
  54         // Set reorderStart after the last code point with cc<=1 if there is one.
  55         if(lastCC>1) {
  56             while(previousCC()>1) {}
  57         }
  58         reorderStart=codePointLimit;
  59     }
  60     return TRUE;
  61 }
  62
  63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
  64     int32_t length=(int32_t)(limit-start);
  65     return
  66         length==(int32_t)(otherLimit-otherStart) &&
  67         0==u_memcmp(start, otherStart, length);
  68 }
  69
  70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
  71     if(remainingCapacity<2 && !resize(2, errorCode)) {
  72         return FALSE;
  73     }
  74     if(lastCC<=cc || cc==0) {
  75         limit[0]=U16_LEAD(c);
  76         limit[1]=U16_TRAIL(c);
  77         limit+=2;
  78         lastCC=cc;
  79         if(cc<=1) {
  80             reorderStart=limit;
  81         }
  82     } else {
  83         insert(c, cc);
  84     }
  85     remainingCapacity-=2;
  86     return TRUE;
  87 }
  88
  89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
  90                                uint8_t leadCC, uint8_t trailCC,
  91                                UErrorCode &errorCode) {
  92     if(length==0) {
  93         return TRUE;
  94     }
  95     if(remainingCapacity<length && !resize(length, errorCode)) {
  96         return FALSE;
  97     }
  98     remainingCapacity-=length;
  99     if(lastCC<=leadCC || leadCC==0) {
 100         if(trailCC<=1) {
 101             reorderStart=limit+length;
 102         } else if(leadCC<=1) {
 103             reorderStart=limit+1;  // Ok if not a code point boundary.
 104         }
 105         const UChar *sLimit=s+length;
 106         do { *limit++=*s++; } while(s!=sLimit);
 107         lastCC=trailCC;
 108     } else {
 109         int32_t i=0;
 110         UChar32 c;
 111         U16_NEXT(s, i, length, c);
 112         insert(c, leadCC);  // insert first code point
 113         while(i<length) {
 114             U16_NEXT(s, i, length, c);
 115             if(i<length) {
 116                 // s must be in NFD, otherwise we need to use getCC().
 117                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
 118             } else {
 119                 leadCC=trailCC;
 120             }
 121             append(c, leadCC, errorCode);
 122         }
 123     }
 124     return TRUE;
 125 }
 126
 127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
 128     int32_t cpLength=U16_LENGTH(c);
 129     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
 130         return FALSE;
 131     }
 132     remainingCapacity-=cpLength;
 133     if(cpLength==1) {
 134         *limit++=(UChar)c;
 135     } else {
 136         limit[0]=U16_LEAD(c);
 137         limit[1]=U16_TRAIL(c);
 138         limit+=2;
 139     }
 140     lastCC=0;
 141     reorderStart=limit;
 142     return TRUE;
 143 }
 144
 145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
 146     if(s==sLimit) {
 147         return TRUE;
 148     }
 149     int32_t length=(int32_t)(sLimit-s);
 150     if(remainingCapacity<length && !resize(length, errorCode)) {
 151         return FALSE;
 152     }
 153     u_memcpy(limit, s, length);
 154     limit+=length;
 155     remainingCapacity-=length;
 156     lastCC=0;
 157     reorderStart=limit;
 158     return TRUE;
 159 }
 160
 161 void ReorderingBuffer::remove() {
 162     reorderStart=limit=start;
 163     remainingCapacity=str.getCapacity();
 164     lastCC=0;
 165 }
 166
 167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
 168     if(suffixLength<(limit-start)) {
 169         limit-=suffixLength;
 170         remainingCapacity+=suffixLength;
 171     } else {
 172         limit=start;
 173         remainingCapacity=str.getCapacity();
 174     }
 175     lastCC=0;
 176     reorderStart=limit;
 177 }
 178
 179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
 180     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
 181     int32_t length=(int32_t)(limit-start);
 182     str.releaseBuffer(length);
 183     int32_t newCapacity=length+appendLength;
 184     int32_t doubleCapacity=2*str.getCapacity();
 185     if(newCapacity<doubleCapacity) {
 186         newCapacity=doubleCapacity;
 187     }
 188     if(newCapacity<256) {
 189         newCapacity=256;
 190     }
 191     start=str.getBuffer(newCapacity);
 192     if(start==NULL) {
 193         // getBuffer() already did str.setToBogus()
 194         errorCode=U_MEMORY_ALLOCATION_ERROR;
 195         return FALSE;
 196     }
 197     reorderStart=start+reorderStartIndex;
 198     limit=start+length;
 199     remainingCapacity=str.getCapacity()-length;
 200     return TRUE;
 201 }
 202
 203 void ReorderingBuffer::skipPrevious() {
 204     codePointLimit=codePointStart;
 205     UChar c=*--codePointStart;
 206     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
 207         --codePointStart;
 208     }
 209 }
 210
 211 uint8_t ReorderingBuffer::previousCC() {
 212     codePointLimit=codePointStart;
 213     if(reorderStart>=codePointStart) {
 214         return 0;
 215     }
 216     UChar32 c=*--codePointStart;
 217     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
 218         return 0;
 219     }
 220
 221     UChar c2;
 222     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
 223         --codePointStart;
 224         c=U16_GET_SUPPLEMENTARY(c2, c);
 225     }
 226     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
 227 }
 228
 229 // Inserts c somewhere before the last character.
 230 // Requires 0<cc<lastCC which implies reorderStart<limit.
 231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
 232     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 233     // insert c at codePointLimit, after the character with prevCC<=cc
 234     UChar *q=limit;
 235     UChar *r=limit+=U16_LENGTH(c);
 236     do {
 237         *--r=*--q;
 238     } while(codePointLimit!=q);
 239     writeCodePoint(q, c);
 240     if(cc<=1) {
 241         reorderStart=r;
 242     }
 243 }
 244
 245 // Normalizer2Impl --------------------------------------------------------- ***
 246
 247 struct CanonIterData : public UMemory {
 248     CanonIterData(UErrorCode &errorCode);
 249     ~CanonIterData();
 250     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
 251     UTrie2 *trie;
 252     UVector canonStartSets;  // contains UnicodeSet *
 253 };
 254
 255 Normalizer2Impl::~Normalizer2Impl() {
 256     delete fCanonIterData;
 257 }
 258
 259 void
 260 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
 261                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
 262     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 263     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 264
 265     minYesNo=inIndexes[IX_MIN_YES_NO];
 266     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 267     minNoNo=inIndexes[IX_MIN_NO_NO];
 268     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 269     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 270
 271     normTrie=inTrie;
 272
 273     maybeYesCompositions=inExtraData;
 274     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
 275
 276     smallFCD=inSmallFCD;
 277
 278     // Build tccc180[].
 279     // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
 280     uint8_t bits=0;
 281     for(UChar c=0; c<0x180; bits>>=1) {
 282         if((c&0xff)==0) {
 283             bits=smallFCD[c>>8];  // one byte per 0x100 code points
 284         }
 285         if(bits&1) {
 286             for(int i=0; i<0x20; ++i, ++c) {
 287                 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
 288             }
 289         } else {
 290             uprv_memset(tccc180+c, 0, 0x20);
 291             c+=0x20;
 292         }
 293     }
 294 }
 295
 296 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
 297     UChar32 c;
 298     if(cpStart==(cpLimit-1)) {
 299         c=*cpStart;
 300     } else {
 301         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
 302     }
 303     uint16_t prevNorm16=getNorm16(c);
 304     if(prevNorm16<=minYesNo) {
 305         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
 306     } else {
 307         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
 308     }
 309 }
 310
 311 namespace {
 312
 313 class LcccContext {
 314 public:
 315     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
 316
 317     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
 318         if(impl.isAlgorithmicNoNo(norm16)) {
 319             // Range of code points with same-norm16-value algorithmic decompositions.
 320             // They might have different non-zero FCD16 values.
 321             do {
 322                 uint16_t fcd16=impl.getFCD16(start);
 323                 if(fcd16>0xff) { set.add(start); }
 324             } while(++start<=end);
 325         } else {
 326             uint16_t fcd16=impl.getFCD16(start);
 327             if(fcd16>0xff) { set.add(start, end); }
 328         }
 329     }
 330
 331 private:
 332     const Normalizer2Impl &impl;
 333     UnicodeSet &set;
 334 };
 335
 336 struct PropertyStartsContext {
 337     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
 338             : impl(ni), sa(adder) {}
 339
 340     const Normalizer2Impl &impl;
 341     const USetAdder *sa;
 342 };
 343
 344 }  // namespace
 345
 346 U_CDECL_BEGIN
 347
 348 static UBool U_CALLCONV
 349 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
 350     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
 351     return TRUE;
 352 }
 353
 354 static UBool U_CALLCONV
 355 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
 356     /* add the start code point to the USet */
 357     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
 358     const USetAdder *sa=ctx->sa;
 359     sa->add(sa->set, start);
 360     if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
 361         // Range of code points with same-norm16-value algorithmic decompositions.
 362         // They might have different non-zero FCD16 values.
 363         uint16_t prevFCD16=ctx->impl.getFCD16(start);
 364         while(++start<=end) {
 365             uint16_t fcd16=ctx->impl.getFCD16(start);
 366             if(fcd16!=prevFCD16) {
 367                 sa->add(sa->set, start);
 368                 prevFCD16=fcd16;
 369             }
 370         }
 371     }
 372     return TRUE;
 373 }
 374
 375 static UBool U_CALLCONV
 376 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
 377     /* add the start code point to the USet */
 378     const USetAdder *sa=(const USetAdder *)context;
 379     sa->add(sa->set, start);
 380     return TRUE;
 381 }
 382
 383 static uint32_t U_CALLCONV
 384 segmentStarterMapper(const void * /*context*/, uint32_t value) {
 385     return value&CANON_NOT_SEGMENT_STARTER;
 386 }
 387
 388 U_CDECL_END
 389
 390 void
 391 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
 392     /* add the start code point of each same-value range of each trie */
 393     LcccContext context(*this, set);
 394     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
 395 }
 396
 397 void
 398 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
 399     /* add the start code point of each same-value range of each trie */
 400     PropertyStartsContext context(*this, sa);
 401     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
 402
 403     /* add Hangul LV syllables and LV+1 because of skippables */
 404     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
 405         sa->add(sa->set, c);
 406         sa->add(sa->set, c+1);
 407     }
 408     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
 409 }
 410
 411 void
 412 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
 413     /* add the start code point of each same-value range of the canonical iterator data trie */
 414     if(ensureCanonIterData(errorCode)) {
 415         // currently only used for the SEGMENT_STARTER property
 416         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
 417     }
 418 }
 419
 420 const UChar *
 421 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
 422                                                 UChar32 minNeedDataCP,
 423                                                 ReorderingBuffer *buffer,
 424                                                 UErrorCode &errorCode) const {
 425     // Make some effort to support NUL-terminated strings reasonably.
 426     // Take the part of the fast quick check loop that does not look up
 427     // data and check the first part of the string.
 428     // After this prefix, determine the string length to simplify the rest
 429     // of the code.
 430     const UChar *prevSrc=src;
 431     UChar c;
 432     while((c=*src++)<minNeedDataCP && c!=0) {}
 433     // Back out the last character for full processing.
 434     // Copy this prefix.
 435     if(--src!=prevSrc) {
 436         if(buffer!=NULL) {
 437             buffer->appendZeroCC(prevSrc, src, errorCode);
 438         }
 439     }
 440     return src;
 441 }
 442
 443 UnicodeString &
 444 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
 445                            UErrorCode &errorCode) const {
 446     if(U_FAILURE(errorCode)) {
 447         dest.setToBogus();
 448         return dest;
 449     }
 450     const UChar *sArray=src.getBuffer();
 451     if(&dest==&src || sArray==NULL) {
 452         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 453         dest.setToBogus();
 454         return dest;
 455     }
 456     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
 457     return dest;
 458 }
 459
 460 void
 461 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 462                            UnicodeString &dest,
 463                            int32_t destLengthEstimate,
 464                            UErrorCode &errorCode) const {
 465     if(destLengthEstimate<0 && limit!=NULL) {
 466         destLengthEstimate=(int32_t)(limit-src);
 467     }
 468     dest.remove();
 469     ReorderingBuffer buffer(*this, dest);
 470     if(buffer.init(destLengthEstimate, errorCode)) {
 471         decompose(src, limit, &buffer, errorCode);
 472     }
 473 }
 474
 475 // Dual functionality:
 476 // buffer!=NULL: normalize
 477 // buffer==NULL: isNormalized/spanQuickCheckYes
 478 const UChar *
 479 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 480                            ReorderingBuffer *buffer,
 481                            UErrorCode &errorCode) const {
 482     UChar32 minNoCP=minDecompNoCP;
 483     if(limit==NULL) {
 484         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
 485         if(U_FAILURE(errorCode)) {
 486             return src;
 487         }
 488         limit=u_strchr(src, 0);
 489     }
 490
 491     const UChar *prevSrc;
 492     UChar32 c=0;
 493     uint16_t norm16=0;
 494
 495     // only for quick check
 496     const UChar *prevBoundary=src;
 497     uint8_t prevCC=0;
 498
 499     for(;;) {
 500         // count code units below the minimum or with irrelevant data for the quick check
 501         for(prevSrc=src; src!=limit;) {
 502             if( (c=*src)<minNoCP ||
 503                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
 504             ) {
 505                 ++src;
 506             } else if(!U16_IS_SURROGATE(c)) {
 507                 break;
 508             } else {
 509                 UChar c2;
 510                 if(U16_IS_SURROGATE_LEAD(c)) {
 511                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
 512                         c=U16_GET_SUPPLEMENTARY(c, c2);
 513                     }
 514                 } else /* trail surrogate */ {
 515                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
 516                         --src;
 517                         c=U16_GET_SUPPLEMENTARY(c2, c);
 518                     }
 519                 }
 520                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
 521                     src+=U16_LENGTH(c);
 522                 } else {
 523                     break;
 524                 }
 525             }
 526         }
 527         // copy these code units all at once
 528         if(src!=prevSrc) {
 529             if(buffer!=NULL) {
 530                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
 531                     break;
 532                 }
 533             } else {
 534                 prevCC=0;
 535                 prevBoundary=src;
 536             }
 537         }
 538         if(src==limit) {
 539             break;
 540         }
 541
 542         // Check one above-minimum, relevant code point.
 543         src+=U16_LENGTH(c);
 544         if(buffer!=NULL) {
 545             if(!decompose(c, norm16, *buffer, errorCode)) {
 546                 break;
 547             }
 548         } else {
 549             if(isDecompYes(norm16)) {
 550                 uint8_t cc=getCCFromYesOrMaybe(norm16);
 551                 if(prevCC<=cc || cc==0) {
 552                     prevCC=cc;
 553                     if(cc<=1) {
 554                         prevBoundary=src;
 555                     }
 556                     continue;
 557                 }
 558             }
 559             return prevBoundary;  // "no" or cc out of order
 560         }
 561     }
 562     return src;
 563 }
 564
 565 // Decompose a short piece of text which is likely to contain characters that
 566 // fail the quick check loop and/or where the quick check loop's overhead
 567 // is unlikely to be amortized.
 568 // Called by the compose() and makeFCD() implementations.
 569 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
 570                                       ReorderingBuffer &buffer,
 571                                       UErrorCode &errorCode) const {
 572     while(src<limit) {
 573         UChar32 c;
 574         uint16_t norm16;
 575         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
 576         if(!decompose(c, norm16, buffer, errorCode)) {
 577             return FALSE;
 578         }
 579     }
 580     return TRUE;
 581 }
 582
 583 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
 584                                  ReorderingBuffer &buffer,
 585                                  UErrorCode &errorCode) const {
 586     // Only loops for 1:1 algorithmic mappings.
 587     for(;;) {
 588         // get the decomposition and the lead and trail cc's
 589         if(isDecompYes(norm16)) {
 590             // c does not decompose
 591             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
 592         } else if(isHangul(norm16)) {
 593             // Hangul syllable: decompose algorithmically
 594             UChar jamos[3];
 595             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
 596         } else if(isDecompNoAlgorithmic(norm16)) {
 597             c=mapAlgorithmic(c, norm16);
 598             norm16=getNorm16(c);
 599         } else {
 600             // c decomposes, get everything from the variable-length extra data
 601             const uint16_t *mapping=getMapping(norm16);
 602             uint16_t firstUnit=*mapping;
 603             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
 604             uint8_t leadCC, trailCC;
 605             trailCC=(uint8_t)(firstUnit>>8);
 606             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
 607                 leadCC=(uint8_t)(*(mapping-1)>>8);
 608             } else {
 609                 leadCC=0;
 610             }
 611             return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
 612         }
 613     }
 614 }
 615
 616 const UChar *
 617 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
 618     const UChar *decomp=NULL;
 619     uint16_t norm16;
 620     for(;;) {
 621         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 622             // c does not decompose
 623             return decomp;
 624         } else if(isHangul(norm16)) {
 625             // Hangul syllable: decompose algorithmically
 626             length=Hangul::decompose(c, buffer);
 627             return buffer;
 628         } else if(isDecompNoAlgorithmic(norm16)) {
 629             c=mapAlgorithmic(c, norm16);
 630             decomp=buffer;
 631             length=0;
 632             U16_APPEND_UNSAFE(buffer, length, c);
 633         } else {
 634             // c decomposes, get everything from the variable-length extra data
 635             const uint16_t *mapping=getMapping(norm16);
 636             length=*mapping&MAPPING_LENGTH_MASK;
 637             return (const UChar *)mapping+1;
 638         }
 639     }
 640 }
 641
 642 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
 643 // so that a raw mapping fits that consists of one unit ("rm0")
 644 // plus all but the first two code units of the normal mapping.
 645 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
 646 const UChar *
 647 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
 648     // We do not loop in this method because an algorithmic mapping itself
 649     // becomes a final result rather than having to be decomposed recursively.
 650     uint16_t norm16;
 651     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 652         // c does not decompose
 653         return NULL;
 654     } else if(isHangul(norm16)) {
 655         // Hangul syllable: decompose algorithmically
 656         Hangul::getRawDecomposition(c, buffer);
 657         length=2;
 658         return buffer;
 659     } else if(isDecompNoAlgorithmic(norm16)) {
 660         c=mapAlgorithmic(c, norm16);
 661         length=0;
 662         U16_APPEND_UNSAFE(buffer, length, c);
 663         return buffer;
 664     } else {
 665         // c decomposes, get everything from the variable-length extra data
 666         const uint16_t *mapping=getMapping(norm16);
 667         uint16_t firstUnit=*mapping;
 668         int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
 669         if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
 670             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
 671             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
 672             const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
 673             uint16_t rm0=*rawMapping;
 674             if(rm0<=MAPPING_LENGTH_MASK) {
 675                 length=rm0;
 676                 return (const UChar *)rawMapping-rm0;
 677             } else {
 678                 // Copy the normal mapping and replace its first two code units with rm0.
 679                 buffer[0]=(UChar)rm0;
 680                 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
 681                 length=mLength-1;
 682                 return buffer;
 683             }
 684         } else {
 685             length=mLength;
 686             return (const UChar *)mapping+1;
 687         }
 688     }
 689 }
 690
 691 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
 692                                          UBool doDecompose,
 693                                          UnicodeString &safeMiddle,
 694                                          ReorderingBuffer &buffer,
 695                                          UErrorCode &errorCode) const {
 696     buffer.copyReorderableSuffixTo(safeMiddle);
 697     if(doDecompose) {
 698         decompose(src, limit, &buffer, errorCode);
 699         return;
 700     }
 701     // Just merge the strings at the boundary.
 702     ForwardUTrie2StringIterator iter(normTrie, src, limit);
 703     uint8_t firstCC, prevCC, cc;
 704     firstCC=prevCC=cc=getCC(iter.next16());
 705     while(cc!=0) {
 706         prevCC=cc;
 707         cc=getCC(iter.next16());
 708     };
 709     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
 710         limit=u_strchr(iter.codePointStart, 0);
 711     }
 712
 713     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
 714         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
 715     }
 716 }
 717
 718 // Note: hasDecompBoundary() could be implemented as aliases to
 719 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
 720 // at the cost of building the FCD trie for a decomposition normalizer.
 721 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
 722     for(;;) {
 723         if(c<minDecompNoCP) {
 724             return TRUE;
 725         }
 726         uint16_t norm16=getNorm16(c);
 727         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
 728             return TRUE;
 729         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
 730             return FALSE;  // ccc!=0
 731         } else if(isDecompNoAlgorithmic(norm16)) {
 732             c=mapAlgorithmic(c, norm16);
 733         } else {
 734             // c decomposes, get everything from the variable-length extra data
 735             const uint16_t *mapping=getMapping(norm16);
 736             uint16_t firstUnit=*mapping;
 737             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
 738                 return FALSE;
 739             }
 740             if(!before) {
 741                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
 742                 // fcd16<=1 || trailCC==0
 743                 if(firstUnit>0x1ff) {
 744                     return FALSE;  // trailCC>1
 745                 }
 746                 if(firstUnit<=0xff) {
 747                     return TRUE;  // trailCC==0
 748                 }
 749                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
 750             }
 751             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
 752             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
 753         }
 754     }
 755 }
 756
 757 /*
 758  * Finds the recomposition result for
 759  * a forward-combining "lead" character,
 760  * specified with a pointer to its compositions list,
 761  * and a backward-combining "trail" character.
 762  *
 763  * If the lead and trail characters combine, then this function returns
 764  * the following "compositeAndFwd" value:
 765  * Bits 21..1  composite character
 766  * Bit      0  set if the composite is a forward-combining starter
 767  * otherwise it returns -1.
 768  *
 769  * The compositions list has (trail, compositeAndFwd) pair entries,
 770  * encoded as either pairs or triples of 16-bit units.
 771  * The last entry has the high bit of its first unit set.
 772  *
 773  * The list is sorted by ascending trail characters (there are no duplicates).
 774  * A linear search is used.
 775  *
 776  * See normalizer2impl.h for a more detailed description
 777  * of the compositions list format.
 778  */
 779 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
 780     uint16_t key1, firstUnit;
 781     if(trail<COMP_1_TRAIL_LIMIT) {
 782         // trail character is 0..33FF
 783         // result entry may have 2 or 3 units
 784         key1=(uint16_t)(trail<<1);
 785         while(key1>(firstUnit=*list)) {
 786             list+=2+(firstUnit&COMP_1_TRIPLE);
 787         }
 788         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
 789             if(firstUnit&COMP_1_TRIPLE) {
 790                 return ((int32_t)list[1]<<16)|list[2];
 791             } else {
 792                 return list[1];
 793             }
 794         }
 795     } else {
 796         // trail character is 3400..10FFFF
 797         // result entry has 3 units
 798         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
 799                         (((trail>>COMP_1_TRAIL_SHIFT))&
 800                           ~COMP_1_TRIPLE));
 801         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
 802         uint16_t secondUnit;
 803         for(;;) {
 804             if(key1>(firstUnit=*list)) {
 805                 list+=2+(firstUnit&COMP_1_TRIPLE);
 806             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
 807                 if(key2>(secondUnit=list[1])) {
 808                     if(firstUnit&COMP_1_LAST_TUPLE) {
 809                         break;
 810                     } else {
 811                         list+=3;
 812                     }
 813                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
 814                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
 815                 } else {
 816                     break;
 817                 }
 818             } else {
 819                 break;
 820             }
 821         }
 822     }
 823     return -1;
 824 }
 825
 826 /**
 827   * @param list some character's compositions list
 828   * @param set recursively receives the composites from these compositions
 829   */
 830 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
 831     uint16_t firstUnit;
 832     int32_t compositeAndFwd;
 833     do {
 834         firstUnit=*list;
 835         if((firstUnit&COMP_1_TRIPLE)==0) {
 836             compositeAndFwd=list[1];
 837             list+=2;
 838         } else {
 839             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
 840             list+=3;
 841         }
 842         UChar32 composite=compositeAndFwd>>1;
 843         if((compositeAndFwd&1)!=0) {
 844             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
 845         }
 846         set.add(composite);
 847     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
 848 }
 849
 850 /*
 851  * Recomposes the buffer text starting at recomposeStartIndex
 852  * (which is in NFD - decomposed and canonically ordered),
 853  * and truncates the buffer contents.
 854  *
 855  * Note that recomposition never lengthens the text:
 856  * Any character consists of either one or two code units;
 857  * a composition may contain at most one more code unit than the original starter,
 858  * while the combining mark that is removed has at least one code unit.
 859  */
 860 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
 861                                 UBool onlyContiguous) const {
 862     UChar *p=buffer.getStart()+recomposeStartIndex;
 863     UChar *limit=buffer.getLimit();
 864     if(p==limit) {
 865         return;
 866     }
 867
 868     UChar *starter, *pRemove, *q, *r;
 869     const uint16_t *compositionsList;
 870     UChar32 c, compositeAndFwd;
 871     uint16_t norm16;
 872     uint8_t cc, prevCC;
 873     UBool starterIsSupplementary;
 874
 875     // Some of the following variables are not used until we have a forward-combining starter
 876     // and are only initialized now to avoid compiler warnings.
 877     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
 878     starter=NULL;
 879     starterIsSupplementary=FALSE;
 880     prevCC=0;
 881
 882     for(;;) {
 883         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
 884         cc=getCCFromYesOrMaybe(norm16);
 885         if( // this character combines backward and
 886             isMaybe(norm16) &&
 887             // we have seen a starter that combines forward and
 888             compositionsList!=NULL &&
 889             // the backward-combining character is not blocked
 890             (prevCC<cc || prevCC==0)
 891         ) {
 892             if(isJamoVT(norm16)) {
 893                 // c is a Jamo V/T, see if we can compose it with the previous character.
 894                 if(c<Hangul::JAMO_T_BASE) {
 895                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
 896                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
 897                     if(prev<Hangul::JAMO_L_COUNT) {
 898                         pRemove=p-1;
 899                         UChar syllable=(UChar)
 900                             (Hangul::HANGUL_BASE+
 901                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
 902                              Hangul::JAMO_T_COUNT);
 903                         UChar t;
 904                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
 905                             ++p;
 906                             syllable+=t;  // The next character was a Jamo T.
 907                         }
 908                         *starter=syllable;
 909                         // remove the Jamo V/T
 910                         q=pRemove;
 911                         r=p;
 912                         while(r<limit) {
 913                             *q++=*r++;
 914                         }
 915                         limit=q;
 916                         p=pRemove;
 917                     }
 918                 }
 919                 /*
 920                  * No "else" for Jamo T:
 921                  * Since the input is in NFD, there are no Hangul LV syllables that
 922                  * a Jamo T could combine with.
 923                  * All Jamo Ts are combined above when handling Jamo Vs.
 924                  */
 925                 if(p==limit) {
 926                     break;
 927                 }
 928                 compositionsList=NULL;
 929                 continue;
 930             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
 931                 // The starter and the combining mark (c) do combine.
 932                 UChar32 composite=compositeAndFwd>>1;
 933
 934                 // Replace the starter with the composite, remove the combining mark.
 935                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
 936                 if(starterIsSupplementary) {
 937                     if(U_IS_SUPPLEMENTARY(composite)) {
 938                         // both are supplementary
 939                         starter[0]=U16_LEAD(composite);
 940                         starter[1]=U16_TRAIL(composite);
 941                     } else {
 942                         *starter=(UChar)composite;
 943                         // The composite is shorter than the starter,
 944                         // move the intermediate characters forward one.
 945                         starterIsSupplementary=FALSE;
 946                         q=starter+1;
 947                         r=q+1;
 948                         while(r<pRemove) {
 949                             *q++=*r++;
 950                         }
 951                         --pRemove;
 952                     }
 953                 } else if(U_IS_SUPPLEMENTARY(composite)) {
 954                     // The composite is longer than the starter,
 955                     // move the intermediate characters back one.
 956                     starterIsSupplementary=TRUE;
 957                     ++starter;  // temporarily increment for the loop boundary
 958                     q=pRemove;
 959                     r=++pRemove;
 960                     while(starter<q) {
 961                         *--r=*--q;
 962                     }
 963                     *starter=U16_TRAIL(composite);
 964                     *--starter=U16_LEAD(composite);  // undo the temporary increment
 965                 } else {
 966                     // both are on the BMP
 967                     *starter=(UChar)composite;
 968                 }
 969
 970                 /* remove the combining mark by moving the following text over it */
 971                 if(pRemove<p) {
 972                     q=pRemove;
 973                     r=p;
 974                     while(r<limit) {
 975                         *q++=*r++;
 976                     }
 977                     limit=q;
 978                     p=pRemove;
 979                 }
 980                 // Keep prevCC because we removed the combining mark.
 981
 982                 if(p==limit) {
 983                     break;
 984                 }
 985                 // Is the composite a starter that combines forward?
 986                 if(compositeAndFwd&1) {
 987                     compositionsList=
 988                         getCompositionsListForComposite(getNorm16(composite));
 989                 } else {
 990                     compositionsList=NULL;
 991                 }
 992
 993                 // We combined; continue with looking for compositions.
 994                 continue;
 995             }
 996         }
 997
 998         // no combination this time
 999         prevCC=cc;
1000         if(p==limit) {
1001             break;
1002         }
1003
1004         // If c did not combine, then check if it is a starter.
1005         if(cc==0) {
1006             // Found a new starter.
1007             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1008                 // It may combine with something, prepare for it.
1009                 if(U_IS_BMP(c)) {
1010                     starterIsSupplementary=FALSE;
1011                     starter=p-1;
1012                 } else {
1013                     starterIsSupplementary=TRUE;
1014                     starter=p-2;
1015                 }
1016             }
1017         } else if(onlyContiguous) {
1018             // FCC: no discontiguous compositions; any intervening character blocks.
1019             compositionsList=NULL;
1020         }
1021     }
1022     buffer.setReorderingLimit(limit);
1023 }
1024
1025 UChar32
1026 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1027     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
1028     const uint16_t *list;
1029     if(isInert(norm16)) {
1030         return U_SENTINEL;
1031     } else if(norm16<minYesNoMappingsOnly) {
1032         if(isJamoL(norm16)) {
1033             b-=Hangul::JAMO_V_BASE;
1034             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1035                 return
1036                     (Hangul::HANGUL_BASE+
1037                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1038                      Hangul::JAMO_T_COUNT);
1039             } else {
1040                 return U_SENTINEL;
1041             }
1042         } else if(isHangul(norm16)) {
1043             b-=Hangul::JAMO_T_BASE;
1044             if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1045                 return a+b;
1046             } else {
1047                 return U_SENTINEL;
1048             }
1049         } else {
1050             // 'a' has a compositions list in extraData
1051             list=extraData+norm16;
1052             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1053                 list+=  // mapping pointer
1054                     1+  // +1 to skip the first unit with the mapping lenth
1055                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1056             }
1057         }
1058     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1059         return U_SENTINEL;
1060     } else {
1061         list=maybeYesCompositions+norm16-minMaybeYes;
1062     }
1063     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1064         return U_SENTINEL;
1065     }
1066 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1067     return combine(list, b)>>1;
1068 #else
1069     int32_t compositeAndFwd=combine(list, b);
1070     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1071 #endif
1072 }
1073
1074 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1075 // doCompose: normalize
1076 // !doCompose: isNormalized (buffer must be empty and initialized)
1077 UBool
1078 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1079                          UBool onlyContiguous,
1080                          UBool doCompose,
1081                          ReorderingBuffer &buffer,
1082                          UErrorCode &errorCode) const {
1083     /*
1084      * prevBoundary points to the last character before the current one
1085      * that has a composition boundary before it with ccc==0 and quick check "yes".
1086      * Keeping track of prevBoundary saves us looking for a composition boundary
1087      * when we find a "no" or "maybe".
1088      *
1089      * When we back out from prevSrc back to prevBoundary,
1090      * then we also remove those same characters (which had been simply copied
1091      * or canonically-order-inserted) from the ReorderingBuffer.
1092      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1093      * must correspond 1:1 to destination units at the end of the destination buffer.
1094      */
1095     const UChar *prevBoundary=src;
1096     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1097     if(limit==NULL) {
1098         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1099                                            doCompose ? &buffer : NULL,
1100                                            errorCode);
1101         if(U_FAILURE(errorCode)) {
1102             return FALSE;
1103         }
1104         if(prevBoundary<src) {
1105             // Set prevBoundary to the last character in the prefix.
1106             prevBoundary=src-1;
1107         }
1108         limit=u_strchr(src, 0);
1109     }
1110
1111     const UChar *prevSrc;
1112     UChar32 c=0;
1113     uint16_t norm16=0;
1114
1115     // only for isNormalized
1116     uint8_t prevCC=0;
1117
1118     for(;;) {
1119         // count code units below the minimum or with irrelevant data for the quick check
1120         for(prevSrc=src; src!=limit;) {
1121             if( (c=*src)<minNoMaybeCP ||
1122                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1123             ) {
1124                 ++src;
1125             } else if(!U16_IS_SURROGATE(c)) {
1126                 break;
1127             } else {
1128                 UChar c2;
1129                 if(U16_IS_SURROGATE_LEAD(c)) {
1130                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1131                         c=U16_GET_SUPPLEMENTARY(c, c2);
1132                     }
1133                 } else /* trail surrogate */ {
1134                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1135                         --src;
1136                         c=U16_GET_SUPPLEMENTARY(c2, c);
1137                     }
1138                 }
1139                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1140                     src+=U16_LENGTH(c);
1141                 } else {
1142                     break;
1143                 }
1144             }
1145         }
1146         // copy these code units all at once
1147         if(src!=prevSrc) {
1148             if(doCompose) {
1149                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1150                     break;
1151                 }
1152             } else {
1153                 prevCC=0;
1154             }
1155             if(src==limit) {
1156                 break;
1157             }
1158             // Set prevBoundary to the last character in the quick check loop.
1159             prevBoundary=src-1;
1160             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1161                 U16_IS_LEAD(*(prevBoundary-1))
1162             ) {
1163                 --prevBoundary;
1164             }
1165             // The start of the current character (c).
1166             prevSrc=src;
1167         } else if(src==limit) {
1168             break;
1169         }
1170
1171         src+=U16_LENGTH(c);
1172         /*
1173          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1174          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1175          * or has ccc!=0.
1176          * Check for Jamo V/T, then for regular characters.
1177          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1178          */
1179         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1180             UChar prev=*(prevSrc-1);
1181             UBool needToDecompose=FALSE;
1182             if(c<Hangul::JAMO_T_BASE) {
1183                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1184                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1185                 if(prev<Hangul::JAMO_L_COUNT) {
1186                     if(!doCompose) {
1187                         return FALSE;
1188                     }
1189                     UChar syllable=(UChar)
1190                         (Hangul::HANGUL_BASE+
1191                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1192                          Hangul::JAMO_T_COUNT);
1193                     UChar t;
1194                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1195                         ++src;
1196                         syllable+=t;  // The next character was a Jamo T.
1197                         prevBoundary=src;
1198                         buffer.setLastChar(syllable);
1199                         continue;
1200                     }
1201                     // If we see L+V+x where x!=T then we drop to the slow path,
1202                     // decompose and recompose.
1203                     // This is to deal with NFKC finding normal L and V but a
1204                     // compatibility variant of a T. We need to either fully compose that
1205                     // combination here (which would complicate the code and may not work
1206                     // with strange custom data) or use the slow path -- or else our replacing
1207                     // two input characters (L+V) with one output character (LV syllable)
1208                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
1209                     // length as what we appended to the buffer since prevBoundary.
1210                     needToDecompose=TRUE;
1211                 }
1212             } else if(Hangul::isHangulWithoutJamoT(prev)) {
1213                 // c is a Jamo Trailing consonant,
1214                 // compose with previous Hangul LV that does not contain a Jamo T.
1215                 if(!doCompose) {
1216                     return FALSE;
1217                 }
1218                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1219                 prevBoundary=src;
1220                 continue;
1221             }
1222             if(!needToDecompose) {
1223                 // The Jamo V/T did not compose into a Hangul syllable.
1224                 if(doCompose) {
1225                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1226                         break;
1227                     }
1228                 } else {
1229                     prevCC=0;
1230                 }
1231                 continue;
1232             }
1233         }
1234         /*
1235          * Source buffer pointers:
1236          *
1237          *  all done      quick check   current char  not yet
1238          *                "yes" but     (c)           processed
1239          *                may combine
1240          *                forward
1241          * [-------------[-------------[-------------[-------------[
1242          * |             |             |             |             |
1243          * orig. src     prevBoundary  prevSrc       src           limit
1244          *
1245          *
1246          * Destination buffer pointers inside the ReorderingBuffer:
1247          *
1248          *  all done      might take    not filled yet
1249          *                characters for
1250          *                reordering
1251          * [-------------[-------------[-------------[
1252          * |             |             |             |
1253          * start         reorderStart  limit         |
1254          *                             +remainingCap.+
1255          */
1256         if(norm16>=MIN_YES_YES_WITH_CC) {
1257             uint8_t cc=(uint8_t)norm16;  // cc!=0
1258             if( onlyContiguous &&  // FCC
1259                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1260                 prevBoundary<prevSrc &&
1261                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1262                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1263                 // passed the quick check "yes && ccc==0" test.
1264                 // Check whether the last character was a "yesYes" or a "yesNo".
1265                 // If a "yesNo", then we get its trailing ccc from its
1266                 // mapping and check for canonical order.
1267                 // All other cases are ok.
1268                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1269             ) {
1270                 // Fails FCD test, need to decompose and contiguously recompose.
1271                 if(!doCompose) {
1272                     return FALSE;
1273                 }
1274             } else if(doCompose) {
1275                 if(!buffer.append(c, cc, errorCode)) {
1276                     break;
1277                 }
1278                 continue;
1279             } else if(prevCC<=cc) {
1280                 prevCC=cc;
1281                 continue;
1282             } else {
1283                 return FALSE;
1284             }
1285         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1286             return FALSE;
1287         }
1288
1289         /*
1290          * Find appropriate boundaries around this character,
1291          * decompose the source text from between the boundaries,
1292          * and recompose it.
1293          *
1294          * We may need to remove the last few characters from the ReorderingBuffer
1295          * to account for source text that was copied or appended
1296          * but needs to take part in the recomposition.
1297          */
1298
1299         /*
1300          * Find the last composition boundary in [prevBoundary..src[.
1301          * It is either the decomposition of the current character (at prevSrc),
1302          * or prevBoundary.
1303          */
1304         if(hasCompBoundaryBefore(c, norm16)) {
1305             prevBoundary=prevSrc;
1306         } else if(doCompose) {
1307             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1308         }
1309
1310         // Find the next composition boundary in [src..limit[ -
1311         // modifies src to point to the next starter.
1312         src=(UChar *)findNextCompBoundary(src, limit);
1313
1314         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1315         int32_t recomposeStartIndex=buffer.length();
1316         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1317             break;
1318         }
1319         recompose(buffer, recomposeStartIndex, onlyContiguous);
1320         if(!doCompose) {
1321             if(!buffer.equals(prevBoundary, src)) {
1322                 return FALSE;
1323             }
1324             buffer.remove();
1325             prevCC=0;
1326         }
1327
1328         // Move to the next starter. We never need to look back before this point again.
1329         prevBoundary=src;
1330     }
1331     return TRUE;
1332 }
1333
1334 // Very similar to compose(): Make the same changes in both places if relevant.
1335 // pQCResult==NULL: spanQuickCheckYes
1336 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1337 const UChar *
1338 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1339                                    UBool onlyContiguous,
1340                                    UNormalizationCheckResult *pQCResult) const {
1341     /*
1342      * prevBoundary points to the last character before the current one
1343      * that has a composition boundary before it with ccc==0 and quick check "yes".
1344      */
1345     const UChar *prevBoundary=src;
1346     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1347     if(limit==NULL) {
1348         UErrorCode errorCode=U_ZERO_ERROR;
1349         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1350         if(prevBoundary<src) {
1351             // Set prevBoundary to the last character in the prefix.
1352             prevBoundary=src-1;
1353         }
1354         limit=u_strchr(src, 0);
1355     }
1356
1357     const UChar *prevSrc;
1358     UChar32 c=0;
1359     uint16_t norm16=0;
1360     uint8_t prevCC=0;
1361
1362     for(;;) {
1363         // count code units below the minimum or with irrelevant data for the quick check
1364         for(prevSrc=src;;) {
1365             if(src==limit) {
1366                 return src;
1367             }
1368             if( (c=*src)<minNoMaybeCP ||
1369                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1370             ) {
1371                 ++src;
1372             } else if(!U16_IS_SURROGATE(c)) {
1373                 break;
1374             } else {
1375                 UChar c2;
1376                 if(U16_IS_SURROGATE_LEAD(c)) {
1377                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1378                         c=U16_GET_SUPPLEMENTARY(c, c2);
1379                     }
1380                 } else /* trail surrogate */ {
1381                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1382                         --src;
1383                         c=U16_GET_SUPPLEMENTARY(c2, c);
1384                     }
1385                 }
1386                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1387                     src+=U16_LENGTH(c);
1388                 } else {
1389                     break;
1390                 }
1391             }
1392         }
1393         if(src!=prevSrc) {
1394             // Set prevBoundary to the last character in the quick check loop.
1395             prevBoundary=src-1;
1396             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1397                 U16_IS_LEAD(*(prevBoundary-1))
1398             ) {
1399                 --prevBoundary;
1400             }
1401             prevCC=0;
1402             // The start of the current character (c).
1403             prevSrc=src;
1404         }
1405
1406         src+=U16_LENGTH(c);
1407         /*
1408          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1409          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1410          * or has ccc!=0.
1411          */
1412         if(isMaybeOrNonZeroCC(norm16)) {
1413             uint8_t cc=getCCFromYesOrMaybe(norm16);
1414             if( onlyContiguous &&  // FCC
1415                 cc!=0 &&
1416                 prevCC==0 &&
1417                 prevBoundary<prevSrc &&
1418                 // prevCC==0 && prevBoundary<prevSrc tell us that
1419                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1420                 // passed the quick check "yes && ccc==0" test.
1421                 // Check whether the last character was a "yesYes" or a "yesNo".
1422                 // If a "yesNo", then we get its trailing ccc from its
1423                 // mapping and check for canonical order.
1424                 // All other cases are ok.
1425                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1426             ) {
1427                 // Fails FCD test.
1428             } else if(prevCC<=cc || cc==0) {
1429                 prevCC=cc;
1430                 if(norm16<MIN_YES_YES_WITH_CC) {
1431                     if(pQCResult!=NULL) {
1432                         *pQCResult=UNORM_MAYBE;
1433                     } else {
1434                         return prevBoundary;
1435                     }
1436                 }
1437                 continue;
1438             }
1439         }
1440         if(pQCResult!=NULL) {
1441             *pQCResult=UNORM_NO;
1442         }
1443         return prevBoundary;
1444     }
1445 }
1446
1447 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1448                                        UBool doCompose,
1449                                        UBool onlyContiguous,
1450                                        UnicodeString &safeMiddle,
1451                                        ReorderingBuffer &buffer,
1452                                        UErrorCode &errorCode) const {
1453     if(!buffer.isEmpty()) {
1454         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1455         if(src!=firstStarterInSrc) {
1456             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1457                                                                     buffer.getLimit());
1458             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1459             UnicodeString middle(lastStarterInDest, destSuffixLength);
1460             buffer.removeSuffix(destSuffixLength);
1461             safeMiddle=middle;
1462             middle.append(src, (int32_t)(firstStarterInSrc-src));
1463             const UChar *middleStart=middle.getBuffer();
1464             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1465                     TRUE, buffer, errorCode);
1466             if(U_FAILURE(errorCode)) {
1467                 return;
1468             }
1469             src=firstStarterInSrc;
1470         }
1471     }
1472     if(doCompose) {
1473         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1474     } else {
1475         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1476             limit=u_strchr(src, 0);
1477         }
1478         buffer.appendZeroCC(src, limit, errorCode);
1479     }
1480 }
1481
1482 /**
1483  * Does c have a composition boundary before it?
1484  * True if its decomposition begins with a character that has
1485  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1486  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1487  * (isCompYesAndZeroCC()) so we need not decompose.
1488  */
1489 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1490     for(;;) {
1491         if(isCompYesAndZeroCC(norm16)) {
1492             return TRUE;
1493         } else if(isMaybeOrNonZeroCC(norm16)) {
1494             return FALSE;
1495         } else if(isDecompNoAlgorithmic(norm16)) {
1496             c=mapAlgorithmic(c, norm16);
1497             norm16=getNorm16(c);
1498         } else {
1499             // c decomposes, get everything from the variable-length extra data
1500             const uint16_t *mapping=getMapping(norm16);
1501             uint16_t firstUnit=*mapping;
1502             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1503                 return FALSE;
1504             }
1505             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
1506                 return FALSE;  // non-zero leadCC
1507             }
1508             int32_t i=1;  // skip over the firstUnit
1509             UChar32 c;
1510             U16_NEXT_UNSAFE(mapping, i, c);
1511             return isCompYesAndZeroCC(getNorm16(c));
1512         }
1513     }
1514 }
1515
1516 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1517     for(;;) {
1518         uint16_t norm16=getNorm16(c);
1519         if(isInert(norm16)) {
1520             return TRUE;
1521         } else if(norm16<=minYesNo) {
1522             // Hangul: norm16==minYesNo
1523             // Hangul LVT has a boundary after it.
1524             // Hangul LV and non-inert yesYes characters combine forward.
1525             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1526         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1527             return FALSE;
1528         } else if(isDecompNoAlgorithmic(norm16)) {
1529             c=mapAlgorithmic(c, norm16);
1530         } else {
1531             // c decomposes, get everything from the variable-length extra data.
1532             // If testInert, then c must be a yesNo character which has lccc=0,
1533             // otherwise it could be a noNo.
1534             const uint16_t *mapping=getMapping(norm16);
1535             uint16_t firstUnit=*mapping;
1536             // TRUE if
1537             //   not MAPPING_NO_COMP_BOUNDARY_AFTER
1538             //     (which is set if
1539             //       c is not deleted, and
1540             //       it and its decomposition do not combine forward, and it has a starter)
1541             //   and if FCC then trailCC<=1
1542             return
1543                 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1544                 (!onlyContiguous || firstUnit<=0x1ff);
1545         }
1546     }
1547 }
1548
1549 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1550     BackwardUTrie2StringIterator iter(normTrie, start, p);
1551     uint16_t norm16;
1552     do {
1553         norm16=iter.previous16();
1554     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1555     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1556     // but that's probably not worth the extra cost.
1557     return iter.codePointStart;
1558 }
1559
1560 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1561     ForwardUTrie2StringIterator iter(normTrie, p, limit);
1562     uint16_t norm16;
1563     do {
1564         norm16=iter.next16();
1565     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1566     return iter.codePointStart;
1567 }
1568
1569 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
1570 // still had getFCDTrie() which built and cached an FCD trie.
1571 // That provided faster access to FCD data than getFCD16FromNormData()
1572 // but required synchronization and consumed some 10kB of heap memory
1573 // in any process that uses FCD (e.g., via collation).
1574 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
1575 // at least for Latin & CJK.
1576
1577 // Gets the FCD value from the regular normalization data.
1578 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
1579     // Only loops for 1:1 algorithmic mappings.
1580     for(;;) {
1581         uint16_t norm16=getNorm16(c);
1582         if(norm16<=minYesNo) {
1583             // no decomposition or Hangul syllable, all zeros
1584             return 0;
1585         } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1586             // combining mark
1587             norm16&=0xff;
1588             return norm16|(norm16<<8);
1589         } else if(norm16>=minMaybeYes) {
1590             return 0;
1591         } else if(isDecompNoAlgorithmic(norm16)) {
1592             c=mapAlgorithmic(c, norm16);
1593         } else {
1594             // c decomposes, get everything from the variable-length extra data
1595             const uint16_t *mapping=getMapping(norm16);
1596             uint16_t firstUnit=*mapping;
1597             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1598                 // A character that is deleted (maps to an empty string) must
1599                 // get the worst-case lccc and tccc values because arbitrary
1600                 // characters on both sides will become adjacent.
1601                 return 0x1ff;
1602             } else {
1603                 norm16=firstUnit>>8;  // tccc
1604                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1605                     norm16|=*(mapping-1)&0xff00;  // lccc
1606                 }
1607                 return norm16;
1608             }
1609         }
1610     }
1611 }
1612
1613 // Dual functionality:
1614 // buffer!=NULL: normalize
1615 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1616 const UChar *
1617 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1618                          ReorderingBuffer *buffer,
1619                          UErrorCode &errorCode) const {
1620     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1621     // Similar to the prevBoundary in the compose() implementation.
1622     const UChar *prevBoundary=src;
1623     int32_t prevFCD16=0;
1624     if(limit==NULL) {
1625         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1626         if(U_FAILURE(errorCode)) {
1627             return src;
1628         }
1629         if(prevBoundary<src) {
1630             prevBoundary=src;
1631             // We know that the previous character's lccc==0.
1632             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1633             prevFCD16=getFCD16(*(src-1));
1634             if(prevFCD16>1) {
1635                 --prevBoundary;
1636             }
1637         }
1638         limit=u_strchr(src, 0);
1639     }
1640
1641     // Note: In this function we use buffer->appendZeroCC() because we track
1642     // the lead and trail combining classes here, rather than leaving it to
1643     // the ReorderingBuffer.
1644     // The exception is the call to decomposeShort() which uses the buffer
1645     // in the normal way.
1646
1647     const UChar *prevSrc;
1648     UChar32 c=0;
1649     uint16_t fcd16=0;
1650
1651     for(;;) {
1652         // count code units with lccc==0
1653         for(prevSrc=src; src!=limit;) {
1654             if((c=*src)<MIN_CCC_LCCC_CP) {
1655                 prevFCD16=~c;
1656                 ++src;
1657             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1658                 prevFCD16=0;
1659                 ++src;
1660             } else {
1661                 if(U16_IS_SURROGATE(c)) {
1662                     UChar c2;
1663                     if(U16_IS_SURROGATE_LEAD(c)) {
1664                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1665                             c=U16_GET_SUPPLEMENTARY(c, c2);
1666                         }
1667                     } else /* trail surrogate */ {
1668                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1669                             --src;
1670                             c=U16_GET_SUPPLEMENTARY(c2, c);
1671                         }
1672                     }
1673                 }
1674                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1675                     prevFCD16=fcd16;
1676                     src+=U16_LENGTH(c);
1677                 } else {
1678                     break;
1679                 }
1680             }
1681         }
1682         // copy these code units all at once
1683         if(src!=prevSrc) {
1684             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1685                 break;
1686             }
1687             if(src==limit) {
1688                 break;
1689             }
1690             prevBoundary=src;
1691             // We know that the previous character's lccc==0.
1692             if(prevFCD16<0) {
1693                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1694                 UChar32 prev=~prevFCD16;
1695                 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1696                 if(prevFCD16>1) {
1697                     --prevBoundary;
1698                 }
1699             } else {
1700                 const UChar *p=src-1;
1701                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1702                     --p;
1703                     // Need to fetch the previous character's FCD value because
1704                     // prevFCD16 was just for the trail surrogate code point.
1705                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
1706                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1707                 }
1708                 if(prevFCD16>1) {
1709                     prevBoundary=p;
1710                 }
1711             }
1712             // The start of the current character (c).
1713             prevSrc=src;
1714         } else if(src==limit) {
1715             break;
1716         }
1717
1718         src+=U16_LENGTH(c);
1719         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1720         // Check for proper order, and decompose locally if necessary.
1721         if((prevFCD16&0xff)<=(fcd16>>8)) {
1722             // proper order: prev tccc <= current lccc
1723             if((fcd16&0xff)<=1) {
1724                 prevBoundary=src;
1725             }
1726             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1727                 break;
1728             }
1729             prevFCD16=fcd16;
1730             continue;
1731         } else if(buffer==NULL) {
1732             return prevBoundary;  // quick check "no"
1733         } else {
1734             /*
1735              * Back out the part of the source that we copied or appended
1736              * already but is now going to be decomposed.
1737              * prevSrc is set to after what was copied/appended.
1738              */
1739             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1740             /*
1741              * Find the part of the source that needs to be decomposed,
1742              * up to the next safe boundary.
1743              */
1744             src=findNextFCDBoundary(src, limit);
1745             /*
1746              * The source text does not fulfill the conditions for FCD.
1747              * Decompose and reorder a limited piece of the text.
1748              */
1749             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1750                 break;
1751             }
1752             prevBoundary=src;
1753             prevFCD16=0;
1754         }
1755     }
1756     return src;
1757 }
1758
1759 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1760                                        UBool doMakeFCD,
1761                                        UnicodeString &safeMiddle,
1762                                        ReorderingBuffer &buffer,
1763                                        UErrorCode &errorCode) const {
1764     if(!buffer.isEmpty()) {
1765         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1766         if(src!=firstBoundaryInSrc) {
1767             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1768                                                                     buffer.getLimit());
1769             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1770             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1771             buffer.removeSuffix(destSuffixLength);
1772             safeMiddle=middle;
1773             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1774             const UChar *middleStart=middle.getBuffer();
1775             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1776             if(U_FAILURE(errorCode)) {
1777                 return;
1778             }
1779             src=firstBoundaryInSrc;
1780         }
1781     }
1782     if(doMakeFCD) {
1783         makeFCD(src, limit, &buffer, errorCode);
1784     } else {
1785         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1786             limit=u_strchr(src, 0);
1787         }
1788         buffer.appendZeroCC(src, limit, errorCode);
1789     }
1790 }
1791
1792 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1793     while(start<p && previousFCD16(start, p)>0xff) {}
1794     return p;
1795 }
1796
1797 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1798     while(p<limit) {
1799         const UChar *codePointStart=p;
1800         if(nextFCD16(p, limit)<=0xff) {
1801             return codePointStart;
1802         }
1803     }
1804     return p;
1805 }
1806
1807 // CanonicalIterator data -------------------------------------------------- ***
1808
1809 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1810         trie(utrie2_open(0, 0, &errorCode)),
1811         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
1812
1813 CanonIterData::~CanonIterData() {
1814     utrie2_close(trie);
1815 }
1816
1817 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1818     uint32_t canonValue=utrie2_get32(trie, decompLead);
1819     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1820         // origin is the first character whose decomposition starts with
1821         // the character for which we are setting the value.
1822         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1823     } else {
1824         // origin is not the first character, or it is U+0000.
1825         UnicodeSet *set;
1826         if((canonValue&CANON_HAS_SET)==0) {
1827             set=new UnicodeSet;
1828             if(set==NULL) {
1829                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1830                 return;
1831             }
1832             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1833             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1834             utrie2_set32(trie, decompLead, canonValue, &errorCode);
1835             canonStartSets.addElement(set, errorCode);
1836             if(firstOrigin!=0) {
1837                 set->add(firstOrigin);
1838             }
1839         } else {
1840             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1841         }
1842         set->add(origin);
1843     }
1844 }
1845
1846 U_CDECL_BEGIN
1847
1848 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1849 //     context: the Normalizer2Impl
1850 static UBool U_CALLCONV
1851 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1852     UErrorCode errorCode = U_ZERO_ERROR;
1853     if (value != 0) {
1854         Normalizer2Impl *impl = (Normalizer2Impl *)context;
1855         impl->makeCanonIterDataFromNorm16(
1856             start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
1857     }
1858     return U_SUCCESS(errorCode);
1859 }
1860
1861
1862
1863 // UInitOnce instantiation function for CanonIterData
1864
1865 static void U_CALLCONV
1866 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
1867     U_ASSERT(impl->fCanonIterData == NULL);
1868     impl->fCanonIterData = new CanonIterData(errorCode);
1869     if (impl->fCanonIterData == NULL) {
1870         errorCode=U_MEMORY_ALLOCATION_ERROR;
1871     }
1872     if (U_SUCCESS(errorCode)) {
1873         utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
1874         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1875     }
1876     if (U_FAILURE(errorCode)) {
1877         delete impl->fCanonIterData;
1878         impl->fCanonIterData = NULL;
1879     }
1880 }
1881
1882 U_CDECL_END
1883
1884 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1885                                                   CanonIterData &newData,
1886                                                   UErrorCode &errorCode) const {
1887     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1888         // Inert, or 2-way mapping (including Hangul syllable).
1889         // We do not write a canonStartSet for any yesNo character.
1890         // Composites from 2-way mappings are added at runtime from the
1891         // starter's compositions list, and the other characters in
1892         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1893         // "maybe" characters.
1894         return;
1895     }
1896     for(UChar32 c=start; c<=end; ++c) {
1897         uint32_t oldValue=utrie2_get32(newData.trie, c);
1898         uint32_t newValue=oldValue;
1899         if(norm16>=minMaybeYes) {
1900             // not a segment starter if it occurs in a decomposition or has cc!=0
1901             newValue|=CANON_NOT_SEGMENT_STARTER;
1902             if(norm16<MIN_NORMAL_MAYBE_YES) {
1903                 newValue|=CANON_HAS_COMPOSITIONS;
1904             }
1905         } else if(norm16<minYesNo) {
1906             newValue|=CANON_HAS_COMPOSITIONS;
1907         } else {
1908             // c has a one-way decomposition
1909             UChar32 c2=c;
1910             uint16_t norm16_2=norm16;
1911             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1912                 c2=mapAlgorithmic(c2, norm16_2);
1913                 norm16_2=getNorm16(c2);
1914             }
1915             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1916                 // c decomposes, get everything from the variable-length extra data
1917                 const uint16_t *mapping=getMapping(norm16_2);
1918                 uint16_t firstUnit=*mapping;
1919                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1920                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1921                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
1922                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
1923                     }
1924                 }
1925                 // Skip empty mappings (no characters in the decomposition).
1926                 if(length!=0) {
1927                     ++mapping;  // skip over the firstUnit
1928                     // add c to first code point's start set
1929                     int32_t i=0;
1930                     U16_NEXT_UNSAFE(mapping, i, c2);
1931                     newData.addToStartSet(c, c2, errorCode);
1932                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1933                     // one-way mapping. A 2-way mapping is possible here after
1934                     // intermediate algorithmic mapping.
1935                     if(norm16_2>=minNoNo) {
1936                         while(i<length) {
1937                             U16_NEXT_UNSAFE(mapping, i, c2);
1938                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
1939                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1940                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1941                                              &errorCode);
1942                             }
1943                         }
1944                     }
1945                 }
1946             } else {
1947                 // c decomposed to c2 algorithmically; c has cc==0
1948                 newData.addToStartSet(c, c2, errorCode);
1949             }
1950         }
1951         if(newValue!=oldValue) {
1952             utrie2_set32(newData.trie, c, newValue, &errorCode);
1953         }
1954     }
1955 }
1956
1957 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1958     // Logically const: Synchronized instantiation.
1959     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1960     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
1961     return U_SUCCESS(errorCode);
1962 }
1963
1964 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1965     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
1966 }
1967
1968 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1969     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
1970 }
1971
1972 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1973     return getCanonValue(c)>=0;
1974 }
1975
1976 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1977     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1978     if(canonValue==0) {
1979         return FALSE;
1980     }
1981     set.clear();
1982     int32_t value=canonValue&CANON_VALUE_MASK;
1983     if((canonValue&CANON_HAS_SET)!=0) {
1984         set.addAll(getCanonStartSet(value));
1985     } else if(value!=0) {
1986         set.add(value);
1987     }
1988     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1989         uint16_t norm16=getNorm16(c);
1990         if(norm16==JAMO_L) {
1991             UChar32 syllable=
1992                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1993             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1994         } else {
1995             addComposites(getCompositionsList(norm16), set);
1996         }
1997     }
1998     return TRUE;
1999 }
2000
2001 U_NAMESPACE_END
2002
2003 // Normalizer2 data swapping ----------------------------------------------- ***
2004
2005 U_NAMESPACE_USE
2006
2007 U_CAPI int32_t U_EXPORT2
2008 unorm2_swap(const UDataSwapper *ds,
2009             const void *inData, int32_t length, void *outData,
2010             UErrorCode *pErrorCode) {
2011     const UDataInfo *pInfo;
2012     int32_t headerSize;
2013
2014     const uint8_t *inBytes;
2015     uint8_t *outBytes;
2016
2017     const int32_t *inIndexes;
2018     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
2019
2020     int32_t i, offset, nextOffset, size;
2021
2022     /* udata_swapDataHeader checks the arguments */
2023     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2024     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2025         return 0;
2026     }
2027
2028     /* check data format and format version */
2029     pInfo=(const UDataInfo *)((const char *)inData+4);
2030     if(!(
2031         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2032         pInfo->dataFormat[1]==0x72 &&
2033         pInfo->dataFormat[2]==0x6d &&
2034         pInfo->dataFormat[3]==0x32 &&
2035         (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
2036     )) {
2037         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2038                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2039                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2040                          pInfo->formatVersion[0]);
2041         *pErrorCode=U_UNSUPPORTED_ERROR;
2042         return 0;
2043     }
2044
2045     inBytes=(const uint8_t *)inData+headerSize;
2046     outBytes=(uint8_t *)outData+headerSize;
2047
2048     inIndexes=(const int32_t *)inBytes;
2049
2050     if(length>=0) {
2051         length-=headerSize;
2052         if(length<(int32_t)sizeof(indexes)) {
2053             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2054                              length);
2055             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2056             return 0;
2057         }
2058     }
2059
2060     /* read the first few indexes */
2061     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2062         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2063     }
2064
2065     /* get the total length of the data */
2066     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2067
2068     if(length>=0) {
2069         if(length<size) {
2070             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2071                              length);
2072             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2073             return 0;
2074         }
2075
2076         /* copy the data for inaccessible bytes */
2077         if(inBytes!=outBytes) {
2078             uprv_memcpy(outBytes, inBytes, size);
2079         }
2080
2081         offset=0;
2082
2083         /* swap the int32_t indexes[] */
2084         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2085         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2086         offset=nextOffset;
2087
2088         /* swap the UTrie2 */
2089         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2090         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2091         offset=nextOffset;
2092
2093         /* swap the uint16_t extraData[] */
2094         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2095         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2096         offset=nextOffset;
2097
2098         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2099         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2100         offset=nextOffset;
2101
2102         U_ASSERT(offset==size);
2103     }
2104
2105     return headerSize+size;
2106 }
2107
2108 #endif  // !UCONFIG_NO_NORMALIZATION