icuSources/common/normalizer2impl.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2009-2010, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  normalizer2impl.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009nov22
  14 *   created by: Markus W. Scherer
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_NORMALIZATION
  20
  21 #include "unicode/normalizer2.h"
  22 #include "unicode/udata.h"
  23 #include "unicode/ustring.h"
  24 #include "cmemory.h"
  25 #include "mutex.h"
  26 #include "normalizer2impl.h"
  27 #include "uassert.h"
  28 #include "uhash.h"
  29 #include "uset_imp.h"
  30 #include "utrie2.h"
  31 #include "uvector.h"
  32
  33 U_NAMESPACE_BEGIN
  34
  35 // ReorderingBuffer -------------------------------------------------------- ***
  36
  37 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
  38     int32_t length=str.length();
  39     start=str.getBuffer(destCapacity);
  40     if(start==NULL) {
  41         // getBuffer() already did str.setToBogus()
  42         errorCode=U_MEMORY_ALLOCATION_ERROR;
  43         return FALSE;
  44     }
  45     limit=start+length;
  46     remainingCapacity=str.getCapacity()-length;
  47     reorderStart=start;
  48     if(start==limit) {
  49         lastCC=0;
  50     } else {
  51         setIterator();
  52         lastCC=previousCC();
  53         // Set reorderStart after the last code point with cc<=1 if there is one.
  54         if(lastCC>1) {
  55             while(previousCC()>1) {}
  56         }
  57         reorderStart=codePointLimit;
  58     }
  59     return TRUE;
  60 }
  61
  62 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
  63     int32_t length=(int32_t)(limit-start);
  64     return
  65         length==(int32_t)(otherLimit-otherStart) &&
  66         0==u_memcmp(start, otherStart, length);
  67 }
  68
  69 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
  70     if(remainingCapacity<2 && !resize(2, errorCode)) {
  71         return FALSE;
  72     }
  73     if(lastCC<=cc || cc==0) {
  74         limit[0]=U16_LEAD(c);
  75         limit[1]=U16_TRAIL(c);
  76         limit+=2;
  77         lastCC=cc;
  78         if(cc<=1) {
  79             reorderStart=limit;
  80         }
  81     } else {
  82         insert(c, cc);
  83     }
  84     remainingCapacity-=2;
  85     return TRUE;
  86 }
  87
  88 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
  89                                uint8_t leadCC, uint8_t trailCC,
  90                                UErrorCode &errorCode) {
  91     if(length==0) {
  92         return TRUE;
  93     }
  94     if(remainingCapacity<length && !resize(length, errorCode)) {
  95         return FALSE;
  96     }
  97     remainingCapacity-=length;
  98     if(lastCC<=leadCC || leadCC==0) {
  99         if(trailCC<=1) {
 100             reorderStart=limit+length;
 101         } else if(leadCC<=1) {
 102             reorderStart=limit+1;  // Ok if not a code point boundary.
 103         }
 104         const UChar *sLimit=s+length;
 105         do { *limit++=*s++; } while(s!=sLimit);
 106         lastCC=trailCC;
 107     } else {
 108         int32_t i=0;
 109         UChar32 c;
 110         U16_NEXT(s, i, length, c);
 111         insert(c, leadCC);  // insert first code point
 112         while(i<length) {
 113             U16_NEXT(s, i, length, c);
 114             if(i<length) {
 115                 // s must be in NFD, otherwise we need to use getCC().
 116                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
 117             } else {
 118                 leadCC=trailCC;
 119             }
 120             append(c, leadCC, errorCode);
 121         }
 122     }
 123     return TRUE;
 124 }
 125
 126 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
 127     int32_t cpLength=U16_LENGTH(c);
 128     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
 129         return FALSE;
 130     }
 131     remainingCapacity-=cpLength;
 132     if(cpLength==1) {
 133         *limit++=(UChar)c;
 134     } else {
 135         limit[0]=U16_LEAD(c);
 136         limit[1]=U16_TRAIL(c);
 137         limit+=2;
 138     }
 139     lastCC=0;
 140     reorderStart=limit;
 141     return TRUE;
 142 }
 143
 144 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
 145     if(s==sLimit) {
 146         return TRUE;
 147     }
 148     int32_t length=(int32_t)(sLimit-s);
 149     if(remainingCapacity<length && !resize(length, errorCode)) {
 150         return FALSE;
 151     }
 152     u_memcpy(limit, s, length);
 153     limit+=length;
 154     remainingCapacity-=length;
 155     lastCC=0;
 156     reorderStart=limit;
 157     return TRUE;
 158 }
 159
 160 void ReorderingBuffer::remove() {
 161     reorderStart=limit=start;
 162     remainingCapacity=str.getCapacity();
 163     lastCC=0;
 164 }
 165
 166 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
 167     if(suffixLength<(limit-start)) {
 168         limit-=suffixLength;
 169         remainingCapacity+=suffixLength;
 170     } else {
 171         limit=start;
 172         remainingCapacity=str.getCapacity();
 173     }
 174     lastCC=0;
 175     reorderStart=limit;
 176 }
 177
 178 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
 179     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
 180     int32_t length=(int32_t)(limit-start);
 181     str.releaseBuffer(length);
 182     int32_t newCapacity=length+appendLength;
 183     int32_t doubleCapacity=2*str.getCapacity();
 184     if(newCapacity<doubleCapacity) {
 185         newCapacity=doubleCapacity;
 186     }
 187     if(newCapacity<256) {
 188         newCapacity=256;
 189     }
 190     start=str.getBuffer(newCapacity);
 191     if(start==NULL) {
 192         // getBuffer() already did str.setToBogus()
 193         errorCode=U_MEMORY_ALLOCATION_ERROR;
 194         return FALSE;
 195     }
 196     reorderStart=start+reorderStartIndex;
 197     limit=start+length;
 198     remainingCapacity=str.getCapacity()-length;
 199     return TRUE;
 200 }
 201
 202 void ReorderingBuffer::skipPrevious() {
 203     codePointLimit=codePointStart;
 204     UChar c=*--codePointStart;
 205     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
 206         --codePointStart;
 207     }
 208 }
 209
 210 uint8_t ReorderingBuffer::previousCC() {
 211     codePointLimit=codePointStart;
 212     if(reorderStart>=codePointStart) {
 213         return 0;
 214     }
 215     UChar32 c=*--codePointStart;
 216     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
 217         return 0;
 218     }
 219
 220     UChar c2;
 221     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
 222         --codePointStart;
 223         c=U16_GET_SUPPLEMENTARY(c2, c);
 224     }
 225     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
 226 }
 227
 228 // Inserts c somewhere before the last character.
 229 // Requires 0<cc<lastCC which implies reorderStart<limit.
 230 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
 231     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 232     // insert c at codePointLimit, after the character with prevCC<=cc
 233     UChar *q=limit;
 234     UChar *r=limit+=U16_LENGTH(c);
 235     do {
 236         *--r=*--q;
 237     } while(codePointLimit!=q);
 238     writeCodePoint(q, c);
 239     if(cc<=1) {
 240         reorderStart=r;
 241     }
 242 }
 243
 244 // Normalizer2Impl --------------------------------------------------------- ***
 245
 246 struct CanonIterData : public UMemory {
 247     CanonIterData(UErrorCode &errorCode);
 248     ~CanonIterData();
 249     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
 250     UTrie2 *trie;
 251     UVector canonStartSets;  // contains UnicodeSet *
 252 };
 253
 254 Normalizer2Impl::~Normalizer2Impl() {
 255     udata_close(memory);
 256     utrie2_close(normTrie);
 257     UTrie2Singleton(fcdTrieSingleton).deleteInstance();
 258     delete (CanonIterData *)canonIterDataSingleton.fInstance;
 259 }
 260
 261 UBool U_CALLCONV
 262 Normalizer2Impl::isAcceptable(void *context,
 263                               const char * /* type */, const char * /*name*/,
 264                               const UDataInfo *pInfo) {
 265     if(
 266         pInfo->size>=20 &&
 267         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
 268         pInfo->charsetFamily==U_CHARSET_FAMILY &&
 269         pInfo->dataFormat[0]==0x4e &&    /* dataFormat="Nrm2" */
 270         pInfo->dataFormat[1]==0x72 &&
 271         pInfo->dataFormat[2]==0x6d &&
 272         pInfo->dataFormat[3]==0x32 &&
 273         pInfo->formatVersion[0]==1
 274     ) {
 275         Normalizer2Impl *me=(Normalizer2Impl *)context;
 276         uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
 277         return TRUE;
 278     } else {
 279         return FALSE;
 280     }
 281 }
 282
 283 void
 284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
 285     if(U_FAILURE(errorCode)) {
 286         return;
 287     }
 288     memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
 289     if(U_FAILURE(errorCode)) {
 290         return;
 291     }
 292     const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
 293     const int32_t *inIndexes=(const int32_t *)inBytes;
 294     int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
 295     if(indexesLength<=IX_MIN_MAYBE_YES) {
 296         errorCode=U_INVALID_FORMAT_ERROR;  // Not enough indexes.
 297         return;
 298     }
 299
 300     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 301     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 302
 303     minYesNo=inIndexes[IX_MIN_YES_NO];
 304     minNoNo=inIndexes[IX_MIN_NO_NO];
 305     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 306     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 307
 308     int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
 309     int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 310     normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
 311                                        inBytes+offset, nextOffset-offset, NULL,
 312                                        &errorCode);
 313     if(U_FAILURE(errorCode)) {
 314         return;
 315     }
 316
 317     offset=nextOffset;
 318     maybeYesCompositions=(const uint16_t *)(inBytes+offset);
 319     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
 320 }
 321
 322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
 323     UChar32 c;
 324     if(cpStart==(cpLimit-1)) {
 325         c=*cpStart;
 326     } else {
 327         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
 328     }
 329     uint16_t prevNorm16=getNorm16(c);
 330     if(prevNorm16<=minYesNo) {
 331         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
 332     } else {
 333         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
 334     }
 335 }
 336
 337 U_CDECL_BEGIN
 338
 339 static UBool U_CALLCONV
 340 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
 341     /* add the start code point to the USet */
 342     const USetAdder *sa=(const USetAdder *)context;
 343     sa->add(sa->set, start);
 344     return TRUE;
 345 }
 346
 347 static uint32_t U_CALLCONV
 348 segmentStarterMapper(const void * /*context*/, uint32_t value) {
 349     return value&CANON_NOT_SEGMENT_STARTER;
 350 }
 351
 352 U_CDECL_END
 353
 354 void
 355 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
 356     /* add the start code point of each same-value range of each trie */
 357     utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
 358
 359     /* add Hangul LV syllables and LV+1 because of skippables */
 360     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
 361         sa->add(sa->set, c);
 362         sa->add(sa->set, c+1);
 363     }
 364     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
 365 }
 366
 367 void
 368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
 369     /* add the start code point of each same-value range of the canonical iterator data trie */
 370     if(ensureCanonIterData(errorCode)) {
 371         // currently only used for the SEGMENT_STARTER property
 372         utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
 373                     segmentStarterMapper, enumPropertyStartsRange, sa);
 374     }
 375 }
 376
 377 const UChar *
 378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
 379                                                 UChar32 minNeedDataCP,
 380                                                 ReorderingBuffer *buffer,
 381                                                 UErrorCode &errorCode) const {
 382     // Make some effort to support NUL-terminated strings reasonably.
 383     // Take the part of the fast quick check loop that does not look up
 384     // data and check the first part of the string.
 385     // After this prefix, determine the string length to simplify the rest
 386     // of the code.
 387     const UChar *prevSrc=src;
 388     UChar c;
 389     while((c=*src++)<minNeedDataCP && c!=0) {}
 390     // Back out the last character for full processing.
 391     // Copy this prefix.
 392     if(--src!=prevSrc) {
 393         if(buffer!=NULL) {
 394             buffer->appendZeroCC(prevSrc, src, errorCode);
 395         }
 396     }
 397     return src;
 398 }
 399
 400 // Dual functionality:
 401 // buffer!=NULL: normalize
 402 // buffer==NULL: isNormalized/spanQuickCheckYes
 403 const UChar *
 404 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
 405                            ReorderingBuffer *buffer,
 406                            UErrorCode &errorCode) const {
 407     UChar32 minNoCP=minDecompNoCP;
 408     if(limit==NULL) {
 409         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
 410         if(U_FAILURE(errorCode)) {
 411             return src;
 412         }
 413         limit=u_strchr(src, 0);
 414     }
 415
 416     const UChar *prevSrc;
 417     UChar32 c=0;
 418     uint16_t norm16=0;
 419
 420     // only for quick check
 421     const UChar *prevBoundary=src;
 422     uint8_t prevCC=0;
 423
 424     for(;;) {
 425         // count code units below the minimum or with irrelevant data for the quick check
 426         for(prevSrc=src; src!=limit;) {
 427             if( (c=*src)<minNoCP ||
 428                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
 429             ) {
 430                 ++src;
 431             } else if(!U16_IS_SURROGATE(c)) {
 432                 break;
 433             } else {
 434                 UChar c2;
 435                 if(U16_IS_SURROGATE_LEAD(c)) {
 436                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
 437                         c=U16_GET_SUPPLEMENTARY(c, c2);
 438                     }
 439                 } else /* trail surrogate */ {
 440                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
 441                         --src;
 442                         c=U16_GET_SUPPLEMENTARY(c2, c);
 443                     }
 444                 }
 445                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
 446                     src+=U16_LENGTH(c);
 447                 } else {
 448                     break;
 449                 }
 450             }
 451         }
 452         // copy these code units all at once
 453         if(src!=prevSrc) {
 454             if(buffer!=NULL) {
 455                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
 456                     break;
 457                 }
 458             } else {
 459                 prevCC=0;
 460                 prevBoundary=src;
 461             }
 462         }
 463         if(src==limit) {
 464             break;
 465         }
 466
 467         // Check one above-minimum, relevant code point.
 468         src+=U16_LENGTH(c);
 469         if(buffer!=NULL) {
 470             if(!decompose(c, norm16, *buffer, errorCode)) {
 471                 break;
 472             }
 473         } else {
 474             if(isDecompYes(norm16)) {
 475                 uint8_t cc=getCCFromYesOrMaybe(norm16);
 476                 if(prevCC<=cc || cc==0) {
 477                     prevCC=cc;
 478                     if(cc<=1) {
 479                         prevBoundary=src;
 480                     }
 481                     continue;
 482                 }
 483             }
 484             return prevBoundary;  // "no" or cc out of order
 485         }
 486     }
 487     return src;
 488 }
 489
 490 // Decompose a short piece of text which is likely to contain characters that
 491 // fail the quick check loop and/or where the quick check loop's overhead
 492 // is unlikely to be amortized.
 493 // Called by the compose() and makeFCD() implementations.
 494 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
 495                                       ReorderingBuffer &buffer,
 496                                       UErrorCode &errorCode) const {
 497     while(src<limit) {
 498         UChar32 c;
 499         uint16_t norm16;
 500         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
 501         if(!decompose(c, norm16, buffer, errorCode)) {
 502             return FALSE;
 503         }
 504     }
 505     return TRUE;
 506 }
 507
 508 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
 509                                  ReorderingBuffer &buffer,
 510                                  UErrorCode &errorCode) const {
 511     // Only loops for 1:1 algorithmic mappings.
 512     for(;;) {
 513         // get the decomposition and the lead and trail cc's
 514         if(isDecompYes(norm16)) {
 515             // c does not decompose
 516             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
 517         } else if(isHangul(norm16)) {
 518             // Hangul syllable: decompose algorithmically
 519             UChar jamos[3];
 520             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
 521         } else if(isDecompNoAlgorithmic(norm16)) {
 522             c=mapAlgorithmic(c, norm16);
 523             norm16=getNorm16(c);
 524         } else {
 525             // c decomposes, get everything from the variable-length extra data
 526             const uint16_t *mapping=getMapping(norm16);
 527             uint16_t firstUnit=*mapping++;
 528             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
 529             uint8_t leadCC, trailCC;
 530             trailCC=(uint8_t)(firstUnit>>8);
 531             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
 532                 leadCC=(uint8_t)(*mapping++>>8);
 533             } else {
 534                 leadCC=0;
 535             }
 536             return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
 537         }
 538     }
 539 }
 540
 541 const UChar *
 542 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
 543     const UChar *decomp=NULL;
 544     uint16_t norm16;
 545     for(;;) {
 546         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 547             // c does not decompose
 548             return decomp;
 549         } else if(isHangul(norm16)) {
 550             // Hangul syllable: decompose algorithmically
 551             length=Hangul::decompose(c, buffer);
 552             return buffer;
 553         } else if(isDecompNoAlgorithmic(norm16)) {
 554             c=mapAlgorithmic(c, norm16);
 555             decomp=buffer;
 556             length=0;
 557             U16_APPEND_UNSAFE(buffer, length, c);
 558         } else {
 559             // c decomposes, get everything from the variable-length extra data
 560             const uint16_t *mapping=getMapping(norm16);
 561             uint16_t firstUnit=*mapping++;
 562             length=firstUnit&MAPPING_LENGTH_MASK;
 563             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
 564                 ++mapping;
 565             }
 566             return (const UChar *)mapping;
 567         }
 568     }
 569 }
 570
 571 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
 572                                          UBool doDecompose,
 573                                          ReorderingBuffer &buffer,
 574                                          UErrorCode &errorCode) const {
 575     if(doDecompose) {
 576         decompose(src, limit, &buffer, errorCode);
 577         return;
 578     }
 579     // Just merge the strings at the boundary.
 580     ForwardUTrie2StringIterator iter(normTrie, src, limit);
 581     uint8_t firstCC, prevCC, cc;
 582     firstCC=prevCC=cc=getCC(iter.next16());
 583     while(cc!=0) {
 584         prevCC=cc;
 585         cc=getCC(iter.next16());
 586     };
 587     buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
 588         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
 589 }
 590
 591 // Note: hasDecompBoundary() could be implemented as aliases to
 592 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
 593 // at the cost of building the FCD trie for a decomposition normalizer.
 594 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
 595     for(;;) {
 596         if(c<minDecompNoCP) {
 597             return TRUE;
 598         }
 599         uint16_t norm16=getNorm16(c);
 600         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
 601             return TRUE;
 602         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
 603             return FALSE;  // ccc!=0
 604         } else if(isDecompNoAlgorithmic(norm16)) {
 605             c=mapAlgorithmic(c, norm16);
 606         } else {
 607             // c decomposes, get everything from the variable-length extra data
 608             const uint16_t *mapping=getMapping(norm16);
 609             uint16_t firstUnit=*mapping++;
 610             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
 611                 return FALSE;
 612             }
 613             if(!before) {
 614                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
 615                 // fcd16<=1 || trailCC==0
 616                 if(firstUnit>0x1ff) {
 617                     return FALSE;  // trailCC>1
 618                 }
 619                 if(firstUnit<=0xff) {
 620                     return TRUE;  // trailCC==0
 621                 }
 622                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
 623             }
 624             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
 625             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
 626         }
 627     }
 628 }
 629
 630 /*
 631  * Finds the recomposition result for
 632  * a forward-combining "lead" character,
 633  * specified with a pointer to its compositions list,
 634  * and a backward-combining "trail" character.
 635  *
 636  * If the lead and trail characters combine, then this function returns
 637  * the following "compositeAndFwd" value:
 638  * Bits 21..1  composite character
 639  * Bit      0  set if the composite is a forward-combining starter
 640  * otherwise it returns -1.
 641  *
 642  * The compositions list has (trail, compositeAndFwd) pair entries,
 643  * encoded as either pairs or triples of 16-bit units.
 644  * The last entry has the high bit of its first unit set.
 645  *
 646  * The list is sorted by ascending trail characters (there are no duplicates).
 647  * A linear search is used.
 648  *
 649  * See normalizer2impl.h for a more detailed description
 650  * of the compositions list format.
 651  */
 652 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
 653     uint16_t key1, firstUnit;
 654     if(trail<COMP_1_TRAIL_LIMIT) {
 655         // trail character is 0..33FF
 656         // result entry may have 2 or 3 units
 657         key1=(uint16_t)(trail<<1);
 658         while(key1>(firstUnit=*list)) {
 659             list+=2+(firstUnit&COMP_1_TRIPLE);
 660         }
 661         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
 662             if(firstUnit&COMP_1_TRIPLE) {
 663                 return ((int32_t)list[1]<<16)|list[2];
 664             } else {
 665                 return list[1];
 666             }
 667         }
 668     } else {
 669         // trail character is 3400..10FFFF
 670         // result entry has 3 units
 671         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
 672                         (((trail>>COMP_1_TRAIL_SHIFT))&
 673                           ~COMP_1_TRIPLE));
 674         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
 675         uint16_t secondUnit;
 676         for(;;) {
 677             if(key1>(firstUnit=*list)) {
 678                 list+=2+(firstUnit&COMP_1_TRIPLE);
 679             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
 680                 if(key2>(secondUnit=list[1])) {
 681                     if(firstUnit&COMP_1_LAST_TUPLE) {
 682                         break;
 683                     } else {
 684                         list+=3;
 685                     }
 686                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
 687                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
 688                 } else {
 689                     break;
 690                 }
 691             } else {
 692                 break;
 693             }
 694         }
 695     }
 696     return -1;
 697 }
 698
 699 /**
 700   * @param list some character's compositions list
 701   * @param set recursively receives the composites from these compositions
 702   */
 703 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
 704     uint16_t firstUnit;
 705     int32_t compositeAndFwd;
 706     do {
 707         firstUnit=*list;
 708         if((firstUnit&COMP_1_TRIPLE)==0) {
 709             compositeAndFwd=list[1];
 710             list+=2;
 711         } else {
 712             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
 713             list+=3;
 714         }
 715         UChar32 composite=compositeAndFwd>>1;
 716         if((compositeAndFwd&1)!=0) {
 717             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
 718         }
 719         set.add(composite);
 720     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
 721 }
 722
 723 /*
 724  * Recomposes the buffer text starting at recomposeStartIndex
 725  * (which is in NFD - decomposed and canonically ordered),
 726  * and truncates the buffer contents.
 727  *
 728  * Note that recomposition never lengthens the text:
 729  * Any character consists of either one or two code units;
 730  * a composition may contain at most one more code unit than the original starter,
 731  * while the combining mark that is removed has at least one code unit.
 732  */
 733 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
 734                                 UBool onlyContiguous) const {
 735     UChar *p=buffer.getStart()+recomposeStartIndex;
 736     UChar *limit=buffer.getLimit();
 737     if(p==limit) {
 738         return;
 739     }
 740
 741     UChar *starter, *pRemove, *q, *r;
 742     const uint16_t *compositionsList;
 743     UChar32 c, compositeAndFwd;
 744     uint16_t norm16;
 745     uint8_t cc, prevCC;
 746     UBool starterIsSupplementary;
 747
 748     // Some of the following variables are not used until we have a forward-combining starter
 749     // and are only initialized now to avoid compiler warnings.
 750     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
 751     starter=NULL;
 752     starterIsSupplementary=FALSE;
 753     prevCC=0;
 754
 755     for(;;) {
 756         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
 757         cc=getCCFromYesOrMaybe(norm16);
 758         if( // this character combines backward and
 759             isMaybe(norm16) &&
 760             // we have seen a starter that combines forward and
 761             compositionsList!=NULL &&
 762             // the backward-combining character is not blocked
 763             (prevCC<cc || prevCC==0)
 764         ) {
 765             if(isJamoVT(norm16)) {
 766                 // c is a Jamo V/T, see if we can compose it with the previous character.
 767                 if(c<Hangul::JAMO_T_BASE) {
 768                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
 769                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
 770                     if(prev<Hangul::JAMO_L_COUNT) {
 771                         pRemove=p-1;
 772                         UChar syllable=(UChar)
 773                             (Hangul::HANGUL_BASE+
 774                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
 775                              Hangul::JAMO_T_COUNT);
 776                         UChar t;
 777                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
 778                             ++p;
 779                             syllable+=t;  // The next character was a Jamo T.
 780                         }
 781                         *starter=syllable;
 782                         // remove the Jamo V/T
 783                         q=pRemove;
 784                         r=p;
 785                         while(r<limit) {
 786                             *q++=*r++;
 787                         }
 788                         limit=q;
 789                         p=pRemove;
 790                     }
 791                 }
 792                 /*
 793                  * No "else" for Jamo T:
 794                  * Since the input is in NFD, there are no Hangul LV syllables that
 795                  * a Jamo T could combine with.
 796                  * All Jamo Ts are combined above when handling Jamo Vs.
 797                  */
 798                 if(p==limit) {
 799                     break;
 800                 }
 801                 compositionsList=NULL;
 802                 continue;
 803             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
 804                 // The starter and the combining mark (c) do combine.
 805                 UChar32 composite=compositeAndFwd>>1;
 806
 807                 // Replace the starter with the composite, remove the combining mark.
 808                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
 809                 if(starterIsSupplementary) {
 810                     if(U_IS_SUPPLEMENTARY(composite)) {
 811                         // both are supplementary
 812                         starter[0]=U16_LEAD(composite);
 813                         starter[1]=U16_TRAIL(composite);
 814                     } else {
 815                         *starter=(UChar)composite;
 816                         // The composite is shorter than the starter,
 817                         // move the intermediate characters forward one.
 818                         starterIsSupplementary=FALSE;
 819                         q=starter+1;
 820                         r=q+1;
 821                         while(r<pRemove) {
 822                             *q++=*r++;
 823                         }
 824                         --pRemove;
 825                     }
 826                 } else if(U_IS_SUPPLEMENTARY(composite)) {
 827                     // The composite is longer than the starter,
 828                     // move the intermediate characters back one.
 829                     starterIsSupplementary=TRUE;
 830                     ++starter;  // temporarily increment for the loop boundary
 831                     q=pRemove;
 832                     r=++pRemove;
 833                     while(starter<q) {
 834                         *--r=*--q;
 835                     }
 836                     *starter=U16_TRAIL(composite);
 837                     *--starter=U16_LEAD(composite);  // undo the temporary increment
 838                 } else {
 839                     // both are on the BMP
 840                     *starter=(UChar)composite;
 841                 }
 842
 843                 /* remove the combining mark by moving the following text over it */
 844                 if(pRemove<p) {
 845                     q=pRemove;
 846                     r=p;
 847                     while(r<limit) {
 848                         *q++=*r++;
 849                     }
 850                     limit=q;
 851                     p=pRemove;
 852                 }
 853                 // Keep prevCC because we removed the combining mark.
 854
 855                 if(p==limit) {
 856                     break;
 857                 }
 858                 // Is the composite a starter that combines forward?
 859                 if(compositeAndFwd&1) {
 860                     compositionsList=
 861                         getCompositionsListForComposite(getNorm16(composite));
 862                 } else {
 863                     compositionsList=NULL;
 864                 }
 865
 866                 // We combined; continue with looking for compositions.
 867                 continue;
 868             }
 869         }
 870
 871         // no combination this time
 872         prevCC=cc;
 873         if(p==limit) {
 874             break;
 875         }
 876
 877         // If c did not combine, then check if it is a starter.
 878         if(cc==0) {
 879             // Found a new starter.
 880             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
 881                 // It may combine with something, prepare for it.
 882                 if(U_IS_BMP(c)) {
 883                     starterIsSupplementary=FALSE;
 884                     starter=p-1;
 885                 } else {
 886                     starterIsSupplementary=TRUE;
 887                     starter=p-2;
 888                 }
 889             }
 890         } else if(onlyContiguous) {
 891             // FCC: no discontiguous compositions; any intervening character blocks.
 892             compositionsList=NULL;
 893         }
 894     }
 895     buffer.setReorderingLimit(limit);
 896 }
 897
 898 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 899 // doCompose: normalize
 900 // !doCompose: isNormalized (buffer must be empty and initialized)
 901 UBool
 902 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
 903                          UBool onlyContiguous,
 904                          UBool doCompose,
 905                          ReorderingBuffer &buffer,
 906                          UErrorCode &errorCode) const {
 907     /*
 908      * prevBoundary points to the last character before the current one
 909      * that has a composition boundary before it with ccc==0 and quick check "yes".
 910      * Keeping track of prevBoundary saves us looking for a composition boundary
 911      * when we find a "no" or "maybe".
 912      *
 913      * When we back out from prevSrc back to prevBoundary,
 914      * then we also remove those same characters (which had been simply copied
 915      * or canonically-order-inserted) from the ReorderingBuffer.
 916      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
 917      * must correspond 1:1 to destination units at the end of the destination buffer.
 918      */
 919     const UChar *prevBoundary=src;
 920     UChar32 minNoMaybeCP=minCompNoMaybeCP;
 921     if(limit==NULL) {
 922         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
 923                                            doCompose ? &buffer : NULL,
 924                                            errorCode);
 925         if(U_FAILURE(errorCode)) {
 926             return FALSE;
 927         }
 928         if(prevBoundary<src) {
 929             // Set prevBoundary to the last character in the prefix.
 930             prevBoundary=src-1;
 931         }
 932         limit=u_strchr(src, 0);
 933     }
 934
 935     const UChar *prevSrc;
 936     UChar32 c=0;
 937     uint16_t norm16=0;
 938
 939     // only for isNormalized
 940     uint8_t prevCC=0;
 941
 942     for(;;) {
 943         // count code units below the minimum or with irrelevant data for the quick check
 944         for(prevSrc=src; src!=limit;) {
 945             if( (c=*src)<minNoMaybeCP ||
 946                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
 947             ) {
 948                 ++src;
 949             } else if(!U16_IS_SURROGATE(c)) {
 950                 break;
 951             } else {
 952                 UChar c2;
 953                 if(U16_IS_SURROGATE_LEAD(c)) {
 954                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
 955                         c=U16_GET_SUPPLEMENTARY(c, c2);
 956                     }
 957                 } else /* trail surrogate */ {
 958                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
 959                         --src;
 960                         c=U16_GET_SUPPLEMENTARY(c2, c);
 961                     }
 962                 }
 963                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
 964                     src+=U16_LENGTH(c);
 965                 } else {
 966                     break;
 967                 }
 968             }
 969         }
 970         // copy these code units all at once
 971         if(src!=prevSrc) {
 972             if(doCompose) {
 973                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
 974                     break;
 975                 }
 976             } else {
 977                 prevCC=0;
 978             }
 979             if(src==limit) {
 980                 break;
 981             }
 982             // Set prevBoundary to the last character in the quick check loop.
 983             prevBoundary=src-1;
 984             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
 985                 U16_IS_LEAD(*(prevBoundary-1))
 986             ) {
 987                 --prevBoundary;
 988             }
 989             // The start of the current character (c).
 990             prevSrc=src;
 991         } else if(src==limit) {
 992             break;
 993         }
 994
 995         src+=U16_LENGTH(c);
 996         /*
 997          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 998          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
 999          * or has ccc!=0.
1000          * Check for Jamo V/T, then for regular characters.
1001          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1002          */
1003         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1004             UChar prev=*(prevSrc-1);
1005             UBool needToDecompose=FALSE;
1006             if(c<Hangul::JAMO_T_BASE) {
1007                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1008                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1009                 if(prev<Hangul::JAMO_L_COUNT) {
1010                     if(!doCompose) {
1011                         return FALSE;
1012                     }
1013                     UChar syllable=(UChar)
1014                         (Hangul::HANGUL_BASE+
1015                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1016                          Hangul::JAMO_T_COUNT);
1017                     UChar t;
1018                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1019                         ++src;
1020                         syllable+=t;  // The next character was a Jamo T.
1021                         prevBoundary=src;
1022                         buffer.setLastChar(syllable);
1023                         continue;
1024                     }
1025                     // If we see L+V+x where x!=T then we drop to the slow path,
1026                     // decompose and recompose.
1027                     // This is to deal with NFKC finding normal L and V but a
1028                     // compatibility variant of a T. We need to either fully compose that
1029                     // combination here (which would complicate the code and may not work
1030                     // with strange custom data) or use the slow path -- or else our replacing
1031                     // two input characters (L+V) with one output character (LV syllable)
1032                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
1033                     // length as what we appended to the buffer since prevBoundary.
1034                     needToDecompose=TRUE;
1035                 }
1036             } else if(Hangul::isHangulWithoutJamoT(prev)) {
1037                 // c is a Jamo Trailing consonant,
1038                 // compose with previous Hangul LV that does not contain a Jamo T.
1039                 if(!doCompose) {
1040                     return FALSE;
1041                 }
1042                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1043                 prevBoundary=src;
1044                 continue;
1045             }
1046             if(!needToDecompose) {
1047                 // The Jamo V/T did not compose into a Hangul syllable.
1048                 if(doCompose) {
1049                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1050                         break;
1051                     }
1052                 } else {
1053                     prevCC=0;
1054                 }
1055                 continue;
1056             }
1057         }
1058         /*
1059          * Source buffer pointers:
1060          *
1061          *  all done      quick check   current char  not yet
1062          *                "yes" but     (c)           processed
1063          *                may combine
1064          *                forward
1065          * [-------------[-------------[-------------[-------------[
1066          * |             |             |             |             |
1067          * orig. src     prevBoundary  prevSrc       src           limit
1068          *
1069          *
1070          * Destination buffer pointers inside the ReorderingBuffer:
1071          *
1072          *  all done      might take    not filled yet
1073          *                characters for
1074          *                reordering
1075          * [-------------[-------------[-------------[
1076          * |             |             |             |
1077          * start         reorderStart  limit         |
1078          *                             +remainingCap.+
1079          */
1080         if(norm16>=MIN_YES_YES_WITH_CC) {
1081             uint8_t cc=(uint8_t)norm16;  // cc!=0
1082             if( onlyContiguous &&  // FCC
1083                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1084                 prevBoundary<prevSrc &&
1085                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1086                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1087                 // passed the quick check "yes && ccc==0" test.
1088                 // Check whether the last character was a "yesYes" or a "yesNo".
1089                 // If a "yesNo", then we get its trailing ccc from its
1090                 // mapping and check for canonical order.
1091                 // All other cases are ok.
1092                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1093             ) {
1094                 // Fails FCD test, need to decompose and contiguously recompose.
1095                 if(!doCompose) {
1096                     return FALSE;
1097                 }
1098             } else if(doCompose) {
1099                 if(!buffer.append(c, cc, errorCode)) {
1100                     break;
1101                 }
1102                 continue;
1103             } else if(prevCC<=cc) {
1104                 prevCC=cc;
1105                 continue;
1106             } else {
1107                 return FALSE;
1108             }
1109         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1110             return FALSE;
1111         }
1112
1113         /*
1114          * Find appropriate boundaries around this character,
1115          * decompose the source text from between the boundaries,
1116          * and recompose it.
1117          *
1118          * We may need to remove the last few characters from the ReorderingBuffer
1119          * to account for source text that was copied or appended
1120          * but needs to take part in the recomposition.
1121          */
1122
1123         /*
1124          * Find the last composition boundary in [prevBoundary..src[.
1125          * It is either the decomposition of the current character (at prevSrc),
1126          * or prevBoundary.
1127          */
1128         if(hasCompBoundaryBefore(c, norm16)) {
1129             prevBoundary=prevSrc;
1130         } else if(doCompose) {
1131             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1132         }
1133
1134         // Find the next composition boundary in [src..limit[ -
1135         // modifies src to point to the next starter.
1136         src=(UChar *)findNextCompBoundary(src, limit);
1137
1138         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1139         int32_t recomposeStartIndex=buffer.length();
1140         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1141             break;
1142         }
1143         recompose(buffer, recomposeStartIndex, onlyContiguous);
1144         if(!doCompose) {
1145             if(!buffer.equals(prevBoundary, src)) {
1146                 return FALSE;
1147             }
1148             buffer.remove();
1149             prevCC=0;
1150         }
1151
1152         // Move to the next starter. We never need to look back before this point again.
1153         prevBoundary=src;
1154     }
1155     return TRUE;
1156 }
1157
1158 // Very similar to compose(): Make the same changes in both places if relevant.
1159 // pQCResult==NULL: spanQuickCheckYes
1160 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1161 const UChar *
1162 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1163                                    UBool onlyContiguous,
1164                                    UNormalizationCheckResult *pQCResult) const {
1165     /*
1166      * prevBoundary points to the last character before the current one
1167      * that has a composition boundary before it with ccc==0 and quick check "yes".
1168      */
1169     const UChar *prevBoundary=src;
1170     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1171     if(limit==NULL) {
1172         UErrorCode errorCode=U_ZERO_ERROR;
1173         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1174         if(prevBoundary<src) {
1175             // Set prevBoundary to the last character in the prefix.
1176             prevBoundary=src-1;
1177         }
1178         limit=u_strchr(src, 0);
1179     }
1180
1181     const UChar *prevSrc;
1182     UChar32 c=0;
1183     uint16_t norm16=0;
1184     uint8_t prevCC=0;
1185
1186     for(;;) {
1187         // count code units below the minimum or with irrelevant data for the quick check
1188         for(prevSrc=src;;) {
1189             if(src==limit) {
1190                 return src;
1191             }
1192             if( (c=*src)<minNoMaybeCP ||
1193                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1194             ) {
1195                 ++src;
1196             } else if(!U16_IS_SURROGATE(c)) {
1197                 break;
1198             } else {
1199                 UChar c2;
1200                 if(U16_IS_SURROGATE_LEAD(c)) {
1201                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1202                         c=U16_GET_SUPPLEMENTARY(c, c2);
1203                     }
1204                 } else /* trail surrogate */ {
1205                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1206                         --src;
1207                         c=U16_GET_SUPPLEMENTARY(c2, c);
1208                     }
1209                 }
1210                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1211                     src+=U16_LENGTH(c);
1212                 } else {
1213                     break;
1214                 }
1215             }
1216         }
1217         if(src!=prevSrc) {
1218             // Set prevBoundary to the last character in the quick check loop.
1219             prevBoundary=src-1;
1220             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1221                 U16_IS_LEAD(*(prevBoundary-1))
1222             ) {
1223                 --prevBoundary;
1224             }
1225             prevCC=0;
1226             // The start of the current character (c).
1227             prevSrc=src;
1228         }
1229
1230         src+=U16_LENGTH(c);
1231         /*
1232          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1233          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1234          * or has ccc!=0.
1235          */
1236         if(isMaybeOrNonZeroCC(norm16)) {
1237             uint8_t cc=getCCFromYesOrMaybe(norm16);
1238             if( onlyContiguous &&  // FCC
1239                 cc!=0 &&
1240                 prevCC==0 &&
1241                 prevBoundary<prevSrc &&
1242                 // prevCC==0 && prevBoundary<prevSrc tell us that
1243                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1244                 // passed the quick check "yes && ccc==0" test.
1245                 // Check whether the last character was a "yesYes" or a "yesNo".
1246                 // If a "yesNo", then we get its trailing ccc from its
1247                 // mapping and check for canonical order.
1248                 // All other cases are ok.
1249                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1250             ) {
1251                 // Fails FCD test.
1252             } else if(prevCC<=cc || cc==0) {
1253                 prevCC=cc;
1254                 if(norm16<MIN_YES_YES_WITH_CC) {
1255                     if(pQCResult!=NULL) {
1256                         *pQCResult=UNORM_MAYBE;
1257                     } else {
1258                         return prevBoundary;
1259                     }
1260                 }
1261                 continue;
1262             }
1263         }
1264         if(pQCResult!=NULL) {
1265             *pQCResult=UNORM_NO;
1266         }
1267         return prevBoundary;
1268     }
1269 }
1270
1271 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1272                                        UBool doCompose,
1273                                        UBool onlyContiguous,
1274                                        ReorderingBuffer &buffer,
1275                                        UErrorCode &errorCode) const {
1276     if(!buffer.isEmpty()) {
1277         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1278         if(src!=firstStarterInSrc) {
1279             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1280                                                                     buffer.getLimit());
1281             UnicodeString middle(lastStarterInDest,
1282                                  (int32_t)(buffer.getLimit()-lastStarterInDest));
1283             buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));
1284             middle.append(src, (int32_t)(firstStarterInSrc-src));
1285             const UChar *middleStart=middle.getBuffer();
1286             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1287                     TRUE, buffer, errorCode);
1288             if(U_FAILURE(errorCode)) {
1289                 return;
1290             }
1291             src=firstStarterInSrc;
1292         }
1293     }
1294     if(doCompose) {
1295         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1296     } else {
1297         buffer.appendZeroCC(src, limit, errorCode);
1298     }
1299 }
1300
1301 /**
1302  * Does c have a composition boundary before it?
1303  * True if its decomposition begins with a character that has
1304  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1305  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1306  * (isCompYesAndZeroCC()) so we need not decompose.
1307  */
1308 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1309     for(;;) {
1310         if(isCompYesAndZeroCC(norm16)) {
1311             return TRUE;
1312         } else if(isMaybeOrNonZeroCC(norm16)) {
1313             return FALSE;
1314         } else if(isDecompNoAlgorithmic(norm16)) {
1315             c=mapAlgorithmic(c, norm16);
1316             norm16=getNorm16(c);
1317         } else {
1318             // c decomposes, get everything from the variable-length extra data
1319             const uint16_t *mapping=getMapping(norm16);
1320             uint16_t firstUnit=*mapping++;
1321             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1322                 return FALSE;
1323             }
1324             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
1325                 return FALSE;  // non-zero leadCC
1326             }
1327             int32_t i=0;
1328             UChar32 c;
1329             U16_NEXT_UNSAFE(mapping, i, c);
1330             return isCompYesAndZeroCC(getNorm16(c));
1331         }
1332     }
1333 }
1334
1335 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1336     for(;;) {
1337         uint16_t norm16=getNorm16(c);
1338         if(isInert(norm16)) {
1339             return TRUE;
1340         } else if(norm16<=minYesNo) {
1341             // Hangul LVT (==minYesNo) has a boundary after it.
1342             // Hangul LV and non-inert yesYes characters combine forward.
1343             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1344         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1345             return FALSE;
1346         } else if(isDecompNoAlgorithmic(norm16)) {
1347             c=mapAlgorithmic(c, norm16);
1348         } else {
1349             // c decomposes, get everything from the variable-length extra data.
1350             // If testInert, then c must be a yesNo character which has lccc=0,
1351             // otherwise it could be a noNo.
1352             const uint16_t *mapping=getMapping(norm16);
1353             uint16_t firstUnit=*mapping;
1354             // TRUE if
1355             //      c is not deleted, and
1356             //      it and its decomposition do not combine forward, and it has a starter, and
1357             //      if FCC then trailCC<=1
1358             return
1359                 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
1360                 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
1361                 (!onlyContiguous || firstUnit<=0x1ff);
1362         }
1363     }
1364 }
1365
1366 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1367     BackwardUTrie2StringIterator iter(normTrie, start, p);
1368     uint16_t norm16;
1369     do {
1370         norm16=iter.previous16();
1371     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1372     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1373     // but that's probably not worth the extra cost.
1374     return iter.codePointStart;
1375 }
1376
1377 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1378     ForwardUTrie2StringIterator iter(normTrie, p, limit);
1379     uint16_t norm16;
1380     do {
1381         norm16=iter.next16();
1382     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1383     return iter.codePointStart;
1384 }
1385
1386 class FCDTrieSingleton : public UTrie2Singleton {
1387 public:
1388     FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1389         UTrie2Singleton(s), impl(ni), errorCode(ec) {}
1390     UTrie2 *getInstance(UErrorCode &errorCode) {
1391         return UTrie2Singleton::getInstance(createInstance, this, errorCode);
1392     }
1393     static void *createInstance(const void *context, UErrorCode &errorCode);
1394     UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1395         if(value!=0) {
1396             impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
1397         }
1398         return U_SUCCESS(errorCode);
1399     }
1400
1401     Normalizer2Impl &impl;
1402     UTrie2 *newFCDTrie;
1403     UErrorCode &errorCode;
1404 };
1405
1406 U_CDECL_BEGIN
1407
1408 // Set the FCD value for a range of same-norm16 characters.
1409 static UBool U_CALLCONV
1410 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1411     return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
1412 }
1413
1414 // Collect (OR together) the FCD values for a range of supplementary characters,
1415 // for their lead surrogate code unit.
1416 static UBool U_CALLCONV
1417 enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1418     *((uint32_t *)context)|=value;
1419     return TRUE;
1420 }
1421
1422 U_CDECL_END
1423
1424 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1425     FCDTrieSingleton *me=(FCDTrieSingleton *)context;
1426     me->newFCDTrie=utrie2_open(0, 0, &errorCode);
1427     if(U_SUCCESS(errorCode)) {
1428         utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
1429         for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1430             uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
1431             utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
1432             if(oredValue!=0) {
1433                 // Set a "bad" value for makeFCD() to break the quick check loop
1434                 // and look up the value for the supplementary code point.
1435                 // If there is any lccc, then set the worst-case lccc of 1.
1436                 // The ORed-together value's tccc is already the worst case.
1437                 if(oredValue>0xff) {
1438                     oredValue=0x100|(oredValue&0xff);
1439                 }
1440                 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
1441             }
1442         }
1443         utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
1444         if(U_SUCCESS(errorCode)) {
1445             return me->newFCDTrie;
1446         }
1447     }
1448     utrie2_close(me->newFCDTrie);
1449     return NULL;
1450 }
1451
1452 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1453                                          UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
1454     // Only loops for 1:1 algorithmic mappings.
1455     for(;;) {
1456         if(norm16>=MIN_NORMAL_MAYBE_YES) {
1457             norm16&=0xff;
1458             norm16|=norm16<<8;
1459         } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
1460             // no decomposition or Hangul syllable, all zeros
1461             break;
1462         } else if(limitNoNo<=norm16) {
1463             int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
1464             if(start==end) {
1465                 start+=delta;
1466                 norm16=getNorm16(start);
1467             } else {
1468                 // the same delta leads from different original characters to different mappings
1469                 do {
1470                     UChar32 c=start+delta;
1471                     setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
1472                 } while(++start<=end);
1473                 break;
1474             }
1475         } else {
1476             // c decomposes, get everything from the variable-length extra data
1477             const uint16_t *mapping=getMapping(norm16);
1478             uint16_t firstUnit=*mapping;
1479             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1480                 // A character that is deleted (maps to an empty string) must
1481                 // get the worst-case lccc and tccc values because arbitrary
1482                 // characters on both sides will become adjacent.
1483                 norm16=0x1ff;
1484             } else {
1485                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1486                     norm16=mapping[1]&0xff00;  // lccc
1487                 } else {
1488                     norm16=0;
1489                 }
1490                 norm16|=firstUnit>>8;  // tccc
1491             }
1492         }
1493         utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
1494         break;
1495     }
1496 }
1497
1498 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
1499     // Logically const: Synchronized instantiation.
1500     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1501     return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
1502 }
1503
1504 // Dual functionality:
1505 // buffer!=NULL: normalize
1506 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1507 const UChar *
1508 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1509                          ReorderingBuffer *buffer,
1510                          UErrorCode &errorCode) const {
1511     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1512     // Similar to the prevBoundary in the compose() implementation.
1513     const UChar *prevBoundary=src;
1514     int32_t prevFCD16=0;
1515     if(limit==NULL) {
1516         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1517         if(U_FAILURE(errorCode)) {
1518             return src;
1519         }
1520         if(prevBoundary<src) {
1521             prevBoundary=src;
1522             // We know that the previous character's lccc==0.
1523             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1524             prevFCD16=getFCD16FromSingleLead(*(src-1));
1525             if(prevFCD16>1) {
1526                 --prevBoundary;
1527             }
1528         }
1529         limit=u_strchr(src, 0);
1530     }
1531
1532     // Note: In this function we use buffer->appendZeroCC() because we track
1533     // the lead and trail combining classes here, rather than leaving it to
1534     // the ReorderingBuffer.
1535     // The exception is the call to decomposeShort() which uses the buffer
1536     // in the normal way.
1537
1538     const UTrie2 *trie=fcdTrie();
1539
1540     const UChar *prevSrc;
1541     UChar32 c=0;
1542     uint16_t fcd16=0;
1543
1544     for(;;) {
1545         // count code units with lccc==0
1546         for(prevSrc=src; src!=limit;) {
1547             if((c=*src)<MIN_CCC_LCCC_CP) {
1548                 prevFCD16=~c;
1549                 ++src;
1550             } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
1551                 prevFCD16=fcd16;
1552                 ++src;
1553             } else if(!U16_IS_SURROGATE(c)) {
1554                 break;
1555             } else {
1556                 UChar c2;
1557                 if(U16_IS_SURROGATE_LEAD(c)) {
1558                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1559                         c=U16_GET_SUPPLEMENTARY(c, c2);
1560                     }
1561                 } else /* trail surrogate */ {
1562                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1563                         --src;
1564                         c=U16_GET_SUPPLEMENTARY(c2, c);
1565                     }
1566                 }
1567                 if((fcd16=getFCD16(c))<=0xff) {
1568                     prevFCD16=fcd16;
1569                     src+=U16_LENGTH(c);
1570                 } else {
1571                     break;
1572                 }
1573             }
1574         }
1575         // copy these code units all at once
1576         if(src!=prevSrc) {
1577             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1578                 break;
1579             }
1580             if(src==limit) {
1581                 break;
1582             }
1583             prevBoundary=src;
1584             // We know that the previous character's lccc==0.
1585             if(prevFCD16<0) {
1586                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1587                 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
1588                 if(prevFCD16>1) {
1589                     --prevBoundary;
1590                 }
1591             } else {
1592                 const UChar *p=src-1;
1593                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1594                     --p;
1595                     // Need to fetch the previous character's FCD value because
1596                     // prevFCD16 was just for the trail surrogate code point.
1597                     prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
1598                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1599                 }
1600                 if(prevFCD16>1) {
1601                     prevBoundary=p;
1602                 }
1603             }
1604             // The start of the current character (c).
1605             prevSrc=src;
1606         } else if(src==limit) {
1607             break;
1608         }
1609
1610         src+=U16_LENGTH(c);
1611         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1612         // Check for proper order, and decompose locally if necessary.
1613         if((prevFCD16&0xff)<=(fcd16>>8)) {
1614             // proper order: prev tccc <= current lccc
1615             if((fcd16&0xff)<=1) {
1616                 prevBoundary=src;
1617             }
1618             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1619                 break;
1620             }
1621             prevFCD16=fcd16;
1622             continue;
1623         } else if(buffer==NULL) {
1624             return prevBoundary;  // quick check "no"
1625         } else {
1626             /*
1627              * Back out the part of the source that we copied or appended
1628              * already but is now going to be decomposed.
1629              * prevSrc is set to after what was copied/appended.
1630              */
1631             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1632             /*
1633              * Find the part of the source that needs to be decomposed,
1634              * up to the next safe boundary.
1635              */
1636             src=findNextFCDBoundary(src, limit);
1637             /*
1638              * The source text does not fulfill the conditions for FCD.
1639              * Decompose and reorder a limited piece of the text.
1640              */
1641             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1642                 break;
1643             }
1644             prevBoundary=src;
1645             prevFCD16=0;
1646         }
1647     }
1648     return src;
1649 }
1650
1651 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1652                                        UBool doMakeFCD,
1653                                        ReorderingBuffer &buffer,
1654                                        UErrorCode &errorCode) const {
1655     if(!buffer.isEmpty()) {
1656         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1657         if(src!=firstBoundaryInSrc) {
1658             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1659                                                                     buffer.getLimit());
1660             UnicodeString middle(lastBoundaryInDest,
1661                                  (int32_t)(buffer.getLimit()-lastBoundaryInDest));
1662             buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest));
1663             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1664             const UChar *middleStart=middle.getBuffer();
1665             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1666             if(U_FAILURE(errorCode)) {
1667                 return;
1668             }
1669             src=firstBoundaryInSrc;
1670         }
1671     }
1672     if(doMakeFCD) {
1673         makeFCD(src, limit, &buffer, errorCode);
1674     } else {
1675         buffer.appendZeroCC(src, limit, errorCode);
1676     }
1677 }
1678
1679 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1680     BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
1681     uint16_t fcd16;
1682     do {
1683         fcd16=iter.previous16();
1684     } while(fcd16>0xff);
1685     return iter.codePointStart;
1686 }
1687
1688 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1689     ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
1690     uint16_t fcd16;
1691     do {
1692         fcd16=iter.next16();
1693     } while(fcd16>0xff);
1694     return iter.codePointStart;
1695 }
1696
1697 // CanonicalIterator data -------------------------------------------------- ***
1698
1699 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1700         trie(utrie2_open(0, 0, &errorCode)),
1701         canonStartSets(uhash_deleteUObject, NULL, errorCode) {}
1702
1703 CanonIterData::~CanonIterData() {
1704     utrie2_close(trie);
1705 }
1706
1707 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1708     uint32_t canonValue=utrie2_get32(trie, decompLead);
1709     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1710         // origin is the first character whose decomposition starts with
1711         // the character for which we are setting the value.
1712         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1713     } else {
1714         // origin is not the first character, or it is U+0000.
1715         UnicodeSet *set;
1716         if((canonValue&CANON_HAS_SET)==0) {
1717             set=new UnicodeSet;
1718             if(set==NULL) {
1719                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1720                 return;
1721             }
1722             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1723             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1724             utrie2_set32(trie, decompLead, canonValue, &errorCode);
1725             canonStartSets.addElement(set, errorCode);
1726             if(firstOrigin!=0) {
1727                 set->add(firstOrigin);
1728             }
1729         } else {
1730             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1731         }
1732         set->add(origin);
1733     }
1734 }
1735
1736 class CanonIterDataSingleton {
1737 public:
1738     CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1739         singleton(s), impl(ni), errorCode(ec) {}
1740     CanonIterData *getInstance(UErrorCode &errorCode) {
1741         void *duplicate;
1742         CanonIterData *instance=
1743             (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
1744         delete (CanonIterData *)duplicate;
1745         return instance;
1746     }
1747     static void *createInstance(const void *context, UErrorCode &errorCode);
1748     UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1749         if(value!=0) {
1750             impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
1751         }
1752         return U_SUCCESS(errorCode);
1753     }
1754
1755 private:
1756     SimpleSingleton &singleton;
1757     Normalizer2Impl &impl;
1758     CanonIterData *newData;
1759     UErrorCode &errorCode;
1760 };
1761
1762 U_CDECL_BEGIN
1763
1764 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1765 static UBool U_CALLCONV
1766 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1767     return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
1768 }
1769
1770 U_CDECL_END
1771
1772 void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1773     CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
1774     me->newData=new CanonIterData(errorCode);
1775     if(me->newData==NULL) {
1776         errorCode=U_MEMORY_ALLOCATION_ERROR;
1777         return NULL;
1778     }
1779     if(U_SUCCESS(errorCode)) {
1780         utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
1781         utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1782         if(U_SUCCESS(errorCode)) {
1783             return me->newData;
1784         }
1785     }
1786     delete me->newData;
1787     return NULL;
1788 }
1789
1790 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1791                                                   CanonIterData &newData,
1792                                                   UErrorCode &errorCode) const {
1793     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1794         // Inert, or 2-way mapping (including Hangul syllable).
1795         // We do not write a canonStartSet for any yesNo character.
1796         // Composites from 2-way mappings are added at runtime from the
1797         // starter's compositions list, and the other characters in
1798         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1799         // "maybe" characters.
1800         return;
1801     }
1802     for(UChar32 c=start; c<=end; ++c) {
1803         uint32_t oldValue=utrie2_get32(newData.trie, c);
1804         uint32_t newValue=oldValue;
1805         if(norm16>=minMaybeYes) {
1806             // not a segment starter if it occurs in a decomposition or has cc!=0
1807             newValue|=CANON_NOT_SEGMENT_STARTER;
1808             if(norm16<MIN_NORMAL_MAYBE_YES) {
1809                 newValue|=CANON_HAS_COMPOSITIONS;
1810             }
1811         } else if(norm16<minYesNo) {
1812             newValue|=CANON_HAS_COMPOSITIONS;
1813         } else {
1814             // c has a one-way decomposition
1815             UChar32 c2=c;
1816             uint16_t norm16_2=norm16;
1817             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1818                 c2=mapAlgorithmic(c2, norm16_2);
1819                 norm16_2=getNorm16(c2);
1820             }
1821             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1822                 // c decomposes, get everything from the variable-length extra data
1823                 const uint16_t *mapping=getMapping(norm16_2);
1824                 uint16_t firstUnit=*mapping++;
1825                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1826                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1827                     if(c==c2 && (*mapping&0xff)!=0) {
1828                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
1829                     }
1830                     ++mapping;
1831                 }
1832                 // Skip empty mappings (no characters in the decomposition).
1833                 if(length!=0) {
1834                     // add c to first code point's start set
1835                     int32_t i=0;
1836                     U16_NEXT_UNSAFE(mapping, i, c2);
1837                     newData.addToStartSet(c, c2, errorCode);
1838                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1839                     // one-way mapping. A 2-way mapping is possible here after
1840                     // intermediate algorithmic mapping.
1841                     if(norm16_2>=minNoNo) {
1842                         while(i<length) {
1843                             U16_NEXT_UNSAFE(mapping, i, c2);
1844                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
1845                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1846                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1847                                              &errorCode);
1848                             }
1849                         }
1850                     }
1851                 }
1852             } else {
1853                 // c decomposed to c2 algorithmically; c has cc==0
1854                 newData.addToStartSet(c, c2, errorCode);
1855             }
1856         }
1857         if(newValue!=oldValue) {
1858             utrie2_set32(newData.trie, c, newValue, &errorCode);
1859         }
1860     }
1861 }
1862
1863 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1864     // Logically const: Synchronized instantiation.
1865     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1866     CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
1867     return U_SUCCESS(errorCode);
1868 }
1869
1870 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1871     return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
1872 }
1873
1874 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1875     return *(const UnicodeSet *)(
1876         ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
1877 }
1878
1879 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1880     return getCanonValue(c)>=0;
1881 }
1882
1883 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1884     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1885     if(canonValue==0) {
1886         return FALSE;
1887     }
1888     set.clear();
1889     int32_t value=canonValue&CANON_VALUE_MASK;
1890     if((canonValue&CANON_HAS_SET)!=0) {
1891         set.addAll(getCanonStartSet(value));
1892     } else if(value!=0) {
1893         set.add(value);
1894     }
1895     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1896         uint16_t norm16=getNorm16(c);
1897         if(norm16==JAMO_L) {
1898             UChar32 syllable=
1899                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1900             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1901         } else {
1902             addComposites(getCompositionsList(norm16), set);
1903         }
1904     }
1905     return TRUE;
1906 }
1907
1908 U_NAMESPACE_END
1909
1910 // Normalizer2 data swapping ----------------------------------------------- ***
1911
1912 U_NAMESPACE_USE
1913
1914 U_CAPI int32_t U_EXPORT2
1915 unorm2_swap(const UDataSwapper *ds,
1916             const void *inData, int32_t length, void *outData,
1917             UErrorCode *pErrorCode) {
1918     const UDataInfo *pInfo;
1919     int32_t headerSize;
1920
1921     const uint8_t *inBytes;
1922     uint8_t *outBytes;
1923
1924     const int32_t *inIndexes;
1925     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1926
1927     int32_t i, offset, nextOffset, size;
1928
1929     /* udata_swapDataHeader checks the arguments */
1930     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1931     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1932         return 0;
1933     }
1934
1935     /* check data format and format version */
1936     pInfo=(const UDataInfo *)((const char *)inData+4);
1937     if(!(
1938         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
1939         pInfo->dataFormat[1]==0x72 &&
1940         pInfo->dataFormat[2]==0x6d &&
1941         pInfo->dataFormat[3]==0x32 &&
1942         pInfo->formatVersion[0]==1
1943     )) {
1944         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1945                          pInfo->dataFormat[0], pInfo->dataFormat[1],
1946                          pInfo->dataFormat[2], pInfo->dataFormat[3],
1947                          pInfo->formatVersion[0]);
1948         *pErrorCode=U_UNSUPPORTED_ERROR;
1949         return 0;
1950     }
1951
1952     inBytes=(const uint8_t *)inData+headerSize;
1953     outBytes=(uint8_t *)outData+headerSize;
1954
1955     inIndexes=(const int32_t *)inBytes;
1956
1957     if(length>=0) {
1958         length-=headerSize;
1959         if(length<(int32_t)sizeof(indexes)) {
1960             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
1961                              length);
1962             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1963             return 0;
1964         }
1965     }
1966
1967     /* read the first few indexes */
1968     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
1969         indexes[i]=udata_readInt32(ds, inIndexes[i]);
1970     }
1971
1972     /* get the total length of the data */
1973     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1974
1975     if(length>=0) {
1976         if(length<size) {
1977             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
1978                              length);
1979             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1980             return 0;
1981         }
1982
1983         /* copy the data for inaccessible bytes */
1984         if(inBytes!=outBytes) {
1985             uprv_memcpy(outBytes, inBytes, size);
1986         }
1987
1988         offset=0;
1989
1990         /* swap the int32_t indexes[] */
1991         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
1992         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
1993         offset=nextOffset;
1994
1995         /* swap the UTrie2 */
1996         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
1997         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
1998         offset=nextOffset;
1999
2000         /* swap the uint16_t extraData[] */
2001         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
2002         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2003         offset=nextOffset;
2004
2005         U_ASSERT(offset==size);
2006     }
2007
2008     return headerSize+size;
2009 }
2010
2011 #endif  // !UCONFIG_NO_NORMALIZATION