icuSources/common/unorm.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (c) 1996-2007, International Business Machines
   4 * Corporation and others. All Rights Reserved.
   5 ******************************************************************************
   6 * File unorm.cpp
   7 *
   8 * Created by: Vladimir Weinstein 12052000
   9 *
  10 * Modification history :
  11 *
  12 * Date        Name        Description
  13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
  14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
  15 *                         Added private method for doing FCD checks
  16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
  17 *                         string for codepoints < 0x300 for the normalization
  18 *                         mode NFC.
  19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
  20 *                         instead of just wrappers around normlzr.cpp,
  21 *                         load unorm.dat, support Unicode 3.1 with
  22 *                         supplementary code points, etc.
  23 */
  24
  25 #include "unicode/utypes.h"
  26
  27 #if !UCONFIG_NO_NORMALIZATION
  28
  29 #include "unicode/udata.h"
  30 #include "unicode/uchar.h"
  31 #include "unicode/ustring.h"
  32 #include "unicode/uiter.h"
  33 #include "unicode/uniset.h"
  34 #include "unicode/usetiter.h"
  35 #include "unicode/unorm.h"
  36 #include "ucln_cmn.h"
  37 #include "unormimp.h"
  38 #include "ucase.h"
  39 #include "cmemory.h"
  40 #include "umutex.h"
  41 #include "utrie.h"
  42 #include "unicode/uset.h"
  43 #include "udataswp.h"
  44 #include "putilimp.h"
  45
  46 /*
  47  * Status of tailored normalization
  48  *
  49  * This was done initially for investigation on Unicode public review issue 7
  50  * (http://www.unicode.org/review/). See Jitterbug 2481.
  51  * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
  52  * a permanent feature in ICU 2.6 in support of IDNA which requires true
  53  * Unicode 3.2 normalization.
  54  * (NormalizationCorrections are rolled into IDNA mapping tables.)
  55  *
  56  * Tailored normalization as implemented here allows to "normalize less"
  57  * than full Unicode normalization would.
  58  * Based internally on a UnicodeSet of code points that are
  59  * "excluded from normalization", the normalization functions leave those
  60  * code points alone ("inert"). This means that tailored normalization
  61  * still transforms text into a canonically equivalent form.
  62  * It does not add decompositions to code points that do not have any or
  63  * change decomposition results.
  64  *
  65  * Any function that searches for a safe boundary has not been touched,
  66  * which means that these functions will be over-pessimistic when
  67  * exclusions are applied.
  68  * This should not matter because subsequent checks and normalizations
  69  * do apply the exclusions; only a little more of the text may be processed
  70  * than necessary under exclusions.
  71  *
  72  * Normalization exclusions have the following effect on excluded code points c:
  73  * - c is not decomposed
  74  * - c is not a composition target
  75  * - c does not combine forward or backward for composition
  76  *   except that this is not implemented for Jamo
  77  * - c is treated as having a combining class of 0
  78  */
  79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  80
  81 U_NAMESPACE_USE
  82
  83 /*
  84  * This new implementation of the normalization code loads its data from
  85  * unorm.dat, which is generated with the gennorm tool.
  86  * The format of that file is described in unormimp.h .
  87  */
  88
  89 /* -------------------------------------------------------------------------- */
  90
  91 enum {
  92     _STACK_BUFFER_CAPACITY=100
  93 };
  94
  95 /*
  96  * Constants for the bit fields in the options bit set parameter.
  97  * These need not be public.
  98  * A user only needs to know the currently assigned values.
  99  * The number and positions of reserved bits per field can remain private
 100  * and may change in future implementations.
 101  */
 102 enum {
 103     _NORM_OPTIONS_NX_MASK=0x1f,
 104     _NORM_OPTIONS_UNICODE_MASK=0x60,
 105     _NORM_OPTIONS_SETS_MASK=0x7f,
 106
 107     _NORM_OPTIONS_UNICODE_SHIFT=5,
 108
 109     /*
 110      * The following options are used only in some composition functions.
 111      * They use bits 12 and up to preserve lower bits for the available options
 112      * space in unorm_compare() -
 113      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
 114      */
 115
 116     /** Options bit 12, for compatibility vs. canonical decomposition. */
 117     _NORM_OPTIONS_COMPAT=0x1000,
 118     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
 119     _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
 120 };
 121
 122 U_CDECL_BEGIN
 123 static inline UBool
 124 isHangulWithoutJamoT(UChar c) {
 125     c-=HANGUL_BASE;
 126     return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
 127 }
 128
 129 /* norm32 helpers */
 130
 131 /* is this a norm32 with a regular index? */
 132 static inline UBool
 133 isNorm32Regular(uint32_t norm32) {
 134     return norm32<_NORM_MIN_SPECIAL;
 135 }
 136
 137 /* is this a norm32 with a special index for a lead surrogate? */
 138 static inline UBool
 139 isNorm32LeadSurrogate(uint32_t norm32) {
 140     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
 141 }
 142
 143 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
 144 static inline UBool
 145 isNorm32HangulOrJamo(uint32_t norm32) {
 146     return norm32>=_NORM_MIN_HANGUL;
 147 }
 148
 149 /*
 150  * Given isNorm32HangulOrJamo(),
 151  * is this a Hangul syllable or a Jamo?
 152  */
 153 /*static inline UBool
 154 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
 155     return norm32<_NORM_MIN_JAMO_V;
 156 }*/
 157
 158 /*
 159  * Given norm32 for Jamo V or T,
 160  * is this a Jamo V?
 161  */
 162 static inline UBool
 163 isJamoVTNorm32JamoV(uint32_t norm32) {
 164     return norm32<_NORM_JAMO_V_TOP;
 165 }
 166
 167 /* load unorm.dat ----------------------------------------------------------- */
 168
 169 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
 170 static int32_t U_CALLCONV
 171 getFoldingNormOffset(uint32_t norm32) {
 172     if(isNorm32LeadSurrogate(norm32)) {
 173         return
 174             UTRIE_BMP_INDEX_LENGTH+
 175                 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
 176                  (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
 177     } else {
 178         return 0;
 179     }
 180 }
 181
 182 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
 183 static int32_t U_CALLCONV
 184 getFoldingAuxOffset(uint32_t data) {
 185     return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
 186 }
 187 U_CDECL_END
 188
 189 #define UNORM_HARDCODE_DATA 1
 190
 191 #if UNORM_HARDCODE_DATA
 192
 193 /* unorm_props_data.c is machine-generated by gennorm --csource */
 194 #include "unorm_props_data.c"
 195
 196 static const UBool formatVersion_2_2=TRUE;
 197
 198 #else
 199
 200 #define DATA_NAME "unorm"
 201 #define DATA_TYPE "icu"
 202
 203 static UDataMemory *normData=NULL;
 204 static UErrorCode dataErrorCode=U_ZERO_ERROR;
 205 static int8_t haveNormData=0;
 206
 207 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
 208 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
 209
 210 /*
 211  * pointers into the memory-mapped unorm.icu
 212  */
 213 static const uint16_t *extraData=NULL,
 214                       *combiningTable=NULL,
 215                       *canonStartSets=NULL;
 216
 217 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
 218 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
 219
 220 /* the Unicode version of the normalization data */
 221 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
 222
 223 #endif
 224
 225 /* cache UnicodeSets for each combination of exclusion flags */
 226 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
 227
 228 U_CDECL_BEGIN
 229
 230 static UBool U_CALLCONV
 231 unorm_cleanup(void) {
 232     int32_t i;
 233
 234 #if !UNORM_HARDCODE_DATA
 235     if(normData!=NULL) {
 236         udata_close(normData);
 237         normData=NULL;
 238     }
 239     dataErrorCode=U_ZERO_ERROR;
 240     haveNormData=0;
 241 #endif
 242
 243     for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
 244         if (nxCache[i]) {
 245             delete nxCache[i];
 246             nxCache[i] = 0;
 247         }
 248     }
 249
 250     return TRUE;
 251 }
 252
 253 #if !UNORM_HARDCODE_DATA
 254
 255 static UBool U_CALLCONV
 256 isAcceptable(void * /* context */,
 257              const char * /* type */, const char * /* name */,
 258              const UDataInfo *pInfo) {
 259     if(
 260         pInfo->size>=20 &&
 261         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
 262         pInfo->charsetFamily==U_CHARSET_FAMILY &&
 263         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
 264         pInfo->dataFormat[1]==0x6f &&
 265         pInfo->dataFormat[2]==0x72 &&
 266         pInfo->dataFormat[3]==0x6d &&
 267         pInfo->formatVersion[0]==2 &&
 268         pInfo->formatVersion[2]==UTRIE_SHIFT &&
 269         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
 270     ) {
 271         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
 272         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
 273         return TRUE;
 274     } else {
 275         return FALSE;
 276     }
 277 }
 278
 279 #endif
 280
 281 static UBool U_CALLCONV
 282 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
 283     /* add the start code point to the USet */
 284     const USetAdder *sa=(const USetAdder *)context;
 285     sa->add(sa->set, start);
 286     return TRUE;
 287 }
 288
 289 U_CDECL_END
 290
 291 #if !UNORM_HARDCODE_DATA
 292
 293 static int8_t
 294 loadNormData(UErrorCode &errorCode) {
 295     /* load Unicode normalization data from file */
 296
 297     /*
 298      * This lazy intialization with double-checked locking (without mutex protection for
 299      * haveNormData==0) is transiently unsafe under certain circumstances.
 300      * Check the readme and use u_init() if necessary.
 301      *
 302      * While u_init() initializes the main normalization data via this functions,
 303      * it does not do so for exclusion sets (which are fully mutexed).
 304      * This is because
 305      * - there can be many exclusion sets
 306      * - they are rarely used
 307      * - they are not usually used in execution paths that are
 308      *   as performance-sensitive as others
 309      *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
 310      */
 311     if(haveNormData==0) {
 312         UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
 313         UDataMemory *data;
 314
 315         const int32_t *p=NULL;
 316         const uint8_t *pb;
 317
 318         if(&errorCode==NULL || U_FAILURE(errorCode)) {
 319             return 0;
 320         }
 321
 322         /* open the data outside the mutex block */
 323         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
 324         dataErrorCode=errorCode;
 325         if(U_FAILURE(errorCode)) {
 326             return haveNormData=-1;
 327         }
 328
 329         p=(const int32_t *)udata_getMemory(data);
 330         pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
 331         utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
 332         _normTrie.getFoldingOffset=getFoldingNormOffset;
 333
 334         pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
 335         if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
 336             utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
 337         }
 338         pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
 339
 340         if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
 341             utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
 342             _auxTrie.getFoldingOffset=getFoldingAuxOffset;
 343         }
 344
 345         if(U_FAILURE(errorCode)) {
 346             dataErrorCode=errorCode;
 347             udata_close(data);
 348             return haveNormData=-1;
 349         }
 350
 351         /* in the mutex block, set the data for this process */
 352         umtx_lock(NULL);
 353         if(normData==NULL) {
 354             normData=data;
 355             data=NULL;
 356
 357             uprv_memcpy(&indexes, p, sizeof(indexes));
 358             uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
 359             uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
 360             uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
 361         } else {
 362             p=(const int32_t *)udata_getMemory(normData);
 363         }
 364
 365         /* initialize some variables */
 366         extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
 367         combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
 368         formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
 369         formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
 370         if(formatVersion_2_1) {
 371             canonStartSets=combiningTable+
 372                 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
 373                 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
 374         }
 375         haveNormData=1;
 376         ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 377         umtx_unlock(NULL);
 378
 379         /* if a different thread set it first, then close the extra data */
 380         if(data!=NULL) {
 381             udata_close(data); /* NULL if it was set correctly */
 382         }
 383     }
 384
 385     return haveNormData;
 386 }
 387
 388 #endif
 389
 390 static inline UBool
 391 _haveData(UErrorCode &errorCode) {
 392 #if UNORM_HARDCODE_DATA
 393     return U_SUCCESS(errorCode);
 394 #else
 395     if(U_FAILURE(errorCode)) {
 396         return FALSE;
 397     } else if(haveNormData>0) {
 398         return TRUE;
 399     } else if(haveNormData<0) {
 400         errorCode=dataErrorCode;
 401         return FALSE;
 402     } else /* haveNormData==0 */ {
 403         return (UBool)(loadNormData(errorCode)>0);
 404     }
 405 #endif
 406 }
 407
 408 U_CAPI UBool U_EXPORT2
 409 unorm_haveData(UErrorCode *pErrorCode) {
 410     return _haveData(*pErrorCode);
 411 }
 412
 413 U_CAPI const uint16_t * U_EXPORT2
 414 unorm_getFCDTrie(UErrorCode *pErrorCode) {
 415     if(_haveData(*pErrorCode)) {
 416         return fcdTrie.index;
 417     } else {
 418         return NULL;
 419     }
 420 }
 421
 422 /* data access primitives --------------------------------------------------- */
 423
 424 static inline uint32_t
 425 _getNorm32(UChar c) {
 426     return UTRIE_GET32_FROM_LEAD(&normTrie, c);
 427 }
 428
 429 static inline uint32_t
 430 _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
 431     /*
 432      * the surrogate index in norm32 stores only the number of the surrogate index block
 433      * see gennorm/store.c/getFoldedNormValue()
 434      */
 435     norm32=
 436         UTRIE_BMP_INDEX_LENGTH+
 437             ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
 438              (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
 439     return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
 440 }
 441
 442 /*
 443  * get a norm32 from text with complete code points
 444  * (like from decompositions)
 445  */
 446 static inline uint32_t
 447 _getNorm32(const UChar *p, uint32_t mask) {
 448     uint32_t norm32=_getNorm32(*p);
 449     if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
 450         /* *p is a lead surrogate, get the real norm32 */
 451         norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
 452     }
 453     return norm32;
 454 }
 455
 456 static inline uint16_t
 457 _getFCD16(UChar c) {
 458     return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
 459 }
 460
 461 static inline uint16_t
 462 _getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
 463     /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
 464     return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
 465 }
 466
 467 static inline const uint16_t *
 468 _getExtraData(uint32_t norm32) {
 469     return extraData+(norm32>>_NORM_EXTRA_SHIFT);
 470 }
 471
 472 #if 0
 473 /*
 474  * It is possible to get the FCD data from the main trie if unorm.icu
 475  * was built without the FCD trie, although it is slower.
 476  * This is not implemented because it is hard to test, and because it seems
 477  * unusual to want to use FCD and not build the data file for it.
 478  *
 479  * Untested sample code:
 480  */
 481 static inline uint16_t
 482 _getFCD16FromNormData(UChar32 c) {
 483     uint32_t norm32, fcd;
 484
 485     norm32=_getNorm32(c);
 486     if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
 487         /* get the lead/trail cc from the decomposition data */
 488         const uint16_t *nfd=_getExtraData(norm32);
 489         if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 490             fcd=nfd[1];
 491         }
 492     } else {
 493         fcd=norm32&_NORM_CC_MASK;
 494         if(fcd!=0) {
 495             /* use the code point cc value for both lead and trail cc's */
 496             fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
 497         }
 498     }
 499
 500     return (uint16_t)fcd;
 501 }
 502 #endif
 503
 504 /* normalization exclusion sets --------------------------------------------- */
 505
 506 /*
 507  * Normalization exclusion UnicodeSets are used for tailored normalization;
 508  * see the comment near the beginning of this file.
 509  *
 510  * By specifying one or several sets of code points,
 511  * those code points become inert for normalization.
 512  */
 513
 514 static const UnicodeSet *
 515 internalGetNXHangul(UErrorCode &errorCode) {
 516     /* internal function, does not check for incoming U_FAILURE */
 517     UBool isCached;
 518
 519     UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
 520
 521     if(!isCached) {
 522         UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
 523         if(set==NULL) {
 524             errorCode=U_MEMORY_ALLOCATION_ERROR;
 525             return NULL;
 526         }
 527         // Compact the set for caching.
 528         set->compact();
 529
 530         umtx_lock(NULL);
 531         if(nxCache[UNORM_NX_HANGUL]==NULL) {
 532             nxCache[UNORM_NX_HANGUL]=set;
 533             set=NULL;
 534             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 535         }
 536         umtx_unlock(NULL);
 537
 538         delete set;
 539     }
 540
 541     return nxCache[UNORM_NX_HANGUL];
 542 }
 543
 544 /* unorm.cpp 1.116 had and used
 545 static const UnicodeSet *
 546 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
 547     ...
 548 }
 549 */
 550
 551 /* get and set an exclusion set from a serialized UnicodeSet */
 552 static const UnicodeSet *
 553 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
 554     /* internal function, does not check for incoming U_FAILURE */
 555     UBool isCached;
 556
 557     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
 558
 559     if( !isCached &&
 560         canonStartSets!=NULL &&
 561         canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
 562     ) {
 563         USerializedSet sset;
 564         UnicodeSet *set;
 565         UChar32 start, end;
 566         int32_t i;
 567
 568         if( !uset_getSerializedSet(
 569                     &sset,
 570                     canonStartSets+canonStartSets[nxIndex],
 571                     canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
 572         ) {
 573             errorCode=U_INVALID_FORMAT_ERROR;
 574             return NULL;
 575         }
 576
 577         /* turn the serialized set into a UnicodeSet */
 578         set=new UnicodeSet();
 579         if(set==NULL) {
 580             errorCode=U_MEMORY_ALLOCATION_ERROR;
 581             return NULL;
 582         }
 583         for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
 584             set->add(start, end);
 585         }
 586         // Compact the set for caching.
 587         set->compact();
 588
 589         umtx_lock(NULL);
 590         if(nxCache[options]==NULL) {
 591             nxCache[options]=set;
 592             set=NULL;
 593             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 594         }
 595         umtx_unlock(NULL);
 596
 597         delete set;
 598     }
 599
 600     return nxCache[options];
 601 }
 602
 603 static const UnicodeSet *
 604 internalGetNXCJKCompat(UErrorCode &errorCode) {
 605     /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
 606     return internalGetSerializedNX(
 607                 UNORM_NX_CJK_COMPAT,
 608                 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
 609                 errorCode);
 610 }
 611
 612 static const UnicodeSet *
 613 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
 614     /* internal function, does not check for incoming U_FAILURE */
 615     int32_t nxIndex;
 616
 617     options&=_NORM_OPTIONS_UNICODE_MASK;
 618     switch(options) {
 619     case 0:
 620         return NULL;
 621     case UNORM_UNICODE_3_2:
 622         /* [:^Age=3.2:] */
 623         nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
 624         break;
 625     default:
 626         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 627         return NULL;
 628     }
 629
 630     /* build a set with all code points that were not designated by the specified Unicode version */
 631     return internalGetSerializedNX(options, nxIndex, errorCode);
 632 }
 633
 634 /* Get a decomposition exclusion set. The data must be loaded. */
 635 static const UnicodeSet *
 636 internalGetNX(int32_t options, UErrorCode &errorCode) {
 637     options&=_NORM_OPTIONS_SETS_MASK;
 638
 639     UBool isCached;
 640
 641     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
 642
 643     if(!isCached) {
 644         /* return basic sets */
 645         if(options==UNORM_NX_HANGUL) {
 646             return internalGetNXHangul(errorCode);
 647         }
 648         if(options==UNORM_NX_CJK_COMPAT) {
 649             return internalGetNXCJKCompat(errorCode);
 650         }
 651         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
 652             return internalGetNXUnicode(options, errorCode);
 653         }
 654
 655         /* build a set from multiple subsets */
 656         UnicodeSet *set;
 657         const UnicodeSet *other;
 658
 659         set=new UnicodeSet();
 660         if(set==NULL) {
 661             errorCode=U_MEMORY_ALLOCATION_ERROR;
 662             return NULL;
 663         }
 664
 665         if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
 666             set->addAll(*other);
 667         }
 668         if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
 669             set->addAll(*other);
 670         }
 671         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
 672             set->addAll(*other);
 673         }
 674
 675         if(U_FAILURE(errorCode)) {
 676             delete set;
 677             return NULL;
 678         }
 679         // Compact the set for caching.
 680         set->compact();
 681
 682         umtx_lock(NULL);
 683         if(nxCache[options]==NULL) {
 684             nxCache[options]=set;
 685             set=NULL;
 686             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 687         }
 688         umtx_unlock(NULL);
 689
 690         delete set;
 691     }
 692
 693     return nxCache[options];
 694 }
 695
 696 static inline const UnicodeSet *
 697 getNX(int32_t options, UErrorCode &errorCode) {
 698     if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
 699         /* incoming failure, or no decomposition exclusions requested */
 700         return NULL;
 701     } else {
 702         return internalGetNX(options, errorCode);
 703     }
 704 }
 705
 706 U_CFUNC const UnicodeSet *
 707 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
 708     return getNX(options, *pErrorCode);
 709 }
 710
 711 static inline UBool
 712 nx_contains(const UnicodeSet *nx, UChar32 c) {
 713     return nx!=NULL && nx->contains(c);
 714 }
 715
 716 static inline UBool
 717 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
 718     return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
 719 }
 720
 721 /* other normalization primitives ------------------------------------------- */
 722
 723 /* get the canonical or compatibility decomposition for one character */
 724 static inline const UChar *
 725 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
 726            uint8_t &cc, uint8_t &trailCC) {
 727     const UChar *p=(const UChar *)_getExtraData(norm32);
 728     length=*p++;
 729
 730     if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
 731         /* use compatibility decomposition, skip canonical data */
 732         p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
 733         length>>=8;
 734     }
 735
 736     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 737         /* get the lead and trail cc's */
 738         UChar bothCCs=*p++;
 739         cc=(uint8_t)(bothCCs>>8);
 740         trailCC=(uint8_t)bothCCs;
 741     } else {
 742         /* lead and trail cc's are both 0 */
 743         cc=trailCC=0;
 744     }
 745
 746     length&=_NORM_DECOMP_LENGTH_MASK;
 747     return p;
 748 }
 749
 750 /* get the canonical decomposition for one character */
 751 static inline const UChar *
 752 _decompose(uint32_t norm32, int32_t &length,
 753            uint8_t &cc, uint8_t &trailCC) {
 754     const UChar *p=(const UChar *)_getExtraData(norm32);
 755     length=*p++;
 756
 757     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 758         /* get the lead and trail cc's */
 759         UChar bothCCs=*p++;
 760         cc=(uint8_t)(bothCCs>>8);
 761         trailCC=(uint8_t)bothCCs;
 762     } else {
 763         /* lead and trail cc's are both 0 */
 764         cc=trailCC=0;
 765     }
 766
 767     length&=_NORM_DECOMP_LENGTH_MASK;
 768     return p;
 769 }
 770
 771 /**
 772  * Get the canonical decomposition for one code point.
 773  * @param c code point
 774  * @param buffer out-only buffer for algorithmic decompositions of Hangul
 775  * @param length out-only, takes the length of the decomposition, if any
 776  * @return pointer to decomposition, or 0 if none
 777  * @internal
 778  */
 779 U_CFUNC const UChar *
 780 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
 781     uint32_t norm32;
 782
 783     if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
 784         /* trivial case */
 785         return NULL;
 786     }
 787
 788     UTRIE_GET32(&normTrie, c, norm32);
 789     if(norm32&_NORM_QC_NFD) {
 790         if(isNorm32HangulOrJamo(norm32)) {
 791             /* Hangul syllable: decompose algorithmically */
 792             UChar c2;
 793
 794             c-=HANGUL_BASE;
 795
 796             c2=(UChar)(c%JAMO_T_COUNT);
 797             c/=JAMO_T_COUNT;
 798             if(c2>0) {
 799                 buffer[2]=(UChar)(JAMO_T_BASE+c2);
 800                 *pLength=3;
 801             } else {
 802                 *pLength=2;
 803             }
 804
 805             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
 806             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
 807             return buffer;
 808         } else {
 809             /* normal decomposition */
 810             uint8_t cc, trailCC;
 811             return _decompose(norm32, *pLength, cc, trailCC);
 812         }
 813     } else {
 814         return 0;
 815     }
 816 }
 817
 818 /*
 819  * get the combining class of (c, c2)=*p++
 820  * before: p<limit  after: p<=limit
 821  * if only one code unit is used, then c2==0
 822  */
 823 static inline uint8_t
 824 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
 825     uint32_t norm32;
 826
 827     c=*p++;
 828     norm32=_getNorm32(c);
 829     if((norm32&_NORM_CC_MASK)==0) {
 830         c2=0;
 831         return 0;
 832     } else {
 833         if(!isNorm32LeadSurrogate(norm32)) {
 834             c2=0;
 835         } else {
 836             /* c is a lead surrogate, get the real norm32 */
 837             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
 838                 ++p;
 839                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
 840             } else {
 841                 c2=0;
 842                 return 0;
 843             }
 844         }
 845
 846         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
 847     }
 848 }
 849
 850 /*
 851  * read backwards and get norm32
 852  * return 0 if the character is <minC
 853  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
 854  */
 855 static inline uint32_t
 856 _getPrevNorm32(const UChar *start, const UChar *&src,
 857                uint32_t minC, uint32_t mask,
 858                UChar &c, UChar &c2) {
 859     uint32_t norm32;
 860
 861     c=*--src;
 862     c2=0;
 863
 864     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
 865     if(c<minC) {
 866         return 0;
 867     } else if(!UTF_IS_SURROGATE(c)) {
 868         return _getNorm32(c);
 869     } else if(UTF_IS_SURROGATE_FIRST(c)) {
 870         /* unpaired first surrogate */
 871         return 0;
 872     } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
 873         --src;
 874         norm32=_getNorm32(c2);
 875
 876         if((norm32&mask)==0) {
 877             /* all surrogate pairs with this lead surrogate have only irrelevant data */
 878             return 0;
 879         } else {
 880             /* norm32 must be a surrogate special */
 881             return _getNorm32FromSurrogatePair(norm32, c);
 882         }
 883     } else {
 884         /* unpaired second surrogate */
 885         c2=0;
 886         return 0;
 887     }
 888 }
 889
 890 /*
 891  * get the combining class of (c, c2)=*--p
 892  * before: start<p  after: start<=p
 893  */
 894 static inline uint8_t
 895 _getPrevCC(const UChar *start, const UChar *&p) {
 896     UChar c, c2;
 897
 898     return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
 899 }
 900
 901 /*
 902  * is this a safe boundary character for NF*D?
 903  * (lead cc==0)
 904  */
 905 static inline UBool
 906 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 907     if((norm32&ccOrQCMask)==0) {
 908         return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
 909     }
 910
 911     /* inspect its decomposition - maybe a Hangul but not a surrogate here */
 912     if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
 913         int32_t length;
 914         uint8_t cc, trailCC;
 915
 916         /* decomposes, get everything from the variable-length extra data */
 917         _decompose(norm32, decompQCMask, length, cc, trailCC);
 918         return cc==0;
 919     } else {
 920         /* no decomposition (or Hangul), test the cc directly */
 921         return (norm32&_NORM_CC_MASK)==0;
 922     }
 923 }
 924
 925 /*
 926  * is this (or does its decomposition begin with) a "true starter"?
 927  * (cc==0 and NF*C_YES)
 928  */
 929 static inline UBool
 930 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 931     if((norm32&ccOrQCMask)==0) {
 932         return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
 933     }
 934
 935     /* inspect its decomposition - not a Hangul or a surrogate here */
 936     if((norm32&decompQCMask)!=0) {
 937         const UChar *p;
 938         int32_t length;
 939         uint8_t cc, trailCC;
 940
 941         /* decomposes, get everything from the variable-length extra data */
 942         p=_decompose(norm32, decompQCMask, length, cc, trailCC);
 943         if(cc==0) {
 944             uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
 945
 946             /* does it begin with NFC_YES? */
 947             if((_getNorm32(p, qcMask)&qcMask)==0) {
 948                 /* yes, the decomposition begins with a true starter */
 949                 return TRUE;
 950             }
 951         }
 952     }
 953     return FALSE;
 954 }
 955
 956 /* uchar.h */
 957 U_CAPI uint8_t U_EXPORT2
 958 u_getCombiningClass(UChar32 c) {
 959 #if !UNORM_HARDCODE_DATA
 960     UErrorCode errorCode=U_ZERO_ERROR;
 961     if(_haveData(errorCode)) {
 962 #endif
 963         uint32_t norm32;
 964
 965         UTRIE_GET32(&normTrie, c, norm32);
 966         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
 967 #if !UNORM_HARDCODE_DATA
 968     } else {
 969         return 0;
 970     }
 971 #endif
 972 }
 973
 974 U_CFUNC UBool U_EXPORT2
 975 unorm_internalIsFullCompositionExclusion(UChar32 c) {
 976 #if UNORM_HARDCODE_DATA
 977     if(auxTrie.index!=NULL) {
 978 #else
 979     UErrorCode errorCode=U_ZERO_ERROR;
 980     if(_haveData(errorCode) && auxTrie.index!=NULL) {
 981 #endif
 982         uint16_t aux;
 983
 984         UTRIE_GET16(&auxTrie, c, aux);
 985         return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
 986     } else {
 987         return FALSE;
 988     }
 989 }
 990
 991 U_CFUNC UBool U_EXPORT2
 992 unorm_isCanonSafeStart(UChar32 c) {
 993 #if UNORM_HARDCODE_DATA
 994     if(auxTrie.index!=NULL) {
 995 #else
 996     UErrorCode errorCode=U_ZERO_ERROR;
 997     if(_haveData(errorCode) && auxTrie.index!=NULL) {
 998 #endif
 999         uint16_t aux;
1000
1001         UTRIE_GET16(&auxTrie, c, aux);
1002         return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
1003     } else {
1004         return FALSE;
1005     }
1006 }
1007
1008 U_CAPI void U_EXPORT2
1009 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
1010     if(unorm_haveData(pErrorCode)){
1011         uprv_memcpy(*versionInfo, dataVersion, 4);
1012     }
1013 }
1014
1015
1016 U_CAPI UBool U_EXPORT2
1017 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
1018 #if !UNORM_HARDCODE_DATA
1019     UErrorCode errorCode=U_ZERO_ERROR;
1020 #endif
1021     if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
1022 #if !UNORM_HARDCODE_DATA
1023         _haveData(errorCode) &&
1024 #endif
1025         canonStartSets!=NULL
1026     ) {
1027         const uint16_t *table;
1028         int32_t i, start, limit;
1029
1030         /*
1031          * binary search for c
1032          *
1033          * There are two search tables,
1034          * one for BMP code points and one for supplementary ones.
1035          * See unormimp.h for details.
1036          */
1037         if(c<=0xffff) {
1038             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
1039             start=0;
1040             limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1041
1042             /* each entry is a pair { c, result } */
1043             while(start<limit-2) {
1044                 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1045                 if(c<table[i]) {
1046                     limit=i;
1047                 } else {
1048                     start=i;
1049                 }
1050             }
1051
1052             /* found? */
1053             if(c==table[start]) {
1054                 i=table[start+1];
1055                 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1056                     /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1057                     i&=(_NORM_MAX_CANON_SETS-1);
1058                     return uset_getSerializedSet(fillSet,
1059                                             canonStartSets+i,
1060                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1061                 } else {
1062                     /* other result values are BMP code points for single-code point sets */
1063                     uset_setSerializedToOne(fillSet, (UChar32)i);
1064                     return TRUE;
1065                 }
1066             }
1067         } else {
1068             uint16_t high, low, h;
1069
1070             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1071                                  canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1072             start=0;
1073             limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1074
1075             high=(uint16_t)(c>>16);
1076             low=(uint16_t)c;
1077
1078             /* each entry is a triplet { high(c), low(c), result } */
1079             while(start<limit-3) {
1080                 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1081                 h=table[i]&0x1f; /* high word */
1082                 if(high<h || (high==h && low<table[i+1])) {
1083                     limit=i;
1084                 } else {
1085                     start=i;
1086                 }
1087             }
1088
1089             /* found? */
1090             h=table[start];
1091             if(high==(h&0x1f) && low==table[start+1]) {
1092                 i=table[start+2];
1093                 if((h&0x8000)==0) {
1094                     /* the result is an index to a USerializedSet */
1095                     return uset_getSerializedSet(fillSet,
1096                                             canonStartSets+i,
1097                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1098                 } else {
1099                     /*
1100                      * single-code point set {x} in
1101                      * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
1102                      */
1103                     i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1104                     uset_setSerializedToOne(fillSet, (UChar32)i);
1105                     return TRUE;
1106                 }
1107             }
1108         }
1109     }
1110
1111     return FALSE; /* not found */
1112 }
1113
1114 U_CAPI int32_t U_EXPORT2
1115 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1116     uint16_t aux;
1117
1118     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1119         return 0;
1120     }
1121     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1122         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1123         return 0;
1124     }
1125     if(!_haveData(*pErrorCode) || auxTrie.index==NULL) {
1126         return 0;
1127     }
1128
1129     UTRIE_GET16(&auxTrie, c, aux);
1130     aux&=_NORM_AUX_FNC_MASK;
1131     if(aux!=0) {
1132         const UChar *s;
1133         int32_t length;
1134
1135         s=(const UChar *)(extraData+aux);
1136         if(*s<0xff00) {
1137             /* s points to the single-unit string */
1138             length=1;
1139         } else {
1140             length=*s&0xff;
1141             ++s;
1142         }
1143         if(0<length && length<=destCapacity) {
1144             uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1145         }
1146         return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1147     } else {
1148         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1149     }
1150 }
1151
1152 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1153 U_CAPI UBool U_EXPORT2
1154 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1155     uint32_t norm32, mask;
1156     uint16_t aux, fcd;
1157
1158 #if !UNORM_HARDCODE_DATA
1159     UErrorCode errorCode=U_ZERO_ERROR;
1160     if(!_haveData(errorCode)) {
1161         return FALSE;
1162     }
1163 #endif
1164
1165     /* handle trivial cases; set the comparison mask for the normal ones */
1166     switch(mode) {
1167     case UNORM_NONE:
1168         return TRUE;
1169     case UNORM_NFD:
1170         mask=_NORM_CC_MASK|_NORM_QC_NFD;
1171         break;
1172     case UNORM_NFKD:
1173         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1174         break;
1175     case UNORM_NFC:
1176     /* case UNORM_FCC: */
1177         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1178         break;
1179     case UNORM_NFKC:
1180         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1181         break;
1182     case UNORM_FCD:
1183         /* FCD: skippable if lead cc==0 and trail cc<=1 */
1184         if(fcdTrie.index!=NULL) {
1185             UTRIE_GET16(&fcdTrie, c, fcd);
1186             return fcd<=1;
1187         } else {
1188             return FALSE;
1189         }
1190     default:
1191         return FALSE;
1192     }
1193
1194     /* check conditions (a)..(e), see unormimp.h */
1195     UTRIE_GET32(&normTrie, c, norm32);
1196     if((norm32&mask)!=0) {
1197         return FALSE; /* fails (a)..(e), not skippable */
1198     }
1199
1200     if(mode<UNORM_NFC) {
1201         return TRUE; /* NF*D, passed (a)..(c), is skippable */
1202     }
1203
1204     /* NF*C/FCC, passed (a)..(e) */
1205     if((norm32&_NORM_QC_NFD)==0) {
1206         return TRUE; /* no canonical decomposition, is skippable */
1207     }
1208
1209     /* check Hangul syllables algorithmically */
1210     if(isNorm32HangulOrJamo(norm32)) {
1211         /* Jamo passed (a)..(e) above, must be Hangul */
1212         return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1213     }
1214
1215     /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1216     /* NF*C, test (f) flag */
1217     if(!formatVersion_2_2 || auxTrie.index==NULL) {
1218         return FALSE; /* no (f) data, say not skippable to be safe */
1219     }
1220
1221     UTRIE_GET16(&auxTrie, c, aux);
1222     return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1223
1224     /* } else { FCC, test fcd<=1 instead of the above } */
1225 }
1226
1227 U_CAPI void U_EXPORT2
1228 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
1229     UChar c;
1230
1231     if(!_haveData(*pErrorCode)) {
1232         return;
1233     }
1234
1235     /* add the start code point of each same-value range of each trie */
1236     utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1237     if(fcdTrie.index!=NULL) {
1238         utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1239     }
1240     if(auxTrie.index!=NULL) {
1241         utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1242     }
1243
1244     /* add Hangul LV syllables and LV+1 because of skippables */
1245     for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1246         sa->add(sa->set, c);
1247         sa->add(sa->set, c+1);
1248     }
1249     sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1250 }
1251
1252 U_CFUNC UNormalizationCheckResult U_EXPORT2
1253 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1254     static const uint32_t qcMask[UNORM_MODE_COUNT]={
1255         0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1256     };
1257
1258     uint32_t norm32;
1259
1260 #if !UNORM_HARDCODE_DATA
1261     UErrorCode errorCode=U_ZERO_ERROR;
1262     if(!_haveData(errorCode)) {
1263         return UNORM_YES;
1264     }
1265 #endif
1266
1267     UTRIE_GET32(&normTrie, c, norm32);
1268     norm32&=qcMask[mode];
1269
1270     if(norm32==0) {
1271         return UNORM_YES;
1272     } else if(norm32&_NORM_QC_ANY_NO) {
1273         return UNORM_NO;
1274     } else /* _NORM_QC_ANY_MAYBE */ {
1275         return UNORM_MAYBE;
1276     }
1277 }
1278
1279 U_CFUNC uint16_t U_EXPORT2
1280 unorm_getFCD16FromCodePoint(UChar32 c) {
1281     uint16_t fcd;
1282 #if !UNORM_HARDCODE_DATA
1283     UErrorCode errorCode;
1284     errorCode=U_ZERO_ERROR;
1285 #endif
1286
1287     if(
1288 #if !UNORM_HARDCODE_DATA
1289         !_haveData(errorCode) ||
1290 #endif
1291         fcdTrie.index==NULL
1292     ) {
1293         return 0;
1294     }
1295
1296     UTRIE_GET16(&fcdTrie, c, fcd);
1297     return fcd;
1298 }
1299
1300 /* reorder UTF-16 in-place -------------------------------------------------- */
1301
1302 /*
1303  * simpler, single-character version of _mergeOrdered() -
1304  * bubble-insert one single code point into the preceding string
1305  * which is already canonically ordered
1306  * (c, c2) may or may not yet have been inserted at [current..p[
1307  *
1308  * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1309  *
1310  * before: [start..current[ is already ordered, and
1311  *         [current..p[     may or may not hold (c, c2) but
1312  *                          must be exactly the same length as (c, c2)
1313  * after: [start..p[ is ordered
1314  *
1315  * returns the trailing combining class
1316  */
1317 static uint8_t
1318 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1319                UChar c, UChar c2, uint8_t cc) {
1320     const UChar *pBack, *pPreBack;
1321     UChar *r;
1322     uint8_t prevCC, trailCC=cc;
1323
1324     if(start<current && cc!=0) {
1325         /* search for the insertion point where cc>=prevCC */
1326         pPreBack=pBack=current;
1327         prevCC=_getPrevCC(start, pPreBack);
1328         if(cc<prevCC) {
1329             /* this will be the last code point, so keep its cc */
1330             trailCC=prevCC;
1331             pBack=pPreBack;
1332             while(start<pPreBack) {
1333                 prevCC=_getPrevCC(start, pPreBack);
1334                 if(cc>=prevCC) {
1335                     break;
1336                 }
1337                 pBack=pPreBack;
1338             }
1339
1340             /*
1341              * this is where we are right now with all these pointers:
1342              * [start..pPreBack[ 0..? code points that we can ignore
1343              * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1344              * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
1345              * [current..p[         1 code point (c, c2) with cc
1346              */
1347
1348             /* move the code units in between up */
1349             r=p;
1350             do {
1351                 *--r=*--current;
1352             } while(pBack!=current);
1353         }
1354     }
1355
1356     /* insert (c, c2) */
1357     *current=c;
1358     if(c2!=0) {
1359         *(current+1)=c2;
1360     }
1361
1362     /* we know the cc of the last code point */
1363     return trailCC;
1364 }
1365
1366 /*
1367  * merge two UTF-16 string parts together
1368  * to canonically order (order by combining classes) their concatenation
1369  *
1370  * the two strings may already be adjacent, so that the merging is done in-place
1371  * if the two strings are not adjacent, then the buffer holding the first one
1372  * must be large enough
1373  * the second string may or may not be ordered in itself
1374  *
1375  * before: [start..current[ is already ordered, and
1376  *         [next..limit[    may be ordered in itself, but
1377  *                          is not in relation to [start..current[
1378  * after: [start..current+(limit-next)[ is ordered
1379  *
1380  * the algorithm is a simple bubble-sort that takes the characters from *next++
1381  * and inserts them in correct combining class order into the preceding part
1382  * of the string
1383  *
1384  * since this function is called much less often than the single-code point
1385  * _insertOrdered(), it just uses that for easier maintenance
1386  * (see file version from before 2001aug31 for a more optimized version)
1387  *
1388  * returns the trailing combining class
1389  */
1390 static uint8_t
1391 _mergeOrdered(UChar *start, UChar *current,
1392               const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1393     UChar *r;
1394     UChar c, c2;
1395     uint8_t cc, trailCC=0;
1396     UBool adjacent;
1397
1398     adjacent= current==next;
1399
1400     if(start!=current || !isOrdered) {
1401         while(next<limit) {
1402             cc=_getNextCC(next, limit, c, c2);
1403             if(cc==0) {
1404                 /* does not bubble back */
1405                 trailCC=0;
1406                 if(adjacent) {
1407                     current=(UChar *)next;
1408                 } else {
1409                     *current++=c;
1410                     if(c2!=0) {
1411                         *current++=c2;
1412                     }
1413                 }
1414                 if(isOrdered) {
1415                     break;
1416                 } else {
1417                     start=current;
1418                 }
1419             } else {
1420                 r=current+(c2==0 ? 1 : 2);
1421                 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1422                 current=r;
1423             }
1424         }
1425     }
1426
1427     if(next==limit) {
1428         /* we know the cc of the last code point */
1429         return trailCC;
1430     } else {
1431         if(!adjacent) {
1432             /* copy the second string part */
1433             do {
1434                 *current++=*next++;
1435             } while(next!=limit);
1436             limit=current;
1437         }
1438         return _getPrevCC(start, limit);
1439     }
1440 }
1441
1442 /* find the last true starter in [start..src[ and return the pointer to it */
1443 static const UChar *
1444 _findPreviousStarter(const UChar *start, const UChar *src,
1445                      uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1446     uint32_t norm32;
1447     UChar c, c2;
1448
1449     while(start<src) {
1450         norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1451         if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1452             break;
1453         }
1454     }
1455     return src;
1456 }
1457
1458 /* find the first true starter in [src..limit[ and return the pointer to it */
1459 static const UChar *
1460 _findNextStarter(const UChar *src, const UChar *limit,
1461                  uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1462     const UChar *p;
1463     uint32_t norm32, ccOrQCMask;
1464     int32_t length;
1465     UChar c, c2;
1466     uint8_t cc, trailCC;
1467
1468     ccOrQCMask=_NORM_CC_MASK|qcMask;
1469
1470     for(;;) {
1471         if(src==limit) {
1472             break; /* end of string */
1473         }
1474         c=*src;
1475         if(c<minNoMaybe) {
1476             break; /* catches NUL terminater, too */
1477         }
1478
1479         norm32=_getNorm32(c);
1480         if((norm32&ccOrQCMask)==0) {
1481             break; /* true starter */
1482         }
1483
1484         if(isNorm32LeadSurrogate(norm32)) {
1485             /* c is a lead surrogate, get the real norm32 */
1486             if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1487                 break; /* unmatched first surrogate: counts as a true starter */
1488             }
1489             norm32=_getNorm32FromSurrogatePair(norm32, c2);
1490
1491             if((norm32&ccOrQCMask)==0) {
1492                 break; /* true starter */
1493             }
1494         } else {
1495             c2=0;
1496         }
1497
1498         /* (c, c2) is not a true starter but its decomposition may be */
1499         if(norm32&decompQCMask) {
1500             /* (c, c2) decomposes, get everything from the variable-length extra data */
1501             p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1502
1503             /* get the first character's norm32 to check if it is a true starter */
1504             if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1505                 break; /* true starter */
1506             }
1507         }
1508
1509         src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1510     }
1511
1512     return src;
1513 }
1514
1515 /* make NFD & NFKD ---------------------------------------------------------- */
1516
1517 U_CAPI int32_t U_EXPORT2
1518 unorm_getDecomposition(UChar32 c, UBool compat,
1519                        UChar *dest, int32_t destCapacity) {
1520 #if !UNORM_HARDCODE_DATA
1521     UErrorCode errorCode=U_ZERO_ERROR;
1522 #endif
1523     if( (uint32_t)c<=0x10ffff &&
1524 #if !UNORM_HARDCODE_DATA
1525         _haveData(errorCode) &&
1526 #endif
1527         ((dest!=NULL && destCapacity>0) || destCapacity==0)
1528     ) {
1529         uint32_t norm32, qcMask;
1530         UChar32 minNoMaybe;
1531         int32_t length;
1532
1533         /* initialize */
1534         if(!compat) {
1535             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1536             qcMask=_NORM_QC_NFD;
1537         } else {
1538             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1539             qcMask=_NORM_QC_NFKD;
1540         }
1541
1542         if(c<minNoMaybe) {
1543             /* trivial case */
1544             if(destCapacity>0) {
1545                 dest[0]=(UChar)c;
1546             }
1547             return -1;
1548         }
1549
1550         /* data lookup */
1551         UTRIE_GET32(&normTrie, c, norm32);
1552         if((norm32&qcMask)==0) {
1553             /* simple case: no decomposition */
1554             if(c<=0xffff) {
1555                 if(destCapacity>0) {
1556                     dest[0]=(UChar)c;
1557                 }
1558                 return -1;
1559             } else {
1560                 if(destCapacity>=2) {
1561                     dest[0]=UTF16_LEAD(c);
1562                     dest[1]=UTF16_TRAIL(c);
1563                 }
1564                 return -2;
1565             }
1566         } else if(isNorm32HangulOrJamo(norm32)) {
1567             /* Hangul syllable: decompose algorithmically */
1568             UChar c2;
1569
1570             c-=HANGUL_BASE;
1571
1572             c2=(UChar)(c%JAMO_T_COUNT);
1573             c/=JAMO_T_COUNT;
1574             if(c2>0) {
1575                 if(destCapacity>=3) {
1576                     dest[2]=(UChar)(JAMO_T_BASE+c2);
1577                 }
1578                 length=3;
1579             } else {
1580                 length=2;
1581             }
1582
1583             if(destCapacity>=2) {
1584                 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1585                 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1586             }
1587             return length;
1588         } else {
1589             /* c decomposes, get everything from the variable-length extra data */
1590             const UChar *p, *limit;
1591             uint8_t cc, trailCC;
1592
1593             p=_decompose(norm32, qcMask, length, cc, trailCC);
1594             if(length<=destCapacity) {
1595                 limit=p+length;
1596                 do {
1597                     *dest++=*p++;
1598                 } while(p<limit);
1599             }
1600             return length;
1601         }
1602     } else {
1603         return 0;
1604     }
1605 }
1606
1607 static int32_t
1608 _decompose(UChar *dest, int32_t destCapacity,
1609            const UChar *src, int32_t srcLength,
1610            UBool compat, const UnicodeSet *nx,
1611            uint8_t &outTrailCC) {
1612     UChar buffer[3];
1613     const UChar *limit, *prevSrc, *p;
1614     uint32_t norm32, ccOrQCMask, qcMask;
1615     int32_t destIndex, reorderStartIndex, length;
1616     UChar c, c2, minNoMaybe;
1617     uint8_t cc, prevCC, trailCC;
1618
1619     if(!compat) {
1620         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1621         qcMask=_NORM_QC_NFD;
1622     } else {
1623         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1624         qcMask=_NORM_QC_NFKD;
1625     }
1626
1627     /* initialize */
1628     ccOrQCMask=_NORM_CC_MASK|qcMask;
1629     destIndex=reorderStartIndex=0;
1630     prevCC=0;
1631
1632     /* avoid compiler warnings */
1633     norm32=0;
1634     c=0;
1635     cc=0;
1636     trailCC=0;
1637
1638     if(srcLength>=0) {
1639         /* string with length */
1640         limit=src+srcLength;
1641     } else /* srcLength==-1 */ {
1642         /* zero-terminated string */
1643         limit=NULL;
1644     }
1645
1646     U_ALIGN_CODE(16);
1647
1648     for(;;) {
1649         /* count code units below the minimum or with irrelevant data for the quick check */
1650         prevSrc=src;
1651         if(limit==NULL) {
1652             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1653                 prevCC=0;
1654                 ++src;
1655             }
1656         } else {
1657             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1658                 prevCC=0;
1659                 ++src;
1660             }
1661         }
1662
1663         /* copy these code units all at once */
1664         if(src!=prevSrc) {
1665             length=(int32_t)(src-prevSrc);
1666             if((destIndex+length)<=destCapacity) {
1667                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1668             }
1669             destIndex+=length;
1670             reorderStartIndex=destIndex;
1671         }
1672
1673         /* end of source reached? */
1674         if(limit==NULL ? c==0 : src==limit) {
1675             break;
1676         }
1677
1678         /* c already contains *src and norm32 is set for it, increment src */
1679         ++src;
1680
1681         /* check one above-minimum, relevant code unit */
1682         /*
1683          * generally, set p and length to the decomposition string
1684          * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1685          * in all cases, set cc to the lead and trailCC to the trail combining class
1686          *
1687          * the following merge-sort of the current character into the preceding,
1688          * canonically ordered result text will use the optimized _insertOrdered()
1689          * if there is only one single code point to process;
1690          * this is indicated with p==NULL, and (c, c2) is the character to insert
1691          * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1692          * for a supplementary character)
1693          * otherwise, p[length] is merged in with _mergeOrdered()
1694          */
1695         if(isNorm32HangulOrJamo(norm32)) {
1696             if(nx_contains(nx, c)) {
1697                 c2=0;
1698                 p=NULL;
1699                 length=1;
1700             } else {
1701                 /* Hangul syllable: decompose algorithmically */
1702                 p=buffer;
1703                 cc=trailCC=0;
1704
1705                 c-=HANGUL_BASE;
1706
1707                 c2=(UChar)(c%JAMO_T_COUNT);
1708                 c/=JAMO_T_COUNT;
1709                 if(c2>0) {
1710                     buffer[2]=(UChar)(JAMO_T_BASE+c2);
1711                     length=3;
1712                 } else {
1713                     length=2;
1714                 }
1715
1716                 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1717                 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1718             }
1719         } else {
1720             if(isNorm32Regular(norm32)) {
1721                 c2=0;
1722                 length=1;
1723             } else {
1724                 /* c is a lead surrogate, get the real norm32 */
1725                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1726                     ++src;
1727                     length=2;
1728                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
1729                 } else {
1730                     c2=0;
1731                     length=1;
1732                     norm32=0;
1733                 }
1734             }
1735
1736             /* get the decomposition and the lead and trail cc's */
1737             if(nx_contains(nx, c, c2)) {
1738                 /* excluded: norm32==0 */
1739                 cc=trailCC=0;
1740                 p=NULL;
1741             } else if((norm32&qcMask)==0) {
1742                 /* c does not decompose */
1743                 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1744                 p=NULL;
1745             } else {
1746                 /* c decomposes, get everything from the variable-length extra data */
1747                 p=_decompose(norm32, qcMask, length, cc, trailCC);
1748                 if(length==1) {
1749                     /* fastpath a single code unit from decomposition */
1750                     c=*p;
1751                     c2=0;
1752                     p=NULL;
1753                 }
1754             }
1755         }
1756
1757         /* append the decomposition to the destination buffer, assume length>0 */
1758         if((destIndex+length)<=destCapacity) {
1759             UChar *reorderSplit=dest+destIndex;
1760             if(p==NULL) {
1761                 /* fastpath: single code point */
1762                 if(cc!=0 && cc<prevCC) {
1763                     /* (c, c2) is out of order with respect to the preceding text */
1764                     destIndex+=length;
1765                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1766                 } else {
1767                     /* just append (c, c2) */
1768                     dest[destIndex++]=c;
1769                     if(c2!=0) {
1770                         dest[destIndex++]=c2;
1771                     }
1772                 }
1773             } else {
1774                 /* general: multiple code points (ordered by themselves) from decomposition */
1775                 if(cc!=0 && cc<prevCC) {
1776                     /* the decomposition is out of order with respect to the preceding text */
1777                     destIndex+=length;
1778                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1779                 } else {
1780                     /* just append the decomposition */
1781                     do {
1782                         dest[destIndex++]=*p++;
1783                     } while(--length>0);
1784                 }
1785             }
1786         } else {
1787             /* buffer overflow */
1788             /* keep incrementing the destIndex for preflighting */
1789             destIndex+=length;
1790         }
1791
1792         prevCC=trailCC;
1793         if(prevCC==0) {
1794             reorderStartIndex=destIndex;
1795         }
1796     }
1797
1798     outTrailCC=prevCC;
1799     return destIndex;
1800 }
1801
1802 U_CAPI int32_t U_EXPORT2
1803 unorm_decompose(UChar *dest, int32_t destCapacity,
1804                 const UChar *src, int32_t srcLength,
1805                 UBool compat, int32_t options,
1806                 UErrorCode *pErrorCode) {
1807     const UnicodeSet *nx;
1808     int32_t destIndex;
1809     uint8_t trailCC;
1810
1811     if(!_haveData(*pErrorCode)) {
1812         return 0;
1813     }
1814
1815     nx=getNX(options, *pErrorCode);
1816     if(U_FAILURE(*pErrorCode)) {
1817         return 0;
1818     }
1819
1820     destIndex=_decompose(dest, destCapacity,
1821                          src, srcLength,
1822                          compat, nx,
1823                          trailCC);
1824
1825     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1826 }
1827
1828 /* make NFC & NFKC ---------------------------------------------------------- */
1829
1830 /* get the composition properties of the next character */
1831 static inline uint32_t
1832 _getNextCombining(UChar *&p, const UChar *limit,
1833                   UChar &c, UChar &c2,
1834                   uint16_t &combiningIndex, uint8_t &cc,
1835                   const UnicodeSet *nx) {
1836     uint32_t norm32, combineFlags;
1837
1838     /* get properties */
1839     c=*p++;
1840     norm32=_getNorm32(c);
1841
1842     /* preset output values for most characters */
1843     c2=0;
1844     combiningIndex=0;
1845     cc=0;
1846
1847     if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1848         return 0;
1849     } else {
1850         if(isNorm32Regular(norm32)) {
1851             /* set cc etc. below */
1852         } else if(isNorm32HangulOrJamo(norm32)) {
1853             /* a compatibility decomposition contained Jamos */
1854             combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1855             return norm32&_NORM_COMBINES_ANY;
1856         } else {
1857             /* c is a lead surrogate, get the real norm32 */
1858             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1859                 ++p;
1860                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1861             } else {
1862                 c2=0;
1863                 return 0;
1864             }
1865         }
1866
1867         if(nx_contains(nx, c, c2)) {
1868             return 0; /* excluded: norm32==0 */
1869         }
1870
1871         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1872
1873         combineFlags=norm32&_NORM_COMBINES_ANY;
1874         if(combineFlags!=0) {
1875             combiningIndex=*(_getExtraData(norm32)-1);
1876         }
1877         return combineFlags;
1878     }
1879 }
1880
1881 /*
1882  * given a composition-result starter (c, c2) - which means its cc==0,
1883  * it combines forward, it has extra data, its norm32!=0,
1884  * it is not a Hangul or Jamo,
1885  * get just its combineFwdIndex
1886  *
1887  * norm32(c) is special if and only if c2!=0
1888  */
1889 static inline uint16_t
1890 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1891     uint32_t norm32;
1892
1893     norm32=_getNorm32(c);
1894     if(c2!=0) {
1895         norm32=_getNorm32FromSurrogatePair(norm32, c2);
1896     }
1897     return *(_getExtraData(norm32)-1);
1898 }
1899
1900 /*
1901  * Find the recomposition result for
1902  * a forward-combining character
1903  * (specified with a pointer to its part of the combiningTable[])
1904  * and a backward-combining character
1905  * (specified with its combineBackIndex).
1906  *
1907  * If these two characters combine, then set (value, value2)
1908  * with the code unit(s) of the composition character.
1909  *
1910  * Return value:
1911  * 0    do not combine
1912  * 1    combine
1913  * >1   combine, and the composition is a forward-combining starter
1914  *
1915  * See unormimp.h for a description of the composition table format.
1916  */
1917 static inline uint16_t
1918 _combine(const uint16_t *table, uint16_t combineBackIndex,
1919          uint16_t &value, uint16_t &value2) {
1920     uint16_t key;
1921
1922     /* search in the starter's composition table */
1923     for(;;) {
1924         key=*table++;
1925         if(key>=combineBackIndex) {
1926             break;
1927         }
1928         table+= *table&0x8000 ? 2 : 1;
1929     }
1930
1931     /* mask off bit 15, the last-entry-in-the-list flag */
1932     if((key&0x7fff)==combineBackIndex) {
1933         /* found! combine! */
1934         value=*table;
1935
1936         /* is the composition a starter that combines forward? */
1937         key=(uint16_t)((value&0x2000)+1);
1938
1939         /* get the composition result code point from the variable-length result value */
1940         if(value&0x8000) {
1941             if(value&0x4000) {
1942                 /* surrogate pair composition result */
1943                 value=(uint16_t)((value&0x3ff)|0xd800);
1944                 value2=*(table+1);
1945             } else {
1946                 /* BMP composition result U+2000..U+ffff */
1947                 value=*(table+1);
1948                 value2=0;
1949             }
1950         } else {
1951             /* BMP composition result U+0000..U+1fff */
1952             value&=0x1fff;
1953             value2=0;
1954         }
1955
1956         return key;
1957     } else {
1958         /* not found */
1959         return 0;
1960     }
1961 }
1962
1963 static inline UBool
1964 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1965                UBool compat, UChar *dest, const UnicodeSet *nx) {
1966     if(isJamoVTNorm32JamoV(norm32)) {
1967         /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1968         prev=(UChar)(prev-JAMO_L_BASE);
1969         if(prev<JAMO_L_COUNT) {
1970             c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1971
1972             /* check if the next character is a Jamo T (normal or compatibility) */
1973             if(src!=limit) {
1974                 UChar next, t;
1975
1976                 next=*src;
1977                 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1978                     /* normal Jamo T */
1979                     ++src;
1980                     c+=t;
1981                 } else if(compat) {
1982                     /* if NFKC, then check for compatibility Jamo T (BMP only) */
1983                     norm32=_getNorm32(next);
1984                     if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1985                         const UChar *p;
1986                         int32_t length;
1987                         uint8_t cc, trailCC;
1988
1989                         p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1990                         if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1991                             /* compatibility Jamo T */
1992                             ++src;
1993                             c+=t;
1994                         }
1995                     }
1996                 }
1997             }
1998             if(nx_contains(nx, c)) {
1999                 if(!isHangulWithoutJamoT(c)) {
2000                     --src; /* undo ++src from reading the Jamo T */
2001                 }
2002                 return FALSE;
2003             }
2004             if(dest!=0) {
2005                 *dest=c;
2006             }
2007             return TRUE;
2008         }
2009     } else if(isHangulWithoutJamoT(prev)) {
2010         /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2011         c=(UChar)(prev+(c-JAMO_T_BASE));
2012         if(nx_contains(nx, c)) {
2013             return FALSE;
2014         }
2015         if(dest!=0) {
2016             *dest=c;
2017         }
2018         return TRUE;
2019     }
2020     return FALSE;
2021 }
2022
2023 /*
2024  * recompose the characters in [p..limit[
2025  * (which is in NFD - decomposed and canonically ordered),
2026  * adjust limit, and return the trailing cc
2027  *
2028  * since for NFKC we may get Jamos in decompositions, we need to
2029  * recompose those too
2030  *
2031  * note that recomposition never lengthens the text:
2032  * any character consists of either one or two code units;
2033  * a composition may contain at most one more code unit than the original starter,
2034  * while the combining mark that is removed has at least one code unit
2035  */
2036 static uint8_t
2037 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
2038     UChar *starter, *pRemove, *q, *r;
2039     uint32_t combineFlags;
2040     UChar c, c2;
2041     uint16_t combineFwdIndex, combineBackIndex;
2042     uint16_t result, value, value2;
2043     uint8_t cc, prevCC;
2044     UBool starterIsSupplementary;
2045
2046     starter=NULL;                   /* no starter */
2047     combineFwdIndex=0;              /* will not be used until starter!=NULL - avoid compiler warnings */
2048     combineBackIndex=0;             /* will always be set if combineFlags!=0 - avoid compiler warnings */
2049     value=value2=0;                 /* always set by _combine() before used - avoid compiler warnings */
2050     starterIsSupplementary=FALSE;   /* will not be used until starter!=NULL - avoid compiler warnings */
2051     prevCC=0;
2052
2053     for(;;) {
2054         combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2055         if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2056             if(combineBackIndex&0x8000) {
2057                 /* c is a Jamo V/T, see if we can compose it with the previous character */
2058                 /* for the PRI #29 fix, check that there is no intervening combining mark */
2059                 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2060                     pRemove=NULL; /* NULL while no Hangul composition */
2061                     combineFlags=0;
2062                     c2=*starter;
2063                     if(combineBackIndex==0xfff2) {
2064                         /* Jamo V, compose with previous Jamo L and following Jamo T */
2065                         c2=(UChar)(c2-JAMO_L_BASE);
2066                         if(c2<JAMO_L_COUNT) {
2067                             pRemove=p-1;
2068                             c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2069                             if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2070                                 ++p;
2071                                 c+=c2;
2072                             } else {
2073                                 /* the result is an LV syllable, which is a starter (unlike LVT) */
2074                                 combineFlags=_NORM_COMBINES_FWD;
2075                             }
2076                             if(!nx_contains(nx, c)) {
2077                                 *starter=c;
2078                             } else {
2079                                 /* excluded */
2080                                 if(!isHangulWithoutJamoT(c)) {
2081                                     --p; /* undo the ++p from reading the Jamo T */
2082                                 }
2083                                 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2084                                 pRemove=NULL;
2085                             }
2086                         }
2087
2088                     /*
2089                      * Normally, the following can not occur:
2090                      * Since the input is in NFD, there are no Hangul LV syllables that
2091                      * a Jamo T could combine with.
2092                      * All Jamo Ts are combined above when handling Jamo Vs.
2093                      *
2094                      * However, before the PRI #29 fix, this can occur due to
2095                      * an intervening combining mark between the Hangul LV and the Jamo T.
2096                      */
2097                     } else {
2098                         /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2099                         if(isHangulWithoutJamoT(c2)) {
2100                             c2+=(UChar)(c-JAMO_T_BASE);
2101                             if(!nx_contains(nx, c2)) {
2102                                 pRemove=p-1;
2103                                 *starter=c2;
2104                             }
2105                         }
2106                     }
2107
2108                     if(pRemove!=NULL) {
2109                         /* remove the Jamo(s) */
2110                         q=pRemove;
2111                         r=p;
2112                         while(r<limit) {
2113                             *q++=*r++;
2114                         }
2115                         p=pRemove;
2116                         limit=q;
2117                     }
2118
2119                     c2=0; /* c2 held *starter temporarily */
2120
2121                     if(combineFlags!=0) {
2122                         /*
2123                          * not starter=NULL because the composition is a Hangul LV syllable
2124                          * and might combine once more (but only before the PRI #29 fix)
2125                          */
2126
2127                         /* done? */
2128                         if(p==limit) {
2129                             return prevCC;
2130                         }
2131
2132                         /* the composition is a Hangul LV syllable which is a starter that combines forward */
2133                         combineFwdIndex=0xfff0;
2134
2135                         /* we combined; continue with looking for compositions */
2136                         continue;
2137                     }
2138                 }
2139
2140                 /*
2141                  * now: cc==0 and the combining index does not include "forward" ->
2142                  * the rest of the loop body will reset starter to NULL;
2143                  * technically, a composed Hangul syllable is a starter, but it
2144                  * does not combine forward now that we have consumed all eligible Jamos;
2145                  * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2146                  */
2147
2148             } else if(
2149                 /* the starter is not a Hangul LV or Jamo V/T and */
2150                 !(combineFwdIndex&0x8000) &&
2151                 /* the combining mark is not blocked and */
2152                 ((options&UNORM_BEFORE_PRI_29) ?
2153                     (prevCC!=cc || prevCC==0) :
2154                     (prevCC<cc || prevCC==0)) &&
2155                 /* the starter and the combining mark (c, c2) do combine and */
2156                 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2157                 /* the composition result is not excluded */
2158                 !nx_contains(nx, value, value2)
2159             ) {
2160                 /* replace the starter with the composition, remove the combining mark */
2161                 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2162
2163                 /* replace the starter with the composition */
2164                 *starter=(UChar)value;
2165                 if(starterIsSupplementary) {
2166                     if(value2!=0) {
2167                         /* both are supplementary */
2168                         *(starter+1)=(UChar)value2;
2169                     } else {
2170                         /* the composition is shorter than the starter, move the intermediate characters forward one */
2171                         starterIsSupplementary=FALSE;
2172                         q=starter+1;
2173                         r=q+1;
2174                         while(r<pRemove) {
2175                             *q++=*r++;
2176                         }
2177                         --pRemove;
2178                     }
2179                 } else if(value2!=0) {
2180                     /* the composition is longer than the starter, move the intermediate characters back one */
2181                     starterIsSupplementary=TRUE;
2182                     ++starter; /* temporarily increment for the loop boundary */
2183                     q=pRemove;
2184                     r=++pRemove;
2185                     while(starter<q) {
2186                         *--r=*--q;
2187                     }
2188                     *starter=(UChar)value2;
2189                     --starter; /* undo the temporary increment */
2190                 /* } else { both are on the BMP, nothing more to do */
2191                 }
2192
2193                 /* remove the combining mark by moving the following text over it */
2194                 if(pRemove<p) {
2195                     q=pRemove;
2196                     r=p;
2197                     while(r<limit) {
2198                         *q++=*r++;
2199                     }
2200                     p=pRemove;
2201                     limit=q;
2202                 }
2203
2204                 /* keep prevCC because we removed the combining mark */
2205
2206                 /* done? */
2207                 if(p==limit) {
2208                     return prevCC;
2209                 }
2210
2211                 /* is the composition a starter that combines forward? */
2212                 if(result>1) {
2213                     combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2214                 } else {
2215                     starter=NULL;
2216                 }
2217
2218                 /* we combined; continue with looking for compositions */
2219                 continue;
2220             }
2221         }
2222
2223         /* no combination this time */
2224         prevCC=cc;
2225         if(p==limit) {
2226             return prevCC;
2227         }
2228
2229         /* if (c, c2) did not combine, then check if it is a starter */
2230         if(cc==0) {
2231             /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2232             if(combineFlags&_NORM_COMBINES_FWD) {
2233                 /* it may combine with something, prepare for it */
2234                 if(c2==0) {
2235                     starterIsSupplementary=FALSE;
2236                     starter=p-1;
2237                 } else {
2238                     starterIsSupplementary=TRUE;
2239                     starter=p-2;
2240                 }
2241                 combineFwdIndex=combineBackIndex;
2242             } else {
2243                 /* it will not combine with anything */
2244                 starter=NULL;
2245             }
2246         } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2247             /* FCC: no discontiguous compositions; any intervening character blocks */
2248             starter=NULL;
2249         }
2250     }
2251 }
2252
2253 /* decompose and recompose [prevStarter..src[ */
2254 static const UChar *
2255 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2256              const UChar *prevStarter, const UChar *src,
2257              uint8_t &prevCC,
2258              int32_t options, const UnicodeSet *nx,
2259              UErrorCode *pErrorCode) {
2260     UChar *recomposeLimit;
2261     uint8_t trailCC;
2262     UBool compat;
2263
2264     compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2265
2266     /* decompose [prevStarter..src[ */
2267     length=_decompose(buffer, bufferCapacity,
2268                       prevStarter, (int32_t)(src-prevStarter),
2269                       compat, nx,
2270                       trailCC);
2271     if(length>bufferCapacity) {
2272         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2273             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2274             return NULL;
2275         }
2276         length=_decompose(buffer, bufferCapacity,
2277                           prevStarter, (int32_t)(src-prevStarter),
2278                           compat, nx,
2279                           trailCC);
2280     }
2281
2282     /* recompose the decomposition */
2283     recomposeLimit=buffer+length;
2284     if(length>=2) {
2285         prevCC=_recompose(buffer, recomposeLimit, options, nx);
2286     }
2287
2288     /* return with a pointer to the recomposition and its length */
2289     length=(int32_t)(recomposeLimit-buffer);
2290     return buffer;
2291 }
2292
2293 static int32_t
2294 _compose(UChar *dest, int32_t destCapacity,
2295          const UChar *src, int32_t srcLength,
2296          int32_t options, const UnicodeSet *nx,
2297          UErrorCode *pErrorCode) {
2298     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2299     UChar *buffer;
2300     int32_t bufferCapacity;
2301
2302     const UChar *limit, *prevSrc, *prevStarter;
2303     uint32_t norm32, ccOrQCMask, qcMask;
2304     int32_t destIndex, reorderStartIndex, length;
2305     UChar c, c2, minNoMaybe;
2306     uint8_t cc, prevCC;
2307
2308     if(options&_NORM_OPTIONS_COMPAT) {
2309         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2310         qcMask=_NORM_QC_NFKC;
2311     } else {
2312         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2313         qcMask=_NORM_QC_NFC;
2314     }
2315
2316     /* initialize */
2317     buffer=stackBuffer;
2318     bufferCapacity=_STACK_BUFFER_CAPACITY;
2319
2320     /*
2321      * prevStarter points to the last character before the current one
2322      * that is a "true" starter with cc==0 and quick check "yes".
2323      *
2324      * prevStarter will be used instead of looking for a true starter
2325      * while incrementally decomposing [prevStarter..prevSrc[
2326      * in _composePart(). Having a good prevStarter allows to just decompose
2327      * the entire [prevStarter..prevSrc[.
2328      *
2329      * When _composePart() backs out from prevSrc back to prevStarter,
2330      * then it also backs out destIndex by the same amount.
2331      * Therefore, at all times, the (prevSrc-prevStarter) source units
2332      * must correspond 1:1 to destination units counted with destIndex,
2333      * except for reordering.
2334      * This is true for the qc "yes" characters copied in the fast loop,
2335      * and for pure reordering.
2336      * prevStarter must be set forward to src when this is not true:
2337      * In _composePart() and after composing a Hangul syllable.
2338      *
2339      * This mechanism relies on the assumption that the decomposition of a true starter
2340      * also begins with a true starter. gennorm/store.c checks for this.
2341      */
2342     prevStarter=src;
2343
2344     ccOrQCMask=_NORM_CC_MASK|qcMask;
2345     destIndex=reorderStartIndex=0;
2346     prevCC=0;
2347
2348     /* avoid compiler warnings */
2349     norm32=0;
2350     c=0;
2351
2352     if(srcLength>=0) {
2353         /* string with length */
2354         limit=src+srcLength;
2355     } else /* srcLength==-1 */ {
2356         /* zero-terminated string */
2357         limit=NULL;
2358     }
2359
2360     U_ALIGN_CODE(16);
2361
2362     for(;;) {
2363         /* count code units below the minimum or with irrelevant data for the quick check */
2364         prevSrc=src;
2365         if(limit==NULL) {
2366             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2367                 prevCC=0;
2368                 ++src;
2369             }
2370         } else {
2371             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2372                 prevCC=0;
2373                 ++src;
2374             }
2375         }
2376
2377         /* copy these code units all at once */
2378         if(src!=prevSrc) {
2379             length=(int32_t)(src-prevSrc);
2380             if((destIndex+length)<=destCapacity) {
2381                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2382             }
2383             destIndex+=length;
2384             reorderStartIndex=destIndex;
2385
2386             /* set prevStarter to the last character in the quick check loop */
2387             prevStarter=src-1;
2388             if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2389                 --prevStarter;
2390             }
2391
2392             prevSrc=src;
2393         }
2394
2395         /* end of source reached? */
2396         if(limit==NULL ? c==0 : src==limit) {
2397             break;
2398         }
2399
2400         /* c already contains *src and norm32 is set for it, increment src */
2401         ++src;
2402
2403         /*
2404          * source buffer pointers:
2405          *
2406          *  all done      quick check   current char  not yet
2407          *                "yes" but     (c, c2)       processed
2408          *                may combine
2409          *                forward
2410          * [-------------[-------------[-------------[-------------[
2411          * |             |             |             |             |
2412          * start         prevStarter   prevSrc       src           limit
2413          *
2414          *
2415          * destination buffer pointers and indexes:
2416          *
2417          *  all done      might take    not filled yet
2418          *                characters for
2419          *                reordering
2420          * [-------------[-------------[-------------[
2421          * |             |             |             |
2422          * dest      reorderStartIndex destIndex     destCapacity
2423          */
2424
2425         /* check one above-minimum, relevant code unit */
2426         /*
2427          * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2428          * check for Jamo V/T, then for surrogates and regular characters
2429          * c is not a Hangul syllable or Jamo L because
2430          * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2431          */
2432         if(isNorm32HangulOrJamo(norm32)) {
2433             /*
2434              * c is a Jamo V/T:
2435              * try to compose with the previous character, Jamo V also with a following Jamo T,
2436              * and set values here right now in case we just continue with the main loop
2437              */
2438             prevCC=cc=0;
2439             reorderStartIndex=destIndex;
2440
2441             if(
2442                 destIndex>0 &&
2443                 _composeHangul(
2444                     *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2445                     destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2446                     nx)
2447             ) {
2448                 prevStarter=src;
2449                 continue;
2450             }
2451
2452             /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2453             c2=0;
2454             length=1;
2455             prevStarter=prevSrc;
2456         } else {
2457             if(isNorm32Regular(norm32)) {
2458                 c2=0;
2459                 length=1;
2460             } else {
2461                 /* c is a lead surrogate, get the real norm32 */
2462                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2463                     ++src;
2464                     length=2;
2465                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
2466                 } else {
2467                     /* c is an unpaired lead surrogate, nothing to do */
2468                     c2=0;
2469                     length=1;
2470                     norm32=0;
2471                 }
2472             }
2473
2474             /* we are looking at the character (c, c2) at [prevSrc..src[ */
2475             if(nx_contains(nx, c, c2)) {
2476                 /* excluded: norm32==0 */
2477                 cc=0;
2478             } else if((norm32&qcMask)==0) {
2479                 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2480             } else {
2481                 const UChar *p;
2482                 uint32_t decompQCMask;
2483
2484                 /*
2485                  * find appropriate boundaries around this character,
2486                  * decompose the source text from between the boundaries,
2487                  * and recompose it
2488                  *
2489                  * this puts the intermediate text into the side buffer because
2490                  * it might be longer than the recomposition end result,
2491                  * or the destination buffer may be too short or missing
2492                  *
2493                  * note that destIndex may be adjusted backwards to account
2494                  * for source text that passed the quick check but needed to
2495                  * take part in the recomposition
2496                  */
2497                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2498
2499                 /*
2500                  * find the last true starter in [prevStarter..src[
2501                  * it is either the decomposition of the current character (at prevSrc),
2502                  * or prevStarter
2503                  */
2504                 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2505                     prevStarter=prevSrc;
2506                 } else {
2507                     /* adjust destIndex: back out what had been copied with qc "yes" */
2508                     destIndex-=(int32_t)(prevSrc-prevStarter);
2509                 }
2510
2511                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2512                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2513
2514                 /* compose [prevStarter..src[ */
2515                 p=_composePart(stackBuffer, buffer, bufferCapacity,
2516                                length,          /* output */
2517                                prevStarter, src,
2518                                prevCC,          /* output */
2519                                options, nx,
2520                                pErrorCode);
2521
2522                 if(p==NULL) {
2523                     destIndex=0;   /* an error occurred (out of memory) */
2524                     break;
2525                 }
2526
2527                 /* append the recomposed buffer contents to the destination buffer */
2528                 if((destIndex+length)<=destCapacity) {
2529                     while(length>0) {
2530                         dest[destIndex++]=*p++;
2531                         --length;
2532                     }
2533                 } else {
2534                     /* buffer overflow */
2535                     /* keep incrementing the destIndex for preflighting */
2536                     destIndex+=length;
2537                 }
2538
2539                 /* set the next starter */
2540                 prevStarter=src;
2541
2542                 continue;
2543             }
2544         }
2545
2546         /* append the single code point (c, c2) to the destination buffer */
2547         if((destIndex+length)<=destCapacity) {
2548             if(cc!=0 && cc<prevCC) {
2549                 /* (c, c2) is out of order with respect to the preceding text */
2550                 UChar *reorderSplit=dest+destIndex;
2551                 destIndex+=length;
2552                 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2553             } else {
2554                 /* just append (c, c2) */
2555                 dest[destIndex++]=c;
2556                 if(c2!=0) {
2557                     dest[destIndex++]=c2;
2558                 }
2559                 prevCC=cc;
2560             }
2561         } else {
2562             /* buffer overflow */
2563             /* keep incrementing the destIndex for preflighting */
2564             destIndex+=length;
2565             prevCC=cc;
2566         }
2567     }
2568
2569     /* cleanup */
2570     if(buffer!=stackBuffer) {
2571         uprv_free(buffer);
2572     }
2573
2574     return destIndex;
2575 }
2576
2577 U_CAPI int32_t U_EXPORT2
2578 unorm_compose(UChar *dest, int32_t destCapacity,
2579               const UChar *src, int32_t srcLength,
2580               UBool compat, int32_t options,
2581               UErrorCode *pErrorCode) {
2582     const UnicodeSet *nx;
2583     int32_t destIndex;
2584
2585     if(!_haveData(*pErrorCode)) {
2586         return 0;
2587     }
2588
2589     nx=getNX(options, *pErrorCode);
2590     if(U_FAILURE(*pErrorCode)) {
2591         return 0;
2592     }
2593
2594     /* reset options bits that should only be set here or inside _compose() */
2595     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2596
2597     if(compat) {
2598         options|=_NORM_OPTIONS_COMPAT;
2599     }
2600
2601     destIndex=_compose(dest, destCapacity,
2602                        src, srcLength,
2603                        options, nx,
2604                        pErrorCode);
2605
2606     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2607 }
2608
2609 /* make FCD ----------------------------------------------------------------- */
2610
2611 static const UChar *
2612 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2613     UChar c, c2;
2614
2615     /*
2616      * find the first position in [src..limit[ after some cc==0 according to FCD data
2617      *
2618      * at the beginning of the loop, we have fcd16 from before src
2619      *
2620      * stop at positions:
2621      * - after trail cc==0
2622      * - at the end of the source
2623      * - before lead cc==0
2624      */
2625     for(;;) {
2626         /* stop if trail cc==0 for the previous character */
2627         if((fcd16&0xff)==0) {
2628             break;
2629         }
2630
2631         /* get c=*src - stop at end of string */
2632         if(src==limit) {
2633             break;
2634         }
2635         c=*src;
2636
2637         /* stop if lead cc==0 for this character */
2638         if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2639             break; /* catches terminating NUL, too */
2640         }
2641
2642         if(!UTF_IS_FIRST_SURROGATE(c)) {
2643             if(fcd16<=0xff) {
2644                 break;
2645             }
2646             ++src;
2647         } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2648             /* c is a lead surrogate, get the real fcd16 */
2649             fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2650             if(fcd16<=0xff) {
2651                 break;
2652             }
2653             src+=2;
2654         } else {
2655             /* c is an unpaired first surrogate, lead cc==0 */
2656             break;
2657         }
2658     }
2659
2660     return src;
2661 }
2662
2663 static uint8_t
2664 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2665               UChar *dest, int32_t &destIndex, int32_t destCapacity,
2666               const UnicodeSet *nx) {
2667     const UChar *p;
2668     uint32_t norm32;
2669     int32_t reorderStartIndex, length;
2670     UChar c, c2;
2671     uint8_t cc, prevCC, trailCC;
2672
2673     /*
2674      * canonically decompose [src..decompLimit[
2675      *
2676      * all characters in this range have some non-zero cc,
2677      * directly or in decomposition,
2678      * so that we do not need to check in the following for quick-check limits etc.
2679      *
2680      * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2681      *
2682      * we also do not need to check for c==0 because we have an established decompLimit
2683      */
2684     reorderStartIndex=destIndex;
2685     prevCC=0;
2686
2687     while(src<decompLimit) {
2688         c=*src++;
2689         norm32=_getNorm32(c);
2690         if(isNorm32Regular(norm32)) {
2691             c2=0;
2692             length=1;
2693         } else {
2694             /*
2695              * reminder: this function is called with [src..decompLimit[
2696              * not containing any Hangul/Jamo characters,
2697              * therefore the only specials are lead surrogates
2698              */
2699             /* c is a lead surrogate, get the real norm32 */
2700             if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2701                 ++src;
2702                 length=2;
2703                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2704             } else {
2705                 c2=0;
2706                 length=1;
2707                 norm32=0;
2708             }
2709         }
2710
2711         /* get the decomposition and the lead and trail cc's */
2712         if(nx_contains(nx, c, c2)) {
2713             /* excluded: norm32==0 */
2714             cc=trailCC=0;
2715             p=NULL;
2716         } else if((norm32&_NORM_QC_NFD)==0) {
2717             /* c does not decompose */
2718             cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2719             p=NULL;
2720         } else {
2721             /* c decomposes, get everything from the variable-length extra data */
2722             p=_decompose(norm32, length, cc, trailCC);
2723             if(length==1) {
2724                 /* fastpath a single code unit from decomposition */
2725                 c=*p;
2726                 c2=0;
2727                 p=NULL;
2728             }
2729         }
2730
2731         /* append the decomposition to the destination buffer, assume length>0 */
2732         if((destIndex+length)<=destCapacity) {
2733             UChar *reorderSplit=dest+destIndex;
2734             if(p==NULL) {
2735                 /* fastpath: single code point */
2736                 if(cc!=0 && cc<prevCC) {
2737                     /* (c, c2) is out of order with respect to the preceding text */
2738                     destIndex+=length;
2739                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2740                 } else {
2741                     /* just append (c, c2) */
2742                     dest[destIndex++]=c;
2743                     if(c2!=0) {
2744                         dest[destIndex++]=c2;
2745                     }
2746                 }
2747             } else {
2748                 /* general: multiple code points (ordered by themselves) from decomposition */
2749                 if(cc!=0 && cc<prevCC) {
2750                     /* the decomposition is out of order with respect to the preceding text */
2751                     destIndex+=length;
2752                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2753                 } else {
2754                     /* just append the decomposition */
2755                     do {
2756                         dest[destIndex++]=*p++;
2757                     } while(--length>0);
2758                 }
2759             }
2760         } else {
2761             /* buffer overflow */
2762             /* keep incrementing the destIndex for preflighting */
2763             destIndex+=length;
2764         }
2765
2766         prevCC=trailCC;
2767         if(prevCC==0) {
2768             reorderStartIndex=destIndex;
2769         }
2770     }
2771
2772     return prevCC;
2773 }
2774
2775 static int32_t
2776 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2777               const UChar *src, int32_t srcLength,
2778               const UnicodeSet *nx,
2779               UErrorCode *pErrorCode) {
2780     const UChar *limit, *prevSrc, *decompStart;
2781     int32_t destIndex, length;
2782     UChar c, c2;
2783     uint16_t fcd16;
2784     int16_t prevCC, cc;
2785
2786     if(!_haveData(*pErrorCode)) {
2787         return 0;
2788     }
2789
2790     /* initialize */
2791     decompStart=src;
2792     destIndex=0;
2793     prevCC=0;
2794
2795     /* avoid compiler warnings */
2796     c=0;
2797     fcd16=0;
2798
2799     if(srcLength>=0) {
2800         /* string with length */
2801         limit=src+srcLength;
2802     } else /* srcLength==-1 */ {
2803         /* zero-terminated string */
2804         limit=NULL;
2805     }
2806
2807     U_ALIGN_CODE(16);
2808
2809     for(;;) {
2810         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2811         prevSrc=src;
2812         if(limit==NULL) {
2813             for(;;) {
2814                 c=*src;
2815                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2816                     if(c==0) {
2817                         break;
2818                     }
2819                     prevCC=(int16_t)-c;
2820                 } else if((fcd16=_getFCD16(c))==0) {
2821                     prevCC=0;
2822                 } else {
2823                     break;
2824                 }
2825                 ++src;
2826             }
2827         } else {
2828             for(;;) {
2829                 if(src==limit) {
2830                     break;
2831                 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2832                     prevCC=(int16_t)-c;
2833                 } else if((fcd16=_getFCD16(c))==0) {
2834                     prevCC=0;
2835                 } else {
2836                     break;
2837                 }
2838                 ++src;
2839             }
2840         }
2841
2842         /*
2843          * prevCC has values from the following ranges:
2844          * 0..0xff - the previous trail combining class
2845          * <0      - the negative value of the previous code unit;
2846          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2847          *           was deferred so that average text is checked faster
2848          */
2849
2850         /* copy these code units all at once */
2851         if(src!=prevSrc) {
2852             length=(int32_t)(src-prevSrc);
2853             if((destIndex+length)<=destCapacity) {
2854                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2855             }
2856             destIndex+=length;
2857             prevSrc=src;
2858
2859             /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2860             if(prevCC<0) {
2861                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2862                 if(!nx_contains(nx, (UChar32)-prevCC)) {
2863                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2864                 } else {
2865                     prevCC=0; /* excluded: fcd16==0 */
2866                 }
2867
2868                 /*
2869                  * set a pointer to this below-U+0300 character;
2870                  * if prevCC==0 then it will moved to after this character below
2871                  */
2872                 decompStart=prevSrc-1;
2873             }
2874         }
2875         /*
2876          * now:
2877          * prevSrc==src - used later to adjust destIndex before decomposition
2878          * prevCC>=0
2879          */
2880
2881         /* end of source reached? */
2882         if(limit==NULL ? c==0 : src==limit) {
2883             break;
2884         }
2885
2886         /* set a pointer to after the last source position where prevCC==0 */
2887         if(prevCC==0) {
2888             decompStart=prevSrc;
2889         }
2890
2891         /* c already contains *src and fcd16 is set for it, increment src */
2892         ++src;
2893
2894         /* check one above-minimum, relevant code unit */
2895         if(UTF_IS_FIRST_SURROGATE(c)) {
2896             /* c is a lead surrogate, get the real fcd16 */
2897             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2898                 ++src;
2899                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2900             } else {
2901                 c2=0;
2902                 fcd16=0;
2903             }
2904         } else {
2905             c2=0;
2906         }
2907
2908         /* we are looking at the character (c, c2) at [prevSrc..src[ */
2909         if(nx_contains(nx, c, c2)) {
2910             fcd16=0; /* excluded: fcd16==0 */
2911         }
2912
2913         /* check the combining order, get the lead cc */
2914         cc=(int16_t)(fcd16>>8);
2915         if(cc==0 || cc>=prevCC) {
2916             /* the order is ok */
2917             if(cc==0) {
2918                 decompStart=prevSrc;
2919             }
2920             prevCC=(int16_t)(fcd16&0xff);
2921
2922             /* just append (c, c2) */
2923             length= c2==0 ? 1 : 2;
2924             if((destIndex+length)<=destCapacity) {
2925                 dest[destIndex++]=c;
2926                 if(c2!=0) {
2927                     dest[destIndex++]=c2;
2928                 }
2929             } else {
2930                 destIndex+=length;
2931             }
2932         } else {
2933             /*
2934              * back out the part of the source that we copied already but
2935              * is now going to be decomposed;
2936              * prevSrc is set to after what was copied
2937              */
2938             destIndex-=(int32_t)(prevSrc-decompStart);
2939
2940             /*
2941              * find the part of the source that needs to be decomposed;
2942              * to be safe and simple, decompose to before the next character with lead cc==0
2943              */
2944             src=_findSafeFCD(src, limit, fcd16);
2945
2946             /*
2947              * the source text does not fulfill the conditions for FCD;
2948              * decompose and reorder a limited piece of the text
2949              */
2950             prevCC=_decomposeFCD(decompStart, src,
2951                                  dest, destIndex, destCapacity,
2952                                  nx);
2953             decompStart=src;
2954         }
2955     }
2956
2957     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2958 }
2959
2960 /* quick check functions ---------------------------------------------------- */
2961
2962 static UBool
2963 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2964     const UChar *limit;
2965     UChar c, c2;
2966     uint16_t fcd16;
2967     int16_t prevCC, cc;
2968
2969     /* initialize */
2970     prevCC=0;
2971
2972     if(srcLength>=0) {
2973         /* string with length */
2974         limit=src+srcLength;
2975     } else /* srcLength==-1 */ {
2976         /* zero-terminated string */
2977         limit=NULL;
2978     }
2979
2980     U_ALIGN_CODE(16);
2981
2982     for(;;) {
2983         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2984         if(limit==NULL) {
2985             for(;;) {
2986                 c=*src++;
2987                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2988                     if(c==0) {
2989                         return TRUE;
2990                     }
2991                     /*
2992                      * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2993                      * because chances are good that the next one will have
2994                      * a leading cc of 0;
2995                      * _getFCD16(-prevCC) is later called when necessary -
2996                      * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2997                      */
2998                     prevCC=(int16_t)-c;
2999                 } else if((fcd16=_getFCD16(c))==0) {
3000                     prevCC=0;
3001                 } else {
3002                     break;
3003                 }
3004             }
3005         } else {
3006             for(;;) {
3007                 if(src==limit) {
3008                     return TRUE;
3009                 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
3010                     prevCC=(int16_t)-c;
3011                 } else if((fcd16=_getFCD16(c))==0) {
3012                     prevCC=0;
3013                 } else {
3014                     break;
3015                 }
3016             }
3017         }
3018
3019         /* check one above-minimum, relevant code unit */
3020         if(UTF_IS_FIRST_SURROGATE(c)) {
3021             /* c is a lead surrogate, get the real fcd16 */
3022             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3023                 ++src;
3024                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
3025             } else {
3026                 c2=0;
3027                 fcd16=0;
3028             }
3029         } else {
3030             c2=0;
3031         }
3032
3033         if(nx_contains(nx, c, c2)) {
3034             prevCC=0; /* excluded: fcd16==0 */
3035             continue;
3036         }
3037
3038         /*
3039          * prevCC has values from the following ranges:
3040          * 0..0xff - the previous trail combining class
3041          * <0      - the negative value of the previous code unit;
3042          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3043          *           was deferred so that average text is checked faster
3044          */
3045
3046         /* check the combining order */
3047         cc=(int16_t)(fcd16>>8);
3048         if(cc!=0) {
3049             if(prevCC<0) {
3050                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3051                 if(!nx_contains(nx, (UChar32)-prevCC)) {
3052                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3053                 } else {
3054                     prevCC=0; /* excluded: fcd16==0 */
3055                 }
3056             }
3057
3058             if(cc<prevCC) {
3059                 return FALSE;
3060             }
3061         }
3062         prevCC=(int16_t)(fcd16&0xff);
3063     }
3064 }
3065
3066 static UNormalizationCheckResult
3067 _quickCheck(const UChar *src,
3068             int32_t srcLength,
3069             UNormalizationMode mode,
3070             UBool allowMaybe,
3071             const UnicodeSet *nx,
3072             UErrorCode *pErrorCode) {
3073     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3074     UChar *buffer;
3075     int32_t bufferCapacity;
3076
3077     const UChar *start, *limit;
3078     uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3079     int32_t options;
3080     UChar c, c2, minNoMaybe;
3081     uint8_t cc, prevCC;
3082     UNormalizationCheckResult result;
3083
3084     /* check arguments */
3085     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3086         return UNORM_MAYBE;
3087     }
3088
3089     if(src==NULL || srcLength<-1) {
3090         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3091         return UNORM_MAYBE;
3092     }
3093
3094     if(!_haveData(*pErrorCode)) {
3095         return UNORM_MAYBE;
3096     }
3097
3098     /* check for a valid mode and set the quick check minimum and mask */
3099     switch(mode) {
3100     case UNORM_NFC:
3101         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3102         qcMask=_NORM_QC_NFC;
3103         options=0;
3104         break;
3105     case UNORM_NFKC:
3106         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3107         qcMask=_NORM_QC_NFKC;
3108         options=_NORM_OPTIONS_COMPAT;
3109         break;
3110     case UNORM_NFD:
3111         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3112         qcMask=_NORM_QC_NFD;
3113         options=0;
3114         break;
3115     case UNORM_NFKD:
3116         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3117         qcMask=_NORM_QC_NFKD;
3118         options=_NORM_OPTIONS_COMPAT;
3119         break;
3120     case UNORM_FCD:
3121         if(fcdTrie.index==NULL) {
3122             *pErrorCode=U_UNSUPPORTED_ERROR;
3123             return UNORM_MAYBE;
3124         }
3125         return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3126     default:
3127         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3128         return UNORM_MAYBE;
3129     }
3130
3131     /* initialize */
3132     buffer=stackBuffer;
3133     bufferCapacity=_STACK_BUFFER_CAPACITY;
3134
3135     ccOrQCMask=_NORM_CC_MASK|qcMask;
3136     result=UNORM_YES;
3137     prevCC=0;
3138
3139     start=src;
3140     if(srcLength>=0) {
3141         /* string with length */
3142         limit=src+srcLength;
3143     } else /* srcLength==-1 */ {
3144         /* zero-terminated string */
3145         limit=NULL;
3146     }
3147
3148     U_ALIGN_CODE(16);
3149
3150     for(;;) {
3151         /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3152         if(limit==NULL) {
3153             for(;;) {
3154                 c=*src++;
3155                 if(c<minNoMaybe) {
3156                     if(c==0) {
3157                         goto endloop; /* break out of outer loop */
3158                     }
3159                 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3160                     break;
3161                 }
3162                 prevCC=0;
3163             }
3164         } else {
3165             for(;;) {
3166                 if(src==limit) {
3167                     goto endloop; /* break out of outer loop */
3168                 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3169                     break;
3170                 }
3171                 prevCC=0;
3172             }
3173         }
3174
3175         /* check one above-minimum, relevant code unit */
3176         if(isNorm32LeadSurrogate(norm32)) {
3177             /* c is a lead surrogate, get the real norm32 */
3178             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3179                 ++src;
3180                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3181             } else {
3182                 c2=0;
3183                 norm32=0;
3184             }
3185         } else {
3186             c2=0;
3187         }
3188
3189         if(nx_contains(nx, c, c2)) {
3190             /* excluded: norm32==0 */
3191             norm32=0;
3192         }
3193
3194         /* check the combining order */
3195         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3196         if(cc!=0 && cc<prevCC) {
3197             result=UNORM_NO;
3198             break;
3199         }
3200         prevCC=cc;
3201
3202         /* check for "no" or "maybe" quick check flags */
3203         qcNorm32=norm32&qcMask;
3204         if(qcNorm32&_NORM_QC_ANY_NO) {
3205             result=UNORM_NO;
3206             break;
3207         } else if(qcNorm32!=0) {
3208             /* "maybe" can only occur for NFC and NFKC */
3209             if(allowMaybe) {
3210                 result=UNORM_MAYBE;
3211             } else {
3212                 /* normalize a section around here to see if it is really normalized or not */
3213                 const UChar *prevStarter;
3214                 uint32_t decompQCMask;
3215                 int32_t length;
3216
3217                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3218
3219                 /* find the previous starter */
3220                 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3221                 if(UTF_IS_TRAIL(*prevStarter)) {
3222                     --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3223                 }
3224                 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3225
3226                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3227                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3228
3229                 /* decompose and recompose [prevStarter..src[ */
3230                 _composePart(stackBuffer, buffer, bufferCapacity,
3231                              length,
3232                              prevStarter,
3233                              src,
3234                              prevCC,
3235                              options, nx, pErrorCode);
3236                 if(U_FAILURE(*pErrorCode)) {
3237                     result=UNORM_MAYBE; /* error (out of memory) */
3238                     break;
3239                 }
3240
3241                 /* compare the normalized version with the original */
3242                 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3243                     result=UNORM_NO; /* normalization differs */
3244                     break;
3245                 }
3246
3247                 /* continue after the next starter */
3248             }
3249         }
3250     }
3251 endloop:
3252
3253     if(buffer!=stackBuffer) {
3254         uprv_free(buffer);
3255     }
3256
3257     return result;
3258 }
3259
3260 U_CAPI UNormalizationCheckResult U_EXPORT2
3261 unorm_quickCheck(const UChar *src,
3262                  int32_t srcLength,
3263                  UNormalizationMode mode,
3264                  UErrorCode *pErrorCode) {
3265     return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3266 }
3267
3268 U_CAPI UNormalizationCheckResult U_EXPORT2
3269 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3270                             UNormalizationMode mode, int32_t options,
3271                             UErrorCode *pErrorCode) {
3272     return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3273 }
3274
3275 U_CFUNC UNormalizationCheckResult
3276 unorm_internalQuickCheck(const UChar *src,
3277                          int32_t srcLength,
3278                          UNormalizationMode mode,
3279                          UBool allowMaybe,
3280                          const UnicodeSet *nx,
3281                          UErrorCode *pErrorCode) {
3282     return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3283 }
3284
3285 U_CAPI UBool U_EXPORT2
3286 unorm_isNormalized(const UChar *src, int32_t srcLength,
3287                    UNormalizationMode mode,
3288                    UErrorCode *pErrorCode) {
3289     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3290 }
3291
3292 U_CAPI UBool U_EXPORT2
3293 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3294                               UNormalizationMode mode, int32_t options,
3295                               UErrorCode *pErrorCode) {
3296     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3297 }
3298
3299 /* normalize() API ---------------------------------------------------------- */
3300
3301 /**
3302  * Internal API for normalizing.
3303  * Does not check for bad input.
3304  * Requires _haveData() to be true.
3305  * @internal
3306  */
3307 U_CFUNC int32_t
3308 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3309                               const UChar *src, int32_t srcLength,
3310                               UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3311                               UErrorCode *pErrorCode) {
3312     int32_t destLength;
3313     uint8_t trailCC;
3314
3315     switch(mode) {
3316     case UNORM_NFD:
3317         destLength=_decompose(dest, destCapacity,
3318                               src, srcLength,
3319                               FALSE, nx, trailCC);
3320         break;
3321     case UNORM_NFKD:
3322         destLength=_decompose(dest, destCapacity,
3323                               src, srcLength,
3324                               TRUE, nx, trailCC);
3325         break;
3326     case UNORM_NFC:
3327         destLength=_compose(dest, destCapacity,
3328                             src, srcLength,
3329                             options, nx, pErrorCode);
3330         break;
3331     case UNORM_NFKC:
3332         destLength=_compose(dest, destCapacity,
3333                             src, srcLength,
3334                             options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3335         break;
3336     case UNORM_FCD:
3337         if(fcdTrie.index==NULL) {
3338             *pErrorCode=U_UNSUPPORTED_ERROR;
3339             return 0;
3340         }
3341         return unorm_makeFCD(dest, destCapacity,
3342                              src, srcLength,
3343                              nx,
3344                              pErrorCode);
3345 #if 0
3346     case UNORM_FCC:
3347         destLength=_compose(dest, destCapacity,
3348                             src, srcLength,
3349                             options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3350         break;
3351 #endif
3352     case UNORM_NONE:
3353         /* just copy the string */
3354         if(srcLength==-1) {
3355             srcLength=u_strlen(src);
3356         }
3357         if(srcLength>0 && srcLength<=destCapacity) {
3358             uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3359         }
3360         destLength=srcLength;
3361         break;
3362     default:
3363         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3364         return 0;
3365     }
3366
3367     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3368 }
3369
3370 /**
3371  * Internal API for normalizing.
3372  * Does not check for bad input.
3373  * @internal
3374  */
3375 U_CAPI int32_t U_EXPORT2
3376 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3377                         const UChar *src, int32_t srcLength,
3378                         UNormalizationMode mode, int32_t options,
3379                         UErrorCode *pErrorCode) {
3380     const UnicodeSet *nx;
3381
3382     if(!_haveData(*pErrorCode)) {
3383         return 0;
3384     }
3385
3386     nx=getNX(options, *pErrorCode);
3387     if(U_FAILURE(*pErrorCode)) {
3388         return 0;
3389     }
3390
3391     /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3392     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3393
3394     return unorm_internalNormalizeWithNX(dest, destCapacity,
3395                                          src, srcLength,
3396                                          mode, options, nx,
3397                                          pErrorCode);
3398 }
3399
3400 /** Public API for normalizing. */
3401 U_CAPI int32_t U_EXPORT2
3402 unorm_normalize(const UChar *src, int32_t srcLength,
3403                 UNormalizationMode mode, int32_t options,
3404                 UChar *dest, int32_t destCapacity,
3405                 UErrorCode *pErrorCode) {
3406     /* check argument values */
3407     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3408         return 0;
3409     }
3410
3411     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3412         src==NULL || srcLength<-1
3413     ) {
3414         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3415         return 0;
3416     }
3417
3418     /* check for overlapping src and destination */
3419     if( dest!=NULL &&
3420         ((src>=dest && src<(dest+destCapacity)) ||
3421          (srcLength>0 && dest>=src && dest<(src+srcLength)))
3422     ) {
3423         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3424         return 0;
3425     }
3426
3427     return unorm_internalNormalize(dest, destCapacity,
3428                                    src, srcLength,
3429                                    mode, options,
3430                                    pErrorCode);
3431 }
3432
3433
3434 /* iteration functions ------------------------------------------------------ */
3435
3436 /*
3437  * These iteration functions are the core implementations of the
3438  * Normalizer class iteration API.
3439  * They read from a UCharIterator into their own buffer
3440  * and normalize into the Normalizer iteration buffer.
3441  * Normalizer itself then iterates over its buffer until that needs to be
3442  * filled again.
3443  */
3444
3445 /*
3446  * ### TODO:
3447  * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3448  * if iteration bounds are reached,
3449  * try to not call hasNext/hasPrevious and instead check for >=0.
3450  */
3451
3452 /* backward iteration ------------------------------------------------------- */
3453
3454 /*
3455  * read backwards and get norm32
3456  * return 0 if the character is <minC
3457  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3458  */
3459 static inline uint32_t
3460 _getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3461     uint32_t norm32;
3462
3463     /* need src.hasPrevious() */
3464     c=(UChar)src.previous(&src);
3465     c2=0;
3466
3467     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3468     if(c<minC) {
3469         return 0;
3470     } else if(!UTF_IS_SURROGATE(c)) {
3471         return _getNorm32(c);
3472     } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3473         /* unpaired surrogate */
3474         return 0;
3475     } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3476         norm32=_getNorm32(c2);
3477         if((norm32&mask)==0) {
3478             /* all surrogate pairs with this lead surrogate have irrelevant data */
3479             return 0;
3480         } else {
3481             /* norm32 must be a surrogate special */
3482             return _getNorm32FromSurrogatePair(norm32, c);
3483         }
3484     } else {
3485         /* unpaired second surrogate, undo the c2=src.previous() movement */
3486         src.move(&src, 1, UITER_CURRENT);
3487         c2=0;
3488         return 0;
3489     }
3490 }
3491
3492 /*
3493  * read backwards and check if the character is a previous-iteration boundary
3494  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3495  */
3496 typedef UBool
3497 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3498
3499 /*
3500  * for NF*D:
3501  * read backwards and check if the lead combining class is 0
3502  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3503  */
3504 static UBool
3505 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3506     return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3507 }
3508
3509 /*
3510  * read backwards and check if the character is (or its decomposition begins with)
3511  * a "true starter" (cc==0 and NF*C_YES)
3512  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3513  */
3514 static UBool
3515 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3516     uint32_t norm32, decompQCMask;
3517
3518     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3519     norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3520     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3521 }
3522
3523 static int32_t
3524 _findPreviousIterationBoundary(UCharIterator &src,
3525                                IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3526                                UChar *&buffer, int32_t &bufferCapacity,
3527                                int32_t &startIndex,
3528                                UErrorCode *pErrorCode) {
3529     UChar *stackBuffer;
3530     UChar c, c2;
3531     UBool isBoundary;
3532
3533     /* initialize */
3534     stackBuffer=buffer;
3535     startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3536
3537     while(src.hasPrevious(&src)) {
3538         isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3539
3540         /* always write this character to the front of the buffer */
3541         /* make sure there is enough space in the buffer */
3542         if(startIndex < (c2==0 ? 1 : 2)) {
3543             int32_t bufferLength=bufferCapacity;
3544
3545             if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3546                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3547                 src.move(&src, 0, UITER_START);
3548                 return 0;
3549             }
3550
3551             /* move the current buffer contents up */
3552             uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3553             startIndex+=bufferCapacity-bufferLength;
3554         }
3555
3556         buffer[--startIndex]=c;
3557         if(c2!=0) {
3558             buffer[--startIndex]=c2;
3559         }
3560
3561         /* stop if this just-copied character is a boundary */
3562         if(isBoundary) {
3563             break;
3564         }
3565     }
3566
3567     /* return the length of the buffer contents */
3568     return bufferCapacity-startIndex;
3569 }
3570
3571 U_CAPI int32_t U_EXPORT2
3572 unorm_previous(UCharIterator *src,
3573                UChar *dest, int32_t destCapacity,
3574                UNormalizationMode mode, int32_t options,
3575                UBool doNormalize, UBool *pNeededToNormalize,
3576                UErrorCode *pErrorCode) {
3577     UChar stackBuffer[100];
3578     UChar *buffer=NULL;
3579     IsPrevBoundaryFn *isPreviousBoundary=NULL;
3580     uint32_t mask=0;
3581     int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3582     int32_t c=0, c2=0;
3583     UChar minC=0;
3584
3585     /* check argument values */
3586     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3587         return 0;
3588     }
3589
3590     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3591         src==NULL
3592     ) {
3593         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3594         return 0;
3595     }
3596
3597     if(!_haveData(*pErrorCode)) {
3598         return 0;
3599     }
3600
3601     if(pNeededToNormalize!=NULL) {
3602         *pNeededToNormalize=FALSE;
3603     }
3604
3605     switch(mode) {
3606     case UNORM_FCD:
3607         if(fcdTrie.index==NULL) {
3608             *pErrorCode=U_UNSUPPORTED_ERROR;
3609             return 0;
3610         }
3611         /* fall through to NFD */
3612     case UNORM_NFD:
3613         isPreviousBoundary=_isPrevNFDSafe;
3614         minC=_NORM_MIN_WITH_LEAD_CC;
3615         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3616         break;
3617     case UNORM_NFKD:
3618         isPreviousBoundary=_isPrevNFDSafe;
3619         minC=_NORM_MIN_WITH_LEAD_CC;
3620         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3621         break;
3622     case UNORM_NFC:
3623         isPreviousBoundary=_isPrevTrueStarter;
3624         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3625         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3626         break;
3627     case UNORM_NFKC:
3628         isPreviousBoundary=_isPrevTrueStarter;
3629         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3630         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3631         break;
3632     case UNORM_NONE:
3633         destLength=0;
3634         if((c=src->previous(src))>=0) {
3635             destLength=1;
3636             if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3637                 if(UTF_IS_LEAD(c2)) {
3638                     if(destCapacity>=2) {
3639                         dest[1]=(UChar)c; /* trail surrogate */
3640                         destLength=2;
3641                     }
3642                     c=c2; /* lead surrogate to be written below */
3643                 } else {
3644                     src->move(src, 1, UITER_CURRENT);
3645                 }
3646             }
3647
3648             if(destCapacity>0) {
3649                 dest[0]=(UChar)c;
3650             }
3651         }
3652         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3653     default:
3654         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3655         return 0;
3656     }
3657
3658     buffer=stackBuffer;
3659     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3660     bufferLength=_findPreviousIterationBoundary(*src,
3661                                                 isPreviousBoundary, minC, mask,
3662                                                 buffer, bufferCapacity,
3663                                                 startIndex,
3664                                                 pErrorCode);
3665     if(bufferLength>0) {
3666         if(doNormalize) {
3667             destLength=unorm_internalNormalize(dest, destCapacity,
3668                                                buffer+startIndex, bufferLength,
3669                                                mode, options,
3670                                                pErrorCode);
3671             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3672                 *pNeededToNormalize=
3673                     (UBool)(destLength!=bufferLength ||
3674                             0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3675             }
3676         } else {
3677             /* just copy the source characters */
3678             if(destCapacity>0) {
3679                 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3680             }
3681             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3682         }
3683     } else {
3684         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3685     }
3686
3687     /* cleanup */
3688     if(buffer!=stackBuffer) {
3689         uprv_free(buffer);
3690     }
3691
3692     return destLength;
3693 }
3694
3695 /* forward iteration -------------------------------------------------------- */
3696
3697 /*
3698  * read forward and get norm32
3699  * return 0 if the character is <minC
3700  * if c2!=0 then (c2, c) is a surrogate pair
3701  * always reads complete characters
3702  */
3703 static inline uint32_t
3704 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3705     uint32_t norm32;
3706
3707     /* need src.hasNext() to be true */
3708     c=(UChar)src.next(&src);
3709     c2=0;
3710
3711     if(c<minC) {
3712         return 0;
3713     }
3714
3715     norm32=_getNorm32(c);
3716     if(UTF_IS_FIRST_SURROGATE(c)) {
3717         if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3718             src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3719             if((norm32&mask)==0) {
3720                 /* irrelevant data */
3721                 return 0;
3722             } else {
3723                 /* norm32 must be a surrogate special */
3724                 return _getNorm32FromSurrogatePair(norm32, c2);
3725             }
3726         } else {
3727             /* unmatched surrogate */
3728             c2=0;
3729             return 0;
3730         }
3731     }
3732     return norm32;
3733 }
3734
3735 /*
3736  * read forward and check if the character is a next-iteration boundary
3737  * if c2!=0 then (c, c2) is a surrogate pair
3738  */
3739 typedef UBool
3740 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3741
3742 /*
3743  * for NF*D:
3744  * read forward and check if the lead combining class is 0
3745  * if c2!=0 then (c, c2) is a surrogate pair
3746  */
3747 static UBool
3748 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3749     return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3750 }
3751
3752 /*
3753  * for NF*C:
3754  * read forward and check if the character is (or its decomposition begins with)
3755  * a "true starter" (cc==0 and NF*C_YES)
3756  * if c2!=0 then (c, c2) is a surrogate pair
3757  */
3758 static UBool
3759 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3760     uint32_t norm32, decompQCMask;
3761
3762     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3763     norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3764     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3765 }
3766
3767 static int32_t
3768 _findNextIterationBoundary(UCharIterator &src,
3769                            IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3770                            UChar *&buffer, int32_t &bufferCapacity,
3771                            UErrorCode *pErrorCode) {
3772     UChar *stackBuffer;
3773     int32_t bufferIndex;
3774     UChar c, c2;
3775
3776     if(!src.hasNext(&src)) {
3777         return 0;
3778     }
3779
3780     /* initialize */
3781     stackBuffer=buffer;
3782
3783     /* get one character and ignore its properties */
3784     buffer[0]=c=(UChar)src.next(&src);
3785     bufferIndex=1;
3786     if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3787         if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3788             buffer[bufferIndex++]=c2;
3789         } else {
3790             src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3791         }
3792     }
3793
3794     /* get all following characters until we see a boundary */
3795     /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3796     while(src.hasNext(&src)) {
3797         if(isNextBoundary(src, minC, mask, c, c2)) {
3798             /* back out the latest movement to stop at the boundary */
3799             src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3800             break;
3801         } else {
3802             if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3803                 /* attempt to grow the buffer */
3804                 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3805                                        2*bufferCapacity,
3806                                        bufferIndex)
3807             ) {
3808                 buffer[bufferIndex++]=c;
3809                 if(c2!=0) {
3810                     buffer[bufferIndex++]=c2;
3811                 }
3812             } else {
3813                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3814                 src.move(&src, 0, UITER_LIMIT);
3815                 return 0;
3816             }
3817         }
3818     }
3819
3820     /* return the length of the buffer contents */
3821     return bufferIndex;
3822 }
3823
3824 U_CAPI int32_t U_EXPORT2
3825 unorm_next(UCharIterator *src,
3826            UChar *dest, int32_t destCapacity,
3827            UNormalizationMode mode, int32_t options,
3828            UBool doNormalize, UBool *pNeededToNormalize,
3829            UErrorCode *pErrorCode) {
3830     UChar stackBuffer[100];
3831     UChar *buffer;
3832     IsNextBoundaryFn *isNextBoundary;
3833     uint32_t mask;
3834     int32_t bufferLength, bufferCapacity, destLength;
3835     int32_t c, c2;
3836     UChar minC;
3837
3838     /* check argument values */
3839     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3840         return 0;
3841     }
3842
3843     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3844         src==NULL
3845     ) {
3846         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3847         return 0;
3848     }
3849
3850     if(!_haveData(*pErrorCode)) {
3851         return 0;
3852     }
3853
3854     if(pNeededToNormalize!=NULL) {
3855         *pNeededToNormalize=FALSE;
3856     }
3857
3858     switch(mode) {
3859     case UNORM_FCD:
3860         if(fcdTrie.index==NULL) {
3861             *pErrorCode=U_UNSUPPORTED_ERROR;
3862             return 0;
3863         }
3864         /* fall through to NFD */
3865     case UNORM_NFD:
3866         isNextBoundary=_isNextNFDSafe;
3867         minC=_NORM_MIN_WITH_LEAD_CC;
3868         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3869         break;
3870     case UNORM_NFKD:
3871         isNextBoundary=_isNextNFDSafe;
3872         minC=_NORM_MIN_WITH_LEAD_CC;
3873         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3874         break;
3875     case UNORM_NFC:
3876         isNextBoundary=_isNextTrueStarter;
3877         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3878         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3879         break;
3880     case UNORM_NFKC:
3881         isNextBoundary=_isNextTrueStarter;
3882         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3883         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3884         break;
3885     case UNORM_NONE:
3886         destLength=0;
3887         if((c=src->next(src))>=0) {
3888             destLength=1;
3889             if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3890                 if(UTF_IS_TRAIL(c2)) {
3891                     if(destCapacity>=2) {
3892                         dest[1]=(UChar)c2; /* trail surrogate */
3893                         destLength=2;
3894                     }
3895                     /* lead surrogate to be written below */
3896                 } else {
3897                     src->move(src, -1, UITER_CURRENT);
3898                 }
3899             }
3900
3901             if(destCapacity>0) {
3902                 dest[0]=(UChar)c;
3903             }
3904         }
3905         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3906     default:
3907         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3908         return 0;
3909     }
3910
3911     buffer=stackBuffer;
3912     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3913     bufferLength=_findNextIterationBoundary(*src,
3914                                             isNextBoundary, minC, mask,
3915                                             buffer, bufferCapacity,
3916                                             pErrorCode);
3917     if(bufferLength>0) {
3918         if(doNormalize) {
3919             destLength=unorm_internalNormalize(dest, destCapacity,
3920                                                buffer, bufferLength,
3921                                                mode, options,
3922                                                pErrorCode);
3923             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3924                 *pNeededToNormalize=
3925                     (UBool)(destLength!=bufferLength ||
3926                             0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3927             }
3928         } else {
3929             /* just copy the source characters */
3930             if(destCapacity>0) {
3931                 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3932             }
3933             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3934         }
3935     } else {
3936         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3937     }
3938
3939     /* cleanup */
3940     if(buffer!=stackBuffer) {
3941         uprv_free(buffer);
3942     }
3943
3944     return destLength;
3945 }
3946
3947 /*
3948  * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3949  * and if not, how hard it would be to improve it.
3950  * For example, see _findSafeFCD().
3951  */
3952
3953 /* Concatenation of normalized strings -------------------------------------- */
3954
3955 U_CAPI int32_t U_EXPORT2
3956 unorm_concatenate(const UChar *left, int32_t leftLength,
3957                   const UChar *right, int32_t rightLength,
3958                   UChar *dest, int32_t destCapacity,
3959                   UNormalizationMode mode, int32_t options,
3960                   UErrorCode *pErrorCode) {
3961     UChar stackBuffer[100];
3962     UChar *buffer;
3963     int32_t bufferLength, bufferCapacity;
3964
3965     UCharIterator iter;
3966     int32_t leftBoundary, rightBoundary, destLength;
3967
3968     /* check argument values */
3969     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3970         return 0;
3971     }
3972
3973     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3974         left==NULL || leftLength<-1 ||
3975         right==NULL || rightLength<-1
3976     ) {
3977         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3978         return 0;
3979     }
3980
3981     /* check for overlapping right and destination */
3982     if( dest!=NULL &&
3983         ((right>=dest && right<(dest+destCapacity)) ||
3984          (rightLength>0 && dest>=right && dest<(right+rightLength)))
3985     ) {
3986         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3987         return 0;
3988     }
3989
3990     /* allow left==dest */
3991
3992     /* set up intermediate buffer */
3993     buffer=stackBuffer;
3994     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3995
3996     /*
3997      * Input: left[0..leftLength[ + right[0..rightLength[
3998      *
3999      * Find normalization-safe boundaries leftBoundary and rightBoundary
4000      * and copy the end parts together:
4001      * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
4002      *
4003      * dest=left[0..leftBoundary[ +
4004      *      normalize(buffer) +
4005      *      right[rightBoundary..rightLength[
4006      */
4007
4008     /*
4009      * find a normalization boundary at the end of the left string
4010      * and copy the end part into the buffer
4011      */
4012     uiter_setString(&iter, left, leftLength);
4013     iter.index=leftLength=iter.length; /* end of left string */
4014
4015     bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
4016                                 mode, options,
4017                                 FALSE, NULL,
4018                                 pErrorCode);
4019     leftBoundary=iter.index;
4020     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4021         *pErrorCode=U_ZERO_ERROR;
4022         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
4023             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4024             /* dont need to cleanup here since
4025              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4026              */
4027             return 0;
4028         }
4029
4030         /* just copy from the left string: we know the boundary already */
4031         uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
4032     }
4033
4034     /*
4035      * find a normalization boundary at the beginning of the right string
4036      * and concatenate the beginning part to the buffer
4037      */
4038     uiter_setString(&iter, right, rightLength);
4039     rightLength=iter.length; /* in case it was -1 */
4040
4041     rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
4042                              mode, options,
4043                              FALSE, NULL,
4044                              pErrorCode);
4045     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4046         *pErrorCode=U_ZERO_ERROR;
4047         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
4048             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4049             /* dont need to cleanup here since
4050              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4051              */
4052             return 0;
4053         }
4054
4055         /* just copy from the right string: we know the boundary already */
4056         uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4057     }
4058
4059     bufferLength+=rightBoundary;
4060
4061     /* copy left[0..leftBoundary[ to dest */
4062     if(left!=dest && leftBoundary>0 && destCapacity>0) {
4063         uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4064     }
4065     destLength=leftBoundary;
4066
4067     /* concatenate the normalization of the buffer to dest */
4068     if(destCapacity>destLength) {
4069         destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4070                                             buffer, bufferLength,
4071                                             mode, options,
4072                                             pErrorCode);
4073     } else {
4074         destLength+=unorm_internalNormalize(NULL, 0,
4075                                             buffer, bufferLength,
4076                                             mode, options,
4077                                             pErrorCode);
4078     }
4079     /*
4080      * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4081      * so we dont check for the error code here..just let it pass through
4082      */
4083     /* concatenate right[rightBoundary..rightLength[ to dest */
4084     right+=rightBoundary;
4085     rightLength-=rightBoundary;
4086     if(rightLength>0 && destCapacity>destLength) {
4087         uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4088     }
4089     destLength+=rightLength;
4090
4091     /* cleanup */
4092     if(buffer!=stackBuffer) {
4093         uprv_free(buffer);
4094     }
4095
4096     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4097 }
4098
4099 #endif /* #if !UCONFIG_NO_NORMALIZATION */