icuSources/common/unorm.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (c) 1996-2006, International Business Machines
   4 * Corporation and others. All Rights Reserved.
   5 ******************************************************************************
   6 * File unorm.cpp
   7 *
   8 * Created by: Vladimir Weinstein 12052000
   9 *
  10 * Modification history :
  11 *
  12 * Date        Name        Description
  13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
  14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
  15 *                         Added private method for doing FCD checks
  16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
  17 *                         string for codepoints < 0x300 for the normalization
  18 *                         mode NFC.
  19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
  20 *                         instead of just wrappers around normlzr.cpp,
  21 *                         load unorm.dat, support Unicode 3.1 with
  22 *                         supplementary code points, etc.
  23 */
  24
  25 #include "unicode/utypes.h"
  26
  27 #if !UCONFIG_NO_NORMALIZATION
  28
  29 #include "unicode/udata.h"
  30 #include "unicode/uchar.h"
  31 #include "unicode/ustring.h"
  32 #include "unicode/uiter.h"
  33 #include "unicode/uniset.h"
  34 #include "unicode/usetiter.h"
  35 #include "unicode/unorm.h"
  36 #include "ucln_cmn.h"
  37 #include "unormimp.h"
  38 #include "ucase.h"
  39 #include "cmemory.h"
  40 #include "umutex.h"
  41 #include "utrie.h"
  42 #include "unicode/uset.h"
  43 #include "udataswp.h"
  44 #include "putilimp.h"
  45
  46 /*
  47  * Status of tailored normalization
  48  *
  49  * This was done initially for investigation on Unicode public review issue 7
  50  * (http://www.unicode.org/review/). See Jitterbug 2481.
  51  * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
  52  * a permanent feature in ICU 2.6 in support of IDNA which requires true
  53  * Unicode 3.2 normalization.
  54  * (NormalizationCorrections are rolled into IDNA mapping tables.)
  55  *
  56  * Tailored normalization as implemented here allows to "normalize less"
  57  * than full Unicode normalization would.
  58  * Based internally on a UnicodeSet of code points that are
  59  * "excluded from normalization", the normalization functions leave those
  60  * code points alone ("inert"). This means that tailored normalization
  61  * still transforms text into a canonically equivalent form.
  62  * It does not add decompositions to code points that do not have any or
  63  * change decomposition results.
  64  *
  65  * Any function that searches for a safe boundary has not been touched,
  66  * which means that these functions will be over-pessimistic when
  67  * exclusions are applied.
  68  * This should not matter because subsequent checks and normalizations
  69  * do apply the exclusions; only a little more of the text may be processed
  70  * than necessary under exclusions.
  71  *
  72  * Normalization exclusions have the following effect on excluded code points c:
  73  * - c is not decomposed
  74  * - c is not a composition target
  75  * - c does not combine forward or backward for composition
  76  *   except that this is not implemented for Jamo
  77  * - c is treated as having a combining class of 0
  78  */
  79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  80
  81 /*
  82  * This new implementation of the normalization code loads its data from
  83  * unorm.dat, which is generated with the gennorm tool.
  84  * The format of that file is described in unormimp.h .
  85  */
  86
  87 /* -------------------------------------------------------------------------- */
  88
  89 enum {
  90     _STACK_BUFFER_CAPACITY=100
  91 };
  92
  93 /*
  94  * Constants for the bit fields in the options bit set parameter.
  95  * These need not be public.
  96  * A user only needs to know the currently assigned values.
  97  * The number and positions of reserved bits per field can remain private
  98  * and may change in future implementations.
  99  */
 100 enum {
 101     _NORM_OPTIONS_NX_MASK=0x1f,
 102     _NORM_OPTIONS_UNICODE_MASK=0x60,
 103     _NORM_OPTIONS_SETS_MASK=0x7f,
 104
 105     _NORM_OPTIONS_UNICODE_SHIFT=5,
 106
 107     /*
 108      * The following options are used only in some composition functions.
 109      * They use bits 12 and up to preserve lower bits for the available options
 110      * space in unorm_compare() -
 111      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
 112      */
 113
 114     /** Options bit 12, for compatibility vs. canonical decomposition. */
 115     _NORM_OPTIONS_COMPAT=0x1000,
 116     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
 117     _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
 118 };
 119
 120 U_CDECL_BEGIN
 121 static inline UBool
 122 isHangulWithoutJamoT(UChar c) {
 123     c-=HANGUL_BASE;
 124     return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
 125 }
 126
 127 /* norm32 helpers */
 128
 129 /* is this a norm32 with a regular index? */
 130 static inline UBool
 131 isNorm32Regular(uint32_t norm32) {
 132     return norm32<_NORM_MIN_SPECIAL;
 133 }
 134
 135 /* is this a norm32 with a special index for a lead surrogate? */
 136 static inline UBool
 137 isNorm32LeadSurrogate(uint32_t norm32) {
 138     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
 139 }
 140
 141 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
 142 static inline UBool
 143 isNorm32HangulOrJamo(uint32_t norm32) {
 144     return norm32>=_NORM_MIN_HANGUL;
 145 }
 146
 147 /*
 148  * Given isNorm32HangulOrJamo(),
 149  * is this a Hangul syllable or a Jamo?
 150  */
 151 /*static inline UBool
 152 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
 153     return norm32<_NORM_MIN_JAMO_V;
 154 }*/
 155
 156 /*
 157  * Given norm32 for Jamo V or T,
 158  * is this a Jamo V?
 159  */
 160 static inline UBool
 161 isJamoVTNorm32JamoV(uint32_t norm32) {
 162     return norm32<_NORM_JAMO_V_TOP;
 163 }
 164
 165 /* load unorm.dat ----------------------------------------------------------- */
 166
 167 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
 168 static int32_t U_CALLCONV
 169 getFoldingNormOffset(uint32_t norm32) {
 170     if(isNorm32LeadSurrogate(norm32)) {
 171         return
 172             UTRIE_BMP_INDEX_LENGTH+
 173                 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
 174                  (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
 175     } else {
 176         return 0;
 177     }
 178 }
 179
 180 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
 181 static int32_t U_CALLCONV
 182 getFoldingAuxOffset(uint32_t data) {
 183     return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
 184 }
 185 U_CDECL_END
 186
 187 #define UNORM_HARDCODE_DATA 1
 188
 189 #if UNORM_HARDCODE_DATA
 190
 191 /* unorm_props_data.c is machine-generated by gennorm --csource */
 192 #include "unorm_props_data.c"
 193
 194 static const UBool formatVersion_2_2=TRUE;
 195
 196 #else
 197
 198 #define DATA_NAME "unorm"
 199 #define DATA_TYPE "icu"
 200
 201 static UDataMemory *normData=NULL;
 202 static UErrorCode dataErrorCode=U_ZERO_ERROR;
 203 static int8_t haveNormData=0;
 204
 205 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
 206 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
 207
 208 /*
 209  * pointers into the memory-mapped unorm.icu
 210  */
 211 static const uint16_t *extraData=NULL,
 212                       *combiningTable=NULL,
 213                       *canonStartSets=NULL;
 214
 215 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
 216 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
 217
 218 /* the Unicode version of the normalization data */
 219 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
 220
 221 #endif
 222
 223 /* cache UnicodeSets for each combination of exclusion flags */
 224 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
 225
 226 U_CDECL_BEGIN
 227
 228 static UBool U_CALLCONV
 229 unorm_cleanup(void) {
 230     int32_t i;
 231
 232 #if !UNORM_HARDCODE_DATA
 233     if(normData!=NULL) {
 234         udata_close(normData);
 235         normData=NULL;
 236     }
 237     dataErrorCode=U_ZERO_ERROR;
 238     haveNormData=0;
 239 #endif
 240
 241     for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
 242         if (nxCache[i]) {
 243             delete nxCache[i];
 244             nxCache[i] = 0;
 245         }
 246     }
 247
 248     return TRUE;
 249 }
 250
 251 #if !UNORM_HARDCODE_DATA
 252
 253 static UBool U_CALLCONV
 254 isAcceptable(void * /* context */,
 255              const char * /* type */, const char * /* name */,
 256              const UDataInfo *pInfo) {
 257     if(
 258         pInfo->size>=20 &&
 259         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
 260         pInfo->charsetFamily==U_CHARSET_FAMILY &&
 261         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
 262         pInfo->dataFormat[1]==0x6f &&
 263         pInfo->dataFormat[2]==0x72 &&
 264         pInfo->dataFormat[3]==0x6d &&
 265         pInfo->formatVersion[0]==2 &&
 266         pInfo->formatVersion[2]==UTRIE_SHIFT &&
 267         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
 268     ) {
 269         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
 270         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
 271         return TRUE;
 272     } else {
 273         return FALSE;
 274     }
 275 }
 276
 277 #endif
 278
 279 static UBool U_CALLCONV
 280 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
 281     /* add the start code point to the USet */
 282     const USetAdder *sa=(const USetAdder *)context;
 283     sa->add(sa->set, start);
 284     return TRUE;
 285 }
 286
 287 U_CDECL_END
 288
 289 #if !UNORM_HARDCODE_DATA
 290
 291 static int8_t
 292 loadNormData(UErrorCode &errorCode) {
 293     /* load Unicode normalization data from file */
 294
 295     /*
 296      * This lazy intialization with double-checked locking (without mutex protection for
 297      * haveNormData==0) is transiently unsafe under certain circumstances.
 298      * Check the readme and use u_init() if necessary.
 299      *
 300      * While u_init() initializes the main normalization data via this functions,
 301      * it does not do so for exclusion sets (which are fully mutexed).
 302      * This is because
 303      * - there can be many exclusion sets
 304      * - they are rarely used
 305      * - they are not usually used in execution paths that are
 306      *   as performance-sensitive as others
 307      *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
 308      */
 309     if(haveNormData==0) {
 310         UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
 311         UDataMemory *data;
 312
 313         const int32_t *p=NULL;
 314         const uint8_t *pb;
 315
 316         if(&errorCode==NULL || U_FAILURE(errorCode)) {
 317             return 0;
 318         }
 319
 320         /* open the data outside the mutex block */
 321         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
 322         dataErrorCode=errorCode;
 323         if(U_FAILURE(errorCode)) {
 324             return haveNormData=-1;
 325         }
 326
 327         p=(const int32_t *)udata_getMemory(data);
 328         pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
 329         utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
 330         _normTrie.getFoldingOffset=getFoldingNormOffset;
 331
 332         pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
 333         if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
 334             utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
 335         }
 336         pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
 337
 338         if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
 339             utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
 340             _auxTrie.getFoldingOffset=getFoldingAuxOffset;
 341         }
 342
 343         if(U_FAILURE(errorCode)) {
 344             dataErrorCode=errorCode;
 345             udata_close(data);
 346             return haveNormData=-1;
 347         }
 348
 349         /* in the mutex block, set the data for this process */
 350         umtx_lock(NULL);
 351         if(normData==NULL) {
 352             normData=data;
 353             data=NULL;
 354
 355             uprv_memcpy(&indexes, p, sizeof(indexes));
 356             uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
 357             uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
 358             uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
 359         } else {
 360             p=(const int32_t *)udata_getMemory(normData);
 361         }
 362
 363         /* initialize some variables */
 364         extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
 365         combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
 366         formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
 367         formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
 368         if(formatVersion_2_1) {
 369             canonStartSets=combiningTable+
 370                 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
 371                 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
 372         }
 373         haveNormData=1;
 374         ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 375         umtx_unlock(NULL);
 376
 377         /* if a different thread set it first, then close the extra data */
 378         if(data!=NULL) {
 379             udata_close(data); /* NULL if it was set correctly */
 380         }
 381     }
 382
 383     return haveNormData;
 384 }
 385
 386 #endif
 387
 388 static inline UBool
 389 _haveData(UErrorCode &errorCode) {
 390 #if UNORM_HARDCODE_DATA
 391     return U_SUCCESS(errorCode);
 392 #else
 393     if(U_FAILURE(errorCode)) {
 394         return FALSE;
 395     } else if(haveNormData>0) {
 396         return TRUE;
 397     } else if(haveNormData<0) {
 398         errorCode=dataErrorCode;
 399         return FALSE;
 400     } else /* haveNormData==0 */ {
 401         return (UBool)(loadNormData(errorCode)>0);
 402     }
 403 #endif
 404 }
 405
 406 U_CAPI UBool U_EXPORT2
 407 unorm_haveData(UErrorCode *pErrorCode) {
 408     return _haveData(*pErrorCode);
 409 }
 410
 411 U_CAPI const uint16_t * U_EXPORT2
 412 unorm_getFCDTrie(UErrorCode *pErrorCode) {
 413     if(_haveData(*pErrorCode)) {
 414         return fcdTrie.index;
 415     } else {
 416         return NULL;
 417     }
 418 }
 419
 420 /* data access primitives --------------------------------------------------- */
 421
 422 static inline uint32_t
 423 _getNorm32(UChar c) {
 424     return UTRIE_GET32_FROM_LEAD(&normTrie, c);
 425 }
 426
 427 static inline uint32_t
 428 _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
 429     /*
 430      * the surrogate index in norm32 stores only the number of the surrogate index block
 431      * see gennorm/store.c/getFoldedNormValue()
 432      */
 433     norm32=
 434         UTRIE_BMP_INDEX_LENGTH+
 435             ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
 436              (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
 437     return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
 438 }
 439
 440 /*
 441  * get a norm32 from text with complete code points
 442  * (like from decompositions)
 443  */
 444 static inline uint32_t
 445 _getNorm32(const UChar *p, uint32_t mask) {
 446     uint32_t norm32=_getNorm32(*p);
 447     if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
 448         /* *p is a lead surrogate, get the real norm32 */
 449         norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
 450     }
 451     return norm32;
 452 }
 453
 454 static inline uint16_t
 455 _getFCD16(UChar c) {
 456     return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
 457 }
 458
 459 static inline uint16_t
 460 _getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
 461     /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
 462     return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
 463 }
 464
 465 static inline const uint16_t *
 466 _getExtraData(uint32_t norm32) {
 467     return extraData+(norm32>>_NORM_EXTRA_SHIFT);
 468 }
 469
 470 #if 0
 471 /*
 472  * It is possible to get the FCD data from the main trie if unorm.icu
 473  * was built without the FCD trie, although it is slower.
 474  * This is not implemented because it is hard to test, and because it seems
 475  * unusual to want to use FCD and not build the data file for it.
 476  *
 477  * Untested sample code:
 478  */
 479 static inline uint16_t
 480 _getFCD16FromNormData(UChar32 c) {
 481     uint32_t norm32, fcd;
 482
 483     norm32=_getNorm32(c);
 484     if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
 485         /* get the lead/trail cc from the decomposition data */
 486         const uint16_t *nfd=_getExtraData(norm32);
 487         if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 488             fcd=nfd[1];
 489         }
 490     } else {
 491         fcd=norm32&_NORM_CC_MASK;
 492         if(fcd!=0) {
 493             /* use the code point cc value for both lead and trail cc's */
 494             fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
 495         }
 496     }
 497
 498     return (uint16_t)fcd;
 499 }
 500 #endif
 501
 502 /* normalization exclusion sets --------------------------------------------- */
 503
 504 /*
 505  * Normalization exclusion UnicodeSets are used for tailored normalization;
 506  * see the comment near the beginning of this file.
 507  *
 508  * By specifying one or several sets of code points,
 509  * those code points become inert for normalization.
 510  */
 511
 512 static const UnicodeSet *
 513 internalGetNXHangul(UErrorCode &errorCode) {
 514     /* internal function, does not check for incoming U_FAILURE */
 515     UBool isCached;
 516
 517     UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
 518
 519     if(!isCached) {
 520         UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
 521         if(set==NULL) {
 522             errorCode=U_MEMORY_ALLOCATION_ERROR;
 523             return NULL;
 524         }
 525
 526         umtx_lock(NULL);
 527         if(nxCache[UNORM_NX_HANGUL]==NULL) {
 528             nxCache[UNORM_NX_HANGUL]=set;
 529             set=NULL;
 530             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 531         }
 532         umtx_unlock(NULL);
 533
 534         delete set;
 535     }
 536
 537     return nxCache[UNORM_NX_HANGUL];
 538 }
 539
 540 /* unorm.cpp 1.116 had and used
 541 static const UnicodeSet *
 542 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
 543     ...
 544 }
 545 */
 546
 547 /* get and set an exclusion set from a serialized UnicodeSet */
 548 static const UnicodeSet *
 549 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
 550     /* internal function, does not check for incoming U_FAILURE */
 551     UBool isCached;
 552
 553     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
 554
 555     if( !isCached &&
 556         canonStartSets!=NULL &&
 557         canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
 558     ) {
 559         USerializedSet sset;
 560         UnicodeSet *set;
 561         UChar32 start, end;
 562         int32_t i;
 563
 564         if( !uset_getSerializedSet(
 565                     &sset,
 566                     canonStartSets+canonStartSets[nxIndex],
 567                     canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
 568         ) {
 569             errorCode=U_INVALID_FORMAT_ERROR;
 570             return NULL;
 571         }
 572
 573         /* turn the serialized set into a UnicodeSet */
 574         set=new UnicodeSet();
 575         if(set==NULL) {
 576             errorCode=U_MEMORY_ALLOCATION_ERROR;
 577             return NULL;
 578         }
 579         for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
 580             set->add(start, end);
 581         }
 582
 583         umtx_lock(NULL);
 584         if(nxCache[options]==NULL) {
 585             nxCache[options]=set;
 586             set=NULL;
 587             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 588         }
 589         umtx_unlock(NULL);
 590
 591         delete set;
 592     }
 593
 594     return nxCache[options];
 595 }
 596
 597 static const UnicodeSet *
 598 internalGetNXCJKCompat(UErrorCode &errorCode) {
 599     /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
 600     return internalGetSerializedNX(
 601                 UNORM_NX_CJK_COMPAT,
 602                 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
 603                 errorCode);
 604 }
 605
 606 static const UnicodeSet *
 607 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
 608     /* internal function, does not check for incoming U_FAILURE */
 609     int32_t nxIndex;
 610
 611     options&=_NORM_OPTIONS_UNICODE_MASK;
 612     switch(options) {
 613     case 0:
 614         return NULL;
 615     case UNORM_UNICODE_3_2:
 616         /* [:^Age=3.2:] */
 617         nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
 618         break;
 619     default:
 620         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 621         return NULL;
 622     }
 623
 624     /* build a set with all code points that were not designated by the specified Unicode version */
 625     return internalGetSerializedNX(options, nxIndex, errorCode);
 626 }
 627
 628 /* Get a decomposition exclusion set. The data must be loaded. */
 629 static const UnicodeSet *
 630 internalGetNX(int32_t options, UErrorCode &errorCode) {
 631     options&=_NORM_OPTIONS_SETS_MASK;
 632
 633     UBool isCached;
 634
 635     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
 636
 637     if(!isCached) {
 638         /* return basic sets */
 639         if(options==UNORM_NX_HANGUL) {
 640             return internalGetNXHangul(errorCode);
 641         }
 642         if(options==UNORM_NX_CJK_COMPAT) {
 643             return internalGetNXCJKCompat(errorCode);
 644         }
 645         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
 646             return internalGetNXUnicode(options, errorCode);
 647         }
 648
 649         /* build a set from multiple subsets */
 650         UnicodeSet *set;
 651         const UnicodeSet *other;
 652
 653         set=new UnicodeSet();
 654         if(set==NULL) {
 655             errorCode=U_MEMORY_ALLOCATION_ERROR;
 656             return NULL;
 657         }
 658
 659         if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
 660             set->addAll(*other);
 661         }
 662         if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
 663             set->addAll(*other);
 664         }
 665         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
 666             set->addAll(*other);
 667         }
 668
 669         if(U_FAILURE(errorCode)) {
 670             delete set;
 671             return NULL;
 672         }
 673
 674         umtx_lock(NULL);
 675         if(nxCache[options]==NULL) {
 676             nxCache[options]=set;
 677             set=NULL;
 678             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 679         }
 680         umtx_unlock(NULL);
 681
 682         delete set;
 683     }
 684
 685     return nxCache[options];
 686 }
 687
 688 static inline const UnicodeSet *
 689 getNX(int32_t options, UErrorCode &errorCode) {
 690     if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
 691         /* incoming failure, or no decomposition exclusions requested */
 692         return NULL;
 693     } else {
 694         return internalGetNX(options, errorCode);
 695     }
 696 }
 697
 698 U_CFUNC const UnicodeSet *
 699 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
 700     return getNX(options, *pErrorCode);
 701 }
 702
 703 static inline UBool
 704 nx_contains(const UnicodeSet *nx, UChar32 c) {
 705     return nx!=NULL && nx->contains(c);
 706 }
 707
 708 static inline UBool
 709 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
 710     return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
 711 }
 712
 713 /* other normalization primitives ------------------------------------------- */
 714
 715 /* get the canonical or compatibility decomposition for one character */
 716 static inline const UChar *
 717 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
 718            uint8_t &cc, uint8_t &trailCC) {
 719     const UChar *p=(const UChar *)_getExtraData(norm32);
 720     length=*p++;
 721
 722     if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
 723         /* use compatibility decomposition, skip canonical data */
 724         p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
 725         length>>=8;
 726     }
 727
 728     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 729         /* get the lead and trail cc's */
 730         UChar bothCCs=*p++;
 731         cc=(uint8_t)(bothCCs>>8);
 732         trailCC=(uint8_t)bothCCs;
 733     } else {
 734         /* lead and trail cc's are both 0 */
 735         cc=trailCC=0;
 736     }
 737
 738     length&=_NORM_DECOMP_LENGTH_MASK;
 739     return p;
 740 }
 741
 742 /* get the canonical decomposition for one character */
 743 static inline const UChar *
 744 _decompose(uint32_t norm32, int32_t &length,
 745            uint8_t &cc, uint8_t &trailCC) {
 746     const UChar *p=(const UChar *)_getExtraData(norm32);
 747     length=*p++;
 748
 749     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 750         /* get the lead and trail cc's */
 751         UChar bothCCs=*p++;
 752         cc=(uint8_t)(bothCCs>>8);
 753         trailCC=(uint8_t)bothCCs;
 754     } else {
 755         /* lead and trail cc's are both 0 */
 756         cc=trailCC=0;
 757     }
 758
 759     length&=_NORM_DECOMP_LENGTH_MASK;
 760     return p;
 761 }
 762
 763 /**
 764  * Get the canonical decomposition for one code point.
 765  * @param c code point
 766  * @param buffer out-only buffer for algorithmic decompositions of Hangul
 767  * @param length out-only, takes the length of the decomposition, if any
 768  * @return pointer to decomposition, or 0 if none
 769  * @internal
 770  */
 771 U_CFUNC const UChar *
 772 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
 773     uint32_t norm32;
 774
 775     if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
 776         /* trivial case */
 777         return NULL;
 778     }
 779
 780     UTRIE_GET32(&normTrie, c, norm32);
 781     if(norm32&_NORM_QC_NFD) {
 782         if(isNorm32HangulOrJamo(norm32)) {
 783             /* Hangul syllable: decompose algorithmically */
 784             UChar c2;
 785
 786             c-=HANGUL_BASE;
 787
 788             c2=(UChar)(c%JAMO_T_COUNT);
 789             c/=JAMO_T_COUNT;
 790             if(c2>0) {
 791                 buffer[2]=(UChar)(JAMO_T_BASE+c2);
 792                 *pLength=3;
 793             } else {
 794                 *pLength=2;
 795             }
 796
 797             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
 798             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
 799             return buffer;
 800         } else {
 801             /* normal decomposition */
 802             uint8_t cc, trailCC;
 803             return _decompose(norm32, *pLength, cc, trailCC);
 804         }
 805     } else {
 806         return 0;
 807     }
 808 }
 809
 810 /*
 811  * get the combining class of (c, c2)=*p++
 812  * before: p<limit  after: p<=limit
 813  * if only one code unit is used, then c2==0
 814  */
 815 static inline uint8_t
 816 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
 817     uint32_t norm32;
 818
 819     c=*p++;
 820     norm32=_getNorm32(c);
 821     if((norm32&_NORM_CC_MASK)==0) {
 822         c2=0;
 823         return 0;
 824     } else {
 825         if(!isNorm32LeadSurrogate(norm32)) {
 826             c2=0;
 827         } else {
 828             /* c is a lead surrogate, get the real norm32 */
 829             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
 830                 ++p;
 831                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
 832             } else {
 833                 c2=0;
 834                 return 0;
 835             }
 836         }
 837
 838         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
 839     }
 840 }
 841
 842 /*
 843  * read backwards and get norm32
 844  * return 0 if the character is <minC
 845  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
 846  */
 847 static inline uint32_t
 848 _getPrevNorm32(const UChar *start, const UChar *&src,
 849                uint32_t minC, uint32_t mask,
 850                UChar &c, UChar &c2) {
 851     uint32_t norm32;
 852
 853     c=*--src;
 854     c2=0;
 855
 856     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
 857     if(c<minC) {
 858         return 0;
 859     } else if(!UTF_IS_SURROGATE(c)) {
 860         return _getNorm32(c);
 861     } else if(UTF_IS_SURROGATE_FIRST(c)) {
 862         /* unpaired first surrogate */
 863         return 0;
 864     } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
 865         --src;
 866         norm32=_getNorm32(c2);
 867
 868         if((norm32&mask)==0) {
 869             /* all surrogate pairs with this lead surrogate have only irrelevant data */
 870             return 0;
 871         } else {
 872             /* norm32 must be a surrogate special */
 873             return _getNorm32FromSurrogatePair(norm32, c);
 874         }
 875     } else {
 876         /* unpaired second surrogate */
 877         c2=0;
 878         return 0;
 879     }
 880 }
 881
 882 /*
 883  * get the combining class of (c, c2)=*--p
 884  * before: start<p  after: start<=p
 885  */
 886 static inline uint8_t
 887 _getPrevCC(const UChar *start, const UChar *&p) {
 888     UChar c, c2;
 889
 890     return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
 891 }
 892
 893 /*
 894  * is this a safe boundary character for NF*D?
 895  * (lead cc==0)
 896  */
 897 static inline UBool
 898 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 899     if((norm32&ccOrQCMask)==0) {
 900         return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
 901     }
 902
 903     /* inspect its decomposition - maybe a Hangul but not a surrogate here */
 904     if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
 905         int32_t length;
 906         uint8_t cc, trailCC;
 907
 908         /* decomposes, get everything from the variable-length extra data */
 909         _decompose(norm32, decompQCMask, length, cc, trailCC);
 910         return cc==0;
 911     } else {
 912         /* no decomposition (or Hangul), test the cc directly */
 913         return (norm32&_NORM_CC_MASK)==0;
 914     }
 915 }
 916
 917 /*
 918  * is this (or does its decomposition begin with) a "true starter"?
 919  * (cc==0 and NF*C_YES)
 920  */
 921 static inline UBool
 922 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 923     if((norm32&ccOrQCMask)==0) {
 924         return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
 925     }
 926
 927     /* inspect its decomposition - not a Hangul or a surrogate here */
 928     if((norm32&decompQCMask)!=0) {
 929         const UChar *p;
 930         int32_t length;
 931         uint8_t cc, trailCC;
 932
 933         /* decomposes, get everything from the variable-length extra data */
 934         p=_decompose(norm32, decompQCMask, length, cc, trailCC);
 935         if(cc==0) {
 936             uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
 937
 938             /* does it begin with NFC_YES? */
 939             if((_getNorm32(p, qcMask)&qcMask)==0) {
 940                 /* yes, the decomposition begins with a true starter */
 941                 return TRUE;
 942             }
 943         }
 944     }
 945     return FALSE;
 946 }
 947
 948 /* uchar.h */
 949 U_CAPI uint8_t U_EXPORT2
 950 u_getCombiningClass(UChar32 c) {
 951 #if !UNORM_HARDCODE_DATA
 952     UErrorCode errorCode=U_ZERO_ERROR;
 953     if(_haveData(errorCode)) {
 954 #endif
 955         uint32_t norm32;
 956
 957         UTRIE_GET32(&normTrie, c, norm32);
 958         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
 959 #if !UNORM_HARDCODE_DATA
 960     } else {
 961         return 0;
 962     }
 963 #endif
 964 }
 965
 966 U_CAPI UBool U_EXPORT2
 967 unorm_internalIsFullCompositionExclusion(UChar32 c) {
 968 #if UNORM_HARDCODE_DATA
 969     if(auxTrie.index!=NULL) {
 970 #else
 971     UErrorCode errorCode=U_ZERO_ERROR;
 972     if(_haveData(errorCode) && auxTrie.index!=NULL) {
 973 #endif
 974         uint16_t aux;
 975
 976         UTRIE_GET16(&auxTrie, c, aux);
 977         return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
 978     } else {
 979         return FALSE;
 980     }
 981 }
 982
 983 U_CAPI UBool U_EXPORT2
 984 unorm_isCanonSafeStart(UChar32 c) {
 985 #if UNORM_HARDCODE_DATA
 986     if(auxTrie.index!=NULL) {
 987 #else
 988     UErrorCode errorCode=U_ZERO_ERROR;
 989     if(_haveData(errorCode) && auxTrie.index!=NULL) {
 990 #endif
 991         uint16_t aux;
 992
 993         UTRIE_GET16(&auxTrie, c, aux);
 994         return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
 995     } else {
 996         return FALSE;
 997     }
 998 }
 999
1000 U_CAPI void U_EXPORT2
1001 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
1002     if(unorm_haveData(pErrorCode)){
1003         uprv_memcpy(*versionInfo, dataVersion, 4);
1004     }
1005 }
1006
1007
1008 U_CAPI UBool U_EXPORT2
1009 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
1010 #if !UNORM_HARDCODE_DATA
1011     UErrorCode errorCode=U_ZERO_ERROR;
1012 #endif
1013     if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
1014 #if !UNORM_HARDCODE_DATA
1015         _haveData(errorCode) &&
1016 #endif
1017         canonStartSets!=NULL
1018     ) {
1019         const uint16_t *table;
1020         int32_t i, start, limit;
1021
1022         /*
1023          * binary search for c
1024          *
1025          * There are two search tables,
1026          * one for BMP code points and one for supplementary ones.
1027          * See unormimp.h for details.
1028          */
1029         if(c<=0xffff) {
1030             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
1031             start=0;
1032             limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1033
1034             /* each entry is a pair { c, result } */
1035             while(start<limit-2) {
1036                 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1037                 if(c<table[i]) {
1038                     limit=i;
1039                 } else {
1040                     start=i;
1041                 }
1042             }
1043
1044             /* found? */
1045             if(c==table[start]) {
1046                 i=table[start+1];
1047                 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1048                     /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1049                     i&=(_NORM_MAX_CANON_SETS-1);
1050                     return uset_getSerializedSet(fillSet,
1051                                             canonStartSets+i,
1052                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1053                 } else {
1054                     /* other result values are BMP code points for single-code point sets */
1055                     uset_setSerializedToOne(fillSet, (UChar32)i);
1056                     return TRUE;
1057                 }
1058             }
1059         } else {
1060             uint16_t high, low, h;
1061
1062             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1063                                  canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1064             start=0;
1065             limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1066
1067             high=(uint16_t)(c>>16);
1068             low=(uint16_t)c;
1069
1070             /* each entry is a triplet { high(c), low(c), result } */
1071             while(start<limit-3) {
1072                 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1073                 h=table[i]&0x1f; /* high word */
1074                 if(high<h || (high==h && low<table[i+1])) {
1075                     limit=i;
1076                 } else {
1077                     start=i;
1078                 }
1079             }
1080
1081             /* found? */
1082             h=table[start];
1083             if(high==(h&0x1f) && low==table[start+1]) {
1084                 i=table[start+2];
1085                 if((h&0x8000)==0) {
1086                     /* the result is an index to a USerializedSet */
1087                     return uset_getSerializedSet(fillSet,
1088                                             canonStartSets+i,
1089                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1090                 } else {
1091                     /*
1092                      * single-code point set {x} in
1093                      * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
1094                      */
1095                     i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1096                     uset_setSerializedToOne(fillSet, (UChar32)i);
1097                     return TRUE;
1098                 }
1099             }
1100         }
1101     }
1102
1103     return FALSE; /* not found */
1104 }
1105
1106 U_CAPI int32_t U_EXPORT2
1107 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1108     uint16_t aux;
1109
1110     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1111         return 0;
1112     }
1113     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1114         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1115         return 0;
1116     }
1117     if(!_haveData(*pErrorCode) || auxTrie.index==NULL) {
1118         return 0;
1119     }
1120
1121     UTRIE_GET16(&auxTrie, c, aux);
1122     aux&=_NORM_AUX_FNC_MASK;
1123     if(aux!=0) {
1124         const UChar *s;
1125         int32_t length;
1126
1127         s=(const UChar *)(extraData+aux);
1128         if(*s<0xff00) {
1129             /* s points to the single-unit string */
1130             length=1;
1131         } else {
1132             length=*s&0xff;
1133             ++s;
1134         }
1135         if(0<length && length<=destCapacity) {
1136             uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1137         }
1138         return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1139     } else {
1140         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1141     }
1142 }
1143
1144 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1145 U_CAPI UBool U_EXPORT2
1146 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1147     uint32_t norm32, mask;
1148     uint16_t aux, fcd;
1149
1150 #if !UNORM_HARDCODE_DATA
1151     UErrorCode errorCode=U_ZERO_ERROR;
1152     if(!_haveData(errorCode)) {
1153         return FALSE;
1154     }
1155 #endif
1156
1157     /* handle trivial cases; set the comparison mask for the normal ones */
1158     switch(mode) {
1159     case UNORM_NONE:
1160         return TRUE;
1161     case UNORM_NFD:
1162         mask=_NORM_CC_MASK|_NORM_QC_NFD;
1163         break;
1164     case UNORM_NFKD:
1165         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1166         break;
1167     case UNORM_NFC:
1168     /* case UNORM_FCC: */
1169         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1170         break;
1171     case UNORM_NFKC:
1172         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1173         break;
1174     case UNORM_FCD:
1175         /* FCD: skippable if lead cc==0 and trail cc<=1 */
1176         if(fcdTrie.index!=NULL) {
1177             UTRIE_GET16(&fcdTrie, c, fcd);
1178             return fcd<=1;
1179         } else {
1180             return FALSE;
1181         }
1182     default:
1183         return FALSE;
1184     }
1185
1186     /* check conditions (a)..(e), see unormimp.h */
1187     UTRIE_GET32(&normTrie, c, norm32);
1188     if((norm32&mask)!=0) {
1189         return FALSE; /* fails (a)..(e), not skippable */
1190     }
1191
1192     if(mode<UNORM_NFC) {
1193         return TRUE; /* NF*D, passed (a)..(c), is skippable */
1194     }
1195
1196     /* NF*C/FCC, passed (a)..(e) */
1197     if((norm32&_NORM_QC_NFD)==0) {
1198         return TRUE; /* no canonical decomposition, is skippable */
1199     }
1200
1201     /* check Hangul syllables algorithmically */
1202     if(isNorm32HangulOrJamo(norm32)) {
1203         /* Jamo passed (a)..(e) above, must be Hangul */
1204         return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1205     }
1206
1207     /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1208     /* NF*C, test (f) flag */
1209     if(!formatVersion_2_2 || auxTrie.index==NULL) {
1210         return FALSE; /* no (f) data, say not skippable to be safe */
1211     }
1212
1213     UTRIE_GET16(&auxTrie, c, aux);
1214     return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1215
1216     /* } else { FCC, test fcd<=1 instead of the above } */
1217 }
1218
1219 U_CAPI void U_EXPORT2
1220 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
1221     UChar c;
1222
1223     if(!_haveData(*pErrorCode)) {
1224         return;
1225     }
1226
1227     /* add the start code point of each same-value range of each trie */
1228     utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1229     if(fcdTrie.index!=NULL) {
1230         utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1231     }
1232     if(auxTrie.index!=NULL) {
1233         utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1234     }
1235
1236     /* add Hangul LV syllables and LV+1 because of skippables */
1237     for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1238         sa->add(sa->set, c);
1239         sa->add(sa->set, c+1);
1240     }
1241     sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1242 }
1243
1244 U_CAPI UNormalizationCheckResult U_EXPORT2
1245 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1246     static const uint32_t qcMask[UNORM_MODE_COUNT]={
1247         0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1248     };
1249
1250     uint32_t norm32;
1251
1252 #if !UNORM_HARDCODE_DATA
1253     UErrorCode errorCode=U_ZERO_ERROR;
1254     if(!_haveData(errorCode)) {
1255         return UNORM_YES;
1256     }
1257 #endif
1258
1259     UTRIE_GET32(&normTrie, c, norm32);
1260     norm32&=qcMask[mode];
1261
1262     if(norm32==0) {
1263         return UNORM_YES;
1264     } else if(norm32&_NORM_QC_ANY_NO) {
1265         return UNORM_NO;
1266     } else /* _NORM_QC_ANY_MAYBE */ {
1267         return UNORM_MAYBE;
1268     }
1269 }
1270
1271 U_CAPI uint16_t U_EXPORT2
1272 unorm_getFCD16FromCodePoint(UChar32 c) {
1273     UErrorCode errorCode;
1274     uint16_t fcd;
1275
1276     errorCode=U_ZERO_ERROR;
1277     if(
1278 #if !UNORM_HARDCODE_DATA
1279         !_haveData(errorCode) ||
1280 #endif
1281         fcdTrie.index==NULL
1282     ) {
1283         return 0;
1284     }
1285
1286     UTRIE_GET16(&fcdTrie, c, fcd);
1287     return fcd;
1288 }
1289
1290 /* reorder UTF-16 in-place -------------------------------------------------- */
1291
1292 /*
1293  * simpler, single-character version of _mergeOrdered() -
1294  * bubble-insert one single code point into the preceding string
1295  * which is already canonically ordered
1296  * (c, c2) may or may not yet have been inserted at [current..p[
1297  *
1298  * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1299  *
1300  * before: [start..current[ is already ordered, and
1301  *         [current..p[     may or may not hold (c, c2) but
1302  *                          must be exactly the same length as (c, c2)
1303  * after: [start..p[ is ordered
1304  *
1305  * returns the trailing combining class
1306  */
1307 static uint8_t
1308 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1309                UChar c, UChar c2, uint8_t cc) {
1310     const UChar *pBack, *pPreBack;
1311     UChar *r;
1312     uint8_t prevCC, trailCC=cc;
1313
1314     if(start<current && cc!=0) {
1315         /* search for the insertion point where cc>=prevCC */
1316         pPreBack=pBack=current;
1317         prevCC=_getPrevCC(start, pPreBack);
1318         if(cc<prevCC) {
1319             /* this will be the last code point, so keep its cc */
1320             trailCC=prevCC;
1321             pBack=pPreBack;
1322             while(start<pPreBack) {
1323                 prevCC=_getPrevCC(start, pPreBack);
1324                 if(cc>=prevCC) {
1325                     break;
1326                 }
1327                 pBack=pPreBack;
1328             }
1329
1330             /*
1331              * this is where we are right now with all these pointers:
1332              * [start..pPreBack[ 0..? code points that we can ignore
1333              * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1334              * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
1335              * [current..p[         1 code point (c, c2) with cc
1336              */
1337
1338             /* move the code units in between up */
1339             r=p;
1340             do {
1341                 *--r=*--current;
1342             } while(pBack!=current);
1343         }
1344     }
1345
1346     /* insert (c, c2) */
1347     *current=c;
1348     if(c2!=0) {
1349         *(current+1)=c2;
1350     }
1351
1352     /* we know the cc of the last code point */
1353     return trailCC;
1354 }
1355
1356 /*
1357  * merge two UTF-16 string parts together
1358  * to canonically order (order by combining classes) their concatenation
1359  *
1360  * the two strings may already be adjacent, so that the merging is done in-place
1361  * if the two strings are not adjacent, then the buffer holding the first one
1362  * must be large enough
1363  * the second string may or may not be ordered in itself
1364  *
1365  * before: [start..current[ is already ordered, and
1366  *         [next..limit[    may be ordered in itself, but
1367  *                          is not in relation to [start..current[
1368  * after: [start..current+(limit-next)[ is ordered
1369  *
1370  * the algorithm is a simple bubble-sort that takes the characters from *next++
1371  * and inserts them in correct combining class order into the preceding part
1372  * of the string
1373  *
1374  * since this function is called much less often than the single-code point
1375  * _insertOrdered(), it just uses that for easier maintenance
1376  * (see file version from before 2001aug31 for a more optimized version)
1377  *
1378  * returns the trailing combining class
1379  */
1380 static uint8_t
1381 _mergeOrdered(UChar *start, UChar *current,
1382               const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1383     UChar *r;
1384     UChar c, c2;
1385     uint8_t cc, trailCC=0;
1386     UBool adjacent;
1387
1388     adjacent= current==next;
1389
1390     if(start!=current || !isOrdered) {
1391         while(next<limit) {
1392             cc=_getNextCC(next, limit, c, c2);
1393             if(cc==0) {
1394                 /* does not bubble back */
1395                 trailCC=0;
1396                 if(adjacent) {
1397                     current=(UChar *)next;
1398                 } else {
1399                     *current++=c;
1400                     if(c2!=0) {
1401                         *current++=c2;
1402                     }
1403                 }
1404                 if(isOrdered) {
1405                     break;
1406                 } else {
1407                     start=current;
1408                 }
1409             } else {
1410                 r=current+(c2==0 ? 1 : 2);
1411                 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1412                 current=r;
1413             }
1414         }
1415     }
1416
1417     if(next==limit) {
1418         /* we know the cc of the last code point */
1419         return trailCC;
1420     } else {
1421         if(!adjacent) {
1422             /* copy the second string part */
1423             do {
1424                 *current++=*next++;
1425             } while(next!=limit);
1426             limit=current;
1427         }
1428         return _getPrevCC(start, limit);
1429     }
1430 }
1431
1432 /* find the last true starter in [start..src[ and return the pointer to it */
1433 static const UChar *
1434 _findPreviousStarter(const UChar *start, const UChar *src,
1435                      uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1436     uint32_t norm32;
1437     UChar c, c2;
1438
1439     while(start<src) {
1440         norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1441         if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1442             break;
1443         }
1444     }
1445     return src;
1446 }
1447
1448 /* find the first true starter in [src..limit[ and return the pointer to it */
1449 static const UChar *
1450 _findNextStarter(const UChar *src, const UChar *limit,
1451                  uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1452     const UChar *p;
1453     uint32_t norm32, ccOrQCMask;
1454     int32_t length;
1455     UChar c, c2;
1456     uint8_t cc, trailCC;
1457
1458     ccOrQCMask=_NORM_CC_MASK|qcMask;
1459
1460     for(;;) {
1461         if(src==limit) {
1462             break; /* end of string */
1463         }
1464         c=*src;
1465         if(c<minNoMaybe) {
1466             break; /* catches NUL terminater, too */
1467         }
1468
1469         norm32=_getNorm32(c);
1470         if((norm32&ccOrQCMask)==0) {
1471             break; /* true starter */
1472         }
1473
1474         if(isNorm32LeadSurrogate(norm32)) {
1475             /* c is a lead surrogate, get the real norm32 */
1476             if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1477                 break; /* unmatched first surrogate: counts as a true starter */
1478             }
1479             norm32=_getNorm32FromSurrogatePair(norm32, c2);
1480
1481             if((norm32&ccOrQCMask)==0) {
1482                 break; /* true starter */
1483             }
1484         } else {
1485             c2=0;
1486         }
1487
1488         /* (c, c2) is not a true starter but its decomposition may be */
1489         if(norm32&decompQCMask) {
1490             /* (c, c2) decomposes, get everything from the variable-length extra data */
1491             p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1492
1493             /* get the first character's norm32 to check if it is a true starter */
1494             if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1495                 break; /* true starter */
1496             }
1497         }
1498
1499         src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1500     }
1501
1502     return src;
1503 }
1504
1505 /* make NFD & NFKD ---------------------------------------------------------- */
1506
1507 U_CAPI int32_t U_EXPORT2
1508 unorm_getDecomposition(UChar32 c, UBool compat,
1509                        UChar *dest, int32_t destCapacity) {
1510 #if !UNORM_HARDCODE_DATA
1511     UErrorCode errorCode=U_ZERO_ERROR;
1512 #endif
1513     if( (uint32_t)c<=0x10ffff &&
1514 #if !UNORM_HARDCODE_DATA
1515         _haveData(errorCode) &&
1516 #endif
1517         ((dest!=NULL && destCapacity>0) || destCapacity==0)
1518     ) {
1519         uint32_t norm32, qcMask;
1520         UChar32 minNoMaybe;
1521         int32_t length;
1522
1523         /* initialize */
1524         if(!compat) {
1525             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1526             qcMask=_NORM_QC_NFD;
1527         } else {
1528             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1529             qcMask=_NORM_QC_NFKD;
1530         }
1531
1532         if(c<minNoMaybe) {
1533             /* trivial case */
1534             if(destCapacity>0) {
1535                 dest[0]=(UChar)c;
1536             }
1537             return -1;
1538         }
1539
1540         /* data lookup */
1541         UTRIE_GET32(&normTrie, c, norm32);
1542         if((norm32&qcMask)==0) {
1543             /* simple case: no decomposition */
1544             if(c<=0xffff) {
1545                 if(destCapacity>0) {
1546                     dest[0]=(UChar)c;
1547                 }
1548                 return -1;
1549             } else {
1550                 if(destCapacity>=2) {
1551                     dest[0]=UTF16_LEAD(c);
1552                     dest[1]=UTF16_TRAIL(c);
1553                 }
1554                 return -2;
1555             }
1556         } else if(isNorm32HangulOrJamo(norm32)) {
1557             /* Hangul syllable: decompose algorithmically */
1558             UChar c2;
1559
1560             c-=HANGUL_BASE;
1561
1562             c2=(UChar)(c%JAMO_T_COUNT);
1563             c/=JAMO_T_COUNT;
1564             if(c2>0) {
1565                 if(destCapacity>=3) {
1566                     dest[2]=(UChar)(JAMO_T_BASE+c2);
1567                 }
1568                 length=3;
1569             } else {
1570                 length=2;
1571             }
1572
1573             if(destCapacity>=2) {
1574                 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1575                 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1576             }
1577             return length;
1578         } else {
1579             /* c decomposes, get everything from the variable-length extra data */
1580             const UChar *p, *limit;
1581             uint8_t cc, trailCC;
1582
1583             p=_decompose(norm32, qcMask, length, cc, trailCC);
1584             if(length<=destCapacity) {
1585                 limit=p+length;
1586                 do {
1587                     *dest++=*p++;
1588                 } while(p<limit);
1589             }
1590             return length;
1591         }
1592     } else {
1593         return 0;
1594     }
1595 }
1596
1597 static int32_t
1598 _decompose(UChar *dest, int32_t destCapacity,
1599            const UChar *src, int32_t srcLength,
1600            UBool compat, const UnicodeSet *nx,
1601            uint8_t &outTrailCC) {
1602     UChar buffer[3];
1603     const UChar *limit, *prevSrc, *p;
1604     uint32_t norm32, ccOrQCMask, qcMask;
1605     int32_t destIndex, reorderStartIndex, length;
1606     UChar c, c2, minNoMaybe;
1607     uint8_t cc, prevCC, trailCC;
1608
1609     if(!compat) {
1610         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1611         qcMask=_NORM_QC_NFD;
1612     } else {
1613         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1614         qcMask=_NORM_QC_NFKD;
1615     }
1616
1617     /* initialize */
1618     ccOrQCMask=_NORM_CC_MASK|qcMask;
1619     destIndex=reorderStartIndex=0;
1620     prevCC=0;
1621
1622     /* avoid compiler warnings */
1623     norm32=0;
1624     c=0;
1625     cc=0;
1626     trailCC=0;
1627
1628     if(srcLength>=0) {
1629         /* string with length */
1630         limit=src+srcLength;
1631     } else /* srcLength==-1 */ {
1632         /* zero-terminated string */
1633         limit=NULL;
1634     }
1635
1636     U_ALIGN_CODE(16);
1637
1638     for(;;) {
1639         /* count code units below the minimum or with irrelevant data for the quick check */
1640         prevSrc=src;
1641         if(limit==NULL) {
1642             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1643                 prevCC=0;
1644                 ++src;
1645             }
1646         } else {
1647             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1648                 prevCC=0;
1649                 ++src;
1650             }
1651         }
1652
1653         /* copy these code units all at once */
1654         if(src!=prevSrc) {
1655             length=(int32_t)(src-prevSrc);
1656             if((destIndex+length)<=destCapacity) {
1657                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1658             }
1659             destIndex+=length;
1660             reorderStartIndex=destIndex;
1661         }
1662
1663         /* end of source reached? */
1664         if(limit==NULL ? c==0 : src==limit) {
1665             break;
1666         }
1667
1668         /* c already contains *src and norm32 is set for it, increment src */
1669         ++src;
1670
1671         /* check one above-minimum, relevant code unit */
1672         /*
1673          * generally, set p and length to the decomposition string
1674          * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1675          * in all cases, set cc to the lead and trailCC to the trail combining class
1676          *
1677          * the following merge-sort of the current character into the preceding,
1678          * canonically ordered result text will use the optimized _insertOrdered()
1679          * if there is only one single code point to process;
1680          * this is indicated with p==NULL, and (c, c2) is the character to insert
1681          * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1682          * for a supplementary character)
1683          * otherwise, p[length] is merged in with _mergeOrdered()
1684          */
1685         if(isNorm32HangulOrJamo(norm32)) {
1686             if(nx_contains(nx, c)) {
1687                 c2=0;
1688                 p=NULL;
1689                 length=1;
1690             } else {
1691                 /* Hangul syllable: decompose algorithmically */
1692                 p=buffer;
1693                 cc=trailCC=0;
1694
1695                 c-=HANGUL_BASE;
1696
1697                 c2=(UChar)(c%JAMO_T_COUNT);
1698                 c/=JAMO_T_COUNT;
1699                 if(c2>0) {
1700                     buffer[2]=(UChar)(JAMO_T_BASE+c2);
1701                     length=3;
1702                 } else {
1703                     length=2;
1704                 }
1705
1706                 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1707                 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1708             }
1709         } else {
1710             if(isNorm32Regular(norm32)) {
1711                 c2=0;
1712                 length=1;
1713             } else {
1714                 /* c is a lead surrogate, get the real norm32 */
1715                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1716                     ++src;
1717                     length=2;
1718                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
1719                 } else {
1720                     c2=0;
1721                     length=1;
1722                     norm32=0;
1723                 }
1724             }
1725
1726             /* get the decomposition and the lead and trail cc's */
1727             if(nx_contains(nx, c, c2)) {
1728                 /* excluded: norm32==0 */
1729                 cc=trailCC=0;
1730                 p=NULL;
1731             } else if((norm32&qcMask)==0) {
1732                 /* c does not decompose */
1733                 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1734                 p=NULL;
1735             } else {
1736                 /* c decomposes, get everything from the variable-length extra data */
1737                 p=_decompose(norm32, qcMask, length, cc, trailCC);
1738                 if(length==1) {
1739                     /* fastpath a single code unit from decomposition */
1740                     c=*p;
1741                     c2=0;
1742                     p=NULL;
1743                 }
1744             }
1745         }
1746
1747         /* append the decomposition to the destination buffer, assume length>0 */
1748         if((destIndex+length)<=destCapacity) {
1749             UChar *reorderSplit=dest+destIndex;
1750             if(p==NULL) {
1751                 /* fastpath: single code point */
1752                 if(cc!=0 && cc<prevCC) {
1753                     /* (c, c2) is out of order with respect to the preceding text */
1754                     destIndex+=length;
1755                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1756                 } else {
1757                     /* just append (c, c2) */
1758                     dest[destIndex++]=c;
1759                     if(c2!=0) {
1760                         dest[destIndex++]=c2;
1761                     }
1762                 }
1763             } else {
1764                 /* general: multiple code points (ordered by themselves) from decomposition */
1765                 if(cc!=0 && cc<prevCC) {
1766                     /* the decomposition is out of order with respect to the preceding text */
1767                     destIndex+=length;
1768                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1769                 } else {
1770                     /* just append the decomposition */
1771                     do {
1772                         dest[destIndex++]=*p++;
1773                     } while(--length>0);
1774                 }
1775             }
1776         } else {
1777             /* buffer overflow */
1778             /* keep incrementing the destIndex for preflighting */
1779             destIndex+=length;
1780         }
1781
1782         prevCC=trailCC;
1783         if(prevCC==0) {
1784             reorderStartIndex=destIndex;
1785         }
1786     }
1787
1788     outTrailCC=prevCC;
1789     return destIndex;
1790 }
1791
1792 U_CAPI int32_t U_EXPORT2
1793 unorm_decompose(UChar *dest, int32_t destCapacity,
1794                 const UChar *src, int32_t srcLength,
1795                 UBool compat, int32_t options,
1796                 UErrorCode *pErrorCode) {
1797     const UnicodeSet *nx;
1798     int32_t destIndex;
1799     uint8_t trailCC;
1800
1801     if(!_haveData(*pErrorCode)) {
1802         return 0;
1803     }
1804
1805     nx=getNX(options, *pErrorCode);
1806     if(U_FAILURE(*pErrorCode)) {
1807         return 0;
1808     }
1809
1810     destIndex=_decompose(dest, destCapacity,
1811                          src, srcLength,
1812                          compat, nx,
1813                          trailCC);
1814
1815     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1816 }
1817
1818 /* make NFC & NFKC ---------------------------------------------------------- */
1819
1820 /* get the composition properties of the next character */
1821 static inline uint32_t
1822 _getNextCombining(UChar *&p, const UChar *limit,
1823                   UChar &c, UChar &c2,
1824                   uint16_t &combiningIndex, uint8_t &cc,
1825                   const UnicodeSet *nx) {
1826     uint32_t norm32, combineFlags;
1827
1828     /* get properties */
1829     c=*p++;
1830     norm32=_getNorm32(c);
1831
1832     /* preset output values for most characters */
1833     c2=0;
1834     combiningIndex=0;
1835     cc=0;
1836
1837     if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1838         return 0;
1839     } else {
1840         if(isNorm32Regular(norm32)) {
1841             /* set cc etc. below */
1842         } else if(isNorm32HangulOrJamo(norm32)) {
1843             /* a compatibility decomposition contained Jamos */
1844             combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1845             return norm32&_NORM_COMBINES_ANY;
1846         } else {
1847             /* c is a lead surrogate, get the real norm32 */
1848             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1849                 ++p;
1850                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1851             } else {
1852                 c2=0;
1853                 return 0;
1854             }
1855         }
1856
1857         if(nx_contains(nx, c, c2)) {
1858             return 0; /* excluded: norm32==0 */
1859         }
1860
1861         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1862
1863         combineFlags=norm32&_NORM_COMBINES_ANY;
1864         if(combineFlags!=0) {
1865             combiningIndex=*(_getExtraData(norm32)-1);
1866         }
1867         return combineFlags;
1868     }
1869 }
1870
1871 /*
1872  * given a composition-result starter (c, c2) - which means its cc==0,
1873  * it combines forward, it has extra data, its norm32!=0,
1874  * it is not a Hangul or Jamo,
1875  * get just its combineFwdIndex
1876  *
1877  * norm32(c) is special if and only if c2!=0
1878  */
1879 static inline uint16_t
1880 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1881     uint32_t norm32;
1882
1883     norm32=_getNorm32(c);
1884     if(c2!=0) {
1885         norm32=_getNorm32FromSurrogatePair(norm32, c2);
1886     }
1887     return *(_getExtraData(norm32)-1);
1888 }
1889
1890 /*
1891  * Find the recomposition result for
1892  * a forward-combining character
1893  * (specified with a pointer to its part of the combiningTable[])
1894  * and a backward-combining character
1895  * (specified with its combineBackIndex).
1896  *
1897  * If these two characters combine, then set (value, value2)
1898  * with the code unit(s) of the composition character.
1899  *
1900  * Return value:
1901  * 0    do not combine
1902  * 1    combine
1903  * >1   combine, and the composition is a forward-combining starter
1904  *
1905  * See unormimp.h for a description of the composition table format.
1906  */
1907 static inline uint16_t
1908 _combine(const uint16_t *table, uint16_t combineBackIndex,
1909          uint16_t &value, uint16_t &value2) {
1910     uint16_t key;
1911
1912     /* search in the starter's composition table */
1913     for(;;) {
1914         key=*table++;
1915         if(key>=combineBackIndex) {
1916             break;
1917         }
1918         table+= *table&0x8000 ? 2 : 1;
1919     }
1920
1921     /* mask off bit 15, the last-entry-in-the-list flag */
1922     if((key&0x7fff)==combineBackIndex) {
1923         /* found! combine! */
1924         value=*table;
1925
1926         /* is the composition a starter that combines forward? */
1927         key=(uint16_t)((value&0x2000)+1);
1928
1929         /* get the composition result code point from the variable-length result value */
1930         if(value&0x8000) {
1931             if(value&0x4000) {
1932                 /* surrogate pair composition result */
1933                 value=(uint16_t)((value&0x3ff)|0xd800);
1934                 value2=*(table+1);
1935             } else {
1936                 /* BMP composition result U+2000..U+ffff */
1937                 value=*(table+1);
1938                 value2=0;
1939             }
1940         } else {
1941             /* BMP composition result U+0000..U+1fff */
1942             value&=0x1fff;
1943             value2=0;
1944         }
1945
1946         return key;
1947     } else {
1948         /* not found */
1949         return 0;
1950     }
1951 }
1952
1953 static inline UBool
1954 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1955                UBool compat, UChar *dest, const UnicodeSet *nx) {
1956     if(isJamoVTNorm32JamoV(norm32)) {
1957         /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1958         prev=(UChar)(prev-JAMO_L_BASE);
1959         if(prev<JAMO_L_COUNT) {
1960             c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1961
1962             /* check if the next character is a Jamo T (normal or compatibility) */
1963             if(src!=limit) {
1964                 UChar next, t;
1965
1966                 next=*src;
1967                 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1968                     /* normal Jamo T */
1969                     ++src;
1970                     c+=t;
1971                 } else if(compat) {
1972                     /* if NFKC, then check for compatibility Jamo T (BMP only) */
1973                     norm32=_getNorm32(next);
1974                     if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1975                         const UChar *p;
1976                         int32_t length;
1977                         uint8_t cc, trailCC;
1978
1979                         p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1980                         if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1981                             /* compatibility Jamo T */
1982                             ++src;
1983                             c+=t;
1984                         }
1985                     }
1986                 }
1987             }
1988             if(nx_contains(nx, c)) {
1989                 if(!isHangulWithoutJamoT(c)) {
1990                     --src; /* undo ++src from reading the Jamo T */
1991                 }
1992                 return FALSE;
1993             }
1994             if(dest!=0) {
1995                 *dest=c;
1996             }
1997             return TRUE;
1998         }
1999     } else if(isHangulWithoutJamoT(prev)) {
2000         /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2001         c=(UChar)(prev+(c-JAMO_T_BASE));
2002         if(nx_contains(nx, c)) {
2003             return FALSE;
2004         }
2005         if(dest!=0) {
2006             *dest=c;
2007         }
2008         return TRUE;
2009     }
2010     return FALSE;
2011 }
2012
2013 /*
2014  * recompose the characters in [p..limit[
2015  * (which is in NFD - decomposed and canonically ordered),
2016  * adjust limit, and return the trailing cc
2017  *
2018  * since for NFKC we may get Jamos in decompositions, we need to
2019  * recompose those too
2020  *
2021  * note that recomposition never lengthens the text:
2022  * any character consists of either one or two code units;
2023  * a composition may contain at most one more code unit than the original starter,
2024  * while the combining mark that is removed has at least one code unit
2025  */
2026 static uint8_t
2027 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
2028     UChar *starter, *pRemove, *q, *r;
2029     uint32_t combineFlags;
2030     UChar c, c2;
2031     uint16_t combineFwdIndex, combineBackIndex;
2032     uint16_t result, value, value2;
2033     uint8_t cc, prevCC;
2034     UBool starterIsSupplementary;
2035
2036     starter=NULL;                   /* no starter */
2037     combineFwdIndex=0;              /* will not be used until starter!=NULL - avoid compiler warnings */
2038     combineBackIndex=0;             /* will always be set if combineFlags!=0 - avoid compiler warnings */
2039     value=value2=0;                 /* always set by _combine() before used - avoid compiler warnings */
2040     starterIsSupplementary=FALSE;   /* will not be used until starter!=NULL - avoid compiler warnings */
2041     prevCC=0;
2042
2043     for(;;) {
2044         combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2045         if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2046             if(combineBackIndex&0x8000) {
2047                 /* c is a Jamo V/T, see if we can compose it with the previous character */
2048                 /* for the PRI #29 fix, check that there is no intervening combining mark */
2049                 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2050                     pRemove=NULL; /* NULL while no Hangul composition */
2051                     combineFlags=0;
2052                     c2=*starter;
2053                     if(combineBackIndex==0xfff2) {
2054                         /* Jamo V, compose with previous Jamo L and following Jamo T */
2055                         c2=(UChar)(c2-JAMO_L_BASE);
2056                         if(c2<JAMO_L_COUNT) {
2057                             pRemove=p-1;
2058                             c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2059                             if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2060                                 ++p;
2061                                 c+=c2;
2062                             } else {
2063                                 /* the result is an LV syllable, which is a starter (unlike LVT) */
2064                                 combineFlags=_NORM_COMBINES_FWD;
2065                             }
2066                             if(!nx_contains(nx, c)) {
2067                                 *starter=c;
2068                             } else {
2069                                 /* excluded */
2070                                 if(!isHangulWithoutJamoT(c)) {
2071                                     --p; /* undo the ++p from reading the Jamo T */
2072                                 }
2073                                 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2074                                 pRemove=NULL;
2075                             }
2076                         }
2077
2078                     /*
2079                      * Normally, the following can not occur:
2080                      * Since the input is in NFD, there are no Hangul LV syllables that
2081                      * a Jamo T could combine with.
2082                      * All Jamo Ts are combined above when handling Jamo Vs.
2083                      *
2084                      * However, before the PRI #29 fix, this can occur due to
2085                      * an intervening combining mark between the Hangul LV and the Jamo T.
2086                      */
2087                     } else {
2088                         /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2089                         if(isHangulWithoutJamoT(c2)) {
2090                             c2+=(UChar)(c-JAMO_T_BASE);
2091                             if(!nx_contains(nx, c2)) {
2092                                 pRemove=p-1;
2093                                 *starter=c2;
2094                             }
2095                         }
2096                     }
2097
2098                     if(pRemove!=NULL) {
2099                         /* remove the Jamo(s) */
2100                         q=pRemove;
2101                         r=p;
2102                         while(r<limit) {
2103                             *q++=*r++;
2104                         }
2105                         p=pRemove;
2106                         limit=q;
2107                     }
2108
2109                     c2=0; /* c2 held *starter temporarily */
2110
2111                     if(combineFlags!=0) {
2112                         /*
2113                          * not starter=NULL because the composition is a Hangul LV syllable
2114                          * and might combine once more (but only before the PRI #29 fix)
2115                          */
2116
2117                         /* done? */
2118                         if(p==limit) {
2119                             return prevCC;
2120                         }
2121
2122                         /* the composition is a Hangul LV syllable which is a starter that combines forward */
2123                         combineFwdIndex=0xfff0;
2124
2125                         /* we combined; continue with looking for compositions */
2126                         continue;
2127                     }
2128                 }
2129
2130                 /*
2131                  * now: cc==0 and the combining index does not include "forward" ->
2132                  * the rest of the loop body will reset starter to NULL;
2133                  * technically, a composed Hangul syllable is a starter, but it
2134                  * does not combine forward now that we have consumed all eligible Jamos;
2135                  * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2136                  */
2137
2138             } else if(
2139                 /* the starter is not a Hangul LV or Jamo V/T and */
2140                 !(combineFwdIndex&0x8000) &&
2141                 /* the combining mark is not blocked and */
2142                 ((options&UNORM_BEFORE_PRI_29) ?
2143                     (prevCC!=cc || prevCC==0) :
2144                     (prevCC<cc || prevCC==0)) &&
2145                 /* the starter and the combining mark (c, c2) do combine and */
2146                 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2147                 /* the composition result is not excluded */
2148                 !nx_contains(nx, value, value2)
2149             ) {
2150                 /* replace the starter with the composition, remove the combining mark */
2151                 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2152
2153                 /* replace the starter with the composition */
2154                 *starter=(UChar)value;
2155                 if(starterIsSupplementary) {
2156                     if(value2!=0) {
2157                         /* both are supplementary */
2158                         *(starter+1)=(UChar)value2;
2159                     } else {
2160                         /* the composition is shorter than the starter, move the intermediate characters forward one */
2161                         starterIsSupplementary=FALSE;
2162                         q=starter+1;
2163                         r=q+1;
2164                         while(r<pRemove) {
2165                             *q++=*r++;
2166                         }
2167                         --pRemove;
2168                     }
2169                 } else if(value2!=0) {
2170                     /* the composition is longer than the starter, move the intermediate characters back one */
2171                     starterIsSupplementary=TRUE;
2172                     ++starter; /* temporarily increment for the loop boundary */
2173                     q=pRemove;
2174                     r=++pRemove;
2175                     while(starter<q) {
2176                         *--r=*--q;
2177                     }
2178                     *starter=(UChar)value2;
2179                     --starter; /* undo the temporary increment */
2180                 /* } else { both are on the BMP, nothing more to do */
2181                 }
2182
2183                 /* remove the combining mark by moving the following text over it */
2184                 if(pRemove<p) {
2185                     q=pRemove;
2186                     r=p;
2187                     while(r<limit) {
2188                         *q++=*r++;
2189                     }
2190                     p=pRemove;
2191                     limit=q;
2192                 }
2193
2194                 /* keep prevCC because we removed the combining mark */
2195
2196                 /* done? */
2197                 if(p==limit) {
2198                     return prevCC;
2199                 }
2200
2201                 /* is the composition a starter that combines forward? */
2202                 if(result>1) {
2203                     combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2204                 } else {
2205                     starter=NULL;
2206                 }
2207
2208                 /* we combined; continue with looking for compositions */
2209                 continue;
2210             }
2211         }
2212
2213         /* no combination this time */
2214         prevCC=cc;
2215         if(p==limit) {
2216             return prevCC;
2217         }
2218
2219         /* if (c, c2) did not combine, then check if it is a starter */
2220         if(cc==0) {
2221             /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2222             if(combineFlags&_NORM_COMBINES_FWD) {
2223                 /* it may combine with something, prepare for it */
2224                 if(c2==0) {
2225                     starterIsSupplementary=FALSE;
2226                     starter=p-1;
2227                 } else {
2228                     starterIsSupplementary=TRUE;
2229                     starter=p-2;
2230                 }
2231                 combineFwdIndex=combineBackIndex;
2232             } else {
2233                 /* it will not combine with anything */
2234                 starter=NULL;
2235             }
2236         } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2237             /* FCC: no discontiguous compositions; any intervening character blocks */
2238             starter=NULL;
2239         }
2240     }
2241 }
2242
2243 /* decompose and recompose [prevStarter..src[ */
2244 static const UChar *
2245 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2246              const UChar *prevStarter, const UChar *src,
2247              uint8_t &prevCC,
2248              int32_t options, const UnicodeSet *nx,
2249              UErrorCode *pErrorCode) {
2250     UChar *recomposeLimit;
2251     uint8_t trailCC;
2252     UBool compat;
2253
2254     compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2255
2256     /* decompose [prevStarter..src[ */
2257     length=_decompose(buffer, bufferCapacity,
2258                       prevStarter, (int32_t)(src-prevStarter),
2259                       compat, nx,
2260                       trailCC);
2261     if(length>bufferCapacity) {
2262         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2263             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2264             return NULL;
2265         }
2266         length=_decompose(buffer, bufferCapacity,
2267                           prevStarter, (int32_t)(src-prevStarter),
2268                           compat, nx,
2269                           trailCC);
2270     }
2271
2272     /* recompose the decomposition */
2273     recomposeLimit=buffer+length;
2274     if(length>=2) {
2275         prevCC=_recompose(buffer, recomposeLimit, options, nx);
2276     }
2277
2278     /* return with a pointer to the recomposition and its length */
2279     length=(int32_t)(recomposeLimit-buffer);
2280     return buffer;
2281 }
2282
2283 static int32_t
2284 _compose(UChar *dest, int32_t destCapacity,
2285          const UChar *src, int32_t srcLength,
2286          int32_t options, const UnicodeSet *nx,
2287          UErrorCode *pErrorCode) {
2288     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2289     UChar *buffer;
2290     int32_t bufferCapacity;
2291
2292     const UChar *limit, *prevSrc, *prevStarter;
2293     uint32_t norm32, ccOrQCMask, qcMask;
2294     int32_t destIndex, reorderStartIndex, length;
2295     UChar c, c2, minNoMaybe;
2296     uint8_t cc, prevCC;
2297
2298     if(options&_NORM_OPTIONS_COMPAT) {
2299         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2300         qcMask=_NORM_QC_NFKC;
2301     } else {
2302         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2303         qcMask=_NORM_QC_NFC;
2304     }
2305
2306     /* initialize */
2307     buffer=stackBuffer;
2308     bufferCapacity=_STACK_BUFFER_CAPACITY;
2309
2310     /*
2311      * prevStarter points to the last character before the current one
2312      * that is a "true" starter with cc==0 and quick check "yes".
2313      *
2314      * prevStarter will be used instead of looking for a true starter
2315      * while incrementally decomposing [prevStarter..prevSrc[
2316      * in _composePart(). Having a good prevStarter allows to just decompose
2317      * the entire [prevStarter..prevSrc[.
2318      *
2319      * When _composePart() backs out from prevSrc back to prevStarter,
2320      * then it also backs out destIndex by the same amount.
2321      * Therefore, at all times, the (prevSrc-prevStarter) source units
2322      * must correspond 1:1 to destination units counted with destIndex,
2323      * except for reordering.
2324      * This is true for the qc "yes" characters copied in the fast loop,
2325      * and for pure reordering.
2326      * prevStarter must be set forward to src when this is not true:
2327      * In _composePart() and after composing a Hangul syllable.
2328      *
2329      * This mechanism relies on the assumption that the decomposition of a true starter
2330      * also begins with a true starter. gennorm/store.c checks for this.
2331      */
2332     prevStarter=src;
2333
2334     ccOrQCMask=_NORM_CC_MASK|qcMask;
2335     destIndex=reorderStartIndex=0;
2336     prevCC=0;
2337
2338     /* avoid compiler warnings */
2339     norm32=0;
2340     c=0;
2341
2342     if(srcLength>=0) {
2343         /* string with length */
2344         limit=src+srcLength;
2345     } else /* srcLength==-1 */ {
2346         /* zero-terminated string */
2347         limit=NULL;
2348     }
2349
2350     U_ALIGN_CODE(16);
2351
2352     for(;;) {
2353         /* count code units below the minimum or with irrelevant data for the quick check */
2354         prevSrc=src;
2355         if(limit==NULL) {
2356             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2357                 prevCC=0;
2358                 ++src;
2359             }
2360         } else {
2361             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2362                 prevCC=0;
2363                 ++src;
2364             }
2365         }
2366
2367         /* copy these code units all at once */
2368         if(src!=prevSrc) {
2369             length=(int32_t)(src-prevSrc);
2370             if((destIndex+length)<=destCapacity) {
2371                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2372             }
2373             destIndex+=length;
2374             reorderStartIndex=destIndex;
2375
2376             /* set prevStarter to the last character in the quick check loop */
2377             prevStarter=src-1;
2378             if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2379                 --prevStarter;
2380             }
2381
2382             prevSrc=src;
2383         }
2384
2385         /* end of source reached? */
2386         if(limit==NULL ? c==0 : src==limit) {
2387             break;
2388         }
2389
2390         /* c already contains *src and norm32 is set for it, increment src */
2391         ++src;
2392
2393         /*
2394          * source buffer pointers:
2395          *
2396          *  all done      quick check   current char  not yet
2397          *                "yes" but     (c, c2)       processed
2398          *                may combine
2399          *                forward
2400          * [-------------[-------------[-------------[-------------[
2401          * |             |             |             |             |
2402          * start         prevStarter   prevSrc       src           limit
2403          *
2404          *
2405          * destination buffer pointers and indexes:
2406          *
2407          *  all done      might take    not filled yet
2408          *                characters for
2409          *                reordering
2410          * [-------------[-------------[-------------[
2411          * |             |             |             |
2412          * dest      reorderStartIndex destIndex     destCapacity
2413          */
2414
2415         /* check one above-minimum, relevant code unit */
2416         /*
2417          * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2418          * check for Jamo V/T, then for surrogates and regular characters
2419          * c is not a Hangul syllable or Jamo L because
2420          * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2421          */
2422         if(isNorm32HangulOrJamo(norm32)) {
2423             /*
2424              * c is a Jamo V/T:
2425              * try to compose with the previous character, Jamo V also with a following Jamo T,
2426              * and set values here right now in case we just continue with the main loop
2427              */
2428             prevCC=cc=0;
2429             reorderStartIndex=destIndex;
2430
2431             if(
2432                 destIndex>0 &&
2433                 _composeHangul(
2434                     *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2435                     destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2436                     nx)
2437             ) {
2438                 prevStarter=src;
2439                 continue;
2440             }
2441
2442             /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2443             c2=0;
2444             length=1;
2445             prevStarter=prevSrc;
2446         } else {
2447             if(isNorm32Regular(norm32)) {
2448                 c2=0;
2449                 length=1;
2450             } else {
2451                 /* c is a lead surrogate, get the real norm32 */
2452                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2453                     ++src;
2454                     length=2;
2455                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
2456                 } else {
2457                     /* c is an unpaired lead surrogate, nothing to do */
2458                     c2=0;
2459                     length=1;
2460                     norm32=0;
2461                 }
2462             }
2463
2464             /* we are looking at the character (c, c2) at [prevSrc..src[ */
2465             if(nx_contains(nx, c, c2)) {
2466                 /* excluded: norm32==0 */
2467                 cc=0;
2468             } else if((norm32&qcMask)==0) {
2469                 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2470             } else {
2471                 const UChar *p;
2472                 uint32_t decompQCMask;
2473
2474                 /*
2475                  * find appropriate boundaries around this character,
2476                  * decompose the source text from between the boundaries,
2477                  * and recompose it
2478                  *
2479                  * this puts the intermediate text into the side buffer because
2480                  * it might be longer than the recomposition end result,
2481                  * or the destination buffer may be too short or missing
2482                  *
2483                  * note that destIndex may be adjusted backwards to account
2484                  * for source text that passed the quick check but needed to
2485                  * take part in the recomposition
2486                  */
2487                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2488
2489                 /*
2490                  * find the last true starter in [prevStarter..src[
2491                  * it is either the decomposition of the current character (at prevSrc),
2492                  * or prevStarter
2493                  */
2494                 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2495                     prevStarter=prevSrc;
2496                 } else {
2497                     /* adjust destIndex: back out what had been copied with qc "yes" */
2498                     destIndex-=(int32_t)(prevSrc-prevStarter);
2499                 }
2500
2501                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2502                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2503
2504                 /* compose [prevStarter..src[ */
2505                 p=_composePart(stackBuffer, buffer, bufferCapacity,
2506                                length,          /* output */
2507                                prevStarter, src,
2508                                prevCC,          /* output */
2509                                options, nx,
2510                                pErrorCode);
2511
2512                 if(p==NULL) {
2513                     destIndex=0;   /* an error occurred (out of memory) */
2514                     break;
2515                 }
2516
2517                 /* append the recomposed buffer contents to the destination buffer */
2518                 if((destIndex+length)<=destCapacity) {
2519                     while(length>0) {
2520                         dest[destIndex++]=*p++;
2521                         --length;
2522                     }
2523                 } else {
2524                     /* buffer overflow */
2525                     /* keep incrementing the destIndex for preflighting */
2526                     destIndex+=length;
2527                 }
2528
2529                 /* set the next starter */
2530                 prevStarter=src;
2531
2532                 continue;
2533             }
2534         }
2535
2536         /* append the single code point (c, c2) to the destination buffer */
2537         if((destIndex+length)<=destCapacity) {
2538             if(cc!=0 && cc<prevCC) {
2539                 /* (c, c2) is out of order with respect to the preceding text */
2540                 UChar *reorderSplit=dest+destIndex;
2541                 destIndex+=length;
2542                 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2543             } else {
2544                 /* just append (c, c2) */
2545                 dest[destIndex++]=c;
2546                 if(c2!=0) {
2547                     dest[destIndex++]=c2;
2548                 }
2549                 prevCC=cc;
2550             }
2551         } else {
2552             /* buffer overflow */
2553             /* keep incrementing the destIndex for preflighting */
2554             destIndex+=length;
2555             prevCC=cc;
2556         }
2557     }
2558
2559     /* cleanup */
2560     if(buffer!=stackBuffer) {
2561         uprv_free(buffer);
2562     }
2563
2564     return destIndex;
2565 }
2566
2567 U_CAPI int32_t U_EXPORT2
2568 unorm_compose(UChar *dest, int32_t destCapacity,
2569               const UChar *src, int32_t srcLength,
2570               UBool compat, int32_t options,
2571               UErrorCode *pErrorCode) {
2572     const UnicodeSet *nx;
2573     int32_t destIndex;
2574
2575     if(!_haveData(*pErrorCode)) {
2576         return 0;
2577     }
2578
2579     nx=getNX(options, *pErrorCode);
2580     if(U_FAILURE(*pErrorCode)) {
2581         return 0;
2582     }
2583
2584     /* reset options bits that should only be set here or inside _compose() */
2585     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2586
2587     if(compat) {
2588         options|=_NORM_OPTIONS_COMPAT;
2589     }
2590
2591     destIndex=_compose(dest, destCapacity,
2592                        src, srcLength,
2593                        options, nx,
2594                        pErrorCode);
2595
2596     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2597 }
2598
2599 /* make FCD ----------------------------------------------------------------- */
2600
2601 static const UChar *
2602 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2603     UChar c, c2;
2604
2605     /*
2606      * find the first position in [src..limit[ after some cc==0 according to FCD data
2607      *
2608      * at the beginning of the loop, we have fcd16 from before src
2609      *
2610      * stop at positions:
2611      * - after trail cc==0
2612      * - at the end of the source
2613      * - before lead cc==0
2614      */
2615     for(;;) {
2616         /* stop if trail cc==0 for the previous character */
2617         if((fcd16&0xff)==0) {
2618             break;
2619         }
2620
2621         /* get c=*src - stop at end of string */
2622         if(src==limit) {
2623             break;
2624         }
2625         c=*src;
2626
2627         /* stop if lead cc==0 for this character */
2628         if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2629             break; /* catches terminating NUL, too */
2630         }
2631
2632         if(!UTF_IS_FIRST_SURROGATE(c)) {
2633             if(fcd16<=0xff) {
2634                 break;
2635             }
2636             ++src;
2637         } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2638             /* c is a lead surrogate, get the real fcd16 */
2639             fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2640             if(fcd16<=0xff) {
2641                 break;
2642             }
2643             src+=2;
2644         } else {
2645             /* c is an unpaired first surrogate, lead cc==0 */
2646             break;
2647         }
2648     }
2649
2650     return src;
2651 }
2652
2653 static uint8_t
2654 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2655               UChar *dest, int32_t &destIndex, int32_t destCapacity,
2656               const UnicodeSet *nx) {
2657     const UChar *p;
2658     uint32_t norm32;
2659     int32_t reorderStartIndex, length;
2660     UChar c, c2;
2661     uint8_t cc, prevCC, trailCC;
2662
2663     /*
2664      * canonically decompose [src..decompLimit[
2665      *
2666      * all characters in this range have some non-zero cc,
2667      * directly or in decomposition,
2668      * so that we do not need to check in the following for quick-check limits etc.
2669      *
2670      * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2671      *
2672      * we also do not need to check for c==0 because we have an established decompLimit
2673      */
2674     reorderStartIndex=destIndex;
2675     prevCC=0;
2676
2677     while(src<decompLimit) {
2678         c=*src++;
2679         norm32=_getNorm32(c);
2680         if(isNorm32Regular(norm32)) {
2681             c2=0;
2682             length=1;
2683         } else {
2684             /*
2685              * reminder: this function is called with [src..decompLimit[
2686              * not containing any Hangul/Jamo characters,
2687              * therefore the only specials are lead surrogates
2688              */
2689             /* c is a lead surrogate, get the real norm32 */
2690             if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2691                 ++src;
2692                 length=2;
2693                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2694             } else {
2695                 c2=0;
2696                 length=1;
2697                 norm32=0;
2698             }
2699         }
2700
2701         /* get the decomposition and the lead and trail cc's */
2702         if(nx_contains(nx, c, c2)) {
2703             /* excluded: norm32==0 */
2704             cc=trailCC=0;
2705             p=NULL;
2706         } else if((norm32&_NORM_QC_NFD)==0) {
2707             /* c does not decompose */
2708             cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2709             p=NULL;
2710         } else {
2711             /* c decomposes, get everything from the variable-length extra data */
2712             p=_decompose(norm32, length, cc, trailCC);
2713             if(length==1) {
2714                 /* fastpath a single code unit from decomposition */
2715                 c=*p;
2716                 c2=0;
2717                 p=NULL;
2718             }
2719         }
2720
2721         /* append the decomposition to the destination buffer, assume length>0 */
2722         if((destIndex+length)<=destCapacity) {
2723             UChar *reorderSplit=dest+destIndex;
2724             if(p==NULL) {
2725                 /* fastpath: single code point */
2726                 if(cc!=0 && cc<prevCC) {
2727                     /* (c, c2) is out of order with respect to the preceding text */
2728                     destIndex+=length;
2729                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2730                 } else {
2731                     /* just append (c, c2) */
2732                     dest[destIndex++]=c;
2733                     if(c2!=0) {
2734                         dest[destIndex++]=c2;
2735                     }
2736                 }
2737             } else {
2738                 /* general: multiple code points (ordered by themselves) from decomposition */
2739                 if(cc!=0 && cc<prevCC) {
2740                     /* the decomposition is out of order with respect to the preceding text */
2741                     destIndex+=length;
2742                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2743                 } else {
2744                     /* just append the decomposition */
2745                     do {
2746                         dest[destIndex++]=*p++;
2747                     } while(--length>0);
2748                 }
2749             }
2750         } else {
2751             /* buffer overflow */
2752             /* keep incrementing the destIndex for preflighting */
2753             destIndex+=length;
2754         }
2755
2756         prevCC=trailCC;
2757         if(prevCC==0) {
2758             reorderStartIndex=destIndex;
2759         }
2760     }
2761
2762     return prevCC;
2763 }
2764
2765 static int32_t
2766 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2767               const UChar *src, int32_t srcLength,
2768               const UnicodeSet *nx,
2769               UErrorCode *pErrorCode) {
2770     const UChar *limit, *prevSrc, *decompStart;
2771     int32_t destIndex, length;
2772     UChar c, c2;
2773     uint16_t fcd16;
2774     int16_t prevCC, cc;
2775
2776     if(!_haveData(*pErrorCode)) {
2777         return 0;
2778     }
2779
2780     /* initialize */
2781     decompStart=src;
2782     destIndex=0;
2783     prevCC=0;
2784
2785     /* avoid compiler warnings */
2786     c=0;
2787     fcd16=0;
2788
2789     if(srcLength>=0) {
2790         /* string with length */
2791         limit=src+srcLength;
2792     } else /* srcLength==-1 */ {
2793         /* zero-terminated string */
2794         limit=NULL;
2795     }
2796
2797     U_ALIGN_CODE(16);
2798
2799     for(;;) {
2800         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2801         prevSrc=src;
2802         if(limit==NULL) {
2803             for(;;) {
2804                 c=*src;
2805                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2806                     if(c==0) {
2807                         break;
2808                     }
2809                     prevCC=(int16_t)-c;
2810                 } else if((fcd16=_getFCD16(c))==0) {
2811                     prevCC=0;
2812                 } else {
2813                     break;
2814                 }
2815                 ++src;
2816             }
2817         } else {
2818             for(;;) {
2819                 if(src==limit) {
2820                     break;
2821                 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2822                     prevCC=(int16_t)-c;
2823                 } else if((fcd16=_getFCD16(c))==0) {
2824                     prevCC=0;
2825                 } else {
2826                     break;
2827                 }
2828                 ++src;
2829             }
2830         }
2831
2832         /*
2833          * prevCC has values from the following ranges:
2834          * 0..0xff - the previous trail combining class
2835          * <0      - the negative value of the previous code unit;
2836          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2837          *           was deferred so that average text is checked faster
2838          */
2839
2840         /* copy these code units all at once */
2841         if(src!=prevSrc) {
2842             length=(int32_t)(src-prevSrc);
2843             if((destIndex+length)<=destCapacity) {
2844                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2845             }
2846             destIndex+=length;
2847             prevSrc=src;
2848
2849             /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2850             if(prevCC<0) {
2851                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2852                 if(!nx_contains(nx, (UChar32)-prevCC)) {
2853                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2854                 } else {
2855                     prevCC=0; /* excluded: fcd16==0 */
2856                 }
2857
2858                 /*
2859                  * set a pointer to this below-U+0300 character;
2860                  * if prevCC==0 then it will moved to after this character below
2861                  */
2862                 decompStart=prevSrc-1;
2863             }
2864         }
2865         /*
2866          * now:
2867          * prevSrc==src - used later to adjust destIndex before decomposition
2868          * prevCC>=0
2869          */
2870
2871         /* end of source reached? */
2872         if(limit==NULL ? c==0 : src==limit) {
2873             break;
2874         }
2875
2876         /* set a pointer to after the last source position where prevCC==0 */
2877         if(prevCC==0) {
2878             decompStart=prevSrc;
2879         }
2880
2881         /* c already contains *src and fcd16 is set for it, increment src */
2882         ++src;
2883
2884         /* check one above-minimum, relevant code unit */
2885         if(UTF_IS_FIRST_SURROGATE(c)) {
2886             /* c is a lead surrogate, get the real fcd16 */
2887             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2888                 ++src;
2889                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2890             } else {
2891                 c2=0;
2892                 fcd16=0;
2893             }
2894         } else {
2895             c2=0;
2896         }
2897
2898         /* we are looking at the character (c, c2) at [prevSrc..src[ */
2899         if(nx_contains(nx, c, c2)) {
2900             fcd16=0; /* excluded: fcd16==0 */
2901         }
2902
2903         /* check the combining order, get the lead cc */
2904         cc=(int16_t)(fcd16>>8);
2905         if(cc==0 || cc>=prevCC) {
2906             /* the order is ok */
2907             if(cc==0) {
2908                 decompStart=prevSrc;
2909             }
2910             prevCC=(int16_t)(fcd16&0xff);
2911
2912             /* just append (c, c2) */
2913             length= c2==0 ? 1 : 2;
2914             if((destIndex+length)<=destCapacity) {
2915                 dest[destIndex++]=c;
2916                 if(c2!=0) {
2917                     dest[destIndex++]=c2;
2918                 }
2919             } else {
2920                 destIndex+=length;
2921             }
2922         } else {
2923             /*
2924              * back out the part of the source that we copied already but
2925              * is now going to be decomposed;
2926              * prevSrc is set to after what was copied
2927              */
2928             destIndex-=(int32_t)(prevSrc-decompStart);
2929
2930             /*
2931              * find the part of the source that needs to be decomposed;
2932              * to be safe and simple, decompose to before the next character with lead cc==0
2933              */
2934             src=_findSafeFCD(src, limit, fcd16);
2935
2936             /*
2937              * the source text does not fulfill the conditions for FCD;
2938              * decompose and reorder a limited piece of the text
2939              */
2940             prevCC=_decomposeFCD(decompStart, src,
2941                                  dest, destIndex, destCapacity,
2942                                  nx);
2943             decompStart=src;
2944         }
2945     }
2946
2947     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2948 }
2949
2950 /* quick check functions ---------------------------------------------------- */
2951
2952 static UBool
2953 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2954     const UChar *limit;
2955     UChar c, c2;
2956     uint16_t fcd16;
2957     int16_t prevCC, cc;
2958
2959     /* initialize */
2960     prevCC=0;
2961
2962     if(srcLength>=0) {
2963         /* string with length */
2964         limit=src+srcLength;
2965     } else /* srcLength==-1 */ {
2966         /* zero-terminated string */
2967         limit=NULL;
2968     }
2969
2970     U_ALIGN_CODE(16);
2971
2972     for(;;) {
2973         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2974         if(limit==NULL) {
2975             for(;;) {
2976                 c=*src++;
2977                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2978                     if(c==0) {
2979                         return TRUE;
2980                     }
2981                     /*
2982                      * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2983                      * because chances are good that the next one will have
2984                      * a leading cc of 0;
2985                      * _getFCD16(-prevCC) is later called when necessary -
2986                      * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2987                      */
2988                     prevCC=(int16_t)-c;
2989                 } else if((fcd16=_getFCD16(c))==0) {
2990                     prevCC=0;
2991                 } else {
2992                     break;
2993                 }
2994             }
2995         } else {
2996             for(;;) {
2997                 if(src==limit) {
2998                     return TRUE;
2999                 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
3000                     prevCC=(int16_t)-c;
3001                 } else if((fcd16=_getFCD16(c))==0) {
3002                     prevCC=0;
3003                 } else {
3004                     break;
3005                 }
3006             }
3007         }
3008
3009         /* check one above-minimum, relevant code unit */
3010         if(UTF_IS_FIRST_SURROGATE(c)) {
3011             /* c is a lead surrogate, get the real fcd16 */
3012             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3013                 ++src;
3014                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
3015             } else {
3016                 c2=0;
3017                 fcd16=0;
3018             }
3019         } else {
3020             c2=0;
3021         }
3022
3023         if(nx_contains(nx, c, c2)) {
3024             prevCC=0; /* excluded: fcd16==0 */
3025             continue;
3026         }
3027
3028         /*
3029          * prevCC has values from the following ranges:
3030          * 0..0xff - the previous trail combining class
3031          * <0      - the negative value of the previous code unit;
3032          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3033          *           was deferred so that average text is checked faster
3034          */
3035
3036         /* check the combining order */
3037         cc=(int16_t)(fcd16>>8);
3038         if(cc!=0) {
3039             if(prevCC<0) {
3040                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3041                 if(!nx_contains(nx, (UChar32)-prevCC)) {
3042                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3043                 } else {
3044                     prevCC=0; /* excluded: fcd16==0 */
3045                 }
3046             }
3047
3048             if(cc<prevCC) {
3049                 return FALSE;
3050             }
3051         }
3052         prevCC=(int16_t)(fcd16&0xff);
3053     }
3054 }
3055
3056 static UNormalizationCheckResult
3057 _quickCheck(const UChar *src,
3058             int32_t srcLength,
3059             UNormalizationMode mode,
3060             UBool allowMaybe,
3061             const UnicodeSet *nx,
3062             UErrorCode *pErrorCode) {
3063     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3064     UChar *buffer;
3065     int32_t bufferCapacity;
3066
3067     const UChar *start, *limit;
3068     uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3069     int32_t options;
3070     UChar c, c2, minNoMaybe;
3071     uint8_t cc, prevCC;
3072     UNormalizationCheckResult result;
3073
3074     /* check arguments */
3075     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3076         return UNORM_MAYBE;
3077     }
3078
3079     if(src==NULL || srcLength<-1) {
3080         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3081         return UNORM_MAYBE;
3082     }
3083
3084     if(!_haveData(*pErrorCode)) {
3085         return UNORM_MAYBE;
3086     }
3087
3088     /* check for a valid mode and set the quick check minimum and mask */
3089     switch(mode) {
3090     case UNORM_NFC:
3091         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3092         qcMask=_NORM_QC_NFC;
3093         options=0;
3094         break;
3095     case UNORM_NFKC:
3096         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3097         qcMask=_NORM_QC_NFKC;
3098         options=_NORM_OPTIONS_COMPAT;
3099         break;
3100     case UNORM_NFD:
3101         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3102         qcMask=_NORM_QC_NFD;
3103         options=0;
3104         break;
3105     case UNORM_NFKD:
3106         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3107         qcMask=_NORM_QC_NFKD;
3108         options=_NORM_OPTIONS_COMPAT;
3109         break;
3110     case UNORM_FCD:
3111         if(fcdTrie.index==NULL) {
3112             *pErrorCode=U_UNSUPPORTED_ERROR;
3113             return UNORM_MAYBE;
3114         }
3115         return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3116     default:
3117         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3118         return UNORM_MAYBE;
3119     }
3120
3121     /* initialize */
3122     buffer=stackBuffer;
3123     bufferCapacity=_STACK_BUFFER_CAPACITY;
3124
3125     ccOrQCMask=_NORM_CC_MASK|qcMask;
3126     result=UNORM_YES;
3127     prevCC=0;
3128
3129     start=src;
3130     if(srcLength>=0) {
3131         /* string with length */
3132         limit=src+srcLength;
3133     } else /* srcLength==-1 */ {
3134         /* zero-terminated string */
3135         limit=NULL;
3136     }
3137
3138     U_ALIGN_CODE(16);
3139
3140     for(;;) {
3141         /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3142         if(limit==NULL) {
3143             for(;;) {
3144                 c=*src++;
3145                 if(c<minNoMaybe) {
3146                     if(c==0) {
3147                         goto endloop; /* break out of outer loop */
3148                     }
3149                 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3150                     break;
3151                 }
3152                 prevCC=0;
3153             }
3154         } else {
3155             for(;;) {
3156                 if(src==limit) {
3157                     goto endloop; /* break out of outer loop */
3158                 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3159                     break;
3160                 }
3161                 prevCC=0;
3162             }
3163         }
3164
3165         /* check one above-minimum, relevant code unit */
3166         if(isNorm32LeadSurrogate(norm32)) {
3167             /* c is a lead surrogate, get the real norm32 */
3168             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3169                 ++src;
3170                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3171             } else {
3172                 c2=0;
3173                 norm32=0;
3174             }
3175         } else {
3176             c2=0;
3177         }
3178
3179         if(nx_contains(nx, c, c2)) {
3180             /* excluded: norm32==0 */
3181             norm32=0;
3182         }
3183
3184         /* check the combining order */
3185         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3186         if(cc!=0 && cc<prevCC) {
3187             result=UNORM_NO;
3188             break;
3189         }
3190         prevCC=cc;
3191
3192         /* check for "no" or "maybe" quick check flags */
3193         qcNorm32=norm32&qcMask;
3194         if(qcNorm32&_NORM_QC_ANY_NO) {
3195             result=UNORM_NO;
3196             break;
3197         } else if(qcNorm32!=0) {
3198             /* "maybe" can only occur for NFC and NFKC */
3199             if(allowMaybe) {
3200                 result=UNORM_MAYBE;
3201             } else {
3202                 /* normalize a section around here to see if it is really normalized or not */
3203                 const UChar *prevStarter;
3204                 uint32_t decompQCMask;
3205                 int32_t length;
3206
3207                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3208
3209                 /* find the previous starter */
3210                 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3211                 if(UTF_IS_TRAIL(*prevStarter)) {
3212                     --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3213                 }
3214                 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3215
3216                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3217                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3218
3219                 /* decompose and recompose [prevStarter..src[ */
3220                 _composePart(stackBuffer, buffer, bufferCapacity,
3221                              length,
3222                              prevStarter,
3223                              src,
3224                              prevCC,
3225                              options, nx, pErrorCode);
3226                 if(U_FAILURE(*pErrorCode)) {
3227                     result=UNORM_MAYBE; /* error (out of memory) */
3228                     break;
3229                 }
3230
3231                 /* compare the normalized version with the original */
3232                 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3233                     result=UNORM_NO; /* normalization differs */
3234                     break;
3235                 }
3236
3237                 /* continue after the next starter */
3238             }
3239         }
3240     }
3241 endloop:
3242
3243     if(buffer!=stackBuffer) {
3244         uprv_free(buffer);
3245     }
3246
3247     return result;
3248 }
3249
3250 U_CAPI UNormalizationCheckResult U_EXPORT2
3251 unorm_quickCheck(const UChar *src,
3252                  int32_t srcLength,
3253                  UNormalizationMode mode,
3254                  UErrorCode *pErrorCode) {
3255     return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3256 }
3257
3258 U_CAPI UNormalizationCheckResult U_EXPORT2
3259 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3260                             UNormalizationMode mode, int32_t options,
3261                             UErrorCode *pErrorCode) {
3262     return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3263 }
3264
3265 U_CFUNC UNormalizationCheckResult
3266 unorm_internalQuickCheck(const UChar *src,
3267                          int32_t srcLength,
3268                          UNormalizationMode mode,
3269                          UBool allowMaybe,
3270                          const UnicodeSet *nx,
3271                          UErrorCode *pErrorCode) {
3272     return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3273 }
3274
3275 U_CAPI UBool U_EXPORT2
3276 unorm_isNormalized(const UChar *src, int32_t srcLength,
3277                    UNormalizationMode mode,
3278                    UErrorCode *pErrorCode) {
3279     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3280 }
3281
3282 U_CAPI UBool U_EXPORT2
3283 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3284                               UNormalizationMode mode, int32_t options,
3285                               UErrorCode *pErrorCode) {
3286     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3287 }
3288
3289 /* normalize() API ---------------------------------------------------------- */
3290
3291 /**
3292  * Internal API for normalizing.
3293  * Does not check for bad input.
3294  * Requires _haveData() to be true.
3295  * @internal
3296  */
3297 U_CFUNC int32_t
3298 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3299                               const UChar *src, int32_t srcLength,
3300                               UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3301                               UErrorCode *pErrorCode) {
3302     int32_t destLength;
3303     uint8_t trailCC;
3304
3305     switch(mode) {
3306     case UNORM_NFD:
3307         destLength=_decompose(dest, destCapacity,
3308                               src, srcLength,
3309                               FALSE, nx, trailCC);
3310         break;
3311     case UNORM_NFKD:
3312         destLength=_decompose(dest, destCapacity,
3313                               src, srcLength,
3314                               TRUE, nx, trailCC);
3315         break;
3316     case UNORM_NFC:
3317         destLength=_compose(dest, destCapacity,
3318                             src, srcLength,
3319                             options, nx, pErrorCode);
3320         break;
3321     case UNORM_NFKC:
3322         destLength=_compose(dest, destCapacity,
3323                             src, srcLength,
3324                             options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3325         break;
3326     case UNORM_FCD:
3327         if(fcdTrie.index==NULL) {
3328             *pErrorCode=U_UNSUPPORTED_ERROR;
3329             return 0;
3330         }
3331         return unorm_makeFCD(dest, destCapacity,
3332                              src, srcLength,
3333                              nx,
3334                              pErrorCode);
3335 #if 0
3336     case UNORM_FCC:
3337         destLength=_compose(dest, destCapacity,
3338                             src, srcLength,
3339                             options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3340         break;
3341 #endif
3342     case UNORM_NONE:
3343         /* just copy the string */
3344         if(srcLength==-1) {
3345             srcLength=u_strlen(src);
3346         }
3347         if(srcLength>0 && srcLength<=destCapacity) {
3348             uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3349         }
3350         destLength=srcLength;
3351         break;
3352     default:
3353         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3354         return 0;
3355     }
3356
3357     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3358 }
3359
3360 /**
3361  * Internal API for normalizing.
3362  * Does not check for bad input.
3363  * @internal
3364  */
3365 U_CAPI int32_t U_EXPORT2
3366 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3367                         const UChar *src, int32_t srcLength,
3368                         UNormalizationMode mode, int32_t options,
3369                         UErrorCode *pErrorCode) {
3370     const UnicodeSet *nx;
3371
3372     if(!_haveData(*pErrorCode)) {
3373         return 0;
3374     }
3375
3376     nx=getNX(options, *pErrorCode);
3377     if(U_FAILURE(*pErrorCode)) {
3378         return 0;
3379     }
3380
3381     /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3382     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3383
3384     return unorm_internalNormalizeWithNX(dest, destCapacity,
3385                                          src, srcLength,
3386                                          mode, options, nx,
3387                                          pErrorCode);
3388 }
3389
3390 /** Public API for normalizing. */
3391 U_CAPI int32_t U_EXPORT2
3392 unorm_normalize(const UChar *src, int32_t srcLength,
3393                 UNormalizationMode mode, int32_t options,
3394                 UChar *dest, int32_t destCapacity,
3395                 UErrorCode *pErrorCode) {
3396     /* check argument values */
3397     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3398         return 0;
3399     }
3400
3401     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3402         src==NULL || srcLength<-1
3403     ) {
3404         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3405         return 0;
3406     }
3407
3408     /* check for overlapping src and destination */
3409     if( dest!=NULL &&
3410         ((src>=dest && src<(dest+destCapacity)) ||
3411          (srcLength>0 && dest>=src && dest<(src+srcLength)))
3412     ) {
3413         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3414         return 0;
3415     }
3416
3417     return unorm_internalNormalize(dest, destCapacity,
3418                                    src, srcLength,
3419                                    mode, options,
3420                                    pErrorCode);
3421 }
3422
3423
3424 /* iteration functions ------------------------------------------------------ */
3425
3426 /*
3427  * These iteration functions are the core implementations of the
3428  * Normalizer class iteration API.
3429  * They read from a UCharIterator into their own buffer
3430  * and normalize into the Normalizer iteration buffer.
3431  * Normalizer itself then iterates over its buffer until that needs to be
3432  * filled again.
3433  */
3434
3435 /*
3436  * ### TODO:
3437  * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3438  * if iteration bounds are reached,
3439  * try to not call hasNext/hasPrevious and instead check for >=0.
3440  */
3441
3442 /* backward iteration ------------------------------------------------------- */
3443
3444 /*
3445  * read backwards and get norm32
3446  * return 0 if the character is <minC
3447  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3448  */
3449 static inline uint32_t
3450 _getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3451     uint32_t norm32;
3452
3453     /* need src.hasPrevious() */
3454     c=(UChar)src.previous(&src);
3455     c2=0;
3456
3457     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3458     if(c<minC) {
3459         return 0;
3460     } else if(!UTF_IS_SURROGATE(c)) {
3461         return _getNorm32(c);
3462     } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3463         /* unpaired surrogate */
3464         return 0;
3465     } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3466         norm32=_getNorm32(c2);
3467         if((norm32&mask)==0) {
3468             /* all surrogate pairs with this lead surrogate have irrelevant data */
3469             return 0;
3470         } else {
3471             /* norm32 must be a surrogate special */
3472             return _getNorm32FromSurrogatePair(norm32, c);
3473         }
3474     } else {
3475         /* unpaired second surrogate, undo the c2=src.previous() movement */
3476         src.move(&src, 1, UITER_CURRENT);
3477         c2=0;
3478         return 0;
3479     }
3480 }
3481
3482 /*
3483  * read backwards and check if the character is a previous-iteration boundary
3484  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3485  */
3486 typedef UBool
3487 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3488
3489 /*
3490  * for NF*D:
3491  * read backwards and check if the lead combining class is 0
3492  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3493  */
3494 static UBool
3495 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3496     return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3497 }
3498
3499 /*
3500  * read backwards and check if the character is (or its decomposition begins with)
3501  * a "true starter" (cc==0 and NF*C_YES)
3502  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3503  */
3504 static UBool
3505 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3506     uint32_t norm32, decompQCMask;
3507
3508     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3509     norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3510     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3511 }
3512
3513 static int32_t
3514 _findPreviousIterationBoundary(UCharIterator &src,
3515                                IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3516                                UChar *&buffer, int32_t &bufferCapacity,
3517                                int32_t &startIndex,
3518                                UErrorCode *pErrorCode) {
3519     UChar *stackBuffer;
3520     UChar c, c2;
3521     UBool isBoundary;
3522
3523     /* initialize */
3524     stackBuffer=buffer;
3525     startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3526
3527     while(src.hasPrevious(&src)) {
3528         isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3529
3530         /* always write this character to the front of the buffer */
3531         /* make sure there is enough space in the buffer */
3532         if(startIndex < (c2==0 ? 1 : 2)) {
3533             int32_t bufferLength=bufferCapacity;
3534
3535             if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3536                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3537                 src.move(&src, 0, UITER_START);
3538                 return 0;
3539             }
3540
3541             /* move the current buffer contents up */
3542             uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3543             startIndex+=bufferCapacity-bufferLength;
3544         }
3545
3546         buffer[--startIndex]=c;
3547         if(c2!=0) {
3548             buffer[--startIndex]=c2;
3549         }
3550
3551         /* stop if this just-copied character is a boundary */
3552         if(isBoundary) {
3553             break;
3554         }
3555     }
3556
3557     /* return the length of the buffer contents */
3558     return bufferCapacity-startIndex;
3559 }
3560
3561 U_CAPI int32_t U_EXPORT2
3562 unorm_previous(UCharIterator *src,
3563                UChar *dest, int32_t destCapacity,
3564                UNormalizationMode mode, int32_t options,
3565                UBool doNormalize, UBool *pNeededToNormalize,
3566                UErrorCode *pErrorCode) {
3567     UChar stackBuffer[100];
3568     UChar *buffer=NULL;
3569     IsPrevBoundaryFn *isPreviousBoundary=NULL;
3570     uint32_t mask=0;
3571     int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3572     int32_t c=0, c2=0;
3573     UChar minC=0;
3574
3575     /* check argument values */
3576     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3577         return 0;
3578     }
3579
3580     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3581         src==NULL
3582     ) {
3583         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3584         return 0;
3585     }
3586
3587     if(!_haveData(*pErrorCode)) {
3588         return 0;
3589     }
3590
3591     if(pNeededToNormalize!=NULL) {
3592         *pNeededToNormalize=FALSE;
3593     }
3594
3595     switch(mode) {
3596     case UNORM_FCD:
3597         if(fcdTrie.index==NULL) {
3598             *pErrorCode=U_UNSUPPORTED_ERROR;
3599             return 0;
3600         }
3601         /* fall through to NFD */
3602     case UNORM_NFD:
3603         isPreviousBoundary=_isPrevNFDSafe;
3604         minC=_NORM_MIN_WITH_LEAD_CC;
3605         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3606         break;
3607     case UNORM_NFKD:
3608         isPreviousBoundary=_isPrevNFDSafe;
3609         minC=_NORM_MIN_WITH_LEAD_CC;
3610         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3611         break;
3612     case UNORM_NFC:
3613         isPreviousBoundary=_isPrevTrueStarter;
3614         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3615         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3616         break;
3617     case UNORM_NFKC:
3618         isPreviousBoundary=_isPrevTrueStarter;
3619         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3620         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3621         break;
3622     case UNORM_NONE:
3623         destLength=0;
3624         if((c=src->previous(src))>=0) {
3625             destLength=1;
3626             if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3627                 if(UTF_IS_LEAD(c2)) {
3628                     if(destCapacity>=2) {
3629                         dest[1]=(UChar)c; /* trail surrogate */
3630                         destLength=2;
3631                     }
3632                     c=c2; /* lead surrogate to be written below */
3633                 } else {
3634                     src->move(src, 1, UITER_CURRENT);
3635                 }
3636             }
3637
3638             if(destCapacity>0) {
3639                 dest[0]=(UChar)c;
3640             }
3641         }
3642         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3643     default:
3644         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3645         return 0;
3646     }
3647
3648     buffer=stackBuffer;
3649     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3650     bufferLength=_findPreviousIterationBoundary(*src,
3651                                                 isPreviousBoundary, minC, mask,
3652                                                 buffer, bufferCapacity,
3653                                                 startIndex,
3654                                                 pErrorCode);
3655     if(bufferLength>0) {
3656         if(doNormalize) {
3657             destLength=unorm_internalNormalize(dest, destCapacity,
3658                                                buffer+startIndex, bufferLength,
3659                                                mode, options,
3660                                                pErrorCode);
3661             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3662                 *pNeededToNormalize=
3663                     (UBool)(destLength!=bufferLength ||
3664                             0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3665             }
3666         } else {
3667             /* just copy the source characters */
3668             if(destCapacity>0) {
3669                 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3670             }
3671             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3672         }
3673     } else {
3674         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3675     }
3676
3677     /* cleanup */
3678     if(buffer!=stackBuffer) {
3679         uprv_free(buffer);
3680     }
3681
3682     return destLength;
3683 }
3684
3685 /* forward iteration -------------------------------------------------------- */
3686
3687 /*
3688  * read forward and get norm32
3689  * return 0 if the character is <minC
3690  * if c2!=0 then (c2, c) is a surrogate pair
3691  * always reads complete characters
3692  */
3693 static inline uint32_t
3694 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3695     uint32_t norm32;
3696
3697     /* need src.hasNext() to be true */
3698     c=(UChar)src.next(&src);
3699     c2=0;
3700
3701     if(c<minC) {
3702         return 0;
3703     }
3704
3705     norm32=_getNorm32(c);
3706     if(UTF_IS_FIRST_SURROGATE(c)) {
3707         if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3708             src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3709             if((norm32&mask)==0) {
3710                 /* irrelevant data */
3711                 return 0;
3712             } else {
3713                 /* norm32 must be a surrogate special */
3714                 return _getNorm32FromSurrogatePair(norm32, c2);
3715             }
3716         } else {
3717             /* unmatched surrogate */
3718             c2=0;
3719             return 0;
3720         }
3721     }
3722     return norm32;
3723 }
3724
3725 /*
3726  * read forward and check if the character is a next-iteration boundary
3727  * if c2!=0 then (c, c2) is a surrogate pair
3728  */
3729 typedef UBool
3730 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3731
3732 /*
3733  * for NF*D:
3734  * read forward and check if the lead combining class is 0
3735  * if c2!=0 then (c, c2) is a surrogate pair
3736  */
3737 static UBool
3738 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3739     return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3740 }
3741
3742 /*
3743  * for NF*C:
3744  * read forward and check if the character is (or its decomposition begins with)
3745  * a "true starter" (cc==0 and NF*C_YES)
3746  * if c2!=0 then (c, c2) is a surrogate pair
3747  */
3748 static UBool
3749 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3750     uint32_t norm32, decompQCMask;
3751
3752     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3753     norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3754     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3755 }
3756
3757 static int32_t
3758 _findNextIterationBoundary(UCharIterator &src,
3759                            IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3760                            UChar *&buffer, int32_t &bufferCapacity,
3761                            UErrorCode *pErrorCode) {
3762     UChar *stackBuffer;
3763     int32_t bufferIndex;
3764     UChar c, c2;
3765
3766     if(!src.hasNext(&src)) {
3767         return 0;
3768     }
3769
3770     /* initialize */
3771     stackBuffer=buffer;
3772
3773     /* get one character and ignore its properties */
3774     buffer[0]=c=(UChar)src.next(&src);
3775     bufferIndex=1;
3776     if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3777         if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3778             buffer[bufferIndex++]=c2;
3779         } else {
3780             src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3781         }
3782     }
3783
3784     /* get all following characters until we see a boundary */
3785     /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3786     while(src.hasNext(&src)) {
3787         if(isNextBoundary(src, minC, mask, c, c2)) {
3788             /* back out the latest movement to stop at the boundary */
3789             src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3790             break;
3791         } else {
3792             if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3793                 /* attempt to grow the buffer */
3794                 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3795                                        2*bufferCapacity,
3796                                        bufferIndex)
3797             ) {
3798                 buffer[bufferIndex++]=c;
3799                 if(c2!=0) {
3800                     buffer[bufferIndex++]=c2;
3801                 }
3802             } else {
3803                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3804                 src.move(&src, 0, UITER_LIMIT);
3805                 return 0;
3806             }
3807         }
3808     }
3809
3810     /* return the length of the buffer contents */
3811     return bufferIndex;
3812 }
3813
3814 U_CAPI int32_t U_EXPORT2
3815 unorm_next(UCharIterator *src,
3816            UChar *dest, int32_t destCapacity,
3817            UNormalizationMode mode, int32_t options,
3818            UBool doNormalize, UBool *pNeededToNormalize,
3819            UErrorCode *pErrorCode) {
3820     UChar stackBuffer[100];
3821     UChar *buffer;
3822     IsNextBoundaryFn *isNextBoundary;
3823     uint32_t mask;
3824     int32_t bufferLength, bufferCapacity, destLength;
3825     int32_t c, c2;
3826     UChar minC;
3827
3828     /* check argument values */
3829     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3830         return 0;
3831     }
3832
3833     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3834         src==NULL
3835     ) {
3836         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3837         return 0;
3838     }
3839
3840     if(!_haveData(*pErrorCode)) {
3841         return 0;
3842     }
3843
3844     if(pNeededToNormalize!=NULL) {
3845         *pNeededToNormalize=FALSE;
3846     }
3847
3848     switch(mode) {
3849     case UNORM_FCD:
3850         if(fcdTrie.index==NULL) {
3851             *pErrorCode=U_UNSUPPORTED_ERROR;
3852             return 0;
3853         }
3854         /* fall through to NFD */
3855     case UNORM_NFD:
3856         isNextBoundary=_isNextNFDSafe;
3857         minC=_NORM_MIN_WITH_LEAD_CC;
3858         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3859         break;
3860     case UNORM_NFKD:
3861         isNextBoundary=_isNextNFDSafe;
3862         minC=_NORM_MIN_WITH_LEAD_CC;
3863         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3864         break;
3865     case UNORM_NFC:
3866         isNextBoundary=_isNextTrueStarter;
3867         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3868         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3869         break;
3870     case UNORM_NFKC:
3871         isNextBoundary=_isNextTrueStarter;
3872         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3873         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3874         break;
3875     case UNORM_NONE:
3876         destLength=0;
3877         if((c=src->next(src))>=0) {
3878             destLength=1;
3879             if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3880                 if(UTF_IS_TRAIL(c2)) {
3881                     if(destCapacity>=2) {
3882                         dest[1]=(UChar)c2; /* trail surrogate */
3883                         destLength=2;
3884                     }
3885                     /* lead surrogate to be written below */
3886                 } else {
3887                     src->move(src, -1, UITER_CURRENT);
3888                 }
3889             }
3890
3891             if(destCapacity>0) {
3892                 dest[0]=(UChar)c;
3893             }
3894         }
3895         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3896     default:
3897         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3898         return 0;
3899     }
3900
3901     buffer=stackBuffer;
3902     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3903     bufferLength=_findNextIterationBoundary(*src,
3904                                             isNextBoundary, minC, mask,
3905                                             buffer, bufferCapacity,
3906                                             pErrorCode);
3907     if(bufferLength>0) {
3908         if(doNormalize) {
3909             destLength=unorm_internalNormalize(dest, destCapacity,
3910                                                buffer, bufferLength,
3911                                                mode, options,
3912                                                pErrorCode);
3913             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3914                 *pNeededToNormalize=
3915                     (UBool)(destLength!=bufferLength ||
3916                             0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3917             }
3918         } else {
3919             /* just copy the source characters */
3920             if(destCapacity>0) {
3921                 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3922             }
3923             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3924         }
3925     } else {
3926         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3927     }
3928
3929     /* cleanup */
3930     if(buffer!=stackBuffer) {
3931         uprv_free(buffer);
3932     }
3933
3934     return destLength;
3935 }
3936
3937 /*
3938  * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3939  * and if not, how hard it would be to improve it.
3940  * For example, see _findSafeFCD().
3941  */
3942
3943 /* Concatenation of normalized strings -------------------------------------- */
3944
3945 U_CAPI int32_t U_EXPORT2
3946 unorm_concatenate(const UChar *left, int32_t leftLength,
3947                   const UChar *right, int32_t rightLength,
3948                   UChar *dest, int32_t destCapacity,
3949                   UNormalizationMode mode, int32_t options,
3950                   UErrorCode *pErrorCode) {
3951     UChar stackBuffer[100];
3952     UChar *buffer;
3953     int32_t bufferLength, bufferCapacity;
3954
3955     UCharIterator iter;
3956     int32_t leftBoundary, rightBoundary, destLength;
3957
3958     /* check argument values */
3959     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3960         return 0;
3961     }
3962
3963     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3964         left==NULL || leftLength<-1 ||
3965         right==NULL || rightLength<-1
3966     ) {
3967         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3968         return 0;
3969     }
3970
3971     /* check for overlapping right and destination */
3972     if( dest!=NULL &&
3973         ((right>=dest && right<(dest+destCapacity)) ||
3974          (rightLength>0 && dest>=right && dest<(right+rightLength)))
3975     ) {
3976         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3977         return 0;
3978     }
3979
3980     /* allow left==dest */
3981
3982     /* set up intermediate buffer */
3983     buffer=stackBuffer;
3984     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3985
3986     /*
3987      * Input: left[0..leftLength[ + right[0..rightLength[
3988      *
3989      * Find normalization-safe boundaries leftBoundary and rightBoundary
3990      * and copy the end parts together:
3991      * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3992      *
3993      * dest=left[0..leftBoundary[ +
3994      *      normalize(buffer) +
3995      *      right[rightBoundary..rightLength[
3996      */
3997
3998     /*
3999      * find a normalization boundary at the end of the left string
4000      * and copy the end part into the buffer
4001      */
4002     uiter_setString(&iter, left, leftLength);
4003     iter.index=leftLength=iter.length; /* end of left string */
4004
4005     bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
4006                                 mode, options,
4007                                 FALSE, NULL,
4008                                 pErrorCode);
4009     leftBoundary=iter.index;
4010     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4011         *pErrorCode=U_ZERO_ERROR;
4012         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
4013             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4014             /* dont need to cleanup here since
4015              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4016              */
4017             return 0;
4018         }
4019
4020         /* just copy from the left string: we know the boundary already */
4021         uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
4022     }
4023
4024     /*
4025      * find a normalization boundary at the beginning of the right string
4026      * and concatenate the beginning part to the buffer
4027      */
4028     uiter_setString(&iter, right, rightLength);
4029     rightLength=iter.length; /* in case it was -1 */
4030
4031     rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
4032                              mode, options,
4033                              FALSE, NULL,
4034                              pErrorCode);
4035     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4036         *pErrorCode=U_ZERO_ERROR;
4037         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
4038             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4039             /* dont need to cleanup here since
4040              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4041              */
4042             return 0;
4043         }
4044
4045         /* just copy from the right string: we know the boundary already */
4046         uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4047     }
4048
4049     bufferLength+=rightBoundary;
4050
4051     /* copy left[0..leftBoundary[ to dest */
4052     if(left!=dest && leftBoundary>0 && destCapacity>0) {
4053         uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4054     }
4055     destLength=leftBoundary;
4056
4057     /* concatenate the normalization of the buffer to dest */
4058     if(destCapacity>destLength) {
4059         destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4060                                             buffer, bufferLength,
4061                                             mode, options,
4062                                             pErrorCode);
4063     } else {
4064         destLength+=unorm_internalNormalize(NULL, 0,
4065                                             buffer, bufferLength,
4066                                             mode, options,
4067                                             pErrorCode);
4068     }
4069     /*
4070      * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4071      * so we dont check for the error code here..just let it pass through
4072      */
4073     /* concatenate right[rightBoundary..rightLength[ to dest */
4074     right+=rightBoundary;
4075     rightLength-=rightBoundary;
4076     if(rightLength>0 && destCapacity>destLength) {
4077         uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4078     }
4079     destLength+=rightLength;
4080
4081     /* cleanup */
4082     if(buffer!=stackBuffer) {
4083         uprv_free(buffer);
4084     }
4085
4086     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4087 }
4088
4089 #endif /* #if !UCONFIG_NO_NORMALIZATION */