icuSources/common/unorm.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (c) 1996-2004, International Business Machines
   4 * Corporation and others. All Rights Reserved.
   5 ******************************************************************************
   6 * File unorm.cpp
   7 *
   8 * Created by: Vladimir Weinstein 12052000
   9 *
  10 * Modification history :
  11 *
  12 * Date        Name        Description
  13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
  14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
  15 *                         Added private method for doing FCD checks
  16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
  17 *                         string for codepoints < 0x300 for the normalization
  18 *                         mode NFC.
  19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
  20 *                         instead of just wrappers around normlzr.cpp,
  21 *                         load unorm.dat, support Unicode 3.1 with
  22 *                         supplementary code points, etc.
  23 */
  24
  25 #include "unicode/utypes.h"
  26
  27 #if !UCONFIG_NO_NORMALIZATION
  28
  29 #include "unicode/udata.h"
  30 #include "unicode/uchar.h"
  31 #include "unicode/ustring.h"
  32 #include "unicode/uiter.h"
  33 #include "unicode/uniset.h"
  34 #include "unicode/usetiter.h"
  35 #include "unicode/unorm.h"
  36 #include "ucln_cmn.h"
  37 #include "unormimp.h"
  38 #include "ucase.h"
  39 #include "cmemory.h"
  40 #include "umutex.h"
  41 #include "utrie.h"
  42 #include "unicode/uset.h"
  43 #include "udataswp.h"
  44 #include "putilimp.h"
  45
  46 /*
  47  * Status of tailored normalization
  48  *
  49  * This was done initially for investigation on Unicode public review issue 7
  50  * (http://www.unicode.org/review/). See Jitterbug 2481.
  51  * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
  52  * a permanent feature in ICU 2.6 in support of IDNA which requires true
  53  * Unicode 3.2 normalization.
  54  * (NormalizationCorrections are rolled into IDNA mapping tables.)
  55  *
  56  * Tailored normalization as implemented here allows to "normalize less"
  57  * than full Unicode normalization would.
  58  * Based internally on a UnicodeSet of code points that are
  59  * "excluded from normalization", the normalization functions leave those
  60  * code points alone ("inert"). This means that tailored normalization
  61  * still transforms text into a canonically equivalent form.
  62  * It does not add decompositions to code points that do not have any or
  63  * change decomposition results.
  64  *
  65  * Any function that searches for a safe boundary has not been touched,
  66  * which means that these functions will be over-pessimistic when
  67  * exclusions are applied.
  68  * This should not matter because subsequent checks and normalizations
  69  * do apply the exclusions; only a little more of the text may be processed
  70  * than necessary under exclusions.
  71  *
  72  * Normalization exclusions have the following effect on excluded code points c:
  73  * - c is not decomposed
  74  * - c is not a composition target
  75  * - c does not combine forward or backward for composition
  76  *   except that this is not implemented for Jamo
  77  * - c is treated as having a combining class of 0
  78  */
  79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  80
  81 /*
  82  * This new implementation of the normalization code loads its data from
  83  * unorm.dat, which is generated with the gennorm tool.
  84  * The format of that file is described in unormimp.h .
  85  */
  86
  87 /* -------------------------------------------------------------------------- */
  88
  89 enum {
  90     _STACK_BUFFER_CAPACITY=100
  91 };
  92
  93 /*
  94  * Constants for the bit fields in the options bit set parameter.
  95  * These need not be public.
  96  * A user only needs to know the currently assigned values.
  97  * The number and positions of reserved bits per field can remain private
  98  * and may change in future implementations.
  99  */
 100 enum {
 101     _NORM_OPTIONS_NX_MASK=0x1f,
 102     _NORM_OPTIONS_UNICODE_MASK=0x60,
 103     _NORM_OPTIONS_SETS_MASK=0x7f,
 104
 105     _NORM_OPTIONS_UNICODE_SHIFT=5,
 106
 107     /*
 108      * The following options are used only in some composition functions.
 109      * They use bits 12 and up to preserve lower bits for the available options
 110      * space in unorm_compare() -
 111      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
 112      */
 113
 114     /** Options bit 12, for compatibility vs. canonical decomposition. */
 115     _NORM_OPTIONS_COMPAT=0x1000,
 116     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
 117     _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
 118 };
 119
 120 static inline UBool
 121 isHangulWithoutJamoT(UChar c) {
 122     c-=HANGUL_BASE;
 123     return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
 124 }
 125
 126 /* norm32 helpers */
 127
 128 /* is this a norm32 with a regular index? */
 129 static inline UBool
 130 isNorm32Regular(uint32_t norm32) {
 131     return norm32<_NORM_MIN_SPECIAL;
 132 }
 133
 134 /* is this a norm32 with a special index for a lead surrogate? */
 135 static inline UBool
 136 isNorm32LeadSurrogate(uint32_t norm32) {
 137     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
 138 }
 139
 140 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
 141 static inline UBool
 142 isNorm32HangulOrJamo(uint32_t norm32) {
 143     return norm32>=_NORM_MIN_HANGUL;
 144 }
 145
 146 /*
 147  * Given isNorm32HangulOrJamo(),
 148  * is this a Hangul syllable or a Jamo?
 149  */
 150 static inline UBool
 151 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
 152     return norm32<_NORM_MIN_JAMO_V;
 153 }
 154
 155 /*
 156  * Given norm32 for Jamo V or T,
 157  * is this a Jamo V?
 158  */
 159 static inline UBool
 160 isJamoVTNorm32JamoV(uint32_t norm32) {
 161     return norm32<_NORM_JAMO_V_TOP;
 162 }
 163
 164 /* load unorm.dat ----------------------------------------------------------- */
 165
 166 #define DATA_NAME "unorm"
 167 #define DATA_TYPE "icu"
 168
 169 static UDataMemory *normData=NULL;
 170 static UErrorCode dataErrorCode=U_ZERO_ERROR;
 171 static int8_t haveNormData=0;
 172
 173 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
 174 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
 175
 176 /*
 177  * pointers into the memory-mapped unorm.icu
 178  */
 179 static const uint16_t *extraData=NULL,
 180                       *combiningTable=NULL,
 181                       *canonStartSets=NULL;
 182
 183 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
 184 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
 185
 186 /* the Unicode version of the normalization data */
 187 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
 188
 189 /* cache UnicodeSets for each combination of exclusion flags */
 190 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
 191
 192 U_CDECL_BEGIN
 193
 194 static UBool U_CALLCONV
 195 unorm_cleanup() {
 196     int32_t i;
 197
 198     if(normData!=NULL) {
 199         udata_close(normData);
 200         normData=NULL;
 201     }
 202     dataErrorCode=U_ZERO_ERROR;
 203     haveNormData=0;
 204
 205     for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
 206         delete nxCache[i];
 207     }
 208     uprv_memset(nxCache, 0, sizeof(nxCache));
 209
 210     return TRUE;
 211 }
 212
 213 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
 214 static int32_t U_CALLCONV
 215 getFoldingNormOffset(uint32_t norm32) {
 216     if(isNorm32LeadSurrogate(norm32)) {
 217         return
 218             UTRIE_BMP_INDEX_LENGTH+
 219                 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
 220                  (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
 221     } else {
 222         return 0;
 223     }
 224 }
 225
 226 /* fcdTrie: the folding offset is the lead FCD value itself */
 227 static int32_t U_CALLCONV
 228 getFoldingFCDOffset(uint32_t data) {
 229     return (int32_t)data;
 230 }
 231
 232 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
 233 static int32_t U_CALLCONV
 234 getFoldingAuxOffset(uint32_t data) {
 235     return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
 236 }
 237
 238 static UBool U_CALLCONV
 239 isAcceptable(void * /* context */,
 240              const char * /* type */, const char * /* name */,
 241              const UDataInfo *pInfo) {
 242     if(
 243         pInfo->size>=20 &&
 244         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
 245         pInfo->charsetFamily==U_CHARSET_FAMILY &&
 246         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
 247         pInfo->dataFormat[1]==0x6f &&
 248         pInfo->dataFormat[2]==0x72 &&
 249         pInfo->dataFormat[3]==0x6d &&
 250         pInfo->formatVersion[0]==2 &&
 251         pInfo->formatVersion[2]==UTRIE_SHIFT &&
 252         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
 253     ) {
 254         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
 255         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
 256         return TRUE;
 257     } else {
 258         return FALSE;
 259     }
 260 }
 261
 262 static UBool U_CALLCONV
 263 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
 264     /* add the start code point to the USet */
 265     USetAdder *sa=(USetAdder *)context;
 266     sa->add(sa->set, start);
 267     return TRUE;
 268 }
 269
 270 U_CDECL_END
 271
 272 static int8_t
 273 loadNormData(UErrorCode &errorCode) {
 274     /* load Unicode normalization data from file */
 275
 276     /*
 277      * This lazy intialization with double-checked locking (without mutex protection for
 278      * haveNormData==0) is transiently unsafe under certain circumstances.
 279      * Check the readme and use u_init() if necessary.
 280      *
 281      * While u_init() initializes the main normalization data via this functions,
 282      * it does not do so for exclusion sets (which are fully mutexed).
 283      * This is because
 284      * - there can be many exclusion sets
 285      * - they are rarely used
 286      * - they are not usually used in execution paths that are
 287      *   as performance-sensitive as others
 288      *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
 289      */
 290     if(haveNormData==0) {
 291         UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
 292         UDataMemory *data;
 293         const int32_t *p=NULL;
 294         const uint8_t *pb;
 295
 296         if(&errorCode==NULL || U_FAILURE(errorCode)) {
 297             return 0;
 298         }
 299
 300         /* open the data outside the mutex block */
 301         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
 302         dataErrorCode=errorCode;
 303         if(U_FAILURE(errorCode)) {
 304             return haveNormData=-1;
 305         }
 306
 307         p=(const int32_t *)udata_getMemory(data);
 308         pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
 309         utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
 310         _normTrie.getFoldingOffset=getFoldingNormOffset;
 311
 312         pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
 313         utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
 314         _fcdTrie.getFoldingOffset=getFoldingFCDOffset;
 315
 316         if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
 317             pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
 318             utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
 319             _auxTrie.getFoldingOffset=getFoldingAuxOffset;
 320         }
 321
 322         if(U_FAILURE(errorCode)) {
 323             dataErrorCode=errorCode;
 324             udata_close(data);
 325             return haveNormData=-1;
 326         }
 327
 328         /* in the mutex block, set the data for this process */
 329         umtx_lock(NULL);
 330         if(normData==NULL) {
 331             normData=data;
 332             data=NULL;
 333
 334             uprv_memcpy(&indexes, p, sizeof(indexes));
 335             uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
 336             uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
 337             uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
 338         } else {
 339             p=(const int32_t *)udata_getMemory(normData);
 340         }
 341
 342         /* initialize some variables */
 343         extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
 344         combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
 345         formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
 346         formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
 347         if(formatVersion_2_1) {
 348             canonStartSets=combiningTable+
 349                 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
 350                 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
 351         }
 352         haveNormData=1;
 353         ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
 354         umtx_unlock(NULL);
 355
 356         /* if a different thread set it first, then close the extra data */
 357         if(data!=NULL) {
 358             udata_close(data); /* NULL if it was set correctly */
 359         }
 360     }
 361
 362     return haveNormData;
 363 }
 364
 365 static inline UBool
 366 _haveData(UErrorCode &errorCode) {
 367     if(haveNormData!=0) {
 368         errorCode=dataErrorCode;
 369         return (UBool)(haveNormData>0);
 370     } else {
 371         return (UBool)(loadNormData(errorCode)>0);
 372     }
 373 }
 374
 375 U_CAPI UBool U_EXPORT2
 376 unorm_haveData(UErrorCode *pErrorCode) {
 377     return _haveData(*pErrorCode);
 378 }
 379
 380 U_CAPI const uint16_t * U_EXPORT2
 381 unorm_getFCDTrie(UErrorCode *pErrorCode) {
 382     if(_haveData(*pErrorCode)) {
 383         return fcdTrie.index;
 384     } else {
 385         return NULL;
 386     }
 387 }
 388
 389 /* data access primitives --------------------------------------------------- */
 390
 391 static inline uint32_t
 392 _getNorm32(UChar c) {
 393     return UTRIE_GET32_FROM_LEAD(&normTrie, c);
 394 }
 395
 396 static inline uint32_t
 397 _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
 398     /*
 399      * the surrogate index in norm32 stores only the number of the surrogate index block
 400      * see gennorm/store.c/getFoldedNormValue()
 401      */
 402     norm32=
 403         UTRIE_BMP_INDEX_LENGTH+
 404             ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
 405              (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
 406     return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
 407 }
 408
 409 /*
 410  * get a norm32 from text with complete code points
 411  * (like from decompositions)
 412  */
 413 static inline uint32_t
 414 _getNorm32(const UChar *p, uint32_t mask) {
 415     uint32_t norm32=_getNorm32(*p);
 416     if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
 417         /* *p is a lead surrogate, get the real norm32 */
 418         norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
 419     }
 420     return norm32;
 421 }
 422
 423 static inline uint16_t
 424 _getFCD16(UChar c) {
 425     return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
 426 }
 427
 428 static inline uint16_t
 429 _getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
 430     /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
 431     return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
 432 }
 433
 434 static inline const uint16_t *
 435 _getExtraData(uint32_t norm32) {
 436     return extraData+(norm32>>_NORM_EXTRA_SHIFT);
 437 }
 438
 439 /* normalization exclusion sets --------------------------------------------- */
 440
 441 /*
 442  * Normalization exclusion UnicodeSets are used for tailored normalization;
 443  * see the comment near the beginning of this file.
 444  *
 445  * By specifying one or several sets of code points,
 446  * those code points become inert for normalization.
 447  */
 448
 449 static const UnicodeSet *
 450 internalGetNXHangul(UErrorCode &errorCode) {
 451     /* internal function, does not check for incoming U_FAILURE */
 452     UBool isCached;
 453
 454     UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
 455
 456     if(!isCached) {
 457         UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
 458         if(set==NULL) {
 459             errorCode=U_MEMORY_ALLOCATION_ERROR;
 460             return NULL;
 461         }
 462
 463         umtx_lock(NULL);
 464         if(nxCache[UNORM_NX_HANGUL]==NULL) {
 465             nxCache[UNORM_NX_HANGUL]=set;
 466             set=NULL;
 467         }
 468         umtx_unlock(NULL);
 469
 470         delete set;
 471     }
 472
 473     return nxCache[UNORM_NX_HANGUL];
 474 }
 475
 476 /* unorm.cpp 1.116 had and used
 477 static const UnicodeSet *
 478 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
 479     ...
 480 }
 481 */
 482
 483 /* get and set an exclusion set from a serialized UnicodeSet */
 484 static const UnicodeSet *
 485 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
 486     /* internal function, does not check for incoming U_FAILURE */
 487     UBool isCached;
 488
 489     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
 490
 491     if( !isCached &&
 492         canonStartSets!=NULL &&
 493         canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
 494     ) {
 495         USerializedSet sset;
 496         UnicodeSet *set;
 497         UChar32 start, end;
 498         int32_t i;
 499
 500         if( !uset_getSerializedSet(
 501                     &sset,
 502                     canonStartSets+canonStartSets[nxIndex],
 503                     canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
 504         ) {
 505             errorCode=U_INVALID_FORMAT_ERROR;
 506             return NULL;
 507         }
 508
 509         /* turn the serialized set into a UnicodeSet */
 510         set=new UnicodeSet();
 511         if(set==NULL) {
 512             errorCode=U_MEMORY_ALLOCATION_ERROR;
 513             return NULL;
 514         }
 515         for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
 516             set->add(start, end);
 517         }
 518
 519         umtx_lock(NULL);
 520         if(nxCache[options]==NULL) {
 521             nxCache[options]=set;
 522             set=NULL;
 523         }
 524         umtx_unlock(NULL);
 525
 526         delete set;
 527     }
 528
 529     return nxCache[options];
 530 }
 531
 532 static const UnicodeSet *
 533 internalGetNXCJKCompat(UErrorCode &errorCode) {
 534     /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
 535     return internalGetSerializedNX(
 536                 UNORM_NX_CJK_COMPAT,
 537                 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
 538                 errorCode);
 539 }
 540
 541 static const UnicodeSet *
 542 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
 543     /* internal function, does not check for incoming U_FAILURE */
 544     int32_t nxIndex;
 545
 546     options&=_NORM_OPTIONS_UNICODE_MASK;
 547     switch(options) {
 548     case 0:
 549         return NULL;
 550     case UNORM_UNICODE_3_2:
 551         /* [:^Age=3.2:] */
 552         nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
 553         break;
 554     default:
 555         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 556         return NULL;
 557     }
 558
 559     /* build a set with all code points that were not designated by the specified Unicode version */
 560     return internalGetSerializedNX(options, nxIndex, errorCode);
 561 }
 562
 563 /* Get a decomposition exclusion set. The data must be loaded. */
 564 static const UnicodeSet *
 565 internalGetNX(int32_t options, UErrorCode &errorCode) {
 566     options&=_NORM_OPTIONS_SETS_MASK;
 567
 568     UBool isCached;
 569
 570     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
 571
 572     if(!isCached) {
 573         /* return basic sets */
 574         if(options==UNORM_NX_HANGUL) {
 575             return internalGetNXHangul(errorCode);
 576         }
 577         if(options==UNORM_NX_CJK_COMPAT) {
 578             return internalGetNXCJKCompat(errorCode);
 579         }
 580         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
 581             return internalGetNXUnicode(options, errorCode);
 582         }
 583
 584         /* build a set from multiple subsets */
 585         UnicodeSet *set;
 586         const UnicodeSet *other;
 587
 588         set=new UnicodeSet();
 589         if(set==NULL) {
 590             errorCode=U_MEMORY_ALLOCATION_ERROR;
 591             return NULL;
 592         }
 593
 594         if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
 595             set->addAll(*other);
 596         }
 597         if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
 598             set->addAll(*other);
 599         }
 600         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
 601             set->addAll(*other);
 602         }
 603
 604         if(U_FAILURE(errorCode)) {
 605             delete set;
 606             return NULL;
 607         }
 608
 609         umtx_lock(NULL);
 610         if(nxCache[options]==NULL) {
 611             nxCache[options]=set;
 612             set=NULL;
 613         }
 614         umtx_unlock(NULL);
 615
 616         delete set;
 617     }
 618
 619     return nxCache[options];
 620 }
 621
 622 static inline const UnicodeSet *
 623 getNX(int32_t options, UErrorCode &errorCode) {
 624     if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
 625         /* incoming failure, or no decomposition exclusions requested */
 626         return NULL;
 627     } else {
 628         return internalGetNX(options, errorCode);
 629     }
 630 }
 631
 632 U_CFUNC const UnicodeSet *
 633 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
 634     return getNX(options, *pErrorCode);
 635 }
 636
 637 static inline UBool
 638 nx_contains(const UnicodeSet *nx, UChar32 c) {
 639     return nx!=NULL && nx->contains(c);
 640 }
 641
 642 static inline UBool
 643 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
 644     return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
 645 }
 646
 647 /* other normalization primitives ------------------------------------------- */
 648
 649 /* get the canonical or compatibility decomposition for one character */
 650 static inline const UChar *
 651 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
 652            uint8_t &cc, uint8_t &trailCC) {
 653     const UChar *p=(const UChar *)_getExtraData(norm32);
 654     length=*p++;
 655
 656     if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
 657         /* use compatibility decomposition, skip canonical data */
 658         p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
 659         length>>=8;
 660     }
 661
 662     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 663         /* get the lead and trail cc's */
 664         UChar bothCCs=*p++;
 665         cc=(uint8_t)(bothCCs>>8);
 666         trailCC=(uint8_t)bothCCs;
 667     } else {
 668         /* lead and trail cc's are both 0 */
 669         cc=trailCC=0;
 670     }
 671
 672     length&=_NORM_DECOMP_LENGTH_MASK;
 673     return p;
 674 }
 675
 676 /* get the canonical decomposition for one character */
 677 static inline const UChar *
 678 _decompose(uint32_t norm32, int32_t &length,
 679            uint8_t &cc, uint8_t &trailCC) {
 680     const UChar *p=(const UChar *)_getExtraData(norm32);
 681     length=*p++;
 682
 683     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
 684         /* get the lead and trail cc's */
 685         UChar bothCCs=*p++;
 686         cc=(uint8_t)(bothCCs>>8);
 687         trailCC=(uint8_t)bothCCs;
 688     } else {
 689         /* lead and trail cc's are both 0 */
 690         cc=trailCC=0;
 691     }
 692
 693     length&=_NORM_DECOMP_LENGTH_MASK;
 694     return p;
 695 }
 696
 697 /**
 698  * Get the canonical decomposition for one code point.
 699  * @param c code point
 700  * @param buffer out-only buffer for algorithmic decompositions of Hangul
 701  * @param length out-only, takes the length of the decomposition, if any
 702  * @return pointer to decomposition, or 0 if none
 703  * @internal
 704  */
 705 U_CFUNC const UChar *
 706 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
 707     uint32_t norm32;
 708
 709     if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
 710         /* trivial case */
 711         return NULL;
 712     }
 713
 714     UTRIE_GET32(&normTrie, c, norm32);
 715     if(norm32&_NORM_QC_NFD) {
 716         if(isNorm32HangulOrJamo(norm32)) {
 717             /* Hangul syllable: decompose algorithmically */
 718             UChar c2;
 719
 720             c-=HANGUL_BASE;
 721
 722             c2=(UChar)(c%JAMO_T_COUNT);
 723             c/=JAMO_T_COUNT;
 724             if(c2>0) {
 725                 buffer[2]=(UChar)(JAMO_T_BASE+c2);
 726                 *pLength=3;
 727             } else {
 728                 *pLength=2;
 729             }
 730
 731             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
 732             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
 733             return buffer;
 734         } else {
 735             /* normal decomposition */
 736             uint8_t cc, trailCC;
 737             return _decompose(norm32, *pLength, cc, trailCC);
 738         }
 739     } else {
 740         return 0;
 741     }
 742 }
 743
 744 /*
 745  * get the combining class of (c, c2)=*p++
 746  * before: p<limit  after: p<=limit
 747  * if only one code unit is used, then c2==0
 748  */
 749 static inline uint8_t
 750 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
 751     uint32_t norm32;
 752
 753     c=*p++;
 754     norm32=_getNorm32(c);
 755     if((norm32&_NORM_CC_MASK)==0) {
 756         c2=0;
 757         return 0;
 758     } else {
 759         if(!isNorm32LeadSurrogate(norm32)) {
 760             c2=0;
 761         } else {
 762             /* c is a lead surrogate, get the real norm32 */
 763             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
 764                 ++p;
 765                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
 766             } else {
 767                 c2=0;
 768                 return 0;
 769             }
 770         }
 771
 772         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
 773     }
 774 }
 775
 776 /*
 777  * read backwards and get norm32
 778  * return 0 if the character is <minC
 779  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
 780  */
 781 static inline uint32_t
 782 _getPrevNorm32(const UChar *start, const UChar *&src,
 783                uint32_t minC, uint32_t mask,
 784                UChar &c, UChar &c2) {
 785     uint32_t norm32;
 786
 787     c=*--src;
 788     c2=0;
 789
 790     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
 791     if(c<minC) {
 792         return 0;
 793     } else if(!UTF_IS_SURROGATE(c)) {
 794         return _getNorm32(c);
 795     } else if(UTF_IS_SURROGATE_FIRST(c)) {
 796         /* unpaired first surrogate */
 797         return 0;
 798     } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
 799         --src;
 800         norm32=_getNorm32(c2);
 801
 802         if((norm32&mask)==0) {
 803             /* all surrogate pairs with this lead surrogate have only irrelevant data */
 804             return 0;
 805         } else {
 806             /* norm32 must be a surrogate special */
 807             return _getNorm32FromSurrogatePair(norm32, c);
 808         }
 809     } else {
 810         /* unpaired second surrogate */
 811         c2=0;
 812         return 0;
 813     }
 814 }
 815
 816 /*
 817  * get the combining class of (c, c2)=*--p
 818  * before: start<p  after: start<=p
 819  */
 820 static inline uint8_t
 821 _getPrevCC(const UChar *start, const UChar *&p) {
 822     UChar c, c2;
 823
 824     return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
 825 }
 826
 827 /*
 828  * is this a safe boundary character for NF*D?
 829  * (lead cc==0)
 830  */
 831 static inline UBool
 832 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 833     if((norm32&ccOrQCMask)==0) {
 834         return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
 835     }
 836
 837     /* inspect its decomposition - maybe a Hangul but not a surrogate here */
 838     if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
 839         int32_t length;
 840         uint8_t cc, trailCC;
 841
 842         /* decomposes, get everything from the variable-length extra data */
 843         _decompose(norm32, decompQCMask, length, cc, trailCC);
 844         return cc==0;
 845     } else {
 846         /* no decomposition (or Hangul), test the cc directly */
 847         return (norm32&_NORM_CC_MASK)==0;
 848     }
 849 }
 850
 851 /*
 852  * is this (or does its decomposition begin with) a "true starter"?
 853  * (cc==0 and NF*C_YES)
 854  */
 855 static inline UBool
 856 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 857     if((norm32&ccOrQCMask)==0) {
 858         return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
 859     }
 860
 861     /* inspect its decomposition - not a Hangul or a surrogate here */
 862     if((norm32&decompQCMask)!=0) {
 863         const UChar *p;
 864         int32_t length;
 865         uint8_t cc, trailCC;
 866
 867         /* decomposes, get everything from the variable-length extra data */
 868         p=_decompose(norm32, decompQCMask, length, cc, trailCC);
 869         if(cc==0) {
 870             uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
 871
 872             /* does it begin with NFC_YES? */
 873             if((_getNorm32(p, qcMask)&qcMask)==0) {
 874                 /* yes, the decomposition begins with a true starter */
 875                 return TRUE;
 876             }
 877         }
 878     }
 879     return FALSE;
 880 }
 881
 882 /* uchar.h */
 883 U_CAPI uint8_t U_EXPORT2
 884 u_getCombiningClass(UChar32 c) {
 885     UErrorCode errorCode=U_ZERO_ERROR;
 886     if(_haveData(errorCode)) {
 887         uint32_t norm32;
 888
 889         UTRIE_GET32(&normTrie, c, norm32);
 890         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
 891     } else {
 892         return 0;
 893     }
 894 }
 895
 896 U_CAPI UBool U_EXPORT2
 897 unorm_internalIsFullCompositionExclusion(UChar32 c) {
 898     UErrorCode errorCode=U_ZERO_ERROR;
 899     if(_haveData(errorCode) && formatVersion_2_1) {
 900         uint16_t aux;
 901
 902         UTRIE_GET16(&auxTrie, c, aux);
 903         return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
 904     } else {
 905         return FALSE;
 906     }
 907 }
 908
 909 U_CAPI UBool U_EXPORT2
 910 unorm_isCanonSafeStart(UChar32 c) {
 911     UErrorCode errorCode=U_ZERO_ERROR;
 912     if(_haveData(errorCode) && formatVersion_2_1) {
 913         uint16_t aux;
 914
 915         UTRIE_GET16(&auxTrie, c, aux);
 916         return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
 917     } else {
 918         return FALSE;
 919     }
 920 }
 921
 922 U_CAPI void U_EXPORT2
 923 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
 924     if(unorm_haveData(pErrorCode)){
 925         uprv_memcpy(*versionInfo, dataVersion, 4);
 926     }
 927 }
 928
 929
 930 U_CAPI UBool U_EXPORT2
 931 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
 932     UErrorCode errorCode=U_ZERO_ERROR;
 933     if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
 934         _haveData(errorCode) && canonStartSets!=NULL
 935     ) {
 936         const uint16_t *table;
 937         int32_t i, start, limit;
 938
 939         /*
 940          * binary search for c
 941          *
 942          * There are two search tables,
 943          * one for BMP code points and one for supplementary ones.
 944          * See unormimp.h for details.
 945          */
 946         if(c<=0xffff) {
 947             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
 948             start=0;
 949             limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
 950
 951             /* each entry is a pair { c, result } */
 952             while(start<limit-2) {
 953                 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
 954                 if(c<table[i]) {
 955                     limit=i;
 956                 } else {
 957                     start=i;
 958                 }
 959             }
 960
 961             /* found? */
 962             if(c==table[start]) {
 963                 i=table[start+1];
 964                 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
 965                     /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
 966                     i&=(_NORM_MAX_CANON_SETS-1);
 967                     return uset_getSerializedSet(fillSet,
 968                                             canonStartSets+i,
 969                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
 970                 } else {
 971                     /* other result values are BMP code points for single-code point sets */
 972                     uset_setSerializedToOne(fillSet, (UChar32)i);
 973                     return TRUE;
 974                 }
 975             }
 976         } else {
 977             uint16_t high, low, h;
 978
 979             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
 980                                  canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
 981             start=0;
 982             limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
 983
 984             high=(uint16_t)(c>>16);
 985             low=(uint16_t)c;
 986
 987             /* each entry is a triplet { high(c), low(c), result } */
 988             while(start<limit-3) {
 989                 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
 990                 h=table[i]&0x1f; /* high word */
 991                 if(high<h || (high==h && low<table[i+1])) {
 992                     limit=i;
 993                 } else {
 994                     start=i;
 995                 }
 996             }
 997
 998             /* found? */
 999             h=table[start];
1000             if(high==(h&0x1f) && low==table[start+1]) {
1001                 i=table[start+2];
1002                 if((h&0x8000)==0) {
1003                     /* the result is an index to a USerializedSet */
1004                     return uset_getSerializedSet(fillSet,
1005                                             canonStartSets+i,
1006                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1007                 } else {
1008                     /*
1009                      * single-code point set {x} in
1010                      * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
1011                      */
1012                     i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1013                     uset_setSerializedToOne(fillSet, (UChar32)i);
1014                     return TRUE;
1015                 }
1016             }
1017         }
1018     }
1019
1020     return FALSE; /* not found */
1021 }
1022
1023 U_CAPI int32_t U_EXPORT2
1024 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1025     uint16_t aux;
1026
1027     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1028         return 0;
1029     }
1030     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1031         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1032         return 0;
1033     }
1034     if(!_haveData(*pErrorCode) || !formatVersion_2_1) {
1035         return 0;
1036     }
1037
1038     UTRIE_GET16(&auxTrie, c, aux);
1039     aux&=_NORM_AUX_FNC_MASK;
1040     if(aux!=0) {
1041         const UChar *s;
1042         int32_t length;
1043
1044         s=(const UChar *)(extraData+aux);
1045         if(*s<0xff00) {
1046             /* s points to the single-unit string */
1047             length=1;
1048         } else {
1049             length=*s&0xff;
1050             ++s;
1051         }
1052         if(0<length && length<=destCapacity) {
1053             uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1054         }
1055         return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1056     } else {
1057         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1058     }
1059 }
1060
1061 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1062 U_CAPI UBool U_EXPORT2
1063 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1064     UErrorCode errorCode;
1065     uint32_t norm32, mask;
1066     uint16_t aux, fcd;
1067
1068     errorCode=U_ZERO_ERROR;
1069     if(!_haveData(errorCode)) {
1070         return FALSE;
1071     }
1072
1073     /* handle trivial cases; set the comparison mask for the normal ones */
1074     switch(mode) {
1075     case UNORM_NONE:
1076         return TRUE;
1077     case UNORM_NFD:
1078         mask=_NORM_CC_MASK|_NORM_QC_NFD;
1079         break;
1080     case UNORM_NFKD:
1081         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1082         break;
1083     case UNORM_NFC:
1084     /* case UNORM_FCC: */
1085         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1086         break;
1087     case UNORM_NFKC:
1088         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1089         break;
1090     case UNORM_FCD:
1091         /* FCD: skippable if lead cc==0 and trail cc<=1 */
1092         UTRIE_GET16(&fcdTrie, c, fcd);
1093         return fcd<=1;
1094     default:
1095         return FALSE;
1096     }
1097
1098     /* check conditions (a)..(e), see unormimp.h */
1099     UTRIE_GET32(&normTrie, c, norm32);
1100     if((norm32&mask)!=0) {
1101         return FALSE; /* fails (a)..(e), not skippable */
1102     }
1103
1104     if(mode<UNORM_NFC) {
1105         return TRUE; /* NF*D, passed (a)..(c), is skippable */
1106     }
1107
1108     /* NF*C/FCC, passed (a)..(e) */
1109     if((norm32&_NORM_QC_NFD)==0) {
1110         return TRUE; /* no canonical decomposition, is skippable */
1111     }
1112
1113     /* check Hangul syllables algorithmically */
1114     if(isNorm32HangulOrJamo(norm32)) {
1115         /* Jamo passed (a)..(e) above, must be Hangul */
1116         return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1117     }
1118
1119     /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1120     /* NF*C, test (f) flag */
1121     if(!formatVersion_2_2) {
1122         return FALSE; /* no (f) data, say not skippable to be safe */
1123     }
1124
1125     UTRIE_GET16(&auxTrie, c, aux);
1126     return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1127
1128     /* } else { FCC, test fcd<=1 instead of the above } */
1129 }
1130
1131 U_CAPI void U_EXPORT2
1132 unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
1133     UChar c;
1134
1135     if(U_FAILURE(*pErrorCode) || !_haveData(*pErrorCode)) {
1136         return;
1137     }
1138
1139     /* add the start code point of each same-value range of each trie */
1140     utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1141     utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1142     if(formatVersion_2_1) {
1143         utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1144     }
1145
1146     /* add Hangul LV syllables and LV+1 because of skippables */
1147     for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1148         sa->add(sa->set, c);
1149         sa->add(sa->set, c+1);
1150     }
1151     sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1152 }
1153
1154 U_CAPI UNormalizationCheckResult U_EXPORT2
1155 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1156     static const uint32_t qcMask[UNORM_MODE_COUNT]={
1157         0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1158     };
1159
1160     UErrorCode errorCode;
1161     uint32_t norm32;
1162
1163     errorCode=U_ZERO_ERROR;
1164     if(!_haveData(errorCode)) {
1165         return UNORM_YES;
1166     }
1167
1168     UTRIE_GET32(&normTrie, c, norm32);
1169     norm32&=qcMask[mode];
1170
1171     if(norm32==0) {
1172         return UNORM_YES;
1173     } else if(norm32&_NORM_QC_ANY_NO) {
1174         return UNORM_NO;
1175     } else /* _NORM_QC_ANY_MAYBE */ {
1176         return UNORM_MAYBE;
1177     }
1178 }
1179
1180 U_CAPI uint16_t U_EXPORT2
1181 unorm_getFCD16FromCodePoint(UChar32 c) {
1182     UErrorCode errorCode;
1183     uint16_t fcd;
1184
1185     errorCode=U_ZERO_ERROR;
1186     if(!_haveData(errorCode)) {
1187         return 0;
1188     }
1189
1190     UTRIE_GET16(&fcdTrie, c, fcd);
1191     return fcd;
1192 }
1193
1194 /* reorder UTF-16 in-place -------------------------------------------------- */
1195
1196 /*
1197  * simpler, single-character version of _mergeOrdered() -
1198  * bubble-insert one single code point into the preceding string
1199  * which is already canonically ordered
1200  * (c, c2) may or may not yet have been inserted at [current..p[
1201  *
1202  * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1203  *
1204  * before: [start..current[ is already ordered, and
1205  *         [current..p[     may or may not hold (c, c2) but
1206  *                          must be exactly the same length as (c, c2)
1207  * after: [start..p[ is ordered
1208  *
1209  * returns the trailing combining class
1210  */
1211 static uint8_t
1212 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1213                UChar c, UChar c2, uint8_t cc) {
1214     const UChar *pBack, *pPreBack;
1215     UChar *r;
1216     uint8_t prevCC, trailCC=cc;
1217
1218     if(start<current && cc!=0) {
1219         /* search for the insertion point where cc>=prevCC */
1220         pPreBack=pBack=current;
1221         prevCC=_getPrevCC(start, pPreBack);
1222         if(cc<prevCC) {
1223             /* this will be the last code point, so keep its cc */
1224             trailCC=prevCC;
1225             pBack=pPreBack;
1226             while(start<pPreBack) {
1227                 prevCC=_getPrevCC(start, pPreBack);
1228                 if(cc>=prevCC) {
1229                     break;
1230                 }
1231                 pBack=pPreBack;
1232             }
1233
1234             /*
1235              * this is where we are right now with all these pointers:
1236              * [start..pPreBack[ 0..? code points that we can ignore
1237              * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1238              * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
1239              * [current..p[         1 code point (c, c2) with cc
1240              */
1241
1242             /* move the code units in between up */
1243             r=p;
1244             do {
1245                 *--r=*--current;
1246             } while(pBack!=current);
1247         }
1248     }
1249
1250     /* insert (c, c2) */
1251     *current=c;
1252     if(c2!=0) {
1253         *(current+1)=c2;
1254     }
1255
1256     /* we know the cc of the last code point */
1257     return trailCC;
1258 }
1259
1260 /*
1261  * merge two UTF-16 string parts together
1262  * to canonically order (order by combining classes) their concatenation
1263  *
1264  * the two strings may already be adjacent, so that the merging is done in-place
1265  * if the two strings are not adjacent, then the buffer holding the first one
1266  * must be large enough
1267  * the second string may or may not be ordered in itself
1268  *
1269  * before: [start..current[ is already ordered, and
1270  *         [next..limit[    may be ordered in itself, but
1271  *                          is not in relation to [start..current[
1272  * after: [start..current+(limit-next)[ is ordered
1273  *
1274  * the algorithm is a simple bubble-sort that takes the characters from *next++
1275  * and inserts them in correct combining class order into the preceding part
1276  * of the string
1277  *
1278  * since this function is called much less often than the single-code point
1279  * _insertOrdered(), it just uses that for easier maintenance
1280  * (see file version from before 2001aug31 for a more optimized version)
1281  *
1282  * returns the trailing combining class
1283  */
1284 static uint8_t
1285 _mergeOrdered(UChar *start, UChar *current,
1286               const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1287     UChar *r;
1288     UChar c, c2;
1289     uint8_t cc, trailCC=0;
1290     UBool adjacent;
1291
1292     adjacent= current==next;
1293
1294     if(start!=current || !isOrdered) {
1295         while(next<limit) {
1296             cc=_getNextCC(next, limit, c, c2);
1297             if(cc==0) {
1298                 /* does not bubble back */
1299                 trailCC=0;
1300                 if(adjacent) {
1301                     current=(UChar *)next;
1302                 } else {
1303                     *current++=c;
1304                     if(c2!=0) {
1305                         *current++=c2;
1306                     }
1307                 }
1308                 if(isOrdered) {
1309                     break;
1310                 } else {
1311                     start=current;
1312                 }
1313             } else {
1314                 r=current+(c2==0 ? 1 : 2);
1315                 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1316                 current=r;
1317             }
1318         }
1319     }
1320
1321     if(next==limit) {
1322         /* we know the cc of the last code point */
1323         return trailCC;
1324     } else {
1325         if(!adjacent) {
1326             /* copy the second string part */
1327             do {
1328                 *current++=*next++;
1329             } while(next!=limit);
1330             limit=current;
1331         }
1332         return _getPrevCC(start, limit);
1333     }
1334 }
1335
1336 /* find the last true starter in [start..src[ and return the pointer to it */
1337 static const UChar *
1338 _findPreviousStarter(const UChar *start, const UChar *src,
1339                      uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1340     uint32_t norm32;
1341     UChar c, c2;
1342
1343     while(start<src) {
1344         norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1345         if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1346             break;
1347         }
1348     }
1349     return src;
1350 }
1351
1352 /* find the first true starter in [src..limit[ and return the pointer to it */
1353 static const UChar *
1354 _findNextStarter(const UChar *src, const UChar *limit,
1355                  uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1356     const UChar *p;
1357     uint32_t norm32, ccOrQCMask;
1358     int32_t length;
1359     UChar c, c2;
1360     uint8_t cc, trailCC;
1361
1362     ccOrQCMask=_NORM_CC_MASK|qcMask;
1363
1364     for(;;) {
1365         if(src==limit) {
1366             break; /* end of string */
1367         }
1368         c=*src;
1369         if(c<minNoMaybe) {
1370             break; /* catches NUL terminater, too */
1371         }
1372
1373         norm32=_getNorm32(c);
1374         if((norm32&ccOrQCMask)==0) {
1375             break; /* true starter */
1376         }
1377
1378         if(isNorm32LeadSurrogate(norm32)) {
1379             /* c is a lead surrogate, get the real norm32 */
1380             if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1381                 break; /* unmatched first surrogate: counts as a true starter */
1382             }
1383             norm32=_getNorm32FromSurrogatePair(norm32, c2);
1384
1385             if((norm32&ccOrQCMask)==0) {
1386                 break; /* true starter */
1387             }
1388         } else {
1389             c2=0;
1390         }
1391
1392         /* (c, c2) is not a true starter but its decomposition may be */
1393         if(norm32&decompQCMask) {
1394             /* (c, c2) decomposes, get everything from the variable-length extra data */
1395             p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1396
1397             /* get the first character's norm32 to check if it is a true starter */
1398             if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1399                 break; /* true starter */
1400             }
1401         }
1402
1403         src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1404     }
1405
1406     return src;
1407 }
1408
1409 /* make NFD & NFKD ---------------------------------------------------------- */
1410
1411 U_CAPI int32_t U_EXPORT2
1412 unorm_getDecomposition(UChar32 c, UBool compat,
1413                        UChar *dest, int32_t destCapacity) {
1414     UErrorCode errorCode=U_ZERO_ERROR;
1415     if( (uint32_t)c<=0x10ffff &&
1416         _haveData(errorCode) &&
1417         ((dest!=NULL && destCapacity>0) || destCapacity==0)
1418     ) {
1419         uint32_t norm32, qcMask;
1420         UChar32 minNoMaybe;
1421         int32_t length;
1422
1423         /* initialize */
1424         if(!compat) {
1425             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1426             qcMask=_NORM_QC_NFD;
1427         } else {
1428             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1429             qcMask=_NORM_QC_NFKD;
1430         }
1431
1432         if(c<minNoMaybe) {
1433             /* trivial case */
1434             if(destCapacity>0) {
1435                 dest[0]=(UChar)c;
1436             }
1437             return -1;
1438         }
1439
1440         /* data lookup */
1441         UTRIE_GET32(&normTrie, c, norm32);
1442         if((norm32&qcMask)==0) {
1443             /* simple case: no decomposition */
1444             if(c<=0xffff) {
1445                 if(destCapacity>0) {
1446                     dest[0]=(UChar)c;
1447                 }
1448                 return -1;
1449             } else {
1450                 if(destCapacity>=2) {
1451                     dest[0]=UTF16_LEAD(c);
1452                     dest[1]=UTF16_TRAIL(c);
1453                 }
1454                 return -2;
1455             }
1456         } else if(isNorm32HangulOrJamo(norm32)) {
1457             /* Hangul syllable: decompose algorithmically */
1458             UChar c2;
1459
1460             c-=HANGUL_BASE;
1461
1462             c2=(UChar)(c%JAMO_T_COUNT);
1463             c/=JAMO_T_COUNT;
1464             if(c2>0) {
1465                 if(destCapacity>=3) {
1466                     dest[2]=(UChar)(JAMO_T_BASE+c2);
1467                 }
1468                 length=3;
1469             } else {
1470                 length=2;
1471             }
1472
1473             if(destCapacity>=2) {
1474                 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1475                 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1476             }
1477             return length;
1478         } else {
1479             /* c decomposes, get everything from the variable-length extra data */
1480             const UChar *p, *limit;
1481             uint8_t cc, trailCC;
1482
1483             p=_decompose(norm32, qcMask, length, cc, trailCC);
1484             if(length<=destCapacity) {
1485                 limit=p+length;
1486                 do {
1487                     *dest++=*p++;
1488                 } while(p<limit);
1489             }
1490             return length;
1491         }
1492     } else {
1493         return 0;
1494     }
1495 }
1496
1497 static int32_t
1498 _decompose(UChar *dest, int32_t destCapacity,
1499            const UChar *src, int32_t srcLength,
1500            UBool compat, const UnicodeSet *nx,
1501            uint8_t &outTrailCC) {
1502     UChar buffer[3];
1503     const UChar *limit, *prevSrc, *p;
1504     uint32_t norm32, ccOrQCMask, qcMask;
1505     int32_t destIndex, reorderStartIndex, length;
1506     UChar c, c2, minNoMaybe;
1507     uint8_t cc, prevCC, trailCC;
1508
1509     if(!compat) {
1510         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1511         qcMask=_NORM_QC_NFD;
1512     } else {
1513         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1514         qcMask=_NORM_QC_NFKD;
1515     }
1516
1517     /* initialize */
1518     ccOrQCMask=_NORM_CC_MASK|qcMask;
1519     destIndex=reorderStartIndex=0;
1520     prevCC=0;
1521
1522     /* avoid compiler warnings */
1523     norm32=0;
1524     c=0;
1525
1526     if(srcLength>=0) {
1527         /* string with length */
1528         limit=src+srcLength;
1529     } else /* srcLength==-1 */ {
1530         /* zero-terminated string */
1531         limit=NULL;
1532     }
1533
1534     U_ALIGN_CODE(16);
1535
1536     for(;;) {
1537         /* count code units below the minimum or with irrelevant data for the quick check */
1538         prevSrc=src;
1539         if(limit==NULL) {
1540             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1541                 prevCC=0;
1542                 ++src;
1543             }
1544         } else {
1545             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1546                 prevCC=0;
1547                 ++src;
1548             }
1549         }
1550
1551         /* copy these code units all at once */
1552         if(src!=prevSrc) {
1553             length=(int32_t)(src-prevSrc);
1554             if((destIndex+length)<=destCapacity) {
1555                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1556             }
1557             destIndex+=length;
1558             reorderStartIndex=destIndex;
1559         }
1560
1561         /* end of source reached? */
1562         if(limit==NULL ? c==0 : src==limit) {
1563             break;
1564         }
1565
1566         /* c already contains *src and norm32 is set for it, increment src */
1567         ++src;
1568
1569         /* check one above-minimum, relevant code unit */
1570         /*
1571          * generally, set p and length to the decomposition string
1572          * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1573          * in all cases, set cc to the lead and trailCC to the trail combining class
1574          *
1575          * the following merge-sort of the current character into the preceding,
1576          * canonically ordered result text will use the optimized _insertOrdered()
1577          * if there is only one single code point to process;
1578          * this is indicated with p==NULL, and (c, c2) is the character to insert
1579          * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1580          * for a supplementary character)
1581          * otherwise, p[length] is merged in with _mergeOrdered()
1582          */
1583         if(isNorm32HangulOrJamo(norm32)) {
1584             if(nx_contains(nx, c)) {
1585                 c2=0;
1586                 p=NULL;
1587                 length=1;
1588             } else {
1589                 /* Hangul syllable: decompose algorithmically */
1590                 p=buffer;
1591                 cc=trailCC=0;
1592
1593                 c-=HANGUL_BASE;
1594
1595                 c2=(UChar)(c%JAMO_T_COUNT);
1596                 c/=JAMO_T_COUNT;
1597                 if(c2>0) {
1598                     buffer[2]=(UChar)(JAMO_T_BASE+c2);
1599                     length=3;
1600                 } else {
1601                     length=2;
1602                 }
1603
1604                 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1605                 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1606             }
1607         } else {
1608             if(isNorm32Regular(norm32)) {
1609                 c2=0;
1610                 length=1;
1611             } else {
1612                 /* c is a lead surrogate, get the real norm32 */
1613                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1614                     ++src;
1615                     length=2;
1616                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
1617                 } else {
1618                     c2=0;
1619                     length=1;
1620                     norm32=0;
1621                 }
1622             }
1623
1624             /* get the decomposition and the lead and trail cc's */
1625             if(nx_contains(nx, c, c2)) {
1626                 /* excluded: norm32==0 */
1627                 cc=trailCC=0;
1628                 p=NULL;
1629             } else if((norm32&qcMask)==0) {
1630                 /* c does not decompose */
1631                 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1632                 p=NULL;
1633             } else {
1634                 /* c decomposes, get everything from the variable-length extra data */
1635                 p=_decompose(norm32, qcMask, length, cc, trailCC);
1636                 if(length==1) {
1637                     /* fastpath a single code unit from decomposition */
1638                     c=*p;
1639                     c2=0;
1640                     p=NULL;
1641                 }
1642             }
1643         }
1644
1645         /* append the decomposition to the destination buffer, assume length>0 */
1646         if((destIndex+length)<=destCapacity) {
1647             UChar *reorderSplit=dest+destIndex;
1648             if(p==NULL) {
1649                 /* fastpath: single code point */
1650                 if(cc!=0 && cc<prevCC) {
1651                     /* (c, c2) is out of order with respect to the preceding text */
1652                     destIndex+=length;
1653                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1654                 } else {
1655                     /* just append (c, c2) */
1656                     dest[destIndex++]=c;
1657                     if(c2!=0) {
1658                         dest[destIndex++]=c2;
1659                     }
1660                 }
1661             } else {
1662                 /* general: multiple code points (ordered by themselves) from decomposition */
1663                 if(cc!=0 && cc<prevCC) {
1664                     /* the decomposition is out of order with respect to the preceding text */
1665                     destIndex+=length;
1666                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1667                 } else {
1668                     /* just append the decomposition */
1669                     do {
1670                         dest[destIndex++]=*p++;
1671                     } while(--length>0);
1672                 }
1673             }
1674         } else {
1675             /* buffer overflow */
1676             /* keep incrementing the destIndex for preflighting */
1677             destIndex+=length;
1678         }
1679
1680         prevCC=trailCC;
1681         if(prevCC==0) {
1682             reorderStartIndex=destIndex;
1683         }
1684     }
1685
1686     outTrailCC=prevCC;
1687     return destIndex;
1688 }
1689
1690 U_CAPI int32_t U_EXPORT2
1691 unorm_decompose(UChar *dest, int32_t destCapacity,
1692                 const UChar *src, int32_t srcLength,
1693                 UBool compat, int32_t options,
1694                 UErrorCode *pErrorCode) {
1695     const UnicodeSet *nx;
1696     int32_t destIndex;
1697     uint8_t trailCC;
1698
1699     if(!_haveData(*pErrorCode)) {
1700         return 0;
1701     }
1702
1703     nx=getNX(options, *pErrorCode);
1704     if(U_FAILURE(*pErrorCode)) {
1705         return 0;
1706     }
1707
1708     destIndex=_decompose(dest, destCapacity,
1709                          src, srcLength,
1710                          compat, nx,
1711                          trailCC);
1712
1713     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1714 }
1715
1716 /* make NFC & NFKC ---------------------------------------------------------- */
1717
1718 /* get the composition properties of the next character */
1719 static inline uint32_t
1720 _getNextCombining(UChar *&p, const UChar *limit,
1721                   UChar &c, UChar &c2,
1722                   uint16_t &combiningIndex, uint8_t &cc,
1723                   const UnicodeSet *nx) {
1724     uint32_t norm32, combineFlags;
1725
1726     /* get properties */
1727     c=*p++;
1728     norm32=_getNorm32(c);
1729
1730     /* preset output values for most characters */
1731     c2=0;
1732     combiningIndex=0;
1733     cc=0;
1734
1735     if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1736         return 0;
1737     } else {
1738         if(isNorm32Regular(norm32)) {
1739             /* set cc etc. below */
1740         } else if(isNorm32HangulOrJamo(norm32)) {
1741             /* a compatibility decomposition contained Jamos */
1742             combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1743             return norm32&_NORM_COMBINES_ANY;
1744         } else {
1745             /* c is a lead surrogate, get the real norm32 */
1746             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1747                 ++p;
1748                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1749             } else {
1750                 c2=0;
1751                 return 0;
1752             }
1753         }
1754
1755         if(nx_contains(nx, c, c2)) {
1756             return 0; /* excluded: norm32==0 */
1757         }
1758
1759         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1760
1761         combineFlags=norm32&_NORM_COMBINES_ANY;
1762         if(combineFlags!=0) {
1763             combiningIndex=*(_getExtraData(norm32)-1);
1764         }
1765         return combineFlags;
1766     }
1767 }
1768
1769 /*
1770  * given a composition-result starter (c, c2) - which means its cc==0,
1771  * it combines forward, it has extra data, its norm32!=0,
1772  * it is not a Hangul or Jamo,
1773  * get just its combineFwdIndex
1774  *
1775  * norm32(c) is special if and only if c2!=0
1776  */
1777 static inline uint16_t
1778 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1779     uint32_t norm32;
1780
1781     norm32=_getNorm32(c);
1782     if(c2!=0) {
1783         norm32=_getNorm32FromSurrogatePair(norm32, c2);
1784     }
1785     return *(_getExtraData(norm32)-1);
1786 }
1787
1788 /*
1789  * Find the recomposition result for
1790  * a forward-combining character
1791  * (specified with a pointer to its part of the combiningTable[])
1792  * and a backward-combining character
1793  * (specified with its combineBackIndex).
1794  *
1795  * If these two characters combine, then set (value, value2)
1796  * with the code unit(s) of the composition character.
1797  *
1798  * Return value:
1799  * 0    do not combine
1800  * 1    combine
1801  * >1   combine, and the composition is a forward-combining starter
1802  *
1803  * See unormimp.h for a description of the composition table format.
1804  */
1805 static inline uint16_t
1806 _combine(const uint16_t *table, uint16_t combineBackIndex,
1807          uint16_t &value, uint16_t &value2) {
1808     uint16_t key;
1809
1810     /* search in the starter's composition table */
1811     for(;;) {
1812         key=*table++;
1813         if(key>=combineBackIndex) {
1814             break;
1815         }
1816         table+= *table&0x8000 ? 2 : 1;
1817     }
1818
1819     /* mask off bit 15, the last-entry-in-the-list flag */
1820     if((key&0x7fff)==combineBackIndex) {
1821         /* found! combine! */
1822         value=*table;
1823
1824         /* is the composition a starter that combines forward? */
1825         key=(uint16_t)((value&0x2000)+1);
1826
1827         /* get the composition result code point from the variable-length result value */
1828         if(value&0x8000) {
1829             if(value&0x4000) {
1830                 /* surrogate pair composition result */
1831                 value=(uint16_t)((value&0x3ff)|0xd800);
1832                 value2=*(table+1);
1833             } else {
1834                 /* BMP composition result U+2000..U+ffff */
1835                 value=*(table+1);
1836                 value2=0;
1837             }
1838         } else {
1839             /* BMP composition result U+0000..U+1fff */
1840             value&=0x1fff;
1841             value2=0;
1842         }
1843
1844         return key;
1845     } else {
1846         /* not found */
1847         return 0;
1848     }
1849 }
1850
1851 static inline UBool
1852 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1853                UBool compat, UChar *dest, const UnicodeSet *nx) {
1854     if(isJamoVTNorm32JamoV(norm32)) {
1855         /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1856         prev=(UChar)(prev-JAMO_L_BASE);
1857         if(prev<JAMO_L_COUNT) {
1858             c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1859
1860             /* check if the next character is a Jamo T (normal or compatibility) */
1861             if(src!=limit) {
1862                 UChar next, t;
1863
1864                 next=*src;
1865                 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1866                     /* normal Jamo T */
1867                     ++src;
1868                     c+=t;
1869                 } else if(compat) {
1870                     /* if NFKC, then check for compatibility Jamo T (BMP only) */
1871                     norm32=_getNorm32(next);
1872                     if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1873                         const UChar *p;
1874                         int32_t length;
1875                         uint8_t cc, trailCC;
1876
1877                         p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1878                         if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1879                             /* compatibility Jamo T */
1880                             ++src;
1881                             c+=t;
1882                         }
1883                     }
1884                 }
1885             }
1886             if(nx_contains(nx, c)) {
1887                 if(!isHangulWithoutJamoT(c)) {
1888                     --src; /* undo ++src from reading the Jamo T */
1889                 }
1890                 return FALSE;
1891             }
1892             if(dest!=0) {
1893                 *dest=c;
1894             }
1895             return TRUE;
1896         }
1897     } else if(isHangulWithoutJamoT(prev)) {
1898         /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
1899         c=(UChar)(prev+(c-JAMO_T_BASE));
1900         if(nx_contains(nx, c)) {
1901             return FALSE;
1902         }
1903         if(dest!=0) {
1904             *dest=c;
1905         }
1906         return TRUE;
1907     }
1908     return FALSE;
1909 }
1910
1911 /*
1912  * recompose the characters in [p..limit[
1913  * (which is in NFD - decomposed and canonically ordered),
1914  * adjust limit, and return the trailing cc
1915  *
1916  * since for NFKC we may get Jamos in decompositions, we need to
1917  * recompose those too
1918  *
1919  * note that recomposition never lengthens the text:
1920  * any character consists of either one or two code units;
1921  * a composition may contain at most one more code unit than the original starter,
1922  * while the combining mark that is removed has at least one code unit
1923  */
1924 static uint8_t
1925 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
1926     UChar *starter, *pRemove, *q, *r;
1927     uint32_t combineFlags;
1928     UChar c, c2;
1929     uint16_t combineFwdIndex, combineBackIndex;
1930     uint16_t result, value, value2;
1931     uint8_t cc, prevCC;
1932     UBool starterIsSupplementary;
1933
1934     starter=NULL;                   /* no starter */
1935     combineFwdIndex=0;              /* will not be used until starter!=NULL - avoid compiler warnings */
1936     combineBackIndex=0;             /* will always be set if combineFlags!=0 - avoid compiler warnings */
1937     value=value2=0;                 /* always set by _combine() before used - avoid compiler warnings */
1938     starterIsSupplementary=FALSE;   /* will not be used until starter!=NULL - avoid compiler warnings */
1939     prevCC=0;
1940
1941     for(;;) {
1942         combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
1943         if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
1944             if(combineBackIndex&0x8000) {
1945                 /* c is a Jamo V/T, see if we can compose it with the previous character */
1946                 /* for the PRI #29 fix, check that there is no intervening combining mark */
1947                 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
1948                     pRemove=NULL; /* NULL while no Hangul composition */
1949                     combineFlags=0;
1950                     c2=*starter;
1951                     if(combineBackIndex==0xfff2) {
1952                         /* Jamo V, compose with previous Jamo L and following Jamo T */
1953                         c2=(UChar)(c2-JAMO_L_BASE);
1954                         if(c2<JAMO_L_COUNT) {
1955                             pRemove=p-1;
1956                             c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1957                             if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1958                                 ++p;
1959                                 c+=c2;
1960                             } else {
1961                                 /* the result is an LV syllable, which is a starter (unlike LVT) */
1962                                 combineFlags=_NORM_COMBINES_FWD;
1963                             }
1964                             if(!nx_contains(nx, c)) {
1965                                 *starter=c;
1966                             } else {
1967                                 /* excluded */
1968                                 if(!isHangulWithoutJamoT(c)) {
1969                                     --p; /* undo the ++p from reading the Jamo T */
1970                                 }
1971                                 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1972                                 pRemove=NULL;
1973                             }
1974                         }
1975
1976                     /*
1977                      * Normally, the following can not occur:
1978                      * Since the input is in NFD, there are no Hangul LV syllables that
1979                      * a Jamo T could combine with.
1980                      * All Jamo Ts are combined above when handling Jamo Vs.
1981                      *
1982                      * However, before the PRI #29 fix, this can occur due to
1983                      * an intervening combining mark between the Hangul LV and the Jamo T.
1984                      */
1985                     } else {
1986                         /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1987                         if(isHangulWithoutJamoT(c2)) {
1988                             c2+=(UChar)(c-JAMO_T_BASE);
1989                             if(!nx_contains(nx, c2)) {
1990                                 pRemove=p-1;
1991                                 *starter=c2;
1992                             }
1993                         }
1994                     }
1995
1996                     if(pRemove!=NULL) {
1997                         /* remove the Jamo(s) */
1998                         q=pRemove;
1999                         r=p;
2000                         while(r<limit) {
2001                             *q++=*r++;
2002                         }
2003                         p=pRemove;
2004                         limit=q;
2005                     }
2006
2007                     c2=0; /* c2 held *starter temporarily */
2008
2009                     if(combineFlags!=0) {
2010                         /*
2011                          * not starter=NULL because the composition is a Hangul LV syllable
2012                          * and might combine once more (but only before the PRI #29 fix)
2013                          */
2014
2015                         /* done? */
2016                         if(p==limit) {
2017                             return prevCC;
2018                         }
2019
2020                         /* the composition is a Hangul LV syllable which is a starter that combines forward */
2021                         combineFwdIndex=0xfff0;
2022
2023                         /* we combined; continue with looking for compositions */
2024                         continue;
2025                     }
2026                 }
2027
2028                 /*
2029                  * now: cc==0 and the combining index does not include "forward" ->
2030                  * the rest of the loop body will reset starter to NULL;
2031                  * technically, a composed Hangul syllable is a starter, but it
2032                  * does not combine forward now that we have consumed all eligible Jamos;
2033                  * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2034                  */
2035
2036             } else if(
2037                 /* the starter is not a Hangul LV or Jamo V/T and */
2038                 !(combineFwdIndex&0x8000) &&
2039                 /* the combining mark is not blocked and */
2040                 ((options&UNORM_BEFORE_PRI_29) ?
2041                     (prevCC!=cc || prevCC==0) :
2042                     (prevCC<cc || prevCC==0)) &&
2043                 /* the starter and the combining mark (c, c2) do combine and */
2044                 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2045                 /* the composition result is not excluded */
2046                 !nx_contains(nx, value, value2)
2047             ) {
2048                 /* replace the starter with the composition, remove the combining mark */
2049                 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2050
2051                 /* replace the starter with the composition */
2052                 *starter=(UChar)value;
2053                 if(starterIsSupplementary) {
2054                     if(value2!=0) {
2055                         /* both are supplementary */
2056                         *(starter+1)=(UChar)value2;
2057                     } else {
2058                         /* the composition is shorter than the starter, move the intermediate characters forward one */
2059                         starterIsSupplementary=FALSE;
2060                         q=starter+1;
2061                         r=q+1;
2062                         while(r<pRemove) {
2063                             *q++=*r++;
2064                         }
2065                         --pRemove;
2066                     }
2067                 } else if(value2!=0) {
2068                     /* the composition is longer than the starter, move the intermediate characters back one */
2069                     starterIsSupplementary=TRUE;
2070                     ++starter; /* temporarily increment for the loop boundary */
2071                     q=pRemove;
2072                     r=++pRemove;
2073                     while(starter<q) {
2074                         *--r=*--q;
2075                     }
2076                     *starter=(UChar)value2;
2077                     --starter; /* undo the temporary increment */
2078                 /* } else { both are on the BMP, nothing more to do */
2079                 }
2080
2081                 /* remove the combining mark by moving the following text over it */
2082                 if(pRemove<p) {
2083                     q=pRemove;
2084                     r=p;
2085                     while(r<limit) {
2086                         *q++=*r++;
2087                     }
2088                     p=pRemove;
2089                     limit=q;
2090                 }
2091
2092                 /* keep prevCC because we removed the combining mark */
2093
2094                 /* done? */
2095                 if(p==limit) {
2096                     return prevCC;
2097                 }
2098
2099                 /* is the composition a starter that combines forward? */
2100                 if(result>1) {
2101                     combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2102                 } else {
2103                     starter=NULL;
2104                 }
2105
2106                 /* we combined; continue with looking for compositions */
2107                 continue;
2108             }
2109         }
2110
2111         /* no combination this time */
2112         prevCC=cc;
2113         if(p==limit) {
2114             return prevCC;
2115         }
2116
2117         /* if (c, c2) did not combine, then check if it is a starter */
2118         if(cc==0) {
2119             /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2120             if(combineFlags&_NORM_COMBINES_FWD) {
2121                 /* it may combine with something, prepare for it */
2122                 if(c2==0) {
2123                     starterIsSupplementary=FALSE;
2124                     starter=p-1;
2125                 } else {
2126                     starterIsSupplementary=TRUE;
2127                     starter=p-2;
2128                 }
2129                 combineFwdIndex=combineBackIndex;
2130             } else {
2131                 /* it will not combine with anything */
2132                 starter=NULL;
2133             }
2134         } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2135             /* FCC: no discontiguous compositions; any intervening character blocks */
2136             starter=NULL;
2137         }
2138     }
2139 }
2140
2141 /* decompose and recompose [prevStarter..src[ */
2142 static const UChar *
2143 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2144              const UChar *prevStarter, const UChar *src,
2145              uint8_t &prevCC,
2146              int32_t options, const UnicodeSet *nx,
2147              UErrorCode *pErrorCode) {
2148     UChar *recomposeLimit;
2149     uint8_t trailCC;
2150     UBool compat;
2151
2152     compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2153
2154     /* decompose [prevStarter..src[ */
2155     length=_decompose(buffer, bufferCapacity,
2156                       prevStarter, src-prevStarter,
2157                       compat, nx,
2158                       trailCC);
2159     if(length>bufferCapacity) {
2160         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2161             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2162             return NULL;
2163         }
2164         length=_decompose(buffer, bufferCapacity,
2165                           prevStarter, src-prevStarter,
2166                           compat, nx,
2167                           trailCC);
2168     }
2169
2170     /* recompose the decomposition */
2171     recomposeLimit=buffer+length;
2172     if(length>=2) {
2173         prevCC=_recompose(buffer, recomposeLimit, options, nx);
2174     }
2175
2176     /* return with a pointer to the recomposition and its length */
2177     length=recomposeLimit-buffer;
2178     return buffer;
2179 }
2180
2181 static int32_t
2182 _compose(UChar *dest, int32_t destCapacity,
2183          const UChar *src, int32_t srcLength,
2184          int32_t options, const UnicodeSet *nx,
2185          UErrorCode *pErrorCode) {
2186     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2187     UChar *buffer;
2188     int32_t bufferCapacity;
2189
2190     const UChar *limit, *prevSrc, *prevStarter;
2191     uint32_t norm32, ccOrQCMask, qcMask;
2192     int32_t destIndex, reorderStartIndex, length;
2193     UChar c, c2, minNoMaybe;
2194     uint8_t cc, prevCC;
2195
2196     if(options&_NORM_OPTIONS_COMPAT) {
2197         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2198         qcMask=_NORM_QC_NFKC;
2199     } else {
2200         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2201         qcMask=_NORM_QC_NFC;
2202     }
2203
2204     /* initialize */
2205     buffer=stackBuffer;
2206     bufferCapacity=_STACK_BUFFER_CAPACITY;
2207
2208     /*
2209      * prevStarter points to the last character before the current one
2210      * that is a "true" starter with cc==0 and quick check "yes".
2211      *
2212      * prevStarter will be used instead of looking for a true starter
2213      * while incrementally decomposing [prevStarter..prevSrc[
2214      * in _composePart(). Having a good prevStarter allows to just decompose
2215      * the entire [prevStarter..prevSrc[.
2216      *
2217      * When _composePart() backs out from prevSrc back to prevStarter,
2218      * then it also backs out destIndex by the same amount.
2219      * Therefore, at all times, the (prevSrc-prevStarter) source units
2220      * must correspond 1:1 to destination units counted with destIndex,
2221      * except for reordering.
2222      * This is true for the qc "yes" characters copied in the fast loop,
2223      * and for pure reordering.
2224      * prevStarter must be set forward to src when this is not true:
2225      * In _composePart() and after composing a Hangul syllable.
2226      *
2227      * This mechanism relies on the assumption that the decomposition of a true starter
2228      * also begins with a true starter. gennorm/store.c checks for this.
2229      */
2230     prevStarter=src;
2231
2232     ccOrQCMask=_NORM_CC_MASK|qcMask;
2233     destIndex=reorderStartIndex=0;
2234     prevCC=0;
2235
2236     /* avoid compiler warnings */
2237     norm32=0;
2238     c=0;
2239
2240     if(srcLength>=0) {
2241         /* string with length */
2242         limit=src+srcLength;
2243     } else /* srcLength==-1 */ {
2244         /* zero-terminated string */
2245         limit=NULL;
2246     }
2247
2248     U_ALIGN_CODE(16);
2249
2250     for(;;) {
2251         /* count code units below the minimum or with irrelevant data for the quick check */
2252         prevSrc=src;
2253         if(limit==NULL) {
2254             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2255                 prevCC=0;
2256                 ++src;
2257             }
2258         } else {
2259             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2260                 prevCC=0;
2261                 ++src;
2262             }
2263         }
2264
2265         /* copy these code units all at once */
2266         if(src!=prevSrc) {
2267             length=(int32_t)(src-prevSrc);
2268             if((destIndex+length)<=destCapacity) {
2269                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2270             }
2271             destIndex+=length;
2272             reorderStartIndex=destIndex;
2273
2274             /* set prevStarter to the last character in the quick check loop */
2275             prevStarter=src-1;
2276             if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2277                 --prevStarter;
2278             }
2279
2280             prevSrc=src;
2281         }
2282
2283         /* end of source reached? */
2284         if(limit==NULL ? c==0 : src==limit) {
2285             break;
2286         }
2287
2288         /* c already contains *src and norm32 is set for it, increment src */
2289         ++src;
2290
2291         /*
2292          * source buffer pointers:
2293          *
2294          *  all done      quick check   current char  not yet
2295          *                "yes" but     (c, c2)       processed
2296          *                may combine
2297          *                forward
2298          * [-------------[-------------[-------------[-------------[
2299          * |             |             |             |             |
2300          * start         prevStarter   prevSrc       src           limit
2301          *
2302          *
2303          * destination buffer pointers and indexes:
2304          *
2305          *  all done      might take    not filled yet
2306          *                characters for
2307          *                reordering
2308          * [-------------[-------------[-------------[
2309          * |             |             |             |
2310          * dest      reorderStartIndex destIndex     destCapacity
2311          */
2312
2313         /* check one above-minimum, relevant code unit */
2314         /*
2315          * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2316          * check for Jamo V/T, then for surrogates and regular characters
2317          * c is not a Hangul syllable or Jamo L because
2318          * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2319          */
2320         if(isNorm32HangulOrJamo(norm32)) {
2321             /*
2322              * c is a Jamo V/T:
2323              * try to compose with the previous character, Jamo V also with a following Jamo T,
2324              * and set values here right now in case we just continue with the main loop
2325              */
2326             prevCC=cc=0;
2327             reorderStartIndex=destIndex;
2328
2329             if(
2330                 destIndex>0 &&
2331                 _composeHangul(
2332                     *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2333                     destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2334                     nx)
2335             ) {
2336                 prevStarter=src;
2337                 continue;
2338             }
2339
2340             /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2341             c2=0;
2342             length=1;
2343             prevStarter=prevSrc;
2344         } else {
2345             if(isNorm32Regular(norm32)) {
2346                 c2=0;
2347                 length=1;
2348             } else {
2349                 /* c is a lead surrogate, get the real norm32 */
2350                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2351                     ++src;
2352                     length=2;
2353                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
2354                 } else {
2355                     /* c is an unpaired lead surrogate, nothing to do */
2356                     c2=0;
2357                     length=1;
2358                     norm32=0;
2359                 }
2360             }
2361
2362             /* we are looking at the character (c, c2) at [prevSrc..src[ */
2363             if(nx_contains(nx, c, c2)) {
2364                 /* excluded: norm32==0 */
2365                 cc=0;
2366             } else if((norm32&qcMask)==0) {
2367                 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2368             } else {
2369                 const UChar *p;
2370                 uint32_t decompQCMask;
2371
2372                 /*
2373                  * find appropriate boundaries around this character,
2374                  * decompose the source text from between the boundaries,
2375                  * and recompose it
2376                  *
2377                  * this puts the intermediate text into the side buffer because
2378                  * it might be longer than the recomposition end result,
2379                  * or the destination buffer may be too short or missing
2380                  *
2381                  * note that destIndex may be adjusted backwards to account
2382                  * for source text that passed the quick check but needed to
2383                  * take part in the recomposition
2384                  */
2385                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2386
2387                 /*
2388                  * find the last true starter in [prevStarter..src[
2389                  * it is either the decomposition of the current character (at prevSrc),
2390                  * or prevStarter
2391                  */
2392                 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2393                     prevStarter=prevSrc;
2394                 } else {
2395                     /* adjust destIndex: back out what had been copied with qc "yes" */
2396                     destIndex-=(int32_t)(prevSrc-prevStarter);
2397                 }
2398
2399                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2400                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2401
2402                 /* compose [prevStarter..src[ */
2403                 p=_composePart(stackBuffer, buffer, bufferCapacity,
2404                                length,          /* output */
2405                                prevStarter, src,
2406                                prevCC,          /* output */
2407                                options, nx,
2408                                pErrorCode);
2409
2410                 if(p==NULL) {
2411                     destIndex=0;   /* an error occurred (out of memory) */
2412                     break;
2413                 }
2414
2415                 /* append the recomposed buffer contents to the destination buffer */
2416                 if((destIndex+length)<=destCapacity) {
2417                     while(length>0) {
2418                         dest[destIndex++]=*p++;
2419                         --length;
2420                     }
2421                 } else {
2422                     /* buffer overflow */
2423                     /* keep incrementing the destIndex for preflighting */
2424                     destIndex+=length;
2425                 }
2426
2427                 /* set the next starter */
2428                 prevStarter=src;
2429
2430                 continue;
2431             }
2432         }
2433
2434         /* append the single code point (c, c2) to the destination buffer */
2435         if((destIndex+length)<=destCapacity) {
2436             if(cc!=0 && cc<prevCC) {
2437                 /* (c, c2) is out of order with respect to the preceding text */
2438                 UChar *reorderSplit=dest+destIndex;
2439                 destIndex+=length;
2440                 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2441             } else {
2442                 /* just append (c, c2) */
2443                 dest[destIndex++]=c;
2444                 if(c2!=0) {
2445                     dest[destIndex++]=c2;
2446                 }
2447                 prevCC=cc;
2448             }
2449         } else {
2450             /* buffer overflow */
2451             /* keep incrementing the destIndex for preflighting */
2452             destIndex+=length;
2453             prevCC=cc;
2454         }
2455     }
2456
2457     /* cleanup */
2458     if(buffer!=stackBuffer) {
2459         uprv_free(buffer);
2460     }
2461
2462     return destIndex;
2463 }
2464
2465 U_CAPI int32_t U_EXPORT2
2466 unorm_compose(UChar *dest, int32_t destCapacity,
2467               const UChar *src, int32_t srcLength,
2468               UBool compat, int32_t options,
2469               UErrorCode *pErrorCode) {
2470     const UnicodeSet *nx;
2471     int32_t destIndex;
2472
2473     if(!_haveData(*pErrorCode)) {
2474         return 0;
2475     }
2476
2477     nx=getNX(options, *pErrorCode);
2478     if(U_FAILURE(*pErrorCode)) {
2479         return 0;
2480     }
2481
2482     /* reset options bits that should only be set here or inside _compose() */
2483     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2484
2485     if(compat) {
2486         options|=_NORM_OPTIONS_COMPAT;
2487     }
2488
2489     destIndex=_compose(dest, destCapacity,
2490                        src, srcLength,
2491                        options, nx,
2492                        pErrorCode);
2493
2494     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2495 }
2496
2497 /* make FCD ----------------------------------------------------------------- */
2498
2499 static const UChar *
2500 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2501     UChar c, c2;
2502
2503     /*
2504      * find the first position in [src..limit[ after some cc==0 according to FCD data
2505      *
2506      * at the beginning of the loop, we have fcd16 from before src
2507      *
2508      * stop at positions:
2509      * - after trail cc==0
2510      * - at the end of the source
2511      * - before lead cc==0
2512      */
2513     for(;;) {
2514         /* stop if trail cc==0 for the previous character */
2515         if((fcd16&0xff)==0) {
2516             break;
2517         }
2518
2519         /* get c=*src - stop at end of string */
2520         if(src==limit) {
2521             break;
2522         }
2523         c=*src;
2524
2525         /* stop if lead cc==0 for this character */
2526         if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2527             break; /* catches terminating NUL, too */
2528         }
2529
2530         if(!UTF_IS_FIRST_SURROGATE(c)) {
2531             if(fcd16<=0xff) {
2532                 break;
2533             }
2534             ++src;
2535         } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2536             /* c is a lead surrogate, get the real fcd16 */
2537             fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2538             if(fcd16<=0xff) {
2539                 break;
2540             }
2541             src+=2;
2542         } else {
2543             /* c is an unpaired first surrogate, lead cc==0 */
2544             break;
2545         }
2546     }
2547
2548     return src;
2549 }
2550
2551 static uint8_t
2552 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2553               UChar *dest, int32_t &destIndex, int32_t destCapacity,
2554               const UnicodeSet *nx) {
2555     const UChar *p;
2556     uint32_t norm32;
2557     int32_t reorderStartIndex, length;
2558     UChar c, c2;
2559     uint8_t cc, prevCC, trailCC;
2560
2561     /*
2562      * canonically decompose [src..decompLimit[
2563      *
2564      * all characters in this range have some non-zero cc,
2565      * directly or in decomposition,
2566      * so that we do not need to check in the following for quick-check limits etc.
2567      *
2568      * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2569      *
2570      * we also do not need to check for c==0 because we have an established decompLimit
2571      */
2572     reorderStartIndex=destIndex;
2573     prevCC=0;
2574
2575     while(src<decompLimit) {
2576         c=*src++;
2577         norm32=_getNorm32(c);
2578         if(isNorm32Regular(norm32)) {
2579             c2=0;
2580             length=1;
2581         } else {
2582             /*
2583              * reminder: this function is called with [src..decompLimit[
2584              * not containing any Hangul/Jamo characters,
2585              * therefore the only specials are lead surrogates
2586              */
2587             /* c is a lead surrogate, get the real norm32 */
2588             if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2589                 ++src;
2590                 length=2;
2591                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2592             } else {
2593                 c2=0;
2594                 length=1;
2595                 norm32=0;
2596             }
2597         }
2598
2599         /* get the decomposition and the lead and trail cc's */
2600         if(nx_contains(nx, c, c2)) {
2601             /* excluded: norm32==0 */
2602             cc=trailCC=0;
2603             p=NULL;
2604         } else if((norm32&_NORM_QC_NFD)==0) {
2605             /* c does not decompose */
2606             cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2607             p=NULL;
2608         } else {
2609             /* c decomposes, get everything from the variable-length extra data */
2610             p=_decompose(norm32, length, cc, trailCC);
2611             if(length==1) {
2612                 /* fastpath a single code unit from decomposition */
2613                 c=*p;
2614                 c2=0;
2615                 p=NULL;
2616             }
2617         }
2618
2619         /* append the decomposition to the destination buffer, assume length>0 */
2620         if((destIndex+length)<=destCapacity) {
2621             UChar *reorderSplit=dest+destIndex;
2622             if(p==NULL) {
2623                 /* fastpath: single code point */
2624                 if(cc!=0 && cc<prevCC) {
2625                     /* (c, c2) is out of order with respect to the preceding text */
2626                     destIndex+=length;
2627                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2628                 } else {
2629                     /* just append (c, c2) */
2630                     dest[destIndex++]=c;
2631                     if(c2!=0) {
2632                         dest[destIndex++]=c2;
2633                     }
2634                 }
2635             } else {
2636                 /* general: multiple code points (ordered by themselves) from decomposition */
2637                 if(cc!=0 && cc<prevCC) {
2638                     /* the decomposition is out of order with respect to the preceding text */
2639                     destIndex+=length;
2640                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2641                 } else {
2642                     /* just append the decomposition */
2643                     do {
2644                         dest[destIndex++]=*p++;
2645                     } while(--length>0);
2646                 }
2647             }
2648         } else {
2649             /* buffer overflow */
2650             /* keep incrementing the destIndex for preflighting */
2651             destIndex+=length;
2652         }
2653
2654         prevCC=trailCC;
2655         if(prevCC==0) {
2656             reorderStartIndex=destIndex;
2657         }
2658     }
2659
2660     return prevCC;
2661 }
2662
2663 static int32_t
2664 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2665               const UChar *src, int32_t srcLength,
2666               const UnicodeSet *nx,
2667               UErrorCode *pErrorCode) {
2668     const UChar *limit, *prevSrc, *decompStart;
2669     int32_t destIndex, length;
2670     UChar c, c2;
2671     uint16_t fcd16;
2672     int16_t prevCC, cc;
2673
2674     if(!_haveData(*pErrorCode)) {
2675         return 0;
2676     }
2677
2678     /* initialize */
2679     decompStart=src;
2680     destIndex=0;
2681     prevCC=0;
2682
2683     /* avoid compiler warnings */
2684     c=0;
2685     fcd16=0;
2686
2687     if(srcLength>=0) {
2688         /* string with length */
2689         limit=src+srcLength;
2690     } else /* srcLength==-1 */ {
2691         /* zero-terminated string */
2692         limit=NULL;
2693     }
2694
2695     U_ALIGN_CODE(16);
2696
2697     for(;;) {
2698         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2699         prevSrc=src;
2700         if(limit==NULL) {
2701             for(;;) {
2702                 c=*src;
2703                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2704                     if(c==0) {
2705                         break;
2706                     }
2707                     prevCC=(int16_t)-c;
2708                 } else if((fcd16=_getFCD16(c))==0) {
2709                     prevCC=0;
2710                 } else {
2711                     break;
2712                 }
2713                 ++src;
2714             }
2715         } else {
2716             for(;;) {
2717                 if(src==limit) {
2718                     break;
2719                 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2720                     prevCC=(int16_t)-c;
2721                 } else if((fcd16=_getFCD16(c))==0) {
2722                     prevCC=0;
2723                 } else {
2724                     break;
2725                 }
2726                 ++src;
2727             }
2728         }
2729
2730         /*
2731          * prevCC has values from the following ranges:
2732          * 0..0xff - the previous trail combining class
2733          * <0      - the negative value of the previous code unit;
2734          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2735          *           was deferred so that average text is checked faster
2736          */
2737
2738         /* copy these code units all at once */
2739         if(src!=prevSrc) {
2740             length=(int32_t)(src-prevSrc);
2741             if((destIndex+length)<=destCapacity) {
2742                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2743             }
2744             destIndex+=length;
2745             prevSrc=src;
2746
2747             /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2748             if(prevCC<0) {
2749                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2750                 if(!nx_contains(nx, (UChar32)-prevCC)) {
2751                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2752                 } else {
2753                     prevCC=0; /* excluded: fcd16==0 */
2754                 }
2755
2756                 /*
2757                  * set a pointer to this below-U+0300 character;
2758                  * if prevCC==0 then it will moved to after this character below
2759                  */
2760                 decompStart=prevSrc-1;
2761             }
2762         }
2763         /*
2764          * now:
2765          * prevSrc==src - used later to adjust destIndex before decomposition
2766          * prevCC>=0
2767          */
2768
2769         /* end of source reached? */
2770         if(limit==NULL ? c==0 : src==limit) {
2771             break;
2772         }
2773
2774         /* set a pointer to after the last source position where prevCC==0 */
2775         if(prevCC==0) {
2776             decompStart=prevSrc;
2777         }
2778
2779         /* c already contains *src and fcd16 is set for it, increment src */
2780         ++src;
2781
2782         /* check one above-minimum, relevant code unit */
2783         if(UTF_IS_FIRST_SURROGATE(c)) {
2784             /* c is a lead surrogate, get the real fcd16 */
2785             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2786                 ++src;
2787                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2788             } else {
2789                 c2=0;
2790                 fcd16=0;
2791             }
2792         } else {
2793             c2=0;
2794         }
2795
2796         /* we are looking at the character (c, c2) at [prevSrc..src[ */
2797         if(nx_contains(nx, c, c2)) {
2798             fcd16=0; /* excluded: fcd16==0 */
2799         }
2800
2801         /* check the combining order, get the lead cc */
2802         cc=(int16_t)(fcd16>>8);
2803         if(cc==0 || cc>=prevCC) {
2804             /* the order is ok */
2805             if(cc==0) {
2806                 decompStart=prevSrc;
2807             }
2808             prevCC=(int16_t)(fcd16&0xff);
2809
2810             /* just append (c, c2) */
2811             length= c2==0 ? 1 : 2;
2812             if((destIndex+length)<=destCapacity) {
2813                 dest[destIndex++]=c;
2814                 if(c2!=0) {
2815                     dest[destIndex++]=c2;
2816                 }
2817             } else {
2818                 destIndex+=length;
2819             }
2820         } else {
2821             /*
2822              * back out the part of the source that we copied already but
2823              * is now going to be decomposed;
2824              * prevSrc is set to after what was copied
2825              */
2826             destIndex-=(int32_t)(prevSrc-decompStart);
2827
2828             /*
2829              * find the part of the source that needs to be decomposed;
2830              * to be safe and simple, decompose to before the next character with lead cc==0
2831              */
2832             src=_findSafeFCD(src, limit, fcd16);
2833
2834             /*
2835              * the source text does not fulfill the conditions for FCD;
2836              * decompose and reorder a limited piece of the text
2837              */
2838             prevCC=_decomposeFCD(decompStart, src,
2839                                  dest, destIndex, destCapacity,
2840                                  nx);
2841             decompStart=src;
2842         }
2843     }
2844
2845     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2846 }
2847
2848 /* quick check functions ---------------------------------------------------- */
2849
2850 static UBool
2851 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2852     const UChar *limit;
2853     UChar c, c2;
2854     uint16_t fcd16;
2855     int16_t prevCC, cc;
2856
2857     /* initialize */
2858     prevCC=0;
2859
2860     if(srcLength>=0) {
2861         /* string with length */
2862         limit=src+srcLength;
2863     } else /* srcLength==-1 */ {
2864         /* zero-terminated string */
2865         limit=NULL;
2866     }
2867
2868     U_ALIGN_CODE(16);
2869
2870     for(;;) {
2871         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2872         if(limit==NULL) {
2873             for(;;) {
2874                 c=*src++;
2875                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2876                     if(c==0) {
2877                         return TRUE;
2878                     }
2879                     /*
2880                      * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2881                      * because chances are good that the next one will have
2882                      * a leading cc of 0;
2883                      * _getFCD16(-prevCC) is later called when necessary -
2884                      * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2885                      */
2886                     prevCC=(int16_t)-c;
2887                 } else if((fcd16=_getFCD16(c))==0) {
2888                     prevCC=0;
2889                 } else {
2890                     break;
2891                 }
2892             }
2893         } else {
2894             for(;;) {
2895                 if(src==limit) {
2896                     return TRUE;
2897                 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
2898                     prevCC=(int16_t)-c;
2899                 } else if((fcd16=_getFCD16(c))==0) {
2900                     prevCC=0;
2901                 } else {
2902                     break;
2903                 }
2904             }
2905         }
2906
2907         /* check one above-minimum, relevant code unit */
2908         if(UTF_IS_FIRST_SURROGATE(c)) {
2909             /* c is a lead surrogate, get the real fcd16 */
2910             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2911                 ++src;
2912                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2913             } else {
2914                 c2=0;
2915                 fcd16=0;
2916             }
2917         } else {
2918             c2=0;
2919         }
2920
2921         if(nx_contains(nx, c, c2)) {
2922             prevCC=0; /* excluded: fcd16==0 */
2923             continue;
2924         }
2925
2926         /*
2927          * prevCC has values from the following ranges:
2928          * 0..0xff - the previous trail combining class
2929          * <0      - the negative value of the previous code unit;
2930          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2931          *           was deferred so that average text is checked faster
2932          */
2933
2934         /* check the combining order */
2935         cc=(int16_t)(fcd16>>8);
2936         if(cc!=0) {
2937             if(prevCC<0) {
2938                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2939                 if(!nx_contains(nx, (UChar32)-prevCC)) {
2940                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2941                 } else {
2942                     prevCC=0; /* excluded: fcd16==0 */
2943                 }
2944             }
2945
2946             if(cc<prevCC) {
2947                 return FALSE;
2948             }
2949         }
2950         prevCC=(int16_t)(fcd16&0xff);
2951     }
2952 }
2953
2954 static UNormalizationCheckResult
2955 _quickCheck(const UChar *src,
2956             int32_t srcLength,
2957             UNormalizationMode mode,
2958             UBool allowMaybe,
2959             const UnicodeSet *nx,
2960             UErrorCode *pErrorCode) {
2961     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2962     UChar *buffer;
2963     int32_t bufferCapacity;
2964
2965     const UChar *start, *limit;
2966     uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
2967     int32_t options;
2968     UChar c, c2, minNoMaybe;
2969     uint8_t cc, prevCC;
2970     UNormalizationCheckResult result;
2971
2972     /* check arguments */
2973     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2974         return UNORM_MAYBE;
2975     }
2976
2977     if(src==NULL || srcLength<-1) {
2978         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2979         return UNORM_MAYBE;
2980     }
2981
2982     if(!_haveData(*pErrorCode)) {
2983         return UNORM_MAYBE;
2984     }
2985
2986     /* check for a valid mode and set the quick check minimum and mask */
2987     switch(mode) {
2988     case UNORM_NFC:
2989         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2990         qcMask=_NORM_QC_NFC;
2991         options=0;
2992         break;
2993     case UNORM_NFKC:
2994         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2995         qcMask=_NORM_QC_NFKC;
2996         options=_NORM_OPTIONS_COMPAT;
2997         break;
2998     case UNORM_NFD:
2999         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3000         qcMask=_NORM_QC_NFD;
3001         options=0;
3002         break;
3003     case UNORM_NFKD:
3004         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3005         qcMask=_NORM_QC_NFKD;
3006         options=_NORM_OPTIONS_COMPAT;
3007         break;
3008     case UNORM_FCD:
3009         return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3010     default:
3011         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3012         return UNORM_MAYBE;
3013     }
3014
3015     /* initialize */
3016     buffer=stackBuffer;
3017     bufferCapacity=_STACK_BUFFER_CAPACITY;
3018
3019     ccOrQCMask=_NORM_CC_MASK|qcMask;
3020     result=UNORM_YES;
3021     prevCC=0;
3022
3023     start=src;
3024     if(srcLength>=0) {
3025         /* string with length */
3026         limit=src+srcLength;
3027     } else /* srcLength==-1 */ {
3028         /* zero-terminated string */
3029         limit=NULL;
3030     }
3031
3032     U_ALIGN_CODE(16);
3033
3034     for(;;) {
3035         /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3036         if(limit==NULL) {
3037             for(;;) {
3038                 c=*src++;
3039                 if(c<minNoMaybe) {
3040                     if(c==0) {
3041                         goto endloop; /* break out of outer loop */
3042                     }
3043                 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3044                     break;
3045                 }
3046                 prevCC=0;
3047             }
3048         } else {
3049             for(;;) {
3050                 if(src==limit) {
3051                     goto endloop; /* break out of outer loop */
3052                 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3053                     break;
3054                 }
3055                 prevCC=0;
3056             }
3057         }
3058
3059         /* check one above-minimum, relevant code unit */
3060         if(isNorm32LeadSurrogate(norm32)) {
3061             /* c is a lead surrogate, get the real norm32 */
3062             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3063                 ++src;
3064                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3065             } else {
3066                 c2=0;
3067                 norm32=0;
3068             }
3069         } else {
3070             c2=0;
3071         }
3072
3073         if(nx_contains(nx, c, c2)) {
3074             /* excluded: norm32==0 */
3075             norm32=0;
3076         }
3077
3078         /* check the combining order */
3079         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3080         if(cc!=0 && cc<prevCC) {
3081             result=UNORM_NO;
3082             break;
3083         }
3084         prevCC=cc;
3085
3086         /* check for "no" or "maybe" quick check flags */
3087         qcNorm32=norm32&qcMask;
3088         if(qcNorm32&_NORM_QC_ANY_NO) {
3089             result=UNORM_NO;
3090             break;
3091         } else if(qcNorm32!=0) {
3092             /* "maybe" can only occur for NFC and NFKC */
3093             if(allowMaybe) {
3094                 result=UNORM_MAYBE;
3095             } else {
3096                 /* normalize a section around here to see if it is really normalized or not */
3097                 const UChar *prevStarter;
3098                 uint32_t decompQCMask;
3099                 int32_t length;
3100
3101                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3102
3103                 /* find the previous starter */
3104                 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3105                 if(UTF_IS_TRAIL(*prevStarter)) {
3106                     --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3107                 }
3108                 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3109
3110                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3111                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3112
3113                 /* decompose and recompose [prevStarter..src[ */
3114                 _composePart(stackBuffer, buffer, bufferCapacity,
3115                              length,
3116                              prevStarter,
3117                              src,
3118                              prevCC,
3119                              options, nx, pErrorCode);
3120                 if(U_FAILURE(*pErrorCode)) {
3121                     result=UNORM_MAYBE; /* error (out of memory) */
3122                     break;
3123                 }
3124
3125                 /* compare the normalized version with the original */
3126                 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3127                     result=UNORM_NO; /* normalization differs */
3128                     break;
3129                 }
3130
3131                 /* continue after the next starter */
3132             }
3133         }
3134     }
3135 endloop:
3136
3137     if(buffer!=stackBuffer) {
3138         uprv_free(buffer);
3139     }
3140
3141     return result;
3142 }
3143
3144 U_CAPI UNormalizationCheckResult U_EXPORT2
3145 unorm_quickCheck(const UChar *src,
3146                  int32_t srcLength,
3147                  UNormalizationMode mode,
3148                  UErrorCode *pErrorCode) {
3149     return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3150 }
3151
3152 U_CAPI UNormalizationCheckResult U_EXPORT2
3153 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3154                             UNormalizationMode mode, int32_t options,
3155                             UErrorCode *pErrorCode) {
3156     return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3157 }
3158
3159 U_CFUNC UNormalizationCheckResult
3160 unorm_internalQuickCheck(const UChar *src,
3161                          int32_t srcLength,
3162                          UNormalizationMode mode,
3163                          UBool allowMaybe,
3164                          const UnicodeSet *nx,
3165                          UErrorCode *pErrorCode) {
3166     return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3167 }
3168
3169 U_CAPI UBool U_EXPORT2
3170 unorm_isNormalized(const UChar *src, int32_t srcLength,
3171                    UNormalizationMode mode,
3172                    UErrorCode *pErrorCode) {
3173     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3174 }
3175
3176 U_CAPI UBool U_EXPORT2
3177 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3178                               UNormalizationMode mode, int32_t options,
3179                               UErrorCode *pErrorCode) {
3180     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3181 }
3182
3183 /* normalize() API ---------------------------------------------------------- */
3184
3185 /**
3186  * Internal API for normalizing.
3187  * Does not check for bad input.
3188  * Requires _haveData() to be true.
3189  * @internal
3190  */
3191 U_CFUNC int32_t
3192 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3193                               const UChar *src, int32_t srcLength,
3194                               UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3195                               UErrorCode *pErrorCode) {
3196     int32_t destLength;
3197     uint8_t trailCC;
3198
3199     switch(mode) {
3200     case UNORM_NFD:
3201         destLength=_decompose(dest, destCapacity,
3202                               src, srcLength,
3203                               FALSE, nx, trailCC);
3204         break;
3205     case UNORM_NFKD:
3206         destLength=_decompose(dest, destCapacity,
3207                               src, srcLength,
3208                               TRUE, nx, trailCC);
3209         break;
3210     case UNORM_NFC:
3211         destLength=_compose(dest, destCapacity,
3212                             src, srcLength,
3213                             options, nx, pErrorCode);
3214         break;
3215     case UNORM_NFKC:
3216         destLength=_compose(dest, destCapacity,
3217                             src, srcLength,
3218                             options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3219         break;
3220     case UNORM_FCD:
3221         return unorm_makeFCD(dest, destCapacity,
3222                              src, srcLength,
3223                              nx,
3224                              pErrorCode);
3225 #if 0
3226     case UNORM_FCC:
3227         destLength=_compose(dest, destCapacity,
3228                             src, srcLength,
3229                             options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3230         break;
3231 #endif
3232     case UNORM_NONE:
3233         /* just copy the string */
3234         if(srcLength==-1) {
3235             srcLength=u_strlen(src);
3236         }
3237         if(srcLength>0 && srcLength<=destCapacity) {
3238             uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3239         }
3240         destLength=srcLength;
3241         break;
3242     default:
3243         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3244         return 0;
3245     }
3246
3247     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3248 }
3249
3250 /**
3251  * Internal API for normalizing.
3252  * Does not check for bad input.
3253  * @internal
3254  */
3255 U_CAPI int32_t U_EXPORT2
3256 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3257                         const UChar *src, int32_t srcLength,
3258                         UNormalizationMode mode, int32_t options,
3259                         UErrorCode *pErrorCode) {
3260     const UnicodeSet *nx;
3261
3262     if(!_haveData(*pErrorCode)) {
3263         return 0;
3264     }
3265
3266     nx=getNX(options, *pErrorCode);
3267     if(U_FAILURE(*pErrorCode)) {
3268         return 0;
3269     }
3270
3271     /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3272     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3273
3274     return unorm_internalNormalizeWithNX(dest, destCapacity,
3275                                          src, srcLength,
3276                                          mode, options, nx,
3277                                          pErrorCode);
3278 }
3279
3280 /** Public API for normalizing. */
3281 U_CAPI int32_t U_EXPORT2
3282 unorm_normalize(const UChar *src, int32_t srcLength,
3283                 UNormalizationMode mode, int32_t options,
3284                 UChar *dest, int32_t destCapacity,
3285                 UErrorCode *pErrorCode) {
3286     /* check argument values */
3287     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3288         return 0;
3289     }
3290
3291     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3292         src==NULL || srcLength<-1
3293     ) {
3294         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3295         return 0;
3296     }
3297
3298     /* check for overlapping src and destination */
3299     if( dest!=NULL &&
3300         ((src>=dest && src<(dest+destCapacity)) ||
3301          (srcLength>0 && dest>=src && dest<(src+srcLength)))
3302     ) {
3303         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3304         return 0;
3305     }
3306
3307     return unorm_internalNormalize(dest, destCapacity,
3308                                    src, srcLength,
3309                                    mode, options,
3310                                    pErrorCode);
3311 }
3312
3313
3314 /* iteration functions ------------------------------------------------------ */
3315
3316 /*
3317  * These iteration functions are the core implementations of the
3318  * Normalizer class iteration API.
3319  * They read from a UCharIterator into their own buffer
3320  * and normalize into the Normalizer iteration buffer.
3321  * Normalizer itself then iterates over its buffer until that needs to be
3322  * filled again.
3323  */
3324
3325 /*
3326  * ### TODO:
3327  * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3328  * if iteration bounds are reached,
3329  * try to not call hasNext/hasPrevious and instead check for >=0.
3330  */
3331
3332 /* backward iteration ------------------------------------------------------- */
3333
3334 /*
3335  * read backwards and get norm32
3336  * return 0 if the character is <minC
3337  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3338  */
3339 static inline uint32_t
3340 _getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3341     uint32_t norm32;
3342
3343     /* need src.hasPrevious() */
3344     c=(UChar)src.previous(&src);
3345     c2=0;
3346
3347     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3348     if(c<minC) {
3349         return 0;
3350     } else if(!UTF_IS_SURROGATE(c)) {
3351         return _getNorm32(c);
3352     } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3353         /* unpaired surrogate */
3354         return 0;
3355     } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3356         norm32=_getNorm32(c2);
3357         if((norm32&mask)==0) {
3358             /* all surrogate pairs with this lead surrogate have irrelevant data */
3359             return 0;
3360         } else {
3361             /* norm32 must be a surrogate special */
3362             return _getNorm32FromSurrogatePair(norm32, c);
3363         }
3364     } else {
3365         /* unpaired second surrogate, undo the c2=src.previous() movement */
3366         src.move(&src, 1, UITER_CURRENT);
3367         c2=0;
3368         return 0;
3369     }
3370 }
3371
3372 /*
3373  * read backwards and check if the character is a previous-iteration boundary
3374  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3375  */
3376 typedef UBool
3377 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3378
3379 /*
3380  * for NF*D:
3381  * read backwards and check if the lead combining class is 0
3382  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3383  */
3384 static UBool
3385 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3386     return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3387 }
3388
3389 /*
3390  * read backwards and check if the character is (or its decomposition begins with)
3391  * a "true starter" (cc==0 and NF*C_YES)
3392  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3393  */
3394 static UBool
3395 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3396     uint32_t norm32, decompQCMask;
3397
3398     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3399     norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3400     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3401 }
3402
3403 static int32_t
3404 _findPreviousIterationBoundary(UCharIterator &src,
3405                                IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3406                                UChar *&buffer, int32_t &bufferCapacity,
3407                                int32_t &startIndex,
3408                                UErrorCode *pErrorCode) {
3409     UChar *stackBuffer;
3410     UChar c, c2;
3411     UBool isBoundary;
3412
3413     /* initialize */
3414     stackBuffer=buffer;
3415     startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3416
3417     while(src.hasPrevious(&src)) {
3418         isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3419
3420         /* always write this character to the front of the buffer */
3421         /* make sure there is enough space in the buffer */
3422         if(startIndex < (c2==0 ? 1 : 2)) {
3423             int32_t bufferLength=bufferCapacity;
3424
3425             if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3426                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3427                 src.move(&src, 0, UITER_START);
3428                 return 0;
3429             }
3430
3431             /* move the current buffer contents up */
3432             uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3433             startIndex+=bufferCapacity-bufferLength;
3434         }
3435
3436         buffer[--startIndex]=c;
3437         if(c2!=0) {
3438             buffer[--startIndex]=c2;
3439         }
3440
3441         /* stop if this just-copied character is a boundary */
3442         if(isBoundary) {
3443             break;
3444         }
3445     }
3446
3447     /* return the length of the buffer contents */
3448     return bufferCapacity-startIndex;
3449 }
3450
3451 U_CAPI int32_t U_EXPORT2
3452 unorm_previous(UCharIterator *src,
3453                UChar *dest, int32_t destCapacity,
3454                UNormalizationMode mode, int32_t options,
3455                UBool doNormalize, UBool *pNeededToNormalize,
3456                UErrorCode *pErrorCode) {
3457     UChar stackBuffer[100];
3458     UChar *buffer=NULL;
3459     IsPrevBoundaryFn *isPreviousBoundary=NULL;
3460     uint32_t mask=0;
3461     int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3462     int32_t c=0, c2=0;
3463     UChar minC=0;
3464
3465     /* check argument values */
3466     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3467         return 0;
3468     }
3469
3470     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3471         src==NULL
3472     ) {
3473         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3474         return 0;
3475     }
3476
3477     if(!_haveData(*pErrorCode)) {
3478         return 0;
3479     }
3480
3481     if(pNeededToNormalize!=NULL) {
3482         *pNeededToNormalize=FALSE;
3483     }
3484
3485     switch(mode) {
3486     case UNORM_NFD:
3487     case UNORM_FCD:
3488         isPreviousBoundary=_isPrevNFDSafe;
3489         minC=_NORM_MIN_WITH_LEAD_CC;
3490         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3491         break;
3492     case UNORM_NFKD:
3493         isPreviousBoundary=_isPrevNFDSafe;
3494         minC=_NORM_MIN_WITH_LEAD_CC;
3495         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3496         break;
3497     case UNORM_NFC:
3498         isPreviousBoundary=_isPrevTrueStarter;
3499         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3500         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3501         break;
3502     case UNORM_NFKC:
3503         isPreviousBoundary=_isPrevTrueStarter;
3504         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3505         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3506         break;
3507     case UNORM_NONE:
3508         destLength=0;
3509         if((c=src->previous(src))>=0) {
3510             destLength=1;
3511             if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3512                 if(UTF_IS_LEAD(c2)) {
3513                     if(destCapacity>=2) {
3514                         dest[1]=(UChar)c; /* trail surrogate */
3515                         destLength=2;
3516                     }
3517                     c=c2; /* lead surrogate to be written below */
3518                 } else {
3519                     src->move(src, 1, UITER_CURRENT);
3520                 }
3521             }
3522
3523             if(destCapacity>0) {
3524                 dest[0]=(UChar)c;
3525             }
3526         }
3527         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3528     default:
3529         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3530         return 0;
3531     }
3532
3533     buffer=stackBuffer;
3534     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3535     bufferLength=_findPreviousIterationBoundary(*src,
3536                                                 isPreviousBoundary, minC, mask,
3537                                                 buffer, bufferCapacity,
3538                                                 startIndex,
3539                                                 pErrorCode);
3540     if(bufferLength>0) {
3541         if(doNormalize) {
3542             destLength=unorm_internalNormalize(dest, destCapacity,
3543                                                buffer+startIndex, bufferLength,
3544                                                mode, options,
3545                                                pErrorCode);
3546             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3547                 *pNeededToNormalize=
3548                     (UBool)(destLength!=bufferLength ||
3549                             0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3550             }
3551         } else {
3552             /* just copy the source characters */
3553             if(destCapacity>0) {
3554                 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3555             }
3556             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3557         }
3558     } else {
3559         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3560     }
3561
3562     /* cleanup */
3563     if(buffer!=stackBuffer) {
3564         uprv_free(buffer);
3565     }
3566
3567     return destLength;
3568 }
3569
3570 /* forward iteration -------------------------------------------------------- */
3571
3572 /*
3573  * read forward and get norm32
3574  * return 0 if the character is <minC
3575  * if c2!=0 then (c2, c) is a surrogate pair
3576  * always reads complete characters
3577  */
3578 static inline uint32_t
3579 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3580     uint32_t norm32;
3581
3582     /* need src.hasNext() to be true */
3583     c=(UChar)src.next(&src);
3584     c2=0;
3585
3586     if(c<minC) {
3587         return 0;
3588     }
3589
3590     norm32=_getNorm32(c);
3591     if(UTF_IS_FIRST_SURROGATE(c)) {
3592         if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3593             src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3594             if((norm32&mask)==0) {
3595                 /* irrelevant data */
3596                 return 0;
3597             } else {
3598                 /* norm32 must be a surrogate special */
3599                 return _getNorm32FromSurrogatePair(norm32, c2);
3600             }
3601         } else {
3602             /* unmatched surrogate */
3603             c2=0;
3604             return 0;
3605         }
3606     }
3607     return norm32;
3608 }
3609
3610 /*
3611  * read forward and check if the character is a next-iteration boundary
3612  * if c2!=0 then (c, c2) is a surrogate pair
3613  */
3614 typedef UBool
3615 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3616
3617 /*
3618  * for NF*D:
3619  * read forward and check if the lead combining class is 0
3620  * if c2!=0 then (c, c2) is a surrogate pair
3621  */
3622 static UBool
3623 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3624     return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3625 }
3626
3627 /*
3628  * for NF*C:
3629  * read forward and check if the character is (or its decomposition begins with)
3630  * a "true starter" (cc==0 and NF*C_YES)
3631  * if c2!=0 then (c, c2) is a surrogate pair
3632  */
3633 static UBool
3634 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3635     uint32_t norm32, decompQCMask;
3636
3637     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3638     norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3639     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3640 }
3641
3642 static int32_t
3643 _findNextIterationBoundary(UCharIterator &src,
3644                            IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3645                            UChar *&buffer, int32_t &bufferCapacity,
3646                            UErrorCode *pErrorCode) {
3647     UChar *stackBuffer;
3648     int32_t bufferIndex;
3649     UChar c, c2;
3650
3651     if(!src.hasNext(&src)) {
3652         return 0;
3653     }
3654
3655     /* initialize */
3656     stackBuffer=buffer;
3657
3658     /* get one character and ignore its properties */
3659     buffer[0]=c=(UChar)src.next(&src);
3660     bufferIndex=1;
3661     if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3662         if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3663             buffer[bufferIndex++]=c2;
3664         } else {
3665             src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3666         }
3667     }
3668
3669     /* get all following characters until we see a boundary */
3670     /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3671     while(src.hasNext(&src)) {
3672         if(isNextBoundary(src, minC, mask, c, c2)) {
3673             /* back out the latest movement to stop at the boundary */
3674             src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3675             break;
3676         } else {
3677             if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3678                 /* attempt to grow the buffer */
3679                 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3680                                        2*bufferCapacity,
3681                                        bufferIndex)
3682             ) {
3683                 buffer[bufferIndex++]=c;
3684                 if(c2!=0) {
3685                     buffer[bufferIndex++]=c2;
3686                 }
3687             } else {
3688                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3689                 src.move(&src, 0, UITER_LIMIT);
3690                 return 0;
3691             }
3692         }
3693     }
3694
3695     /* return the length of the buffer contents */
3696     return bufferIndex;
3697 }
3698
3699 U_CAPI int32_t U_EXPORT2
3700 unorm_next(UCharIterator *src,
3701            UChar *dest, int32_t destCapacity,
3702            UNormalizationMode mode, int32_t options,
3703            UBool doNormalize, UBool *pNeededToNormalize,
3704            UErrorCode *pErrorCode) {
3705     UChar stackBuffer[100];
3706     UChar *buffer;
3707     IsNextBoundaryFn *isNextBoundary;
3708     uint32_t mask;
3709     int32_t bufferLength, bufferCapacity, destLength;
3710     int32_t c, c2;
3711     UChar minC;
3712
3713     /* check argument values */
3714     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3715         return 0;
3716     }
3717
3718     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3719         src==NULL
3720     ) {
3721         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3722         return 0;
3723     }
3724
3725     if(!_haveData(*pErrorCode)) {
3726         return 0;
3727     }
3728
3729     if(pNeededToNormalize!=NULL) {
3730         *pNeededToNormalize=FALSE;
3731     }
3732
3733     switch(mode) {
3734     case UNORM_NFD:
3735     case UNORM_FCD:
3736         isNextBoundary=_isNextNFDSafe;
3737         minC=_NORM_MIN_WITH_LEAD_CC;
3738         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3739         break;
3740     case UNORM_NFKD:
3741         isNextBoundary=_isNextNFDSafe;
3742         minC=_NORM_MIN_WITH_LEAD_CC;
3743         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3744         break;
3745     case UNORM_NFC:
3746         isNextBoundary=_isNextTrueStarter;
3747         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3748         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3749         break;
3750     case UNORM_NFKC:
3751         isNextBoundary=_isNextTrueStarter;
3752         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3753         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3754         break;
3755     case UNORM_NONE:
3756         destLength=0;
3757         if((c=src->next(src))>=0) {
3758             destLength=1;
3759             if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3760                 if(UTF_IS_TRAIL(c2)) {
3761                     if(destCapacity>=2) {
3762                         dest[1]=(UChar)c2; /* trail surrogate */
3763                         destLength=2;
3764                     }
3765                     /* lead surrogate to be written below */
3766                 } else {
3767                     src->move(src, -1, UITER_CURRENT);
3768                 }
3769             }
3770
3771             if(destCapacity>0) {
3772                 dest[0]=(UChar)c;
3773             }
3774         }
3775         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3776     default:
3777         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3778         return 0;
3779     }
3780
3781     buffer=stackBuffer;
3782     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3783     bufferLength=_findNextIterationBoundary(*src,
3784                                             isNextBoundary, minC, mask,
3785                                             buffer, bufferCapacity,
3786                                             pErrorCode);
3787     if(bufferLength>0) {
3788         if(doNormalize) {
3789             destLength=unorm_internalNormalize(dest, destCapacity,
3790                                                buffer, bufferLength,
3791                                                mode, options,
3792                                                pErrorCode);
3793             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3794                 *pNeededToNormalize=
3795                     (UBool)(destLength!=bufferLength ||
3796                             0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3797             }
3798         } else {
3799             /* just copy the source characters */
3800             if(destCapacity>0) {
3801                 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3802             }
3803             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3804         }
3805     } else {
3806         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3807     }
3808
3809     /* cleanup */
3810     if(buffer!=stackBuffer) {
3811         uprv_free(buffer);
3812     }
3813
3814     return destLength;
3815 }
3816
3817 /*
3818  * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3819  * and if not, how hard it would be to improve it.
3820  * For example, see _findSafeFCD().
3821  */
3822
3823 /* Concatenation of normalized strings -------------------------------------- */
3824
3825 U_CAPI int32_t U_EXPORT2
3826 unorm_concatenate(const UChar *left, int32_t leftLength,
3827                   const UChar *right, int32_t rightLength,
3828                   UChar *dest, int32_t destCapacity,
3829                   UNormalizationMode mode, int32_t options,
3830                   UErrorCode *pErrorCode) {
3831     UChar stackBuffer[100];
3832     UChar *buffer;
3833     int32_t bufferLength, bufferCapacity;
3834
3835     UCharIterator iter;
3836     int32_t leftBoundary, rightBoundary, destLength;
3837
3838     /* check argument values */
3839     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3840         return 0;
3841     }
3842
3843     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3844         left==NULL || leftLength<-1 ||
3845         right==NULL || rightLength<-1
3846     ) {
3847         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3848         return 0;
3849     }
3850
3851     /* check for overlapping right and destination */
3852     if( dest!=NULL &&
3853         ((right>=dest && right<(dest+destCapacity)) ||
3854          (rightLength>0 && dest>=right && dest<(right+rightLength)))
3855     ) {
3856         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3857         return 0;
3858     }
3859
3860     /* allow left==dest */
3861
3862     /* set up intermediate buffer */
3863     buffer=stackBuffer;
3864     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3865
3866     /*
3867      * Input: left[0..leftLength[ + right[0..rightLength[
3868      *
3869      * Find normalization-safe boundaries leftBoundary and rightBoundary
3870      * and copy the end parts together:
3871      * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3872      *
3873      * dest=left[0..leftBoundary[ +
3874      *      normalize(buffer) +
3875      *      right[rightBoundary..rightLength[
3876      */
3877
3878     /*
3879      * find a normalization boundary at the end of the left string
3880      * and copy the end part into the buffer
3881      */
3882     uiter_setString(&iter, left, leftLength);
3883     iter.index=leftLength=iter.length; /* end of left string */
3884
3885     bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
3886                                 mode, options,
3887                                 FALSE, NULL,
3888                                 pErrorCode);
3889     leftBoundary=iter.index;
3890     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3891         *pErrorCode=U_ZERO_ERROR;
3892         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
3893             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3894             /* dont need to cleanup here since
3895              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3896              */
3897             return 0;
3898         }
3899
3900         /* just copy from the left string: we know the boundary already */
3901         uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
3902     }
3903
3904     /*
3905      * find a normalization boundary at the beginning of the right string
3906      * and concatenate the beginning part to the buffer
3907      */
3908     uiter_setString(&iter, right, rightLength);
3909     rightLength=iter.length; /* in case it was -1 */
3910
3911     rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
3912                              mode, options,
3913                              FALSE, NULL,
3914                              pErrorCode);
3915     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3916         *pErrorCode=U_ZERO_ERROR;
3917         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
3918             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3919             /* dont need to cleanup here since
3920              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3921              */
3922             return 0;
3923         }
3924
3925         /* just copy from the right string: we know the boundary already */
3926         uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
3927     }
3928
3929     bufferLength+=rightBoundary;
3930
3931     /* copy left[0..leftBoundary[ to dest */
3932     if(left!=dest && leftBoundary>0 && destCapacity>0) {
3933         uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
3934     }
3935     destLength=leftBoundary;
3936
3937     /* concatenate the normalization of the buffer to dest */
3938     if(destCapacity>destLength) {
3939         destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
3940                                             buffer, bufferLength,
3941                                             mode, options,
3942                                             pErrorCode);
3943     } else {
3944         destLength+=unorm_internalNormalize(NULL, 0,
3945                                             buffer, bufferLength,
3946                                             mode, options,
3947                                             pErrorCode);
3948     }
3949     /*
3950      * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
3951      * so we dont check for the error code here..just let it pass through
3952      */
3953     /* concatenate right[rightBoundary..rightLength[ to dest */
3954     right+=rightBoundary;
3955     rightLength-=rightBoundary;
3956     if(rightLength>0 && destCapacity>destLength) {
3957         uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
3958     }
3959     destLength+=rightLength;
3960
3961     /* cleanup */
3962     if(buffer!=stackBuffer) {
3963         uprv_free(buffer);
3964     }
3965
3966     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3967 }
3968
3969 /* data swapping ------------------------------------------------------------ */
3970
3971 U_CAPI int32_t U_EXPORT2
3972 unorm_swap(const UDataSwapper *ds,
3973            const void *inData, int32_t length, void *outData,
3974            UErrorCode *pErrorCode) {
3975     const UDataInfo *pInfo;
3976     int32_t headerSize;
3977
3978     const uint8_t *inBytes;
3979     uint8_t *outBytes;
3980
3981     const int32_t *inIndexes;
3982     int32_t indexes[32];
3983
3984     int32_t i, offset, count, size;
3985
3986     /* udata_swapDataHeader checks the arguments */
3987     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
3988     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3989         return 0;
3990     }
3991
3992     /* check data format and format version */
3993     pInfo=(const UDataInfo *)((const char *)inData+4);
3994     if(!(
3995         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
3996         pInfo->dataFormat[1]==0x6f &&
3997         pInfo->dataFormat[2]==0x72 &&
3998         pInfo->dataFormat[3]==0x6d &&
3999         pInfo->formatVersion[0]==2
4000     )) {
4001         udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n",
4002                          pInfo->dataFormat[0], pInfo->dataFormat[1],
4003                          pInfo->dataFormat[2], pInfo->dataFormat[3],
4004                          pInfo->formatVersion[0]);
4005         *pErrorCode=U_UNSUPPORTED_ERROR;
4006         return 0;
4007     }
4008
4009     inBytes=(const uint8_t *)inData+headerSize;
4010     outBytes=(uint8_t *)outData+headerSize;
4011
4012     inIndexes=(const int32_t *)inBytes;
4013
4014     if(length>=0) {
4015         length-=headerSize;
4016         if(length<32*4) {
4017             udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n",
4018                              length);
4019             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
4020             return 0;
4021         }
4022     }
4023
4024     /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */
4025     for(i=0; i<32; ++i) {
4026         indexes[i]=udata_readInt32(ds, inIndexes[i]);
4027     }
4028
4029     /* calculate the total length of the data */
4030     size=
4031         32*4+ /* size of indexes[] */
4032         indexes[_NORM_INDEX_TRIE_SIZE]+
4033         indexes[_NORM_INDEX_UCHAR_COUNT]*2+
4034         indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+
4035         indexes[_NORM_INDEX_FCD_TRIE_SIZE]+
4036         indexes[_NORM_INDEX_AUX_TRIE_SIZE]+
4037         indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
4038
4039     if(length>=0) {
4040         if(length<size) {
4041             udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n",
4042                              length);
4043             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
4044             return 0;
4045         }
4046
4047         /* copy the data for inaccessible bytes */
4048         if(inBytes!=outBytes) {
4049             uprv_memcpy(outBytes, inBytes, size);
4050         }
4051
4052         offset=0;
4053
4054         /* swap the indexes[] */
4055         count=32*4;
4056         ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
4057         offset+=count;
4058
4059         /* swap the main UTrie */
4060         count=indexes[_NORM_INDEX_TRIE_SIZE];
4061         utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4062         offset+=count;
4063
4064         /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */
4065         count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2;
4066         ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4067         offset+=count;
4068
4069         /* swap the FCD UTrie */
4070         count=indexes[_NORM_INDEX_FCD_TRIE_SIZE];
4071         if(count!=0) {
4072             utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4073             offset+=count;
4074         }
4075
4076         /* swap the aux UTrie */
4077         count=indexes[_NORM_INDEX_AUX_TRIE_SIZE];
4078         if(count!=0) {
4079             utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4080             offset+=count;
4081         }
4082
4083         /* swap the uint16_t combiningTable[] */
4084         count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
4085         ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4086         offset+=count;
4087     }
4088
4089     return headerSize+size;
4090 }
4091
4092 #endif /* #if !UCONFIG_NO_NORMALIZATION */