+ /* no combination this time */
+ prevCC=cc;
+ if(p==limit) {
+ return prevCC;
+ }
+
+ /* if (c, c2) did not combine, then check if it is a starter */
+ if(cc==0) {
+ /* found a new starter; combineFlags==0 if (c, c2) is excluded */
+ if(combineFlags&_NORM_COMBINES_FWD) {
+ /* it may combine with something, prepare for it */
+ if(c2==0) {
+ starterIsSupplementary=FALSE;
+ starter=p-1;
+ } else {
+ starterIsSupplementary=TRUE;
+ starter=p-2;
+ }
+ combineFwdIndex=combineBackIndex;
+ } else {
+ /* it will not combine with anything */
+ starter=NULL;
+ }
+ } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
+ /* FCC: no discontiguous compositions; any intervening character blocks */
+ starter=NULL;
+ }
+ }
+}
+
+/* decompose and recompose [prevStarter..src[ */
+static const UChar *
+_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
+ const UChar *prevStarter, const UChar *src,
+ uint8_t &prevCC,
+ int32_t options, const UnicodeSet *nx,
+ UErrorCode *pErrorCode) {
+ UChar *recomposeLimit;
+ uint8_t trailCC;
+ UBool compat;
+
+ compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
+
+ /* decompose [prevStarter..src[ */
+ length=_decompose(buffer, bufferCapacity,
+ prevStarter, src-prevStarter,
+ compat, nx,
+ trailCC);
+ if(length>bufferCapacity) {
+ if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ length=_decompose(buffer, bufferCapacity,
+ prevStarter, src-prevStarter,
+ compat, nx,
+ trailCC);
+ }
+
+ /* recompose the decomposition */
+ recomposeLimit=buffer+length;
+ if(length>=2) {
+ prevCC=_recompose(buffer, recomposeLimit, options, nx);
+ }
+
+ /* return with a pointer to the recomposition and its length */
+ length=recomposeLimit-buffer;
+ return buffer;
+}
+
+static int32_t
+_compose(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ int32_t options, const UnicodeSet *nx,
+ UErrorCode *pErrorCode) {
+ UChar stackBuffer[_STACK_BUFFER_CAPACITY];
+ UChar *buffer;
+ int32_t bufferCapacity;
+
+ const UChar *limit, *prevSrc, *prevStarter;
+ uint32_t norm32, ccOrQCMask, qcMask;
+ int32_t destIndex, reorderStartIndex, length;
+ UChar c, c2, minNoMaybe;
+ uint8_t cc, prevCC;
+
+ if(options&_NORM_OPTIONS_COMPAT) {
+ minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
+ qcMask=_NORM_QC_NFKC;
+ } else {
+ minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
+ qcMask=_NORM_QC_NFC;
+ }
+
+ /* initialize */
+ buffer=stackBuffer;
+ bufferCapacity=_STACK_BUFFER_CAPACITY;
+
+ /*
+ * prevStarter points to the last character before the current one
+ * that is a "true" starter with cc==0 and quick check "yes".
+ *
+ * prevStarter will be used instead of looking for a true starter
+ * while incrementally decomposing [prevStarter..prevSrc[
+ * in _composePart(). Having a good prevStarter allows to just decompose
+ * the entire [prevStarter..prevSrc[.
+ *
+ * When _composePart() backs out from prevSrc back to prevStarter,
+ * then it also backs out destIndex by the same amount.
+ * Therefore, at all times, the (prevSrc-prevStarter) source units
+ * must correspond 1:1 to destination units counted with destIndex,
+ * except for reordering.
+ * This is true for the qc "yes" characters copied in the fast loop,
+ * and for pure reordering.
+ * prevStarter must be set forward to src when this is not true:
+ * In _composePart() and after composing a Hangul syllable.
+ *
+ * This mechanism relies on the assumption that the decomposition of a true starter
+ * also begins with a true starter. gennorm/store.c checks for this.
+ */
+ prevStarter=src;
+
+ ccOrQCMask=_NORM_CC_MASK|qcMask;
+ destIndex=reorderStartIndex=0;
+ prevCC=0;
+
+ /* avoid compiler warnings */
+ norm32=0;
+ c=0;
+
+ if(srcLength>=0) {
+ /* string with length */
+ limit=src+srcLength;
+ } else /* srcLength==-1 */ {
+ /* zero-terminated string */
+ limit=NULL;
+ }
+
+ U_ALIGN_CODE(16);
+
+ for(;;) {
+ /* count code units below the minimum or with irrelevant data for the quick check */
+ prevSrc=src;
+ if(limit==NULL) {
+ while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
+ prevCC=0;
+ ++src;
+ }
+ } else {
+ while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
+ prevCC=0;
+ ++src;
+ }
+ }
+
+ /* copy these code units all at once */
+ if(src!=prevSrc) {
+ length=(int32_t)(src-prevSrc);
+ if((destIndex+length)<=destCapacity) {
+ uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
+ }
+ destIndex+=length;
+ reorderStartIndex=destIndex;
+
+ /* set prevStarter to the last character in the quick check loop */
+ prevStarter=src-1;
+ if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
+ --prevStarter;
+ }
+
+ prevSrc=src;
+ }
+
+ /* end of source reached? */
+ if(limit==NULL ? c==0 : src==limit) {
+ break;
+ }
+
+ /* c already contains *src and norm32 is set for it, increment src */
+ ++src;
+
+ /*
+ * source buffer pointers:
+ *
+ * all done quick check current char not yet
+ * "yes" but (c, c2) processed
+ * may combine
+ * forward
+ * [-------------[-------------[-------------[-------------[
+ * | | | | |
+ * start prevStarter prevSrc src limit
+ *
+ *
+ * destination buffer pointers and indexes:
+ *
+ * all done might take not filled yet
+ * characters for
+ * reordering
+ * [-------------[-------------[-------------[
+ * | | | |
+ * dest reorderStartIndex destIndex destCapacity
+ */
+
+ /* check one above-minimum, relevant code unit */
+ /*
+ * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
+ * check for Jamo V/T, then for surrogates and regular characters
+ * c is not a Hangul syllable or Jamo L because
+ * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
+ */
+ if(isNorm32HangulOrJamo(norm32)) {
+ /*
+ * c is a Jamo V/T:
+ * try to compose with the previous character, Jamo V also with a following Jamo T,
+ * and set values here right now in case we just continue with the main loop
+ */
+ prevCC=cc=0;
+ reorderStartIndex=destIndex;
+
+ if(
+ destIndex>0 &&
+ _composeHangul(
+ *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
+ destIndex<=destCapacity ? dest+(destIndex-1) : 0,
+ nx)
+ ) {
+ prevStarter=src;
+ continue;
+ }
+
+ /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
+ c2=0;
+ length=1;
+ prevStarter=prevSrc;
+ } else {
+ if(isNorm32Regular(norm32)) {
+ c2=0;
+ length=1;
+ } else {
+ /* c is a lead surrogate, get the real norm32 */
+ if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+ ++src;
+ length=2;
+ norm32=_getNorm32FromSurrogatePair(norm32, c2);
+ } else {
+ /* c is an unpaired lead surrogate, nothing to do */
+ c2=0;
+ length=1;
+ norm32=0;
+ }
+ }
+
+ /* we are looking at the character (c, c2) at [prevSrc..src[ */
+ if(nx_contains(nx, c, c2)) {
+ /* excluded: norm32==0 */
+ cc=0;
+ } else if((norm32&qcMask)==0) {
+ cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
+ } else {
+ const UChar *p;
+ uint32_t decompQCMask;
+
+ /*
+ * find appropriate boundaries around this character,
+ * decompose the source text from between the boundaries,
+ * and recompose it
+ *
+ * this puts the intermediate text into the side buffer because
+ * it might be longer than the recomposition end result,
+ * or the destination buffer may be too short or missing
+ *
+ * note that destIndex may be adjusted backwards to account
+ * for source text that passed the quick check but needed to
+ * take part in the recomposition
+ */
+ decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
+
+ /*
+ * find the last true starter in [prevStarter..src[
+ * it is either the decomposition of the current character (at prevSrc),
+ * or prevStarter
+ */
+ if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
+ prevStarter=prevSrc;
+ } else {
+ /* adjust destIndex: back out what had been copied with qc "yes" */
+ destIndex-=(int32_t)(prevSrc-prevStarter);
+ }
+
+ /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
+ src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
+
+ /* compose [prevStarter..src[ */
+ p=_composePart(stackBuffer, buffer, bufferCapacity,
+ length, /* output */
+ prevStarter, src,
+ prevCC, /* output */
+ options, nx,
+ pErrorCode);
+
+ if(p==NULL) {
+ destIndex=0; /* an error occurred (out of memory) */
+ break;
+ }
+
+ /* append the recomposed buffer contents to the destination buffer */
+ if((destIndex+length)<=destCapacity) {
+ while(length>0) {
+ dest[destIndex++]=*p++;
+ --length;
+ }
+ } else {
+ /* buffer overflow */
+ /* keep incrementing the destIndex for preflighting */
+ destIndex+=length;
+ }
+
+ /* set the next starter */
+ prevStarter=src;
+
+ continue;
+ }
+ }
+
+ /* append the single code point (c, c2) to the destination buffer */
+ if((destIndex+length)<=destCapacity) {
+ if(cc!=0 && cc<prevCC) {
+ /* (c, c2) is out of order with respect to the preceding text */
+ UChar *reorderSplit=dest+destIndex;
+ destIndex+=length;
+ prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
+ } else {
+ /* just append (c, c2) */
+ dest[destIndex++]=c;
+ if(c2!=0) {
+ dest[destIndex++]=c2;
+ }
+ prevCC=cc;
+ }
+ } else {
+ /* buffer overflow */
+ /* keep incrementing the destIndex for preflighting */
+ destIndex+=length;
+ prevCC=cc;
+ }
+ }
+
+ /* cleanup */
+ if(buffer!=stackBuffer) {
+ uprv_free(buffer);
+ }
+
+ return destIndex;
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm_compose(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UBool compat, int32_t options,
+ UErrorCode *pErrorCode) {
+ const UnicodeSet *nx;
+ int32_t destIndex;
+
+ if(!_haveData(*pErrorCode)) {
+ return 0;
+ }
+
+ nx=getNX(options, *pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* reset options bits that should only be set here or inside _compose() */
+ options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
+
+ if(compat) {
+ options|=_NORM_OPTIONS_COMPAT;
+ }
+
+ destIndex=_compose(dest, destCapacity,
+ src, srcLength,
+ options, nx,
+ pErrorCode);
+
+ return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
+}
+
+/* make FCD ----------------------------------------------------------------- */
+
+static const UChar *
+_findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
+ UChar c, c2;
+
+ /*
+ * find the first position in [src..limit[ after some cc==0 according to FCD data
+ *
+ * at the beginning of the loop, we have fcd16 from before src
+ *
+ * stop at positions:
+ * - after trail cc==0
+ * - at the end of the source
+ * - before lead cc==0
+ */
+ for(;;) {
+ /* stop if trail cc==0 for the previous character */
+ if((fcd16&0xff)==0) {
+ break;
+ }
+
+ /* get c=*src - stop at end of string */
+ if(src==limit) {
+ break;
+ }
+ c=*src;
+
+ /* stop if lead cc==0 for this character */
+ if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
+ break; /* catches terminating NUL, too */
+ }
+
+ if(!UTF_IS_FIRST_SURROGATE(c)) {
+ if(fcd16<=0xff) {
+ break;
+ }
+ ++src;
+ } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
+ /* c is a lead surrogate, get the real fcd16 */
+ fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
+ if(fcd16<=0xff) {
+ break;
+ }
+ src+=2;
+ } else {
+ /* c is an unpaired first surrogate, lead cc==0 */
+ break;
+ }
+ }
+
+ return src;
+}
+
+static uint8_t
+_decomposeFCD(const UChar *src, const UChar *decompLimit,
+ UChar *dest, int32_t &destIndex, int32_t destCapacity,
+ const UnicodeSet *nx) {
+ const UChar *p;
+ uint32_t norm32;
+ int32_t reorderStartIndex, length;
+ UChar c, c2;
+ uint8_t cc, prevCC, trailCC;
+
+ /*
+ * canonically decompose [src..decompLimit[
+ *
+ * all characters in this range have some non-zero cc,
+ * directly or in decomposition,
+ * so that we do not need to check in the following for quick-check limits etc.
+ *
+ * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
+ *
+ * we also do not need to check for c==0 because we have an established decompLimit
+ */
+ reorderStartIndex=destIndex;
+ prevCC=0;
+
+ while(src<decompLimit) {
+ c=*src++;
+ norm32=_getNorm32(c);
+ if(isNorm32Regular(norm32)) {
+ c2=0;
+ length=1;
+ } else {
+ /*
+ * reminder: this function is called with [src..decompLimit[
+ * not containing any Hangul/Jamo characters,
+ * therefore the only specials are lead surrogates
+ */
+ /* c is a lead surrogate, get the real norm32 */
+ if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+ ++src;
+ length=2;
+ norm32=_getNorm32FromSurrogatePair(norm32, c2);
+ } else {
+ c2=0;
+ length=1;
+ norm32=0;
+ }
+ }
+
+ /* get the decomposition and the lead and trail cc's */
+ if(nx_contains(nx, c, c2)) {
+ /* excluded: norm32==0 */
+ cc=trailCC=0;
+ p=NULL;
+ } else if((norm32&_NORM_QC_NFD)==0) {
+ /* c does not decompose */
+ cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
+ p=NULL;
+ } else {
+ /* c decomposes, get everything from the variable-length extra data */
+ p=_decompose(norm32, length, cc, trailCC);
+ if(length==1) {
+ /* fastpath a single code unit from decomposition */
+ c=*p;
+ c2=0;
+ p=NULL;
+ }
+ }
+
+ /* append the decomposition to the destination buffer, assume length>0 */
+ if((destIndex+length)<=destCapacity) {
+ UChar *reorderSplit=dest+destIndex;
+ if(p==NULL) {
+ /* fastpath: single code point */
+ if(cc!=0 && cc<prevCC) {
+ /* (c, c2) is out of order with respect to the preceding text */
+ destIndex+=length;
+ trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);