icuSources/tools/gennorm/store.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  store.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2001may25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Store Unicode normalization data in a memory-mappable file.
  17 */
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include "unicode/utypes.h"
  22 #include "unicode/uchar.h"
  23 #include "unicode/ustring.h"
  24 #include "cmemory.h"
  25 #include "cstring.h"
  26 #include "filestrm.h"
  27 #include "unicode/udata.h"
  28 #include "utrie.h"
  29 #include "unicode/uset.h"
  30 #include "toolutil.h"
  31 #include "unewdata.h"
  32 #include "unormimp.h"
  33 #include "gennorm.h"
  34 #ifdef WIN32
  35 #   pragma warning(disable: 4100)
  36 #endif
  37
  38 #define DO_DEBUG_OUT 0
  39
  40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  41
  42 /*
  43  * The new implementation of the normalization code loads its data from
  44  * unorm.icu, which is generated with this gennorm tool.
  45  * The format of that file is described in unormimp.h .
  46  */
  47
  48 /* file data ---------------------------------------------------------------- */
  49
  50 #if UCONFIG_NO_NORMALIZATION
  51
  52 /* dummy UDataInfo cf. udata.h */
  53 static UDataInfo dataInfo = {
  54     sizeof(UDataInfo),
  55     0,
  56
  57     U_IS_BIG_ENDIAN,
  58     U_CHARSET_FAMILY,
  59     U_SIZEOF_UCHAR,
  60     0,
  61
  62     { 0, 0, 0, 0 },                 /* dummy dataFormat */
  63     { 0, 0, 0, 0 },                 /* dummy formatVersion */
  64     { 0, 0, 0, 0 }                  /* dummy dataVersion */
  65 };
  66
  67 #else
  68
  69 /* UDataInfo cf. udata.h */
  70 static UDataInfo dataInfo={
  71     sizeof(UDataInfo),
  72     0,
  73
  74     U_IS_BIG_ENDIAN,
  75     U_CHARSET_FAMILY,
  76     U_SIZEOF_UCHAR,
  77     0,
  78
  79     { 0x4e, 0x6f, 0x72, 0x6d },   /* dataFormat="Norm" */
  80     { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
  81     { 3, 2, 0, 0 }                /* dataVersion (Unicode version) */
  82 };
  83
  84 extern void
  85 setUnicodeVersion(const char *v) {
  86     UVersionInfo version;
  87     u_versionFromString(version, v);
  88     uprv_memcpy(dataInfo.dataVersion, version, 4);
  89 }
  90
  91 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
  92
  93 /* builder data ------------------------------------------------------------- */
  94
  95 typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
  96
  97 static UNewTrie
  98     *normTrie,
  99     *norm32Trie,
 100     *fcdTrie,
 101     *auxTrie;
 102
 103 static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
 104
 105 static Norm *norms;
 106
 107 /*
 108  * set a flag for each code point that was seen in decompositions -
 109  * avoid to decompose ones that have not been used before
 110  */
 111 static uint32_t haveSeenFlags[256];
 112
 113 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
 114 static USet *nfdQCNoSet;
 115
 116 /* see addCombiningCP() for details */
 117 static uint32_t combiningCPs[2000];
 118
 119 /*
 120  * after processCombining() this contains for each code point in combiningCPs[]
 121  * the runtime combining index
 122  */
 123 static uint16_t combiningIndexes[2000];
 124
 125 /* section limits for combiningCPs[], see addCombiningCP() */
 126 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
 127
 128 /**
 129  * Structure for a triple of code points, stored in combiningTriplesMem.
 130  * The lead and trail code points combine into the the combined one,
 131  * i.e., there is a canonical decomposition of combined-> <lead, trail>.
 132  *
 133  * Before processCombining() is called, leadIndex and trailIndex are 0.
 134  * After processCombining(), they contain the indexes of the lead and trail
 135  * code point in the combiningCPs[] array.
 136  * They are then sorted by leadIndex, then trailIndex.
 137  * They are not sorted by code points.
 138  */
 139 typedef struct CombiningTriple {
 140     uint16_t leadIndex, trailIndex;
 141     uint32_t lead, trail, combined;
 142 } CombiningTriple;
 143
 144 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
 145 static uint16_t combiningTable[0x8000];
 146 static uint16_t combiningTableTop=0;
 147
 148 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
 149 static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
 150                                +10000]; /* +10000 for exclusion sets */
 151 static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
 152 static int32_t canonSetsCount=0;
 153
 154 extern void
 155 init() {
 156     uint16_t *p16;
 157
 158     normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 159     uprv_memset(normTrie, 0, sizeof(UNewTrie));
 160     norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 161     uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
 162     fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 163     uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
 164     auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 165     uprv_memset(auxTrie, 0, sizeof(UNewTrie));
 166
 167     /* initialize the two tries */
 168     if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
 169         fprintf(stderr, "error: failed to initialize tries\n");
 170         exit(U_MEMORY_ALLOCATION_ERROR);
 171     }
 172
 173     /* allocate Norm structures and reset the first one */
 174     normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
 175     norms=utm_alloc(normMem);
 176
 177     /* allocate UTF-32 string memory */
 178     utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
 179
 180     /* reset all "have seen" flags */
 181     uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
 182
 183     /* open an empty set */
 184     nfdQCNoSet=uset_open(1, 0);
 185
 186     /* allocate extra data memory for UTF-16 decomposition strings and other values */
 187     extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
 188     /* initialize the extraMem counter for the top of FNC strings */
 189     p16=(uint16_t *)utm_alloc(extraMem);
 190     *p16=1;
 191
 192     /* allocate temporary memory for combining triples */
 193     combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
 194
 195     /* set the minimum code points for no/maybe quick check values to the end of the BMP */
 196     indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
 197     indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
 198     indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
 199     indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
 200
 201     /* preset the indexes portion of canonStartSets */
 202     uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
 203 }
 204
 205 /*
 206  * get or create a Norm unit;
 207  * get or create the intermediate trie entries for it as well
 208  */
 209 static Norm *
 210 createNorm(uint32_t code) {
 211     Norm *p;
 212     uint32_t i;
 213
 214     i=utrie_get32(normTrie, (UChar32)code, NULL);
 215     if(i!=0) {
 216         p=norms+i;
 217     } else {
 218         /* allocate Norm */
 219         p=(Norm *)utm_alloc(normMem);
 220         if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
 221             fprintf(stderr, "error: too many normalization entries\n");
 222             exit(U_BUFFER_OVERFLOW_ERROR);
 223         }
 224     }
 225     return p;
 226 }
 227
 228 /* get an existing Norm unit */
 229 static Norm *
 230 getNorm(uint32_t code) {
 231     uint32_t i;
 232
 233     i=utrie_get32(normTrie, (UChar32)code, NULL);
 234     if(i==0) {
 235         return NULL;
 236     }
 237     return norms+i;
 238 }
 239
 240 /* get the canonical combining class of a character */
 241 static uint8_t
 242 getCCFromCP(uint32_t code) {
 243     Norm *norm=getNorm(code);
 244     if(norm==NULL) {
 245         return 0;
 246     } else {
 247         return norm->udataCC;
 248     }
 249 }
 250
 251 /*
 252  * enumerate all code points with their Norm structs and call a function for each
 253  * return the number of code points with data
 254  */
 255 static uint32_t
 256 enumTrie(EnumTrieFn *fn, void *context) {
 257     uint32_t count, i;
 258     UChar32 code;
 259     UBool isInBlockZero;
 260
 261     count=0;
 262     for(code=0; code<=0x10ffff;) {
 263         i=utrie_get32(normTrie, code, &isInBlockZero);
 264         if(isInBlockZero) {
 265             code+=UTRIE_DATA_BLOCK_LENGTH;
 266         } else {
 267             if(i!=0) {
 268                 fn(context, (uint32_t)code, norms+i);
 269                 ++count;
 270             }
 271             ++code;
 272         }
 273     }
 274     return count;
 275 }
 276
 277 static void
 278 setHaveSeenString(const uint32_t *s, int32_t length) {
 279     uint32_t c;
 280
 281     while(length>0) {
 282         c=*s++;
 283         haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
 284         --length;
 285     }
 286 }
 287
 288 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
 289
 290 /* handle combining data ---------------------------------------------------- */
 291
 292 /*
 293  * Insert an entry into combiningCPs[] for the new code point code with its flags.
 294  * The flags indicate if code combines forward, backward, or both.
 295  *
 296  * combiningCPs[] contains three sections:
 297  * 1. code points that combine forward
 298  * 2. code points that combine forward and backward
 299  * 3. code points that combine backward
 300  *
 301  * Search for code in the entire array.
 302  * If it is found and already is in the right section (old flags==new flags)
 303  * then we are done.
 304  * If it is found but the flags are different, then remove it,
 305  * union the old and new flags, and reinsert it into its correct section.
 306  * If it is not found, then just insert it.
 307  *
 308  * Within each section, the code points are not sorted.
 309  */
 310 static void
 311 addCombiningCP(uint32_t code, uint8_t flags) {
 312     uint32_t newEntry;
 313     uint16_t i;
 314
 315     newEntry=code|((uint32_t)flags<<24);
 316
 317     /* search for this code point */
 318     for(i=0; i<combineBackTop; ++i) {
 319         if(code==(combiningCPs[i]&0xffffff)) {
 320             /* found it */
 321             if(newEntry==combiningCPs[i]) {
 322                 return; /* no change */
 323             }
 324
 325             /* combine the flags, remove the old entry from the old place, and insert the new one */
 326             newEntry|=combiningCPs[i];
 327             if(i!=--combineBackTop) {
 328                 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
 329             }
 330             if(i<combineBothTop) {
 331                 --combineBothTop;
 332             }
 333             if(i<combineFwdTop) {
 334                 --combineFwdTop;
 335             }
 336             break;
 337         }
 338     }
 339
 340     /* not found or modified, insert it */
 341     if(combineBackTop>=sizeof(combiningCPs)/4) {
 342         fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
 343                 (long)(sizeof(combiningCPs)/4));
 344         exit(U_MEMORY_ALLOCATION_ERROR);
 345     }
 346
 347     /* set i to the insertion point */
 348     flags=(uint8_t)(newEntry>>24);
 349     if(flags==1) {
 350         i=combineFwdTop++;
 351         ++combineBothTop;
 352     } else if(flags==3) {
 353         i=combineBothTop++;
 354     } else /* flags==2 */ {
 355         i=combineBackTop;
 356     }
 357
 358     /* move the following code points up one and insert newEntry at i */
 359     if(i<combineBackTop) {
 360         uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
 361     }
 362     combiningCPs[i]=newEntry;
 363
 364     /* finally increment the total counter */
 365     ++combineBackTop;
 366 }
 367
 368 /**
 369  * Find the index in combiningCPs[] where code point code is stored.
 370  * @param code code point to look for
 371  * @param isLead is code a forward combining code point?
 372  * @return index in combiningCPs[] where code is stored
 373  */
 374 static uint16_t
 375 findCombiningCP(uint32_t code, UBool isLead) {
 376     uint16_t i, limit;
 377
 378     if(isLead) {
 379         i=0;
 380         limit=combineBothTop;
 381     } else {
 382         i=combineFwdTop;
 383         limit=combineBackTop;
 384     }
 385
 386     /* search for this code point */
 387     for(; i<limit; ++i) {
 388         if(code==(combiningCPs[i]&0xffffff)) {
 389             /* found it */
 390             return i;
 391         }
 392     }
 393
 394     /* not found */
 395     return 0xffff;
 396 }
 397
 398 static void
 399 addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
 400     CombiningTriple *triple;
 401
 402     /*
 403      * set combiningFlags for the two code points
 404      * do this after decomposition so that getNorm() above returns NULL
 405      * if we do not have actual sub-decomposition data for the initial NFD here
 406      */
 407     createNorm(lead)->combiningFlags|=1;    /* combines forward */
 408     createNorm(trail)->combiningFlags|=2;    /* combines backward */
 409
 410     addCombiningCP(lead, 1);
 411     addCombiningCP(trail, 2);
 412
 413     triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
 414     triple->lead=lead;
 415     triple->trail=trail;
 416     triple->combined=combined;
 417 }
 418
 419 static int
 420 compareTriples(const void *l, const void *r) {
 421     int diff;
 422     diff=(int)((CombiningTriple *)l)->leadIndex-
 423          (int)((CombiningTriple *)r)->leadIndex;
 424     if(diff==0) {
 425         diff=(int)((CombiningTriple *)l)->trailIndex-
 426              (int)((CombiningTriple *)r)->trailIndex;
 427     }
 428     return diff;
 429 }
 430
 431 static void
 432 processCombining() {
 433     CombiningTriple *triples;
 434     uint16_t *p;
 435     uint32_t combined;
 436     uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
 437
 438     triples=utm_getStart(combiningTriplesMem);
 439
 440     /* add lead and trail indexes to the triples for sorting */
 441     count=(uint16_t)utm_countItems(combiningTriplesMem);
 442     for(i=0; i<count; ++i) {
 443         /* findCombiningCP() must always find the code point */
 444         triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
 445         triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
 446     }
 447
 448     /* sort them by leadIndex, trailIndex */
 449     qsort(triples, count, sizeof(CombiningTriple), compareTriples);
 450
 451     /* calculate final combining indexes and store them in the Norm entries */
 452     tableTop=0;
 453     j=0; /* triples counter */
 454
 455     /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
 456     for(i=0; i<combineBothTop; ++i) {
 457         /* start a new table */
 458
 459         /* assign combining index */
 460         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
 461
 462         /* calculate the length of the combining data for this lead code point in the combiningTable */
 463         while(j<count && i==triples[j].leadIndex) {
 464             /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
 465             combined=triples[j++].combined;
 466             if(combined<=0x1fff) {
 467                 tableTop+=2;
 468             } else {
 469                 tableTop+=3;
 470             }
 471         }
 472     }
 473
 474     /* second, combining indexes of back-only characters are simply incremented from here to be unique */
 475     finalIndex=tableTop;
 476     for(; i<combineBackTop; ++i) {
 477         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
 478     }
 479
 480     /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
 481     if(finalIndex>0x8000) {
 482         fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
 483                 tableTop, (long)(sizeof(combiningTable)/4));
 484         exit(U_MEMORY_ALLOCATION_ERROR);
 485     }
 486
 487     combiningTableTop=tableTop;
 488
 489     /* store the combining data in the combiningTable, with the final indexes from above */
 490     p=combiningTable;
 491     j=0; /* triples counter */
 492
 493     /*
 494      * this is essentially the same loop as above, but
 495      * it writes the table data instead of calculating and setting the final indexes;
 496      * it is necessary to have two passes so that all the final indexes are known before
 497      * they are written into the table
 498      */
 499     for(i=0; i<combineBothTop; ++i) {
 500         /* start a new table */
 501
 502         combined=0; /* avoid compiler warning */
 503
 504         /* store the combining data for this lead code point in the combiningTable */
 505         while(j<count && i==triples[j].leadIndex) {
 506             finalIndex=combiningIndexes[triples[j].trailIndex];
 507             combined=triples[j++].combined;
 508
 509             /* is combined a starter? (i.e., cc==0 && combines forward) */
 510             combinesFwd=(uint16_t)((getNorm(combined)->combiningFlags&1)<<13);
 511
 512             *p++=finalIndex;
 513             if(combined<=0x1fff) {
 514                 *p++=(uint16_t)(combinesFwd|combined);
 515             } else if(combined<=0xffff) {
 516                 *p++=(uint16_t)(0x8000|combinesFwd);
 517                 *p++=(uint16_t)combined;
 518             } else {
 519                 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
 520                 *p++=(uint16_t)(0xdc00|(combined&0x3ff));
 521             }
 522         }
 523
 524         /* set a marker on the last final trail index in this lead's table */
 525         if(combined<=0x1fff) {
 526             *(p-2)|=0x8000;
 527         } else {
 528             *(p-3)|=0x8000;
 529         }
 530     }
 531
 532     /* post condition: tableTop==(p-combiningTable) */
 533 }
 534
 535 /* processing incoming normalization data ----------------------------------- */
 536
 537 /*
 538  * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
 539  * c must be a Hangul syllable code point.
 540  */
 541 static void
 542 getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
 543     /* Hangul syllable: decompose algorithmically */
 544     uint32_t c2;
 545     uint8_t length;
 546
 547     uprv_memset(pHangulNorm, 0, sizeof(Norm));
 548
 549     c-=HANGUL_BASE;
 550
 551     c2=c%JAMO_T_COUNT;
 552     c/=JAMO_T_COUNT;
 553     if(c2>0) {
 554         hangulBuffer[2]=JAMO_T_BASE+c2;
 555         length=3;
 556     } else {
 557         hangulBuffer[2]=0;
 558         length=2;
 559     }
 560
 561     hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
 562     hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
 563
 564     pHangulNorm->nfd=pHangulNorm->nfkd=hangulBuffer;
 565     pHangulNorm->lenNFD=pHangulNorm->lenNFKD=length;
 566 }
 567
 568 /*
 569  * decompose the one decomposition further, may generate two decompositions
 570  * apply all previous characters' decompositions to this one
 571  */
 572 static void
 573 decompStoreNewNF(uint32_t code, Norm *norm) {
 574     uint32_t nfd[40], nfkd[40], hangulBuffer[3];
 575     Norm hangulNorm;
 576
 577     uint32_t *s32;
 578     Norm *p;
 579     uint32_t c;
 580     int32_t i, length;
 581     uint8_t lenNFD=0, lenNFKD=0;
 582     UBool changedNFD=FALSE, changedNFKD=FALSE;
 583
 584     if((length=norm->lenNFD)!=0) {
 585         /* always allocate the original string */
 586         changedNFD=TRUE;
 587         s32=norm->nfd;
 588     } else if((length=norm->lenNFKD)!=0) {
 589         /* always allocate the original string */
 590         changedNFKD=TRUE;
 591         s32=norm->nfkd;
 592     } else {
 593         /* no decomposition here, nothing to do */
 594         return;
 595     }
 596
 597     /* decompose each code point */
 598     for(i=0; i<length; ++i) {
 599         c=s32[i];
 600         p=getNorm(c);
 601         if(p==NULL) {
 602             if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
 603                 getHangulDecomposition(c, &hangulNorm, hangulBuffer);
 604                 p=&hangulNorm;
 605             } else {
 606                 /* no data, no decomposition */
 607                 nfd[lenNFD++]=c;
 608                 nfkd[lenNFKD++]=c;
 609                 continue;
 610             }
 611         }
 612
 613         /* canonically decompose c */
 614         if(changedNFD) {
 615             if(p->lenNFD!=0) {
 616                 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
 617                 lenNFD+=p->lenNFD;
 618             } else {
 619                 nfd[lenNFD++]=c;
 620             }
 621         }
 622
 623         /* compatibility-decompose c */
 624         if(p->lenNFKD!=0) {
 625             uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
 626             lenNFKD+=p->lenNFKD;
 627             changedNFKD=TRUE;
 628         } else if(p->lenNFD!=0) {
 629             uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
 630             lenNFKD+=p->lenNFD;
 631             changedNFKD=TRUE;
 632         } else {
 633             nfkd[lenNFKD++]=c;
 634         }
 635     }
 636
 637     /* assume that norm->lenNFD==1 or ==2 */
 638     if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
 639         addCombiningTriple(s32[0], s32[1], code);
 640     }
 641
 642     if(changedNFD) {
 643         if(lenNFD!=0) {
 644             s32=utm_allocN(utf32Mem, lenNFD);
 645             uprv_memcpy(s32, nfd, lenNFD*4);
 646         } else {
 647             s32=NULL;
 648         }
 649         norm->lenNFD=lenNFD;
 650         norm->nfd=s32;
 651         setHaveSeenString(nfd, lenNFD);
 652     }
 653     if(changedNFKD) {
 654         if(lenNFKD!=0) {
 655             s32=utm_allocN(utf32Mem, lenNFKD);
 656             uprv_memcpy(s32, nfkd, lenNFKD*4);
 657         } else {
 658             s32=NULL;
 659         }
 660         norm->lenNFKD=lenNFKD;
 661         norm->nfkd=s32;
 662         setHaveSeenString(nfkd, lenNFKD);
 663     }
 664 }
 665
 666 typedef struct DecompSingle {
 667     uint32_t c;
 668     Norm *norm;
 669 } DecompSingle;
 670
 671 /*
 672  * apply this one character's decompositions (there is at least one!) to
 673  * all previous characters' decompositions to decompose them further
 674  */
 675 static void
 676 decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
 677     uint32_t nfd[40], nfkd[40];
 678     uint32_t *s32;
 679     DecompSingle *me=(DecompSingle *)context;
 680     uint32_t c, myC;
 681     int32_t i, length;
 682     uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
 683     UBool changedNFD=FALSE, changedNFKD=FALSE;
 684
 685     /* get the new character's data */
 686     myC=me->c;
 687     myLenNFD=me->norm->lenNFD;
 688     myLenNFKD=me->norm->lenNFKD;
 689     /* assume that myC has at least one decomposition */
 690
 691     if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
 692         /* apply NFD(myC) to norm->nfd */
 693         s32=norm->nfd;
 694         for(i=0; i<length; ++i) {
 695             c=s32[i];
 696             if(c==myC) {
 697                 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
 698                 lenNFD+=myLenNFD;
 699                 changedNFD=TRUE;
 700             } else {
 701                 nfd[lenNFD++]=c;
 702             }
 703         }
 704     }
 705
 706     if((length=norm->lenNFKD)!=0) {
 707         /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
 708         s32=norm->nfkd;
 709         for(i=0; i<length; ++i) {
 710             c=s32[i];
 711             if(c==myC) {
 712                 if(myLenNFKD!=0) {
 713                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
 714                     lenNFKD+=myLenNFKD;
 715                 } else /* assume myLenNFD!=0 */ {
 716                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
 717                     lenNFKD+=myLenNFD;
 718                 }
 719                 changedNFKD=TRUE;
 720             } else {
 721                 nfkd[lenNFKD++]=c;
 722             }
 723         }
 724     } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
 725         /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
 726         s32=norm->nfd;
 727         for(i=0; i<length; ++i) {
 728             c=s32[i];
 729             if(c==myC) {
 730                 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
 731                 lenNFKD+=myLenNFKD;
 732                 changedNFKD=TRUE;
 733             } else {
 734                 nfkd[lenNFKD++]=c;
 735             }
 736         }
 737     }
 738
 739     /* set the new decompositions, forget the old ones */
 740     if(changedNFD) {
 741         if(lenNFD!=0) {
 742             if(lenNFD>norm->lenNFD) {
 743                 s32=utm_allocN(utf32Mem, lenNFD);
 744             } else {
 745                 s32=norm->nfd;
 746             }
 747             uprv_memcpy(s32, nfd, lenNFD*4);
 748         } else {
 749             s32=NULL;
 750         }
 751         norm->lenNFD=lenNFD;
 752         norm->nfd=s32;
 753     }
 754     if(changedNFKD) {
 755         if(lenNFKD!=0) {
 756             if(lenNFKD>norm->lenNFKD) {
 757                 s32=utm_allocN(utf32Mem, lenNFKD);
 758             } else {
 759                 s32=norm->nfkd;
 760             }
 761             uprv_memcpy(s32, nfkd, lenNFKD*4);
 762         } else {
 763             s32=NULL;
 764         }
 765         norm->lenNFKD=lenNFKD;
 766         norm->nfkd=s32;
 767     }
 768 }
 769
 770 /*
 771  * process the data for one code point listed in UnicodeData;
 772  * UnicodeData itself never maps a code point to both NFD and NFKD
 773  */
 774 extern void
 775 storeNorm(uint32_t code, Norm *norm) {
 776     DecompSingle decompSingle;
 777     Norm *p;
 778
 779     /* copy existing derived normalization properties */
 780     p=createNorm(code);
 781     norm->qcFlags=p->qcFlags;
 782     norm->combiningFlags=p->combiningFlags;
 783     norm->fncIndex=p->fncIndex;
 784
 785     /* process the decomposition if if there is at one here */
 786     if((norm->lenNFD|norm->lenNFKD)!=0) {
 787         /* decompose this one decomposition further, may generate two decompositions */
 788         decompStoreNewNF(code, norm);
 789
 790         /* has this code point been used in previous decompositions? */
 791         if(HAVE_SEEN(code)) {
 792             /* use this decomposition to decompose other decompositions further */
 793             decompSingle.c=code;
 794             decompSingle.norm=norm;
 795             enumTrie(decompWithSingleFn, &decompSingle);
 796         }
 797     }
 798
 799     /* store the data */
 800     uprv_memcpy(p, norm, sizeof(Norm));
 801 }
 802
 803 extern void
 804 setQCFlags(uint32_t code, uint8_t qcFlags) {
 805     createNorm(code)->qcFlags|=qcFlags;
 806
 807     /* adjust the minimum code point for quick check no/maybe */
 808     if(code<0xffff) {
 809         if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
 810             indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
 811         }
 812         if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
 813             indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
 814         }
 815         if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
 816             indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
 817         }
 818         if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
 819             indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
 820         }
 821     }
 822
 823     if(qcFlags&_NORM_QC_NFD) {
 824         uset_add(nfdQCNoSet, (UChar32)code);
 825     }
 826 }
 827
 828 extern void
 829 setCompositionExclusion(uint32_t code) {
 830     createNorm(code)->combiningFlags|=0x80;
 831 }
 832
 833 static void
 834 setHangulJamoSpecials() {
 835     Norm *norm;
 836     uint32_t c, hangul;
 837
 838     /*
 839      * Hangul syllables are algorithmically decomposed into Jamos,
 840      * and Jamos are algorithmically composed into Hangul syllables.
 841      * The quick check flags are parsed, except for Hangul.
 842      */
 843
 844     /* set Jamo L specials */
 845     hangul=0xac00;
 846     for(c=0x1100; c<=0x1112; ++c) {
 847         norm=createNorm(c);
 848         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
 849         norm->combiningFlags=1;
 850
 851         /* for each Jamo L create a set with its associated Hangul block */
 852         norm->canonStart=uset_open(hangul, hangul+21*28-1);
 853         hangul+=21*28;
 854     }
 855
 856     /* set Jamo V specials */
 857     for(c=0x1161; c<=0x1175; ++c) {
 858         norm=createNorm(c);
 859         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
 860         norm->combiningFlags=2;
 861         norm->unsafeStart=TRUE;
 862     }
 863
 864     /* set Jamo T specials */
 865     for(c=0x11a8; c<=0x11c2; ++c) {
 866         norm=createNorm(c);
 867         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
 868         norm->combiningFlags=2;
 869         norm->unsafeStart=TRUE;
 870     }
 871
 872     /* set Hangul specials, precompacted */
 873     norm=(Norm *)utm_alloc(normMem);
 874     norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
 875     norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
 876
 877     if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
 878         fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
 879         exit(U_BUFFER_OVERFLOW_ERROR);
 880     }
 881 }
 882
 883 /*
 884  * set FC-NFKC-Closure string
 885  * s contains the closure string; s[0]==length, s[1..length] is the actual string
 886  * may modify s[0]
 887  */
 888 U_CFUNC void
 889 setFNC(uint32_t c, UChar *s) {
 890     uint16_t *p;
 891     int32_t length, i, count;
 892     UChar first;
 893
 894     count=utm_countItems(extraMem);
 895     length=s[0];
 896     first=s[1];
 897
 898     /* try to overlay single-unit strings with existing ones */
 899     if(length==1 && first<0xff00) {
 900         p=utm_getStart(extraMem);
 901         for(i=1; i<count; ++i) {
 902             if(first==p[i]) {
 903                 break;
 904             }
 905         }
 906     } else {
 907         i=count;
 908     }
 909
 910     /* append the new string if it cannot be overlayed with an old one */
 911     if(i==count) {
 912         if(count>_NORM_AUX_MAX_FNC) {
 913             fprintf(stderr, "gennorm error: too many FNC strings\n");
 914             exit(U_INDEX_OUTOFBOUNDS_ERROR);
 915         }
 916
 917         /* prepend 0xffxx with xx==length */
 918         s[0]=(uint16_t)(0xff00+length);
 919         ++length;
 920         p=(uint16_t *)utm_allocN(extraMem, length);
 921         uprv_memcpy(p, s, length*2);
 922
 923         /* update the top index in extraMem[0] */
 924         count+=length;
 925         ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
 926     }
 927
 928     /* store the index to the string */
 929     createNorm(c)->fncIndex=i;
 930 }
 931
 932 /* build runtime structures ------------------------------------------------- */
 933
 934 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
 935 static uint16_t
 936 reorderString(uint32_t *s, int32_t length) {
 937     uint8_t ccs[40];
 938     uint32_t c;
 939     int32_t i, j;
 940     uint8_t cc, prevCC;
 941
 942     if(length<=0) {
 943         return 0;
 944     }
 945
 946     for(i=0; i<length; ++i) {
 947         /* get the i-th code point and its combining class */
 948         c=s[i];
 949         cc=getCCFromCP(c);
 950         if(cc!=0 && i!=0) {
 951             /* it is a combining mark, see if it needs to be moved back */
 952             j=i;
 953             do {
 954                 prevCC=ccs[j-1];
 955                 if(prevCC<=cc) {
 956                     break;  /* found the right place */
 957                 }
 958                 /* move the previous code point here and go back */
 959                 s[j]=s[j-1];
 960                 ccs[j]=prevCC;
 961             } while(--j!=0);
 962             s[j]=c;
 963             ccs[j]=cc;
 964         } else {
 965             /* just store the combining class */
 966             ccs[i]=cc;
 967         }
 968     }
 969
 970     return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
 971 }
 972
 973 static UBool combineAndQC[64]={ 0 };
 974
 975 /*
 976  * canonically reorder the up to two decompositions
 977  * and store the leading and trailing combining classes accordingly
 978  *
 979  * also process canonical decompositions for canonical closure
 980  */
 981 static void
 982 postParseFn(void *context, uint32_t code, Norm *norm) {
 983     int32_t length;
 984
 985     /* canonically order the NFD */
 986     length=norm->lenNFD;
 987     if(length>0) {
 988         norm->canonBothCCs=reorderString(norm->nfd, length);
 989     }
 990
 991     /* canonically reorder the NFKD */
 992     length=norm->lenNFKD;
 993     if(length>0) {
 994         norm->compatBothCCs=reorderString(norm->nfkd, length);
 995     }
 996
 997     /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
 998     if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
 999         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
1000     }
1001     if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
1002         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
1003     }
1004
1005     /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1006     combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
1007
1008     if(norm->combiningFlags&1) {
1009         if(norm->udataCC!=0) {
1010             /* illegal - data-derivable composition exclusion */
1011             fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
1012         }
1013     }
1014     if(norm->combiningFlags&2) {
1015         if((norm->qcFlags&0x11)==0) {
1016             fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
1017         }
1018 #if 0
1019         /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1020         if(norm->udataCC==0) {
1021             printf("U+%04lx combines backward but udataCC==0\n", (long)code);
1022         }
1023 #endif
1024     }
1025     if((norm->combiningFlags&3)==3 && beVerbose) {
1026         printf("U+%04lx combines both ways\n", (long)code);
1027     }
1028
1029     /*
1030      * process canonical decompositions for canonical closure
1031      *
1032      * in each canonical decomposition:
1033      *   add the current character (code) to the set of canonical starters of its norm->nfd[0]
1034      *   set the "unsafe starter" flag for each norm->nfd[1..]
1035      */
1036     length=norm->lenNFD;
1037     if(length>0) {
1038         Norm *otherNorm;
1039         UChar32 c;
1040         int32_t i;
1041
1042         /* nfd[0].canonStart.add(code) */
1043         c=norm->nfd[0];
1044         otherNorm=createNorm(c);
1045         if(otherNorm->canonStart==NULL) {
1046             otherNorm->canonStart=uset_open(code, code);
1047             if(otherNorm->canonStart==NULL) {
1048                 fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
1049                 exit(U_MEMORY_ALLOCATION_ERROR);
1050             }
1051         } else {
1052             uset_add(otherNorm->canonStart, code);
1053             if(!uset_contains(otherNorm->canonStart, code)) {
1054                 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
1055                 exit(U_INTERNAL_PROGRAM_ERROR);
1056             }
1057         }
1058
1059         /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1060         for(i=1; i<length; ++i) {
1061             createNorm(norm->nfd[i])->unsafeStart=TRUE;
1062         }
1063     }
1064 }
1065
1066 static uint32_t
1067 make32BitNorm(Norm *norm) {
1068     UChar extra[100];
1069     const Norm *other;
1070     uint32_t word;
1071     int32_t i, length, beforeZero=0, count, start;
1072
1073     /*
1074      * Check for assumptions:
1075      *
1076      * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1077      * then the decomposition also begins with a true starter.
1078      */
1079     if(norm->udataCC==0) {
1080         /* this is a starter */
1081         if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
1082             /* a "true" NFC starter with a canonical decomposition */
1083             if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1084                 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
1085             ) {
1086                 fprintf(stderr,
1087                     "error: true NFC starter canonical decomposition[%u] does not begin\n"
1088                     "    with a true NFC starter: U+%04lx U+%04lx%s\n",
1089                     norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1090                     norm->lenNFD<=2 ? "" : " ...");
1091                 exit(U_INVALID_TABLE_FILE);
1092             }
1093         }
1094
1095         if((norm->qcFlags&_NORM_QC_NFKC)==0) {
1096             if(norm->lenNFKD>0) {
1097                 /* a "true" NFKC starter with a compatibility decomposition */
1098                 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
1099                     ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFC_YES */
1100                 ) {
1101                     fprintf(stderr,
1102                         "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1103                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
1104                         norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],                        norm->lenNFKD<=2 ? "" : " ...");
1105                     exit(U_INVALID_TABLE_FILE);
1106                 }
1107             } else if(norm->lenNFD>0) {
1108                 /* a "true" NFKC starter with only a canonical decomposition */
1109                 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1110                     ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFC_YES */
1111                 ) {
1112                     fprintf(stderr,
1113                         "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1114                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
1115                         norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1116                         norm->lenNFD<=2 ? "" : " ...");
1117                     exit(U_INVALID_TABLE_FILE);
1118                 }
1119             }
1120         }
1121     }
1122
1123     /* reset the 32-bit word and set the quick check flags */
1124     word=norm->qcFlags;
1125
1126     /* set the UnicodeData combining class */
1127     word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
1128
1129     /* set the combining flag and index */
1130     if(norm->combiningFlags&3) {
1131         word|=(uint32_t)(norm->combiningFlags&3)<<6;
1132     }
1133
1134     /* set the combining index value into the extra data */
1135     if(norm->combiningIndex!=0) {
1136         extra[0]=norm->combiningIndex;
1137         beforeZero=1;
1138     }
1139
1140     count=beforeZero;
1141
1142     /* write the decompositions */
1143     if((norm->lenNFD|norm->lenNFKD)!=0) {
1144         extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
1145
1146         length=norm->lenNFD;
1147         if(length>0) {
1148             if(norm->canonBothCCs!=0) {
1149                 extra[beforeZero]|=0x80;
1150                 extra[count++]=norm->canonBothCCs;
1151             }
1152             start=count;
1153             for(i=0; i<length; ++i) {
1154                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
1155             }
1156             extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
1157         }
1158
1159         length=norm->lenNFKD;
1160         if(length>0) {
1161             if(norm->compatBothCCs!=0) {
1162                 extra[beforeZero]|=0x8000;
1163                 extra[count++]=norm->compatBothCCs;
1164             }
1165             start=count;
1166             for(i=0; i<length; ++i) {
1167                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
1168             }
1169             extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
1170         }
1171     }
1172
1173     /* allocate and copy the extra data */
1174     if(count!=0) {
1175         UChar *p;
1176
1177         if(norm->specialTag!=0) {
1178             fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
1179             exit(U_ILLEGAL_ARGUMENT_ERROR);
1180         }
1181
1182         p=(UChar *)utm_allocN(extraMem, count);
1183         uprv_memcpy(p, extra, count*2);
1184
1185         /* set the extra index, offset by beforeZero */
1186         word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
1187     } else if(norm->specialTag!=0) {
1188         /* set a special tag instead of an extra index */
1189         word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
1190     }
1191
1192     return word;
1193 }
1194
1195 /* turn all Norm structs into corresponding 32-bit norm values */
1196 static void
1197 makeAll32() {
1198     uint32_t *pNormData;
1199     uint32_t n;
1200     int32_t i, normLength, count;
1201
1202     count=(int32_t)utm_countItems(normMem);
1203     for(i=0; i<count; ++i) {
1204         norms[i].value32=make32BitNorm(norms+i);
1205     }
1206
1207     pNormData=utrie_getData(norm32Trie, &normLength);
1208
1209     count=0;
1210     for(i=0; i<normLength; ++i) {
1211         n=pNormData[i];
1212         if(0!=(pNormData[i]=norms[n].value32)) {
1213             ++count;
1214         }
1215     }
1216 }
1217
1218 /*
1219  * extract all Norm.canonBothCCs into the FCD table
1220  * set 32-bit values to use the common fold and compact functions
1221  */
1222 static void
1223 makeFCD() {
1224     uint32_t *pFCDData;
1225     uint32_t n;
1226     int32_t i, count, fcdLength;
1227     uint16_t bothCCs;
1228
1229     count=utm_countItems(normMem);
1230     for(i=0; i<count; ++i) {
1231         bothCCs=norms[i].canonBothCCs;
1232         if(bothCCs==0) {
1233             /* if there are no decomposition cc's then use the udataCC twice */
1234             bothCCs=norms[i].udataCC;
1235             bothCCs|=bothCCs<<8;
1236         }
1237         norms[i].value32=bothCCs;
1238     }
1239
1240     pFCDData=utrie_getData(fcdTrie, &fcdLength);
1241
1242     for(i=0; i<fcdLength; ++i) {
1243         n=pFCDData[i];
1244         pFCDData[i]=norms[n].value32;
1245     }
1246 }
1247
1248 /**
1249  * If the given set contains exactly one character, then return it.
1250  * Otherwise return -1.
1251  */
1252 static int32_t
1253 usetContainsOne(const USet* set) {
1254     if (uset_size(set) == 1) { /* ### faster to count ranges and check only range?! */
1255         UChar32 start, end;
1256         UErrorCode ec = U_ZERO_ERROR;
1257         int32_t len = uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
1258         if (len == 0) return start;
1259     }
1260     return -1;
1261 }
1262
1263 static void
1264 makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
1265     if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
1266         uint16_t *table;
1267         int32_t c, tableLength;
1268         UErrorCode errorCode=U_ZERO_ERROR;
1269
1270         /* does the set contain exactly one code point? */
1271         c=usetContainsOne(norm->canonStart); /* ### why? */
1272
1273         /* add an entry to the BMP or supplementary search table */
1274         if(code<=0xffff) {
1275             table=canonStartSets+_NORM_MAX_CANON_SETS;
1276             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1277
1278             table[tableLength++]=(uint16_t)code;
1279
1280             if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
1281                 /* single-code point BMP result for BMP code point */
1282                 table[tableLength++]=(uint16_t)c;
1283             } else {
1284                 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
1285                 c=-1;
1286             }
1287             canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
1288         } else {
1289             table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
1290             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1291
1292             table[tableLength++]=(uint16_t)(code>>16);
1293             table[tableLength++]=(uint16_t)code;
1294
1295             if(c>=0) {
1296                 /* single-code point result for supplementary code point */
1297                 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); /* ### how does this work again? */
1298                 table[tableLength++]=(uint16_t)c;
1299             } else {
1300                 table[tableLength++]=(uint16_t)canonStartSetsTop;
1301             }
1302             canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
1303         }
1304
1305         if(c<0) {
1306             /* write a USerializedSet */
1307             ++canonSetsCount;
1308             canonStartSetsTop+=
1309                     uset_serialize(norm->canonStart,
1310                             canonStartSets+canonStartSetsTop,
1311                             _NORM_MAX_CANON_SETS-canonStartSetsTop,
1312                             &errorCode);
1313         }
1314         canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1315
1316         if(U_FAILURE(errorCode)) {
1317             fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
1318             exit(errorCode);
1319         }
1320         if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
1321             fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
1322             exit(U_INDEX_OUTOFBOUNDS_ERROR);
1323         }
1324     }
1325 }
1326
1327 /* for getSkippableFlags ---------------------------------------------------- */
1328
1329 /* combine the lead and trail code points; return <0 if they do not combine */
1330 static int32_t
1331 combine(uint32_t lead, uint32_t trail) {
1332     CombiningTriple *triples;
1333     uint32_t i, count;
1334
1335     /* search for all triples with c as lead code point */
1336     triples=utm_getStart(combiningTriplesMem);
1337     count=utm_countItems(combiningTriplesMem);
1338
1339     /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1340     for(i=0; i<count && lead!=triples[i].lead; ++i) {}
1341
1342     /* check each triple for this code point */
1343     for(; i<count && lead==triples[i].lead; ++i) {
1344         if(trail==triples[i].trail) {
1345             return (int32_t)triples[i].combined;
1346         }
1347     }
1348
1349     return -1;
1350 }
1351
1352 /*
1353  * Starting from the canonical decomposition s[0..length[ of a single code point,
1354  * is the code point c consumed in an NFC/FCC recomposition?
1355  *
1356  * No need to handle discontiguous composition because that would not consume some
1357  * intermediate character, so would not compose back to the original character.
1358  * See comments in canChangeWithFollowing().
1359  *
1360  * No need to compose beyond where c canonically orders because if it is consumed
1361  * then the result differs from the original anyway.
1362  *
1363  * Possible optimization:
1364  * - Verify that there are no cases of the same combining mark stacking twice.
1365  * - return FALSE right away if c inserts after a copy of itself
1366  *   without attempting to recompose; will happen because each mark in
1367  *   the decomposition will be enumerated and passed in as c.
1368  *   More complicated and fragile though than it is already.
1369  *
1370  * markus 2002nov04
1371  */
1372 static UBool
1373 doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
1374     int32_t starter, i;
1375
1376     /* ignore trailing characters where cc<prevCC */
1377     while(length>1 && cc<getCCFromCP(s[length-1])) {
1378         --length;
1379     }
1380
1381     /* start consuming/combining from the beginning */
1382     starter=(int32_t)s[0];
1383     for(i=1; i<length; ++i) {
1384         starter=combine((uint32_t)starter, s[i]);
1385         if(starter<0) {
1386             fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1387                 (int)s[0], (int)s[1], (int)length, (int)c, cc);
1388             exit(U_INTERNAL_PROGRAM_ERROR);
1389         }
1390     }
1391
1392     /* try to combine/consume c, return TRUE if it is consumed */
1393     return combine((uint32_t)starter, c)>=0;
1394 }
1395
1396 /* does the starter s[0] combine forward with another char that is below trailCC? */
1397 static UBool
1398 canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
1399     if(trailCC<=1) {
1400         /* no character will combine ahead of the trailing char of the decomposition */
1401         return FALSE;
1402     }
1403
1404     /*
1405      * We are only checking skippable condition (f).
1406      * Therefore, the original character does not have quick check flag NFC_NO (c),
1407      * i.e., the decomposition recomposes completely back into the original code point.
1408      * So s[0] must be a true starter with cc==0 and
1409      * combining with following code points.
1410      *
1411      * Similarly, length==1 is not possible because that would be a singleton
1412      * decomposition which is marked with NFC_NO and does not pass (c).
1413      *
1414      * Only a character with cc<trailCC can change the composition.
1415      * Reason: A char with cc>=trailCC would order after decomposition s[],
1416      * composition would consume all of the decomposition, and here we know that
1417      * the original char passed check d), i.e., it does not combine forward,
1418      * therefore does not combine with anything after the decomposition is consumed.
1419      *
1420      * Now see if there is a character that
1421      * 1. combines backward
1422      * 2. has cc<trailCC
1423      * 3. is consumed in recomposition
1424      *
1425      * length==2 is simple:
1426      *
1427      * Characters that fulfill these conditions are exactly the ones that combine directly
1428      * with the starter c==s[0] because there is no intervening character after
1429      * reordering.
1430      * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1431      * and see if one has cc<trailCC (passes 2.).
1432      *
1433      * length>2 is a little harder:
1434      *
1435      * Since we will get different starters during recomposition, we need to
1436      * enumerate each backward-combining character (1.)
1437      * with cc<trailCC (2.) and
1438      * see if it gets consumed in recomposition. (3.)
1439      * No need to enumerate both-ways combining characters because they must have cc==0.
1440      */
1441     if(length==2) {
1442         /* enumerate all chars that combine with this one and check their cc */
1443         CombiningTriple *triples;
1444         uint32_t c, i, count;
1445         uint8_t cc;
1446
1447         /* search for all triples with c as lead code point */
1448         triples=utm_getStart(combiningTriplesMem);
1449         count=utm_countItems(combiningTriplesMem);
1450         c=s[0];
1451
1452         /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1453         for(i=0; i<count && c!=triples[i].lead; ++i) {}
1454
1455         /* check each triple for this code point */
1456         for(; i<count && c==triples[i].lead; ++i) {
1457             cc=getCCFromCP(triples[i].trail);
1458             if(cc>0 && cc<trailCC) {
1459                 /* this trail code point combines with c and has cc<trailCC */
1460                 return TRUE;
1461             }
1462         }
1463     } else {
1464         /* enumerate all chars that combine backward */
1465         uint32_t c2;
1466         uint16_t i;
1467         uint8_t cc;
1468
1469         for(i=combineBothTop; i<combineBackTop; ++i) {
1470             c2=combiningCPs[i]&0xffffff;
1471             cc=getCCFromCP(c2);
1472             /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1473             if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
1474                 return TRUE;
1475             }
1476         }
1477     }
1478
1479     /* this decomposition is not modified by any appended character */
1480     return FALSE;
1481 }
1482
1483 /* see unormimp.h for details on NF*C Skippable flags */
1484 static uint32_t
1485 getSkippableFlags(const Norm *norm) {
1486     /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1487
1488     /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1489     if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
1490         return 0;
1491     }
1492
1493     /* ### check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1494
1495     /*
1496      * Note:
1497      * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1498      *
1499      * This means that (a)..(e) must always be derived from the runtime norm32 value,
1500      * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1501      * the form is NF*C and there is a canonical decomposition (NFD_NO).
1502      *
1503      * (a) unassigned code points get "not skippable"==false because they
1504      * don't have a Norm struct so they won't get here
1505      */
1506
1507     /* (b) not skippable if cc!=0 */
1508     if(norm->udataCC!=0) {
1509         return 0; /* non-zero flag for (f) only */
1510     }
1511
1512     /*
1513      * not NFC_Skippable if
1514      * (c) quick check flag == NO  or
1515      * (d) combines forward  or
1516      * (e) combines back or
1517      * (f) can change if another character is added
1518      *
1519      * for (f):
1520      * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1521      *           check its composition list,
1522      *           see if any of the second code points in the list
1523      *           has cc less than the trailCC of the decomposition.
1524      *
1525      * For FCC: Test at runtime if the decomposition has a trailCC>1
1526      *          -> there are characters with cc==1, they would order before the trail char
1527      *          and prevent contiguous combination with the trail char.
1528      */
1529     if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
1530         (norm->combiningFlags&3)!=0) {
1531         return 0; /* non-zero flag for (f) only */
1532     }
1533     if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
1534         return _NORM_AUX_NFC_SKIP_F_MASK;
1535     }
1536
1537     return 0; /* skippable */
1538 }
1539
1540 static void
1541 makeAux() {
1542     Norm *norm;
1543     uint32_t *pData;
1544     int32_t i, length;
1545
1546     pData=utrie_getData(auxTrie, &length);
1547
1548     for(i=0; i<length; ++i) {
1549         norm=norms+pData[i];
1550         /*
1551          * 16-bit auxiliary normalization properties
1552          * see unormimp.h
1553          */
1554         pData[i]=
1555             ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
1556             (uint32_t)norm->fncIndex;
1557
1558         if(norm->unsafeStart || norm->udataCC!=0) {
1559             pData[i]|=_NORM_AUX_UNSAFE_MASK;
1560         }
1561
1562         pData[i]|=getSkippableFlags(norm);
1563     }
1564 }
1565
1566 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1567 static uint32_t U_CALLCONV
1568 getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1569     uint32_t value, leadNorm32=0;
1570     UChar32 limit;
1571     UBool inBlockZero;
1572
1573     limit=start+0x400;
1574     while(start<limit) {
1575         value=utrie_get32(trie, start, &inBlockZero);
1576         if(inBlockZero) {
1577             start+=UTRIE_DATA_BLOCK_LENGTH;
1578         } else {
1579             if(value!=0) {
1580                 leadNorm32|=value;
1581             }
1582             ++start;
1583         }
1584     }
1585
1586     /* turn multi-bit fields into the worst-case value */
1587     if(leadNorm32&_NORM_CC_MASK) {
1588         leadNorm32|=_NORM_CC_MASK;
1589     }
1590
1591     /* clean up unnecessarily ored bit fields */
1592     leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
1593
1594     if(leadNorm32==0) {
1595         /* nothing to do (only composition exclusions?) */
1596         return 0;
1597     }
1598
1599     /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1600     leadNorm32|=(
1601         (uint32_t)_NORM_EXTRA_INDEX_TOP+
1602         (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
1603     )<<_NORM_EXTRA_SHIFT;
1604
1605     return leadNorm32;
1606 }
1607
1608 /* folding value for FCD: just store the offset (16 bits) if there is any non-0 entry */
1609 static uint32_t U_CALLCONV
1610 getFoldedFCDValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1611     uint32_t value;
1612     UChar32 limit;
1613     UBool inBlockZero;
1614
1615     limit=start+0x400;
1616     while(start<limit) {
1617         value=utrie_get32(trie, start, &inBlockZero);
1618         if(inBlockZero) {
1619             start+=UTRIE_DATA_BLOCK_LENGTH;
1620         } else if(value!=0) {
1621             return (uint32_t)offset;
1622         } else {
1623             ++start;
1624         }
1625     }
1626     return 0;
1627 }
1628
1629 /*
1630  * folding value for auxiliary data:
1631  * store the non-zero offset in bits 9..0 (FNC bits)
1632  * if there is any non-0 entry;
1633  * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1634  */
1635 static uint32_t U_CALLCONV
1636 getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1637     uint32_t value, oredValues;
1638     UChar32 limit;
1639     UBool inBlockZero;
1640
1641     oredValues=0;
1642     limit=start+0x400;
1643     while(start<limit) {
1644         value=utrie_get32(trie, start, &inBlockZero);
1645         if(inBlockZero) {
1646             start+=UTRIE_DATA_BLOCK_LENGTH;
1647         } else {
1648             oredValues|=value;
1649             ++start;
1650         }
1651     }
1652
1653     if(oredValues!=0) {
1654         /* move the 10 significant offset bits into bits 9..0 */
1655         offset>>=UTRIE_SURROGATE_BLOCK_BITS;
1656         if(offset>_NORM_AUX_FNC_MASK) {
1657             fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
1658             exit(U_INDEX_OUTOFBOUNDS_ERROR);
1659         }
1660         return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
1661     } else {
1662         return 0;
1663     }
1664 }
1665
1666 extern void
1667 processData() {
1668 #if 0
1669     uint16_t i;
1670 #endif
1671
1672     processCombining();
1673
1674     /* canonically reorder decompositions and assign combining classes for decompositions */
1675     enumTrie(postParseFn, NULL);
1676
1677 #if 0
1678     for(i=1; i<64; ++i) {
1679         if(combineAndQC[i]) {
1680             printf("combiningFlags==0x%02x  qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
1681         }
1682     }
1683 #endif
1684
1685     /* add hangul/jamo specials */
1686     setHangulJamoSpecials();
1687
1688     /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1689     enumTrie(makeCanonSetFn, NULL);
1690
1691     /* clone the normalization builder trie to make the final data tries */
1692     if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
1693         NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
1694         NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
1695     ) {
1696         fprintf(stderr, "error: unable to clone the normalization trie\n");
1697         exit(U_MEMORY_ALLOCATION_ERROR);
1698     }
1699
1700     /* --- finalize data for quick checks & normalization --- */
1701
1702     /* turn the Norm structs (stage2, norms) into 32-bit data words */
1703     makeAll32();
1704
1705     /* --- finalize data for FCD checks --- */
1706
1707     /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1708     makeFCD();
1709
1710     /* --- finalize auxiliary normalization data --- */
1711     makeAux();
1712
1713     if(beVerbose) {
1714 #if 0
1715         printf("number of stage 2 entries: %ld\n", stage2Mem->index);
1716         printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
1717 #endif
1718         printf("combining CPs tops: fwd %u  both %u  back %u\n", combineFwdTop, combineBothTop, combineBackTop);
1719         printf("combining table count: %u\n", combiningTableTop);
1720     }
1721 }
1722
1723 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1724
1725 extern void
1726 generateData(const char *dataDir) {
1727     static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
1728
1729     UNewDataMemory *pData;
1730     UErrorCode errorCode=U_ZERO_ERROR;
1731     int32_t size, dataLength;
1732
1733 #if UCONFIG_NO_NORMALIZATION
1734
1735     size=0;
1736
1737 #else
1738
1739     U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
1740     U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1741     USet *set;
1742     int32_t normTrieSize, fcdTrieSize, auxTrieSize;
1743
1744     normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
1745     if(U_FAILURE(errorCode)) {
1746         fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
1747         exit(errorCode);
1748     }
1749
1750     fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), getFoldedFCDValue, TRUE, &errorCode);
1751     if(U_FAILURE(errorCode)) {
1752         fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
1753         exit(errorCode);
1754     }
1755
1756     auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
1757     if(U_FAILURE(errorCode)) {
1758         fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
1759         exit(errorCode);
1760     }
1761
1762     /* move the parts of canonStartSets[] together into a contiguous block */
1763     if(canonStartSetsTop<_NORM_MAX_CANON_SETS) {
1764         uprv_memmove(canonStartSets+canonStartSetsTop,
1765                      canonStartSets+_NORM_MAX_CANON_SETS,
1766                      canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
1767     }
1768     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1769
1770     if(canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH)) {
1771         uprv_memmove(canonStartSets+canonStartSetsTop,
1772                      canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
1773                      canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
1774     }
1775     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1776
1777     /* create the normalization exclusion sets */
1778     /*
1779      * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1780      * but we cannot use NFD_QC from the pattern because that would require
1781      * unorm.icu which we are just going to generate.
1782      * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1783      * with that.
1784      */
1785     U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
1786     U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1787
1788     canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
1789     set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
1790     if(U_FAILURE(errorCode)) {
1791         fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1792         exit(errorCode);
1793     }
1794     uset_retainAll(set, nfdQCNoSet);
1795     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1796     if(U_FAILURE(errorCode)) {
1797         fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1798         exit(errorCode);
1799     }
1800     uset_close(set);
1801
1802     canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
1803     set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
1804     if(U_FAILURE(errorCode)) {
1805         fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1806         exit(errorCode);
1807     }
1808     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1809     if(U_FAILURE(errorCode)) {
1810         fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1811         exit(errorCode);
1812     }
1813     uset_close(set);
1814
1815     canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
1816
1817     /* make sure that the FCD trie is 4-aligned */
1818     if((utm_countItems(extraMem)+combiningTableTop)&1) {
1819         combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
1820     }
1821
1822     /* pad canonStartSets to 4-alignment, too */
1823     if(canonStartSetsTop&1) {
1824         canonStartSets[canonStartSetsTop++]=0x1235;
1825     }
1826
1827     size=
1828         _NORM_INDEX_TOP*4+
1829         normTrieSize+
1830         utm_countItems(extraMem)*2+
1831         combiningTableTop*2+
1832         fcdTrieSize+
1833         auxTrieSize+
1834         canonStartSetsTop*2;
1835
1836     if(beVerbose) {
1837         printf("size of normalization trie              %5u bytes\n", (int)normTrieSize);
1838         printf("size of 16-bit extra memory             %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
1839         printf("  of that: FC_NFKC_Closure size         %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
1840         printf("size of combining table                 %5u uint16_t\n", combiningTableTop);
1841         printf("size of FCD trie                        %5u bytes\n", (int)fcdTrieSize);
1842         printf("size of auxiliary trie                  %5u bytes\n", (int)auxTrieSize);
1843         printf("size of canonStartSets[]                %5u uint16_t\n", (int)canonStartSetsTop);
1844         printf("  number of indexes                     %5u uint16_t\n", _NORM_SET_INDEX_TOP);
1845         printf("  size of sets                          %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
1846         printf("  number of sets                        %5d\n", (int)canonSetsCount);
1847         printf("  size of BMP search table              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
1848         printf("  size of supplementary search table    %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
1849         printf("  length of exclusion sets              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
1850         printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
1851     }
1852
1853     indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
1854     indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
1855
1856     indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
1857     indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
1858     indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
1859     indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
1860
1861     /* the quick check minimum code points are already set */
1862
1863     indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
1864     indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
1865     indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
1866
1867 #endif
1868
1869     /* write the data */
1870     pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
1871                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1872     if(U_FAILURE(errorCode)) {
1873         fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
1874         exit(errorCode);
1875     }
1876
1877 #if !UCONFIG_NO_NORMALIZATION
1878
1879     udata_writeBlock(pData, indexes, sizeof(indexes));
1880     udata_writeBlock(pData, normTrieBlock, normTrieSize);
1881     udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
1882     udata_writeBlock(pData, combiningTable, combiningTableTop*2);
1883     udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
1884     udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
1885     udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
1886
1887 #endif
1888
1889     /* finish up */
1890     dataLength=udata_finish(pData, &errorCode);
1891     if(U_FAILURE(errorCode)) {
1892         fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
1893         exit(errorCode);
1894     }
1895
1896     if(dataLength!=size) {
1897         fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
1898             (long)dataLength, (long)size);
1899         exit(U_INTERNAL_PROGRAM_ERROR);
1900     }
1901 }
1902
1903 #if !UCONFIG_NO_NORMALIZATION
1904
1905 extern void
1906 cleanUpData(void) {
1907     int32_t i, count;
1908
1909     count=utm_countItems(normMem);
1910     for(i=0; i<count; ++i) {
1911         uset_close(norms[i].canonStart);
1912     }
1913
1914     utm_close(normMem);
1915     utm_close(utf32Mem);
1916     utm_close(extraMem);
1917     utm_close(combiningTriplesMem);
1918     utrie_close(normTrie);
1919     utrie_close(norm32Trie);
1920     utrie_close(fcdTrie);
1921     utrie_close(auxTrie);
1922
1923     uset_close(nfdQCNoSet);
1924
1925     uprv_free(normTrie);
1926     uprv_free(norm32Trie);
1927     uprv_free(fcdTrie);
1928     uprv_free(auxTrie);
1929 }
1930
1931 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1932
1933 /*
1934  * Hey, Emacs, please set the following:
1935  *
1936  * Local Variables:
1937  * indent-tabs-mode: nil
1938  * End:
1939  *
1940  */