icuSources/tools/gennorm/store.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  store.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2001may25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Store Unicode normalization data in a memory-mappable file.
  17 */
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include "unicode/utypes.h"
  22 #include "unicode/uchar.h"
  23 #include "unicode/ustring.h"
  24 #include "cmemory.h"
  25 #include "cstring.h"
  26 #include "filestrm.h"
  27 #include "unicode/udata.h"
  28 #include "utrie.h"
  29 #include "unicode/uset.h"
  30 #include "toolutil.h"
  31 #include "unewdata.h"
  32 #include "writesrc.h"
  33 #include "unormimp.h"
  34 #include "gennorm.h"
  35
  36 #define DO_DEBUG_OUT 0
  37
  38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  39
  40 /*
  41  * The new implementation of the normalization code loads its data from
  42  * unorm.icu, which is generated with this gennorm tool.
  43  * The format of that file is described in unormimp.h .
  44  */
  45
  46 /* file data ---------------------------------------------------------------- */
  47
  48 #if UCONFIG_NO_NORMALIZATION
  49
  50 /* dummy UDataInfo cf. udata.h */
  51 static UDataInfo dataInfo = {
  52     sizeof(UDataInfo),
  53     0,
  54
  55     U_IS_BIG_ENDIAN,
  56     U_CHARSET_FAMILY,
  57     U_SIZEOF_UCHAR,
  58     0,
  59
  60     { 0, 0, 0, 0 },                 /* dummy dataFormat */
  61     { 0, 0, 0, 0 },                 /* dummy formatVersion */
  62     { 0, 0, 0, 0 }                  /* dummy dataVersion */
  63 };
  64
  65 #else
  66
  67 /* UDataInfo cf. udata.h */
  68 static UDataInfo dataInfo={
  69     sizeof(UDataInfo),
  70     0,
  71
  72     U_IS_BIG_ENDIAN,
  73     U_CHARSET_FAMILY,
  74     U_SIZEOF_UCHAR,
  75     0,
  76
  77     { 0x4e, 0x6f, 0x72, 0x6d },   /* dataFormat="Norm" */
  78     { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
  79     { 3, 2, 0, 0 }                /* dataVersion (Unicode version) */
  80 };
  81
  82 extern void
  83 setUnicodeVersion(const char *v) {
  84     UVersionInfo version;
  85     u_versionFromString(version, v);
  86     uprv_memcpy(dataInfo.dataVersion, version, 4);
  87 }
  88
  89 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
  90
  91 /* builder data ------------------------------------------------------------- */
  92
  93 /* modularization flags, see gennorm.h (default to "store everything") */
  94 uint32_t gStoreFlags=0xffffffff;
  95
  96 typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
  97
  98 static UNewTrie
  99     *normTrie,
 100     *norm32Trie,
 101     *fcdTrie,
 102     *auxTrie;
 103
 104 static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
 105
 106 static Norm *norms;
 107
 108 /*
 109  * set a flag for each code point that was seen in decompositions -
 110  * avoid to decompose ones that have not been used before
 111  */
 112 static uint32_t haveSeenFlags[256];
 113
 114 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
 115 static USet *nfdQCNoSet;
 116
 117 /* see addCombiningCP() for details */
 118 static uint32_t combiningCPs[2000];
 119
 120 /*
 121  * after processCombining() this contains for each code point in combiningCPs[]
 122  * the runtime combining index
 123  */
 124 static uint16_t combiningIndexes[2000];
 125
 126 /* section limits for combiningCPs[], see addCombiningCP() */
 127 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
 128
 129 /**
 130  * Structure for a triple of code points, stored in combiningTriplesMem.
 131  * The lead and trail code points combine into the the combined one,
 132  * i.e., there is a canonical decomposition of combined-> <lead, trail>.
 133  *
 134  * Before processCombining() is called, leadIndex and trailIndex are 0.
 135  * After processCombining(), they contain the indexes of the lead and trail
 136  * code point in the combiningCPs[] array.
 137  * They are then sorted by leadIndex, then trailIndex.
 138  * They are not sorted by code points.
 139  */
 140 typedef struct CombiningTriple {
 141     uint16_t leadIndex, trailIndex;
 142     uint32_t lead, trail, combined;
 143 } CombiningTriple;
 144
 145 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
 146 static uint16_t combiningTable[0x8000];
 147 static uint16_t combiningTableTop=0;
 148
 149 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
 150 static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
 151                                +10000]; /* +10000 for exclusion sets */
 152 static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
 153 static int32_t canonSetsCount=0;
 154
 155 /* allocate and initialize a Norm unit */
 156 static Norm *
 157 allocNorm() {
 158     /* allocate Norm */
 159     Norm *p=(Norm *)utm_alloc(normMem);
 160     /*
 161      * The combiningIndex must not be initialized to 0 because 0 is the
 162      * combiningIndex of the first forward-combining character.
 163      */
 164     p->combiningIndex=0xffff;
 165     return p;
 166 }
 167
 168 extern void
 169 init() {
 170     uint16_t *p16;
 171
 172     normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 173     uprv_memset(normTrie, 0, sizeof(UNewTrie));
 174     norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 175     uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
 176     fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 177     uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
 178     auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
 179     uprv_memset(auxTrie, 0, sizeof(UNewTrie));
 180
 181     /* initialize the two tries */
 182     if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
 183         fprintf(stderr, "error: failed to initialize tries\n");
 184         exit(U_MEMORY_ALLOCATION_ERROR);
 185     }
 186
 187     /* allocate Norm structures and reset the first one */
 188     normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
 189     norms=allocNorm();
 190
 191     /* allocate UTF-32 string memory */
 192     utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
 193
 194     /* reset all "have seen" flags */
 195     uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
 196
 197     /* open an empty set */
 198     nfdQCNoSet=uset_open(1, 0);
 199
 200     /* allocate extra data memory for UTF-16 decomposition strings and other values */
 201     extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
 202     /* initialize the extraMem counter for the top of FNC strings */
 203     p16=(uint16_t *)utm_alloc(extraMem);
 204     *p16=1;
 205
 206     /* allocate temporary memory for combining triples */
 207     combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
 208
 209     /* set the minimum code points for no/maybe quick check values to the end of the BMP */
 210     indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
 211     indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
 212     indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
 213     indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
 214
 215     /* preset the indexes portion of canonStartSets */
 216     uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
 217 }
 218
 219 /*
 220  * get or create a Norm unit;
 221  * get or create the intermediate trie entries for it as well
 222  */
 223 static Norm *
 224 createNorm(uint32_t code) {
 225     Norm *p;
 226     uint32_t i;
 227
 228     i=utrie_get32(normTrie, (UChar32)code, NULL);
 229     if(i!=0) {
 230         p=norms+i;
 231     } else {
 232         /* allocate Norm */
 233         p=allocNorm();
 234         if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
 235             fprintf(stderr, "error: too many normalization entries\n");
 236             exit(U_BUFFER_OVERFLOW_ERROR);
 237         }
 238     }
 239     return p;
 240 }
 241
 242 /* get an existing Norm unit */
 243 static Norm *
 244 getNorm(uint32_t code) {
 245     uint32_t i;
 246
 247     i=utrie_get32(normTrie, (UChar32)code, NULL);
 248     if(i==0) {
 249         return NULL;
 250     }
 251     return norms+i;
 252 }
 253
 254 /* get the canonical combining class of a character */
 255 static uint8_t
 256 getCCFromCP(uint32_t code) {
 257     Norm *norm=getNorm(code);
 258     if(norm==NULL) {
 259         return 0;
 260     } else {
 261         return norm->udataCC;
 262     }
 263 }
 264
 265 /*
 266  * enumerate all code points with their Norm structs and call a function for each
 267  * return the number of code points with data
 268  */
 269 static uint32_t
 270 enumTrie(EnumTrieFn *fn, void *context) {
 271     uint32_t count, i;
 272     UChar32 code;
 273     UBool isInBlockZero;
 274
 275     count=0;
 276     for(code=0; code<=0x10ffff;) {
 277         i=utrie_get32(normTrie, code, &isInBlockZero);
 278         if(isInBlockZero) {
 279             code+=UTRIE_DATA_BLOCK_LENGTH;
 280         } else {
 281             if(i!=0) {
 282                 fn(context, (uint32_t)code, norms+i);
 283                 ++count;
 284             }
 285             ++code;
 286         }
 287     }
 288     return count;
 289 }
 290
 291 static void
 292 setHaveSeenString(const uint32_t *s, int32_t length) {
 293     uint32_t c;
 294
 295     while(length>0) {
 296         c=*s++;
 297         haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
 298         --length;
 299     }
 300 }
 301
 302 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
 303
 304 /* handle combining data ---------------------------------------------------- */
 305
 306 /*
 307  * Insert an entry into combiningCPs[] for the new code point code with its flags.
 308  * The flags indicate if code combines forward, backward, or both.
 309  *
 310  * combiningCPs[] contains three sections:
 311  * 1. code points that combine forward
 312  * 2. code points that combine forward and backward
 313  * 3. code points that combine backward
 314  *
 315  * Search for code in the entire array.
 316  * If it is found and already is in the right section (old flags==new flags)
 317  * then we are done.
 318  * If it is found but the flags are different, then remove it,
 319  * union the old and new flags, and reinsert it into its correct section.
 320  * If it is not found, then just insert it.
 321  *
 322  * Within each section, the code points are not sorted.
 323  */
 324 static void
 325 addCombiningCP(uint32_t code, uint8_t flags) {
 326     uint32_t newEntry;
 327     uint16_t i;
 328
 329     newEntry=code|((uint32_t)flags<<24);
 330
 331     /* search for this code point */
 332     for(i=0; i<combineBackTop; ++i) {
 333         if(code==(combiningCPs[i]&0xffffff)) {
 334             /* found it */
 335             if(newEntry==combiningCPs[i]) {
 336                 return; /* no change */
 337             }
 338
 339             /* combine the flags, remove the old entry from the old place, and insert the new one */
 340             newEntry|=combiningCPs[i];
 341             if(i!=--combineBackTop) {
 342                 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
 343             }
 344             if(i<combineBothTop) {
 345                 --combineBothTop;
 346             }
 347             if(i<combineFwdTop) {
 348                 --combineFwdTop;
 349             }
 350             break;
 351         }
 352     }
 353
 354     /* not found or modified, insert it */
 355     if(combineBackTop>=sizeof(combiningCPs)/4) {
 356         fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
 357                 (long)(sizeof(combiningCPs)/4));
 358         exit(U_MEMORY_ALLOCATION_ERROR);
 359     }
 360
 361     /* set i to the insertion point */
 362     flags=(uint8_t)(newEntry>>24);
 363     if(flags==1) {
 364         i=combineFwdTop++;
 365         ++combineBothTop;
 366     } else if(flags==3) {
 367         i=combineBothTop++;
 368     } else /* flags==2 */ {
 369         i=combineBackTop;
 370     }
 371
 372     /* move the following code points up one and insert newEntry at i */
 373     if(i<combineBackTop) {
 374         uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
 375     }
 376     combiningCPs[i]=newEntry;
 377
 378     /* finally increment the total counter */
 379     ++combineBackTop;
 380 }
 381
 382 /**
 383  * Find the index in combiningCPs[] where code point code is stored.
 384  * @param code code point to look for
 385  * @param isLead is code a forward combining code point?
 386  * @return index in combiningCPs[] where code is stored
 387  */
 388 static uint16_t
 389 findCombiningCP(uint32_t code, UBool isLead) {
 390     uint16_t i, limit;
 391
 392     if(isLead) {
 393         i=0;
 394         limit=combineBothTop;
 395     } else {
 396         i=combineFwdTop;
 397         limit=combineBackTop;
 398     }
 399
 400     /* search for this code point */
 401     for(; i<limit; ++i) {
 402         if(code==(combiningCPs[i]&0xffffff)) {
 403             /* found it */
 404             return i;
 405         }
 406     }
 407
 408     /* not found */
 409     return 0xffff;
 410 }
 411
 412 static void
 413 addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
 414     CombiningTriple *triple;
 415
 416     if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
 417         return;
 418     }
 419
 420     /*
 421      * set combiningFlags for the two code points
 422      * do this after decomposition so that getNorm() above returns NULL
 423      * if we do not have actual sub-decomposition data for the initial NFD here
 424      */
 425     createNorm(lead)->combiningFlags|=1;    /* combines forward */
 426     createNorm(trail)->combiningFlags|=2;    /* combines backward */
 427
 428     addCombiningCP(lead, 1);
 429     addCombiningCP(trail, 2);
 430
 431     triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
 432     triple->lead=lead;
 433     triple->trail=trail;
 434     triple->combined=combined;
 435 }
 436
 437 static int
 438 compareTriples(const void *l, const void *r) {
 439     int diff;
 440     diff=(int)((CombiningTriple *)l)->leadIndex-
 441          (int)((CombiningTriple *)r)->leadIndex;
 442     if(diff==0) {
 443         diff=(int)((CombiningTriple *)l)->trailIndex-
 444              (int)((CombiningTriple *)r)->trailIndex;
 445     }
 446     return diff;
 447 }
 448
 449 static void
 450 processCombining() {
 451     CombiningTriple *triples;
 452     uint16_t *p;
 453     uint32_t combined;
 454     uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
 455
 456     triples=utm_getStart(combiningTriplesMem);
 457
 458     /* add lead and trail indexes to the triples for sorting */
 459     count=(uint16_t)utm_countItems(combiningTriplesMem);
 460     for(i=0; i<count; ++i) {
 461         /* findCombiningCP() must always find the code point */
 462         triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
 463         triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
 464     }
 465
 466     /* sort them by leadIndex, trailIndex */
 467     qsort(triples, count, sizeof(CombiningTriple), compareTriples);
 468
 469     /* calculate final combining indexes and store them in the Norm entries */
 470     tableTop=0;
 471     j=0; /* triples counter */
 472
 473     /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
 474     for(i=0; i<combineBothTop; ++i) {
 475         /* start a new table */
 476
 477         /* assign combining index */
 478         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
 479
 480         /* calculate the length of the combining data for this lead code point in the combiningTable */
 481         while(j<count && i==triples[j].leadIndex) {
 482             /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
 483             combined=triples[j++].combined;
 484             if(combined<=0x1fff) {
 485                 tableTop+=2;
 486             } else {
 487                 tableTop+=3;
 488             }
 489         }
 490     }
 491
 492     /* second, combining indexes of back-only characters are simply incremented from here to be unique */
 493     finalIndex=tableTop;
 494     for(; i<combineBackTop; ++i) {
 495         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
 496     }
 497
 498     /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
 499     if(finalIndex>0x8000) {
 500         fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
 501                 tableTop, (long)(sizeof(combiningTable)/4));
 502         exit(U_MEMORY_ALLOCATION_ERROR);
 503     }
 504
 505     combiningTableTop=tableTop;
 506
 507     /* store the combining data in the combiningTable, with the final indexes from above */
 508     p=combiningTable;
 509     j=0; /* triples counter */
 510
 511     /*
 512      * this is essentially the same loop as above, but
 513      * it writes the table data instead of calculating and setting the final indexes;
 514      * it is necessary to have two passes so that all the final indexes are known before
 515      * they are written into the table
 516      */
 517     for(i=0; i<combineBothTop; ++i) {
 518         /* start a new table */
 519
 520         combined=0; /* avoid compiler warning */
 521
 522         /* store the combining data for this lead code point in the combiningTable */
 523         while(j<count && i==triples[j].leadIndex) {
 524             Norm *normPtr;
 525             finalIndex=combiningIndexes[triples[j].trailIndex];
 526             combined=triples[j++].combined;
 527             normPtr = getNorm(combined);
 528
 529             if (normPtr == NULL) {
 530                 fprintf(stderr, "error: processCombining did not get expected result. combined=%d\n", combined);
 531                 exit(U_INTERNAL_PROGRAM_ERROR);
 532             }
 533
 534             /* is combined a starter? (i.e., cc==0 && combines forward) */
 535             combinesFwd=(uint16_t)((normPtr->combiningFlags&1)<<13);
 536
 537             *p++=finalIndex;
 538             if(combined<=0x1fff) {
 539                 *p++=(uint16_t)(combinesFwd|combined);
 540             } else if(combined<=0xffff) {
 541                 *p++=(uint16_t)(0x8000|combinesFwd);
 542                 *p++=(uint16_t)combined;
 543             } else {
 544                 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
 545                 *p++=(uint16_t)(0xdc00|(combined&0x3ff));
 546             }
 547         }
 548
 549         /* set a marker on the last final trail index in this lead's table */
 550         if(combined<=0x1fff) {
 551             *(p-2)|=0x8000;
 552         } else {
 553             *(p-3)|=0x8000;
 554         }
 555     }
 556
 557     /* post condition: tableTop==(p-combiningTable) */
 558 }
 559
 560 /* processing incoming normalization data ----------------------------------- */
 561
 562 /*
 563  * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
 564  * c must be a Hangul syllable code point.
 565  */
 566 static void
 567 getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
 568     /* Hangul syllable: decompose algorithmically */
 569     uint32_t c2;
 570     uint8_t length;
 571
 572     uprv_memset(pHangulNorm, 0, sizeof(Norm));
 573
 574     c-=HANGUL_BASE;
 575
 576     c2=c%JAMO_T_COUNT;
 577     c/=JAMO_T_COUNT;
 578     if(c2>0) {
 579         hangulBuffer[2]=JAMO_T_BASE+c2;
 580         length=3;
 581     } else {
 582         hangulBuffer[2]=0;
 583         length=2;
 584     }
 585
 586     hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
 587     hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
 588
 589     pHangulNorm->nfd=hangulBuffer;
 590     pHangulNorm->lenNFD=length;
 591     if(DO_STORE(UGENNORM_STORE_COMPAT)) {
 592         pHangulNorm->nfkd=hangulBuffer;
 593         pHangulNorm->lenNFKD=length;
 594     }
 595 }
 596
 597 /*
 598  * decompose the one decomposition further, may generate two decompositions
 599  * apply all previous characters' decompositions to this one
 600  */
 601 static void
 602 decompStoreNewNF(uint32_t code, Norm *norm) {
 603     uint32_t nfd[40], nfkd[40], hangulBuffer[3];
 604     Norm hangulNorm;
 605
 606     uint32_t *s32;
 607     Norm *p;
 608     uint32_t c;
 609     int32_t i, length;
 610     uint8_t lenNFD=0, lenNFKD=0;
 611     UBool changedNFD=FALSE, changedNFKD=FALSE;
 612
 613     if((length=norm->lenNFD)!=0) {
 614         /* always allocate the original string */
 615         changedNFD=TRUE;
 616         s32=norm->nfd;
 617     } else if((length=norm->lenNFKD)!=0) {
 618         /* always allocate the original string */
 619         changedNFKD=TRUE;
 620         s32=norm->nfkd;
 621     } else {
 622         /* no decomposition here, nothing to do */
 623         return;
 624     }
 625
 626     /* decompose each code point */
 627     for(i=0; i<length; ++i) {
 628         c=s32[i];
 629         p=getNorm(c);
 630         if(p==NULL) {
 631             if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
 632                 getHangulDecomposition(c, &hangulNorm, hangulBuffer);
 633                 p=&hangulNorm;
 634             } else {
 635                 /* no data, no decomposition */
 636                 nfd[lenNFD++]=c;
 637                 nfkd[lenNFKD++]=c;
 638                 continue;
 639             }
 640         }
 641
 642         /* canonically decompose c */
 643         if(changedNFD) {
 644             if(p->lenNFD!=0) {
 645                 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
 646                 lenNFD+=p->lenNFD;
 647             } else {
 648                 nfd[lenNFD++]=c;
 649             }
 650         }
 651
 652         /* compatibility-decompose c */
 653         if(p->lenNFKD!=0) {
 654             uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
 655             lenNFKD+=p->lenNFKD;
 656             changedNFKD=TRUE;
 657         } else if(p->lenNFD!=0) {
 658             uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
 659             lenNFKD+=p->lenNFD;
 660             /*
 661              * not  changedNFKD=TRUE;
 662              * so that we do not store a new nfkd if there was no nfkd string before
 663              * and we only see canonical decompositions
 664              */
 665         } else {
 666             nfkd[lenNFKD++]=c;
 667         }
 668     }
 669
 670     /* assume that norm->lenNFD==1 or ==2 */
 671     if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
 672         addCombiningTriple(s32[0], s32[1], code);
 673     }
 674
 675     if(changedNFD) {
 676         if(lenNFD!=0) {
 677             s32=utm_allocN(utf32Mem, lenNFD);
 678             uprv_memcpy(s32, nfd, lenNFD*4);
 679         } else {
 680             s32=NULL;
 681         }
 682         norm->lenNFD=lenNFD;
 683         norm->nfd=s32;
 684         setHaveSeenString(nfd, lenNFD);
 685     }
 686     if(changedNFKD) {
 687         if(lenNFKD!=0) {
 688             s32=utm_allocN(utf32Mem, lenNFKD);
 689             uprv_memcpy(s32, nfkd, lenNFKD*4);
 690         } else {
 691             s32=NULL;
 692         }
 693         norm->lenNFKD=lenNFKD;
 694         norm->nfkd=s32;
 695         setHaveSeenString(nfkd, lenNFKD);
 696     }
 697 }
 698
 699 typedef struct DecompSingle {
 700     uint32_t c;
 701     Norm *norm;
 702 } DecompSingle;
 703
 704 /*
 705  * apply this one character's decompositions (there is at least one!) to
 706  * all previous characters' decompositions to decompose them further
 707  */
 708 static void
 709 decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
 710     uint32_t nfd[40], nfkd[40];
 711     uint32_t *s32;
 712     DecompSingle *me=(DecompSingle *)context;
 713     uint32_t c, myC;
 714     int32_t i, length;
 715     uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
 716     UBool changedNFD=FALSE, changedNFKD=FALSE;
 717
 718     /* get the new character's data */
 719     myC=me->c;
 720     myLenNFD=me->norm->lenNFD;
 721     myLenNFKD=me->norm->lenNFKD;
 722     /* assume that myC has at least one decomposition */
 723
 724     if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
 725         /* apply NFD(myC) to norm->nfd */
 726         s32=norm->nfd;
 727         for(i=0; i<length; ++i) {
 728             c=s32[i];
 729             if(c==myC) {
 730                 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
 731                 lenNFD+=myLenNFD;
 732                 changedNFD=TRUE;
 733             } else {
 734                 nfd[lenNFD++]=c;
 735             }
 736         }
 737     }
 738
 739     if((length=norm->lenNFKD)!=0) {
 740         /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
 741         s32=norm->nfkd;
 742         for(i=0; i<length; ++i) {
 743             c=s32[i];
 744             if(c==myC) {
 745                 if(myLenNFKD!=0) {
 746                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
 747                     lenNFKD+=myLenNFKD;
 748                 } else /* assume myLenNFD!=0 */ {
 749                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
 750                     lenNFKD+=myLenNFD;
 751                 }
 752                 changedNFKD=TRUE;
 753             } else {
 754                 nfkd[lenNFKD++]=c;
 755             }
 756         }
 757     } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
 758         /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
 759         s32=norm->nfd;
 760         for(i=0; i<length; ++i) {
 761             c=s32[i];
 762             if(c==myC) {
 763                 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
 764                 lenNFKD+=myLenNFKD;
 765                 changedNFKD=TRUE;
 766             } else {
 767                 nfkd[lenNFKD++]=c;
 768             }
 769         }
 770     }
 771
 772     /* set the new decompositions, forget the old ones */
 773     if(changedNFD) {
 774         if(lenNFD!=0) {
 775             if(lenNFD>norm->lenNFD) {
 776                 s32=utm_allocN(utf32Mem, lenNFD);
 777             } else {
 778                 s32=norm->nfd;
 779             }
 780             uprv_memcpy(s32, nfd, lenNFD*4);
 781         } else {
 782             s32=NULL;
 783         }
 784         norm->lenNFD=lenNFD;
 785         norm->nfd=s32;
 786     }
 787     if(changedNFKD) {
 788         if(lenNFKD!=0) {
 789             if(lenNFKD>norm->lenNFKD) {
 790                 s32=utm_allocN(utf32Mem, lenNFKD);
 791             } else {
 792                 s32=norm->nfkd;
 793             }
 794             uprv_memcpy(s32, nfkd, lenNFKD*4);
 795         } else {
 796             s32=NULL;
 797         }
 798         norm->lenNFKD=lenNFKD;
 799         norm->nfkd=s32;
 800     }
 801 }
 802
 803 /*
 804  * process the data for one code point listed in UnicodeData;
 805  * UnicodeData itself never maps a code point to both NFD and NFKD
 806  */
 807 extern void
 808 storeNorm(uint32_t code, Norm *norm) {
 809     DecompSingle decompSingle;
 810     Norm *p;
 811
 812     if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
 813         /* ignore compatibility decomposition */
 814         norm->lenNFKD=0;
 815     }
 816
 817     /* copy existing derived normalization properties */
 818     p=createNorm(code);
 819     norm->qcFlags=p->qcFlags;
 820     norm->combiningFlags=p->combiningFlags;
 821     norm->fncIndex=p->fncIndex;
 822
 823     /* process the decomposition if there is one here */
 824     if((norm->lenNFD|norm->lenNFKD)!=0) {
 825         /* decompose this one decomposition further, may generate two decompositions */
 826         decompStoreNewNF(code, norm);
 827
 828         /* has this code point been used in previous decompositions? */
 829         if(HAVE_SEEN(code)) {
 830             /* use this decomposition to decompose other decompositions further */
 831             decompSingle.c=code;
 832             decompSingle.norm=norm;
 833             enumTrie(decompWithSingleFn, &decompSingle);
 834         }
 835     }
 836
 837     /* store the data */
 838     uprv_memcpy(p, norm, sizeof(Norm));
 839 }
 840
 841 extern void
 842 setQCFlags(uint32_t code, uint8_t qcFlags) {
 843     if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
 844         /* ignore compatibility decomposition: unset the KC/KD flags */
 845         qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD);
 846
 847         /* set the KC/KD flags to the same values as the C/D flags */
 848         qcFlags|=qcFlags<<1;
 849     }
 850     if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
 851         /* ignore composition data: unset the C/KC flags */
 852         qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC);
 853
 854         /* set the C/KC flags to the same values as the D/KD flags */
 855         qcFlags|=qcFlags>>2;
 856     }
 857
 858     createNorm(code)->qcFlags|=qcFlags;
 859
 860     /* adjust the minimum code point for quick check no/maybe */
 861     if(code<0xffff) {
 862         if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
 863             indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
 864         }
 865         if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
 866             indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
 867         }
 868         if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
 869             indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
 870         }
 871         if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
 872             indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
 873         }
 874     }
 875
 876     if(qcFlags&_NORM_QC_NFD) {
 877         uset_add(nfdQCNoSet, (UChar32)code);
 878     }
 879 }
 880
 881 extern void
 882 setCompositionExclusion(uint32_t code) {
 883     if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
 884         createNorm(code)->combiningFlags|=0x80;
 885     }
 886 }
 887
 888 static void
 889 setHangulJamoSpecials() {
 890     Norm *norm;
 891     uint32_t c, hangul;
 892
 893     /*
 894      * Hangul syllables are algorithmically decomposed into Jamos,
 895      * and Jamos are algorithmically composed into Hangul syllables.
 896      * The quick check flags are parsed, except for Hangul.
 897      */
 898
 899     /* set Jamo L specials */
 900     hangul=0xac00;
 901     for(c=0x1100; c<=0x1112; ++c) {
 902         norm=createNorm(c);
 903         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
 904         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
 905             norm->combiningFlags=1;
 906         }
 907
 908         /* for each Jamo L create a set with its associated Hangul block */
 909         norm->canonStart=uset_open(hangul, hangul+21*28-1);
 910         hangul+=21*28;
 911     }
 912
 913     /* set Jamo V specials */
 914     for(c=0x1161; c<=0x1175; ++c) {
 915         norm=createNorm(c);
 916         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
 917         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
 918             norm->combiningFlags=2;
 919         }
 920         norm->unsafeStart=TRUE;
 921     }
 922
 923     /* set Jamo T specials */
 924     for(c=0x11a8; c<=0x11c2; ++c) {
 925         norm=createNorm(c);
 926         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
 927         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
 928             norm->combiningFlags=2;
 929         }
 930         norm->unsafeStart=TRUE;
 931     }
 932
 933     /* set Hangul specials, precompacted */
 934     norm=allocNorm();
 935     norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
 936     if(DO_STORE(UGENNORM_STORE_COMPAT)) {
 937         norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
 938     } else {
 939         norm->qcFlags=_NORM_QC_NFD;
 940     }
 941
 942     if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
 943         fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
 944         exit(U_BUFFER_OVERFLOW_ERROR);
 945     }
 946 }
 947
 948 /*
 949  * set FC-NFKC-Closure string
 950  * s contains the closure string; s[0]==length, s[1..length] is the actual string
 951  * may modify s[0]
 952  */
 953 U_CFUNC void
 954 setFNC(uint32_t c, UChar *s) {
 955     uint16_t *p;
 956     int32_t length, i, count;
 957     UChar first;
 958
 959     if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) ||
 960         DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) ||
 961         DO_NOT_STORE(UGENNORM_STORE_AUX)
 962     ) {
 963         return;
 964     }
 965
 966     count=utm_countItems(extraMem);
 967     length=s[0];
 968     first=s[1];
 969
 970     /* try to overlay single-unit strings with existing ones */
 971     if(length==1 && first<0xff00) {
 972         p=utm_getStart(extraMem);
 973         for(i=1; i<count; ++i) {
 974             if(first==p[i]) {
 975                 break;
 976             }
 977         }
 978     } else {
 979         i=count;
 980     }
 981
 982     /* append the new string if it cannot be overlayed with an old one */
 983     if(i==count) {
 984         if(count>_NORM_AUX_MAX_FNC) {
 985             fprintf(stderr, "gennorm error: too many FNC strings\n");
 986             exit(U_INDEX_OUTOFBOUNDS_ERROR);
 987         }
 988
 989         /* prepend 0xffxx with xx==length */
 990         s[0]=(uint16_t)(0xff00+length);
 991         ++length;
 992         p=(uint16_t *)utm_allocN(extraMem, length);
 993         uprv_memcpy(p, s, length*2);
 994
 995         /* update the top index in extraMem[0] */
 996         count+=length;
 997         ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
 998     }
 999
1000     /* store the index to the string */
1001     createNorm(c)->fncIndex=i;
1002 }
1003
1004 /* build runtime structures ------------------------------------------------- */
1005
1006 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
1007 static uint16_t
1008 reorderString(uint32_t *s, int32_t length) {
1009     uint8_t ccs[40];
1010     uint32_t c;
1011     int32_t i, j;
1012     uint8_t cc, prevCC;
1013
1014     if(length<=0) {
1015         return 0;
1016     }
1017
1018     for(i=0; i<length; ++i) {
1019         /* get the i-th code point and its combining class */
1020         c=s[i];
1021         cc=getCCFromCP(c);
1022         if(cc!=0 && i!=0) {
1023             /* it is a combining mark, see if it needs to be moved back */
1024             j=i;
1025             do {
1026                 prevCC=ccs[j-1];
1027                 if(prevCC<=cc) {
1028                     break;  /* found the right place */
1029                 }
1030                 /* move the previous code point here and go back */
1031                 s[j]=s[j-1];
1032                 ccs[j]=prevCC;
1033             } while(--j!=0);
1034             s[j]=c;
1035             ccs[j]=cc;
1036         } else {
1037             /* just store the combining class */
1038             ccs[i]=cc;
1039         }
1040     }
1041
1042     return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
1043 }
1044
1045 #if 0
1046 static UBool combineAndQC[64]={ 0 };
1047 #endif
1048
1049 /*
1050  * canonically reorder the up to two decompositions
1051  * and store the leading and trailing combining classes accordingly
1052  *
1053  * also process canonical decompositions for canonical closure
1054  */
1055 static void
1056 postParseFn(void *context, uint32_t code, Norm *norm) {
1057     int32_t length;
1058
1059     /* canonically order the NFD */
1060     length=norm->lenNFD;
1061     if(length>0) {
1062         norm->canonBothCCs=reorderString(norm->nfd, length);
1063     }
1064
1065     /* canonically reorder the NFKD */
1066     length=norm->lenNFKD;
1067     if(length>0) {
1068         norm->compatBothCCs=reorderString(norm->nfkd, length);
1069     }
1070
1071     /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
1072     if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
1073         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
1074     }
1075     if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
1076         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
1077     }
1078
1079     /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1080 #if 0
1081     combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
1082 #endif
1083
1084     if(norm->combiningFlags&1) {
1085         if(norm->udataCC!=0) {
1086             /* illegal - data-derivable composition exclusion */
1087             fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
1088         }
1089     }
1090     if(norm->combiningFlags&2) {
1091         if((norm->qcFlags&0x11)==0) {
1092             fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
1093         }
1094 #if 0
1095         /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1096         if(norm->udataCC==0) {
1097             printf("U+%04lx combines backward but udataCC==0\n", (long)code);
1098         }
1099 #endif
1100     }
1101     if((norm->combiningFlags&3)==3 && beVerbose) {
1102         printf("U+%04lx combines both ways\n", (long)code);
1103     }
1104
1105     /*
1106      * process canonical decompositions for canonical closure
1107      *
1108      * in each canonical decomposition:
1109      *   add the current character (code) to the set of canonical starters of its norm->nfd[0]
1110      *   set the "unsafe starter" flag for each norm->nfd[1..]
1111      */
1112     length=norm->lenNFD;
1113     if(length>0) {
1114         Norm *otherNorm;
1115         UChar32 c;
1116         int32_t i;
1117
1118         /* nfd[0].canonStart.add(code) */
1119         c=norm->nfd[0];
1120         otherNorm=createNorm(c);
1121         if(otherNorm->canonStart==NULL) {
1122             otherNorm->canonStart=uset_open(code, code);
1123             if(otherNorm->canonStart==NULL) {
1124                 fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
1125                 exit(U_MEMORY_ALLOCATION_ERROR);
1126             }
1127         } else {
1128             uset_add(otherNorm->canonStart, code);
1129             if(!uset_contains(otherNorm->canonStart, code)) {
1130                 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
1131                 exit(U_INTERNAL_PROGRAM_ERROR);
1132             }
1133         }
1134
1135         /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1136         for(i=1; i<length; ++i) {
1137             createNorm(norm->nfd[i])->unsafeStart=TRUE;
1138         }
1139     }
1140 }
1141
1142 static uint32_t
1143 make32BitNorm(Norm *norm) {
1144     UChar extra[100];
1145     const Norm *other;
1146     uint32_t word;
1147     int32_t i, length, beforeZero=0, count, start;
1148
1149     /*
1150      * Check for assumptions:
1151      *
1152      * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1153      * then the decomposition also begins with a true starter.
1154      */
1155     if(norm->udataCC==0) {
1156         /* this is a starter */
1157         if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
1158             /* a "true" NFC starter with a canonical decomposition */
1159             if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1160                 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
1161             ) {
1162                 fprintf(stderr,
1163                     "error: true NFC starter canonical decomposition[%u] does not begin\n"
1164                     "    with a true NFC starter: U+%04lx U+%04lx%s\n",
1165                     norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1166                     norm->lenNFD<=2 ? "" : " ...");
1167                 exit(U_INVALID_TABLE_FILE);
1168             }
1169         }
1170
1171         if((norm->qcFlags&_NORM_QC_NFKC)==0) {
1172             if(norm->lenNFKD>0) {
1173                 /* a "true" NFKC starter with a compatibility decomposition */
1174                 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
1175                     ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */
1176                 ) {
1177                     fprintf(stderr,
1178                         "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1179                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
1180                         norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],
1181                         norm->lenNFKD<=2 ? "" : " ...");
1182                     exit(U_INVALID_TABLE_FILE);
1183                 }
1184             } else if(norm->lenNFD>0) {
1185                 /* a "true" NFKC starter with only a canonical decomposition */
1186                 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1187                     ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */
1188                 ) {
1189                     fprintf(stderr,
1190                         "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1191                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
1192                         norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1193                         norm->lenNFD<=2 ? "" : " ...");
1194                     exit(U_INVALID_TABLE_FILE);
1195                 }
1196             }
1197         }
1198     }
1199
1200     /* reset the 32-bit word and set the quick check flags */
1201     word=norm->qcFlags;
1202
1203     /* set the UnicodeData combining class */
1204     word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
1205
1206     /* set the combining flag and index */
1207     if(norm->combiningFlags&3) {
1208         word|=(uint32_t)(norm->combiningFlags&3)<<6;
1209     }
1210
1211     /* set the combining index value into the extra data */
1212     /* 0xffff: no combining index; 0..0x7fff: combining index */
1213     if(norm->combiningIndex!=0xffff) {
1214         extra[0]=norm->combiningIndex;
1215         beforeZero=1;
1216     }
1217
1218     count=beforeZero;
1219
1220     /* write the decompositions */
1221     if((norm->lenNFD|norm->lenNFKD)!=0) {
1222         extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
1223
1224         length=norm->lenNFD;
1225         if(length>0) {
1226             if(norm->canonBothCCs!=0) {
1227                 extra[beforeZero]|=0x80;
1228                 extra[count++]=norm->canonBothCCs;
1229             }
1230             start=count;
1231             for(i=0; i<length; ++i) {
1232                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
1233             }
1234             extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
1235         }
1236
1237         length=norm->lenNFKD;
1238         if(length>0) {
1239             if(norm->compatBothCCs!=0) {
1240                 extra[beforeZero]|=0x8000;
1241                 extra[count++]=norm->compatBothCCs;
1242             }
1243             start=count;
1244             for(i=0; i<length; ++i) {
1245                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
1246             }
1247             extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
1248         }
1249     }
1250
1251     /* allocate and copy the extra data */
1252     if(count!=0) {
1253         UChar *p;
1254
1255         if(norm->specialTag!=0) {
1256             fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
1257             exit(U_ILLEGAL_ARGUMENT_ERROR);
1258         }
1259
1260         p=(UChar *)utm_allocN(extraMem, count);
1261         uprv_memcpy(p, extra, count*2);
1262
1263         /* set the extra index, offset by beforeZero */
1264         word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
1265     } else if(norm->specialTag!=0) {
1266         /* set a special tag instead of an extra index */
1267         word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
1268     }
1269
1270     return word;
1271 }
1272
1273 /* turn all Norm structs into corresponding 32-bit norm values */
1274 static void
1275 makeAll32() {
1276     uint32_t *pNormData;
1277     uint32_t n;
1278     int32_t i, normLength, count;
1279
1280     count=(int32_t)utm_countItems(normMem);
1281     for(i=0; i<count; ++i) {
1282         norms[i].value32=make32BitNorm(norms+i);
1283     }
1284
1285     pNormData=utrie_getData(norm32Trie, &normLength);
1286
1287     count=0; /* count is now just used for debugging */
1288     for(i=0; i<normLength; ++i) {
1289         n=pNormData[i];
1290         if(0!=(pNormData[i]=norms[n].value32)) {
1291             ++count;
1292         }
1293     }
1294 }
1295
1296 /*
1297  * extract all Norm.canonBothCCs into the FCD table
1298  * set 32-bit values to use the common fold and compact functions
1299  */
1300 static void
1301 makeFCD() {
1302     uint32_t *pFCDData;
1303     uint32_t n;
1304     int32_t i, count, fcdLength;
1305     uint16_t bothCCs;
1306
1307     count=utm_countItems(normMem);
1308     for(i=0; i<count; ++i) {
1309         bothCCs=norms[i].canonBothCCs;
1310         if(bothCCs==0) {
1311             /* if there are no decomposition cc's then use the udataCC twice */
1312             bothCCs=norms[i].udataCC;
1313             bothCCs|=bothCCs<<8;
1314         }
1315         norms[i].value32=bothCCs;
1316     }
1317
1318     pFCDData=utrie_getData(fcdTrie, &fcdLength);
1319
1320     for(i=0; i<fcdLength; ++i) {
1321         n=pFCDData[i];
1322         pFCDData[i]=norms[n].value32;
1323     }
1324 }
1325
1326 /**
1327  * If the given set contains exactly one character, then return it.
1328  * Otherwise return -1.
1329  */
1330 static int32_t
1331 usetContainsOne(const USet* set) {
1332     if(uset_getItemCount(set)==1) {
1333         /* there is a single item (a single range) */
1334         UChar32 start, end;
1335         UErrorCode ec=U_ZERO_ERROR;
1336         int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
1337         if (len==0 && start==end) { /* a range (len==0) with a single code point */
1338             return start;
1339         }
1340     }
1341     return -1;
1342 }
1343
1344 static void
1345 makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
1346     if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
1347         uint16_t *table;
1348         int32_t c, tableLength;
1349         UErrorCode errorCode=U_ZERO_ERROR;
1350
1351         /* does the set contain exactly one code point? */
1352         c=usetContainsOne(norm->canonStart);
1353
1354         /* add an entry to the BMP or supplementary search table */
1355         if(code<=0xffff) {
1356             table=canonStartSets+_NORM_MAX_CANON_SETS;
1357             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1358
1359             table[tableLength++]=(uint16_t)code;
1360
1361             if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
1362                 /* single-code point BMP result for BMP code point */
1363                 table[tableLength++]=(uint16_t)c;
1364             } else {
1365                 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
1366                 c=-1;
1367             }
1368             canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
1369         } else {
1370             table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
1371             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1372
1373             table[tableLength++]=(uint16_t)(code>>16);
1374             table[tableLength++]=(uint16_t)code;
1375
1376             if(c>=0) {
1377                 /* single-code point result for supplementary code point */
1378                 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
1379                 table[tableLength++]=(uint16_t)c;
1380             } else {
1381                 table[tableLength++]=(uint16_t)canonStartSetsTop;
1382             }
1383             canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
1384         }
1385
1386         if(c<0) {
1387             /* write a USerializedSet */
1388             ++canonSetsCount;
1389             canonStartSetsTop+=
1390                     uset_serialize(norm->canonStart,
1391                             canonStartSets+canonStartSetsTop,
1392                             _NORM_MAX_CANON_SETS-canonStartSetsTop,
1393                             &errorCode);
1394         }
1395         canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1396
1397         if(U_FAILURE(errorCode)) {
1398             fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
1399             exit(errorCode);
1400         }
1401         if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
1402             fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
1403             exit(U_INDEX_OUTOFBOUNDS_ERROR);
1404         }
1405     }
1406 }
1407
1408 /* for getSkippableFlags ---------------------------------------------------- */
1409
1410 /* combine the lead and trail code points; return <0 if they do not combine */
1411 static int32_t
1412 combine(uint32_t lead, uint32_t trail) {
1413     CombiningTriple *triples;
1414     uint32_t i, count;
1415
1416     /* search for all triples with c as lead code point */
1417     triples=utm_getStart(combiningTriplesMem);
1418     count=utm_countItems(combiningTriplesMem);
1419
1420     /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1421     for(i=0; i<count && lead!=triples[i].lead; ++i) {}
1422
1423     /* check each triple for this code point */
1424     for(; i<count && lead==triples[i].lead; ++i) {
1425         if(trail==triples[i].trail) {
1426             return (int32_t)triples[i].combined;
1427         }
1428     }
1429
1430     return -1;
1431 }
1432
1433 /*
1434  * Starting from the canonical decomposition s[0..length[ of a single code point,
1435  * is the code point c consumed in an NFC/FCC recomposition?
1436  *
1437  * No need to handle discontiguous composition because that would not consume some
1438  * intermediate character, so would not compose back to the original character.
1439  * See comments in canChangeWithFollowing().
1440  *
1441  * No need to compose beyond where c canonically orders because if it is consumed
1442  * then the result differs from the original anyway.
1443  *
1444  * Possible optimization:
1445  * - Verify that there are no cases of the same combining mark stacking twice.
1446  * - return FALSE right away if c inserts after a copy of itself
1447  *   without attempting to recompose; will happen because each mark in
1448  *   the decomposition will be enumerated and passed in as c.
1449  *   More complicated and fragile though than it is already.
1450  *
1451  * markus 2002nov04
1452  */
1453 static UBool
1454 doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
1455     int32_t starter, i;
1456
1457     /* ignore trailing characters where cc<prevCC */
1458     while(length>1 && cc<getCCFromCP(s[length-1])) {
1459         --length;
1460     }
1461
1462     /* start consuming/combining from the beginning */
1463     starter=(int32_t)s[0];
1464     for(i=1; i<length; ++i) {
1465         starter=combine((uint32_t)starter, s[i]);
1466         if(starter<0) {
1467             fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1468                 (int)s[0], (int)s[1], (int)length, (int)c, cc);
1469             exit(U_INTERNAL_PROGRAM_ERROR);
1470         }
1471     }
1472
1473     /* try to combine/consume c, return TRUE if it is consumed */
1474     return combine((uint32_t)starter, c)>=0;
1475 }
1476
1477 /* does the starter s[0] combine forward with another char that is below trailCC? */
1478 static UBool
1479 canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
1480     if(trailCC<=1) {
1481         /* no character will combine ahead of the trailing char of the decomposition */
1482         return FALSE;
1483     }
1484
1485     /*
1486      * We are only checking skippable condition (f).
1487      * Therefore, the original character does not have quick check flag NFC_NO (c),
1488      * i.e., the decomposition recomposes completely back into the original code point.
1489      * So s[0] must be a true starter with cc==0 and
1490      * combining with following code points.
1491      *
1492      * Similarly, length==1 is not possible because that would be a singleton
1493      * decomposition which is marked with NFC_NO and does not pass (c).
1494      *
1495      * Only a character with cc<trailCC can change the composition.
1496      * Reason: A char with cc>=trailCC would order after decomposition s[],
1497      * composition would consume all of the decomposition, and here we know that
1498      * the original char passed check d), i.e., it does not combine forward,
1499      * therefore does not combine with anything after the decomposition is consumed.
1500      *
1501      * Now see if there is a character that
1502      * 1. combines backward
1503      * 2. has cc<trailCC
1504      * 3. is consumed in recomposition
1505      *
1506      * length==2 is simple:
1507      *
1508      * Characters that fulfill these conditions are exactly the ones that combine directly
1509      * with the starter c==s[0] because there is no intervening character after
1510      * reordering.
1511      * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1512      * and see if one has cc<trailCC (passes 2.).
1513      *
1514      * length>2 is a little harder:
1515      *
1516      * Since we will get different starters during recomposition, we need to
1517      * enumerate each backward-combining character (1.)
1518      * with cc<trailCC (2.) and
1519      * see if it gets consumed in recomposition. (3.)
1520      * No need to enumerate both-ways combining characters because they must have cc==0.
1521      */
1522     if(length==2) {
1523         /* enumerate all chars that combine with this one and check their cc */
1524         CombiningTriple *triples;
1525         uint32_t c, i, count;
1526         uint8_t cc;
1527
1528         /* search for all triples with c as lead code point */
1529         triples=utm_getStart(combiningTriplesMem);
1530         count=utm_countItems(combiningTriplesMem);
1531         c=s[0];
1532
1533         /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1534         for(i=0; i<count && c!=triples[i].lead; ++i) {}
1535
1536         /* check each triple for this code point */
1537         for(; i<count && c==triples[i].lead; ++i) {
1538             cc=getCCFromCP(triples[i].trail);
1539             if(cc>0 && cc<trailCC) {
1540                 /* this trail code point combines with c and has cc<trailCC */
1541                 return TRUE;
1542             }
1543         }
1544     } else {
1545         /* enumerate all chars that combine backward */
1546         uint32_t c2;
1547         uint16_t i;
1548         uint8_t cc;
1549
1550         for(i=combineBothTop; i<combineBackTop; ++i) {
1551             c2=combiningCPs[i]&0xffffff;
1552             cc=getCCFromCP(c2);
1553             /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1554             if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
1555                 return TRUE;
1556             }
1557         }
1558     }
1559
1560     /* this decomposition is not modified by any appended character */
1561     return FALSE;
1562 }
1563
1564 /* see unormimp.h for details on NF*C Skippable flags */
1565 static uint32_t
1566 getSkippableFlags(const Norm *norm) {
1567     /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1568
1569     /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1570     if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
1571         return 0;
1572     }
1573
1574     /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1575
1576     /*
1577      * Note:
1578      * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1579      *
1580      * This means that (a)..(e) must always be derived from the runtime norm32 value,
1581      * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1582      * the form is NF*C and there is a canonical decomposition (NFD_NO).
1583      *
1584      * (a) unassigned code points get "not skippable"==false because they
1585      * don't have a Norm struct so they won't get here
1586      */
1587
1588     /* (b) not skippable if cc!=0 */
1589     if(norm->udataCC!=0) {
1590         return 0; /* non-zero flag for (f) only */
1591     }
1592
1593     /*
1594      * not NFC_Skippable if
1595      * (c) quick check flag == NO  or
1596      * (d) combines forward  or
1597      * (e) combines back or
1598      * (f) can change if another character is added
1599      *
1600      * for (f):
1601      * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1602      *           check its composition list,
1603      *           see if any of the second code points in the list
1604      *           has cc less than the trailCC of the decomposition.
1605      *
1606      * For FCC: Test at runtime if the decomposition has a trailCC>1
1607      *          -> there are characters with cc==1, they would order before the trail char
1608      *          and prevent contiguous combination with the trail char.
1609      */
1610     if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
1611         (norm->combiningFlags&3)!=0) {
1612         return 0; /* non-zero flag for (f) only */
1613     }
1614     if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
1615         return _NORM_AUX_NFC_SKIP_F_MASK;
1616     }
1617
1618     return 0; /* skippable */
1619 }
1620
1621 static void
1622 makeAux() {
1623     Norm *norm;
1624     uint32_t *pData;
1625     int32_t i, length;
1626
1627     pData=utrie_getData(auxTrie, &length);
1628
1629     for(i=0; i<length; ++i) {
1630         norm=norms+pData[i];
1631         /*
1632          * 16-bit auxiliary normalization properties
1633          * see unormimp.h
1634          */
1635         pData[i]=
1636             ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
1637             (uint32_t)norm->fncIndex;
1638
1639         if(norm->unsafeStart || norm->udataCC!=0) {
1640             pData[i]|=_NORM_AUX_UNSAFE_MASK;
1641         }
1642
1643         pData[i]|=getSkippableFlags(norm);
1644     }
1645 }
1646
1647 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1648 static uint32_t U_CALLCONV
1649 getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1650     uint32_t value, leadNorm32=0;
1651     UChar32 limit;
1652     UBool inBlockZero;
1653
1654     limit=start+0x400;
1655     while(start<limit) {
1656         value=utrie_get32(trie, start, &inBlockZero);
1657         if(inBlockZero) {
1658             start+=UTRIE_DATA_BLOCK_LENGTH;
1659         } else {
1660             if(value!=0) {
1661                 leadNorm32|=value;
1662             }
1663             ++start;
1664         }
1665     }
1666
1667     /* turn multi-bit fields into the worst-case value */
1668     if(leadNorm32&_NORM_CC_MASK) {
1669         leadNorm32|=_NORM_CC_MASK;
1670     }
1671
1672     /* clean up unnecessarily ored bit fields */
1673     leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
1674
1675     if(leadNorm32==0) {
1676         /* nothing to do (only composition exclusions?) */
1677         return 0;
1678     }
1679
1680     /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1681     leadNorm32|=(
1682         (uint32_t)_NORM_EXTRA_INDEX_TOP+
1683         (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
1684     )<<_NORM_EXTRA_SHIFT;
1685
1686     return leadNorm32;
1687 }
1688
1689 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
1690
1691 /*
1692  * folding value for auxiliary data:
1693  * store the non-zero offset in bits 9..0 (FNC bits)
1694  * if there is any non-0 entry;
1695  * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1696  */
1697 static uint32_t U_CALLCONV
1698 getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1699     uint32_t value, oredValues;
1700     UChar32 limit;
1701     UBool inBlockZero;
1702
1703     oredValues=0;
1704     limit=start+0x400;
1705     while(start<limit) {
1706         value=utrie_get32(trie, start, &inBlockZero);
1707         if(inBlockZero) {
1708             start+=UTRIE_DATA_BLOCK_LENGTH;
1709         } else {
1710             oredValues|=value;
1711             ++start;
1712         }
1713     }
1714
1715     if(oredValues!=0) {
1716         /* move the 10 significant offset bits into bits 9..0 */
1717         offset>>=UTRIE_SURROGATE_BLOCK_BITS;
1718         if(offset>_NORM_AUX_FNC_MASK) {
1719             fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
1720             exit(U_INDEX_OUTOFBOUNDS_ERROR);
1721         }
1722         return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
1723     } else {
1724         return 0;
1725     }
1726 }
1727
1728 extern void
1729 processData() {
1730 #if 0
1731     uint16_t i;
1732 #endif
1733
1734     processCombining();
1735
1736     /* canonically reorder decompositions and assign combining classes for decompositions */
1737     enumTrie(postParseFn, NULL);
1738
1739 #if 0
1740     for(i=1; i<64; ++i) {
1741         if(combineAndQC[i]) {
1742             printf("combiningFlags==0x%02x  qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
1743         }
1744     }
1745 #endif
1746
1747     /* add hangul/jamo specials */
1748     setHangulJamoSpecials();
1749
1750     /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
1751     canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1752
1753     /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1754     if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) {
1755         enumTrie(makeCanonSetFn, NULL);
1756     }
1757
1758     /* clone the normalization builder trie to make the final data tries */
1759     if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
1760         NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
1761         NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
1762     ) {
1763         fprintf(stderr, "error: unable to clone the normalization trie\n");
1764         exit(U_MEMORY_ALLOCATION_ERROR);
1765     }
1766
1767     /* --- finalize data for quick checks & normalization --- */
1768
1769     /* turn the Norm structs (stage2, norms) into 32-bit data words */
1770     makeAll32();
1771
1772     /* --- finalize data for FCD checks --- */
1773
1774     /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1775     makeFCD();
1776
1777     /* --- finalize auxiliary normalization data --- */
1778     makeAux();
1779
1780     if(beVerbose) {
1781 #if 0
1782         printf("number of stage 2 entries: %ld\n", stage2Mem->index);
1783         printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
1784 #endif
1785         printf("combining CPs tops: fwd %u  both %u  back %u\n", combineFwdTop, combineBothTop, combineBackTop);
1786         printf("combining table count: %u\n", combiningTableTop);
1787     }
1788 }
1789
1790 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1791
1792 extern void
1793 generateData(const char *dataDir, UBool csource) {
1794     static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
1795
1796     UNewDataMemory *pData;
1797     UErrorCode errorCode=U_ZERO_ERROR;
1798     int32_t size, dataLength;
1799
1800 #if UCONFIG_NO_NORMALIZATION
1801
1802     size=0;
1803
1804 #else
1805
1806     U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
1807     U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1808     USet *set;
1809     int32_t normTrieSize, fcdTrieSize, auxTrieSize;
1810
1811     normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
1812     if(U_FAILURE(errorCode)) {
1813         fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
1814         exit(errorCode);
1815     }
1816
1817     if(DO_STORE(UGENNORM_STORE_FCD)) {
1818         fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode);
1819         if(U_FAILURE(errorCode)) {
1820             fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
1821             exit(errorCode);
1822         }
1823     } else {
1824         fcdTrieSize=0;
1825     }
1826
1827     if(DO_STORE(UGENNORM_STORE_AUX)) {
1828         auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
1829         if(U_FAILURE(errorCode)) {
1830             fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
1831             exit(errorCode);
1832         }
1833     } else {
1834         auxTrieSize=0;
1835     }
1836
1837     /* move the parts of canonStartSets[] together into a contiguous block */
1838     if( canonStartSetsTop<_NORM_MAX_CANON_SETS &&
1839         canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0
1840     ) {
1841         uprv_memmove(canonStartSets+canonStartSetsTop,
1842                      canonStartSets+_NORM_MAX_CANON_SETS,
1843                      canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
1844     }
1845     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1846
1847     if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) &&
1848         canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0
1849     ) {
1850         uprv_memmove(canonStartSets+canonStartSetsTop,
1851                      canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
1852                      canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
1853     }
1854     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1855
1856     /* create the normalization exclusion sets */
1857     /*
1858      * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1859      * but we cannot use NFD_QC from the pattern because that would require
1860      * unorm.icu which we are just going to generate.
1861      * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1862      * with that.
1863      */
1864     U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
1865     U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1866
1867     canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
1868     set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
1869     if(U_FAILURE(errorCode)) {
1870         fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1871         exit(errorCode);
1872     }
1873     uset_retainAll(set, nfdQCNoSet);
1874     if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
1875         uset_clear(set);
1876     }
1877     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1878     if(U_FAILURE(errorCode)) {
1879         fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1880         exit(errorCode);
1881     }
1882     uset_close(set);
1883
1884     canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
1885     set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
1886     if(U_FAILURE(errorCode)) {
1887         fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1888         exit(errorCode);
1889     }
1890     if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
1891         uset_clear(set);
1892     }
1893     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1894     if(U_FAILURE(errorCode)) {
1895         fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1896         exit(errorCode);
1897     }
1898     uset_close(set);
1899
1900     canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
1901
1902     /* make sure that the FCD trie is 4-aligned */
1903     if((utm_countItems(extraMem)+combiningTableTop)&1) {
1904         combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
1905     }
1906
1907     /* pad canonStartSets to 4-alignment, too */
1908     if(canonStartSetsTop&1) {
1909         canonStartSets[canonStartSetsTop++]=0x1235;
1910     }
1911
1912     size=
1913         _NORM_INDEX_TOP*4+
1914         normTrieSize+
1915         utm_countItems(extraMem)*2+
1916         combiningTableTop*2+
1917         fcdTrieSize+
1918         auxTrieSize+
1919         canonStartSetsTop*2;
1920
1921     if(beVerbose) {
1922         printf("size of normalization trie              %5u bytes\n", (int)normTrieSize);
1923         printf("size of 16-bit extra memory             %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
1924         printf("  of that: FC_NFKC_Closure size         %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
1925         printf("size of combining table                 %5u uint16_t\n", combiningTableTop);
1926         printf("size of FCD trie                        %5u bytes\n", (int)fcdTrieSize);
1927         printf("size of auxiliary trie                  %5u bytes\n", (int)auxTrieSize);
1928         printf("size of canonStartSets[]                %5u uint16_t\n", (int)canonStartSetsTop);
1929         printf("  number of indexes                     %5u uint16_t\n", _NORM_SET_INDEX_TOP);
1930         printf("  size of sets                          %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
1931         printf("  number of sets                        %5d\n", (int)canonSetsCount);
1932         printf("  size of BMP search table              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
1933         printf("  size of supplementary search table    %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
1934         printf("  length of exclusion sets              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
1935         printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
1936     }
1937
1938     indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
1939     indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
1940
1941     indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
1942     indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
1943     indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
1944     indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
1945
1946     /* the quick check minimum code points are already set */
1947
1948     indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
1949     indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
1950     indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
1951
1952 #endif
1953
1954     if(csource) {
1955 #if UCONFIG_NO_NORMALIZATION
1956     /* no csource for dummy mode..? */
1957     fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
1958     exit(1);
1959 #else
1960         /* write .c file for hardcoded data */
1961         UTrie normTrie2={ NULL }, fcdTrie2={ NULL }, auxTrie2={ NULL };
1962         FILE *f;
1963
1964         utrie_unserialize(&normTrie2, normTrieBlock, normTrieSize, &errorCode);
1965         if(fcdTrieSize>0) {
1966             utrie_unserialize(&fcdTrie2, fcdTrieBlock, fcdTrieSize, &errorCode);
1967         }
1968         if(auxTrieSize>0) {
1969             utrie_unserialize(&auxTrie2, auxTrieBlock, auxTrieSize, &errorCode);
1970         }
1971         if(U_FAILURE(errorCode)) {
1972             fprintf(
1973                 stderr,
1974                 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
1975                 u_errorName(errorCode));
1976             exit(errorCode);
1977         }
1978
1979         f=usrc_create(dataDir, "unorm_props_data.c");
1980         if(f!=NULL) {
1981             usrc_writeArray(f,
1982                 "static const UVersionInfo formatVersion={ ",
1983                 dataInfo.formatVersion, 8, 4,
1984                 " };\n\n");
1985             usrc_writeArray(f,
1986                 "static const UVersionInfo dataVersion={ ",
1987                 dataInfo.dataVersion, 8, 4,
1988                 " };\n\n");
1989             usrc_writeArray(f,
1990                 "static const int32_t indexes[_NORM_INDEX_TOP]={\n",
1991                 indexes, 32, _NORM_INDEX_TOP,
1992                 "\n};\n\n");
1993             usrc_writeUTrieArrays(f,
1994                 "static const uint16_t normTrie_index[%ld]={\n",
1995                 "static const uint32_t normTrie_data32[%ld]={\n",
1996                 &normTrie2,
1997                 "\n};\n\n");
1998             usrc_writeUTrieStruct(f,
1999                 "static const UTrie normTrie={\n",
2000                 &normTrie2, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
2001                 "};\n\n");
2002             usrc_writeArray(f,
2003                 "static const uint16_t extraData[%ld]={\n",
2004                 utm_getStart(extraMem), 16, utm_countItems(extraMem),
2005                 "\n};\n\n");
2006             usrc_writeArray(f,
2007                 "static const uint16_t combiningTable[%ld]={\n",
2008                 combiningTable, 16, combiningTableTop,
2009                 "\n};\n\n");
2010             if(fcdTrieSize>0) {
2011                 usrc_writeUTrieArrays(f,
2012                     "static const uint16_t fcdTrie_index[%ld]={\n", NULL,
2013                     &fcdTrie2,
2014                     "\n};\n\n");
2015                 usrc_writeUTrieStruct(f,
2016                     "static const UTrie fcdTrie={\n",
2017                     &fcdTrie2, "fcdTrie_index", NULL, NULL,
2018                     "};\n\n");
2019             } else {
2020                 fputs( "static const UTrie fcdTrie={ NULL };\n\n", f);
2021             }
2022             if(auxTrieSize>0) {
2023                 usrc_writeUTrieArrays(f,
2024                     "static const uint16_t auxTrie_index[%ld]={\n", NULL,
2025                     &auxTrie2,
2026                     "\n};\n\n");
2027                 usrc_writeUTrieStruct(f,
2028                     "static const UTrie auxTrie={\n",
2029                     &auxTrie2, "auxTrie_index", NULL, "getFoldingAuxOffset",
2030                     "};\n\n");
2031             } else {
2032                 fputs( "static const UTrie auxTrie={ NULL };\n\n", f);
2033             }
2034             usrc_writeArray(f,
2035                 "static const uint16_t canonStartSets[%ld]={\n",
2036                 canonStartSets, 16, canonStartSetsTop,
2037                 "\n};\n\n");
2038             fclose(f);
2039         }
2040 #endif
2041     } else {
2042         /* write the data */
2043         pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
2044                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
2045         if(U_FAILURE(errorCode)) {
2046             fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
2047             exit(errorCode);
2048         }
2049
2050 #if !UCONFIG_NO_NORMALIZATION
2051
2052         udata_writeBlock(pData, indexes, sizeof(indexes));
2053         udata_writeBlock(pData, normTrieBlock, normTrieSize);
2054         udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
2055         udata_writeBlock(pData, combiningTable, combiningTableTop*2);
2056         udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
2057         udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
2058         udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
2059
2060 #endif
2061
2062         /* finish up */
2063         dataLength=udata_finish(pData, &errorCode);
2064         if(U_FAILURE(errorCode)) {
2065             fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
2066             exit(errorCode);
2067         }
2068
2069         if(dataLength!=size) {
2070             fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
2071                 (long)dataLength, (long)size);
2072             exit(U_INTERNAL_PROGRAM_ERROR);
2073         }
2074     }
2075 }
2076
2077 #if !UCONFIG_NO_NORMALIZATION
2078
2079 extern void
2080 cleanUpData(void) {
2081     int32_t i, count;
2082
2083     count=utm_countItems(normMem);
2084     for(i=0; i<count; ++i) {
2085         uset_close(norms[i].canonStart);
2086     }
2087
2088     utm_close(normMem);
2089     utm_close(utf32Mem);
2090     utm_close(extraMem);
2091     utm_close(combiningTriplesMem);
2092     utrie_close(normTrie);
2093     utrie_close(norm32Trie);
2094     utrie_close(fcdTrie);
2095     utrie_close(auxTrie);
2096
2097     uset_close(nfdQCNoSet);
2098
2099     uprv_free(normTrie);
2100     uprv_free(norm32Trie);
2101     uprv_free(fcdTrie);
2102     uprv_free(auxTrie);
2103 }
2104
2105 #endif /* #if !UCONFIG_NO_NORMALIZATION */
2106
2107 /*
2108  * Hey, Emacs, please set the following:
2109  *
2110  * Local Variables:
2111  * indent-tabs-mode: nil
2112  * End:
2113  *
2114  */