]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennorm/store.c
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / tools / gennorm / store.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
374ca955 4* Copyright (C) 1999-2004, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: store.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2001may25
14* created by: Markus W. Scherer
15*
16* Store Unicode normalization data in a memory-mappable file.
17*/
18
19#include <stdio.h>
20#include <stdlib.h>
21#include "unicode/utypes.h"
22#include "unicode/uchar.h"
374ca955 23#include "unicode/ustring.h"
b75a7d8f
A
24#include "cmemory.h"
25#include "cstring.h"
26#include "filestrm.h"
27#include "unicode/udata.h"
28#include "utrie.h"
29#include "unicode/uset.h"
374ca955 30#include "toolutil.h"
b75a7d8f
A
31#include "unewdata.h"
32#include "unormimp.h"
33#include "gennorm.h"
34#ifdef WIN32
35# pragma warning(disable: 4100)
36#endif
37
38#define DO_DEBUG_OUT 0
39
374ca955
A
40#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41
b75a7d8f
A
42/*
43 * The new implementation of the normalization code loads its data from
44 * unorm.icu, which is generated with this gennorm tool.
45 * The format of that file is described in unormimp.h .
46 */
47
48/* file data ---------------------------------------------------------------- */
49
50#if UCONFIG_NO_NORMALIZATION
51
52/* dummy UDataInfo cf. udata.h */
53static UDataInfo dataInfo = {
54 sizeof(UDataInfo),
55 0,
56
57 U_IS_BIG_ENDIAN,
58 U_CHARSET_FAMILY,
59 U_SIZEOF_UCHAR,
60 0,
61
62 { 0, 0, 0, 0 }, /* dummy dataFormat */
63 { 0, 0, 0, 0 }, /* dummy formatVersion */
64 { 0, 0, 0, 0 } /* dummy dataVersion */
65};
66
67#else
68
69/* UDataInfo cf. udata.h */
70static UDataInfo dataInfo={
71 sizeof(UDataInfo),
72 0,
73
74 U_IS_BIG_ENDIAN,
75 U_CHARSET_FAMILY,
76 U_SIZEOF_UCHAR,
77 0,
78
79 { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
374ca955 80 { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
b75a7d8f
A
81 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
82};
83
84extern void
85setUnicodeVersion(const char *v) {
86 UVersionInfo version;
87 u_versionFromString(version, v);
88 uprv_memcpy(dataInfo.dataVersion, version, 4);
89}
90
91static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
92
b75a7d8f
A
93/* builder data ------------------------------------------------------------- */
94
95typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
96
97static UNewTrie
374ca955
A
98 *normTrie,
99 *norm32Trie,
100 *fcdTrie,
101 *auxTrie;
b75a7d8f
A
102
103static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
104
105static Norm *norms;
106
107/*
108 * set a flag for each code point that was seen in decompositions -
109 * avoid to decompose ones that have not been used before
110 */
111static uint32_t haveSeenFlags[256];
112
374ca955
A
113/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
114static USet *nfdQCNoSet;
115
b75a7d8f
A
116/* see addCombiningCP() for details */
117static uint32_t combiningCPs[2000];
118
119/*
120 * after processCombining() this contains for each code point in combiningCPs[]
121 * the runtime combining index
122 */
123static uint16_t combiningIndexes[2000];
124
125/* section limits for combiningCPs[], see addCombiningCP() */
126static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
127
128/**
129 * Structure for a triple of code points, stored in combiningTriplesMem.
130 * The lead and trail code points combine into the the combined one,
131 * i.e., there is a canonical decomposition of combined-> <lead, trail>.
132 *
133 * Before processCombining() is called, leadIndex and trailIndex are 0.
134 * After processCombining(), they contain the indexes of the lead and trail
135 * code point in the combiningCPs[] array.
136 * They are then sorted by leadIndex, then trailIndex.
137 * They are not sorted by code points.
138 */
139typedef struct CombiningTriple {
140 uint16_t leadIndex, trailIndex;
141 uint32_t lead, trail, combined;
142} CombiningTriple;
143
144/* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
145static uint16_t combiningTable[0x8000];
146static uint16_t combiningTableTop=0;
147
148#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
374ca955
A
149static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
150 +10000]; /* +10000 for exclusion sets */
b75a7d8f
A
151static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
152static int32_t canonSetsCount=0;
153
154extern void
155init() {
156 uint16_t *p16;
157
374ca955
A
158 normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
159 uprv_memset(normTrie, 0, sizeof(UNewTrie));
160 norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
161 uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
162 fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
163 uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
164 auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
165 uprv_memset(auxTrie, 0, sizeof(UNewTrie));
166
b75a7d8f 167 /* initialize the two tries */
374ca955 168 if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
b75a7d8f
A
169 fprintf(stderr, "error: failed to initialize tries\n");
170 exit(U_MEMORY_ALLOCATION_ERROR);
171 }
172
173 /* allocate Norm structures and reset the first one */
374ca955 174 normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
b75a7d8f
A
175 norms=utm_alloc(normMem);
176
177 /* allocate UTF-32 string memory */
374ca955 178 utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
b75a7d8f
A
179
180 /* reset all "have seen" flags */
181 uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
182
374ca955
A
183 /* open an empty set */
184 nfdQCNoSet=uset_open(1, 0);
185
b75a7d8f 186 /* allocate extra data memory for UTF-16 decomposition strings and other values */
374ca955 187 extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
b75a7d8f
A
188 /* initialize the extraMem counter for the top of FNC strings */
189 p16=(uint16_t *)utm_alloc(extraMem);
190 *p16=1;
191
192 /* allocate temporary memory for combining triples */
374ca955 193 combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
b75a7d8f
A
194
195 /* set the minimum code points for no/maybe quick check values to the end of the BMP */
196 indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
197 indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
198 indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
199 indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
200
201 /* preset the indexes portion of canonStartSets */
202 uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
203}
204
205/*
206 * get or create a Norm unit;
207 * get or create the intermediate trie entries for it as well
208 */
209static Norm *
210createNorm(uint32_t code) {
211 Norm *p;
212 uint32_t i;
213
374ca955 214 i=utrie_get32(normTrie, (UChar32)code, NULL);
b75a7d8f
A
215 if(i!=0) {
216 p=norms+i;
217 } else {
218 /* allocate Norm */
219 p=(Norm *)utm_alloc(normMem);
374ca955 220 if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
b75a7d8f
A
221 fprintf(stderr, "error: too many normalization entries\n");
222 exit(U_BUFFER_OVERFLOW_ERROR);
223 }
224 }
225 return p;
226}
227
228/* get an existing Norm unit */
229static Norm *
230getNorm(uint32_t code) {
231 uint32_t i;
232
374ca955 233 i=utrie_get32(normTrie, (UChar32)code, NULL);
b75a7d8f
A
234 if(i==0) {
235 return NULL;
236 }
237 return norms+i;
238}
239
240/* get the canonical combining class of a character */
241static uint8_t
242getCCFromCP(uint32_t code) {
243 Norm *norm=getNorm(code);
244 if(norm==NULL) {
245 return 0;
246 } else {
247 return norm->udataCC;
248 }
249}
250
251/*
252 * enumerate all code points with their Norm structs and call a function for each
253 * return the number of code points with data
254 */
255static uint32_t
256enumTrie(EnumTrieFn *fn, void *context) {
257 uint32_t count, i;
258 UChar32 code;
259 UBool isInBlockZero;
260
261 count=0;
262 for(code=0; code<=0x10ffff;) {
374ca955 263 i=utrie_get32(normTrie, code, &isInBlockZero);
b75a7d8f
A
264 if(isInBlockZero) {
265 code+=UTRIE_DATA_BLOCK_LENGTH;
266 } else {
267 if(i!=0) {
268 fn(context, (uint32_t)code, norms+i);
269 ++count;
270 }
271 ++code;
272 }
273 }
274 return count;
275}
276
277static void
278setHaveSeenString(const uint32_t *s, int32_t length) {
279 uint32_t c;
280
281 while(length>0) {
282 c=*s++;
283 haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
284 --length;
285 }
286}
287
288#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
289
290/* handle combining data ---------------------------------------------------- */
291
292/*
293 * Insert an entry into combiningCPs[] for the new code point code with its flags.
294 * The flags indicate if code combines forward, backward, or both.
295 *
296 * combiningCPs[] contains three sections:
297 * 1. code points that combine forward
298 * 2. code points that combine forward and backward
299 * 3. code points that combine backward
300 *
301 * Search for code in the entire array.
302 * If it is found and already is in the right section (old flags==new flags)
303 * then we are done.
304 * If it is found but the flags are different, then remove it,
305 * union the old and new flags, and reinsert it into its correct section.
306 * If it is not found, then just insert it.
307 *
308 * Within each section, the code points are not sorted.
309 */
310static void
311addCombiningCP(uint32_t code, uint8_t flags) {
312 uint32_t newEntry;
313 uint16_t i;
314
315 newEntry=code|((uint32_t)flags<<24);
316
317 /* search for this code point */
318 for(i=0; i<combineBackTop; ++i) {
319 if(code==(combiningCPs[i]&0xffffff)) {
320 /* found it */
321 if(newEntry==combiningCPs[i]) {
322 return; /* no change */
323 }
324
325 /* combine the flags, remove the old entry from the old place, and insert the new one */
326 newEntry|=combiningCPs[i];
327 if(i!=--combineBackTop) {
328 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
329 }
330 if(i<combineBothTop) {
331 --combineBothTop;
332 }
333 if(i<combineFwdTop) {
334 --combineFwdTop;
335 }
336 break;
337 }
338 }
339
340 /* not found or modified, insert it */
341 if(combineBackTop>=sizeof(combiningCPs)/4) {
342 fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
343 (long)(sizeof(combiningCPs)/4));
344 exit(U_MEMORY_ALLOCATION_ERROR);
345 }
346
347 /* set i to the insertion point */
348 flags=(uint8_t)(newEntry>>24);
349 if(flags==1) {
350 i=combineFwdTop++;
351 ++combineBothTop;
352 } else if(flags==3) {
353 i=combineBothTop++;
354 } else /* flags==2 */ {
355 i=combineBackTop;
356 }
357
358 /* move the following code points up one and insert newEntry at i */
359 if(i<combineBackTop) {
360 uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
361 }
362 combiningCPs[i]=newEntry;
363
364 /* finally increment the total counter */
365 ++combineBackTop;
366}
367
368/**
369 * Find the index in combiningCPs[] where code point code is stored.
370 * @param code code point to look for
371 * @param isLead is code a forward combining code point?
372 * @return index in combiningCPs[] where code is stored
373 */
374static uint16_t
375findCombiningCP(uint32_t code, UBool isLead) {
376 uint16_t i, limit;
377
378 if(isLead) {
379 i=0;
380 limit=combineBothTop;
381 } else {
382 i=combineFwdTop;
383 limit=combineBackTop;
384 }
385
386 /* search for this code point */
387 for(; i<limit; ++i) {
388 if(code==(combiningCPs[i]&0xffffff)) {
389 /* found it */
390 return i;
391 }
392 }
393
394 /* not found */
395 return 0xffff;
396}
397
398static void
399addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
400 CombiningTriple *triple;
401
402 /*
403 * set combiningFlags for the two code points
404 * do this after decomposition so that getNorm() above returns NULL
405 * if we do not have actual sub-decomposition data for the initial NFD here
406 */
407 createNorm(lead)->combiningFlags|=1; /* combines forward */
408 createNorm(trail)->combiningFlags|=2; /* combines backward */
409
410 addCombiningCP(lead, 1);
411 addCombiningCP(trail, 2);
412
413 triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
414 triple->lead=lead;
415 triple->trail=trail;
416 triple->combined=combined;
417}
418
419static int
420compareTriples(const void *l, const void *r) {
421 int diff;
422 diff=(int)((CombiningTriple *)l)->leadIndex-
423 (int)((CombiningTriple *)r)->leadIndex;
424 if(diff==0) {
425 diff=(int)((CombiningTriple *)l)->trailIndex-
426 (int)((CombiningTriple *)r)->trailIndex;
427 }
428 return diff;
429}
430
431static void
432processCombining() {
433 CombiningTriple *triples;
434 uint16_t *p;
435 uint32_t combined;
436 uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
437
438 triples=utm_getStart(combiningTriplesMem);
439
440 /* add lead and trail indexes to the triples for sorting */
374ca955 441 count=(uint16_t)utm_countItems(combiningTriplesMem);
b75a7d8f
A
442 for(i=0; i<count; ++i) {
443 /* findCombiningCP() must always find the code point */
444 triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
445 triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
446 }
447
448 /* sort them by leadIndex, trailIndex */
449 qsort(triples, count, sizeof(CombiningTriple), compareTriples);
450
451 /* calculate final combining indexes and store them in the Norm entries */
452 tableTop=0;
453 j=0; /* triples counter */
454
455 /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
456 for(i=0; i<combineBothTop; ++i) {
457 /* start a new table */
458
459 /* assign combining index */
460 createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
461
462 /* calculate the length of the combining data for this lead code point in the combiningTable */
463 while(j<count && i==triples[j].leadIndex) {
464 /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
465 combined=triples[j++].combined;
466 if(combined<=0x1fff) {
467 tableTop+=2;
468 } else {
469 tableTop+=3;
470 }
471 }
472 }
473
474 /* second, combining indexes of back-only characters are simply incremented from here to be unique */
475 finalIndex=tableTop;
476 for(; i<combineBackTop; ++i) {
477 createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
478 }
479
480 /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
481 if(finalIndex>0x8000) {
482 fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
483 tableTop, (long)(sizeof(combiningTable)/4));
484 exit(U_MEMORY_ALLOCATION_ERROR);
485 }
486
487 combiningTableTop=tableTop;
488
489 /* store the combining data in the combiningTable, with the final indexes from above */
490 p=combiningTable;
491 j=0; /* triples counter */
492
493 /*
494 * this is essentially the same loop as above, but
495 * it writes the table data instead of calculating and setting the final indexes;
496 * it is necessary to have two passes so that all the final indexes are known before
497 * they are written into the table
498 */
499 for(i=0; i<combineBothTop; ++i) {
500 /* start a new table */
501
502 combined=0; /* avoid compiler warning */
503
504 /* store the combining data for this lead code point in the combiningTable */
505 while(j<count && i==triples[j].leadIndex) {
506 finalIndex=combiningIndexes[triples[j].trailIndex];
507 combined=triples[j++].combined;
508
509 /* is combined a starter? (i.e., cc==0 && combines forward) */
510 combinesFwd=(uint16_t)((getNorm(combined)->combiningFlags&1)<<13);
511
512 *p++=finalIndex;
513 if(combined<=0x1fff) {
514 *p++=(uint16_t)(combinesFwd|combined);
515 } else if(combined<=0xffff) {
516 *p++=(uint16_t)(0x8000|combinesFwd);
517 *p++=(uint16_t)combined;
518 } else {
519 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
520 *p++=(uint16_t)(0xdc00|(combined&0x3ff));
521 }
522 }
523
524 /* set a marker on the last final trail index in this lead's table */
525 if(combined<=0x1fff) {
526 *(p-2)|=0x8000;
527 } else {
528 *(p-3)|=0x8000;
529 }
530 }
531
532 /* post condition: tableTop==(p-combiningTable) */
533}
534
535/* processing incoming normalization data ----------------------------------- */
536
537/*
538 * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
539 * c must be a Hangul syllable code point.
540 */
541static void
542getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
543 /* Hangul syllable: decompose algorithmically */
544 uint32_t c2;
545 uint8_t length;
546
547 uprv_memset(pHangulNorm, 0, sizeof(Norm));
548
549 c-=HANGUL_BASE;
550
551 c2=c%JAMO_T_COUNT;
552 c/=JAMO_T_COUNT;
553 if(c2>0) {
554 hangulBuffer[2]=JAMO_T_BASE+c2;
555 length=3;
556 } else {
557 hangulBuffer[2]=0;
558 length=2;
559 }
560
561 hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
562 hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
563
564 pHangulNorm->nfd=pHangulNorm->nfkd=hangulBuffer;
565 pHangulNorm->lenNFD=pHangulNorm->lenNFKD=length;
566}
567
568/*
569 * decompose the one decomposition further, may generate two decompositions
570 * apply all previous characters' decompositions to this one
571 */
572static void
573decompStoreNewNF(uint32_t code, Norm *norm) {
574 uint32_t nfd[40], nfkd[40], hangulBuffer[3];
575 Norm hangulNorm;
576
577 uint32_t *s32;
578 Norm *p;
579 uint32_t c;
580 int32_t i, length;
581 uint8_t lenNFD=0, lenNFKD=0;
582 UBool changedNFD=FALSE, changedNFKD=FALSE;
583
584 if((length=norm->lenNFD)!=0) {
585 /* always allocate the original string */
586 changedNFD=TRUE;
587 s32=norm->nfd;
588 } else if((length=norm->lenNFKD)!=0) {
589 /* always allocate the original string */
590 changedNFKD=TRUE;
591 s32=norm->nfkd;
592 } else {
593 /* no decomposition here, nothing to do */
594 return;
595 }
596
597 /* decompose each code point */
598 for(i=0; i<length; ++i) {
599 c=s32[i];
600 p=getNorm(c);
601 if(p==NULL) {
602 if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
603 getHangulDecomposition(c, &hangulNorm, hangulBuffer);
604 p=&hangulNorm;
605 } else {
606 /* no data, no decomposition */
607 nfd[lenNFD++]=c;
608 nfkd[lenNFKD++]=c;
609 continue;
610 }
611 }
612
613 /* canonically decompose c */
614 if(changedNFD) {
615 if(p->lenNFD!=0) {
616 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
617 lenNFD+=p->lenNFD;
618 } else {
619 nfd[lenNFD++]=c;
620 }
621 }
622
623 /* compatibility-decompose c */
624 if(p->lenNFKD!=0) {
625 uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
626 lenNFKD+=p->lenNFKD;
627 changedNFKD=TRUE;
628 } else if(p->lenNFD!=0) {
629 uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
630 lenNFKD+=p->lenNFD;
631 changedNFKD=TRUE;
632 } else {
633 nfkd[lenNFKD++]=c;
634 }
635 }
636
637 /* assume that norm->lenNFD==1 or ==2 */
638 if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
639 addCombiningTriple(s32[0], s32[1], code);
640 }
641
642 if(changedNFD) {
643 if(lenNFD!=0) {
644 s32=utm_allocN(utf32Mem, lenNFD);
645 uprv_memcpy(s32, nfd, lenNFD*4);
646 } else {
647 s32=NULL;
648 }
649 norm->lenNFD=lenNFD;
650 norm->nfd=s32;
651 setHaveSeenString(nfd, lenNFD);
652 }
653 if(changedNFKD) {
654 if(lenNFKD!=0) {
655 s32=utm_allocN(utf32Mem, lenNFKD);
656 uprv_memcpy(s32, nfkd, lenNFKD*4);
657 } else {
658 s32=NULL;
659 }
660 norm->lenNFKD=lenNFKD;
661 norm->nfkd=s32;
662 setHaveSeenString(nfkd, lenNFKD);
663 }
664}
665
666typedef struct DecompSingle {
667 uint32_t c;
668 Norm *norm;
669} DecompSingle;
670
671/*
672 * apply this one character's decompositions (there is at least one!) to
673 * all previous characters' decompositions to decompose them further
674 */
675static void
676decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
677 uint32_t nfd[40], nfkd[40];
678 uint32_t *s32;
679 DecompSingle *me=(DecompSingle *)context;
680 uint32_t c, myC;
681 int32_t i, length;
682 uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
683 UBool changedNFD=FALSE, changedNFKD=FALSE;
684
685 /* get the new character's data */
686 myC=me->c;
687 myLenNFD=me->norm->lenNFD;
688 myLenNFKD=me->norm->lenNFKD;
689 /* assume that myC has at least one decomposition */
690
691 if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
692 /* apply NFD(myC) to norm->nfd */
693 s32=norm->nfd;
694 for(i=0; i<length; ++i) {
695 c=s32[i];
696 if(c==myC) {
697 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
698 lenNFD+=myLenNFD;
699 changedNFD=TRUE;
700 } else {
701 nfd[lenNFD++]=c;
702 }
703 }
704 }
705
706 if((length=norm->lenNFKD)!=0) {
707 /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
708 s32=norm->nfkd;
709 for(i=0; i<length; ++i) {
710 c=s32[i];
711 if(c==myC) {
712 if(myLenNFKD!=0) {
713 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
714 lenNFKD+=myLenNFKD;
715 } else /* assume myLenNFD!=0 */ {
716 uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
717 lenNFKD+=myLenNFD;
718 }
719 changedNFKD=TRUE;
720 } else {
721 nfkd[lenNFKD++]=c;
722 }
723 }
724 } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
725 /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
726 s32=norm->nfd;
727 for(i=0; i<length; ++i) {
728 c=s32[i];
729 if(c==myC) {
730 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
731 lenNFKD+=myLenNFKD;
732 changedNFKD=TRUE;
733 } else {
734 nfkd[lenNFKD++]=c;
735 }
736 }
737 }
738
739 /* set the new decompositions, forget the old ones */
740 if(changedNFD) {
741 if(lenNFD!=0) {
742 if(lenNFD>norm->lenNFD) {
743 s32=utm_allocN(utf32Mem, lenNFD);
744 } else {
745 s32=norm->nfd;
746 }
747 uprv_memcpy(s32, nfd, lenNFD*4);
748 } else {
749 s32=NULL;
750 }
751 norm->lenNFD=lenNFD;
752 norm->nfd=s32;
753 }
754 if(changedNFKD) {
755 if(lenNFKD!=0) {
756 if(lenNFKD>norm->lenNFKD) {
757 s32=utm_allocN(utf32Mem, lenNFKD);
758 } else {
759 s32=norm->nfkd;
760 }
761 uprv_memcpy(s32, nfkd, lenNFKD*4);
762 } else {
763 s32=NULL;
764 }
765 norm->lenNFKD=lenNFKD;
766 norm->nfkd=s32;
767 }
768}
769
770/*
771 * process the data for one code point listed in UnicodeData;
772 * UnicodeData itself never maps a code point to both NFD and NFKD
773 */
774extern void
775storeNorm(uint32_t code, Norm *norm) {
776 DecompSingle decompSingle;
777 Norm *p;
778
779 /* copy existing derived normalization properties */
780 p=createNorm(code);
781 norm->qcFlags=p->qcFlags;
782 norm->combiningFlags=p->combiningFlags;
783 norm->fncIndex=p->fncIndex;
784
785 /* process the decomposition if if there is at one here */
786 if((norm->lenNFD|norm->lenNFKD)!=0) {
787 /* decompose this one decomposition further, may generate two decompositions */
788 decompStoreNewNF(code, norm);
789
790 /* has this code point been used in previous decompositions? */
791 if(HAVE_SEEN(code)) {
792 /* use this decomposition to decompose other decompositions further */
793 decompSingle.c=code;
794 decompSingle.norm=norm;
795 enumTrie(decompWithSingleFn, &decompSingle);
796 }
797 }
798
799 /* store the data */
800 uprv_memcpy(p, norm, sizeof(Norm));
801}
802
803extern void
804setQCFlags(uint32_t code, uint8_t qcFlags) {
805 createNorm(code)->qcFlags|=qcFlags;
806
807 /* adjust the minimum code point for quick check no/maybe */
808 if(code<0xffff) {
809 if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
810 indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
811 }
812 if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
813 indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
814 }
815 if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
816 indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
817 }
818 if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
819 indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
820 }
821 }
374ca955
A
822
823 if(qcFlags&_NORM_QC_NFD) {
824 uset_add(nfdQCNoSet, (UChar32)code);
825 }
b75a7d8f
A
826}
827
828extern void
829setCompositionExclusion(uint32_t code) {
830 createNorm(code)->combiningFlags|=0x80;
831}
832
833static void
834setHangulJamoSpecials() {
835 Norm *norm;
836 uint32_t c, hangul;
837
838 /*
839 * Hangul syllables are algorithmically decomposed into Jamos,
840 * and Jamos are algorithmically composed into Hangul syllables.
841 * The quick check flags are parsed, except for Hangul.
842 */
843
844 /* set Jamo L specials */
845 hangul=0xac00;
846 for(c=0x1100; c<=0x1112; ++c) {
847 norm=createNorm(c);
848 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
849 norm->combiningFlags=1;
850
851 /* for each Jamo L create a set with its associated Hangul block */
852 norm->canonStart=uset_open(hangul, hangul+21*28-1);
853 hangul+=21*28;
854 }
855
856 /* set Jamo V specials */
857 for(c=0x1161; c<=0x1175; ++c) {
858 norm=createNorm(c);
859 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
860 norm->combiningFlags=2;
861 norm->unsafeStart=TRUE;
862 }
863
864 /* set Jamo T specials */
865 for(c=0x11a8; c<=0x11c2; ++c) {
866 norm=createNorm(c);
867 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
868 norm->combiningFlags=2;
869 norm->unsafeStart=TRUE;
870 }
871
872 /* set Hangul specials, precompacted */
873 norm=(Norm *)utm_alloc(normMem);
874 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
875 norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
876
374ca955 877 if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
b75a7d8f
A
878 fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
879 exit(U_BUFFER_OVERFLOW_ERROR);
880 }
881}
882
883/*
884 * set FC-NFKC-Closure string
885 * s contains the closure string; s[0]==length, s[1..length] is the actual string
886 * may modify s[0]
887 */
888U_CFUNC void
889setFNC(uint32_t c, UChar *s) {
890 uint16_t *p;
891 int32_t length, i, count;
892 UChar first;
893
894 count=utm_countItems(extraMem);
895 length=s[0];
896 first=s[1];
897
898 /* try to overlay single-unit strings with existing ones */
899 if(length==1 && first<0xff00) {
900 p=utm_getStart(extraMem);
901 for(i=1; i<count; ++i) {
902 if(first==p[i]) {
903 break;
904 }
905 }
906 } else {
907 i=count;
908 }
909
910 /* append the new string if it cannot be overlayed with an old one */
911 if(i==count) {
912 if(count>_NORM_AUX_MAX_FNC) {
913 fprintf(stderr, "gennorm error: too many FNC strings\n");
914 exit(U_INDEX_OUTOFBOUNDS_ERROR);
915 }
916
917 /* prepend 0xffxx with xx==length */
918 s[0]=(uint16_t)(0xff00+length);
919 ++length;
920 p=(uint16_t *)utm_allocN(extraMem, length);
921 uprv_memcpy(p, s, length*2);
922
923 /* update the top index in extraMem[0] */
924 count+=length;
925 ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
926 }
927
928 /* store the index to the string */
929 createNorm(c)->fncIndex=i;
930}
931
932/* build runtime structures ------------------------------------------------- */
933
934/* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
935static uint16_t
936reorderString(uint32_t *s, int32_t length) {
937 uint8_t ccs[40];
938 uint32_t c;
939 int32_t i, j;
940 uint8_t cc, prevCC;
941
942 if(length<=0) {
943 return 0;
944 }
945
946 for(i=0; i<length; ++i) {
947 /* get the i-th code point and its combining class */
948 c=s[i];
949 cc=getCCFromCP(c);
950 if(cc!=0 && i!=0) {
951 /* it is a combining mark, see if it needs to be moved back */
952 j=i;
953 do {
954 prevCC=ccs[j-1];
955 if(prevCC<=cc) {
956 break; /* found the right place */
957 }
958 /* move the previous code point here and go back */
959 s[j]=s[j-1];
960 ccs[j]=prevCC;
961 } while(--j!=0);
962 s[j]=c;
963 ccs[j]=cc;
964 } else {
965 /* just store the combining class */
966 ccs[i]=cc;
967 }
968 }
969
970 return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
971}
972
973static UBool combineAndQC[64]={ 0 };
974
975/*
976 * canonically reorder the up to two decompositions
977 * and store the leading and trailing combining classes accordingly
978 *
979 * also process canonical decompositions for canonical closure
980 */
981static void
982postParseFn(void *context, uint32_t code, Norm *norm) {
983 int32_t length;
984
985 /* canonically order the NFD */
986 length=norm->lenNFD;
987 if(length>0) {
988 norm->canonBothCCs=reorderString(norm->nfd, length);
989 }
990
991 /* canonically reorder the NFKD */
992 length=norm->lenNFKD;
993 if(length>0) {
994 norm->compatBothCCs=reorderString(norm->nfkd, length);
995 }
996
997 /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
998 if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
999 fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
1000 }
1001 if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
1002 fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
1003 }
1004
1005 /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1006 combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
1007
1008 if(norm->combiningFlags&1) {
1009 if(norm->udataCC!=0) {
1010 /* illegal - data-derivable composition exclusion */
1011 fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
1012 }
1013 }
1014 if(norm->combiningFlags&2) {
1015 if((norm->qcFlags&0x11)==0) {
1016 fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
1017 }
1018#if 0
1019 /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1020 if(norm->udataCC==0) {
1021 printf("U+%04lx combines backward but udataCC==0\n", (long)code);
1022 }
1023#endif
1024 }
1025 if((norm->combiningFlags&3)==3 && beVerbose) {
1026 printf("U+%04lx combines both ways\n", (long)code);
1027 }
1028
1029 /*
1030 * process canonical decompositions for canonical closure
1031 *
1032 * in each canonical decomposition:
1033 * add the current character (code) to the set of canonical starters of its norm->nfd[0]
1034 * set the "unsafe starter" flag for each norm->nfd[1..]
1035 */
1036 length=norm->lenNFD;
1037 if(length>0) {
1038 Norm *otherNorm;
1039 UChar32 c;
1040 int32_t i;
1041
1042 /* nfd[0].canonStart.add(code) */
1043 c=norm->nfd[0];
1044 otherNorm=createNorm(c);
1045 if(otherNorm->canonStart==NULL) {
1046 otherNorm->canonStart=uset_open(code, code);
1047 if(otherNorm->canonStart==NULL) {
1048 fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
1049 exit(U_MEMORY_ALLOCATION_ERROR);
1050 }
1051 } else {
1052 uset_add(otherNorm->canonStart, code);
1053 if(!uset_contains(otherNorm->canonStart, code)) {
374ca955 1054 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
b75a7d8f
A
1055 exit(U_INTERNAL_PROGRAM_ERROR);
1056 }
1057 }
1058
1059 /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1060 for(i=1; i<length; ++i) {
1061 createNorm(norm->nfd[i])->unsafeStart=TRUE;
1062 }
1063 }
1064}
1065
1066static uint32_t
1067make32BitNorm(Norm *norm) {
1068 UChar extra[100];
1069 const Norm *other;
1070 uint32_t word;
1071 int32_t i, length, beforeZero=0, count, start;
1072
1073 /*
1074 * Check for assumptions:
1075 *
1076 * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1077 * then the decomposition also begins with a true starter.
1078 */
1079 if(norm->udataCC==0) {
1080 /* this is a starter */
1081 if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
1082 /* a "true" NFC starter with a canonical decomposition */
1083 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1084 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
1085 ) {
1086 fprintf(stderr,
1087 "error: true NFC starter canonical decomposition[%u] does not begin\n"
1088 " with a true NFC starter: U+%04lx U+%04lx%s\n",
1089 norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1090 norm->lenNFD<=2 ? "" : " ...");
1091 exit(U_INVALID_TABLE_FILE);
1092 }
1093 }
1094
1095 if((norm->qcFlags&_NORM_QC_NFKC)==0) {
1096 if(norm->lenNFKD>0) {
1097 /* a "true" NFKC starter with a compatibility decomposition */
1098 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
1099 ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFC_YES */
1100 ) {
1101 fprintf(stderr,
1102 "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1103 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1104 norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1], norm->lenNFKD<=2 ? "" : " ...");
1105 exit(U_INVALID_TABLE_FILE);
1106 }
1107 } else if(norm->lenNFD>0) {
1108 /* a "true" NFKC starter with only a canonical decomposition */
1109 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1110 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFC_YES */
1111 ) {
1112 fprintf(stderr,
1113 "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1114 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1115 norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1116 norm->lenNFD<=2 ? "" : " ...");
1117 exit(U_INVALID_TABLE_FILE);
1118 }
1119 }
1120 }
1121 }
1122
1123 /* reset the 32-bit word and set the quick check flags */
1124 word=norm->qcFlags;
1125
1126 /* set the UnicodeData combining class */
1127 word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
1128
1129 /* set the combining flag and index */
1130 if(norm->combiningFlags&3) {
1131 word|=(uint32_t)(norm->combiningFlags&3)<<6;
1132 }
1133
1134 /* set the combining index value into the extra data */
1135 if(norm->combiningIndex!=0) {
1136 extra[0]=norm->combiningIndex;
1137 beforeZero=1;
1138 }
1139
1140 count=beforeZero;
1141
1142 /* write the decompositions */
1143 if((norm->lenNFD|norm->lenNFKD)!=0) {
1144 extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
1145
1146 length=norm->lenNFD;
1147 if(length>0) {
1148 if(norm->canonBothCCs!=0) {
1149 extra[beforeZero]|=0x80;
1150 extra[count++]=norm->canonBothCCs;
1151 }
1152 start=count;
1153 for(i=0; i<length; ++i) {
1154 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
1155 }
1156 extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
1157 }
1158
1159 length=norm->lenNFKD;
1160 if(length>0) {
1161 if(norm->compatBothCCs!=0) {
1162 extra[beforeZero]|=0x8000;
1163 extra[count++]=norm->compatBothCCs;
1164 }
1165 start=count;
1166 for(i=0; i<length; ++i) {
1167 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
1168 }
1169 extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
1170 }
1171 }
1172
1173 /* allocate and copy the extra data */
1174 if(count!=0) {
1175 UChar *p;
1176
1177 if(norm->specialTag!=0) {
1178 fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
1179 exit(U_ILLEGAL_ARGUMENT_ERROR);
1180 }
1181
1182 p=(UChar *)utm_allocN(extraMem, count);
1183 uprv_memcpy(p, extra, count*2);
1184
1185 /* set the extra index, offset by beforeZero */
1186 word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
1187 } else if(norm->specialTag!=0) {
1188 /* set a special tag instead of an extra index */
1189 word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
1190 }
1191
1192 return word;
1193}
1194
1195/* turn all Norm structs into corresponding 32-bit norm values */
1196static void
1197makeAll32() {
1198 uint32_t *pNormData;
1199 uint32_t n;
1200 int32_t i, normLength, count;
1201
374ca955 1202 count=(int32_t)utm_countItems(normMem);
b75a7d8f
A
1203 for(i=0; i<count; ++i) {
1204 norms[i].value32=make32BitNorm(norms+i);
1205 }
1206
374ca955 1207 pNormData=utrie_getData(norm32Trie, &normLength);
b75a7d8f
A
1208
1209 count=0;
1210 for(i=0; i<normLength; ++i) {
1211 n=pNormData[i];
1212 if(0!=(pNormData[i]=norms[n].value32)) {
1213 ++count;
1214 }
1215 }
1216}
1217
1218/*
1219 * extract all Norm.canonBothCCs into the FCD table
1220 * set 32-bit values to use the common fold and compact functions
1221 */
1222static void
1223makeFCD() {
1224 uint32_t *pFCDData;
1225 uint32_t n;
1226 int32_t i, count, fcdLength;
1227 uint16_t bothCCs;
1228
374ca955 1229 count=utm_countItems(normMem);
b75a7d8f
A
1230 for(i=0; i<count; ++i) {
1231 bothCCs=norms[i].canonBothCCs;
1232 if(bothCCs==0) {
1233 /* if there are no decomposition cc's then use the udataCC twice */
1234 bothCCs=norms[i].udataCC;
1235 bothCCs|=bothCCs<<8;
1236 }
1237 norms[i].value32=bothCCs;
1238 }
1239
374ca955 1240 pFCDData=utrie_getData(fcdTrie, &fcdLength);
b75a7d8f
A
1241
1242 for(i=0; i<fcdLength; ++i) {
1243 n=pFCDData[i];
1244 pFCDData[i]=norms[n].value32;
1245 }
1246}
1247
1248/**
1249 * If the given set contains exactly one character, then return it.
1250 * Otherwise return -1.
1251 */
1252static int32_t
1253usetContainsOne(const USet* set) {
1254 if (uset_size(set) == 1) { /* ### faster to count ranges and check only range?! */
1255 UChar32 start, end;
1256 UErrorCode ec = U_ZERO_ERROR;
1257 int32_t len = uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
1258 if (len == 0) return start;
1259 }
1260 return -1;
1261}
1262
1263static void
1264makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
1265 if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
1266 uint16_t *table;
1267 int32_t c, tableLength;
1268 UErrorCode errorCode=U_ZERO_ERROR;
1269
1270 /* does the set contain exactly one code point? */
1271 c=usetContainsOne(norm->canonStart); /* ### why? */
1272
1273 /* add an entry to the BMP or supplementary search table */
1274 if(code<=0xffff) {
1275 table=canonStartSets+_NORM_MAX_CANON_SETS;
1276 tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1277
1278 table[tableLength++]=(uint16_t)code;
1279
1280 if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
1281 /* single-code point BMP result for BMP code point */
1282 table[tableLength++]=(uint16_t)c;
1283 } else {
1284 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
1285 c=-1;
1286 }
1287 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
1288 } else {
1289 table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
1290 tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1291
1292 table[tableLength++]=(uint16_t)(code>>16);
1293 table[tableLength++]=(uint16_t)code;
1294
1295 if(c>=0) {
1296 /* single-code point result for supplementary code point */
1297 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); /* ### how does this work again? */
1298 table[tableLength++]=(uint16_t)c;
1299 } else {
1300 table[tableLength++]=(uint16_t)canonStartSetsTop;
1301 }
1302 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
1303 }
1304
1305 if(c<0) {
1306 /* write a USerializedSet */
1307 ++canonSetsCount;
1308 canonStartSetsTop+=
1309 uset_serialize(norm->canonStart,
1310 canonStartSets+canonStartSetsTop,
1311 _NORM_MAX_CANON_SETS-canonStartSetsTop,
1312 &errorCode);
1313 }
1314 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1315
1316 if(U_FAILURE(errorCode)) {
374ca955 1317 fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
b75a7d8f
A
1318 exit(errorCode);
1319 }
1320 if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
1321 fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
1322 exit(U_INDEX_OUTOFBOUNDS_ERROR);
1323 }
1324 }
1325}
1326
1327/* for getSkippableFlags ---------------------------------------------------- */
1328
1329/* combine the lead and trail code points; return <0 if they do not combine */
1330static int32_t
1331combine(uint32_t lead, uint32_t trail) {
1332 CombiningTriple *triples;
1333 uint32_t i, count;
1334
1335 /* search for all triples with c as lead code point */
1336 triples=utm_getStart(combiningTriplesMem);
374ca955 1337 count=utm_countItems(combiningTriplesMem);
b75a7d8f
A
1338
1339 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1340 for(i=0; i<count && lead!=triples[i].lead; ++i) {}
1341
1342 /* check each triple for this code point */
1343 for(; i<count && lead==triples[i].lead; ++i) {
1344 if(trail==triples[i].trail) {
1345 return (int32_t)triples[i].combined;
1346 }
1347 }
1348
1349 return -1;
1350}
1351
1352/*
1353 * Starting from the canonical decomposition s[0..length[ of a single code point,
1354 * is the code point c consumed in an NFC/FCC recomposition?
1355 *
1356 * No need to handle discontiguous composition because that would not consume some
1357 * intermediate character, so would not compose back to the original character.
1358 * See comments in canChangeWithFollowing().
1359 *
1360 * No need to compose beyond where c canonically orders because if it is consumed
1361 * then the result differs from the original anyway.
1362 *
1363 * Possible optimization:
1364 * - Verify that there are no cases of the same combining mark stacking twice.
1365 * - return FALSE right away if c inserts after a copy of itself
1366 * without attempting to recompose; will happen because each mark in
1367 * the decomposition will be enumerated and passed in as c.
1368 * More complicated and fragile though than it is already.
1369 *
1370 * markus 2002nov04
1371 */
1372static UBool
1373doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
1374 int32_t starter, i;
1375
1376 /* ignore trailing characters where cc<prevCC */
1377 while(length>1 && cc<getCCFromCP(s[length-1])) {
1378 --length;
1379 }
1380
1381 /* start consuming/combining from the beginning */
1382 starter=(int32_t)s[0];
1383 for(i=1; i<length; ++i) {
1384 starter=combine((uint32_t)starter, s[i]);
1385 if(starter<0) {
374ca955
A
1386 fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1387 (int)s[0], (int)s[1], (int)length, (int)c, cc);
b75a7d8f
A
1388 exit(U_INTERNAL_PROGRAM_ERROR);
1389 }
1390 }
1391
1392 /* try to combine/consume c, return TRUE if it is consumed */
1393 return combine((uint32_t)starter, c)>=0;
1394}
1395
1396/* does the starter s[0] combine forward with another char that is below trailCC? */
1397static UBool
1398canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
1399 if(trailCC<=1) {
1400 /* no character will combine ahead of the trailing char of the decomposition */
1401 return FALSE;
1402 }
1403
1404 /*
1405 * We are only checking skippable condition (f).
1406 * Therefore, the original character does not have quick check flag NFC_NO (c),
1407 * i.e., the decomposition recomposes completely back into the original code point.
1408 * So s[0] must be a true starter with cc==0 and
1409 * combining with following code points.
1410 *
1411 * Similarly, length==1 is not possible because that would be a singleton
1412 * decomposition which is marked with NFC_NO and does not pass (c).
1413 *
1414 * Only a character with cc<trailCC can change the composition.
1415 * Reason: A char with cc>=trailCC would order after decomposition s[],
1416 * composition would consume all of the decomposition, and here we know that
1417 * the original char passed check d), i.e., it does not combine forward,
1418 * therefore does not combine with anything after the decomposition is consumed.
1419 *
1420 * Now see if there is a character that
1421 * 1. combines backward
1422 * 2. has cc<trailCC
1423 * 3. is consumed in recomposition
1424 *
1425 * length==2 is simple:
1426 *
1427 * Characters that fulfill these conditions are exactly the ones that combine directly
1428 * with the starter c==s[0] because there is no intervening character after
1429 * reordering.
1430 * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1431 * and see if one has cc<trailCC (passes 2.).
1432 *
1433 * length>2 is a little harder:
1434 *
1435 * Since we will get different starters during recomposition, we need to
1436 * enumerate each backward-combining character (1.)
1437 * with cc<trailCC (2.) and
1438 * see if it gets consumed in recomposition. (3.)
1439 * No need to enumerate both-ways combining characters because they must have cc==0.
1440 */
1441 if(length==2) {
1442 /* enumerate all chars that combine with this one and check their cc */
1443 CombiningTriple *triples;
1444 uint32_t c, i, count;
1445 uint8_t cc;
1446
1447 /* search for all triples with c as lead code point */
1448 triples=utm_getStart(combiningTriplesMem);
374ca955 1449 count=utm_countItems(combiningTriplesMem);
b75a7d8f
A
1450 c=s[0];
1451
1452 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1453 for(i=0; i<count && c!=triples[i].lead; ++i) {}
1454
1455 /* check each triple for this code point */
1456 for(; i<count && c==triples[i].lead; ++i) {
1457 cc=getCCFromCP(triples[i].trail);
1458 if(cc>0 && cc<trailCC) {
1459 /* this trail code point combines with c and has cc<trailCC */
1460 return TRUE;
1461 }
1462 }
1463 } else {
1464 /* enumerate all chars that combine backward */
1465 uint32_t c2;
1466 uint16_t i;
1467 uint8_t cc;
1468
1469 for(i=combineBothTop; i<combineBackTop; ++i) {
1470 c2=combiningCPs[i]&0xffffff;
1471 cc=getCCFromCP(c2);
1472 /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1473 if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
1474 return TRUE;
1475 }
1476 }
1477 }
1478
1479 /* this decomposition is not modified by any appended character */
1480 return FALSE;
1481}
1482
1483/* see unormimp.h for details on NF*C Skippable flags */
1484static uint32_t
1485getSkippableFlags(const Norm *norm) {
1486 /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1487
1488 /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1489 if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
1490 return 0;
1491 }
1492
1493 /* ### check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1494
1495 /*
1496 * Note:
1497 * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1498 *
1499 * This means that (a)..(e) must always be derived from the runtime norm32 value,
1500 * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1501 * the form is NF*C and there is a canonical decomposition (NFD_NO).
1502 *
1503 * (a) unassigned code points get "not skippable"==false because they
1504 * don't have a Norm struct so they won't get here
1505 */
1506
1507 /* (b) not skippable if cc!=0 */
1508 if(norm->udataCC!=0) {
1509 return 0; /* non-zero flag for (f) only */
1510 }
1511
1512 /*
1513 * not NFC_Skippable if
1514 * (c) quick check flag == NO or
1515 * (d) combines forward or
1516 * (e) combines back or
1517 * (f) can change if another character is added
1518 *
1519 * for (f):
1520 * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1521 * check its composition list,
1522 * see if any of the second code points in the list
1523 * has cc less than the trailCC of the decomposition.
1524 *
1525 * For FCC: Test at runtime if the decomposition has a trailCC>1
1526 * -> there are characters with cc==1, they would order before the trail char
1527 * and prevent contiguous combination with the trail char.
1528 */
1529 if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
1530 (norm->combiningFlags&3)!=0) {
1531 return 0; /* non-zero flag for (f) only */
1532 }
1533 if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
1534 return _NORM_AUX_NFC_SKIP_F_MASK;
1535 }
1536
1537 return 0; /* skippable */
1538}
1539
1540static void
1541makeAux() {
1542 Norm *norm;
1543 uint32_t *pData;
1544 int32_t i, length;
1545
374ca955 1546 pData=utrie_getData(auxTrie, &length);
b75a7d8f
A
1547
1548 for(i=0; i<length; ++i) {
1549 norm=norms+pData[i];
1550 /*
1551 * 16-bit auxiliary normalization properties
1552 * see unormimp.h
1553 */
1554 pData[i]=
1555 ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
1556 (uint32_t)norm->fncIndex;
1557
1558 if(norm->unsafeStart || norm->udataCC!=0) {
1559 pData[i]|=_NORM_AUX_UNSAFE_MASK;
1560 }
1561
1562 pData[i]|=getSkippableFlags(norm);
1563 }
1564}
1565
1566/* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1567static uint32_t U_CALLCONV
1568getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1569 uint32_t value, leadNorm32=0;
1570 UChar32 limit;
1571 UBool inBlockZero;
1572
1573 limit=start+0x400;
1574 while(start<limit) {
1575 value=utrie_get32(trie, start, &inBlockZero);
1576 if(inBlockZero) {
1577 start+=UTRIE_DATA_BLOCK_LENGTH;
1578 } else {
1579 if(value!=0) {
1580 leadNorm32|=value;
1581 }
1582 ++start;
1583 }
1584 }
1585
1586 /* turn multi-bit fields into the worst-case value */
1587 if(leadNorm32&_NORM_CC_MASK) {
1588 leadNorm32|=_NORM_CC_MASK;
1589 }
1590
1591 /* clean up unnecessarily ored bit fields */
1592 leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
1593
1594 if(leadNorm32==0) {
1595 /* nothing to do (only composition exclusions?) */
1596 return 0;
1597 }
1598
1599 /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1600 leadNorm32|=(
1601 (uint32_t)_NORM_EXTRA_INDEX_TOP+
1602 (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
1603 )<<_NORM_EXTRA_SHIFT;
1604
1605 return leadNorm32;
1606}
1607
1608/* folding value for FCD: just store the offset (16 bits) if there is any non-0 entry */
1609static uint32_t U_CALLCONV
1610getFoldedFCDValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1611 uint32_t value;
1612 UChar32 limit;
1613 UBool inBlockZero;
1614
1615 limit=start+0x400;
1616 while(start<limit) {
1617 value=utrie_get32(trie, start, &inBlockZero);
1618 if(inBlockZero) {
1619 start+=UTRIE_DATA_BLOCK_LENGTH;
1620 } else if(value!=0) {
1621 return (uint32_t)offset;
1622 } else {
1623 ++start;
1624 }
1625 }
1626 return 0;
1627}
1628
1629/*
1630 * folding value for auxiliary data:
1631 * store the non-zero offset in bits 9..0 (FNC bits)
1632 * if there is any non-0 entry;
1633 * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1634 */
1635static uint32_t U_CALLCONV
1636getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1637 uint32_t value, oredValues;
1638 UChar32 limit;
1639 UBool inBlockZero;
1640
1641 oredValues=0;
1642 limit=start+0x400;
1643 while(start<limit) {
1644 value=utrie_get32(trie, start, &inBlockZero);
1645 if(inBlockZero) {
1646 start+=UTRIE_DATA_BLOCK_LENGTH;
1647 } else {
1648 oredValues|=value;
1649 ++start;
1650 }
1651 }
1652
1653 if(oredValues!=0) {
1654 /* move the 10 significant offset bits into bits 9..0 */
1655 offset>>=UTRIE_SURROGATE_BLOCK_BITS;
1656 if(offset>_NORM_AUX_FNC_MASK) {
1657 fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
1658 exit(U_INDEX_OUTOFBOUNDS_ERROR);
1659 }
1660 return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
1661 } else {
1662 return 0;
1663 }
1664}
1665
1666extern void
1667processData() {
1668#if 0
1669 uint16_t i;
1670#endif
1671
1672 processCombining();
1673
1674 /* canonically reorder decompositions and assign combining classes for decompositions */
1675 enumTrie(postParseFn, NULL);
1676
1677#if 0
1678 for(i=1; i<64; ++i) {
1679 if(combineAndQC[i]) {
1680 printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
1681 }
1682 }
1683#endif
1684
1685 /* add hangul/jamo specials */
1686 setHangulJamoSpecials();
1687
1688 /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1689 enumTrie(makeCanonSetFn, NULL);
1690
1691 /* clone the normalization builder trie to make the final data tries */
374ca955
A
1692 if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
1693 NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
1694 NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
b75a7d8f
A
1695 ) {
1696 fprintf(stderr, "error: unable to clone the normalization trie\n");
1697 exit(U_MEMORY_ALLOCATION_ERROR);
1698 }
1699
1700 /* --- finalize data for quick checks & normalization --- */
1701
1702 /* turn the Norm structs (stage2, norms) into 32-bit data words */
1703 makeAll32();
1704
1705 /* --- finalize data for FCD checks --- */
1706
1707 /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1708 makeFCD();
1709
1710 /* --- finalize auxiliary normalization data --- */
1711 makeAux();
1712
1713 if(beVerbose) {
1714#if 0
1715 printf("number of stage 2 entries: %ld\n", stage2Mem->index);
1716 printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
1717#endif
1718 printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
1719 printf("combining table count: %u\n", combiningTableTop);
1720 }
1721}
1722
1723#endif /* #if !UCONFIG_NO_NORMALIZATION */
1724
1725extern void
1726generateData(const char *dataDir) {
1727 static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
1728
1729 UNewDataMemory *pData;
1730 UErrorCode errorCode=U_ZERO_ERROR;
1731 int32_t size, dataLength;
1732
1733#if UCONFIG_NO_NORMALIZATION
1734
1735 size=0;
1736
1737#else
1738
374ca955
A
1739 U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
1740 U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1741 USet *set;
b75a7d8f
A
1742 int32_t normTrieSize, fcdTrieSize, auxTrieSize;
1743
374ca955 1744 normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
b75a7d8f
A
1745 if(U_FAILURE(errorCode)) {
1746 fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
1747 exit(errorCode);
1748 }
1749
374ca955 1750 fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), getFoldedFCDValue, TRUE, &errorCode);
b75a7d8f
A
1751 if(U_FAILURE(errorCode)) {
1752 fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
1753 exit(errorCode);
1754 }
1755
374ca955 1756 auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
b75a7d8f
A
1757 if(U_FAILURE(errorCode)) {
1758 fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
1759 exit(errorCode);
1760 }
1761
1762 /* move the parts of canonStartSets[] together into a contiguous block */
1763 if(canonStartSetsTop<_NORM_MAX_CANON_SETS) {
1764 uprv_memmove(canonStartSets+canonStartSetsTop,
1765 canonStartSets+_NORM_MAX_CANON_SETS,
1766 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
1767 }
1768 canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1769
1770 if(canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH)) {
1771 uprv_memmove(canonStartSets+canonStartSetsTop,
1772 canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
1773 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
1774 }
1775 canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1776
374ca955
A
1777 /* create the normalization exclusion sets */
1778 /*
1779 * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1780 * but we cannot use NFD_QC from the pattern because that would require
1781 * unorm.icu which we are just going to generate.
1782 * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1783 * with that.
1784 */
1785 U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
1786 U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1787
1788 canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
1789 set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
1790 if(U_FAILURE(errorCode)) {
1791 fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1792 exit(errorCode);
1793 }
1794 uset_retainAll(set, nfdQCNoSet);
1795 canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1796 if(U_FAILURE(errorCode)) {
1797 fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1798 exit(errorCode);
1799 }
1800 uset_close(set);
1801
1802 canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
1803 set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
1804 if(U_FAILURE(errorCode)) {
1805 fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1806 exit(errorCode);
1807 }
1808 canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1809 if(U_FAILURE(errorCode)) {
1810 fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1811 exit(errorCode);
1812 }
1813 uset_close(set);
1814
1815 canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
1816
b75a7d8f 1817 /* make sure that the FCD trie is 4-aligned */
374ca955 1818 if((utm_countItems(extraMem)+combiningTableTop)&1) {
b75a7d8f
A
1819 combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
1820 }
1821
1822 /* pad canonStartSets to 4-alignment, too */
1823 if(canonStartSetsTop&1) {
1824 canonStartSets[canonStartSetsTop++]=0x1235;
1825 }
1826
1827 size=
1828 _NORM_INDEX_TOP*4+
1829 normTrieSize+
374ca955 1830 utm_countItems(extraMem)*2+
b75a7d8f
A
1831 combiningTableTop*2+
1832 fcdTrieSize+
1833 auxTrieSize+
1834 canonStartSetsTop*2;
1835
1836 if(beVerbose) {
374ca955
A
1837 printf("size of normalization trie %5u bytes\n", (int)normTrieSize);
1838 printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
b75a7d8f
A
1839 printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
1840 printf("size of combining table %5u uint16_t\n", combiningTableTop);
374ca955
A
1841 printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize);
1842 printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize);
1843 printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop);
b75a7d8f
A
1844 printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP);
1845 printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
374ca955 1846 printf(" number of sets %5d\n", (int)canonSetsCount);
b75a7d8f
A
1847 printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
1848 printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
374ca955 1849 printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
b75a7d8f
A
1850 printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
1851 }
1852
1853 indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
374ca955 1854 indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
b75a7d8f
A
1855
1856 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
1857 indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
1858 indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
1859 indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
1860
1861 /* the quick check minimum code points are already set */
1862
1863 indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
1864 indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
1865 indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
1866
1867#endif
1868
1869 /* write the data */
374ca955 1870 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
b75a7d8f
A
1871 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1872 if(U_FAILURE(errorCode)) {
1873 fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
1874 exit(errorCode);
1875 }
1876
1877#if !UCONFIG_NO_NORMALIZATION
1878
1879 udata_writeBlock(pData, indexes, sizeof(indexes));
1880 udata_writeBlock(pData, normTrieBlock, normTrieSize);
374ca955 1881 udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
b75a7d8f
A
1882 udata_writeBlock(pData, combiningTable, combiningTableTop*2);
1883 udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
1884 udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
1885 udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
1886
1887#endif
1888
1889 /* finish up */
1890 dataLength=udata_finish(pData, &errorCode);
1891 if(U_FAILURE(errorCode)) {
1892 fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
1893 exit(errorCode);
1894 }
1895
1896 if(dataLength!=size) {
1897 fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
1898 (long)dataLength, (long)size);
1899 exit(U_INTERNAL_PROGRAM_ERROR);
1900 }
1901}
1902
1903#if !UCONFIG_NO_NORMALIZATION
1904
1905extern void
1906cleanUpData(void) {
1907 int32_t i, count;
1908
374ca955 1909 count=utm_countItems(normMem);
b75a7d8f
A
1910 for(i=0; i<count; ++i) {
1911 uset_close(norms[i].canonStart);
1912 }
1913
1914 utm_close(normMem);
1915 utm_close(utf32Mem);
1916 utm_close(extraMem);
1917 utm_close(combiningTriplesMem);
374ca955
A
1918 utrie_close(normTrie);
1919 utrie_close(norm32Trie);
1920 utrie_close(fcdTrie);
1921 utrie_close(auxTrie);
1922
1923 uset_close(nfdQCNoSet);
1924
1925 uprv_free(normTrie);
1926 uprv_free(norm32Trie);
1927 uprv_free(fcdTrie);
1928 uprv_free(auxTrie);
b75a7d8f
A
1929}
1930
1931#endif /* #if !UCONFIG_NO_NORMALIZATION */
1932
1933/*
1934 * Hey, Emacs, please set the following:
1935 *
1936 * Local Variables:
1937 * indent-tabs-mode: nil
1938 * End:
1939 *
1940 */