git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/gennorm/store.c

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1999-2006, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: store.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2001may25
	14	* created by: Markus W. Scherer
	15	*
	16	* Store Unicode normalization data in a memory-mappable file.
	17	*/
	18
	19	#include <stdio.h>
	20	#include <stdlib.h>
	21	#include "unicode/utypes.h"
	22	#include "unicode/uchar.h"
	23	#include "unicode/ustring.h"
	24	#include "cmemory.h"
	25	#include "cstring.h"
	26	#include "filestrm.h"
	27	#include "unicode/udata.h"
	28	#include "utrie.h"
	29	#include "unicode/uset.h"
	30	#include "toolutil.h"
	31	#include "unewdata.h"
	32	#include "writesrc.h"
	33	#include "unormimp.h"
	34	#include "gennorm.h"
	35
	36	#define DO_DEBUG_OUT 0
	37
	38	#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
	39
	40	/*
	41	* The new implementation of the normalization code loads its data from
	42	* unorm.icu, which is generated with this gennorm tool.
	43	* The format of that file is described in unormimp.h .
	44	*/
	45
	46	/* file data ---------------------------------------------------------------- */
	47
	48	#if UCONFIG_NO_NORMALIZATION
	49
	50	/* dummy UDataInfo cf. udata.h */
	51	static UDataInfo dataInfo = {
	52	sizeof(UDataInfo),
	53	0,
	54
	55	U_IS_BIG_ENDIAN,
	56	U_CHARSET_FAMILY,
	57	U_SIZEOF_UCHAR,
	58	0,
	59
	60	{ 0, 0, 0, 0 }, /* dummy dataFormat */
	61	{ 0, 0, 0, 0 }, /* dummy formatVersion */
	62	{ 0, 0, 0, 0 } /* dummy dataVersion */
	63	};
	64
	65	#else
	66
	67	/* UDataInfo cf. udata.h */
	68	static UDataInfo dataInfo={
	69	sizeof(UDataInfo),
	70	0,
	71
	72	U_IS_BIG_ENDIAN,
	73	U_CHARSET_FAMILY,
	74	U_SIZEOF_UCHAR,
	75	0,
	76
	77	{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
	78	{ 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
	79	{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
	80	};
	81
	82	extern void
	83	setUnicodeVersion(const char *v) {
	84	UVersionInfo version;
	85	u_versionFromString(version, v);
	86	uprv_memcpy(dataInfo.dataVersion, version, 4);
	87	}
	88
	89	static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
	90
	91	/* builder data ------------------------------------------------------------- */
	92
	93	/* modularization flags, see gennorm.h (default to "store everything") */
	94	uint32_t gStoreFlags=0xffffffff;
	95
	96	typedef void EnumTrieFn(void context, uint32_t code, Norm norm);
	97
	98	static UNewTrie
	99	*normTrie,
	100	*norm32Trie,
	101	*fcdTrie,
	102	*auxTrie;
	103
	104	static UToolMemory normMem, utf32Mem, extraMem, combiningTriplesMem;
	105
	106	static Norm *norms;
	107
	108	/*
	109	* set a flag for each code point that was seen in decompositions -
	110	* avoid to decompose ones that have not been used before
	111	*/
	112	static uint32_t haveSeenFlags[256];
	113
	114	/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
	115	static USet *nfdQCNoSet;
	116
	117	/* see addCombiningCP() for details */
	118	static uint32_t combiningCPs[2000];
	119
	120	/*
	121	* after processCombining() this contains for each code point in combiningCPs[]
	122	* the runtime combining index
	123	*/
	124	static uint16_t combiningIndexes[2000];
	125
	126	/* section limits for combiningCPs[], see addCombiningCP() */
	127	static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
	128
	129	/**
	130	* Structure for a triple of code points, stored in combiningTriplesMem.
	131	* The lead and trail code points combine into the the combined one,
	132	* i.e., there is a canonical decomposition of combined-> <lead, trail>.
	133	*
	134	* Before processCombining() is called, leadIndex and trailIndex are 0.
	135	* After processCombining(), they contain the indexes of the lead and trail
	136	* code point in the combiningCPs[] array.
	137	* They are then sorted by leadIndex, then trailIndex.
	138	* They are not sorted by code points.
	139	*/
	140	typedef struct CombiningTriple {
	141	uint16_t leadIndex, trailIndex;
	142	uint32_t lead, trail, combined;
	143	} CombiningTriple;
	144
	145	/* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
	146	static uint16_t combiningTable[0x8000];
	147	static uint16_t combiningTableTop=0;
	148
	149	#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
	150	static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
	151	+10000]; /* +10000 for exclusion sets */
	152	static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
	153	static int32_t canonSetsCount=0;
	154
	155	/* allocate and initialize a Norm unit */
	156	static Norm *
	157	allocNorm() {
	158	/* allocate Norm */
	159	Norm p=(Norm )utm_alloc(normMem);
	160	/*
	161	* The combiningIndex must not be initialized to 0 because 0 is the
	162	* combiningIndex of the first forward-combining character.
	163	*/
	164	p->combiningIndex=0xffff;
	165	return p;
	166	}
	167
	168	extern void
	169	init() {
	170	uint16_t *p16;
	171
	172	normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
	173	uprv_memset(normTrie, 0, sizeof(UNewTrie));
	174	norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
	175	uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
	176	fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
	177	uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
	178	auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
	179	uprv_memset(auxTrie, 0, sizeof(UNewTrie));
	180
	181	/* initialize the two tries */
	182	if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
	183	fprintf(stderr, "error: failed to initialize tries\n");
	184	exit(U_MEMORY_ALLOCATION_ERROR);
	185	}
	186
	187	/* allocate Norm structures and reset the first one */
	188	normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
	189	norms=allocNorm();
	190
	191	/* allocate UTF-32 string memory */
	192	utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
	193
	194	/* reset all "have seen" flags */
	195	uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
	196
	197	/* open an empty set */
	198	nfdQCNoSet=uset_open(1, 0);
	199
	200	/* allocate extra data memory for UTF-16 decomposition strings and other values */
	201	extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
	202	/* initialize the extraMem counter for the top of FNC strings */
	203	p16=(uint16_t *)utm_alloc(extraMem);
	204	*p16=1;
	205
	206	/* allocate temporary memory for combining triples */
	207	combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
	208
	209	/* set the minimum code points for no/maybe quick check values to the end of the BMP */
	210	indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
	211	indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
	212	indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
	213	indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
	214
	215	/* preset the indexes portion of canonStartSets */
	216	uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
	217	}
	218
	219	/*
	220	* get or create a Norm unit;
	221	* get or create the intermediate trie entries for it as well
	222	*/
	223	static Norm *
	224	createNorm(uint32_t code) {
	225	Norm *p;
	226	uint32_t i;
	227
	228	i=utrie_get32(normTrie, (UChar32)code, NULL);
	229	if(i!=0) {
	230	p=norms+i;
	231	} else {
	232	/* allocate Norm */
	233	p=allocNorm();
	234	if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
	235	fprintf(stderr, "error: too many normalization entries\n");
	236	exit(U_BUFFER_OVERFLOW_ERROR);
	237	}
	238	}
	239	return p;
	240	}
	241
	242	/* get an existing Norm unit */
	243	static Norm *
	244	getNorm(uint32_t code) {
	245	uint32_t i;
	246
	247	i=utrie_get32(normTrie, (UChar32)code, NULL);
	248	if(i==0) {
	249	return NULL;
	250	}
	251	return norms+i;
	252	}
	253
	254	/* get the canonical combining class of a character */
	255	static uint8_t
	256	getCCFromCP(uint32_t code) {
	257	Norm *norm=getNorm(code);
	258	if(norm==NULL) {
	259	return 0;
	260	} else {
	261	return norm->udataCC;
	262	}
	263	}
	264
	265	/*
	266	* enumerate all code points with their Norm structs and call a function for each
	267	* return the number of code points with data
	268	*/
	269	static uint32_t
	270	enumTrie(EnumTrieFn fn, void context) {
	271	uint32_t count, i;
	272	UChar32 code;
	273	UBool isInBlockZero;
	274
	275	count=0;
	276	for(code=0; code<=0x10ffff;) {
	277	i=utrie_get32(normTrie, code, &isInBlockZero);
	278	if(isInBlockZero) {
	279	code+=UTRIE_DATA_BLOCK_LENGTH;
	280	} else {
	281	if(i!=0) {
	282	fn(context, (uint32_t)code, norms+i);
	283	++count;
	284	}
	285	++code;
	286	}
	287	}
	288	return count;
	289	}
	290
	291	static void
	292	setHaveSeenString(const uint32_t *s, int32_t length) {
	293	uint32_t c;
	294
	295	while(length>0) {
	296	c=*s++;
	297	haveSeenFlags[(c>>5)&0xff]\|=(1<<(c&0x1f));
	298	--length;
	299	}
	300	}
	301
	302	#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
	303
	304	/* handle combining data ---------------------------------------------------- */
	305
	306	/*
	307	* Insert an entry into combiningCPs[] for the new code point code with its flags.
	308	* The flags indicate if code combines forward, backward, or both.
	309	*
	310	* combiningCPs[] contains three sections:
	311	* 1. code points that combine forward
	312	* 2. code points that combine forward and backward
	313	* 3. code points that combine backward
	314	*
	315	* Search for code in the entire array.
	316	* If it is found and already is in the right section (old flags==new flags)
	317	* then we are done.
	318	* If it is found but the flags are different, then remove it,
	319	* union the old and new flags, and reinsert it into its correct section.
	320	* If it is not found, then just insert it.
	321	*
	322	* Within each section, the code points are not sorted.
	323	*/
	324	static void
	325	addCombiningCP(uint32_t code, uint8_t flags) {
	326	uint32_t newEntry;
	327	uint16_t i;
	328
	329	newEntry=code\|((uint32_t)flags<<24);
	330
	331	/* search for this code point */
	332	for(i=0; i<combineBackTop; ++i) {
	333	if(code==(combiningCPs[i]&0xffffff)) {
	334	/* found it */
	335	if(newEntry==combiningCPs[i]) {
	336	return; /* no change */
	337	}
	338
	339	/* combine the flags, remove the old entry from the old place, and insert the new one */
	340	newEntry\|=combiningCPs[i];
	341	if(i!=--combineBackTop) {
	342	uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
	343	}
	344	if(i<combineBothTop) {
	345	--combineBothTop;
	346	}
	347	if(i<combineFwdTop) {
	348	--combineFwdTop;
	349	}
	350	break;
	351	}
	352	}
	353
	354	/* not found or modified, insert it */
	355	if(combineBackTop>=sizeof(combiningCPs)/4) {
	356	fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
	357	(long)(sizeof(combiningCPs)/4));
	358	exit(U_MEMORY_ALLOCATION_ERROR);
	359	}
	360
	361	/* set i to the insertion point */
	362	flags=(uint8_t)(newEntry>>24);
	363	if(flags==1) {
	364	i=combineFwdTop++;
	365	++combineBothTop;
	366	} else if(flags==3) {
	367	i=combineBothTop++;
	368	} else /* flags==2 */ {
	369	i=combineBackTop;
	370	}
	371
	372	/* move the following code points up one and insert newEntry at i */
	373	if(i<combineBackTop) {
	374	uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
	375	}
	376	combiningCPs[i]=newEntry;
	377
	378	/* finally increment the total counter */
	379	++combineBackTop;
	380	}
	381
	382	/**
	383	* Find the index in combiningCPs[] where code point code is stored.
	384	* @param code code point to look for
	385	* @param isLead is code a forward combining code point?
	386	* @return index in combiningCPs[] where code is stored
	387	*/
	388	static uint16_t
	389	findCombiningCP(uint32_t code, UBool isLead) {
	390	uint16_t i, limit;
	391
	392	if(isLead) {
	393	i=0;
	394	limit=combineBothTop;
	395	} else {
	396	i=combineFwdTop;
	397	limit=combineBackTop;
	398	}
	399
	400	/* search for this code point */
	401	for(; i<limit; ++i) {
	402	if(code==(combiningCPs[i]&0xffffff)) {
	403	/* found it */
	404	return i;
	405	}
	406	}
	407
	408	/* not found */
	409	return 0xffff;
	410	}
	411
	412	static void
	413	addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
	414	CombiningTriple *triple;
	415
	416	if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
	417	return;
	418	}
	419
	420	/*
	421	* set combiningFlags for the two code points
	422	* do this after decomposition so that getNorm() above returns NULL
	423	* if we do not have actual sub-decomposition data for the initial NFD here
	424	*/
	425	createNorm(lead)->combiningFlags\|=1; /* combines forward */
	426	createNorm(trail)->combiningFlags\|=2; /* combines backward */
	427
	428	addCombiningCP(lead, 1);
	429	addCombiningCP(trail, 2);
	430
	431	triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
	432	triple->lead=lead;
	433	triple->trail=trail;
	434	triple->combined=combined;
	435	}
	436
	437	static int
	438	compareTriples(const void l, const void r) {
	439	int diff;
	440	diff=(int)((CombiningTriple *)l)->leadIndex-
	441	(int)((CombiningTriple *)r)->leadIndex;
	442	if(diff==0) {
	443	diff=(int)((CombiningTriple *)l)->trailIndex-
	444	(int)((CombiningTriple *)r)->trailIndex;
	445	}
	446	return diff;
	447	}
	448
	449	static void
	450	processCombining() {
	451	CombiningTriple *triples;
	452	uint16_t *p;
	453	uint32_t combined;
	454	uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
	455
	456	triples=utm_getStart(combiningTriplesMem);
	457
	458	/* add lead and trail indexes to the triples for sorting */
	459	count=(uint16_t)utm_countItems(combiningTriplesMem);
	460	for(i=0; i<count; ++i) {
	461	/* findCombiningCP() must always find the code point */
	462	triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
	463	triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
	464	}
	465
	466	/* sort them by leadIndex, trailIndex */
	467	qsort(triples, count, sizeof(CombiningTriple), compareTriples);
	468
	469	/* calculate final combining indexes and store them in the Norm entries */
	470	tableTop=0;
	471	j=0; /* triples counter */
	472
	473	/* first, combining indexes of fwd/both characters are indexes into the combiningTable */
	474	for(i=0; i<combineBothTop; ++i) {
	475	/* start a new table */
	476
	477	/* assign combining index */
	478	createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
	479
	480	/* calculate the length of the combining data for this lead code point in the combiningTable */
	481	while(j<count && i==triples[j].leadIndex) {
	482	/* count 2 to 3 16-bit units per composition entry (back-index, code point) */
	483	combined=triples[j++].combined;
	484	if(combined<=0x1fff) {
	485	tableTop+=2;
	486	} else {
	487	tableTop+=3;
	488	}
	489	}
	490	}
	491
	492	/* second, combining indexes of back-only characters are simply incremented from here to be unique */
	493	finalIndex=tableTop;
	494	for(; i<combineBackTop; ++i) {
	495	createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
	496	}
	497
	498	/* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
	499	if(finalIndex>0x8000) {
	500	fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
	501	tableTop, (long)(sizeof(combiningTable)/4));
	502	exit(U_MEMORY_ALLOCATION_ERROR);
	503	}
	504
	505	combiningTableTop=tableTop;
	506
	507	/* store the combining data in the combiningTable, with the final indexes from above */
	508	p=combiningTable;
	509	j=0; /* triples counter */
	510
	511	/*
	512	* this is essentially the same loop as above, but
	513	* it writes the table data instead of calculating and setting the final indexes;
	514	* it is necessary to have two passes so that all the final indexes are known before
	515	* they are written into the table
	516	*/
	517	for(i=0; i<combineBothTop; ++i) {
	518	/* start a new table */
	519
	520	combined=0; /* avoid compiler warning */
	521
	522	/* store the combining data for this lead code point in the combiningTable */
	523	while(j<count && i==triples[j].leadIndex) {
	524	finalIndex=combiningIndexes[triples[j].trailIndex];
	525	combined=triples[j++].combined;
	526
	527	/* is combined a starter? (i.e., cc==0 && combines forward) */
	528	combinesFwd=(uint16_t)((getNorm(combined)->combiningFlags&1)<<13);
	529
	530	*p++=finalIndex;
	531	if(combined<=0x1fff) {
	532	*p++=(uint16_t)(combinesFwd\|combined);
	533	} else if(combined<=0xffff) {
	534	*p++=(uint16_t)(0x8000\|combinesFwd);
	535	*p++=(uint16_t)combined;
	536	} else {
	537	*p++=(uint16_t)(0xc000\|combinesFwd\|((combined-0x10000)>>10));
	538	*p++=(uint16_t)(0xdc00\|(combined&0x3ff));
	539	}
	540	}
	541
	542	/* set a marker on the last final trail index in this lead's table */
	543	if(combined<=0x1fff) {
	544	*(p-2)\|=0x8000;
	545	} else {
	546	*(p-3)\|=0x8000;
	547	}
	548	}
	549
	550	/* post condition: tableTop==(p-combiningTable) */
	551	}
	552
	553	/* processing incoming normalization data ----------------------------------- */
	554
	555	/*
	556	* Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
	557	* c must be a Hangul syllable code point.
	558	*/
	559	static void
	560	getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
	561	/* Hangul syllable: decompose algorithmically */
	562	uint32_t c2;
	563	uint8_t length;
	564
	565	uprv_memset(pHangulNorm, 0, sizeof(Norm));
	566
	567	c-=HANGUL_BASE;
	568
	569	c2=c%JAMO_T_COUNT;
	570	c/=JAMO_T_COUNT;
	571	if(c2>0) {
	572	hangulBuffer[2]=JAMO_T_BASE+c2;
	573	length=3;
	574	} else {
	575	hangulBuffer[2]=0;
	576	length=2;
	577	}
	578
	579	hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
	580	hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
	581
	582	pHangulNorm->nfd=hangulBuffer;
	583	pHangulNorm->lenNFD=length;
	584	if(DO_STORE(UGENNORM_STORE_COMPAT)) {
	585	pHangulNorm->nfkd=hangulBuffer;
	586	pHangulNorm->lenNFKD=length;
	587	}
	588	}
	589
	590	/*
	591	* decompose the one decomposition further, may generate two decompositions
	592	* apply all previous characters' decompositions to this one
	593	*/
	594	static void
	595	decompStoreNewNF(uint32_t code, Norm *norm) {
	596	uint32_t nfd[40], nfkd[40], hangulBuffer[3];
	597	Norm hangulNorm;
	598
	599	uint32_t *s32;
	600	Norm *p;
	601	uint32_t c;
	602	int32_t i, length;
	603	uint8_t lenNFD=0, lenNFKD=0;
	604	UBool changedNFD=FALSE, changedNFKD=FALSE;
	605
	606	if((length=norm->lenNFD)!=0) {
	607	/* always allocate the original string */
	608	changedNFD=TRUE;
	609	s32=norm->nfd;
	610	} else if((length=norm->lenNFKD)!=0) {
	611	/* always allocate the original string */
	612	changedNFKD=TRUE;
	613	s32=norm->nfkd;
	614	} else {
	615	/* no decomposition here, nothing to do */
	616	return;
	617	}
	618
	619	/* decompose each code point */
	620	for(i=0; i<length; ++i) {
	621	c=s32[i];
	622	p=getNorm(c);
	623	if(p==NULL) {
	624	if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
	625	getHangulDecomposition(c, &hangulNorm, hangulBuffer);
	626	p=&hangulNorm;
	627	} else {
	628	/* no data, no decomposition */
	629	nfd[lenNFD++]=c;
	630	nfkd[lenNFKD++]=c;
	631	continue;
	632	}
	633	}
	634
	635	/* canonically decompose c */
	636	if(changedNFD) {
	637	if(p->lenNFD!=0) {
	638	uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
	639	lenNFD+=p->lenNFD;
	640	} else {
	641	nfd[lenNFD++]=c;
	642	}
	643	}
	644
	645	/* compatibility-decompose c */
	646	if(p->lenNFKD!=0) {
	647	uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
	648	lenNFKD+=p->lenNFKD;
	649	changedNFKD=TRUE;
	650	} else if(p->lenNFD!=0) {
	651	uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
	652	lenNFKD+=p->lenNFD;
	653	/*
	654	* not changedNFKD=TRUE;
	655	* so that we do not store a new nfkd if there was no nfkd string before
	656	* and we only see canonical decompositions
	657	*/
	658	} else {
	659	nfkd[lenNFKD++]=c;
	660	}
	661	}
	662
	663	/* assume that norm->lenNFD==1 or ==2 */
	664	if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
	665	addCombiningTriple(s32[0], s32[1], code);
	666	}
	667
	668	if(changedNFD) {
	669	if(lenNFD!=0) {
	670	s32=utm_allocN(utf32Mem, lenNFD);
	671	uprv_memcpy(s32, nfd, lenNFD*4);
	672	} else {
	673	s32=NULL;
	674	}
	675	norm->lenNFD=lenNFD;
	676	norm->nfd=s32;
	677	setHaveSeenString(nfd, lenNFD);
	678	}
	679	if(changedNFKD) {
	680	if(lenNFKD!=0) {
	681	s32=utm_allocN(utf32Mem, lenNFKD);
	682	uprv_memcpy(s32, nfkd, lenNFKD*4);
	683	} else {
	684	s32=NULL;
	685	}
	686	norm->lenNFKD=lenNFKD;
	687	norm->nfkd=s32;
	688	setHaveSeenString(nfkd, lenNFKD);
	689	}
	690	}
	691
	692	typedef struct DecompSingle {
	693	uint32_t c;
	694	Norm *norm;
	695	} DecompSingle;
	696
	697	/*
	698	* apply this one character's decompositions (there is at least one!) to
	699	* all previous characters' decompositions to decompose them further
	700	*/
	701	static void
	702	decompWithSingleFn(void context, uint32_t code, Norm norm) {
	703	uint32_t nfd[40], nfkd[40];
	704	uint32_t *s32;
	705	DecompSingle me=(DecompSingle )context;
	706	uint32_t c, myC;
	707	int32_t i, length;
	708	uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
	709	UBool changedNFD=FALSE, changedNFKD=FALSE;
	710
	711	/* get the new character's data */
	712	myC=me->c;
	713	myLenNFD=me->norm->lenNFD;
	714	myLenNFKD=me->norm->lenNFKD;
	715	/* assume that myC has at least one decomposition */
	716
	717	if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
	718	/* apply NFD(myC) to norm->nfd */
	719	s32=norm->nfd;
	720	for(i=0; i<length; ++i) {
	721	c=s32[i];
	722	if(c==myC) {
	723	uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
	724	lenNFD+=myLenNFD;
	725	changedNFD=TRUE;
	726	} else {
	727	nfd[lenNFD++]=c;
	728	}
	729	}
	730	}
	731
	732	if((length=norm->lenNFKD)!=0) {
	733	/* apply NFD(myC) and NFKD(myC) to norm->nfkd */
	734	s32=norm->nfkd;
	735	for(i=0; i<length; ++i) {
	736	c=s32[i];
	737	if(c==myC) {
	738	if(myLenNFKD!=0) {
	739	uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
	740	lenNFKD+=myLenNFKD;
	741	} else /* assume myLenNFD!=0 */ {
	742	uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
	743	lenNFKD+=myLenNFD;
	744	}
	745	changedNFKD=TRUE;
	746	} else {
	747	nfkd[lenNFKD++]=c;
	748	}
	749	}
	750	} else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
	751	/* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
	752	s32=norm->nfd;
	753	for(i=0; i<length; ++i) {
	754	c=s32[i];
	755	if(c==myC) {
	756	uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
	757	lenNFKD+=myLenNFKD;
	758	changedNFKD=TRUE;
	759	} else {
	760	nfkd[lenNFKD++]=c;
	761	}
	762	}
	763	}
	764
	765	/* set the new decompositions, forget the old ones */
	766	if(changedNFD) {
	767	if(lenNFD!=0) {
	768	if(lenNFD>norm->lenNFD) {
	769	s32=utm_allocN(utf32Mem, lenNFD);
	770	} else {
	771	s32=norm->nfd;
	772	}
	773	uprv_memcpy(s32, nfd, lenNFD*4);
	774	} else {
	775	s32=NULL;
	776	}
	777	norm->lenNFD=lenNFD;
	778	norm->nfd=s32;
	779	}
	780	if(changedNFKD) {
	781	if(lenNFKD!=0) {
	782	if(lenNFKD>norm->lenNFKD) {
	783	s32=utm_allocN(utf32Mem, lenNFKD);
	784	} else {
	785	s32=norm->nfkd;
	786	}
	787	uprv_memcpy(s32, nfkd, lenNFKD*4);
	788	} else {
	789	s32=NULL;
	790	}
	791	norm->lenNFKD=lenNFKD;
	792	norm->nfkd=s32;
	793	}
	794	}
	795
	796	/*
	797	* process the data for one code point listed in UnicodeData;
	798	* UnicodeData itself never maps a code point to both NFD and NFKD
	799	*/
	800	extern void
	801	storeNorm(uint32_t code, Norm *norm) {
	802	DecompSingle decompSingle;
	803	Norm *p;
	804
	805	if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
	806	/* ignore compatibility decomposition */
	807	norm->lenNFKD=0;
	808	}
	809
	810	/* copy existing derived normalization properties */
	811	p=createNorm(code);
	812	norm->qcFlags=p->qcFlags;
	813	norm->combiningFlags=p->combiningFlags;
	814	norm->fncIndex=p->fncIndex;
	815
	816	/* process the decomposition if there is one here */
	817	if((norm->lenNFD\|norm->lenNFKD)!=0) {
	818	/* decompose this one decomposition further, may generate two decompositions */
	819	decompStoreNewNF(code, norm);
	820
	821	/* has this code point been used in previous decompositions? */
	822	if(HAVE_SEEN(code)) {
	823	/* use this decomposition to decompose other decompositions further */
	824	decompSingle.c=code;
	825	decompSingle.norm=norm;
	826	enumTrie(decompWithSingleFn, &decompSingle);
	827	}
	828	}
	829
	830	/* store the data */
	831	uprv_memcpy(p, norm, sizeof(Norm));
	832	}
	833
	834	extern void
	835	setQCFlags(uint32_t code, uint8_t qcFlags) {
	836	if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
	837	/* ignore compatibility decomposition: unset the KC/KD flags */
	838	qcFlags&=~(_NORM_QC_NFKC\|_NORM_QC_NFKD);
	839
	840	/* set the KC/KD flags to the same values as the C/D flags */
	841	qcFlags\|=qcFlags<<1;
	842	}
	843	if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
	844	/* ignore composition data: unset the C/KC flags */
	845	qcFlags&=~(_NORM_QC_NFC\|_NORM_QC_NFKC);
	846
	847	/* set the C/KC flags to the same values as the D/KD flags */
	848	qcFlags\|=qcFlags>>2;
	849	}
	850
	851	createNorm(code)->qcFlags\|=qcFlags;
	852
	853	/* adjust the minimum code point for quick check no/maybe */
	854	if(code<0xffff) {
	855	if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
	856	indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
	857	}
	858	if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
	859	indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
	860	}
	861	if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
	862	indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
	863	}
	864	if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
	865	indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
	866	}
	867	}
	868
	869	if(qcFlags&_NORM_QC_NFD) {
	870	uset_add(nfdQCNoSet, (UChar32)code);
	871	}
	872	}
	873
	874	extern void
	875	setCompositionExclusion(uint32_t code) {
	876	if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
	877	createNorm(code)->combiningFlags\|=0x80;
	878	}
	879	}
	880
	881	static void
	882	setHangulJamoSpecials() {
	883	Norm *norm;
	884	uint32_t c, hangul;
	885
	886	/*
	887	* Hangul syllables are algorithmically decomposed into Jamos,
	888	* and Jamos are algorithmically composed into Hangul syllables.
	889	* The quick check flags are parsed, except for Hangul.
	890	*/
	891
	892	/* set Jamo L specials */
	893	hangul=0xac00;
	894	for(c=0x1100; c<=0x1112; ++c) {
	895	norm=createNorm(c);
	896	norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
	897	if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
	898	norm->combiningFlags=1;
	899	}
	900
	901	/* for each Jamo L create a set with its associated Hangul block */
	902	norm->canonStart=uset_open(hangul, hangul+21*28-1);
	903	hangul+=21*28;
	904	}
	905
	906	/* set Jamo V specials */
	907	for(c=0x1161; c<=0x1175; ++c) {
	908	norm=createNorm(c);
	909	norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
	910	if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
	911	norm->combiningFlags=2;
	912	}
	913	norm->unsafeStart=TRUE;
	914	}
	915
	916	/* set Jamo T specials */
	917	for(c=0x11a8; c<=0x11c2; ++c) {
	918	norm=createNorm(c);
	919	norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
	920	if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
	921	norm->combiningFlags=2;
	922	}
	923	norm->unsafeStart=TRUE;
	924	}
	925
	926	/* set Hangul specials, precompacted */
	927	norm=allocNorm();
	928	norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
	929	if(DO_STORE(UGENNORM_STORE_COMPAT)) {
	930	norm->qcFlags=_NORM_QC_NFD\|_NORM_QC_NFKD;
	931	} else {
	932	norm->qcFlags=_NORM_QC_NFD;
	933	}
	934
	935	if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
	936	fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
	937	exit(U_BUFFER_OVERFLOW_ERROR);
	938	}
	939	}
	940
	941	/*
	942	* set FC-NFKC-Closure string
	943	* s contains the closure string; s[0]==length, s[1..length] is the actual string
	944	* may modify s[0]
	945	*/
	946	U_CFUNC void
	947	setFNC(uint32_t c, UChar *s) {
	948	uint16_t *p;
	949	int32_t length, i, count;
	950	UChar first;
	951
	952	if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) \|\|
	953	DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) \|\|
	954	DO_NOT_STORE(UGENNORM_STORE_AUX)
	955	) {
	956	return;
	957	}
	958
	959	count=utm_countItems(extraMem);
	960	length=s[0];
	961	first=s[1];
	962
	963	/* try to overlay single-unit strings with existing ones */
	964	if(length==1 && first<0xff00) {
	965	p=utm_getStart(extraMem);
	966	for(i=1; i<count; ++i) {
	967	if(first==p[i]) {
	968	break;
	969	}
	970	}
	971	} else {
	972	i=count;
	973	}
	974
	975	/* append the new string if it cannot be overlayed with an old one */
	976	if(i==count) {
	977	if(count>_NORM_AUX_MAX_FNC) {
	978	fprintf(stderr, "gennorm error: too many FNC strings\n");
	979	exit(U_INDEX_OUTOFBOUNDS_ERROR);
	980	}
	981
	982	/* prepend 0xffxx with xx==length */
	983	s[0]=(uint16_t)(0xff00+length);
	984	++length;
	985	p=(uint16_t *)utm_allocN(extraMem, length);
	986	uprv_memcpy(p, s, length*2);
	987
	988	/* update the top index in extraMem[0] */
	989	count+=length;
	990	((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
	991	}
	992
	993	/* store the index to the string */
	994	createNorm(c)->fncIndex=i;
	995	}
	996
	997	/* build runtime structures ------------------------------------------------- */
	998
	999	/* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
	1000	static uint16_t
	1001	reorderString(uint32_t *s, int32_t length) {
	1002	uint8_t ccs[40];
	1003	uint32_t c;
	1004	int32_t i, j;
	1005	uint8_t cc, prevCC;
	1006
	1007	if(length<=0) {
	1008	return 0;
	1009	}
	1010
	1011	for(i=0; i<length; ++i) {
	1012	/* get the i-th code point and its combining class */
	1013	c=s[i];
	1014	cc=getCCFromCP(c);
	1015	if(cc!=0 && i!=0) {
	1016	/* it is a combining mark, see if it needs to be moved back */
	1017	j=i;
	1018	do {
	1019	prevCC=ccs[j-1];
	1020	if(prevCC<=cc) {
	1021	break; /* found the right place */
	1022	}
	1023	/* move the previous code point here and go back */
	1024	s[j]=s[j-1];
	1025	ccs[j]=prevCC;
	1026	} while(--j!=0);
	1027	s[j]=c;
	1028	ccs[j]=cc;
	1029	} else {
	1030	/* just store the combining class */
	1031	ccs[i]=cc;
	1032	}
	1033	}
	1034
	1035	return (uint16_t)(((uint16_t)ccs[0]<<8)\|ccs[length-1]);
	1036	}
	1037
	1038	#if 0
	1039	static UBool combineAndQC[64]={ 0 };
	1040	#endif
	1041
	1042	/*
	1043	* canonically reorder the up to two decompositions
	1044	* and store the leading and trailing combining classes accordingly
	1045	*
	1046	* also process canonical decompositions for canonical closure
	1047	*/
	1048	static void
	1049	postParseFn(void context, uint32_t code, Norm norm) {
	1050	int32_t length;
	1051
	1052	/* canonically order the NFD */
	1053	length=norm->lenNFD;
	1054	if(length>0) {
	1055	norm->canonBothCCs=reorderString(norm->nfd, length);
	1056	}
	1057
	1058	/* canonically reorder the NFKD */
	1059	length=norm->lenNFKD;
	1060	if(length>0) {
	1061	norm->compatBothCCs=reorderString(norm->nfkd, length);
	1062	}
	1063
	1064	/* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
	1065	if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
	1066	fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
	1067	}
	1068	if(((norm->lenNFD\|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD\|_NORM_QC_NFKD))!=0)) {
	1069	fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
	1070	}
	1071
	1072	/* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
	1073	#if 0
	1074	combineAndQC[(norm->qcFlags&0x33)\|((norm->combiningFlags&3)<<2)]=1;
	1075	#endif
	1076
	1077	if(norm->combiningFlags&1) {
	1078	if(norm->udataCC!=0) {
	1079	/* illegal - data-derivable composition exclusion */
	1080	fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
	1081	}
	1082	}
	1083	if(norm->combiningFlags&2) {
	1084	if((norm->qcFlags&0x11)==0) {
	1085	fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
	1086	}
	1087	#if 0
	1088	/* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
	1089	if(norm->udataCC==0) {
	1090	printf("U+%04lx combines backward but udataCC==0\n", (long)code);
	1091	}
	1092	#endif
	1093	}
	1094	if((norm->combiningFlags&3)==3 && beVerbose) {
	1095	printf("U+%04lx combines both ways\n", (long)code);
	1096	}
	1097
	1098	/*
	1099	* process canonical decompositions for canonical closure
	1100	*
	1101	* in each canonical decomposition:
	1102	* add the current character (code) to the set of canonical starters of its norm->nfd[0]
	1103	* set the "unsafe starter" flag for each norm->nfd[1..]
	1104	*/
	1105	length=norm->lenNFD;
	1106	if(length>0) {
	1107	Norm *otherNorm;
	1108	UChar32 c;
	1109	int32_t i;
	1110
	1111	/* nfd[0].canonStart.add(code) */
	1112	c=norm->nfd[0];
	1113	otherNorm=createNorm(c);
	1114	if(otherNorm->canonStart==NULL) {
	1115	otherNorm->canonStart=uset_open(code, code);
	1116	if(otherNorm->canonStart==NULL) {
	1117	fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
	1118	exit(U_MEMORY_ALLOCATION_ERROR);
	1119	}
	1120	} else {
	1121	uset_add(otherNorm->canonStart, code);
	1122	if(!uset_contains(otherNorm->canonStart, code)) {
	1123	fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
	1124	exit(U_INTERNAL_PROGRAM_ERROR);
	1125	}
	1126	}
	1127
	1128	/* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
	1129	for(i=1; i<length; ++i) {
	1130	createNorm(norm->nfd[i])->unsafeStart=TRUE;
	1131	}
	1132	}
	1133	}
	1134
	1135	static uint32_t
	1136	make32BitNorm(Norm *norm) {
	1137	UChar extra[100];
	1138	const Norm *other;
	1139	uint32_t word;
	1140	int32_t i, length, beforeZero=0, count, start;
	1141
	1142	/*
	1143	* Check for assumptions:
	1144	*
	1145	* Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
	1146	* then the decomposition also begins with a true starter.
	1147	*/
	1148	if(norm->udataCC==0) {
	1149	/* this is a starter */
	1150	if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
	1151	/* a "true" NFC starter with a canonical decomposition */
	1152	if( norm->canonBothCCs>=0x100 \|\| /* lead cc!=0 or */
	1153	((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
	1154	) {
	1155	fprintf(stderr,
	1156	"error: true NFC starter canonical decomposition[%u] does not begin\n"
	1157	" with a true NFC starter: U+%04lx U+%04lx%s\n",
	1158	norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
	1159	norm->lenNFD<=2 ? "" : " ...");
	1160	exit(U_INVALID_TABLE_FILE);
	1161	}
	1162	}
	1163
	1164	if((norm->qcFlags&_NORM_QC_NFKC)==0) {
	1165	if(norm->lenNFKD>0) {
	1166	/* a "true" NFKC starter with a compatibility decomposition */
	1167	if( norm->compatBothCCs>=0x100 \|\| /* lead cc!=0 or */
	1168	((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */
	1169	) {
	1170	fprintf(stderr,
	1171	"error: true NFKC starter compatibility decomposition[%u] does not begin\n"
	1172	" with a true NFKC starter: U+%04lx U+%04lx%s\n",
	1173	norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],
	1174	norm->lenNFKD<=2 ? "" : " ...");
	1175	exit(U_INVALID_TABLE_FILE);
	1176	}
	1177	} else if(norm->lenNFD>0) {
	1178	/* a "true" NFKC starter with only a canonical decomposition */
	1179	if( norm->canonBothCCs>=0x100 \|\| /* lead cc!=0 or */
	1180	((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */
	1181	) {
	1182	fprintf(stderr,
	1183	"error: true NFKC starter canonical decomposition[%u] does not begin\n"
	1184	" with a true NFKC starter: U+%04lx U+%04lx%s\n",
	1185	norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
	1186	norm->lenNFD<=2 ? "" : " ...");
	1187	exit(U_INVALID_TABLE_FILE);
	1188	}
	1189	}
	1190	}
	1191	}
	1192
	1193	/* reset the 32-bit word and set the quick check flags */
	1194	word=norm->qcFlags;
	1195
	1196	/* set the UnicodeData combining class */
	1197	word\|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
	1198
	1199	/* set the combining flag and index */
	1200	if(norm->combiningFlags&3) {
	1201	word\|=(uint32_t)(norm->combiningFlags&3)<<6;
	1202	}
	1203
	1204	/* set the combining index value into the extra data */
	1205	/* 0xffff: no combining index; 0..0x7fff: combining index */
	1206	if(norm->combiningIndex!=0xffff) {
	1207	extra[0]=norm->combiningIndex;
	1208	beforeZero=1;
	1209	}
	1210
	1211	count=beforeZero;
	1212
	1213	/* write the decompositions */
	1214	if((norm->lenNFD\|norm->lenNFKD)!=0) {
	1215	extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
	1216
	1217	length=norm->lenNFD;
	1218	if(length>0) {
	1219	if(norm->canonBothCCs!=0) {
	1220	extra[beforeZero]\|=0x80;
	1221	extra[count++]=norm->canonBothCCs;
	1222	}
	1223	start=count;
	1224	for(i=0; i<length; ++i) {
	1225	UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
	1226	}
	1227	extra[beforeZero]\|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
	1228	}
	1229
	1230	length=norm->lenNFKD;
	1231	if(length>0) {
	1232	if(norm->compatBothCCs!=0) {
	1233	extra[beforeZero]\|=0x8000;
	1234	extra[count++]=norm->compatBothCCs;
	1235	}
	1236	start=count;
	1237	for(i=0; i<length; ++i) {
	1238	UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
	1239	}
	1240	extra[beforeZero]\|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
	1241	}
	1242	}
	1243
	1244	/* allocate and copy the extra data */
	1245	if(count!=0) {
	1246	UChar *p;
	1247
	1248	if(norm->specialTag!=0) {
	1249	fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
	1250	exit(U_ILLEGAL_ARGUMENT_ERROR);
	1251	}
	1252
	1253	p=(UChar *)utm_allocN(extraMem, count);
	1254	uprv_memcpy(p, extra, count*2);
	1255
	1256	/* set the extra index, offset by beforeZero */
	1257	word\|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
	1258	} else if(norm->specialTag!=0) {
	1259	/* set a special tag instead of an extra index */
	1260	word\|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
	1261	}
	1262
	1263	return word;
	1264	}
	1265
	1266	/* turn all Norm structs into corresponding 32-bit norm values */
	1267	static void
	1268	makeAll32() {
	1269	uint32_t *pNormData;
	1270	uint32_t n;
	1271	int32_t i, normLength, count;
	1272
	1273	count=(int32_t)utm_countItems(normMem);
	1274	for(i=0; i<count; ++i) {
	1275	norms[i].value32=make32BitNorm(norms+i);
	1276	}
	1277
	1278	pNormData=utrie_getData(norm32Trie, &normLength);
	1279
	1280	count=0; /* count is now just used for debugging */
	1281	for(i=0; i<normLength; ++i) {
	1282	n=pNormData[i];
	1283	if(0!=(pNormData[i]=norms[n].value32)) {
	1284	++count;
	1285	}
	1286	}
	1287	}
	1288
	1289	/*
	1290	* extract all Norm.canonBothCCs into the FCD table
	1291	* set 32-bit values to use the common fold and compact functions
	1292	*/
	1293	static void
	1294	makeFCD() {
	1295	uint32_t *pFCDData;
	1296	uint32_t n;
	1297	int32_t i, count, fcdLength;
	1298	uint16_t bothCCs;
	1299
	1300	count=utm_countItems(normMem);
	1301	for(i=0; i<count; ++i) {
	1302	bothCCs=norms[i].canonBothCCs;
	1303	if(bothCCs==0) {
	1304	/* if there are no decomposition cc's then use the udataCC twice */
	1305	bothCCs=norms[i].udataCC;
	1306	bothCCs\|=bothCCs<<8;
	1307	}
	1308	norms[i].value32=bothCCs;
	1309	}
	1310
	1311	pFCDData=utrie_getData(fcdTrie, &fcdLength);
	1312
	1313	for(i=0; i<fcdLength; ++i) {
	1314	n=pFCDData[i];
	1315	pFCDData[i]=norms[n].value32;
	1316	}
	1317	}
	1318
	1319	/**
	1320	* If the given set contains exactly one character, then return it.
	1321	* Otherwise return -1.
	1322	*/
	1323	static int32_t
	1324	usetContainsOne(const USet* set) {
	1325	if(uset_getItemCount(set)==1) {
	1326	/* there is a single item (a single range) */
	1327	UChar32 start, end;
	1328	UErrorCode ec=U_ZERO_ERROR;
	1329	int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
	1330	if (len==0 && start==end) { /* a range (len==0) with a single code point */
	1331	return start;
	1332	}
	1333	}
	1334	return -1;
	1335	}
	1336
	1337	static void
	1338	makeCanonSetFn(void context, uint32_t code, Norm norm) {
	1339	if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
	1340	uint16_t *table;
	1341	int32_t c, tableLength;
	1342	UErrorCode errorCode=U_ZERO_ERROR;
	1343
	1344	/* does the set contain exactly one code point? */
	1345	c=usetContainsOne(norm->canonStart);
	1346
	1347	/* add an entry to the BMP or supplementary search table */
	1348	if(code<=0xffff) {
	1349	table=canonStartSets+_NORM_MAX_CANON_SETS;
	1350	tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
	1351
	1352	table[tableLength++]=(uint16_t)code;
	1353
	1354	if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
	1355	/* single-code point BMP result for BMP code point */
	1356	table[tableLength++]=(uint16_t)c;
	1357	} else {
	1358	table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX\|canonStartSetsTop);
	1359	c=-1;
	1360	}
	1361	canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
	1362	} else {
	1363	table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
	1364	tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
	1365
	1366	table[tableLength++]=(uint16_t)(code>>16);
	1367	table[tableLength++]=(uint16_t)code;
	1368
	1369	if(c>=0) {
	1370	/* single-code point result for supplementary code point */
	1371	table[tableLength-2]\|=(uint16_t)(0x8000\|((c>>8)&0x1f00));
	1372	table[tableLength++]=(uint16_t)c;
	1373	} else {
	1374	table[tableLength++]=(uint16_t)canonStartSetsTop;
	1375	}
	1376	canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
	1377	}
	1378
	1379	if(c<0) {
	1380	/* write a USerializedSet */
	1381	++canonSetsCount;
	1382	canonStartSetsTop+=
	1383	uset_serialize(norm->canonStart,
	1384	canonStartSets+canonStartSetsTop,
	1385	_NORM_MAX_CANON_SETS-canonStartSetsTop,
	1386	&errorCode);
	1387	}
	1388	canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
	1389
	1390	if(U_FAILURE(errorCode)) {
	1391	fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
	1392	exit(errorCode);
	1393	}
	1394	if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
	1395	fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
	1396	exit(U_INDEX_OUTOFBOUNDS_ERROR);
	1397	}
	1398	}
	1399	}
	1400
	1401	/* for getSkippableFlags ---------------------------------------------------- */
	1402
	1403	/* combine the lead and trail code points; return <0 if they do not combine */
	1404	static int32_t
	1405	combine(uint32_t lead, uint32_t trail) {
	1406	CombiningTriple *triples;
	1407	uint32_t i, count;
	1408
	1409	/* search for all triples with c as lead code point */
	1410	triples=utm_getStart(combiningTriplesMem);
	1411	count=utm_countItems(combiningTriplesMem);
	1412
	1413	/* triples are not sorted by code point but for each lead CP there is one contiguous block */
	1414	for(i=0; i<count && lead!=triples[i].lead; ++i) {}
	1415
	1416	/* check each triple for this code point */
	1417	for(; i<count && lead==triples[i].lead; ++i) {
	1418	if(trail==triples[i].trail) {
	1419	return (int32_t)triples[i].combined;
	1420	}
	1421	}
	1422
	1423	return -1;
	1424	}
	1425
	1426	/*
	1427	* Starting from the canonical decomposition s[0..length[ of a single code point,
	1428	* is the code point c consumed in an NFC/FCC recomposition?
	1429	*
	1430	* No need to handle discontiguous composition because that would not consume some
	1431	* intermediate character, so would not compose back to the original character.
	1432	* See comments in canChangeWithFollowing().
	1433	*
	1434	* No need to compose beyond where c canonically orders because if it is consumed
	1435	* then the result differs from the original anyway.
	1436	*
	1437	* Possible optimization:
	1438	* - Verify that there are no cases of the same combining mark stacking twice.
	1439	* - return FALSE right away if c inserts after a copy of itself
	1440	* without attempting to recompose; will happen because each mark in
	1441	* the decomposition will be enumerated and passed in as c.
	1442	* More complicated and fragile though than it is already.
	1443	*
	1444	* markus 2002nov04
	1445	*/
	1446	static UBool
	1447	doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
	1448	int32_t starter, i;
	1449
	1450	/* ignore trailing characters where cc<prevCC */
	1451	while(length>1 && cc<getCCFromCP(s[length-1])) {
	1452	--length;
	1453	}
	1454
	1455	/* start consuming/combining from the beginning */
	1456	starter=(int32_t)s[0];
	1457	for(i=1; i<length; ++i) {
	1458	starter=combine((uint32_t)starter, s[i]);
	1459	if(starter<0) {
	1460	fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
	1461	(int)s[0], (int)s[1], (int)length, (int)c, cc);
	1462	exit(U_INTERNAL_PROGRAM_ERROR);
	1463	}
	1464	}
	1465
	1466	/* try to combine/consume c, return TRUE if it is consumed */
	1467	return combine((uint32_t)starter, c)>=0;
	1468	}
	1469
	1470	/* does the starter s[0] combine forward with another char that is below trailCC? */
	1471	static UBool
	1472	canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
	1473	if(trailCC<=1) {
	1474	/* no character will combine ahead of the trailing char of the decomposition */
	1475	return FALSE;
	1476	}
	1477
	1478	/*
	1479	* We are only checking skippable condition (f).
	1480	* Therefore, the original character does not have quick check flag NFC_NO (c),
	1481	* i.e., the decomposition recomposes completely back into the original code point.
	1482	* So s[0] must be a true starter with cc==0 and
	1483	* combining with following code points.
	1484	*
	1485	* Similarly, length==1 is not possible because that would be a singleton
	1486	* decomposition which is marked with NFC_NO and does not pass (c).
	1487	*
	1488	* Only a character with cc<trailCC can change the composition.
	1489	* Reason: A char with cc>=trailCC would order after decomposition s[],
	1490	* composition would consume all of the decomposition, and here we know that
	1491	* the original char passed check d), i.e., it does not combine forward,
	1492	* therefore does not combine with anything after the decomposition is consumed.
	1493	*
	1494	* Now see if there is a character that
	1495	* 1. combines backward
	1496	* 2. has cc<trailCC
	1497	* 3. is consumed in recomposition
	1498	*
	1499	* length==2 is simple:
	1500	*
	1501	* Characters that fulfill these conditions are exactly the ones that combine directly
	1502	* with the starter c==s[0] because there is no intervening character after
	1503	* reordering.
	1504	* We can just enumerate all chars with which c combines (they all pass 1. and 3.)
	1505	* and see if one has cc<trailCC (passes 2.).
	1506	*
	1507	* length>2 is a little harder:
	1508	*
	1509	* Since we will get different starters during recomposition, we need to
	1510	* enumerate each backward-combining character (1.)
	1511	* with cc<trailCC (2.) and
	1512	* see if it gets consumed in recomposition. (3.)
	1513	* No need to enumerate both-ways combining characters because they must have cc==0.
	1514	*/
	1515	if(length==2) {
	1516	/* enumerate all chars that combine with this one and check their cc */
	1517	CombiningTriple *triples;
	1518	uint32_t c, i, count;
	1519	uint8_t cc;
	1520
	1521	/* search for all triples with c as lead code point */
	1522	triples=utm_getStart(combiningTriplesMem);
	1523	count=utm_countItems(combiningTriplesMem);
	1524	c=s[0];
	1525
	1526	/* triples are not sorted by code point but for each lead CP there is one contiguous block */
	1527	for(i=0; i<count && c!=triples[i].lead; ++i) {}
	1528
	1529	/* check each triple for this code point */
	1530	for(; i<count && c==triples[i].lead; ++i) {
	1531	cc=getCCFromCP(triples[i].trail);
	1532	if(cc>0 && cc<trailCC) {
	1533	/* this trail code point combines with c and has cc<trailCC */
	1534	return TRUE;
	1535	}
	1536	}
	1537	} else {
	1538	/* enumerate all chars that combine backward */
	1539	uint32_t c2;
	1540	uint16_t i;
	1541	uint8_t cc;
	1542
	1543	for(i=combineBothTop; i<combineBackTop; ++i) {
	1544	c2=combiningCPs[i]&0xffffff;
	1545	cc=getCCFromCP(c2);
	1546	/* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
	1547	if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
	1548	return TRUE;
	1549	}
	1550	}
	1551	}
	1552
	1553	/* this decomposition is not modified by any appended character */
	1554	return FALSE;
	1555	}
	1556
	1557	/* see unormimp.h for details on NFC Skippable flags /
	1558	static uint32_t
	1559	getSkippableFlags(const Norm *norm) {
	1560	/* ignore NFD skippable properties because they are covered by norm32, test at runtime /
	1561
	1562	/* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
	1563	if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
	1564	return 0;
	1565	}
	1566
	1567	/* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
	1568
	1569	/*
	1570	* Note:
	1571	* This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
	1572	*
	1573	* This means that (a)..(e) must always be derived from the runtime norm32 value,
	1574	* and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
	1575	* the form is NF*C and there is a canonical decomposition (NFD_NO).
	1576	*
	1577	* (a) unassigned code points get "not skippable"==false because they
	1578	* don't have a Norm struct so they won't get here
	1579	*/
	1580
	1581	/* (b) not skippable if cc!=0 */
	1582	if(norm->udataCC!=0) {
	1583	return 0; /* non-zero flag for (f) only */
	1584	}
	1585
	1586	/*
	1587	* not NFC_Skippable if
	1588	* (c) quick check flag == NO or
	1589	* (d) combines forward or
	1590	* (e) combines back or
	1591	* (f) can change if another character is added
	1592	*
	1593	* for (f):
	1594	* For NF*C: Get corresponding decomposition, get its last starter (cc==0),
	1595	* check its composition list,
	1596	* see if any of the second code points in the list
	1597	* has cc less than the trailCC of the decomposition.
	1598	*
	1599	* For FCC: Test at runtime if the decomposition has a trailCC>1
	1600	* -> there are characters with cc==1, they would order before the trail char
	1601	* and prevent contiguous combination with the trail char.
	1602	*/
	1603	if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 \|\|
	1604	(norm->combiningFlags&3)!=0) {
	1605	return 0; /* non-zero flag for (f) only */
	1606	}
	1607	if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
	1608	return _NORM_AUX_NFC_SKIP_F_MASK;
	1609	}
	1610
	1611	return 0; /* skippable */
	1612	}
	1613
	1614	static void
	1615	makeAux() {
	1616	Norm *norm;
	1617	uint32_t *pData;
	1618	int32_t i, length;
	1619
	1620	pData=utrie_getData(auxTrie, &length);
	1621
	1622	for(i=0; i<length; ++i) {
	1623	norm=norms+pData[i];
	1624	/*
	1625	* 16-bit auxiliary normalization properties
	1626	* see unormimp.h
	1627	*/
	1628	pData[i]=
	1629	((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))\|
	1630	(uint32_t)norm->fncIndex;
	1631
	1632	if(norm->unsafeStart \|\| norm->udataCC!=0) {
	1633	pData[i]\|=_NORM_AUX_UNSAFE_MASK;
	1634	}
	1635
	1636	pData[i]\|=getSkippableFlags(norm);
	1637	}
	1638	}
	1639
	1640	/* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
	1641	static uint32_t U_CALLCONV
	1642	getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
	1643	uint32_t value, leadNorm32=0;
	1644	UChar32 limit;
	1645	UBool inBlockZero;
	1646
	1647	limit=start+0x400;
	1648	while(start<limit) {
	1649	value=utrie_get32(trie, start, &inBlockZero);
	1650	if(inBlockZero) {
	1651	start+=UTRIE_DATA_BLOCK_LENGTH;
	1652	} else {
	1653	if(value!=0) {
	1654	leadNorm32\|=value;
	1655	}
	1656	++start;
	1657	}
	1658	}
	1659
	1660	/* turn multi-bit fields into the worst-case value */
	1661	if(leadNorm32&_NORM_CC_MASK) {
	1662	leadNorm32\|=_NORM_CC_MASK;
	1663	}
	1664
	1665	/* clean up unnecessarily ored bit fields */
	1666	leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
	1667
	1668	if(leadNorm32==0) {
	1669	/* nothing to do (only composition exclusions?) */
	1670	return 0;
	1671	}
	1672
	1673	/* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
	1674	leadNorm32\|=(
	1675	(uint32_t)_NORM_EXTRA_INDEX_TOP+
	1676	(uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
	1677	)<<_NORM_EXTRA_SHIFT;
	1678
	1679	return leadNorm32;
	1680	}
	1681
	1682	/* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
	1683
	1684	/*
	1685	* folding value for auxiliary data:
	1686	* store the non-zero offset in bits 9..0 (FNC bits)
	1687	* if there is any non-0 entry;
	1688	* "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
	1689	*/
	1690	static uint32_t U_CALLCONV
	1691	getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
	1692	uint32_t value, oredValues;
	1693	UChar32 limit;
	1694	UBool inBlockZero;
	1695
	1696	oredValues=0;
	1697	limit=start+0x400;
	1698	while(start<limit) {
	1699	value=utrie_get32(trie, start, &inBlockZero);
	1700	if(inBlockZero) {
	1701	start+=UTRIE_DATA_BLOCK_LENGTH;
	1702	} else {
	1703	oredValues\|=value;
	1704	++start;
	1705	}
	1706	}
	1707
	1708	if(oredValues!=0) {
	1709	/* move the 10 significant offset bits into bits 9..0 */
	1710	offset>>=UTRIE_SURROGATE_BLOCK_BITS;
	1711	if(offset>_NORM_AUX_FNC_MASK) {
	1712	fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
	1713	exit(U_INDEX_OUTOFBOUNDS_ERROR);
	1714	}
	1715	return (uint32_t)offset\|(oredValues&~_NORM_AUX_FNC_MASK);
	1716	} else {
	1717	return 0;
	1718	}
	1719	}
	1720
	1721	extern void
	1722	processData() {
	1723	#if 0
	1724	uint16_t i;
	1725	#endif
	1726
	1727	processCombining();
	1728
	1729	/* canonically reorder decompositions and assign combining classes for decompositions */
	1730	enumTrie(postParseFn, NULL);
	1731
	1732	#if 0
	1733	for(i=1; i<64; ++i) {
	1734	if(combineAndQC[i]) {
	1735	printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
	1736	}
	1737	}
	1738	#endif
	1739
	1740	/* add hangul/jamo specials */
	1741	setHangulJamoSpecials();
	1742
	1743	/* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
	1744	canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
	1745
	1746	/* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
	1747	if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) {
	1748	enumTrie(makeCanonSetFn, NULL);
	1749	}
	1750
	1751	/* clone the normalization builder trie to make the final data tries */
	1752	if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) \|\|
	1753	NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) \|\|
	1754	NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
	1755	) {
	1756	fprintf(stderr, "error: unable to clone the normalization trie\n");
	1757	exit(U_MEMORY_ALLOCATION_ERROR);
	1758	}
	1759
	1760	/* --- finalize data for quick checks & normalization --- */
	1761
	1762	/* turn the Norm structs (stage2, norms) into 32-bit data words */
	1763	makeAll32();
	1764
	1765	/* --- finalize data for FCD checks --- */
	1766
	1767	/* FCD data: take Norm.canonBothCCs and store them in the FCD table */
	1768	makeFCD();
	1769
	1770	/* --- finalize auxiliary normalization data --- */
	1771	makeAux();
	1772
	1773	if(beVerbose) {
	1774	#if 0
	1775	printf("number of stage 2 entries: %ld\n", stage2Mem->index);
	1776	printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT2+stage2Mem->index4+extraMem->index*2);
	1777	#endif
	1778	printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
	1779	printf("combining table count: %u\n", combiningTableTop);
	1780	}
	1781	}
	1782
	1783	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	1784
	1785	extern void
	1786	generateData(const char *dataDir, UBool csource) {
	1787	static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
	1788
	1789	UNewDataMemory *pData;
	1790	UErrorCode errorCode=U_ZERO_ERROR;
	1791	int32_t size, dataLength;
	1792
	1793	#if UCONFIG_NO_NORMALIZATION
	1794
	1795	size=0;
	1796
	1797	#else
	1798
	1799	U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
	1800	U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
	1801	USet *set;
	1802	int32_t normTrieSize, fcdTrieSize, auxTrieSize;
	1803
	1804	normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
	1805	if(U_FAILURE(errorCode)) {
	1806	fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
	1807	exit(errorCode);
	1808	}
	1809
	1810	if(DO_STORE(UGENNORM_STORE_FCD)) {
	1811	fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode);
	1812	if(U_FAILURE(errorCode)) {
	1813	fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
	1814	exit(errorCode);
	1815	}
	1816	} else {
	1817	fcdTrieSize=0;
	1818	}
	1819
	1820	if(DO_STORE(UGENNORM_STORE_AUX)) {
	1821	auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
	1822	if(U_FAILURE(errorCode)) {
	1823	fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
	1824	exit(errorCode);
	1825	}
	1826	} else {
	1827	auxTrieSize=0;
	1828	}
	1829
	1830	/* move the parts of canonStartSets[] together into a contiguous block */
	1831	if( canonStartSetsTop<_NORM_MAX_CANON_SETS &&
	1832	canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0
	1833	) {
	1834	uprv_memmove(canonStartSets+canonStartSetsTop,
	1835	canonStartSets+_NORM_MAX_CANON_SETS,
	1836	canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
	1837	}
	1838	canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
	1839
	1840	if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) &&
	1841	canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0
	1842	) {
	1843	uprv_memmove(canonStartSets+canonStartSetsTop,
	1844	canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
	1845	canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
	1846	}
	1847	canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
	1848
	1849	/* create the normalization exclusion sets */
	1850	/*
	1851	* nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
	1852	* but we cannot use NFD_QC from the pattern because that would require
	1853	* unorm.icu which we are just going to generate.
	1854	* Therefore we have manually collected nfdQCNoSet and intersect Ideographic
	1855	* with that.
	1856	*/
	1857	U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
	1858	U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
	1859
	1860	canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
	1861	set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
	1862	if(U_FAILURE(errorCode)) {
	1863	fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
	1864	exit(errorCode);
	1865	}
	1866	uset_retainAll(set, nfdQCNoSet);
	1867	if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
	1868	uset_clear(set);
	1869	}
	1870	canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
	1871	if(U_FAILURE(errorCode)) {
	1872	fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
	1873	exit(errorCode);
	1874	}
	1875	uset_close(set);
	1876
	1877	canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
	1878	set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
	1879	if(U_FAILURE(errorCode)) {
	1880	fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
	1881	exit(errorCode);
	1882	}
	1883	if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
	1884	uset_clear(set);
	1885	}
	1886	canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
	1887	if(U_FAILURE(errorCode)) {
	1888	fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
	1889	exit(errorCode);
	1890	}
	1891	uset_close(set);
	1892
	1893	canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
	1894
	1895	/* make sure that the FCD trie is 4-aligned */
	1896	if((utm_countItems(extraMem)+combiningTableTop)&1) {
	1897	combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
	1898	}
	1899
	1900	/* pad canonStartSets to 4-alignment, too */
	1901	if(canonStartSetsTop&1) {
	1902	canonStartSets[canonStartSetsTop++]=0x1235;
	1903	}
	1904
	1905	size=
	1906	_NORM_INDEX_TOP*4+
	1907	normTrieSize+
	1908	utm_countItems(extraMem)*2+
	1909	combiningTableTop*2+
	1910	fcdTrieSize+
	1911	auxTrieSize+
	1912	canonStartSetsTop*2;
	1913
	1914	if(beVerbose) {
	1915	printf("size of normalization trie %5u bytes\n", (int)normTrieSize);
	1916	printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
	1917	printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
	1918	printf("size of combining table %5u uint16_t\n", combiningTableTop);
	1919	printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize);
	1920	printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize);
	1921	printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop);
	1922	printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP);
	1923	printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
	1924	printf(" number of sets %5d\n", (int)canonSetsCount);
	1925	printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
	1926	printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
	1927	printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
	1928	printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
	1929	}
	1930
	1931	indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
	1932	indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
	1933
	1934	indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
	1935	indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
	1936	indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
	1937	indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
	1938
	1939	/* the quick check minimum code points are already set */
	1940
	1941	indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
	1942	indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
	1943	indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
	1944
	1945	#endif
	1946
	1947	if(csource) {
	1948	#if UCONFIG_NO_NORMALIZATION
	1949	/* no csource for dummy mode..? */
	1950	fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
	1951	exit(1);
	1952	#else
	1953	/* write .c file for hardcoded data */
	1954	UTrie normTrie2={ NULL }, fcdTrie2={ NULL }, auxTrie2={ NULL };
	1955	FILE *f;
	1956
	1957	utrie_unserialize(&normTrie2, normTrieBlock, normTrieSize, &errorCode);
	1958	if(fcdTrieSize>0) {
	1959	utrie_unserialize(&fcdTrie2, fcdTrieBlock, fcdTrieSize, &errorCode);
	1960	}
	1961	if(auxTrieSize>0) {
	1962	utrie_unserialize(&auxTrie2, auxTrieBlock, auxTrieSize, &errorCode);
	1963	}
	1964	if(U_FAILURE(errorCode)) {
	1965	fprintf(
	1966	stderr,
	1967	"gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
	1968	u_errorName(errorCode));
	1969	exit(errorCode);
	1970	}
	1971
	1972	f=usrc_create(dataDir, "unorm_props_data.c");
	1973	if(f!=NULL) {
	1974	usrc_writeArray(f,
	1975	"static const UVersionInfo formatVersion={ ",
	1976	dataInfo.formatVersion, 8, 4,
	1977	" };\n\n");
	1978	usrc_writeArray(f,
	1979	"static const UVersionInfo dataVersion={ ",
	1980	dataInfo.dataVersion, 8, 4,
	1981	" };\n\n");
	1982	usrc_writeArray(f,
	1983	"static const int32_t indexes[_NORM_INDEX_TOP]={\n",
	1984	indexes, 32, _NORM_INDEX_TOP,
	1985	"\n};\n\n");
	1986	usrc_writeUTrieArrays(f,
	1987	"static const uint16_t normTrie_index[%ld]={\n",
	1988	"static const uint32_t normTrie_data32[%ld]={\n",
	1989	&normTrie2,
	1990	"\n};\n\n");
	1991	usrc_writeUTrieStruct(f,
	1992	"static const UTrie normTrie={\n",
	1993	&normTrie2, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
	1994	"};\n\n");
	1995	usrc_writeArray(f,
	1996	"static const uint16_t extraData[%ld]={\n",
	1997	utm_getStart(extraMem), 16, utm_countItems(extraMem),
	1998	"\n};\n\n");
	1999	usrc_writeArray(f,
	2000	"static const uint16_t combiningTable[%ld]={\n",
	2001	combiningTable, 16, combiningTableTop,
	2002	"\n};\n\n");
	2003	if(fcdTrieSize>0) {
	2004	usrc_writeUTrieArrays(f,
	2005	"static const uint16_t fcdTrie_index[%ld]={\n", NULL,
	2006	&fcdTrie2,
	2007	"\n};\n\n");
	2008	usrc_writeUTrieStruct(f,
	2009	"static const UTrie fcdTrie={\n",
	2010	&fcdTrie2, "fcdTrie_index", NULL, NULL,
	2011	"};\n\n");
	2012	} else {
	2013	fputs( "static const UTrie fcdTrie={ NULL };\n\n", f);
	2014	}
	2015	if(auxTrieSize>0) {
	2016	usrc_writeUTrieArrays(f,
	2017	"static const uint16_t auxTrie_index[%ld]={\n", NULL,
	2018	&auxTrie2,
	2019	"\n};\n\n");
	2020	usrc_writeUTrieStruct(f,
	2021	"static const UTrie auxTrie={\n",
	2022	&auxTrie2, "auxTrie_index", NULL, "getFoldingAuxOffset",
	2023	"};\n\n");
	2024	} else {
	2025	fputs( "static const UTrie auxTrie={ NULL };\n\n", f);
	2026	}
	2027	usrc_writeArray(f,
	2028	"static const uint16_t canonStartSets[%ld]={\n",
	2029	canonStartSets, 16, canonStartSetsTop,
	2030	"\n};\n\n");
	2031	fclose(f);
	2032	}
	2033	#endif
	2034	} else {
	2035	/* write the data */
	2036	pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
	2037	haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
	2038	if(U_FAILURE(errorCode)) {
	2039	fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
	2040	exit(errorCode);
	2041	}
	2042
	2043	#if !UCONFIG_NO_NORMALIZATION
	2044
	2045	udata_writeBlock(pData, indexes, sizeof(indexes));
	2046	udata_writeBlock(pData, normTrieBlock, normTrieSize);
	2047	udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
	2048	udata_writeBlock(pData, combiningTable, combiningTableTop*2);
	2049	udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
	2050	udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
	2051	udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
	2052
	2053	#endif
	2054
	2055	/* finish up */
	2056	dataLength=udata_finish(pData, &errorCode);
	2057	if(U_FAILURE(errorCode)) {
	2058	fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
	2059	exit(errorCode);
	2060	}
	2061
	2062	if(dataLength!=size) {
	2063	fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
	2064	(long)dataLength, (long)size);
	2065	exit(U_INTERNAL_PROGRAM_ERROR);
	2066	}
	2067	}
	2068	}
	2069
	2070	#if !UCONFIG_NO_NORMALIZATION
	2071
	2072	extern void
	2073	cleanUpData(void) {
	2074	int32_t i, count;
	2075
	2076	count=utm_countItems(normMem);
	2077	for(i=0; i<count; ++i) {
	2078	uset_close(norms[i].canonStart);
	2079	}
	2080
	2081	utm_close(normMem);
	2082	utm_close(utf32Mem);
	2083	utm_close(extraMem);
	2084	utm_close(combiningTriplesMem);
	2085	utrie_close(normTrie);
	2086	utrie_close(norm32Trie);
	2087	utrie_close(fcdTrie);
	2088	utrie_close(auxTrie);
	2089
	2090	uset_close(nfdQCNoSet);
	2091
	2092	uprv_free(normTrie);
	2093	uprv_free(norm32Trie);
	2094	uprv_free(fcdTrie);
	2095	uprv_free(auxTrie);
	2096	}
	2097
	2098	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	2099
	2100	/*
	2101	* Hey, Emacs, please set the following:
	2102	*
	2103	* Local Variables:
	2104	* indent-tabs-mode: nil
	2105	* End:
	2106	*
	2107	*/