git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/gennames/gennames.c

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1999-2001, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: gennames.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 1999sep30
	14	* created by: Markus W. Scherer
	15	*
	16	* This program reads the Unicode character database text file,
	17	* parses it, and extracts the character code,
	18	* the "modern" character name, and optionally the
	19	* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
	20	* It then tokenizes and compresses the names and builds
	21	* compact binary tables for random-access lookup
	22	* in a u_charName() API function.
	23	*
	24	* unames.icu file format (after UDataInfo header etc. - see udata.c)
	25	* (all data is static const)
	26	*
	27	* UDataInfo fields:
	28	* dataFormat "unam"
	29	* formatVersion 1.0
	30	* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
	31	*
	32	* -- data-based names
	33	* uint32_t tokenStringOffset,
	34	* groupsOffset,
	35	* groupStringOffset,
	36	* algNamesOffset;
	37	*
	38	* uint16_t tokenCount;
	39	* uint16_t tokenTable[tokenCount];
	40	*
	41	* char tokenStrings[]; -- padded to even count
	42	*
	43	* -- strings (groupStrings) are tokenized as follows:
	44	* for each character c
	45	* if(c>=tokenCount) write that character c directly
	46	* else
	47	* token=tokenTable[c];
	48	* if(token==0xfffe) -- lead byte of double-byte token
	49	* token=tokenTable[c<<8\|next character];
	50	* if(token==-1)
	51	* write c directly
	52	* else
	53	* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
	54	* append zero-terminated tokenString;
	55	*
	56	* Different strings for a code point - normal name, 1.0 name, and ISO comment -
	57	* are separated by ';'.
	58	*
	59	* uint16_t groupCount;
	60	* struct {
	61	* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
	62	* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
	63	* uint16_t offsetLow;
	64	* } groupTable[groupCount];
	65	*
	66	* char groupStrings[]; -- padded to 4-count
	67	*
	68	* -- The actual, tokenized group strings are not zero-terminated because
	69	* that would take up too much space.
	70	* Instead, they are preceeded by their length, written in a variable-length sequence:
	71	* For each of the 32 group strings, one or two nibbles are stored for its length.
	72	* Nibbles (4-bit values, half-bytes) are read MSB first.
	73	* A nibble with a value of 0..11 directly indicates the length of the name string.
	74	* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
	75	* by (((n-12)<<4)\|m)+12, reaching values of 12..75.
	76	* These lengths are sequentially for each tokenized string, not for the de-tokenized result.
	77	* For the de-tokenizing, see token description above; the strings immediately follow the
	78	* 32 lengths.
	79	*
	80	* -- algorithmic names
	81	*
	82	* typedef struct AlgorithmicRange {
	83	* uint32_t rangeStart, rangeEnd;
	84	* uint8_t algorithmType, algorithmVariant;
	85	* uint16_t rangeSize;
	86	* } AlgorithmicRange;
	87	*
	88	* uint32_t algRangesCount; -- number of data blocks for ranges of
	89	* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
	90	*
	91	* struct {
	92	* AlgorithmicRange algRange;
	93	* uint8_t algRangeData[]; -- padded to 4-count except in last range
	94	* } algRanges[algNamesCount];
	95	* -- not a real array because each part has a different size
	96	* of algRange.rangeSize (including AlgorithmicRange)
	97	*
	98	* -- algorithmic range types:
	99	*
	100	* 0 Names are formed from a string prefix that is stored in
	101	* the algRangeData (zero-terminated), followed by the Unicode code point
	102	* of the character in hexadecimal digits;
	103	* algRange.algorithmVariant digits are written
	104	*
	105	* 1 Names are formed by calculating modulo-factors of the code point value as follows:
	106	* algRange.algorithmVariant is the count of modulo factors
	107	* algRangeData contains
	108	* uint16_t factors[algRange.algorithmVariant];
	109	* char strings[];
	110	* the first zero-terminated string is written as the prefix; then:
	111	*
	112	* The rangeStart is subtracted; with the difference, here "code":
	113	* for(i=algRange.algorithmVariant-1 to 0 step -1)
	114	* index[i]=code%factor[i];
	115	* code/=factor[i];
	116	*
	117	* The strings after the prefix are short pieces that are then appended to the result
	118	* according to index[0..algRange.algorithmVariant-1].
	119	*/
	120
	121	#include <stdio.h>
	122	#include <stdlib.h>
	123	#include "unicode/utypes.h"
	124	#include "unicode/putil.h"
	125	#include "cmemory.h"
	126	#include "cstring.h"
	127	#include "unicode/udata.h"
	128	#include "unewdata.h"
	129	#include "uoptions.h"
	130	#include "uparse.h"
	131
	132	#define STRING_STORE_SIZE 1000000
	133	#define GROUP_STORE_SIZE 5000
	134
	135	#define GROUP_SHIFT 5
	136	#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
	137	#define GROUP_MASK (LINES_PER_GROUP-1)
	138
	139	#define MAX_LINE_COUNT 50000
	140	#define MAX_WORD_COUNT 20000
	141	#define MAX_GROUP_COUNT 5000
	142
	143	#define DATA_NAME "unames"
	144	#define DATA_TYPE "icu"
	145	#define VERSION_STRING "unam"
	146	#define NAME_SEPARATOR_CHAR ';'
	147
	148	static const UVersionInfo
	149	unicode_3_0={ 3, 0, 0, 0 },
	150	unicode_3_1={ 3, 1, 0, 0 };
	151
	152	/* UDataInfo cf. udata.h */
	153	static UDataInfo dataInfo={
	154	sizeof(UDataInfo),
	155	0,
	156
	157	U_IS_BIG_ENDIAN,
	158	U_CHARSET_FAMILY,
	159	sizeof(UChar),
	160	0,
	161
	162	{0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
	163	{1, 0, 0, 0}, /* formatVersion */
	164	{3, 0, 0, 0} /* dataVersion */
	165	};
	166
	167	static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
	168
	169	static uint8_t stringStore[STRING_STORE_SIZE],
	170	groupStore[GROUP_STORE_SIZE],
	171	lineLengths[LINES_PER_GROUP];
	172
	173	static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
	174
	175	typedef struct {
	176	uint32_t code;
	177	int16_t length;
	178	uint8_t *s;
	179	} Line;
	180
	181	typedef struct {
	182	int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
	183	int16_t count;
	184	int16_t length;
	185	uint8_t *s;
	186	} Word;
	187
	188	static Line lines[MAX_LINE_COUNT];
	189	static Word words[MAX_WORD_COUNT];
	190
	191	static uint32_t lineCount=0, wordCount=0;
	192
	193	static int16_t leadByteCount;
	194
	195	#define LEADBYTE_LIMIT 16
	196
	197	static int16_t tokens[LEADBYTE_LIMIT*256];
	198	static uint32_t tokenCount;
	199
	200	/* prototypes --------------------------------------------------------------- */
	201
	202	static void
	203	init(void);
	204
	205	static void
	206	parseDB(const char *filename, UBool store10Names);
	207
	208	static void
	209	parseName(char *name, int16_t length);
	210
	211	static int16_t
	212	skipNoise(char *line, int16_t start, int16_t limit);
	213
	214	static int16_t
	215	getWord(char *line, int16_t start, int16_t limit);
	216
	217	static void
	218	compress(void);
	219
	220	static void
	221	compressLines(void);
	222
	223	static int16_t
	224	compressLine(uint8_t s, int16_t length, int16_t pGroupTop);
	225
	226	static int
	227	compareWords(const void word1, const void word2);
	228
	229	static void
	230	generateData(const char *dataDir);
	231
	232	static uint32_t
	233	generateAlgorithmicData(UNewDataMemory *pData);
	234
	235	static int16_t
	236	findToken(uint8_t *s, int16_t length);
	237
	238	static Word *
	239	findWord(char *s, int16_t length);
	240
	241	static Word *
	242	addWord(char *s, int16_t length);
	243
	244	static void
	245	countWord(Word *word);
	246
	247	static void
	248	addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
	249
	250	static void
	251	addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
	252
	253	static uint32_t
	254	addToken(uint8_t *s, int16_t length);
	255
	256	static void
	257	appendLineLength(int16_t length);
	258
	259	static void
	260	appendLineLengthNibble(uint8_t nibble);
	261
	262	static uint8_t *
	263	allocLine(int32_t length);
	264
	265	static uint8_t *
	266	allocWord(uint32_t length);
	267
	268	/* -------------------------------------------------------------------------- */
	269
	270	static UOption options[]={
	271	UOPTION_HELP_H,
	272	UOPTION_HELP_QUESTION_MARK,
	273	UOPTION_VERBOSE,
	274	UOPTION_QUIET,
	275	UOPTION_COPYRIGHT,
	276	UOPTION_DESTDIR,
	277	{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
	278	{ "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
	279	};
	280
	281	extern int
	282	main(int argc, char* argv[]) {
	283	UVersionInfo version;
	284	UBool store10Names=FALSE;
	285
	286	U_MAIN_INIT_ARGS(argc, argv);
	287
	288	/* preset then read command line options */
	289	options[5].value=u_getDataDirectory();
	290	options[6].value="3.2";
	291	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
	292
	293	/* error handling, printing usage message */
	294	if(argc<0) {
	295	fprintf(stderr,
	296	"error in command line argument \"%s\"\n",
	297	argv[-argc]);
	298	} else if(argc<2) {
	299	argc=-1;
	300	}
	301	if(argc<0 \|\| options[0].doesOccur \|\| options[1].doesOccur) {
	302	/*
	303	* Broken into chucks because the C89 standard says the minimum
	304	* required supported string length is 509 bytes.
	305	*/
	306	fprintf(stderr,
	307	"Usage: %s [-1[+\|-]] [-v[+\|-]] [-c[+\|-]] filename\n"
	308	"\n"
	309	"Read the UnicodeData.txt file and \n"
	310	"create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the character names\n"
	311	"\n"
	312	"\tfilename absolute path/filename for the Unicode database text file\n"
	313	"\t\t(default: standard input)\n"
	314	"\n",
	315	argv[0]);
	316	fprintf(stderr,
	317	"Options:\n"
	318	"\t-h or -? or --help this usage text\n"
	319	"\t-v or --verbose verbose output\n"
	320	"\t-q or --quiet no output\n"
	321	"\t-c or --copyright include a copyright notice\n"
	322	"\t-d or --destdir destination directory, followed by the path\n"
	323	"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
	324	"\t-1 or --unicode1-names store Unicode 1.0 character names\n");
	325	return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	326	}
	327
	328	/* get the options values */
	329	beVerbose=options[2].doesOccur;
	330	beQuiet=options[3].doesOccur;
	331	haveCopyright=options[4].doesOccur;
	332	store10Names=options[7].doesOccur;
	333
	334	/* set the Unicode version */
	335	u_versionFromString(version, options[6].value);
	336	uprv_memcpy(dataInfo.dataVersion, version, 4);
	337
	338	init();
	339	parseDB(argc>=2 ? argv[1] : "-", store10Names);
	340	compress();
	341	generateData(options[5].value);
	342
	343	return 0;
	344	}
	345
	346	static void
	347	init() {
	348	int i;
	349
	350	for(i=0; i<256; ++i) {
	351	tokens[i]=0;
	352	}
	353	}
	354
	355	/* parsing ------------------------------------------------------------------ */
	356
	357	static void U_CALLCONV
	358	lineFn(void *context,
	359	char *fields[][2], int32_t fieldCount,
	360	UErrorCode *pErrorCode) {
	361	char *names[3];
	362	int16_t lengths[3];
	363	static uint32_t prevCode=0;
	364	uint32_t code=0;
	365
	366	if(U_FAILURE(*pErrorCode)) {
	367	return;
	368	}
	369	/* get the character code */
	370	code=uprv_strtoul(fields[0][0], NULL, 16);
	371
	372	/* get the character name */
	373	names[0]=fields[1][0];
	374	if(fields[1][0][0]!='<') {
	375	lengths[0]=(int16_t)(fields[1][1]-names[0]);
	376	} else {
	377	/* do not store pseudo-names in <> brackets */
	378	lengths[0]=0;
	379	}
	380
	381	/* store 1.0 names */
	382	/* get the second character name, the one from Unicode 1.0 */
	383	/* do not store pseudo-names in <> brackets */
	384	names[1]=fields[10][0];
	385	if((UBool )context && fields[10][0][0]!='<') {
	386	lengths[1]=(int16_t)(fields[10][1]-names[1]);
	387	} else {
	388	lengths[1]=0;
	389	}
	390
	391	/* get the ISO 10646 comment */
	392	names[2]=fields[11][0];
	393	lengths[2]=(int16_t)(fields[11][1]-names[2]);
	394
	395	if(lengths[0]+lengths[1]+lengths[2]==0) {
	396	return;
	397	}
	398
	399	/* check for non-character code points */
	400	if(!UTF_IS_UNICODE_CHAR(code)) {
	401	fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
	402	(unsigned long)code);
	403	*pErrorCode=U_PARSE_ERROR;
	404	exit(U_PARSE_ERROR);
	405	}
	406
	407	/* check that the code points (code) are in ascending order */
	408	if(code<=prevCode && code>0) {
	409	fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
	410	(unsigned long)code, (unsigned long)prevCode);
	411	*pErrorCode=U_PARSE_ERROR;
	412	exit(U_PARSE_ERROR);
	413	}
	414	prevCode=code;
	415
	416	parseName(names[0], lengths[0]);
	417	parseName(names[1], lengths[1]);
	418	parseName(names[2], lengths[2]);
	419
	420	/*
	421	* set the count argument to
	422	* 1: only store regular names
	423	* 2: store regular and 1.0 names
	424	* 3: store names and ISO 10646 comment
	425	*/
	426	addLine(code, names, lengths, 3);
	427	}
	428
	429	static void
	430	parseDB(const char *filename, UBool store10Names) {
	431	char *fields[15][2];
	432	UErrorCode errorCode=U_ZERO_ERROR;
	433
	434	u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
	435	if(U_FAILURE(errorCode)) {
	436	fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
	437	exit(errorCode);
	438	}
	439
	440	if(!beQuiet) {
	441	printf("size of all names in the database: %lu\n",
	442	(unsigned long)lineTop);
	443	printf("number of named Unicode characters: %lu\n",
	444	(unsigned long)lineCount);
	445	printf("number of words in the dictionary from these names: %lu\n",
	446	(unsigned long)wordCount);
	447	}
	448	}
	449
	450	static void
	451	parseName(char *name, int16_t length) {
	452	int16_t start=0, limit, wordLength/, prevStart=-1/;
	453	Word *word;
	454
	455	while(start<length) {
	456	/* skip any "noise" characters */
	457	limit=skipNoise(name, start, length);
	458	if(start<limit) {
	459	/prevStart=-1;/
	460	start=limit;
	461	}
	462	if(start==length) {
	463	break;
	464	}
	465
	466	/* get a word and add it if it is longer than 1 */
	467	limit=getWord(name, start, length);
	468	wordLength=(int16_t)(limit-start);
	469	if(wordLength>1) {
	470	word=findWord(name+start, wordLength);
	471	if(word==NULL) {
	472	word=addWord(name+start, wordLength);
	473	}
	474	countWord(word);
	475	}
	476
	477	#if 0
	478	/*
	479	* if there was a word before this
	480	* (with no noise in between), then add the pair of words, too
	481	*/
	482	if(prevStart!=-1) {
	483	wordLength=limit-prevStart;
	484	word=findWord(name+prevStart, wordLength);
	485	if(word==NULL) {
	486	word=addWord(name+prevStart, wordLength);
	487	}
	488	countWord(word);
	489	}
	490	#endif
	491
	492	/prevStart=start;/
	493	start=limit;
	494	}
	495	}
	496
	497	static UBool U_INLINE
	498	isWordChar(char c) {
	499	return ('A'<=c && c<='I') \|\| /* EBCDIC-safe check for letters */
	500	('J'<=c && c<='R') \|\|
	501	('S'<=c && c<='Z') \|\|
	502
	503	('a'<=c && c<='i') \|\| /* lowercase letters for ISO comments */
	504	('j'<=c && c<='r') \|\|
	505	('s'<=c && c<='z') \|\|
	506
	507	('0'<=c && c<='9');
	508	}
	509
	510	static int16_t
	511	skipNoise(char *line, int16_t start, int16_t limit) {
	512	/* skip anything that is not part of a word in this sense */
	513	while(start<limit && !isWordChar(line[start])) {
	514	++start;
	515	}
	516
	517	return start;
	518	}
	519
	520	static int16_t
	521	getWord(char *line, int16_t start, int16_t limit) {
	522	char c=0; /* initialize to avoid a compiler warning although the code was safe */
	523
	524	/* a unicode character name word consists of A-Z0-9 */
	525	while(start<limit && isWordChar(line[start])) {
	526	++start;
	527	}
	528
	529	/* include a following space or dash */
	530	if(start<limit && ((c=line[start])==' ' \|\| c=='-')) {
	531	++start;
	532	}
	533
	534	return start;
	535	}
	536
	537	/* compressing -------------------------------------------------------------- */
	538
	539	static void
	540	compress() {
	541	uint32_t i, letterCount;
	542	int16_t wordNumber;
	543
	544	/* sort the words in reverse order by weight */
	545	qsort(words, wordCount, sizeof(Word), compareWords);
	546
	547	/* remove the words that do not save anything */
	548	while(wordCount>0 && words[wordCount-1].weight<1) {
	549	--wordCount;
	550	}
	551
	552	/* count the letters in the token range */
	553	letterCount=0;
	554	for(i=LEADBYTE_LIMIT; i<256; ++i) {
	555	if(tokens[i]==-1) {
	556	++letterCount;
	557	}
	558	}
	559	if(!beQuiet) {
	560	printf("number of letters used in the names: %d\n", letterCount);
	561	}
	562
	563	/* do we need double-byte tokens? */
	564	if(wordCount+letterCount<=256) {
	565	/* no, single-byte tokens are enough */
	566	leadByteCount=0;
	567	for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
	568	if(tokens[i]!=-1) {
	569	tokens[i]=wordNumber;
	570	if(beVerbose) {
	571	printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
	572	i, (long)words[wordNumber].weight,
	573	words[wordNumber].length, words[wordNumber].s);
	574	}
	575	++wordNumber;
	576	}
	577	}
	578	tokenCount=i;
	579	} else {
	580	/*
	581	* The tokens that need two token bytes
	582	* get their weight reduced by their count
	583	* because they save less.
	584	*/
	585	tokenCount=256-letterCount;
	586	for(i=tokenCount; i<wordCount; ++i) {
	587	words[i].weight-=words[i].count;
	588	}
	589
	590	/* sort these words in reverse order by weight */
	591	qsort(words+tokenCount, wordCount-tokenCount, sizeof(Word), compareWords);
	592
	593	/* remove the words that do not save anything */
	594	while(wordCount>0 && words[wordCount-1].weight<1) {
	595	--wordCount;
	596	}
	597
	598	/* how many tokens and lead bytes do we have now? */
	599	tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
	600	/*
	601	* adjust upwards to take into account that
	602	* double-byte tokens must not
	603	* use NAME_SEPARATOR_CHAR as a second byte
	604	*/
	605	tokenCount+=(tokenCount-256+254)/255;
	606
	607	leadByteCount=(int16_t)(tokenCount>>8);
	608	if(leadByteCount<LEADBYTE_LIMIT) {
	609	/* adjust for the real number of lead bytes */
	610	tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
	611	} else {
	612	/* limit the number of lead bytes */
	613	leadByteCount=LEADBYTE_LIMIT-1;
	614	tokenCount=LEADBYTE_LIMIT*256;
	615	wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
	616	/* adjust again to skip double-byte tokens with ';' */
	617	wordCount-=(tokenCount-256+254)/255;
	618	}
	619
	620	/* set token 0 to word 0 */
	621	tokens[0]=0;
	622	if(beVerbose) {
	623	printf("tokens[0x000]: word%8ld \"%.*s\"\n",
	624	(long)words[0].weight,
	625	words[0].length, words[0].s);
	626	}
	627	wordNumber=1;
	628
	629	/* set the lead byte tokens */
	630	for(i=1; (int16_t)i<=leadByteCount; ++i) {
	631	tokens[i]=-2;
	632	}
	633
	634	/* set the tokens */
	635	for(; i<256; ++i) {
	636	/* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
	637	if(tokens[i]!=-1) {
	638	tokens[i]=wordNumber;
	639	if(beVerbose) {
	640	printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
	641	i, (long)words[wordNumber].weight,
	642	words[wordNumber].length, words[wordNumber].s);
	643	}
	644	++wordNumber;
	645	}
	646	}
	647
	648	/* continue above 255 where there are no letters */
	649	for(; (uint32_t)wordNumber<wordCount; ++i) {
	650	if((i&0xff)==NAME_SEPARATOR_CHAR) {
	651	tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
	652	} else {
	653	tokens[i]=wordNumber;
	654	if(beVerbose) {
	655	printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
	656	i, (long)words[wordNumber].weight,
	657	words[wordNumber].length, words[wordNumber].s);
	658	}
	659	++wordNumber;
	660	}
	661	}
	662	tokenCount=i; /* should be already tokenCount={i or i+1} */
	663	}
	664
	665	if(!beQuiet) {
	666	printf("number of lead bytes: %d\n", leadByteCount);
	667	printf("number of single-byte tokens: %lu\n",
	668	(unsigned long)256-letterCount-leadByteCount);
	669	printf("number of tokens: %lu\n", (unsigned long)tokenCount);
	670	}
	671
	672	compressLines();
	673	}
	674
	675	static void
	676	compressLines() {
	677	Line *line=NULL;
	678	uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
	679	groupMSB=0xffff, lineCount2;
	680	int16_t groupTop=0;
	681
	682	/* store the groups like lines, reusing the lines' memory */
	683	lineTop=0;
	684	lineCount2=lineCount;
	685	lineCount=0;
	686
	687	/* loop over all lines */
	688	while(i<lineCount2) {
	689	line=lines+i++;
	690	inLine=line->code;
	691
	692	/* segment the lines to groups of 32 */
	693	if(inLine>>GROUP_SHIFT!=groupMSB) {
	694	/* finish the current group with empty lines */
	695	while((++outLine&GROUP_MASK)!=0) {
	696	appendLineLength(0);
	697	}
	698
	699	/* store the group like a line */
	700	if(groupTop>0) {
	701	if(groupTop>GROUP_STORE_SIZE) {
	702	fprintf(stderr, "gennames: group store overflow\n");
	703	exit(U_BUFFER_OVERFLOW_ERROR);
	704	}
	705	addGroup(groupMSB, groupStore, groupTop);
	706	if(lineTop>(uint32_t)(line->s-stringStore)) {
	707	fprintf(stderr, "gennames: group store runs into string store\n");
	708	exit(U_INTERNAL_PROGRAM_ERROR);
	709	}
	710	}
	711
	712	/* start the new group */
	713	lineLengthsTop=0;
	714	groupTop=0;
	715	groupMSB=inLine>>GROUP_SHIFT;
	716	outLine=(inLine&~GROUP_MASK)-1;
	717	}
	718
	719	/* write empty lines between the previous line in the group and this one */
	720	while(++outLine<inLine) {
	721	appendLineLength(0);
	722	}
	723
	724	/* write characters and tokens for this line */
	725	appendLineLength(compressLine(line->s, line->length, &groupTop));
	726	}
	727
	728	/* finish and store the last group */
	729	if(line && groupMSB!=0xffff) {
	730	/* finish the current group with empty lines */
	731	while((++outLine&GROUP_MASK)!=0) {
	732	appendLineLength(0);
	733	}
	734
	735	/* store the group like a line */
	736	if(groupTop>0) {
	737	if(groupTop>GROUP_STORE_SIZE) {
	738	fprintf(stderr, "gennames: group store overflow\n");
	739	exit(U_BUFFER_OVERFLOW_ERROR);
	740	}
	741	addGroup(groupMSB, groupStore, groupTop);
	742	if(lineTop>(uint32_t)(line->s-stringStore)) {
	743	fprintf(stderr, "gennames: group store runs into string store\n");
	744	exit(U_INTERNAL_PROGRAM_ERROR);
	745	}
	746	}
	747	}
	748
	749	if(!beQuiet) {
	750	printf("number of groups: %lu\n", (unsigned long)lineCount);
	751	}
	752	}
	753
	754	static int16_t
	755	compressLine(uint8_t s, int16_t length, int16_t pGroupTop) {
	756	int16_t start, limit, token, groupTop=*pGroupTop;
	757
	758	start=0;
	759	do {
	760	/* write any "noise" characters */
	761	limit=skipNoise((char *)s, start, length);
	762	while(start<limit) {
	763	groupStore[groupTop++]=s[start++];
	764	}
	765
	766	if(start==length) {
	767	break;
	768	}
	769
	770	/* write a word, as token or directly */
	771	limit=getWord((char *)s, start, length);
	772	if(limit-start==1) {
	773	groupStore[groupTop++]=s[start++];
	774	} else {
	775	token=findToken(s+start, (int16_t)(limit-start));
	776	if(token!=-1) {
	777	if(token>0xff) {
	778	groupStore[groupTop++]=(uint8_t)(token>>8);
	779	}
	780	groupStore[groupTop++]=(uint8_t)token;
	781	start=limit;
	782	} else {
	783	while(start<limit) {
	784	groupStore[groupTop++]=s[start++];
	785	}
	786	}
	787	}
	788	} while(start<length);
	789
	790	length=(int16_t)(groupTop-*pGroupTop);
	791	*pGroupTop=groupTop;
	792	return length;
	793	}
	794
	795	static int
	796	compareWords(const void word1, const void word2) {
	797	/* reverse sort by word weight */
	798	return ((Word )word2)->weight-((Word )word1)->weight;
	799	}
	800
	801	/* generate output data ----------------------------------------------------- */
	802
	803	static void
	804	generateData(const char *dataDir) {
	805	UNewDataMemory *pData;
	806	UErrorCode errorCode=U_ZERO_ERROR;
	807	uint16_t groupWords[3];
	808	uint32_t i, groupTop=lineTop, offset, size,
	809	tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
	810	long dataLength;
	811	int16_t token;
	812
	813	pData=udata_create(dataDir, DATA_TYPE,U_ICUDATA_NAME "_" DATA_NAME, &dataInfo,
	814	haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
	815	if(U_FAILURE(errorCode)) {
	816	fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
	817	exit(errorCode);
	818	}
	819
	820	/* first, see how much space we need, and prepare the token strings */
	821	for(i=0; i<tokenCount; ++i) {
	822	token=tokens[i];
	823	if(token!=-1 && token!=-2) {
	824	tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
	825	}
	826	}
	827
	828	/*
	829	* Calculate the total size in bytes of the data including:
	830	* - the offset to the token strings, uint32_t (4)
	831	* - the offset to the group table, uint32_t (4)
	832	* - the offset to the group strings, uint32_t (4)
	833	* - the offset to the algorithmic names, uint32_t (4)
	834	*
	835	* - the number of tokens, uint16_t (2)
	836	* - the token table, uint16_t[tokenCount] (2*tokenCount)
	837	*
	838	* - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
	839	*
	840	* - the number of groups, uint16_t (2)
	841	* - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
	842	*
	843	* - the group strings (groupTop), 2-padded
	844	*
	845	* - the size of the data for the algorithmic names
	846	*/
	847	tokenStringOffset=4+4+4+4+2+2*tokenCount;
	848	groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
	849	groupStringOffset=groupsOffset+2+6*lineCount;
	850	algNamesOffset=(groupStringOffset+groupTop+3)&~3;
	851
	852	offset=generateAlgorithmicData(NULL);
	853	size=algNamesOffset+offset;
	854
	855	if(!beQuiet) {
	856	printf("size of the Unicode Names data:\n"
	857	"total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
	858	(unsigned long)size, (unsigned long)(lineTop-groupTop),
	859	(unsigned long)groupTop, (unsigned long)offset);
	860	}
	861
	862	/* write the data to the file */
	863	/* offsets */
	864	udata_write32(pData, tokenStringOffset);
	865	udata_write32(pData, groupsOffset);
	866	udata_write32(pData, groupStringOffset);
	867	udata_write32(pData, algNamesOffset);
	868
	869	/* token table */
	870	udata_write16(pData, (uint16_t)tokenCount);
	871	udata_writeBlock(pData, tokens, 2*tokenCount);
	872
	873	/* token strings */
	874	udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
	875	if((lineTop-groupTop)&1) {
	876	/* 2-padding */
	877	udata_writePadding(pData, 1);
	878	}
	879
	880	/* group table */
	881	udata_write16(pData, (uint16_t)lineCount);
	882	for(i=0; i<lineCount; ++i) {
	883	/* groupMSB */
	884	groupWords[0]=(uint16_t)lines[i].code;
	885
	886	/* offset */
	887	offset = (uint32_t)(lines[i].s - stringStore);
	888	groupWords[1]=(uint16_t)(offset>>16);
	889	groupWords[2]=(uint16_t)(offset);
	890	udata_writeBlock(pData, groupWords, 6);
	891	}
	892
	893	/* group strings */
	894	udata_writeBlock(pData, stringStore, groupTop);
	895
	896	/* 4-align the algorithmic names data */
	897	udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
	898
	899	generateAlgorithmicData(pData);
	900
	901	/* finish up */
	902	dataLength=udata_finish(pData, &errorCode);
	903	if(U_FAILURE(errorCode)) {
	904	fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
	905	exit(errorCode);
	906	}
	907
	908	if(dataLength!=(long)size) {
	909	fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
	910	dataLength, (unsigned long)size);
	911	exit(U_INTERNAL_PROGRAM_ERROR);
	912	}
	913	}
	914
	915	/* the structure for algorithmic names needs to be 4-aligned */
	916	typedef struct AlgorithmicRange {
	917	uint32_t rangeStart, rangeEnd;
	918	uint8_t algorithmType, algorithmVariant;
	919	uint16_t rangeSize;
	920	} AlgorithmicRange;
	921
	922	static uint32_t
	923	generateAlgorithmicData(UNewDataMemory *pData) {
	924	static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
	925	# define PREFIX_LENGTH 23
	926	# define PREFIX_LENGTH_4 24
	927	uint32_t countAlgRanges;
	928
	929	static AlgorithmicRange cjkExtA={
	930	0x3400, 0x4db5,
	931	0, 4,
	932	sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
	933	};
	934	static AlgorithmicRange cjk={
	935	0x4e00, 0x9fa5,
	936	0, 4,
	937	sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
	938	};
	939	static AlgorithmicRange cjkExtB={
	940	0x20000, 0x2a6d6,
	941	0, 5,
	942	sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
	943	};
	944
	945	static char jamo[]=
	946	"HANGUL SYLLABLE \0"
	947
	948	"G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
	949	"S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
	950
	951	"A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
	952	"WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
	953	"YU\0EU\0YI\0I\0"
	954
	955	"\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
	956	"LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
	957	"S\0SS\0NG\0J\0C\0K\0T\0P\0H"
	958	;
	959
	960	static AlgorithmicRange hangul={
	961	0xac00, 0xd7a3,
	962	1, 3,
	963	sizeof(AlgorithmicRange)+6+sizeof(jamo)
	964	};
	965
	966	/* modulo factors, maximum 8 */
	967	/* 3 factors: 19, 21, 28, most-to-least-significant */
	968	static uint16_t hangulFactors[3]={
	969	19, 21, 28
	970	};
	971
	972	uint32_t size;
	973
	974	size=0;
	975
	976	/* number of ranges of algorithmic names */
	977	if(uprv_memcmp(dataInfo.dataVersion, unicode_3_1, sizeof(UVersionInfo))>=0) {
	978	/* Unicode 3.1 and up has 4 ranges including CJK Extension B */
	979	countAlgRanges=4;
	980	} else if(uprv_memcmp(dataInfo.dataVersion, unicode_3_0, sizeof(UVersionInfo))>=0) {
	981	/* Unicode 3.0 has 3 ranges including CJK Extension A */
	982	countAlgRanges=3;
	983	} else {
	984	/* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
	985	countAlgRanges=2;
	986	}
	987
	988	if(pData!=NULL) {
	989	udata_write32(pData, countAlgRanges);
	990	} else {
	991	size+=4;
	992	}
	993
	994	/*
	995	* each range:
	996	* uint32_t rangeStart
	997	* uint32_t rangeEnd
	998	* uint8_t algorithmType
	999	* uint8_t algorithmVariant
	1000	* uint16_t size of range data
	1001	* uint8_t[size] data
	1002	*/
	1003
	1004	/* range 0: cjk extension a */
	1005	if(countAlgRanges>=3) {
	1006	if(pData!=NULL) {
	1007	udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
	1008	udata_writeString(pData, prefix, PREFIX_LENGTH);
	1009	if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
	1010	udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
	1011	}
	1012	} else {
	1013	size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
	1014	}
	1015	}
	1016
	1017	/* range 1: cjk */
	1018	if(pData!=NULL) {
	1019	udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
	1020	udata_writeString(pData, prefix, PREFIX_LENGTH);
	1021	if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
	1022	udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
	1023	}
	1024	} else {
	1025	size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
	1026	}
	1027
	1028	/* range 2: hangul syllables */
	1029	if(pData!=NULL) {
	1030	udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
	1031	udata_writeBlock(pData, hangulFactors, 6);
	1032	udata_writeString(pData, jamo, sizeof(jamo));
	1033	} else {
	1034	size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
	1035	}
	1036
	1037	/* range 3: cjk extension b */
	1038	if(countAlgRanges>=4) {
	1039	if(pData!=NULL) {
	1040	udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
	1041	udata_writeString(pData, prefix, PREFIX_LENGTH);
	1042	if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
	1043	udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
	1044	}
	1045	} else {
	1046	size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
	1047	}
	1048	}
	1049
	1050	return size;
	1051	}
	1052
	1053	/* helpers ------------------------------------------------------------------ */
	1054
	1055	static int16_t
	1056	findToken(uint8_t *s, int16_t length) {
	1057	int16_t i, token;
	1058
	1059	for(i=0; i<(int16_t)tokenCount; ++i) {
	1060	token=tokens[i];
	1061	if(token!=-1 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
	1062	return i;
	1063	}
	1064	}
	1065
	1066	return -1;
	1067	}
	1068
	1069	static Word *
	1070	findWord(char *s, int16_t length) {
	1071	uint32_t i;
	1072
	1073	for(i=0; i<wordCount; ++i) {
	1074	if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
	1075	return words+i;
	1076	}
	1077	}
	1078
	1079	return NULL;
	1080	}
	1081
	1082	static Word *
	1083	addWord(char *s, int16_t length) {
	1084	uint8_t *stringStart;
	1085	Word *word;
	1086
	1087	if(wordCount==MAX_WORD_COUNT) {
	1088	fprintf(stderr, "gennames: too many words\n");
	1089	exit(U_BUFFER_OVERFLOW_ERROR);
	1090	}
	1091
	1092	stringStart=allocWord(length);
	1093	uprv_memcpy(stringStart, s, length);
	1094
	1095	word=words+wordCount;
	1096
	1097	/*
	1098	* Initialize the weight with the costs for this token:
	1099	* a zero-terminated string and a 16-bit offset.
	1100	*/
	1101	word->weight=-(length+1+2);
	1102	word->count=0;
	1103	word->length=length;
	1104	word->s=stringStart;
	1105
	1106	++wordCount;
	1107
	1108	return word;
	1109	}
	1110
	1111	static void
	1112	countWord(Word *word) {
	1113	/* add to the weight the savings: the length of the word minus 1 byte for the token */
	1114	word->weight+=word->length-1;
	1115	++word->count;
	1116	}
	1117
	1118	static void
	1119	addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
	1120	uint8_t *stringStart;
	1121	Line *line;
	1122	int16_t i, length;
	1123
	1124	if(lineCount==MAX_LINE_COUNT) {
	1125	fprintf(stderr, "gennames: too many lines\n");
	1126	exit(U_BUFFER_OVERFLOW_ERROR);
	1127	}
	1128
	1129	/* find the last non-empty name */
	1130	while(count>0 && lengths[count-1]==0) {
	1131	--count;
	1132	}
	1133	if(count==0) {
	1134	return; /* should not occur: caller should not have called */
	1135	}
	1136
	1137	/* there will be (count-1) separator characters */
	1138	i=count;
	1139	length=count-1;
	1140
	1141	/* add lengths of strings */
	1142	while(i>0) {
	1143	length+=lengths[--i];
	1144	}
	1145
	1146	/* allocate line memory */
	1147	stringStart=allocLine(length);
	1148
	1149	/* copy all strings into the line memory */
	1150	length=0; /* number of chars copied so far */
	1151	for(i=0; i<count; ++i) {
	1152	if(i>0) {
	1153	stringStart[length++]=NAME_SEPARATOR_CHAR;
	1154	}
	1155	if(lengths[i]>0) {
	1156	uprv_memcpy(stringStart+length, names[i], lengths[i]);
	1157	length+=lengths[i];
	1158	}
	1159	}
	1160
	1161	line=lines+lineCount;
	1162
	1163	line->code=code;
	1164	line->length=length;
	1165	line->s=stringStart;
	1166
	1167	++lineCount;
	1168
	1169	/* prevent a character value that is actually in a name from becoming a token */
	1170	while(length>0) {
	1171	tokens[stringStart[--length]]=-1;
	1172	}
	1173	}
	1174
	1175	static void
	1176	addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
	1177	uint8_t *stringStart;
	1178	Line *line;
	1179
	1180	if(lineCount==MAX_LINE_COUNT) {
	1181	fprintf(stderr, "gennames: too many groups\n");
	1182	exit(U_BUFFER_OVERFLOW_ERROR);
	1183	}
	1184
	1185	/* store the line lengths first, then the strings */
	1186	lineLengthsTop=(lineLengthsTop+1)/2;
	1187	stringStart=allocLine(lineLengthsTop+length);
	1188	uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
	1189	uprv_memcpy(stringStart+lineLengthsTop, strings, length);
	1190
	1191	line=lines+lineCount;
	1192
	1193	line->code=groupMSB;
	1194	line->length=length;
	1195	line->s=stringStart;
	1196
	1197	++lineCount;
	1198	}
	1199
	1200	static uint32_t
	1201	addToken(uint8_t *s, int16_t length) {
	1202	uint8_t *stringStart;
	1203
	1204	stringStart=allocLine(length+1);
	1205	uprv_memcpy(stringStart, s, length);
	1206	stringStart[length]=0;
	1207
	1208	return (uint32_t)(stringStart - stringStore);
	1209	}
	1210
	1211	static void
	1212	appendLineLength(int16_t length) {
	1213	if(length>=76) {
	1214	fprintf(stderr, "gennames: compressed line too long\n");
	1215	exit(U_BUFFER_OVERFLOW_ERROR);
	1216	}
	1217	if(length>=12) {
	1218	length-=12;
	1219	appendLineLengthNibble((uint8_t)((length>>4)\|12));
	1220	}
	1221	appendLineLengthNibble((uint8_t)length);
	1222	}
	1223
	1224	static void
	1225	appendLineLengthNibble(uint8_t nibble) {
	1226	if((lineLengthsTop&1)==0) {
	1227	lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
	1228	} else {
	1229	lineLengths[lineLengthsTop/2]\|=nibble&0xf;
	1230	}
	1231	++lineLengthsTop;
	1232	}
	1233
	1234	static uint8_t *
	1235	allocLine(int32_t length) {
	1236	uint32_t top=lineTop+length;
	1237	uint8_t *p;
	1238
	1239	if(top>wordBottom) {
	1240	fprintf(stderr, "gennames: out of memory\n");
	1241	exit(U_MEMORY_ALLOCATION_ERROR);
	1242	}
	1243	p=stringStore+lineTop;
	1244	lineTop=top;
	1245	return p;
	1246	}
	1247
	1248	static uint8_t *
	1249	allocWord(uint32_t length) {
	1250	uint32_t bottom=wordBottom-length;
	1251
	1252	if(lineTop>bottom) {
	1253	fprintf(stderr, "gennames: out of memory\n");
	1254	exit(U_MEMORY_ALLOCATION_ERROR);
	1255	}
	1256	wordBottom=bottom;
	1257	return stringStore+bottom;
	1258	}
	1259
	1260	/*
	1261	* Hey, Emacs, please set the following:
	1262	*
	1263	* Local Variables:
	1264	* indent-tabs-mode: nil
	1265	* End:
	1266	*
	1267	*/