]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/gennames/gennames.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / gennames / gennames.c
... / ...
CommitLineData
1/*
2*******************************************************************************
3*
4* Copyright (C) 1999-2001, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: gennames.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999sep30
14* created by: Markus W. Scherer
15*
16* This program reads the Unicode character database text file,
17* parses it, and extracts the character code,
18* the "modern" character name, and optionally the
19* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20* It then tokenizes and compresses the names and builds
21* compact binary tables for random-access lookup
22* in a u_charName() API function.
23*
24* unames.icu file format (after UDataInfo header etc. - see udata.c)
25* (all data is static const)
26*
27* UDataInfo fields:
28* dataFormat "unam"
29* formatVersion 1.0
30* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
31*
32* -- data-based names
33* uint32_t tokenStringOffset,
34* groupsOffset,
35* groupStringOffset,
36* algNamesOffset;
37*
38* uint16_t tokenCount;
39* uint16_t tokenTable[tokenCount];
40*
41* char tokenStrings[]; -- padded to even count
42*
43* -- strings (groupStrings) are tokenized as follows:
44* for each character c
45* if(c>=tokenCount) write that character c directly
46* else
47* token=tokenTable[c];
48* if(token==0xfffe) -- lead byte of double-byte token
49* token=tokenTable[c<<8|next character];
50* if(token==-1)
51* write c directly
52* else
53* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54* append zero-terminated tokenString;
55*
56* Different strings for a code point - normal name, 1.0 name, and ISO comment -
57* are separated by ';'.
58*
59* uint16_t groupCount;
60* struct {
61* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
63* uint16_t offsetLow;
64* } groupTable[groupCount];
65*
66* char groupStrings[]; -- padded to 4-count
67*
68* -- The actual, tokenized group strings are not zero-terminated because
69* that would take up too much space.
70* Instead, they are preceeded by their length, written in a variable-length sequence:
71* For each of the 32 group strings, one or two nibbles are stored for its length.
72* Nibbles (4-bit values, half-bytes) are read MSB first.
73* A nibble with a value of 0..11 directly indicates the length of the name string.
74* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75* by (((n-12)<<4)|m)+12, reaching values of 12..75.
76* These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77* For the de-tokenizing, see token description above; the strings immediately follow the
78* 32 lengths.
79*
80* -- algorithmic names
81*
82* typedef struct AlgorithmicRange {
83* uint32_t rangeStart, rangeEnd;
84* uint8_t algorithmType, algorithmVariant;
85* uint16_t rangeSize;
86* } AlgorithmicRange;
87*
88* uint32_t algRangesCount; -- number of data blocks for ranges of
89* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
90*
91* struct {
92* AlgorithmicRange algRange;
93* uint8_t algRangeData[]; -- padded to 4-count except in last range
94* } algRanges[algNamesCount];
95* -- not a real array because each part has a different size
96* of algRange.rangeSize (including AlgorithmicRange)
97*
98* -- algorithmic range types:
99*
100* 0 Names are formed from a string prefix that is stored in
101* the algRangeData (zero-terminated), followed by the Unicode code point
102* of the character in hexadecimal digits;
103* algRange.algorithmVariant digits are written
104*
105* 1 Names are formed by calculating modulo-factors of the code point value as follows:
106* algRange.algorithmVariant is the count of modulo factors
107* algRangeData contains
108* uint16_t factors[algRange.algorithmVariant];
109* char strings[];
110* the first zero-terminated string is written as the prefix; then:
111*
112* The rangeStart is subtracted; with the difference, here "code":
113* for(i=algRange.algorithmVariant-1 to 0 step -1)
114* index[i]=code%factor[i];
115* code/=factor[i];
116*
117* The strings after the prefix are short pieces that are then appended to the result
118* according to index[0..algRange.algorithmVariant-1].
119*/
120
121#include <stdio.h>
122#include <stdlib.h>
123#include "unicode/utypes.h"
124#include "unicode/putil.h"
125#include "cmemory.h"
126#include "cstring.h"
127#include "unicode/udata.h"
128#include "unewdata.h"
129#include "uoptions.h"
130#include "uparse.h"
131
132#define STRING_STORE_SIZE 1000000
133#define GROUP_STORE_SIZE 5000
134
135#define GROUP_SHIFT 5
136#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
137#define GROUP_MASK (LINES_PER_GROUP-1)
138
139#define MAX_LINE_COUNT 50000
140#define MAX_WORD_COUNT 20000
141#define MAX_GROUP_COUNT 5000
142
143#define DATA_NAME "unames"
144#define DATA_TYPE "icu"
145#define VERSION_STRING "unam"
146#define NAME_SEPARATOR_CHAR ';'
147
148static const UVersionInfo
149unicode_3_0={ 3, 0, 0, 0 },
150unicode_3_1={ 3, 1, 0, 0 };
151
152/* UDataInfo cf. udata.h */
153static UDataInfo dataInfo={
154 sizeof(UDataInfo),
155 0,
156
157 U_IS_BIG_ENDIAN,
158 U_CHARSET_FAMILY,
159 sizeof(UChar),
160 0,
161
162 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
163 {1, 0, 0, 0}, /* formatVersion */
164 {3, 0, 0, 0} /* dataVersion */
165};
166
167static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
168
169static uint8_t stringStore[STRING_STORE_SIZE],
170 groupStore[GROUP_STORE_SIZE],
171 lineLengths[LINES_PER_GROUP];
172
173static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
174
175typedef struct {
176 uint32_t code;
177 int16_t length;
178 uint8_t *s;
179} Line;
180
181typedef struct {
182 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
183 int16_t count;
184 int16_t length;
185 uint8_t *s;
186} Word;
187
188static Line lines[MAX_LINE_COUNT];
189static Word words[MAX_WORD_COUNT];
190
191static uint32_t lineCount=0, wordCount=0;
192
193static int16_t leadByteCount;
194
195#define LEADBYTE_LIMIT 16
196
197static int16_t tokens[LEADBYTE_LIMIT*256];
198static uint32_t tokenCount;
199
200/* prototypes --------------------------------------------------------------- */
201
202static void
203init(void);
204
205static void
206parseDB(const char *filename, UBool store10Names);
207
208static void
209parseName(char *name, int16_t length);
210
211static int16_t
212skipNoise(char *line, int16_t start, int16_t limit);
213
214static int16_t
215getWord(char *line, int16_t start, int16_t limit);
216
217static void
218compress(void);
219
220static void
221compressLines(void);
222
223static int16_t
224compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
225
226static int
227compareWords(const void *word1, const void *word2);
228
229static void
230generateData(const char *dataDir);
231
232static uint32_t
233generateAlgorithmicData(UNewDataMemory *pData);
234
235static int16_t
236findToken(uint8_t *s, int16_t length);
237
238static Word *
239findWord(char *s, int16_t length);
240
241static Word *
242addWord(char *s, int16_t length);
243
244static void
245countWord(Word *word);
246
247static void
248addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
249
250static void
251addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
252
253static uint32_t
254addToken(uint8_t *s, int16_t length);
255
256static void
257appendLineLength(int16_t length);
258
259static void
260appendLineLengthNibble(uint8_t nibble);
261
262static uint8_t *
263allocLine(int32_t length);
264
265static uint8_t *
266allocWord(uint32_t length);
267
268/* -------------------------------------------------------------------------- */
269
270static UOption options[]={
271 UOPTION_HELP_H,
272 UOPTION_HELP_QUESTION_MARK,
273 UOPTION_VERBOSE,
274 UOPTION_QUIET,
275 UOPTION_COPYRIGHT,
276 UOPTION_DESTDIR,
277 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
278 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
279};
280
281extern int
282main(int argc, char* argv[]) {
283 UVersionInfo version;
284 UBool store10Names=FALSE;
285
286 U_MAIN_INIT_ARGS(argc, argv);
287
288 /* preset then read command line options */
289 options[5].value=u_getDataDirectory();
290 options[6].value="3.2";
291 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
292
293 /* error handling, printing usage message */
294 if(argc<0) {
295 fprintf(stderr,
296 "error in command line argument \"%s\"\n",
297 argv[-argc]);
298 } else if(argc<2) {
299 argc=-1;
300 }
301 if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
302 /*
303 * Broken into chucks because the C89 standard says the minimum
304 * required supported string length is 509 bytes.
305 */
306 fprintf(stderr,
307 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
308 "\n"
309 "Read the UnicodeData.txt file and \n"
310 "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the character names\n"
311 "\n"
312 "\tfilename absolute path/filename for the Unicode database text file\n"
313 "\t\t(default: standard input)\n"
314 "\n",
315 argv[0]);
316 fprintf(stderr,
317 "Options:\n"
318 "\t-h or -? or --help this usage text\n"
319 "\t-v or --verbose verbose output\n"
320 "\t-q or --quiet no output\n"
321 "\t-c or --copyright include a copyright notice\n"
322 "\t-d or --destdir destination directory, followed by the path\n"
323 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
324 "\t-1 or --unicode1-names store Unicode 1.0 character names\n");
325 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
326 }
327
328 /* get the options values */
329 beVerbose=options[2].doesOccur;
330 beQuiet=options[3].doesOccur;
331 haveCopyright=options[4].doesOccur;
332 store10Names=options[7].doesOccur;
333
334 /* set the Unicode version */
335 u_versionFromString(version, options[6].value);
336 uprv_memcpy(dataInfo.dataVersion, version, 4);
337
338 init();
339 parseDB(argc>=2 ? argv[1] : "-", store10Names);
340 compress();
341 generateData(options[5].value);
342
343 return 0;
344}
345
346static void
347init() {
348 int i;
349
350 for(i=0; i<256; ++i) {
351 tokens[i]=0;
352 }
353}
354
355/* parsing ------------------------------------------------------------------ */
356
357static void U_CALLCONV
358lineFn(void *context,
359 char *fields[][2], int32_t fieldCount,
360 UErrorCode *pErrorCode) {
361 char *names[3];
362 int16_t lengths[3];
363 static uint32_t prevCode=0;
364 uint32_t code=0;
365
366 if(U_FAILURE(*pErrorCode)) {
367 return;
368 }
369 /* get the character code */
370 code=uprv_strtoul(fields[0][0], NULL, 16);
371
372 /* get the character name */
373 names[0]=fields[1][0];
374 if(fields[1][0][0]!='<') {
375 lengths[0]=(int16_t)(fields[1][1]-names[0]);
376 } else {
377 /* do not store pseudo-names in <> brackets */
378 lengths[0]=0;
379 }
380
381 /* store 1.0 names */
382 /* get the second character name, the one from Unicode 1.0 */
383 /* do not store pseudo-names in <> brackets */
384 names[1]=fields[10][0];
385 if(*(UBool *)context && fields[10][0][0]!='<') {
386 lengths[1]=(int16_t)(fields[10][1]-names[1]);
387 } else {
388 lengths[1]=0;
389 }
390
391 /* get the ISO 10646 comment */
392 names[2]=fields[11][0];
393 lengths[2]=(int16_t)(fields[11][1]-names[2]);
394
395 if(lengths[0]+lengths[1]+lengths[2]==0) {
396 return;
397 }
398
399 /* check for non-character code points */
400 if(!UTF_IS_UNICODE_CHAR(code)) {
401 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
402 (unsigned long)code);
403 *pErrorCode=U_PARSE_ERROR;
404 exit(U_PARSE_ERROR);
405 }
406
407 /* check that the code points (code) are in ascending order */
408 if(code<=prevCode && code>0) {
409 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
410 (unsigned long)code, (unsigned long)prevCode);
411 *pErrorCode=U_PARSE_ERROR;
412 exit(U_PARSE_ERROR);
413 }
414 prevCode=code;
415
416 parseName(names[0], lengths[0]);
417 parseName(names[1], lengths[1]);
418 parseName(names[2], lengths[2]);
419
420 /*
421 * set the count argument to
422 * 1: only store regular names
423 * 2: store regular and 1.0 names
424 * 3: store names and ISO 10646 comment
425 */
426 addLine(code, names, lengths, 3);
427}
428
429static void
430parseDB(const char *filename, UBool store10Names) {
431 char *fields[15][2];
432 UErrorCode errorCode=U_ZERO_ERROR;
433
434 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
435 if(U_FAILURE(errorCode)) {
436 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
437 exit(errorCode);
438 }
439
440 if(!beQuiet) {
441 printf("size of all names in the database: %lu\n",
442 (unsigned long)lineTop);
443 printf("number of named Unicode characters: %lu\n",
444 (unsigned long)lineCount);
445 printf("number of words in the dictionary from these names: %lu\n",
446 (unsigned long)wordCount);
447 }
448}
449
450static void
451parseName(char *name, int16_t length) {
452 int16_t start=0, limit, wordLength/*, prevStart=-1*/;
453 Word *word;
454
455 while(start<length) {
456 /* skip any "noise" characters */
457 limit=skipNoise(name, start, length);
458 if(start<limit) {
459 /*prevStart=-1;*/
460 start=limit;
461 }
462 if(start==length) {
463 break;
464 }
465
466 /* get a word and add it if it is longer than 1 */
467 limit=getWord(name, start, length);
468 wordLength=(int16_t)(limit-start);
469 if(wordLength>1) {
470 word=findWord(name+start, wordLength);
471 if(word==NULL) {
472 word=addWord(name+start, wordLength);
473 }
474 countWord(word);
475 }
476
477#if 0
478 /*
479 * if there was a word before this
480 * (with no noise in between), then add the pair of words, too
481 */
482 if(prevStart!=-1) {
483 wordLength=limit-prevStart;
484 word=findWord(name+prevStart, wordLength);
485 if(word==NULL) {
486 word=addWord(name+prevStart, wordLength);
487 }
488 countWord(word);
489 }
490#endif
491
492 /*prevStart=start;*/
493 start=limit;
494 }
495}
496
497static UBool U_INLINE
498isWordChar(char c) {
499 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
500 ('J'<=c && c<='R') ||
501 ('S'<=c && c<='Z') ||
502
503 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
504 ('j'<=c && c<='r') ||
505 ('s'<=c && c<='z') ||
506
507 ('0'<=c && c<='9');
508}
509
510static int16_t
511skipNoise(char *line, int16_t start, int16_t limit) {
512 /* skip anything that is not part of a word in this sense */
513 while(start<limit && !isWordChar(line[start])) {
514 ++start;
515 }
516
517 return start;
518}
519
520static int16_t
521getWord(char *line, int16_t start, int16_t limit) {
522 char c=0; /* initialize to avoid a compiler warning although the code was safe */
523
524 /* a unicode character name word consists of A-Z0-9 */
525 while(start<limit && isWordChar(line[start])) {
526 ++start;
527 }
528
529 /* include a following space or dash */
530 if(start<limit && ((c=line[start])==' ' || c=='-')) {
531 ++start;
532 }
533
534 return start;
535}
536
537/* compressing -------------------------------------------------------------- */
538
539static void
540compress() {
541 uint32_t i, letterCount;
542 int16_t wordNumber;
543
544 /* sort the words in reverse order by weight */
545 qsort(words, wordCount, sizeof(Word), compareWords);
546
547 /* remove the words that do not save anything */
548 while(wordCount>0 && words[wordCount-1].weight<1) {
549 --wordCount;
550 }
551
552 /* count the letters in the token range */
553 letterCount=0;
554 for(i=LEADBYTE_LIMIT; i<256; ++i) {
555 if(tokens[i]==-1) {
556 ++letterCount;
557 }
558 }
559 if(!beQuiet) {
560 printf("number of letters used in the names: %d\n", letterCount);
561 }
562
563 /* do we need double-byte tokens? */
564 if(wordCount+letterCount<=256) {
565 /* no, single-byte tokens are enough */
566 leadByteCount=0;
567 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
568 if(tokens[i]!=-1) {
569 tokens[i]=wordNumber;
570 if(beVerbose) {
571 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
572 i, (long)words[wordNumber].weight,
573 words[wordNumber].length, words[wordNumber].s);
574 }
575 ++wordNumber;
576 }
577 }
578 tokenCount=i;
579 } else {
580 /*
581 * The tokens that need two token bytes
582 * get their weight reduced by their count
583 * because they save less.
584 */
585 tokenCount=256-letterCount;
586 for(i=tokenCount; i<wordCount; ++i) {
587 words[i].weight-=words[i].count;
588 }
589
590 /* sort these words in reverse order by weight */
591 qsort(words+tokenCount, wordCount-tokenCount, sizeof(Word), compareWords);
592
593 /* remove the words that do not save anything */
594 while(wordCount>0 && words[wordCount-1].weight<1) {
595 --wordCount;
596 }
597
598 /* how many tokens and lead bytes do we have now? */
599 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
600 /*
601 * adjust upwards to take into account that
602 * double-byte tokens must not
603 * use NAME_SEPARATOR_CHAR as a second byte
604 */
605 tokenCount+=(tokenCount-256+254)/255;
606
607 leadByteCount=(int16_t)(tokenCount>>8);
608 if(leadByteCount<LEADBYTE_LIMIT) {
609 /* adjust for the real number of lead bytes */
610 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
611 } else {
612 /* limit the number of lead bytes */
613 leadByteCount=LEADBYTE_LIMIT-1;
614 tokenCount=LEADBYTE_LIMIT*256;
615 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
616 /* adjust again to skip double-byte tokens with ';' */
617 wordCount-=(tokenCount-256+254)/255;
618 }
619
620 /* set token 0 to word 0 */
621 tokens[0]=0;
622 if(beVerbose) {
623 printf("tokens[0x000]: word%8ld \"%.*s\"\n",
624 (long)words[0].weight,
625 words[0].length, words[0].s);
626 }
627 wordNumber=1;
628
629 /* set the lead byte tokens */
630 for(i=1; (int16_t)i<=leadByteCount; ++i) {
631 tokens[i]=-2;
632 }
633
634 /* set the tokens */
635 for(; i<256; ++i) {
636 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
637 if(tokens[i]!=-1) {
638 tokens[i]=wordNumber;
639 if(beVerbose) {
640 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
641 i, (long)words[wordNumber].weight,
642 words[wordNumber].length, words[wordNumber].s);
643 }
644 ++wordNumber;
645 }
646 }
647
648 /* continue above 255 where there are no letters */
649 for(; (uint32_t)wordNumber<wordCount; ++i) {
650 if((i&0xff)==NAME_SEPARATOR_CHAR) {
651 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
652 } else {
653 tokens[i]=wordNumber;
654 if(beVerbose) {
655 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
656 i, (long)words[wordNumber].weight,
657 words[wordNumber].length, words[wordNumber].s);
658 }
659 ++wordNumber;
660 }
661 }
662 tokenCount=i; /* should be already tokenCount={i or i+1} */
663 }
664
665 if(!beQuiet) {
666 printf("number of lead bytes: %d\n", leadByteCount);
667 printf("number of single-byte tokens: %lu\n",
668 (unsigned long)256-letterCount-leadByteCount);
669 printf("number of tokens: %lu\n", (unsigned long)tokenCount);
670 }
671
672 compressLines();
673}
674
675static void
676compressLines() {
677 Line *line=NULL;
678 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
679 groupMSB=0xffff, lineCount2;
680 int16_t groupTop=0;
681
682 /* store the groups like lines, reusing the lines' memory */
683 lineTop=0;
684 lineCount2=lineCount;
685 lineCount=0;
686
687 /* loop over all lines */
688 while(i<lineCount2) {
689 line=lines+i++;
690 inLine=line->code;
691
692 /* segment the lines to groups of 32 */
693 if(inLine>>GROUP_SHIFT!=groupMSB) {
694 /* finish the current group with empty lines */
695 while((++outLine&GROUP_MASK)!=0) {
696 appendLineLength(0);
697 }
698
699 /* store the group like a line */
700 if(groupTop>0) {
701 if(groupTop>GROUP_STORE_SIZE) {
702 fprintf(stderr, "gennames: group store overflow\n");
703 exit(U_BUFFER_OVERFLOW_ERROR);
704 }
705 addGroup(groupMSB, groupStore, groupTop);
706 if(lineTop>(uint32_t)(line->s-stringStore)) {
707 fprintf(stderr, "gennames: group store runs into string store\n");
708 exit(U_INTERNAL_PROGRAM_ERROR);
709 }
710 }
711
712 /* start the new group */
713 lineLengthsTop=0;
714 groupTop=0;
715 groupMSB=inLine>>GROUP_SHIFT;
716 outLine=(inLine&~GROUP_MASK)-1;
717 }
718
719 /* write empty lines between the previous line in the group and this one */
720 while(++outLine<inLine) {
721 appendLineLength(0);
722 }
723
724 /* write characters and tokens for this line */
725 appendLineLength(compressLine(line->s, line->length, &groupTop));
726 }
727
728 /* finish and store the last group */
729 if(line && groupMSB!=0xffff) {
730 /* finish the current group with empty lines */
731 while((++outLine&GROUP_MASK)!=0) {
732 appendLineLength(0);
733 }
734
735 /* store the group like a line */
736 if(groupTop>0) {
737 if(groupTop>GROUP_STORE_SIZE) {
738 fprintf(stderr, "gennames: group store overflow\n");
739 exit(U_BUFFER_OVERFLOW_ERROR);
740 }
741 addGroup(groupMSB, groupStore, groupTop);
742 if(lineTop>(uint32_t)(line->s-stringStore)) {
743 fprintf(stderr, "gennames: group store runs into string store\n");
744 exit(U_INTERNAL_PROGRAM_ERROR);
745 }
746 }
747 }
748
749 if(!beQuiet) {
750 printf("number of groups: %lu\n", (unsigned long)lineCount);
751 }
752}
753
754static int16_t
755compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
756 int16_t start, limit, token, groupTop=*pGroupTop;
757
758 start=0;
759 do {
760 /* write any "noise" characters */
761 limit=skipNoise((char *)s, start, length);
762 while(start<limit) {
763 groupStore[groupTop++]=s[start++];
764 }
765
766 if(start==length) {
767 break;
768 }
769
770 /* write a word, as token or directly */
771 limit=getWord((char *)s, start, length);
772 if(limit-start==1) {
773 groupStore[groupTop++]=s[start++];
774 } else {
775 token=findToken(s+start, (int16_t)(limit-start));
776 if(token!=-1) {
777 if(token>0xff) {
778 groupStore[groupTop++]=(uint8_t)(token>>8);
779 }
780 groupStore[groupTop++]=(uint8_t)token;
781 start=limit;
782 } else {
783 while(start<limit) {
784 groupStore[groupTop++]=s[start++];
785 }
786 }
787 }
788 } while(start<length);
789
790 length=(int16_t)(groupTop-*pGroupTop);
791 *pGroupTop=groupTop;
792 return length;
793}
794
795static int
796compareWords(const void *word1, const void *word2) {
797 /* reverse sort by word weight */
798 return ((Word *)word2)->weight-((Word *)word1)->weight;
799}
800
801/* generate output data ----------------------------------------------------- */
802
803static void
804generateData(const char *dataDir) {
805 UNewDataMemory *pData;
806 UErrorCode errorCode=U_ZERO_ERROR;
807 uint16_t groupWords[3];
808 uint32_t i, groupTop=lineTop, offset, size,
809 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
810 long dataLength;
811 int16_t token;
812
813 pData=udata_create(dataDir, DATA_TYPE,U_ICUDATA_NAME "_" DATA_NAME, &dataInfo,
814 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
815 if(U_FAILURE(errorCode)) {
816 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
817 exit(errorCode);
818 }
819
820 /* first, see how much space we need, and prepare the token strings */
821 for(i=0; i<tokenCount; ++i) {
822 token=tokens[i];
823 if(token!=-1 && token!=-2) {
824 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
825 }
826 }
827
828 /*
829 * Calculate the total size in bytes of the data including:
830 * - the offset to the token strings, uint32_t (4)
831 * - the offset to the group table, uint32_t (4)
832 * - the offset to the group strings, uint32_t (4)
833 * - the offset to the algorithmic names, uint32_t (4)
834 *
835 * - the number of tokens, uint16_t (2)
836 * - the token table, uint16_t[tokenCount] (2*tokenCount)
837 *
838 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
839 *
840 * - the number of groups, uint16_t (2)
841 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
842 *
843 * - the group strings (groupTop), 2-padded
844 *
845 * - the size of the data for the algorithmic names
846 */
847 tokenStringOffset=4+4+4+4+2+2*tokenCount;
848 groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
849 groupStringOffset=groupsOffset+2+6*lineCount;
850 algNamesOffset=(groupStringOffset+groupTop+3)&~3;
851
852 offset=generateAlgorithmicData(NULL);
853 size=algNamesOffset+offset;
854
855 if(!beQuiet) {
856 printf("size of the Unicode Names data:\n"
857 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
858 (unsigned long)size, (unsigned long)(lineTop-groupTop),
859 (unsigned long)groupTop, (unsigned long)offset);
860 }
861
862 /* write the data to the file */
863 /* offsets */
864 udata_write32(pData, tokenStringOffset);
865 udata_write32(pData, groupsOffset);
866 udata_write32(pData, groupStringOffset);
867 udata_write32(pData, algNamesOffset);
868
869 /* token table */
870 udata_write16(pData, (uint16_t)tokenCount);
871 udata_writeBlock(pData, tokens, 2*tokenCount);
872
873 /* token strings */
874 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
875 if((lineTop-groupTop)&1) {
876 /* 2-padding */
877 udata_writePadding(pData, 1);
878 }
879
880 /* group table */
881 udata_write16(pData, (uint16_t)lineCount);
882 for(i=0; i<lineCount; ++i) {
883 /* groupMSB */
884 groupWords[0]=(uint16_t)lines[i].code;
885
886 /* offset */
887 offset = (uint32_t)(lines[i].s - stringStore);
888 groupWords[1]=(uint16_t)(offset>>16);
889 groupWords[2]=(uint16_t)(offset);
890 udata_writeBlock(pData, groupWords, 6);
891 }
892
893 /* group strings */
894 udata_writeBlock(pData, stringStore, groupTop);
895
896 /* 4-align the algorithmic names data */
897 udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
898
899 generateAlgorithmicData(pData);
900
901 /* finish up */
902 dataLength=udata_finish(pData, &errorCode);
903 if(U_FAILURE(errorCode)) {
904 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
905 exit(errorCode);
906 }
907
908 if(dataLength!=(long)size) {
909 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
910dataLength, (unsigned long)size);
911 exit(U_INTERNAL_PROGRAM_ERROR);
912 }
913}
914
915/* the structure for algorithmic names needs to be 4-aligned */
916typedef struct AlgorithmicRange {
917 uint32_t rangeStart, rangeEnd;
918 uint8_t algorithmType, algorithmVariant;
919 uint16_t rangeSize;
920} AlgorithmicRange;
921
922static uint32_t
923generateAlgorithmicData(UNewDataMemory *pData) {
924 static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
925# define PREFIX_LENGTH 23
926# define PREFIX_LENGTH_4 24
927 uint32_t countAlgRanges;
928
929 static AlgorithmicRange cjkExtA={
930 0x3400, 0x4db5,
931 0, 4,
932 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
933 };
934 static AlgorithmicRange cjk={
935 0x4e00, 0x9fa5,
936 0, 4,
937 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
938 };
939 static AlgorithmicRange cjkExtB={
940 0x20000, 0x2a6d6,
941 0, 5,
942 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
943 };
944
945 static char jamo[]=
946 "HANGUL SYLLABLE \0"
947
948 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
949 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
950
951 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
952 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
953 "YU\0EU\0YI\0I\0"
954
955 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
956 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
957 "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
958 ;
959
960 static AlgorithmicRange hangul={
961 0xac00, 0xd7a3,
962 1, 3,
963 sizeof(AlgorithmicRange)+6+sizeof(jamo)
964 };
965
966 /* modulo factors, maximum 8 */
967 /* 3 factors: 19, 21, 28, most-to-least-significant */
968 static uint16_t hangulFactors[3]={
969 19, 21, 28
970 };
971
972 uint32_t size;
973
974 size=0;
975
976 /* number of ranges of algorithmic names */
977 if(uprv_memcmp(dataInfo.dataVersion, unicode_3_1, sizeof(UVersionInfo))>=0) {
978 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
979 countAlgRanges=4;
980 } else if(uprv_memcmp(dataInfo.dataVersion, unicode_3_0, sizeof(UVersionInfo))>=0) {
981 /* Unicode 3.0 has 3 ranges including CJK Extension A */
982 countAlgRanges=3;
983 } else {
984 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
985 countAlgRanges=2;
986 }
987
988 if(pData!=NULL) {
989 udata_write32(pData, countAlgRanges);
990 } else {
991 size+=4;
992 }
993
994 /*
995 * each range:
996 * uint32_t rangeStart
997 * uint32_t rangeEnd
998 * uint8_t algorithmType
999 * uint8_t algorithmVariant
1000 * uint16_t size of range data
1001 * uint8_t[size] data
1002 */
1003
1004 /* range 0: cjk extension a */
1005 if(countAlgRanges>=3) {
1006 if(pData!=NULL) {
1007 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1008 udata_writeString(pData, prefix, PREFIX_LENGTH);
1009 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1010 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1011 }
1012 } else {
1013 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1014 }
1015 }
1016
1017 /* range 1: cjk */
1018 if(pData!=NULL) {
1019 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1020 udata_writeString(pData, prefix, PREFIX_LENGTH);
1021 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1022 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1023 }
1024 } else {
1025 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1026 }
1027
1028 /* range 2: hangul syllables */
1029 if(pData!=NULL) {
1030 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1031 udata_writeBlock(pData, hangulFactors, 6);
1032 udata_writeString(pData, jamo, sizeof(jamo));
1033 } else {
1034 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1035 }
1036
1037 /* range 3: cjk extension b */
1038 if(countAlgRanges>=4) {
1039 if(pData!=NULL) {
1040 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1041 udata_writeString(pData, prefix, PREFIX_LENGTH);
1042 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1043 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1044 }
1045 } else {
1046 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1047 }
1048 }
1049
1050 return size;
1051}
1052
1053/* helpers ------------------------------------------------------------------ */
1054
1055static int16_t
1056findToken(uint8_t *s, int16_t length) {
1057 int16_t i, token;
1058
1059 for(i=0; i<(int16_t)tokenCount; ++i) {
1060 token=tokens[i];
1061 if(token!=-1 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1062 return i;
1063 }
1064 }
1065
1066 return -1;
1067}
1068
1069static Word *
1070findWord(char *s, int16_t length) {
1071 uint32_t i;
1072
1073 for(i=0; i<wordCount; ++i) {
1074 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1075 return words+i;
1076 }
1077 }
1078
1079 return NULL;
1080}
1081
1082static Word *
1083addWord(char *s, int16_t length) {
1084 uint8_t *stringStart;
1085 Word *word;
1086
1087 if(wordCount==MAX_WORD_COUNT) {
1088 fprintf(stderr, "gennames: too many words\n");
1089 exit(U_BUFFER_OVERFLOW_ERROR);
1090 }
1091
1092 stringStart=allocWord(length);
1093 uprv_memcpy(stringStart, s, length);
1094
1095 word=words+wordCount;
1096
1097 /*
1098 * Initialize the weight with the costs for this token:
1099 * a zero-terminated string and a 16-bit offset.
1100 */
1101 word->weight=-(length+1+2);
1102 word->count=0;
1103 word->length=length;
1104 word->s=stringStart;
1105
1106 ++wordCount;
1107
1108 return word;
1109}
1110
1111static void
1112countWord(Word *word) {
1113 /* add to the weight the savings: the length of the word minus 1 byte for the token */
1114 word->weight+=word->length-1;
1115 ++word->count;
1116}
1117
1118static void
1119addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1120 uint8_t *stringStart;
1121 Line *line;
1122 int16_t i, length;
1123
1124 if(lineCount==MAX_LINE_COUNT) {
1125 fprintf(stderr, "gennames: too many lines\n");
1126 exit(U_BUFFER_OVERFLOW_ERROR);
1127 }
1128
1129 /* find the last non-empty name */
1130 while(count>0 && lengths[count-1]==0) {
1131 --count;
1132 }
1133 if(count==0) {
1134 return; /* should not occur: caller should not have called */
1135 }
1136
1137 /* there will be (count-1) separator characters */
1138 i=count;
1139 length=count-1;
1140
1141 /* add lengths of strings */
1142 while(i>0) {
1143 length+=lengths[--i];
1144 }
1145
1146 /* allocate line memory */
1147 stringStart=allocLine(length);
1148
1149 /* copy all strings into the line memory */
1150 length=0; /* number of chars copied so far */
1151 for(i=0; i<count; ++i) {
1152 if(i>0) {
1153 stringStart[length++]=NAME_SEPARATOR_CHAR;
1154 }
1155 if(lengths[i]>0) {
1156 uprv_memcpy(stringStart+length, names[i], lengths[i]);
1157 length+=lengths[i];
1158 }
1159 }
1160
1161 line=lines+lineCount;
1162
1163 line->code=code;
1164 line->length=length;
1165 line->s=stringStart;
1166
1167 ++lineCount;
1168
1169 /* prevent a character value that is actually in a name from becoming a token */
1170 while(length>0) {
1171 tokens[stringStart[--length]]=-1;
1172 }
1173}
1174
1175static void
1176addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1177 uint8_t *stringStart;
1178 Line *line;
1179
1180 if(lineCount==MAX_LINE_COUNT) {
1181 fprintf(stderr, "gennames: too many groups\n");
1182 exit(U_BUFFER_OVERFLOW_ERROR);
1183 }
1184
1185 /* store the line lengths first, then the strings */
1186 lineLengthsTop=(lineLengthsTop+1)/2;
1187 stringStart=allocLine(lineLengthsTop+length);
1188 uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1189 uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1190
1191 line=lines+lineCount;
1192
1193 line->code=groupMSB;
1194 line->length=length;
1195 line->s=stringStart;
1196
1197 ++lineCount;
1198}
1199
1200static uint32_t
1201addToken(uint8_t *s, int16_t length) {
1202 uint8_t *stringStart;
1203
1204 stringStart=allocLine(length+1);
1205 uprv_memcpy(stringStart, s, length);
1206 stringStart[length]=0;
1207
1208 return (uint32_t)(stringStart - stringStore);
1209}
1210
1211static void
1212appendLineLength(int16_t length) {
1213 if(length>=76) {
1214 fprintf(stderr, "gennames: compressed line too long\n");
1215 exit(U_BUFFER_OVERFLOW_ERROR);
1216 }
1217 if(length>=12) {
1218 length-=12;
1219 appendLineLengthNibble((uint8_t)((length>>4)|12));
1220 }
1221 appendLineLengthNibble((uint8_t)length);
1222}
1223
1224static void
1225appendLineLengthNibble(uint8_t nibble) {
1226 if((lineLengthsTop&1)==0) {
1227 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1228 } else {
1229 lineLengths[lineLengthsTop/2]|=nibble&0xf;
1230 }
1231 ++lineLengthsTop;
1232}
1233
1234static uint8_t *
1235allocLine(int32_t length) {
1236 uint32_t top=lineTop+length;
1237 uint8_t *p;
1238
1239 if(top>wordBottom) {
1240 fprintf(stderr, "gennames: out of memory\n");
1241 exit(U_MEMORY_ALLOCATION_ERROR);
1242 }
1243 p=stringStore+lineTop;
1244 lineTop=top;
1245 return p;
1246}
1247
1248static uint8_t *
1249allocWord(uint32_t length) {
1250 uint32_t bottom=wordBottom-length;
1251
1252 if(lineTop>bottom) {
1253 fprintf(stderr, "gennames: out of memory\n");
1254 exit(U_MEMORY_ALLOCATION_ERROR);
1255 }
1256 wordBottom=bottom;
1257 return stringStore+bottom;
1258}
1259
1260/*
1261 * Hey, Emacs, please set the following:
1262 *
1263 * Local Variables:
1264 * indent-tabs-mode: nil
1265 * End:
1266 *
1267 */