]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennames/gennames.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / tools / gennames / gennames.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gennames.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999sep30
14 * created by: Markus W. Scherer
15 *
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the character code,
18 * the "modern" character name, and optionally the
19 * Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20 * It then tokenizes and compresses the names and builds
21 * compact binary tables for random-access lookup
22 * in a u_charName() API function.
23 *
24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
25 * (all data is static const)
26 *
27 * UDataInfo fields:
28 * dataFormat "unam"
29 * formatVersion 1.0
30 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
31 *
32 * -- data-based names
33 * uint32_t tokenStringOffset,
34 * groupsOffset,
35 * groupStringOffset,
36 * algNamesOffset;
37 *
38 * uint16_t tokenCount;
39 * uint16_t tokenTable[tokenCount];
40 *
41 * char tokenStrings[]; -- padded to even count
42 *
43 * -- strings (groupStrings) are tokenized as follows:
44 * for each character c
45 * if(c>=tokenCount) write that character c directly
46 * else
47 * token=tokenTable[c];
48 * if(token==0xfffe) -- lead byte of double-byte token
49 * token=tokenTable[c<<8|next character];
50 * if(token==-1)
51 * write c directly
52 * else
53 * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54 * append zero-terminated tokenString;
55 *
56 * Different strings for a code point - normal name, 1.0 name, and ISO comment -
57 * are separated by ';'.
58 *
59 * uint16_t groupCount;
60 * struct {
61 * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
63 * uint16_t offsetLow;
64 * } groupTable[groupCount];
65 *
66 * char groupStrings[]; -- padded to 4-count
67 *
68 * -- The actual, tokenized group strings are not zero-terminated because
69 * that would take up too much space.
70 * Instead, they are preceeded by their length, written in a variable-length sequence:
71 * For each of the 32 group strings, one or two nibbles are stored for its length.
72 * Nibbles (4-bit values, half-bytes) are read MSB first.
73 * A nibble with a value of 0..11 directly indicates the length of the name string.
74 * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75 * by (((n-12)<<4)|m)+12, reaching values of 12..75.
76 * These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77 * For the de-tokenizing, see token description above; the strings immediately follow the
78 * 32 lengths.
79 *
80 * -- algorithmic names
81 *
82 * typedef struct AlgorithmicRange {
83 * uint32_t rangeStart, rangeEnd;
84 * uint8_t algorithmType, algorithmVariant;
85 * uint16_t rangeSize;
86 * } AlgorithmicRange;
87 *
88 * uint32_t algRangesCount; -- number of data blocks for ranges of
89 * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
90 *
91 * struct {
92 * AlgorithmicRange algRange;
93 * uint8_t algRangeData[]; -- padded to 4-count except in last range
94 * } algRanges[algNamesCount];
95 * -- not a real array because each part has a different size
96 * of algRange.rangeSize (including AlgorithmicRange)
97 *
98 * -- algorithmic range types:
99 *
100 * 0 Names are formed from a string prefix that is stored in
101 * the algRangeData (zero-terminated), followed by the Unicode code point
102 * of the character in hexadecimal digits;
103 * algRange.algorithmVariant digits are written
104 *
105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
106 * algRange.algorithmVariant is the count of modulo factors
107 * algRangeData contains
108 * uint16_t factors[algRange.algorithmVariant];
109 * char strings[];
110 * the first zero-terminated string is written as the prefix; then:
111 *
112 * The rangeStart is subtracted; with the difference, here "code":
113 * for(i=algRange.algorithmVariant-1 to 0 step -1)
114 * index[i]=code%factor[i];
115 * code/=factor[i];
116 *
117 * The strings after the prefix are short pieces that are then appended to the result
118 * according to index[0..algRange.algorithmVariant-1].
119 */
120
121 #include <stdio.h>
122 #include "unicode/utypes.h"
123 #include "unicode/putil.h"
124 #include "unicode/uclean.h"
125 #include "unicode/udata.h"
126 #include "cmemory.h"
127 #include "cstring.h"
128 #include "uarrsort.h"
129 #include "unewdata.h"
130 #include "uoptions.h"
131 #include "uparse.h"
132
133 #define STRING_STORE_SIZE 1000000
134 #define GROUP_STORE_SIZE 5000
135
136 #define GROUP_SHIFT 5
137 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
138 #define GROUP_MASK (LINES_PER_GROUP-1)
139
140 #define MAX_LINE_COUNT 50000
141 #define MAX_WORD_COUNT 20000
142 #define MAX_GROUP_COUNT 5000
143
144 #define DATA_NAME "unames"
145 #define DATA_TYPE "icu"
146 #define VERSION_STRING "unam"
147 #define NAME_SEPARATOR_CHAR ';'
148
149 static const UVersionInfo
150 unicode_3_0={ 3, 0, 0, 0 },
151 unicode_3_1={ 3, 1, 0, 0 };
152
153 /* UDataInfo cf. udata.h */
154 static UDataInfo dataInfo={
155 sizeof(UDataInfo),
156 0,
157
158 U_IS_BIG_ENDIAN,
159 U_CHARSET_FAMILY,
160 sizeof(UChar),
161 0,
162
163 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
164 {1, 0, 0, 0}, /* formatVersion */
165 {3, 0, 0, 0} /* dataVersion */
166 };
167
168 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
169
170 static uint8_t stringStore[STRING_STORE_SIZE],
171 groupStore[GROUP_STORE_SIZE],
172 lineLengths[LINES_PER_GROUP];
173
174 static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
175
176 typedef struct {
177 uint32_t code;
178 int16_t length;
179 uint8_t *s;
180 } Line;
181
182 typedef struct {
183 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
184 int16_t count;
185 int16_t length;
186 uint8_t *s;
187 } Word;
188
189 static Line lines[MAX_LINE_COUNT];
190 static Word words[MAX_WORD_COUNT];
191
192 static uint32_t lineCount=0, wordCount=0;
193
194 static int16_t leadByteCount;
195
196 #define LEADBYTE_LIMIT 16
197
198 static int16_t tokens[LEADBYTE_LIMIT*256];
199 static uint32_t tokenCount;
200
201 /* prototypes --------------------------------------------------------------- */
202
203 static void
204 init(void);
205
206 static void
207 parseDB(const char *filename, UBool store10Names);
208
209 static void
210 parseName(char *name, int16_t length);
211
212 static int16_t
213 skipNoise(char *line, int16_t start, int16_t limit);
214
215 static int16_t
216 getWord(char *line, int16_t start, int16_t limit);
217
218 static void
219 compress(void);
220
221 static void
222 compressLines(void);
223
224 static int16_t
225 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
226
227 static int32_t
228 compareWords(const void *context, const void *word1, const void *word2);
229
230 static void
231 generateData(const char *dataDir);
232
233 static uint32_t
234 generateAlgorithmicData(UNewDataMemory *pData);
235
236 static int16_t
237 findToken(uint8_t *s, int16_t length);
238
239 static Word *
240 findWord(char *s, int16_t length);
241
242 static Word *
243 addWord(char *s, int16_t length);
244
245 static void
246 countWord(Word *word);
247
248 static void
249 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
250
251 static void
252 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
253
254 static uint32_t
255 addToken(uint8_t *s, int16_t length);
256
257 static void
258 appendLineLength(int16_t length);
259
260 static void
261 appendLineLengthNibble(uint8_t nibble);
262
263 static uint8_t *
264 allocLine(int32_t length);
265
266 static uint8_t *
267 allocWord(uint32_t length);
268
269 /* -------------------------------------------------------------------------- */
270
271 static UOption options[]={
272 UOPTION_HELP_H,
273 UOPTION_HELP_QUESTION_MARK,
274 UOPTION_VERBOSE,
275 UOPTION_QUIET,
276 UOPTION_COPYRIGHT,
277 UOPTION_DESTDIR,
278 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
279 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
280 };
281
282 extern int
283 main(int argc, char* argv[]) {
284 UVersionInfo version;
285 UBool store10Names=FALSE;
286 UErrorCode errorCode = U_ZERO_ERROR;
287
288 U_MAIN_INIT_ARGS(argc, argv);
289
290 /* Initialize ICU */
291 u_init(&errorCode);
292 if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
293 /* Note: u_init() will try to open ICU property data.
294 * failures here are expected when building ICU from scratch.
295 * ignore them.
296 */
297 fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n",
298 argv[0], u_errorName(errorCode));
299 exit(1);
300 }
301
302 /* preset then read command line options */
303 options[5].value=u_getDataDirectory();
304 options[6].value="3.2";
305 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
306
307 /* error handling, printing usage message */
308 if(argc<0) {
309 fprintf(stderr,
310 "error in command line argument \"%s\"\n",
311 argv[-argc]);
312 } else if(argc<2) {
313 argc=-1;
314 }
315 if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
316 /*
317 * Broken into chucks because the C89 standard says the minimum
318 * required supported string length is 509 bytes.
319 */
320 fprintf(stderr,
321 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
322 "\n"
323 "Read the UnicodeData.txt file and \n"
324 "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
325 "\n"
326 "\tfilename absolute path/filename for the Unicode database text file\n"
327 "\t\t(default: standard input)\n"
328 "\n",
329 argv[0]);
330 fprintf(stderr,
331 "Options:\n"
332 "\t-h or -? or --help this usage text\n"
333 "\t-v or --verbose verbose output\n"
334 "\t-q or --quiet no output\n"
335 "\t-c or --copyright include a copyright notice\n"
336 "\t-d or --destdir destination directory, followed by the path\n"
337 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
338 "\t-1 or --unicode1-names store Unicode 1.0 character names\n");
339 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
340 }
341
342 /* get the options values */
343 beVerbose=options[2].doesOccur;
344 beQuiet=options[3].doesOccur;
345 haveCopyright=options[4].doesOccur;
346 store10Names=options[7].doesOccur;
347
348 /* set the Unicode version */
349 u_versionFromString(version, options[6].value);
350 uprv_memcpy(dataInfo.dataVersion, version, 4);
351
352 init();
353 parseDB(argc>=2 ? argv[1] : "-", store10Names);
354 compress();
355 generateData(options[5].value);
356
357 u_cleanup();
358 return 0;
359 }
360
361 static void
362 init() {
363 int i;
364
365 for(i=0; i<256; ++i) {
366 tokens[i]=0;
367 }
368 }
369
370 /* parsing ------------------------------------------------------------------ */
371
372 /* get a name, strip leading and trailing whitespace */
373 static int16_t
374 getName(char **pStart, char *limit) {
375 /* strip leading whitespace */
376 char *start=(char *)u_skipWhitespace(*pStart);
377
378 /* strip trailing whitespace */
379 while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
380 --limit;
381 }
382
383 /* return results */
384 *pStart=start;
385 return (int16_t)(limit-start);
386 }
387
388 static void U_CALLCONV
389 lineFn(void *context,
390 char *fields[][2], int32_t fieldCount,
391 UErrorCode *pErrorCode) {
392 char *names[3];
393 int16_t lengths[3];
394 static uint32_t prevCode=0;
395 uint32_t code=0;
396
397 if(U_FAILURE(*pErrorCode)) {
398 return;
399 }
400 /* get the character code */
401 code=uprv_strtoul(fields[0][0], NULL, 16);
402
403 /* get the character name */
404 names[0]=fields[1][0];
405 lengths[0]=getName(names+0, fields[1][1]);
406 if(names[0][0]=='<') {
407 /* do not store pseudo-names in <> brackets */
408 lengths[0]=0;
409 }
410
411 /* store 1.0 names */
412 /* get the second character name, the one from Unicode 1.0 */
413 /* do not store pseudo-names in <> brackets */
414 names[1]=fields[10][0];
415 lengths[1]=getName(names+1, fields[10][1]);
416 if(*(UBool *)context && names[1][0]!='<') {
417 /* keep the name */
418 } else {
419 lengths[1]=0;
420 }
421
422 /* get the ISO 10646 comment */
423 names[2]=fields[11][0];
424 lengths[2]=getName(names+2, fields[11][1]);
425
426 if(lengths[0]+lengths[1]+lengths[2]==0) {
427 return;
428 }
429
430 /* check for non-character code points */
431 if(!UTF_IS_UNICODE_CHAR(code)) {
432 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
433 (unsigned long)code);
434 *pErrorCode=U_PARSE_ERROR;
435 exit(U_PARSE_ERROR);
436 }
437
438 /* check that the code points (code) are in ascending order */
439 if(code<=prevCode && code>0) {
440 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
441 (unsigned long)code, (unsigned long)prevCode);
442 *pErrorCode=U_PARSE_ERROR;
443 exit(U_PARSE_ERROR);
444 }
445 prevCode=code;
446
447 parseName(names[0], lengths[0]);
448 parseName(names[1], lengths[1]);
449 parseName(names[2], lengths[2]);
450
451 /*
452 * set the count argument to
453 * 1: only store regular names
454 * 2: store regular and 1.0 names
455 * 3: store names and ISO 10646 comment
456 */
457 addLine(code, names, lengths, 3);
458 }
459
460 static void
461 parseDB(const char *filename, UBool store10Names) {
462 char *fields[15][2];
463 UErrorCode errorCode=U_ZERO_ERROR;
464
465 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
466 if(U_FAILURE(errorCode)) {
467 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
468 exit(errorCode);
469 }
470
471 if(!beQuiet) {
472 printf("size of all names in the database: %lu\n",
473 (unsigned long)lineTop);
474 printf("number of named Unicode characters: %lu\n",
475 (unsigned long)lineCount);
476 printf("number of words in the dictionary from these names: %lu\n",
477 (unsigned long)wordCount);
478 }
479 }
480
481 static void
482 parseName(char *name, int16_t length) {
483 int16_t start=0, limit, wordLength/*, prevStart=-1*/;
484 Word *word;
485
486 while(start<length) {
487 /* skip any "noise" characters */
488 limit=skipNoise(name, start, length);
489 if(start<limit) {
490 /*prevStart=-1;*/
491 start=limit;
492 }
493 if(start==length) {
494 break;
495 }
496
497 /* get a word and add it if it is longer than 1 */
498 limit=getWord(name, start, length);
499 wordLength=(int16_t)(limit-start);
500 if(wordLength>1) {
501 word=findWord(name+start, wordLength);
502 if(word==NULL) {
503 word=addWord(name+start, wordLength);
504 }
505 countWord(word);
506 }
507
508 #if 0
509 /*
510 * if there was a word before this
511 * (with no noise in between), then add the pair of words, too
512 */
513 if(prevStart!=-1) {
514 wordLength=limit-prevStart;
515 word=findWord(name+prevStart, wordLength);
516 if(word==NULL) {
517 word=addWord(name+prevStart, wordLength);
518 }
519 countWord(word);
520 }
521 #endif
522
523 /*prevStart=start;*/
524 start=limit;
525 }
526 }
527
528 static UBool U_INLINE
529 isWordChar(char c) {
530 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
531 ('J'<=c && c<='R') ||
532 ('S'<=c && c<='Z') ||
533
534 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
535 ('j'<=c && c<='r') ||
536 ('s'<=c && c<='z') ||
537
538 ('0'<=c && c<='9');
539 }
540
541 static int16_t
542 skipNoise(char *line, int16_t start, int16_t limit) {
543 /* skip anything that is not part of a word in this sense */
544 while(start<limit && !isWordChar(line[start])) {
545 ++start;
546 }
547
548 return start;
549 }
550
551 static int16_t
552 getWord(char *line, int16_t start, int16_t limit) {
553 char c=0; /* initialize to avoid a compiler warning although the code was safe */
554
555 /* a unicode character name word consists of A-Z0-9 */
556 while(start<limit && isWordChar(line[start])) {
557 ++start;
558 }
559
560 /* include a following space or dash */
561 if(start<limit && ((c=line[start])==' ' || c=='-')) {
562 ++start;
563 }
564
565 return start;
566 }
567
568 /* compressing -------------------------------------------------------------- */
569
570 static void
571 compress() {
572 uint32_t i, letterCount;
573 int16_t wordNumber;
574 UErrorCode errorCode;
575
576 /* sort the words in reverse order by weight */
577 errorCode=U_ZERO_ERROR;
578 uprv_sortArray(words, wordCount, sizeof(Word),
579 compareWords, NULL, FALSE, &errorCode);
580
581 /* remove the words that do not save anything */
582 while(wordCount>0 && words[wordCount-1].weight<1) {
583 --wordCount;
584 }
585
586 /* count the letters in the token range */
587 letterCount=0;
588 for(i=LEADBYTE_LIMIT; i<256; ++i) {
589 if(tokens[i]==-1) {
590 ++letterCount;
591 }
592 }
593 if(!beQuiet) {
594 printf("number of letters used in the names: %d\n", (int)letterCount);
595 }
596
597 /* do we need double-byte tokens? */
598 if(wordCount+letterCount<=256) {
599 /* no, single-byte tokens are enough */
600 leadByteCount=0;
601 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
602 if(tokens[i]!=-1) {
603 tokens[i]=wordNumber;
604 if(beVerbose) {
605 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
606 (int)i, (long)words[wordNumber].weight,
607 words[wordNumber].length, words[wordNumber].s);
608 }
609 ++wordNumber;
610 }
611 }
612 tokenCount=i;
613 } else {
614 /*
615 * The tokens that need two token bytes
616 * get their weight reduced by their count
617 * because they save less.
618 */
619 tokenCount=256-letterCount;
620 for(i=tokenCount; i<wordCount; ++i) {
621 words[i].weight-=words[i].count;
622 }
623
624 /* sort these words in reverse order by weight */
625 errorCode=U_ZERO_ERROR;
626 uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
627 compareWords, NULL, FALSE, &errorCode);
628
629 /* remove the words that do not save anything */
630 while(wordCount>0 && words[wordCount-1].weight<1) {
631 --wordCount;
632 }
633
634 /* how many tokens and lead bytes do we have now? */
635 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
636 /*
637 * adjust upwards to take into account that
638 * double-byte tokens must not
639 * use NAME_SEPARATOR_CHAR as a second byte
640 */
641 tokenCount+=(tokenCount-256+254)/255;
642
643 leadByteCount=(int16_t)(tokenCount>>8);
644 if(leadByteCount<LEADBYTE_LIMIT) {
645 /* adjust for the real number of lead bytes */
646 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
647 } else {
648 /* limit the number of lead bytes */
649 leadByteCount=LEADBYTE_LIMIT-1;
650 tokenCount=LEADBYTE_LIMIT*256;
651 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
652 /* adjust again to skip double-byte tokens with ';' */
653 wordCount-=(tokenCount-256+254)/255;
654 }
655
656 /* set token 0 to word 0 */
657 tokens[0]=0;
658 if(beVerbose) {
659 printf("tokens[0x000]: word%8ld \"%.*s\"\n",
660 (long)words[0].weight,
661 words[0].length, words[0].s);
662 }
663 wordNumber=1;
664
665 /* set the lead byte tokens */
666 for(i=1; (int16_t)i<=leadByteCount; ++i) {
667 tokens[i]=-2;
668 }
669
670 /* set the tokens */
671 for(; i<256; ++i) {
672 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
673 if(tokens[i]!=-1) {
674 tokens[i]=wordNumber;
675 if(beVerbose) {
676 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
677 (int)i, (long)words[wordNumber].weight,
678 words[wordNumber].length, words[wordNumber].s);
679 }
680 ++wordNumber;
681 }
682 }
683
684 /* continue above 255 where there are no letters */
685 for(; (uint32_t)wordNumber<wordCount; ++i) {
686 if((i&0xff)==NAME_SEPARATOR_CHAR) {
687 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
688 } else {
689 tokens[i]=wordNumber;
690 if(beVerbose) {
691 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
692 (int)i, (long)words[wordNumber].weight,
693 words[wordNumber].length, words[wordNumber].s);
694 }
695 ++wordNumber;
696 }
697 }
698 tokenCount=i; /* should be already tokenCount={i or i+1} */
699 }
700
701 if(!beQuiet) {
702 printf("number of lead bytes: %d\n", leadByteCount);
703 printf("number of single-byte tokens: %lu\n",
704 (unsigned long)256-letterCount-leadByteCount);
705 printf("number of tokens: %lu\n", (unsigned long)tokenCount);
706 }
707
708 compressLines();
709 }
710
711 static void
712 compressLines() {
713 Line *line=NULL;
714 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
715 groupMSB=0xffff, lineCount2;
716 int16_t groupTop=0;
717
718 /* store the groups like lines, reusing the lines' memory */
719 lineTop=0;
720 lineCount2=lineCount;
721 lineCount=0;
722
723 /* loop over all lines */
724 while(i<lineCount2) {
725 line=lines+i++;
726 inLine=line->code;
727
728 /* segment the lines to groups of 32 */
729 if(inLine>>GROUP_SHIFT!=groupMSB) {
730 /* finish the current group with empty lines */
731 while((++outLine&GROUP_MASK)!=0) {
732 appendLineLength(0);
733 }
734
735 /* store the group like a line */
736 if(groupTop>0) {
737 if(groupTop>GROUP_STORE_SIZE) {
738 fprintf(stderr, "gennames: group store overflow\n");
739 exit(U_BUFFER_OVERFLOW_ERROR);
740 }
741 addGroup(groupMSB, groupStore, groupTop);
742 if(lineTop>(uint32_t)(line->s-stringStore)) {
743 fprintf(stderr, "gennames: group store runs into string store\n");
744 exit(U_INTERNAL_PROGRAM_ERROR);
745 }
746 }
747
748 /* start the new group */
749 lineLengthsTop=0;
750 groupTop=0;
751 groupMSB=inLine>>GROUP_SHIFT;
752 outLine=(inLine&~GROUP_MASK)-1;
753 }
754
755 /* write empty lines between the previous line in the group and this one */
756 while(++outLine<inLine) {
757 appendLineLength(0);
758 }
759
760 /* write characters and tokens for this line */
761 appendLineLength(compressLine(line->s, line->length, &groupTop));
762 }
763
764 /* finish and store the last group */
765 if(line && groupMSB!=0xffff) {
766 /* finish the current group with empty lines */
767 while((++outLine&GROUP_MASK)!=0) {
768 appendLineLength(0);
769 }
770
771 /* store the group like a line */
772 if(groupTop>0) {
773 if(groupTop>GROUP_STORE_SIZE) {
774 fprintf(stderr, "gennames: group store overflow\n");
775 exit(U_BUFFER_OVERFLOW_ERROR);
776 }
777 addGroup(groupMSB, groupStore, groupTop);
778 if(lineTop>(uint32_t)(line->s-stringStore)) {
779 fprintf(stderr, "gennames: group store runs into string store\n");
780 exit(U_INTERNAL_PROGRAM_ERROR);
781 }
782 }
783 }
784
785 if(!beQuiet) {
786 printf("number of groups: %lu\n", (unsigned long)lineCount);
787 }
788 }
789
790 static int16_t
791 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
792 int16_t start, limit, token, groupTop=*pGroupTop;
793
794 start=0;
795 do {
796 /* write any "noise" characters */
797 limit=skipNoise((char *)s, start, length);
798 while(start<limit) {
799 groupStore[groupTop++]=s[start++];
800 }
801
802 if(start==length) {
803 break;
804 }
805
806 /* write a word, as token or directly */
807 limit=getWord((char *)s, start, length);
808 if(limit-start==1) {
809 groupStore[groupTop++]=s[start++];
810 } else {
811 token=findToken(s+start, (int16_t)(limit-start));
812 if(token!=-1) {
813 if(token>0xff) {
814 groupStore[groupTop++]=(uint8_t)(token>>8);
815 }
816 groupStore[groupTop++]=(uint8_t)token;
817 start=limit;
818 } else {
819 while(start<limit) {
820 groupStore[groupTop++]=s[start++];
821 }
822 }
823 }
824 } while(start<length);
825
826 length=(int16_t)(groupTop-*pGroupTop);
827 *pGroupTop=groupTop;
828 return length;
829 }
830
831 static int32_t
832 compareWords(const void *context, const void *word1, const void *word2) {
833 /* reverse sort by word weight */
834 return ((Word *)word2)->weight-((Word *)word1)->weight;
835 }
836
837 /* generate output data ----------------------------------------------------- */
838
839 static void
840 generateData(const char *dataDir) {
841 UNewDataMemory *pData;
842 UErrorCode errorCode=U_ZERO_ERROR;
843 uint16_t groupWords[3];
844 uint32_t i, groupTop=lineTop, offset, size,
845 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
846 long dataLength;
847 int16_t token;
848
849 pData=udata_create(dataDir, DATA_TYPE,DATA_NAME, &dataInfo,
850 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
851 if(U_FAILURE(errorCode)) {
852 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
853 exit(errorCode);
854 }
855
856 /* first, see how much space we need, and prepare the token strings */
857 for(i=0; i<tokenCount; ++i) {
858 token=tokens[i];
859 if(token!=-1 && token!=-2) {
860 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
861 }
862 }
863
864 /*
865 * Calculate the total size in bytes of the data including:
866 * - the offset to the token strings, uint32_t (4)
867 * - the offset to the group table, uint32_t (4)
868 * - the offset to the group strings, uint32_t (4)
869 * - the offset to the algorithmic names, uint32_t (4)
870 *
871 * - the number of tokens, uint16_t (2)
872 * - the token table, uint16_t[tokenCount] (2*tokenCount)
873 *
874 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
875 *
876 * - the number of groups, uint16_t (2)
877 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
878 *
879 * - the group strings (groupTop), 2-padded
880 *
881 * - the size of the data for the algorithmic names
882 */
883 tokenStringOffset=4+4+4+4+2+2*tokenCount;
884 groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
885 groupStringOffset=groupsOffset+2+6*lineCount;
886 algNamesOffset=(groupStringOffset+groupTop+3)&~3;
887
888 offset=generateAlgorithmicData(NULL);
889 size=algNamesOffset+offset;
890
891 if(!beQuiet) {
892 printf("size of the Unicode Names data:\n"
893 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
894 (unsigned long)size, (unsigned long)(lineTop-groupTop),
895 (unsigned long)groupTop, (unsigned long)offset);
896 }
897
898 /* write the data to the file */
899 /* offsets */
900 udata_write32(pData, tokenStringOffset);
901 udata_write32(pData, groupsOffset);
902 udata_write32(pData, groupStringOffset);
903 udata_write32(pData, algNamesOffset);
904
905 /* token table */
906 udata_write16(pData, (uint16_t)tokenCount);
907 udata_writeBlock(pData, tokens, 2*tokenCount);
908
909 /* token strings */
910 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
911 if((lineTop-groupTop)&1) {
912 /* 2-padding */
913 udata_writePadding(pData, 1);
914 }
915
916 /* group table */
917 udata_write16(pData, (uint16_t)lineCount);
918 for(i=0; i<lineCount; ++i) {
919 /* groupMSB */
920 groupWords[0]=(uint16_t)lines[i].code;
921
922 /* offset */
923 offset = (uint32_t)(lines[i].s - stringStore);
924 groupWords[1]=(uint16_t)(offset>>16);
925 groupWords[2]=(uint16_t)(offset);
926 udata_writeBlock(pData, groupWords, 6);
927 }
928
929 /* group strings */
930 udata_writeBlock(pData, stringStore, groupTop);
931
932 /* 4-align the algorithmic names data */
933 udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
934
935 generateAlgorithmicData(pData);
936
937 /* finish up */
938 dataLength=udata_finish(pData, &errorCode);
939 if(U_FAILURE(errorCode)) {
940 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
941 exit(errorCode);
942 }
943
944 if(dataLength!=(long)size) {
945 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
946 dataLength, (unsigned long)size);
947 exit(U_INTERNAL_PROGRAM_ERROR);
948 }
949 }
950
951 /* the structure for algorithmic names needs to be 4-aligned */
952 typedef struct AlgorithmicRange {
953 uint32_t rangeStart, rangeEnd;
954 uint8_t algorithmType, algorithmVariant;
955 uint16_t rangeSize;
956 } AlgorithmicRange;
957
958 static uint32_t
959 generateAlgorithmicData(UNewDataMemory *pData) {
960 static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
961 # define PREFIX_LENGTH 23
962 # define PREFIX_LENGTH_4 24
963 uint32_t countAlgRanges;
964
965 static AlgorithmicRange cjkExtA={
966 0x3400, 0x4db5,
967 0, 4,
968 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
969 };
970 static AlgorithmicRange cjk={
971 0x4e00, 0x9fa5,
972 0, 4,
973 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
974 };
975 static AlgorithmicRange cjkExtB={
976 0x20000, 0x2a6d6,
977 0, 5,
978 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
979 };
980
981 static char jamo[]=
982 "HANGUL SYLLABLE \0"
983
984 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
985 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
986
987 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
988 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
989 "YU\0EU\0YI\0I\0"
990
991 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
992 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
993 "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
994 ;
995
996 static AlgorithmicRange hangul={
997 0xac00, 0xd7a3,
998 1, 3,
999 sizeof(AlgorithmicRange)+6+sizeof(jamo)
1000 };
1001
1002 /* modulo factors, maximum 8 */
1003 /* 3 factors: 19, 21, 28, most-to-least-significant */
1004 static uint16_t hangulFactors[3]={
1005 19, 21, 28
1006 };
1007
1008 uint32_t size;
1009
1010 size=0;
1011
1012 /* number of ranges of algorithmic names */
1013 if(uprv_memcmp(dataInfo.dataVersion, unicode_3_1, sizeof(UVersionInfo))>=0) {
1014 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1015 countAlgRanges=4;
1016 } else if(uprv_memcmp(dataInfo.dataVersion, unicode_3_0, sizeof(UVersionInfo))>=0) {
1017 /* Unicode 3.0 has 3 ranges including CJK Extension A */
1018 countAlgRanges=3;
1019 } else {
1020 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1021 countAlgRanges=2;
1022 }
1023
1024 if(pData!=NULL) {
1025 udata_write32(pData, countAlgRanges);
1026 } else {
1027 size+=4;
1028 }
1029
1030 /*
1031 * each range:
1032 * uint32_t rangeStart
1033 * uint32_t rangeEnd
1034 * uint8_t algorithmType
1035 * uint8_t algorithmVariant
1036 * uint16_t size of range data
1037 * uint8_t[size] data
1038 */
1039
1040 /* range 0: cjk extension a */
1041 if(countAlgRanges>=3) {
1042 if(pData!=NULL) {
1043 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1044 udata_writeString(pData, prefix, PREFIX_LENGTH);
1045 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1046 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1047 }
1048 } else {
1049 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1050 }
1051 }
1052
1053 /* range 1: cjk */
1054 if(pData!=NULL) {
1055 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1056 udata_writeString(pData, prefix, PREFIX_LENGTH);
1057 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1058 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1059 }
1060 } else {
1061 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1062 }
1063
1064 /* range 2: hangul syllables */
1065 if(pData!=NULL) {
1066 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1067 udata_writeBlock(pData, hangulFactors, 6);
1068 udata_writeString(pData, jamo, sizeof(jamo));
1069 } else {
1070 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1071 }
1072
1073 /* range 3: cjk extension b */
1074 if(countAlgRanges>=4) {
1075 if(pData!=NULL) {
1076 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1077 udata_writeString(pData, prefix, PREFIX_LENGTH);
1078 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1079 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1080 }
1081 } else {
1082 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1083 }
1084 }
1085
1086 return size;
1087 }
1088
1089 /* helpers ------------------------------------------------------------------ */
1090
1091 static int16_t
1092 findToken(uint8_t *s, int16_t length) {
1093 int16_t i, token;
1094
1095 for(i=0; i<(int16_t)tokenCount; ++i) {
1096 token=tokens[i];
1097 if(token!=-1 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1098 return i;
1099 }
1100 }
1101
1102 return -1;
1103 }
1104
1105 static Word *
1106 findWord(char *s, int16_t length) {
1107 uint32_t i;
1108
1109 for(i=0; i<wordCount; ++i) {
1110 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1111 return words+i;
1112 }
1113 }
1114
1115 return NULL;
1116 }
1117
1118 static Word *
1119 addWord(char *s, int16_t length) {
1120 uint8_t *stringStart;
1121 Word *word;
1122
1123 if(wordCount==MAX_WORD_COUNT) {
1124 fprintf(stderr, "gennames: too many words\n");
1125 exit(U_BUFFER_OVERFLOW_ERROR);
1126 }
1127
1128 stringStart=allocWord(length);
1129 uprv_memcpy(stringStart, s, length);
1130
1131 word=words+wordCount;
1132
1133 /*
1134 * Initialize the weight with the costs for this token:
1135 * a zero-terminated string and a 16-bit offset.
1136 */
1137 word->weight=-(length+1+2);
1138 word->count=0;
1139 word->length=length;
1140 word->s=stringStart;
1141
1142 ++wordCount;
1143
1144 return word;
1145 }
1146
1147 static void
1148 countWord(Word *word) {
1149 /* add to the weight the savings: the length of the word minus 1 byte for the token */
1150 word->weight+=word->length-1;
1151 ++word->count;
1152 }
1153
1154 static void
1155 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1156 uint8_t *stringStart;
1157 Line *line;
1158 int16_t i, length;
1159
1160 if(lineCount==MAX_LINE_COUNT) {
1161 fprintf(stderr, "gennames: too many lines\n");
1162 exit(U_BUFFER_OVERFLOW_ERROR);
1163 }
1164
1165 /* find the last non-empty name */
1166 while(count>0 && lengths[count-1]==0) {
1167 --count;
1168 }
1169 if(count==0) {
1170 return; /* should not occur: caller should not have called */
1171 }
1172
1173 /* there will be (count-1) separator characters */
1174 i=count;
1175 length=count-1;
1176
1177 /* add lengths of strings */
1178 while(i>0) {
1179 length+=lengths[--i];
1180 }
1181
1182 /* allocate line memory */
1183 stringStart=allocLine(length);
1184
1185 /* copy all strings into the line memory */
1186 length=0; /* number of chars copied so far */
1187 for(i=0; i<count; ++i) {
1188 if(i>0) {
1189 stringStart[length++]=NAME_SEPARATOR_CHAR;
1190 }
1191 if(lengths[i]>0) {
1192 uprv_memcpy(stringStart+length, names[i], lengths[i]);
1193 length+=lengths[i];
1194 }
1195 }
1196
1197 line=lines+lineCount;
1198
1199 line->code=code;
1200 line->length=length;
1201 line->s=stringStart;
1202
1203 ++lineCount;
1204
1205 /* prevent a character value that is actually in a name from becoming a token */
1206 while(length>0) {
1207 tokens[stringStart[--length]]=-1;
1208 }
1209 }
1210
1211 static void
1212 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1213 uint8_t *stringStart;
1214 Line *line;
1215
1216 if(lineCount==MAX_LINE_COUNT) {
1217 fprintf(stderr, "gennames: too many groups\n");
1218 exit(U_BUFFER_OVERFLOW_ERROR);
1219 }
1220
1221 /* store the line lengths first, then the strings */
1222 lineLengthsTop=(lineLengthsTop+1)/2;
1223 stringStart=allocLine(lineLengthsTop+length);
1224 uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1225 uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1226
1227 line=lines+lineCount;
1228
1229 line->code=groupMSB;
1230 line->length=length;
1231 line->s=stringStart;
1232
1233 ++lineCount;
1234 }
1235
1236 static uint32_t
1237 addToken(uint8_t *s, int16_t length) {
1238 uint8_t *stringStart;
1239
1240 stringStart=allocLine(length+1);
1241 uprv_memcpy(stringStart, s, length);
1242 stringStart[length]=0;
1243
1244 return (uint32_t)(stringStart - stringStore);
1245 }
1246
1247 static void
1248 appendLineLength(int16_t length) {
1249 if(length>=76) {
1250 fprintf(stderr, "gennames: compressed line too long\n");
1251 exit(U_BUFFER_OVERFLOW_ERROR);
1252 }
1253 if(length>=12) {
1254 length-=12;
1255 appendLineLengthNibble((uint8_t)((length>>4)|12));
1256 }
1257 appendLineLengthNibble((uint8_t)length);
1258 }
1259
1260 static void
1261 appendLineLengthNibble(uint8_t nibble) {
1262 if((lineLengthsTop&1)==0) {
1263 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1264 } else {
1265 lineLengths[lineLengthsTop/2]|=nibble&0xf;
1266 }
1267 ++lineLengthsTop;
1268 }
1269
1270 static uint8_t *
1271 allocLine(int32_t length) {
1272 uint32_t top=lineTop+length;
1273 uint8_t *p;
1274
1275 if(top>wordBottom) {
1276 fprintf(stderr, "gennames: out of memory\n");
1277 exit(U_MEMORY_ALLOCATION_ERROR);
1278 }
1279 p=stringStore+lineTop;
1280 lineTop=top;
1281 return p;
1282 }
1283
1284 static uint8_t *
1285 allocWord(uint32_t length) {
1286 uint32_t bottom=wordBottom-length;
1287
1288 if(lineTop>bottom) {
1289 fprintf(stderr, "gennames: out of memory\n");
1290 exit(U_MEMORY_ALLOCATION_ERROR);
1291 }
1292 wordBottom=bottom;
1293 return stringStore+bottom;
1294 }
1295
1296 /*
1297 * Hey, Emacs, please set the following:
1298 *
1299 * Local Variables:
1300 * indent-tabs-mode: nil
1301 * End:
1302 *
1303 */