]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennames/gennames.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / gennames / gennames.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
73c04bcf 4* Copyright (C) 1999-2005, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: gennames.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999sep30
14* created by: Markus W. Scherer
15*
16* This program reads the Unicode character database text file,
17* parses it, and extracts the character code,
18* the "modern" character name, and optionally the
19* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20* It then tokenizes and compresses the names and builds
21* compact binary tables for random-access lookup
22* in a u_charName() API function.
23*
24* unames.icu file format (after UDataInfo header etc. - see udata.c)
25* (all data is static const)
26*
27* UDataInfo fields:
28* dataFormat "unam"
29* formatVersion 1.0
30* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
31*
32* -- data-based names
33* uint32_t tokenStringOffset,
34* groupsOffset,
35* groupStringOffset,
36* algNamesOffset;
37*
38* uint16_t tokenCount;
39* uint16_t tokenTable[tokenCount];
40*
41* char tokenStrings[]; -- padded to even count
42*
43* -- strings (groupStrings) are tokenized as follows:
44* for each character c
45* if(c>=tokenCount) write that character c directly
46* else
47* token=tokenTable[c];
48* if(token==0xfffe) -- lead byte of double-byte token
49* token=tokenTable[c<<8|next character];
50* if(token==-1)
51* write c directly
52* else
53* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54* append zero-terminated tokenString;
55*
56* Different strings for a code point - normal name, 1.0 name, and ISO comment -
57* are separated by ';'.
58*
59* uint16_t groupCount;
60* struct {
61* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
63* uint16_t offsetLow;
64* } groupTable[groupCount];
65*
66* char groupStrings[]; -- padded to 4-count
67*
68* -- The actual, tokenized group strings are not zero-terminated because
69* that would take up too much space.
70* Instead, they are preceeded by their length, written in a variable-length sequence:
71* For each of the 32 group strings, one or two nibbles are stored for its length.
72* Nibbles (4-bit values, half-bytes) are read MSB first.
73* A nibble with a value of 0..11 directly indicates the length of the name string.
74* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75* by (((n-12)<<4)|m)+12, reaching values of 12..75.
76* These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77* For the de-tokenizing, see token description above; the strings immediately follow the
78* 32 lengths.
79*
80* -- algorithmic names
81*
82* typedef struct AlgorithmicRange {
83* uint32_t rangeStart, rangeEnd;
84* uint8_t algorithmType, algorithmVariant;
85* uint16_t rangeSize;
86* } AlgorithmicRange;
87*
88* uint32_t algRangesCount; -- number of data blocks for ranges of
89* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
90*
91* struct {
92* AlgorithmicRange algRange;
93* uint8_t algRangeData[]; -- padded to 4-count except in last range
94* } algRanges[algNamesCount];
95* -- not a real array because each part has a different size
96* of algRange.rangeSize (including AlgorithmicRange)
97*
98* -- algorithmic range types:
99*
100* 0 Names are formed from a string prefix that is stored in
101* the algRangeData (zero-terminated), followed by the Unicode code point
102* of the character in hexadecimal digits;
103* algRange.algorithmVariant digits are written
104*
105* 1 Names are formed by calculating modulo-factors of the code point value as follows:
106* algRange.algorithmVariant is the count of modulo factors
107* algRangeData contains
108* uint16_t factors[algRange.algorithmVariant];
109* char strings[];
110* the first zero-terminated string is written as the prefix; then:
111*
112* The rangeStart is subtracted; with the difference, here "code":
113* for(i=algRange.algorithmVariant-1 to 0 step -1)
114* index[i]=code%factor[i];
115* code/=factor[i];
116*
117* The strings after the prefix are short pieces that are then appended to the result
118* according to index[0..algRange.algorithmVariant-1].
119*/
120
121#include <stdio.h>
b75a7d8f
A
122#include "unicode/utypes.h"
123#include "unicode/putil.h"
374ca955
A
124#include "unicode/uclean.h"
125#include "unicode/udata.h"
b75a7d8f
A
126#include "cmemory.h"
127#include "cstring.h"
374ca955 128#include "uarrsort.h"
b75a7d8f
A
129#include "unewdata.h"
130#include "uoptions.h"
131#include "uparse.h"
132
133#define STRING_STORE_SIZE 1000000
134#define GROUP_STORE_SIZE 5000
135
136#define GROUP_SHIFT 5
137#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
138#define GROUP_MASK (LINES_PER_GROUP-1)
139
140#define MAX_LINE_COUNT 50000
141#define MAX_WORD_COUNT 20000
142#define MAX_GROUP_COUNT 5000
143
144#define DATA_NAME "unames"
145#define DATA_TYPE "icu"
146#define VERSION_STRING "unam"
147#define NAME_SEPARATOR_CHAR ';'
148
73c04bcf
A
149/* Unicode versions --------------------------------------------------------- */
150
151enum {
152 UNI_1_0,
153 UNI_1_1,
154 UNI_2_0,
155 UNI_3_0,
156 UNI_3_1,
157 UNI_3_2,
158 UNI_4_0,
159 UNI_4_0_1,
160 UNI_4_1,
161 UNI_VER_COUNT
162};
163
b75a7d8f 164static const UVersionInfo
73c04bcf
A
165unicodeVersions[]={
166 { 1, 0, 0, 0 },
167 { 1, 1, 0, 0 },
168 { 2, 0, 0, 0 },
169 { 3, 0, 0, 0 },
170 { 3, 1, 0, 0 },
171 { 3, 2, 0, 0 },
172 { 4, 0, 0, 0 },
173 { 4, 0, 1, 0 },
174 { 4, 1, 0, 0 }
175};
176
177static int32_t ucdVersion=UNI_4_1;
178
179static int32_t
180findUnicodeVersion(const UVersionInfo version) {
181 int32_t i;
182
183 for(i=0; /* while(version>unicodeVersions[i]) {} */
184 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
185 ++i) {}
186 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
187 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
188 }
189 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
190}
191
192/* generator data ----------------------------------------------------------- */
b75a7d8f
A
193
194/* UDataInfo cf. udata.h */
195static UDataInfo dataInfo={
196 sizeof(UDataInfo),
197 0,
198
199 U_IS_BIG_ENDIAN,
200 U_CHARSET_FAMILY,
201 sizeof(UChar),
202 0,
203
204 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
205 {1, 0, 0, 0}, /* formatVersion */
206 {3, 0, 0, 0} /* dataVersion */
207};
208
209static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
210
211static uint8_t stringStore[STRING_STORE_SIZE],
212 groupStore[GROUP_STORE_SIZE],
213 lineLengths[LINES_PER_GROUP];
214
215static uint32_t lineTop=0, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
216
217typedef struct {
218 uint32_t code;
219 int16_t length;
220 uint8_t *s;
221} Line;
222
223typedef struct {
224 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
225 int16_t count;
226 int16_t length;
227 uint8_t *s;
228} Word;
229
230static Line lines[MAX_LINE_COUNT];
231static Word words[MAX_WORD_COUNT];
232
233static uint32_t lineCount=0, wordCount=0;
234
235static int16_t leadByteCount;
236
237#define LEADBYTE_LIMIT 16
238
239static int16_t tokens[LEADBYTE_LIMIT*256];
240static uint32_t tokenCount;
241
242/* prototypes --------------------------------------------------------------- */
243
244static void
245init(void);
246
247static void
248parseDB(const char *filename, UBool store10Names);
249
250static void
251parseName(char *name, int16_t length);
252
253static int16_t
254skipNoise(char *line, int16_t start, int16_t limit);
255
256static int16_t
257getWord(char *line, int16_t start, int16_t limit);
258
259static void
260compress(void);
261
262static void
263compressLines(void);
264
265static int16_t
266compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
267
374ca955
A
268static int32_t
269compareWords(const void *context, const void *word1, const void *word2);
b75a7d8f
A
270
271static void
272generateData(const char *dataDir);
273
274static uint32_t
275generateAlgorithmicData(UNewDataMemory *pData);
276
277static int16_t
278findToken(uint8_t *s, int16_t length);
279
280static Word *
281findWord(char *s, int16_t length);
282
283static Word *
284addWord(char *s, int16_t length);
285
286static void
287countWord(Word *word);
288
289static void
290addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
291
292static void
293addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
294
295static uint32_t
296addToken(uint8_t *s, int16_t length);
297
298static void
299appendLineLength(int16_t length);
300
301static void
302appendLineLengthNibble(uint8_t nibble);
303
304static uint8_t *
305allocLine(int32_t length);
306
307static uint8_t *
308allocWord(uint32_t length);
309
310/* -------------------------------------------------------------------------- */
311
312static UOption options[]={
313 UOPTION_HELP_H,
314 UOPTION_HELP_QUESTION_MARK,
315 UOPTION_VERBOSE,
316 UOPTION_QUIET,
317 UOPTION_COPYRIGHT,
318 UOPTION_DESTDIR,
319 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
320 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }
321};
322
323extern int
324main(int argc, char* argv[]) {
325 UVersionInfo version;
326 UBool store10Names=FALSE;
374ca955 327 UErrorCode errorCode = U_ZERO_ERROR;
b75a7d8f
A
328
329 U_MAIN_INIT_ARGS(argc, argv);
330
374ca955
A
331 /* Initialize ICU */
332 u_init(&errorCode);
333 if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
334 /* Note: u_init() will try to open ICU property data.
335 * failures here are expected when building ICU from scratch.
336 * ignore them.
337 */
338 fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n",
339 argv[0], u_errorName(errorCode));
340 exit(1);
341 }
342
b75a7d8f
A
343 /* preset then read command line options */
344 options[5].value=u_getDataDirectory();
73c04bcf 345 options[6].value="4.1";
b75a7d8f
A
346 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
347
348 /* error handling, printing usage message */
349 if(argc<0) {
350 fprintf(stderr,
351 "error in command line argument \"%s\"\n",
352 argv[-argc]);
353 } else if(argc<2) {
354 argc=-1;
355 }
356 if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
357 /*
358 * Broken into chucks because the C89 standard says the minimum
359 * required supported string length is 509 bytes.
360 */
361 fprintf(stderr,
362 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
363 "\n"
364 "Read the UnicodeData.txt file and \n"
374ca955 365 "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
b75a7d8f
A
366 "\n"
367 "\tfilename absolute path/filename for the Unicode database text file\n"
368 "\t\t(default: standard input)\n"
369 "\n",
370 argv[0]);
371 fprintf(stderr,
372 "Options:\n"
373 "\t-h or -? or --help this usage text\n"
374 "\t-v or --verbose verbose output\n"
375 "\t-q or --quiet no output\n"
376 "\t-c or --copyright include a copyright notice\n"
377 "\t-d or --destdir destination directory, followed by the path\n"
378 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
379 "\t-1 or --unicode1-names store Unicode 1.0 character names\n");
380 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
381 }
382
383 /* get the options values */
384 beVerbose=options[2].doesOccur;
385 beQuiet=options[3].doesOccur;
386 haveCopyright=options[4].doesOccur;
387 store10Names=options[7].doesOccur;
388
389 /* set the Unicode version */
390 u_versionFromString(version, options[6].value);
391 uprv_memcpy(dataInfo.dataVersion, version, 4);
73c04bcf 392 ucdVersion=findUnicodeVersion(version);
b75a7d8f
A
393
394 init();
395 parseDB(argc>=2 ? argv[1] : "-", store10Names);
396 compress();
397 generateData(options[5].value);
398
374ca955 399 u_cleanup();
b75a7d8f
A
400 return 0;
401}
402
403static void
404init() {
405 int i;
406
407 for(i=0; i<256; ++i) {
408 tokens[i]=0;
409 }
410}
411
412/* parsing ------------------------------------------------------------------ */
413
374ca955
A
414/* get a name, strip leading and trailing whitespace */
415static int16_t
416getName(char **pStart, char *limit) {
417 /* strip leading whitespace */
418 char *start=(char *)u_skipWhitespace(*pStart);
419
420 /* strip trailing whitespace */
421 while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
422 --limit;
423 }
424
425 /* return results */
426 *pStart=start;
427 return (int16_t)(limit-start);
428}
429
b75a7d8f
A
430static void U_CALLCONV
431lineFn(void *context,
432 char *fields[][2], int32_t fieldCount,
433 UErrorCode *pErrorCode) {
434 char *names[3];
435 int16_t lengths[3];
436 static uint32_t prevCode=0;
437 uint32_t code=0;
438
439 if(U_FAILURE(*pErrorCode)) {
440 return;
441 }
442 /* get the character code */
443 code=uprv_strtoul(fields[0][0], NULL, 16);
444
445 /* get the character name */
446 names[0]=fields[1][0];
374ca955
A
447 lengths[0]=getName(names+0, fields[1][1]);
448 if(names[0][0]=='<') {
b75a7d8f
A
449 /* do not store pseudo-names in <> brackets */
450 lengths[0]=0;
451 }
452
453 /* store 1.0 names */
454 /* get the second character name, the one from Unicode 1.0 */
455 /* do not store pseudo-names in <> brackets */
456 names[1]=fields[10][0];
374ca955
A
457 lengths[1]=getName(names+1, fields[10][1]);
458 if(*(UBool *)context && names[1][0]!='<') {
459 /* keep the name */
b75a7d8f
A
460 } else {
461 lengths[1]=0;
462 }
463
464 /* get the ISO 10646 comment */
465 names[2]=fields[11][0];
374ca955 466 lengths[2]=getName(names+2, fields[11][1]);
b75a7d8f
A
467
468 if(lengths[0]+lengths[1]+lengths[2]==0) {
469 return;
470 }
471
472 /* check for non-character code points */
473 if(!UTF_IS_UNICODE_CHAR(code)) {
474 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
475 (unsigned long)code);
476 *pErrorCode=U_PARSE_ERROR;
477 exit(U_PARSE_ERROR);
478 }
479
480 /* check that the code points (code) are in ascending order */
481 if(code<=prevCode && code>0) {
482 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
483 (unsigned long)code, (unsigned long)prevCode);
484 *pErrorCode=U_PARSE_ERROR;
485 exit(U_PARSE_ERROR);
486 }
487 prevCode=code;
488
489 parseName(names[0], lengths[0]);
490 parseName(names[1], lengths[1]);
491 parseName(names[2], lengths[2]);
492
493 /*
494 * set the count argument to
495 * 1: only store regular names
496 * 2: store regular and 1.0 names
497 * 3: store names and ISO 10646 comment
498 */
499 addLine(code, names, lengths, 3);
500}
501
502static void
503parseDB(const char *filename, UBool store10Names) {
504 char *fields[15][2];
505 UErrorCode errorCode=U_ZERO_ERROR;
506
507 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
508 if(U_FAILURE(errorCode)) {
509 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
510 exit(errorCode);
511 }
512
513 if(!beQuiet) {
514 printf("size of all names in the database: %lu\n",
515 (unsigned long)lineTop);
516 printf("number of named Unicode characters: %lu\n",
517 (unsigned long)lineCount);
518 printf("number of words in the dictionary from these names: %lu\n",
519 (unsigned long)wordCount);
520 }
521}
522
523static void
524parseName(char *name, int16_t length) {
525 int16_t start=0, limit, wordLength/*, prevStart=-1*/;
526 Word *word;
527
528 while(start<length) {
529 /* skip any "noise" characters */
530 limit=skipNoise(name, start, length);
531 if(start<limit) {
532 /*prevStart=-1;*/
533 start=limit;
534 }
535 if(start==length) {
536 break;
537 }
538
539 /* get a word and add it if it is longer than 1 */
540 limit=getWord(name, start, length);
541 wordLength=(int16_t)(limit-start);
542 if(wordLength>1) {
543 word=findWord(name+start, wordLength);
544 if(word==NULL) {
545 word=addWord(name+start, wordLength);
546 }
547 countWord(word);
548 }
549
550#if 0
551 /*
552 * if there was a word before this
553 * (with no noise in between), then add the pair of words, too
554 */
555 if(prevStart!=-1) {
556 wordLength=limit-prevStart;
557 word=findWord(name+prevStart, wordLength);
558 if(word==NULL) {
559 word=addWord(name+prevStart, wordLength);
560 }
561 countWord(word);
562 }
563#endif
564
565 /*prevStart=start;*/
566 start=limit;
567 }
568}
569
570static UBool U_INLINE
571isWordChar(char c) {
572 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
573 ('J'<=c && c<='R') ||
574 ('S'<=c && c<='Z') ||
575
576 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
577 ('j'<=c && c<='r') ||
578 ('s'<=c && c<='z') ||
579
580 ('0'<=c && c<='9');
581}
582
583static int16_t
584skipNoise(char *line, int16_t start, int16_t limit) {
585 /* skip anything that is not part of a word in this sense */
586 while(start<limit && !isWordChar(line[start])) {
587 ++start;
588 }
589
590 return start;
591}
592
593static int16_t
594getWord(char *line, int16_t start, int16_t limit) {
595 char c=0; /* initialize to avoid a compiler warning although the code was safe */
596
597 /* a unicode character name word consists of A-Z0-9 */
598 while(start<limit && isWordChar(line[start])) {
599 ++start;
600 }
601
602 /* include a following space or dash */
603 if(start<limit && ((c=line[start])==' ' || c=='-')) {
604 ++start;
605 }
606
607 return start;
608}
609
610/* compressing -------------------------------------------------------------- */
611
612static void
613compress() {
614 uint32_t i, letterCount;
615 int16_t wordNumber;
374ca955 616 UErrorCode errorCode;
b75a7d8f
A
617
618 /* sort the words in reverse order by weight */
374ca955
A
619 errorCode=U_ZERO_ERROR;
620 uprv_sortArray(words, wordCount, sizeof(Word),
621 compareWords, NULL, FALSE, &errorCode);
b75a7d8f
A
622
623 /* remove the words that do not save anything */
624 while(wordCount>0 && words[wordCount-1].weight<1) {
625 --wordCount;
626 }
627
628 /* count the letters in the token range */
629 letterCount=0;
630 for(i=LEADBYTE_LIMIT; i<256; ++i) {
631 if(tokens[i]==-1) {
632 ++letterCount;
633 }
634 }
635 if(!beQuiet) {
374ca955 636 printf("number of letters used in the names: %d\n", (int)letterCount);
b75a7d8f
A
637 }
638
639 /* do we need double-byte tokens? */
640 if(wordCount+letterCount<=256) {
641 /* no, single-byte tokens are enough */
642 leadByteCount=0;
643 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
644 if(tokens[i]!=-1) {
645 tokens[i]=wordNumber;
646 if(beVerbose) {
647 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
374ca955 648 (int)i, (long)words[wordNumber].weight,
b75a7d8f
A
649 words[wordNumber].length, words[wordNumber].s);
650 }
651 ++wordNumber;
652 }
653 }
654 tokenCount=i;
655 } else {
656 /*
657 * The tokens that need two token bytes
658 * get their weight reduced by their count
659 * because they save less.
660 */
661 tokenCount=256-letterCount;
662 for(i=tokenCount; i<wordCount; ++i) {
663 words[i].weight-=words[i].count;
664 }
665
666 /* sort these words in reverse order by weight */
374ca955
A
667 errorCode=U_ZERO_ERROR;
668 uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
669 compareWords, NULL, FALSE, &errorCode);
b75a7d8f
A
670
671 /* remove the words that do not save anything */
672 while(wordCount>0 && words[wordCount-1].weight<1) {
673 --wordCount;
674 }
675
676 /* how many tokens and lead bytes do we have now? */
677 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
678 /*
679 * adjust upwards to take into account that
680 * double-byte tokens must not
681 * use NAME_SEPARATOR_CHAR as a second byte
682 */
683 tokenCount+=(tokenCount-256+254)/255;
684
685 leadByteCount=(int16_t)(tokenCount>>8);
686 if(leadByteCount<LEADBYTE_LIMIT) {
687 /* adjust for the real number of lead bytes */
688 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
689 } else {
690 /* limit the number of lead bytes */
691 leadByteCount=LEADBYTE_LIMIT-1;
692 tokenCount=LEADBYTE_LIMIT*256;
693 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
694 /* adjust again to skip double-byte tokens with ';' */
695 wordCount-=(tokenCount-256+254)/255;
696 }
697
698 /* set token 0 to word 0 */
699 tokens[0]=0;
700 if(beVerbose) {
701 printf("tokens[0x000]: word%8ld \"%.*s\"\n",
702 (long)words[0].weight,
703 words[0].length, words[0].s);
704 }
705 wordNumber=1;
706
707 /* set the lead byte tokens */
708 for(i=1; (int16_t)i<=leadByteCount; ++i) {
709 tokens[i]=-2;
710 }
711
712 /* set the tokens */
713 for(; i<256; ++i) {
714 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
715 if(tokens[i]!=-1) {
716 tokens[i]=wordNumber;
717 if(beVerbose) {
718 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
374ca955 719 (int)i, (long)words[wordNumber].weight,
b75a7d8f
A
720 words[wordNumber].length, words[wordNumber].s);
721 }
722 ++wordNumber;
723 }
724 }
725
726 /* continue above 255 where there are no letters */
727 for(; (uint32_t)wordNumber<wordCount; ++i) {
728 if((i&0xff)==NAME_SEPARATOR_CHAR) {
729 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
730 } else {
731 tokens[i]=wordNumber;
732 if(beVerbose) {
733 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
374ca955 734 (int)i, (long)words[wordNumber].weight,
b75a7d8f
A
735 words[wordNumber].length, words[wordNumber].s);
736 }
737 ++wordNumber;
738 }
739 }
740 tokenCount=i; /* should be already tokenCount={i or i+1} */
741 }
742
743 if(!beQuiet) {
744 printf("number of lead bytes: %d\n", leadByteCount);
745 printf("number of single-byte tokens: %lu\n",
746 (unsigned long)256-letterCount-leadByteCount);
747 printf("number of tokens: %lu\n", (unsigned long)tokenCount);
748 }
749
750 compressLines();
751}
752
753static void
754compressLines() {
755 Line *line=NULL;
756 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
757 groupMSB=0xffff, lineCount2;
758 int16_t groupTop=0;
759
760 /* store the groups like lines, reusing the lines' memory */
761 lineTop=0;
762 lineCount2=lineCount;
763 lineCount=0;
764
765 /* loop over all lines */
766 while(i<lineCount2) {
767 line=lines+i++;
768 inLine=line->code;
769
770 /* segment the lines to groups of 32 */
771 if(inLine>>GROUP_SHIFT!=groupMSB) {
772 /* finish the current group with empty lines */
773 while((++outLine&GROUP_MASK)!=0) {
774 appendLineLength(0);
775 }
776
777 /* store the group like a line */
778 if(groupTop>0) {
779 if(groupTop>GROUP_STORE_SIZE) {
780 fprintf(stderr, "gennames: group store overflow\n");
781 exit(U_BUFFER_OVERFLOW_ERROR);
782 }
783 addGroup(groupMSB, groupStore, groupTop);
784 if(lineTop>(uint32_t)(line->s-stringStore)) {
785 fprintf(stderr, "gennames: group store runs into string store\n");
786 exit(U_INTERNAL_PROGRAM_ERROR);
787 }
788 }
789
790 /* start the new group */
791 lineLengthsTop=0;
792 groupTop=0;
793 groupMSB=inLine>>GROUP_SHIFT;
794 outLine=(inLine&~GROUP_MASK)-1;
795 }
796
797 /* write empty lines between the previous line in the group and this one */
798 while(++outLine<inLine) {
799 appendLineLength(0);
800 }
801
802 /* write characters and tokens for this line */
803 appendLineLength(compressLine(line->s, line->length, &groupTop));
804 }
805
806 /* finish and store the last group */
807 if(line && groupMSB!=0xffff) {
808 /* finish the current group with empty lines */
809 while((++outLine&GROUP_MASK)!=0) {
810 appendLineLength(0);
811 }
812
813 /* store the group like a line */
814 if(groupTop>0) {
815 if(groupTop>GROUP_STORE_SIZE) {
816 fprintf(stderr, "gennames: group store overflow\n");
817 exit(U_BUFFER_OVERFLOW_ERROR);
818 }
819 addGroup(groupMSB, groupStore, groupTop);
820 if(lineTop>(uint32_t)(line->s-stringStore)) {
821 fprintf(stderr, "gennames: group store runs into string store\n");
822 exit(U_INTERNAL_PROGRAM_ERROR);
823 }
824 }
825 }
826
827 if(!beQuiet) {
828 printf("number of groups: %lu\n", (unsigned long)lineCount);
829 }
830}
831
832static int16_t
833compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
834 int16_t start, limit, token, groupTop=*pGroupTop;
835
836 start=0;
837 do {
838 /* write any "noise" characters */
839 limit=skipNoise((char *)s, start, length);
840 while(start<limit) {
841 groupStore[groupTop++]=s[start++];
842 }
843
844 if(start==length) {
845 break;
846 }
847
848 /* write a word, as token or directly */
849 limit=getWord((char *)s, start, length);
850 if(limit-start==1) {
851 groupStore[groupTop++]=s[start++];
852 } else {
853 token=findToken(s+start, (int16_t)(limit-start));
854 if(token!=-1) {
855 if(token>0xff) {
856 groupStore[groupTop++]=(uint8_t)(token>>8);
857 }
858 groupStore[groupTop++]=(uint8_t)token;
859 start=limit;
860 } else {
861 while(start<limit) {
862 groupStore[groupTop++]=s[start++];
863 }
864 }
865 }
866 } while(start<length);
867
868 length=(int16_t)(groupTop-*pGroupTop);
869 *pGroupTop=groupTop;
870 return length;
871}
872
374ca955
A
873static int32_t
874compareWords(const void *context, const void *word1, const void *word2) {
b75a7d8f
A
875 /* reverse sort by word weight */
876 return ((Word *)word2)->weight-((Word *)word1)->weight;
877}
878
879/* generate output data ----------------------------------------------------- */
880
881static void
882generateData(const char *dataDir) {
883 UNewDataMemory *pData;
884 UErrorCode errorCode=U_ZERO_ERROR;
885 uint16_t groupWords[3];
886 uint32_t i, groupTop=lineTop, offset, size,
887 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
888 long dataLength;
889 int16_t token;
890
374ca955 891 pData=udata_create(dataDir, DATA_TYPE,DATA_NAME, &dataInfo,
b75a7d8f
A
892 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
893 if(U_FAILURE(errorCode)) {
894 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
895 exit(errorCode);
896 }
897
898 /* first, see how much space we need, and prepare the token strings */
899 for(i=0; i<tokenCount; ++i) {
900 token=tokens[i];
901 if(token!=-1 && token!=-2) {
902 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
903 }
904 }
905
73c04bcf
A
906 /*
907 * Required padding for data swapping:
908 * The token table undergoes a permutation during data swapping when the
909 * input and output charsets are different.
910 * The token table cannot grow during swapping, so we need to make sure that
911 * the table is long enough for successful in-place permutation.
912 *
913 * We simply round up tokenCount to the next multiple of 256 to account for
914 * all possible permutations.
915 *
916 * An optimization is possible if we only ever swap between ASCII and EBCDIC:
917 *
918 * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
919 * and will be swapped between ASCII and EBCDIC between
920 * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
921 * This should be the only -1 entry in tokens[256..511] on which the data
922 * swapper bases its trail byte permutation map (trailMap[]).
923 *
924 * It would be sufficient to increase tokenCount so that its lower 8 bits
925 * are at least 0x5e+1 to make room for swapping between the two semicolons.
926 * For values higher than 0x5e, the trail byte permutation map (trailMap[])
927 * should always be an identity map, where we do not need additional room.
928 */
929 i=tokenCount;
930 tokenCount=(tokenCount+0xff)&~0xff;
931 if(!beQuiet && i<tokenCount) {
932 printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
933 }
934 for(; i<tokenCount; ++i) {
935 if((i&0xff)==NAME_SEPARATOR_CHAR) {
936 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
937 } else {
938 tokens[i]=0; /* unused token for padding */
939 }
940 }
941
b75a7d8f
A
942 /*
943 * Calculate the total size in bytes of the data including:
944 * - the offset to the token strings, uint32_t (4)
945 * - the offset to the group table, uint32_t (4)
946 * - the offset to the group strings, uint32_t (4)
947 * - the offset to the algorithmic names, uint32_t (4)
948 *
949 * - the number of tokens, uint16_t (2)
950 * - the token table, uint16_t[tokenCount] (2*tokenCount)
951 *
952 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
953 *
954 * - the number of groups, uint16_t (2)
955 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
956 *
957 * - the group strings (groupTop), 2-padded
958 *
959 * - the size of the data for the algorithmic names
960 */
961 tokenStringOffset=4+4+4+4+2+2*tokenCount;
962 groupsOffset=(tokenStringOffset+(lineTop-groupTop+1))&~1;
963 groupStringOffset=groupsOffset+2+6*lineCount;
964 algNamesOffset=(groupStringOffset+groupTop+3)&~3;
965
966 offset=generateAlgorithmicData(NULL);
967 size=algNamesOffset+offset;
968
969 if(!beQuiet) {
970 printf("size of the Unicode Names data:\n"
971 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
972 (unsigned long)size, (unsigned long)(lineTop-groupTop),
973 (unsigned long)groupTop, (unsigned long)offset);
974 }
975
976 /* write the data to the file */
977 /* offsets */
978 udata_write32(pData, tokenStringOffset);
979 udata_write32(pData, groupsOffset);
980 udata_write32(pData, groupStringOffset);
981 udata_write32(pData, algNamesOffset);
982
983 /* token table */
984 udata_write16(pData, (uint16_t)tokenCount);
985 udata_writeBlock(pData, tokens, 2*tokenCount);
986
987 /* token strings */
988 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
989 if((lineTop-groupTop)&1) {
990 /* 2-padding */
991 udata_writePadding(pData, 1);
992 }
993
994 /* group table */
995 udata_write16(pData, (uint16_t)lineCount);
996 for(i=0; i<lineCount; ++i) {
997 /* groupMSB */
998 groupWords[0]=(uint16_t)lines[i].code;
999
1000 /* offset */
1001 offset = (uint32_t)(lines[i].s - stringStore);
1002 groupWords[1]=(uint16_t)(offset>>16);
1003 groupWords[2]=(uint16_t)(offset);
1004 udata_writeBlock(pData, groupWords, 6);
1005 }
1006
1007 /* group strings */
1008 udata_writeBlock(pData, stringStore, groupTop);
1009
1010 /* 4-align the algorithmic names data */
1011 udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop));
1012
1013 generateAlgorithmicData(pData);
1014
1015 /* finish up */
1016 dataLength=udata_finish(pData, &errorCode);
1017 if(U_FAILURE(errorCode)) {
1018 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1019 exit(errorCode);
1020 }
1021
1022 if(dataLength!=(long)size) {
1023 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1024dataLength, (unsigned long)size);
1025 exit(U_INTERNAL_PROGRAM_ERROR);
1026 }
1027}
1028
1029/* the structure for algorithmic names needs to be 4-aligned */
1030typedef struct AlgorithmicRange {
1031 uint32_t rangeStart, rangeEnd;
1032 uint8_t algorithmType, algorithmVariant;
1033 uint16_t rangeSize;
1034} AlgorithmicRange;
1035
1036static uint32_t
1037generateAlgorithmicData(UNewDataMemory *pData) {
1038 static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1039# define PREFIX_LENGTH 23
1040# define PREFIX_LENGTH_4 24
1041 uint32_t countAlgRanges;
1042
1043 static AlgorithmicRange cjkExtA={
1044 0x3400, 0x4db5,
1045 0, 4,
1046 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1047 };
1048 static AlgorithmicRange cjk={
1049 0x4e00, 0x9fa5,
1050 0, 4,
1051 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1052 };
1053 static AlgorithmicRange cjkExtB={
1054 0x20000, 0x2a6d6,
1055 0, 5,
1056 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1057 };
1058
1059 static char jamo[]=
1060 "HANGUL SYLLABLE \0"
1061
1062 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1063 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1064
1065 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1066 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1067 "YU\0EU\0YI\0I\0"
1068
1069 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1070 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1071 "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1072 ;
1073
1074 static AlgorithmicRange hangul={
1075 0xac00, 0xd7a3,
1076 1, 3,
1077 sizeof(AlgorithmicRange)+6+sizeof(jamo)
1078 };
1079
1080 /* modulo factors, maximum 8 */
1081 /* 3 factors: 19, 21, 28, most-to-least-significant */
1082 static uint16_t hangulFactors[3]={
1083 19, 21, 28
1084 };
1085
1086 uint32_t size;
1087
1088 size=0;
1089
73c04bcf
A
1090 if(ucdVersion>=UNI_4_1) {
1091 /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1092 cjk.rangeEnd=0x9FBB;
1093 }
1094
b75a7d8f 1095 /* number of ranges of algorithmic names */
73c04bcf 1096 if(ucdVersion>=UNI_3_1) {
b75a7d8f
A
1097 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1098 countAlgRanges=4;
73c04bcf 1099 } else if(ucdVersion>=UNI_3_0) {
b75a7d8f
A
1100 /* Unicode 3.0 has 3 ranges including CJK Extension A */
1101 countAlgRanges=3;
1102 } else {
1103 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1104 countAlgRanges=2;
1105 }
1106
1107 if(pData!=NULL) {
1108 udata_write32(pData, countAlgRanges);
1109 } else {
1110 size+=4;
1111 }
1112
1113 /*
1114 * each range:
1115 * uint32_t rangeStart
1116 * uint32_t rangeEnd
1117 * uint8_t algorithmType
1118 * uint8_t algorithmVariant
1119 * uint16_t size of range data
1120 * uint8_t[size] data
1121 */
1122
1123 /* range 0: cjk extension a */
1124 if(countAlgRanges>=3) {
1125 if(pData!=NULL) {
1126 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1127 udata_writeString(pData, prefix, PREFIX_LENGTH);
1128 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1129 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1130 }
1131 } else {
1132 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1133 }
1134 }
1135
1136 /* range 1: cjk */
1137 if(pData!=NULL) {
1138 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1139 udata_writeString(pData, prefix, PREFIX_LENGTH);
1140 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1141 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1142 }
1143 } else {
1144 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1145 }
1146
1147 /* range 2: hangul syllables */
1148 if(pData!=NULL) {
1149 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1150 udata_writeBlock(pData, hangulFactors, 6);
1151 udata_writeString(pData, jamo, sizeof(jamo));
1152 } else {
1153 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1154 }
1155
1156 /* range 3: cjk extension b */
1157 if(countAlgRanges>=4) {
1158 if(pData!=NULL) {
1159 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1160 udata_writeString(pData, prefix, PREFIX_LENGTH);
1161 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1162 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1163 }
1164 } else {
1165 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1166 }
1167 }
1168
1169 return size;
1170}
1171
1172/* helpers ------------------------------------------------------------------ */
1173
1174static int16_t
1175findToken(uint8_t *s, int16_t length) {
1176 int16_t i, token;
1177
1178 for(i=0; i<(int16_t)tokenCount; ++i) {
1179 token=tokens[i];
73c04bcf 1180 if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
b75a7d8f
A
1181 return i;
1182 }
1183 }
1184
1185 return -1;
1186}
1187
1188static Word *
1189findWord(char *s, int16_t length) {
1190 uint32_t i;
1191
1192 for(i=0; i<wordCount; ++i) {
1193 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1194 return words+i;
1195 }
1196 }
1197
1198 return NULL;
1199}
1200
1201static Word *
1202addWord(char *s, int16_t length) {
1203 uint8_t *stringStart;
1204 Word *word;
1205
1206 if(wordCount==MAX_WORD_COUNT) {
1207 fprintf(stderr, "gennames: too many words\n");
1208 exit(U_BUFFER_OVERFLOW_ERROR);
1209 }
1210
1211 stringStart=allocWord(length);
1212 uprv_memcpy(stringStart, s, length);
1213
1214 word=words+wordCount;
1215
1216 /*
1217 * Initialize the weight with the costs for this token:
1218 * a zero-terminated string and a 16-bit offset.
1219 */
1220 word->weight=-(length+1+2);
1221 word->count=0;
1222 word->length=length;
1223 word->s=stringStart;
1224
1225 ++wordCount;
1226
1227 return word;
1228}
1229
1230static void
1231countWord(Word *word) {
1232 /* add to the weight the savings: the length of the word minus 1 byte for the token */
1233 word->weight+=word->length-1;
1234 ++word->count;
1235}
1236
1237static void
1238addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1239 uint8_t *stringStart;
1240 Line *line;
1241 int16_t i, length;
1242
1243 if(lineCount==MAX_LINE_COUNT) {
1244 fprintf(stderr, "gennames: too many lines\n");
1245 exit(U_BUFFER_OVERFLOW_ERROR);
1246 }
1247
1248 /* find the last non-empty name */
1249 while(count>0 && lengths[count-1]==0) {
1250 --count;
1251 }
1252 if(count==0) {
1253 return; /* should not occur: caller should not have called */
1254 }
1255
1256 /* there will be (count-1) separator characters */
1257 i=count;
1258 length=count-1;
1259
1260 /* add lengths of strings */
1261 while(i>0) {
1262 length+=lengths[--i];
1263 }
1264
1265 /* allocate line memory */
1266 stringStart=allocLine(length);
1267
1268 /* copy all strings into the line memory */
1269 length=0; /* number of chars copied so far */
1270 for(i=0; i<count; ++i) {
1271 if(i>0) {
1272 stringStart[length++]=NAME_SEPARATOR_CHAR;
1273 }
1274 if(lengths[i]>0) {
1275 uprv_memcpy(stringStart+length, names[i], lengths[i]);
1276 length+=lengths[i];
1277 }
1278 }
1279
1280 line=lines+lineCount;
1281
1282 line->code=code;
1283 line->length=length;
1284 line->s=stringStart;
1285
1286 ++lineCount;
1287
1288 /* prevent a character value that is actually in a name from becoming a token */
1289 while(length>0) {
1290 tokens[stringStart[--length]]=-1;
1291 }
1292}
1293
1294static void
1295addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1296 uint8_t *stringStart;
1297 Line *line;
1298
1299 if(lineCount==MAX_LINE_COUNT) {
1300 fprintf(stderr, "gennames: too many groups\n");
1301 exit(U_BUFFER_OVERFLOW_ERROR);
1302 }
1303
1304 /* store the line lengths first, then the strings */
1305 lineLengthsTop=(lineLengthsTop+1)/2;
1306 stringStart=allocLine(lineLengthsTop+length);
1307 uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1308 uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1309
1310 line=lines+lineCount;
1311
1312 line->code=groupMSB;
1313 line->length=length;
1314 line->s=stringStart;
1315
1316 ++lineCount;
1317}
1318
1319static uint32_t
1320addToken(uint8_t *s, int16_t length) {
1321 uint8_t *stringStart;
1322
1323 stringStart=allocLine(length+1);
1324 uprv_memcpy(stringStart, s, length);
1325 stringStart[length]=0;
1326
1327 return (uint32_t)(stringStart - stringStore);
1328}
1329
1330static void
1331appendLineLength(int16_t length) {
1332 if(length>=76) {
1333 fprintf(stderr, "gennames: compressed line too long\n");
1334 exit(U_BUFFER_OVERFLOW_ERROR);
1335 }
1336 if(length>=12) {
1337 length-=12;
1338 appendLineLengthNibble((uint8_t)((length>>4)|12));
1339 }
1340 appendLineLengthNibble((uint8_t)length);
1341}
1342
1343static void
1344appendLineLengthNibble(uint8_t nibble) {
1345 if((lineLengthsTop&1)==0) {
1346 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1347 } else {
1348 lineLengths[lineLengthsTop/2]|=nibble&0xf;
1349 }
1350 ++lineLengthsTop;
1351}
1352
1353static uint8_t *
1354allocLine(int32_t length) {
1355 uint32_t top=lineTop+length;
1356 uint8_t *p;
1357
1358 if(top>wordBottom) {
1359 fprintf(stderr, "gennames: out of memory\n");
1360 exit(U_MEMORY_ALLOCATION_ERROR);
1361 }
1362 p=stringStore+lineTop;
1363 lineTop=top;
1364 return p;
1365}
1366
1367static uint8_t *
1368allocWord(uint32_t length) {
1369 uint32_t bottom=wordBottom-length;
1370
1371 if(lineTop>bottom) {
1372 fprintf(stderr, "gennames: out of memory\n");
1373 exit(U_MEMORY_ALLOCATION_ERROR);
1374 }
1375 wordBottom=bottom;
1376 return stringStore+bottom;
1377}
1378
1379/*
1380 * Hey, Emacs, please set the following:
1381 *
1382 * Local Variables:
1383 * indent-tabs-mode: nil
1384 * End:
1385 *
1386 */