]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennames/gennames.c
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / tools / gennames / gennames.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
46f4442e 4* Copyright (C) 1999-2008, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: gennames.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999sep30
14* created by: Markus W. Scherer
15*
16* This program reads the Unicode character database text file,
17* parses it, and extracts the character code,
18* the "modern" character name, and optionally the
19* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20* It then tokenizes and compresses the names and builds
21* compact binary tables for random-access lookup
22* in a u_charName() API function.
23*
24* unames.icu file format (after UDataInfo header etc. - see udata.c)
25* (all data is static const)
26*
27* UDataInfo fields:
28* dataFormat "unam"
29* formatVersion 1.0
30* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
31*
32* -- data-based names
33* uint32_t tokenStringOffset,
34* groupsOffset,
35* groupStringOffset,
36* algNamesOffset;
37*
38* uint16_t tokenCount;
39* uint16_t tokenTable[tokenCount];
40*
41* char tokenStrings[]; -- padded to even count
42*
43* -- strings (groupStrings) are tokenized as follows:
44* for each character c
45* if(c>=tokenCount) write that character c directly
46* else
47* token=tokenTable[c];
48* if(token==0xfffe) -- lead byte of double-byte token
49* token=tokenTable[c<<8|next character];
50* if(token==-1)
51* write c directly
52* else
53* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54* append zero-terminated tokenString;
55*
56* Different strings for a code point - normal name, 1.0 name, and ISO comment -
57* are separated by ';'.
58*
59* uint16_t groupCount;
60* struct {
61* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
63* uint16_t offsetLow;
64* } groupTable[groupCount];
65*
66* char groupStrings[]; -- padded to 4-count
67*
68* -- The actual, tokenized group strings are not zero-terminated because
69* that would take up too much space.
70* Instead, they are preceeded by their length, written in a variable-length sequence:
71* For each of the 32 group strings, one or two nibbles are stored for its length.
72* Nibbles (4-bit values, half-bytes) are read MSB first.
73* A nibble with a value of 0..11 directly indicates the length of the name string.
74* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75* by (((n-12)<<4)|m)+12, reaching values of 12..75.
76* These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77* For the de-tokenizing, see token description above; the strings immediately follow the
78* 32 lengths.
79*
80* -- algorithmic names
81*
82* typedef struct AlgorithmicRange {
83* uint32_t rangeStart, rangeEnd;
84* uint8_t algorithmType, algorithmVariant;
85* uint16_t rangeSize;
86* } AlgorithmicRange;
87*
88* uint32_t algRangesCount; -- number of data blocks for ranges of
89* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
90*
91* struct {
92* AlgorithmicRange algRange;
93* uint8_t algRangeData[]; -- padded to 4-count except in last range
94* } algRanges[algNamesCount];
95* -- not a real array because each part has a different size
96* of algRange.rangeSize (including AlgorithmicRange)
97*
98* -- algorithmic range types:
99*
100* 0 Names are formed from a string prefix that is stored in
101* the algRangeData (zero-terminated), followed by the Unicode code point
102* of the character in hexadecimal digits;
103* algRange.algorithmVariant digits are written
104*
105* 1 Names are formed by calculating modulo-factors of the code point value as follows:
106* algRange.algorithmVariant is the count of modulo factors
107* algRangeData contains
108* uint16_t factors[algRange.algorithmVariant];
109* char strings[];
110* the first zero-terminated string is written as the prefix; then:
111*
112* The rangeStart is subtracted; with the difference, here "code":
113* for(i=algRange.algorithmVariant-1 to 0 step -1)
114* index[i]=code%factor[i];
115* code/=factor[i];
116*
117* The strings after the prefix are short pieces that are then appended to the result
118* according to index[0..algRange.algorithmVariant-1].
119*/
120
121#include <stdio.h>
b75a7d8f
A
122#include "unicode/utypes.h"
123#include "unicode/putil.h"
374ca955
A
124#include "unicode/uclean.h"
125#include "unicode/udata.h"
b75a7d8f
A
126#include "cmemory.h"
127#include "cstring.h"
374ca955 128#include "uarrsort.h"
b75a7d8f
A
129#include "unewdata.h"
130#include "uoptions.h"
131#include "uparse.h"
132
46f4442e
A
133#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
134
b75a7d8f
A
135#define STRING_STORE_SIZE 1000000
136#define GROUP_STORE_SIZE 5000
137
138#define GROUP_SHIFT 5
139#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
140#define GROUP_MASK (LINES_PER_GROUP-1)
141
142#define MAX_LINE_COUNT 50000
143#define MAX_WORD_COUNT 20000
144#define MAX_GROUP_COUNT 5000
145
146#define DATA_NAME "unames"
147#define DATA_TYPE "icu"
148#define VERSION_STRING "unam"
149#define NAME_SEPARATOR_CHAR ';'
150
46f4442e
A
151#define ISO_DATA_NAME "ucomment"
152
73c04bcf
A
153/* Unicode versions --------------------------------------------------------- */
154
155enum {
156 UNI_1_0,
157 UNI_1_1,
158 UNI_2_0,
159 UNI_3_0,
160 UNI_3_1,
161 UNI_3_2,
162 UNI_4_0,
163 UNI_4_0_1,
164 UNI_4_1,
46f4442e
A
165 UNI_5_0,
166 UNI_5_1,
73c04bcf
A
167 UNI_VER_COUNT
168};
169
b75a7d8f 170static const UVersionInfo
73c04bcf
A
171unicodeVersions[]={
172 { 1, 0, 0, 0 },
173 { 1, 1, 0, 0 },
174 { 2, 0, 0, 0 },
175 { 3, 0, 0, 0 },
176 { 3, 1, 0, 0 },
177 { 3, 2, 0, 0 },
178 { 4, 0, 0, 0 },
179 { 4, 0, 1, 0 },
46f4442e
A
180 { 4, 1, 0, 0 },
181 { 5, 0, 0, 0 },
182 { 5, 1, 0, 0 }
73c04bcf
A
183};
184
46f4442e 185static int32_t ucdVersion=UNI_5_1;
73c04bcf
A
186
187static int32_t
188findUnicodeVersion(const UVersionInfo version) {
189 int32_t i;
190
191 for(i=0; /* while(version>unicodeVersions[i]) {} */
192 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
193 ++i) {}
194 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
195 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
196 }
197 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
198}
199
200/* generator data ----------------------------------------------------------- */
b75a7d8f
A
201
202/* UDataInfo cf. udata.h */
203static UDataInfo dataInfo={
204 sizeof(UDataInfo),
205 0,
206
207 U_IS_BIG_ENDIAN,
208 U_CHARSET_FAMILY,
209 sizeof(UChar),
210 0,
211
212 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
213 {1, 0, 0, 0}, /* formatVersion */
214 {3, 0, 0, 0} /* dataVersion */
215};
216
217static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
218
46f4442e
A
219typedef struct Options {
220 UBool storeNames;
221 UBool store10Names;
222 UBool storeISOComments;
223} Options;
224
b75a7d8f
A
225static uint8_t stringStore[STRING_STORE_SIZE],
226 groupStore[GROUP_STORE_SIZE],
227 lineLengths[LINES_PER_GROUP];
228
46f4442e 229static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
b75a7d8f
A
230
231typedef struct {
232 uint32_t code;
233 int16_t length;
234 uint8_t *s;
235} Line;
236
237typedef struct {
238 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
239 int16_t count;
240 int16_t length;
241 uint8_t *s;
242} Word;
243
244static Line lines[MAX_LINE_COUNT];
245static Word words[MAX_WORD_COUNT];
246
247static uint32_t lineCount=0, wordCount=0;
248
249static int16_t leadByteCount;
250
251#define LEADBYTE_LIMIT 16
252
253static int16_t tokens[LEADBYTE_LIMIT*256];
254static uint32_t tokenCount;
255
256/* prototypes --------------------------------------------------------------- */
257
258static void
259init(void);
260
261static void
46f4442e 262parseDB(const char *filename, Options *options);
b75a7d8f
A
263
264static void
265parseName(char *name, int16_t length);
266
267static int16_t
268skipNoise(char *line, int16_t start, int16_t limit);
269
270static int16_t
271getWord(char *line, int16_t start, int16_t limit);
272
273static void
274compress(void);
275
276static void
277compressLines(void);
278
279static int16_t
280compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
281
374ca955
A
282static int32_t
283compareWords(const void *context, const void *word1, const void *word2);
b75a7d8f
A
284
285static void
46f4442e 286generateData(const char *dataDir, Options *options);
b75a7d8f
A
287
288static uint32_t
46f4442e 289generateAlgorithmicData(UNewDataMemory *pData, Options *options);
b75a7d8f
A
290
291static int16_t
292findToken(uint8_t *s, int16_t length);
293
294static Word *
295findWord(char *s, int16_t length);
296
297static Word *
298addWord(char *s, int16_t length);
299
300static void
301countWord(Word *word);
302
303static void
304addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
305
306static void
307addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
308
309static uint32_t
310addToken(uint8_t *s, int16_t length);
311
312static void
313appendLineLength(int16_t length);
314
315static void
316appendLineLengthNibble(uint8_t nibble);
317
318static uint8_t *
319allocLine(int32_t length);
320
321static uint8_t *
322allocWord(uint32_t length);
323
324/* -------------------------------------------------------------------------- */
325
46f4442e
A
326enum {
327 HELP_H,
328 HELP_QUESTION_MARK,
329 VERBOSE,
330 QUIET,
331 COPYRIGHT,
332 DESTDIR,
333 UNICODE,
334 UNICODE1_NAMES,
335 NO_ISO_COMMENTS,
336 ONLY_ISO_COMMENTS
337};
338
b75a7d8f
A
339static UOption options[]={
340 UOPTION_HELP_H,
341 UOPTION_HELP_QUESTION_MARK,
342 UOPTION_VERBOSE,
343 UOPTION_QUIET,
344 UOPTION_COPYRIGHT,
345 UOPTION_DESTDIR,
346 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
46f4442e
A
347 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
348 { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
349 { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
b75a7d8f
A
350};
351
352extern int
353main(int argc, char* argv[]) {
354 UVersionInfo version;
46f4442e 355 Options moreOptions={ TRUE, FALSE, TRUE };
374ca955 356 UErrorCode errorCode = U_ZERO_ERROR;
b75a7d8f
A
357
358 U_MAIN_INIT_ARGS(argc, argv);
359
374ca955
A
360 /* Initialize ICU */
361 u_init(&errorCode);
362 if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
363 /* Note: u_init() will try to open ICU property data.
364 * failures here are expected when building ICU from scratch.
365 * ignore them.
366 */
367 fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n",
368 argv[0], u_errorName(errorCode));
369 exit(1);
370 }
371
b75a7d8f 372 /* preset then read command line options */
46f4442e
A
373 options[DESTDIR].value=u_getDataDirectory();
374 options[UNICODE].value="4.1";
375 argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
b75a7d8f
A
376
377 /* error handling, printing usage message */
378 if(argc<0) {
379 fprintf(stderr,
380 "error in command line argument \"%s\"\n",
381 argv[-argc]);
382 } else if(argc<2) {
383 argc=-1;
384 }
46f4442e 385 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
b75a7d8f
A
386 /*
387 * Broken into chucks because the C89 standard says the minimum
388 * required supported string length is 509 bytes.
389 */
390 fprintf(stderr,
391 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
392 "\n"
393 "Read the UnicodeData.txt file and \n"
374ca955 394 "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
b75a7d8f
A
395 "\n"
396 "\tfilename absolute path/filename for the Unicode database text file\n"
397 "\t\t(default: standard input)\n"
398 "\n",
399 argv[0]);
400 fprintf(stderr,
401 "Options:\n"
402 "\t-h or -? or --help this usage text\n"
403 "\t-v or --verbose verbose output\n"
404 "\t-q or --quiet no output\n"
405 "\t-c or --copyright include a copyright notice\n"
406 "\t-d or --destdir destination directory, followed by the path\n"
46f4442e
A
407 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
408 fprintf(stderr,
409 "\t-1 or --unicode1-names store Unicode 1.0 character names\n"
410 "\t --no-iso-comments do not store ISO comments\n"
411 "\t --only-iso-comments write ucomment.icu with only ISO comments\n");
b75a7d8f
A
412 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
413 }
414
415 /* get the options values */
46f4442e
A
416 beVerbose=options[VERBOSE].doesOccur;
417 beQuiet=options[QUIET].doesOccur;
418 haveCopyright=options[COPYRIGHT].doesOccur;
419 moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
420 moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
421 if(options[ONLY_ISO_COMMENTS].doesOccur) {
422 moreOptions.storeNames=moreOptions.store10Names=FALSE;
423 moreOptions.storeISOComments=TRUE;
424 }
b75a7d8f
A
425
426 /* set the Unicode version */
46f4442e 427 u_versionFromString(version, options[UNICODE].value);
b75a7d8f 428 uprv_memcpy(dataInfo.dataVersion, version, 4);
73c04bcf 429 ucdVersion=findUnicodeVersion(version);
b75a7d8f
A
430
431 init();
46f4442e 432 parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
b75a7d8f 433 compress();
46f4442e 434 generateData(options[DESTDIR].value, &moreOptions);
b75a7d8f 435
374ca955 436 u_cleanup();
b75a7d8f
A
437 return 0;
438}
439
440static void
441init() {
442 int i;
443
444 for(i=0; i<256; ++i) {
445 tokens[i]=0;
446 }
447}
448
449/* parsing ------------------------------------------------------------------ */
450
374ca955
A
451/* get a name, strip leading and trailing whitespace */
452static int16_t
453getName(char **pStart, char *limit) {
454 /* strip leading whitespace */
455 char *start=(char *)u_skipWhitespace(*pStart);
456
457 /* strip trailing whitespace */
458 while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
459 --limit;
460 }
461
462 /* return results */
463 *pStart=start;
464 return (int16_t)(limit-start);
465}
466
b75a7d8f
A
467static void U_CALLCONV
468lineFn(void *context,
469 char *fields[][2], int32_t fieldCount,
470 UErrorCode *pErrorCode) {
46f4442e 471 Options *storeOptions=(Options *)context;
b75a7d8f 472 char *names[3];
46f4442e 473 int16_t lengths[3]={ 0, 0, 0 };
b75a7d8f
A
474 static uint32_t prevCode=0;
475 uint32_t code=0;
476
477 if(U_FAILURE(*pErrorCode)) {
478 return;
479 }
480 /* get the character code */
481 code=uprv_strtoul(fields[0][0], NULL, 16);
482
483 /* get the character name */
46f4442e
A
484 if(storeOptions->storeNames) {
485 names[0]=fields[1][0];
486 lengths[0]=getName(names+0, fields[1][1]);
487 if(names[0][0]=='<') {
488 /* do not store pseudo-names in <> brackets */
489 lengths[0]=0;
490 }
b75a7d8f
A
491 }
492
493 /* store 1.0 names */
494 /* get the second character name, the one from Unicode 1.0 */
46f4442e
A
495 if(storeOptions->store10Names) {
496 names[1]=fields[10][0];
497 lengths[1]=getName(names+1, fields[10][1]);
498 if(names[1][0]=='<') {
499 /* do not store pseudo-names in <> brackets */
500 lengths[1]=0;
501 }
b75a7d8f
A
502 }
503
504 /* get the ISO 10646 comment */
46f4442e
A
505 if(storeOptions->storeISOComments) {
506 names[2]=fields[11][0];
507 lengths[2]=getName(names+2, fields[11][1]);
508 }
b75a7d8f
A
509
510 if(lengths[0]+lengths[1]+lengths[2]==0) {
511 return;
512 }
513
514 /* check for non-character code points */
46f4442e 515 if(!U_IS_UNICODE_CHAR(code)) {
b75a7d8f
A
516 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
517 (unsigned long)code);
518 *pErrorCode=U_PARSE_ERROR;
519 exit(U_PARSE_ERROR);
520 }
521
522 /* check that the code points (code) are in ascending order */
523 if(code<=prevCode && code>0) {
524 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
525 (unsigned long)code, (unsigned long)prevCode);
526 *pErrorCode=U_PARSE_ERROR;
527 exit(U_PARSE_ERROR);
528 }
529 prevCode=code;
530
531 parseName(names[0], lengths[0]);
532 parseName(names[1], lengths[1]);
533 parseName(names[2], lengths[2]);
534
535 /*
536 * set the count argument to
46f4442e 537 * 1: only store regular names, or only store ISO 10646 comments
b75a7d8f
A
538 * 2: store regular and 1.0 names
539 * 3: store names and ISO 10646 comment
46f4442e
A
540 *
541 * addLine() will ignore empty trailing names
b75a7d8f 542 */
46f4442e
A
543 if(storeOptions->storeNames) {
544 /* store names and comments as parsed according to storeOptions */
545 addLine(code, names, lengths, 3);
546 } else {
547 /* store only ISO 10646 comments */
548 addLine(code, names+2, lengths+2, 1);
549 }
b75a7d8f
A
550}
551
552static void
46f4442e 553parseDB(const char *filename, Options *storeOptions) {
b75a7d8f
A
554 char *fields[15][2];
555 UErrorCode errorCode=U_ZERO_ERROR;
556
46f4442e 557 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
b75a7d8f
A
558 if(U_FAILURE(errorCode)) {
559 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
560 exit(errorCode);
561 }
562
563 if(!beQuiet) {
564 printf("size of all names in the database: %lu\n",
565 (unsigned long)lineTop);
566 printf("number of named Unicode characters: %lu\n",
567 (unsigned long)lineCount);
568 printf("number of words in the dictionary from these names: %lu\n",
569 (unsigned long)wordCount);
570 }
571}
572
573static void
574parseName(char *name, int16_t length) {
575 int16_t start=0, limit, wordLength/*, prevStart=-1*/;
576 Word *word;
577
578 while(start<length) {
579 /* skip any "noise" characters */
580 limit=skipNoise(name, start, length);
581 if(start<limit) {
582 /*prevStart=-1;*/
583 start=limit;
584 }
585 if(start==length) {
586 break;
587 }
588
589 /* get a word and add it if it is longer than 1 */
590 limit=getWord(name, start, length);
591 wordLength=(int16_t)(limit-start);
592 if(wordLength>1) {
593 word=findWord(name+start, wordLength);
594 if(word==NULL) {
595 word=addWord(name+start, wordLength);
596 }
597 countWord(word);
598 }
599
600#if 0
601 /*
602 * if there was a word before this
603 * (with no noise in between), then add the pair of words, too
604 */
605 if(prevStart!=-1) {
606 wordLength=limit-prevStart;
607 word=findWord(name+prevStart, wordLength);
608 if(word==NULL) {
609 word=addWord(name+prevStart, wordLength);
610 }
611 countWord(word);
612 }
613#endif
614
615 /*prevStart=start;*/
616 start=limit;
617 }
618}
619
620static UBool U_INLINE
621isWordChar(char c) {
622 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
623 ('J'<=c && c<='R') ||
624 ('S'<=c && c<='Z') ||
625
626 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
627 ('j'<=c && c<='r') ||
628 ('s'<=c && c<='z') ||
629
630 ('0'<=c && c<='9');
631}
632
633static int16_t
634skipNoise(char *line, int16_t start, int16_t limit) {
635 /* skip anything that is not part of a word in this sense */
636 while(start<limit && !isWordChar(line[start])) {
637 ++start;
638 }
639
640 return start;
641}
642
643static int16_t
644getWord(char *line, int16_t start, int16_t limit) {
645 char c=0; /* initialize to avoid a compiler warning although the code was safe */
646
647 /* a unicode character name word consists of A-Z0-9 */
648 while(start<limit && isWordChar(line[start])) {
649 ++start;
650 }
651
652 /* include a following space or dash */
653 if(start<limit && ((c=line[start])==' ' || c=='-')) {
654 ++start;
655 }
656
657 return start;
658}
659
660/* compressing -------------------------------------------------------------- */
661
662static void
663compress() {
664 uint32_t i, letterCount;
665 int16_t wordNumber;
374ca955 666 UErrorCode errorCode;
b75a7d8f
A
667
668 /* sort the words in reverse order by weight */
374ca955
A
669 errorCode=U_ZERO_ERROR;
670 uprv_sortArray(words, wordCount, sizeof(Word),
671 compareWords, NULL, FALSE, &errorCode);
b75a7d8f
A
672
673 /* remove the words that do not save anything */
674 while(wordCount>0 && words[wordCount-1].weight<1) {
675 --wordCount;
676 }
677
678 /* count the letters in the token range */
679 letterCount=0;
680 for(i=LEADBYTE_LIMIT; i<256; ++i) {
681 if(tokens[i]==-1) {
682 ++letterCount;
683 }
684 }
685 if(!beQuiet) {
374ca955 686 printf("number of letters used in the names: %d\n", (int)letterCount);
b75a7d8f
A
687 }
688
689 /* do we need double-byte tokens? */
690 if(wordCount+letterCount<=256) {
691 /* no, single-byte tokens are enough */
692 leadByteCount=0;
693 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
694 if(tokens[i]!=-1) {
695 tokens[i]=wordNumber;
696 if(beVerbose) {
697 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
374ca955 698 (int)i, (long)words[wordNumber].weight,
b75a7d8f
A
699 words[wordNumber].length, words[wordNumber].s);
700 }
701 ++wordNumber;
702 }
703 }
704 tokenCount=i;
705 } else {
706 /*
707 * The tokens that need two token bytes
708 * get their weight reduced by their count
709 * because they save less.
710 */
711 tokenCount=256-letterCount;
712 for(i=tokenCount; i<wordCount; ++i) {
713 words[i].weight-=words[i].count;
714 }
715
716 /* sort these words in reverse order by weight */
374ca955
A
717 errorCode=U_ZERO_ERROR;
718 uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
719 compareWords, NULL, FALSE, &errorCode);
b75a7d8f
A
720
721 /* remove the words that do not save anything */
722 while(wordCount>0 && words[wordCount-1].weight<1) {
723 --wordCount;
724 }
725
726 /* how many tokens and lead bytes do we have now? */
727 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
728 /*
729 * adjust upwards to take into account that
730 * double-byte tokens must not
731 * use NAME_SEPARATOR_CHAR as a second byte
732 */
733 tokenCount+=(tokenCount-256+254)/255;
734
735 leadByteCount=(int16_t)(tokenCount>>8);
736 if(leadByteCount<LEADBYTE_LIMIT) {
737 /* adjust for the real number of lead bytes */
738 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
739 } else {
740 /* limit the number of lead bytes */
741 leadByteCount=LEADBYTE_LIMIT-1;
742 tokenCount=LEADBYTE_LIMIT*256;
743 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
744 /* adjust again to skip double-byte tokens with ';' */
745 wordCount-=(tokenCount-256+254)/255;
746 }
747
748 /* set token 0 to word 0 */
749 tokens[0]=0;
750 if(beVerbose) {
751 printf("tokens[0x000]: word%8ld \"%.*s\"\n",
752 (long)words[0].weight,
753 words[0].length, words[0].s);
754 }
755 wordNumber=1;
756
757 /* set the lead byte tokens */
758 for(i=1; (int16_t)i<=leadByteCount; ++i) {
759 tokens[i]=-2;
760 }
761
762 /* set the tokens */
763 for(; i<256; ++i) {
764 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
765 if(tokens[i]!=-1) {
766 tokens[i]=wordNumber;
767 if(beVerbose) {
768 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
374ca955 769 (int)i, (long)words[wordNumber].weight,
b75a7d8f
A
770 words[wordNumber].length, words[wordNumber].s);
771 }
772 ++wordNumber;
773 }
774 }
775
776 /* continue above 255 where there are no letters */
777 for(; (uint32_t)wordNumber<wordCount; ++i) {
778 if((i&0xff)==NAME_SEPARATOR_CHAR) {
779 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
780 } else {
781 tokens[i]=wordNumber;
782 if(beVerbose) {
783 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
374ca955 784 (int)i, (long)words[wordNumber].weight,
b75a7d8f
A
785 words[wordNumber].length, words[wordNumber].s);
786 }
787 ++wordNumber;
788 }
789 }
790 tokenCount=i; /* should be already tokenCount={i or i+1} */
791 }
792
793 if(!beQuiet) {
794 printf("number of lead bytes: %d\n", leadByteCount);
795 printf("number of single-byte tokens: %lu\n",
796 (unsigned long)256-letterCount-leadByteCount);
797 printf("number of tokens: %lu\n", (unsigned long)tokenCount);
798 }
799
800 compressLines();
801}
802
803static void
804compressLines() {
805 Line *line=NULL;
806 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
807 groupMSB=0xffff, lineCount2;
808 int16_t groupTop=0;
809
46f4442e
A
810 /* store the groups like lines, with compressed data after raw strings */
811 groupBottom=lineTop;
b75a7d8f
A
812 lineCount2=lineCount;
813 lineCount=0;
814
815 /* loop over all lines */
816 while(i<lineCount2) {
817 line=lines+i++;
818 inLine=line->code;
819
820 /* segment the lines to groups of 32 */
821 if(inLine>>GROUP_SHIFT!=groupMSB) {
822 /* finish the current group with empty lines */
823 while((++outLine&GROUP_MASK)!=0) {
824 appendLineLength(0);
825 }
826
827 /* store the group like a line */
828 if(groupTop>0) {
829 if(groupTop>GROUP_STORE_SIZE) {
830 fprintf(stderr, "gennames: group store overflow\n");
831 exit(U_BUFFER_OVERFLOW_ERROR);
832 }
833 addGroup(groupMSB, groupStore, groupTop);
b75a7d8f
A
834 }
835
836 /* start the new group */
837 lineLengthsTop=0;
838 groupTop=0;
839 groupMSB=inLine>>GROUP_SHIFT;
840 outLine=(inLine&~GROUP_MASK)-1;
841 }
842
843 /* write empty lines between the previous line in the group and this one */
844 while(++outLine<inLine) {
845 appendLineLength(0);
846 }
847
848 /* write characters and tokens for this line */
849 appendLineLength(compressLine(line->s, line->length, &groupTop));
850 }
851
852 /* finish and store the last group */
853 if(line && groupMSB!=0xffff) {
854 /* finish the current group with empty lines */
855 while((++outLine&GROUP_MASK)!=0) {
856 appendLineLength(0);
857 }
858
859 /* store the group like a line */
860 if(groupTop>0) {
861 if(groupTop>GROUP_STORE_SIZE) {
862 fprintf(stderr, "gennames: group store overflow\n");
863 exit(U_BUFFER_OVERFLOW_ERROR);
864 }
865 addGroup(groupMSB, groupStore, groupTop);
b75a7d8f
A
866 }
867 }
868
869 if(!beQuiet) {
870 printf("number of groups: %lu\n", (unsigned long)lineCount);
871 }
872}
873
874static int16_t
875compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
876 int16_t start, limit, token, groupTop=*pGroupTop;
877
878 start=0;
879 do {
880 /* write any "noise" characters */
881 limit=skipNoise((char *)s, start, length);
882 while(start<limit) {
883 groupStore[groupTop++]=s[start++];
884 }
885
886 if(start==length) {
887 break;
888 }
889
890 /* write a word, as token or directly */
891 limit=getWord((char *)s, start, length);
892 if(limit-start==1) {
893 groupStore[groupTop++]=s[start++];
894 } else {
895 token=findToken(s+start, (int16_t)(limit-start));
896 if(token!=-1) {
897 if(token>0xff) {
898 groupStore[groupTop++]=(uint8_t)(token>>8);
899 }
900 groupStore[groupTop++]=(uint8_t)token;
901 start=limit;
902 } else {
903 while(start<limit) {
904 groupStore[groupTop++]=s[start++];
905 }
906 }
907 }
908 } while(start<length);
909
910 length=(int16_t)(groupTop-*pGroupTop);
911 *pGroupTop=groupTop;
912 return length;
913}
914
374ca955
A
915static int32_t
916compareWords(const void *context, const void *word1, const void *word2) {
b75a7d8f
A
917 /* reverse sort by word weight */
918 return ((Word *)word2)->weight-((Word *)word1)->weight;
919}
920
921/* generate output data ----------------------------------------------------- */
922
923static void
46f4442e 924generateData(const char *dataDir, Options *storeOptions) {
b75a7d8f
A
925 UNewDataMemory *pData;
926 UErrorCode errorCode=U_ZERO_ERROR;
927 uint16_t groupWords[3];
928 uint32_t i, groupTop=lineTop, offset, size,
929 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
930 long dataLength;
931 int16_t token;
932
46f4442e
A
933 pData=udata_create(dataDir,
934 DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
935 &dataInfo,
b75a7d8f
A
936 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
937 if(U_FAILURE(errorCode)) {
938 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
939 exit(errorCode);
940 }
941
942 /* first, see how much space we need, and prepare the token strings */
943 for(i=0; i<tokenCount; ++i) {
944 token=tokens[i];
945 if(token!=-1 && token!=-2) {
946 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
947 }
948 }
949
73c04bcf
A
950 /*
951 * Required padding for data swapping:
952 * The token table undergoes a permutation during data swapping when the
953 * input and output charsets are different.
954 * The token table cannot grow during swapping, so we need to make sure that
955 * the table is long enough for successful in-place permutation.
956 *
957 * We simply round up tokenCount to the next multiple of 256 to account for
958 * all possible permutations.
959 *
960 * An optimization is possible if we only ever swap between ASCII and EBCDIC:
961 *
962 * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
963 * and will be swapped between ASCII and EBCDIC between
964 * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
965 * This should be the only -1 entry in tokens[256..511] on which the data
966 * swapper bases its trail byte permutation map (trailMap[]).
967 *
968 * It would be sufficient to increase tokenCount so that its lower 8 bits
969 * are at least 0x5e+1 to make room for swapping between the two semicolons.
970 * For values higher than 0x5e, the trail byte permutation map (trailMap[])
971 * should always be an identity map, where we do not need additional room.
972 */
973 i=tokenCount;
974 tokenCount=(tokenCount+0xff)&~0xff;
975 if(!beQuiet && i<tokenCount) {
976 printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
977 }
978 for(; i<tokenCount; ++i) {
979 if((i&0xff)==NAME_SEPARATOR_CHAR) {
980 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
981 } else {
982 tokens[i]=0; /* unused token for padding */
983 }
984 }
985
b75a7d8f
A
986 /*
987 * Calculate the total size in bytes of the data including:
988 * - the offset to the token strings, uint32_t (4)
989 * - the offset to the group table, uint32_t (4)
990 * - the offset to the group strings, uint32_t (4)
991 * - the offset to the algorithmic names, uint32_t (4)
992 *
993 * - the number of tokens, uint16_t (2)
994 * - the token table, uint16_t[tokenCount] (2*tokenCount)
995 *
996 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
997 *
998 * - the number of groups, uint16_t (2)
999 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
1000 *
46f4442e 1001 * - the group strings (groupTop-groupBottom), 2-padded
b75a7d8f
A
1002 *
1003 * - the size of the data for the algorithmic names
1004 */
1005 tokenStringOffset=4+4+4+4+2+2*tokenCount;
46f4442e 1006 groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
b75a7d8f 1007 groupStringOffset=groupsOffset+2+6*lineCount;
46f4442e 1008 algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
b75a7d8f 1009
46f4442e 1010 offset=generateAlgorithmicData(NULL, storeOptions);
b75a7d8f
A
1011 size=algNamesOffset+offset;
1012
1013 if(!beQuiet) {
1014 printf("size of the Unicode Names data:\n"
1015 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
1016 (unsigned long)size, (unsigned long)(lineTop-groupTop),
46f4442e 1017 (unsigned long)(groupTop-groupBottom), (unsigned long)offset);
b75a7d8f
A
1018 }
1019
1020 /* write the data to the file */
1021 /* offsets */
1022 udata_write32(pData, tokenStringOffset);
1023 udata_write32(pData, groupsOffset);
1024 udata_write32(pData, groupStringOffset);
1025 udata_write32(pData, algNamesOffset);
1026
1027 /* token table */
1028 udata_write16(pData, (uint16_t)tokenCount);
1029 udata_writeBlock(pData, tokens, 2*tokenCount);
1030
1031 /* token strings */
1032 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
1033 if((lineTop-groupTop)&1) {
1034 /* 2-padding */
1035 udata_writePadding(pData, 1);
1036 }
1037
1038 /* group table */
1039 udata_write16(pData, (uint16_t)lineCount);
1040 for(i=0; i<lineCount; ++i) {
1041 /* groupMSB */
1042 groupWords[0]=(uint16_t)lines[i].code;
1043
1044 /* offset */
46f4442e 1045 offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
b75a7d8f
A
1046 groupWords[1]=(uint16_t)(offset>>16);
1047 groupWords[2]=(uint16_t)(offset);
1048 udata_writeBlock(pData, groupWords, 6);
1049 }
1050
1051 /* group strings */
46f4442e 1052 udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
b75a7d8f
A
1053
1054 /* 4-align the algorithmic names data */
46f4442e 1055 udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
b75a7d8f 1056
46f4442e 1057 generateAlgorithmicData(pData, storeOptions);
b75a7d8f
A
1058
1059 /* finish up */
1060 dataLength=udata_finish(pData, &errorCode);
1061 if(U_FAILURE(errorCode)) {
1062 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1063 exit(errorCode);
1064 }
1065
1066 if(dataLength!=(long)size) {
1067 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1068dataLength, (unsigned long)size);
1069 exit(U_INTERNAL_PROGRAM_ERROR);
1070 }
1071}
1072
1073/* the structure for algorithmic names needs to be 4-aligned */
1074typedef struct AlgorithmicRange {
1075 uint32_t rangeStart, rangeEnd;
1076 uint8_t algorithmType, algorithmVariant;
1077 uint16_t rangeSize;
1078} AlgorithmicRange;
1079
1080static uint32_t
46f4442e 1081generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
b75a7d8f
A
1082 static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1083# define PREFIX_LENGTH 23
1084# define PREFIX_LENGTH_4 24
1085 uint32_t countAlgRanges;
1086
1087 static AlgorithmicRange cjkExtA={
1088 0x3400, 0x4db5,
1089 0, 4,
1090 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1091 };
1092 static AlgorithmicRange cjk={
1093 0x4e00, 0x9fa5,
1094 0, 4,
1095 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1096 };
1097 static AlgorithmicRange cjkExtB={
1098 0x20000, 0x2a6d6,
1099 0, 5,
1100 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1101 };
1102
1103 static char jamo[]=
1104 "HANGUL SYLLABLE \0"
1105
1106 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1107 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1108
1109 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1110 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1111 "YU\0EU\0YI\0I\0"
1112
1113 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1114 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1115 "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1116 ;
1117
1118 static AlgorithmicRange hangul={
1119 0xac00, 0xd7a3,
1120 1, 3,
1121 sizeof(AlgorithmicRange)+6+sizeof(jamo)
1122 };
1123
1124 /* modulo factors, maximum 8 */
1125 /* 3 factors: 19, 21, 28, most-to-least-significant */
1126 static uint16_t hangulFactors[3]={
1127 19, 21, 28
1128 };
1129
1130 uint32_t size;
1131
1132 size=0;
1133
46f4442e
A
1134 if(ucdVersion>=UNI_5_1) {
1135 /* Unicode 5.1 and up has a longer CJK Unihan range than before */
1136 cjk.rangeEnd=0x9FC3;
1137 } else if(ucdVersion>=UNI_4_1) {
73c04bcf
A
1138 /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1139 cjk.rangeEnd=0x9FBB;
1140 }
1141
b75a7d8f 1142 /* number of ranges of algorithmic names */
46f4442e
A
1143 if(!storeOptions->storeNames) {
1144 countAlgRanges=0;
1145 } else if(ucdVersion>=UNI_3_1) {
b75a7d8f
A
1146 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1147 countAlgRanges=4;
73c04bcf 1148 } else if(ucdVersion>=UNI_3_0) {
b75a7d8f
A
1149 /* Unicode 3.0 has 3 ranges including CJK Extension A */
1150 countAlgRanges=3;
1151 } else {
1152 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1153 countAlgRanges=2;
1154 }
1155
1156 if(pData!=NULL) {
1157 udata_write32(pData, countAlgRanges);
1158 } else {
1159 size+=4;
1160 }
46f4442e
A
1161 if(countAlgRanges==0) {
1162 return size;
1163 }
b75a7d8f
A
1164
1165 /*
1166 * each range:
1167 * uint32_t rangeStart
1168 * uint32_t rangeEnd
1169 * uint8_t algorithmType
1170 * uint8_t algorithmVariant
1171 * uint16_t size of range data
1172 * uint8_t[size] data
1173 */
1174
1175 /* range 0: cjk extension a */
1176 if(countAlgRanges>=3) {
1177 if(pData!=NULL) {
1178 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1179 udata_writeString(pData, prefix, PREFIX_LENGTH);
1180 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1181 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1182 }
1183 } else {
1184 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1185 }
1186 }
1187
1188 /* range 1: cjk */
1189 if(pData!=NULL) {
1190 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1191 udata_writeString(pData, prefix, PREFIX_LENGTH);
1192 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1193 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1194 }
1195 } else {
1196 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1197 }
1198
1199 /* range 2: hangul syllables */
1200 if(pData!=NULL) {
1201 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1202 udata_writeBlock(pData, hangulFactors, 6);
1203 udata_writeString(pData, jamo, sizeof(jamo));
1204 } else {
1205 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1206 }
1207
1208 /* range 3: cjk extension b */
1209 if(countAlgRanges>=4) {
1210 if(pData!=NULL) {
1211 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1212 udata_writeString(pData, prefix, PREFIX_LENGTH);
1213 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1214 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1215 }
1216 } else {
1217 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1218 }
1219 }
1220
1221 return size;
1222}
1223
1224/* helpers ------------------------------------------------------------------ */
1225
1226static int16_t
1227findToken(uint8_t *s, int16_t length) {
1228 int16_t i, token;
1229
1230 for(i=0; i<(int16_t)tokenCount; ++i) {
1231 token=tokens[i];
73c04bcf 1232 if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
b75a7d8f
A
1233 return i;
1234 }
1235 }
1236
1237 return -1;
1238}
1239
1240static Word *
1241findWord(char *s, int16_t length) {
1242 uint32_t i;
1243
1244 for(i=0; i<wordCount; ++i) {
1245 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1246 return words+i;
1247 }
1248 }
1249
1250 return NULL;
1251}
1252
1253static Word *
1254addWord(char *s, int16_t length) {
1255 uint8_t *stringStart;
1256 Word *word;
1257
1258 if(wordCount==MAX_WORD_COUNT) {
1259 fprintf(stderr, "gennames: too many words\n");
1260 exit(U_BUFFER_OVERFLOW_ERROR);
1261 }
1262
1263 stringStart=allocWord(length);
1264 uprv_memcpy(stringStart, s, length);
1265
1266 word=words+wordCount;
1267
1268 /*
1269 * Initialize the weight with the costs for this token:
1270 * a zero-terminated string and a 16-bit offset.
1271 */
1272 word->weight=-(length+1+2);
1273 word->count=0;
1274 word->length=length;
1275 word->s=stringStart;
1276
1277 ++wordCount;
1278
1279 return word;
1280}
1281
1282static void
1283countWord(Word *word) {
1284 /* add to the weight the savings: the length of the word minus 1 byte for the token */
1285 word->weight+=word->length-1;
1286 ++word->count;
1287}
1288
1289static void
1290addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1291 uint8_t *stringStart;
1292 Line *line;
1293 int16_t i, length;
1294
1295 if(lineCount==MAX_LINE_COUNT) {
1296 fprintf(stderr, "gennames: too many lines\n");
1297 exit(U_BUFFER_OVERFLOW_ERROR);
1298 }
1299
1300 /* find the last non-empty name */
1301 while(count>0 && lengths[count-1]==0) {
1302 --count;
1303 }
1304 if(count==0) {
1305 return; /* should not occur: caller should not have called */
1306 }
1307
1308 /* there will be (count-1) separator characters */
1309 i=count;
1310 length=count-1;
1311
1312 /* add lengths of strings */
1313 while(i>0) {
1314 length+=lengths[--i];
1315 }
1316
1317 /* allocate line memory */
1318 stringStart=allocLine(length);
1319
1320 /* copy all strings into the line memory */
1321 length=0; /* number of chars copied so far */
1322 for(i=0; i<count; ++i) {
1323 if(i>0) {
1324 stringStart[length++]=NAME_SEPARATOR_CHAR;
1325 }
1326 if(lengths[i]>0) {
1327 uprv_memcpy(stringStart+length, names[i], lengths[i]);
1328 length+=lengths[i];
1329 }
1330 }
1331
1332 line=lines+lineCount;
1333
1334 line->code=code;
1335 line->length=length;
1336 line->s=stringStart;
1337
1338 ++lineCount;
1339
1340 /* prevent a character value that is actually in a name from becoming a token */
1341 while(length>0) {
1342 tokens[stringStart[--length]]=-1;
1343 }
1344}
1345
1346static void
1347addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1348 uint8_t *stringStart;
1349 Line *line;
1350
1351 if(lineCount==MAX_LINE_COUNT) {
1352 fprintf(stderr, "gennames: too many groups\n");
1353 exit(U_BUFFER_OVERFLOW_ERROR);
1354 }
1355
1356 /* store the line lengths first, then the strings */
1357 lineLengthsTop=(lineLengthsTop+1)/2;
1358 stringStart=allocLine(lineLengthsTop+length);
1359 uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1360 uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1361
1362 line=lines+lineCount;
1363
1364 line->code=groupMSB;
1365 line->length=length;
1366 line->s=stringStart;
1367
1368 ++lineCount;
1369}
1370
1371static uint32_t
1372addToken(uint8_t *s, int16_t length) {
1373 uint8_t *stringStart;
1374
1375 stringStart=allocLine(length+1);
1376 uprv_memcpy(stringStart, s, length);
1377 stringStart[length]=0;
1378
1379 return (uint32_t)(stringStart - stringStore);
1380}
1381
1382static void
1383appendLineLength(int16_t length) {
1384 if(length>=76) {
1385 fprintf(stderr, "gennames: compressed line too long\n");
1386 exit(U_BUFFER_OVERFLOW_ERROR);
1387 }
1388 if(length>=12) {
1389 length-=12;
1390 appendLineLengthNibble((uint8_t)((length>>4)|12));
1391 }
1392 appendLineLengthNibble((uint8_t)length);
1393}
1394
1395static void
1396appendLineLengthNibble(uint8_t nibble) {
1397 if((lineLengthsTop&1)==0) {
1398 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1399 } else {
1400 lineLengths[lineLengthsTop/2]|=nibble&0xf;
1401 }
1402 ++lineLengthsTop;
1403}
1404
1405static uint8_t *
1406allocLine(int32_t length) {
1407 uint32_t top=lineTop+length;
1408 uint8_t *p;
1409
1410 if(top>wordBottom) {
1411 fprintf(stderr, "gennames: out of memory\n");
1412 exit(U_MEMORY_ALLOCATION_ERROR);
1413 }
1414 p=stringStore+lineTop;
1415 lineTop=top;
1416 return p;
1417}
1418
1419static uint8_t *
1420allocWord(uint32_t length) {
1421 uint32_t bottom=wordBottom-length;
1422
1423 if(lineTop>bottom) {
1424 fprintf(stderr, "gennames: out of memory\n");
1425 exit(U_MEMORY_ALLOCATION_ERROR);
1426 }
1427 wordBottom=bottom;
1428 return stringStore+bottom;
1429}
1430
1431/*
1432 * Hey, Emacs, please set the following:
1433 *
1434 * Local Variables:
1435 * indent-tabs-mode: nil
1436 * End:
1437 *
1438 */