2 *******************************************************************************
4 * Copyright (C) 1999-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: gennames.c
10 * tab size: 8 (not used)
13 * created on: 1999sep30
14 * created by: Markus W. Scherer
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the character code,
18 * the "modern" character name, and optionally the
19 * Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20 * It then tokenizes and compresses the names and builds
21 * compact binary tables for random-access lookup
22 * in a u_charName() API function.
24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
25 * (all data is static const)
30 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
33 * uint32_t tokenStringOffset,
38 * uint16_t tokenCount;
39 * uint16_t tokenTable[tokenCount];
41 * char tokenStrings[]; -- padded to even count
43 * -- strings (groupStrings) are tokenized as follows:
44 * for each character c
45 * if(c>=tokenCount) write that character c directly
47 * token=tokenTable[c];
48 * if(token==0xfffe) -- lead byte of double-byte token
49 * token=tokenTable[c<<8|next character];
53 * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54 * append zero-terminated tokenString;
56 * Different strings for a code point - normal name, 1.0 name, and ISO comment -
57 * are separated by ';'.
59 * uint16_t groupCount;
61 * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
64 * } groupTable[groupCount];
66 * char groupStrings[]; -- padded to 4-count
68 * -- The actual, tokenized group strings are not zero-terminated because
69 * that would take up too much space.
70 * Instead, they are preceeded by their length, written in a variable-length sequence:
71 * For each of the 32 group strings, one or two nibbles are stored for its length.
72 * Nibbles (4-bit values, half-bytes) are read MSB first.
73 * A nibble with a value of 0..11 directly indicates the length of the name string.
74 * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75 * by (((n-12)<<4)|m)+12, reaching values of 12..75.
76 * These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77 * For the de-tokenizing, see token description above; the strings immediately follow the
80 * -- algorithmic names
82 * typedef struct AlgorithmicRange {
83 * uint32_t rangeStart, rangeEnd;
84 * uint8_t algorithmType, algorithmVariant;
88 * uint32_t algRangesCount; -- number of data blocks for ranges of
89 * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
92 * AlgorithmicRange algRange;
93 * uint8_t algRangeData[]; -- padded to 4-count except in last range
94 * } algRanges[algNamesCount];
95 * -- not a real array because each part has a different size
96 * of algRange.rangeSize (including AlgorithmicRange)
98 * -- algorithmic range types:
100 * 0 Names are formed from a string prefix that is stored in
101 * the algRangeData (zero-terminated), followed by the Unicode code point
102 * of the character in hexadecimal digits;
103 * algRange.algorithmVariant digits are written
105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
106 * algRange.algorithmVariant is the count of modulo factors
107 * algRangeData contains
108 * uint16_t factors[algRange.algorithmVariant];
110 * the first zero-terminated string is written as the prefix; then:
112 * The rangeStart is subtracted; with the difference, here "code":
113 * for(i=algRange.algorithmVariant-1 to 0 step -1)
114 * index[i]=code%factor[i];
117 * The strings after the prefix are short pieces that are then appended to the result
118 * according to index[0..algRange.algorithmVariant-1].
122 #include "unicode/utypes.h"
123 #include "unicode/putil.h"
124 #include "unicode/uclean.h"
125 #include "unicode/udata.h"
128 #include "uarrsort.h"
129 #include "unewdata.h"
130 #include "uoptions.h"
133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
135 #define STRING_STORE_SIZE 1000000
136 #define GROUP_STORE_SIZE 5000
138 #define GROUP_SHIFT 5
139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
140 #define GROUP_MASK (LINES_PER_GROUP-1)
142 #define MAX_LINE_COUNT 50000
143 #define MAX_WORD_COUNT 20000
144 #define MAX_GROUP_COUNT 5000
146 #define DATA_NAME "unames"
147 #define DATA_TYPE "icu"
148 #define VERSION_STRING "unam"
149 #define NAME_SEPARATOR_CHAR ';'
151 #define ISO_DATA_NAME "ucomment"
153 /* Unicode versions --------------------------------------------------------- */
170 static const UVersionInfo
185 static int32_t ucdVersion
=UNI_5_1
;
188 findUnicodeVersion(const UVersionInfo version
) {
191 for(i
=0; /* while(version>unicodeVersions[i]) {} */
192 i
<UNI_VER_COUNT
&& uprv_memcmp(version
, unicodeVersions
[i
], 4)>0;
194 if(0<i
&& i
<UNI_VER_COUNT
&& uprv_memcmp(version
, unicodeVersions
[i
], 4)<0) {
195 --i
; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
197 return i
; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
200 /* generator data ----------------------------------------------------------- */
202 /* UDataInfo cf. udata.h */
203 static UDataInfo dataInfo
={
212 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
213 {1, 0, 0, 0}, /* formatVersion */
214 {3, 0, 0, 0} /* dataVersion */
217 static UBool beVerbose
=FALSE
, beQuiet
=FALSE
, haveCopyright
=TRUE
;
219 typedef struct Options
{
222 UBool storeISOComments
;
225 static uint8_t stringStore
[STRING_STORE_SIZE
],
226 groupStore
[GROUP_STORE_SIZE
],
227 lineLengths
[LINES_PER_GROUP
];
229 static uint32_t lineTop
=0, groupBottom
, wordBottom
=STRING_STORE_SIZE
, lineLengthsTop
;
238 int32_t weight
; /* -(cost for token) + (number of occurences) * (length-1) */
244 static Line lines
[MAX_LINE_COUNT
];
245 static Word words
[MAX_WORD_COUNT
];
247 static uint32_t lineCount
=0, wordCount
=0;
249 static int16_t leadByteCount
;
251 #define LEADBYTE_LIMIT 16
253 static int16_t tokens
[LEADBYTE_LIMIT
*256];
254 static uint32_t tokenCount
;
256 /* prototypes --------------------------------------------------------------- */
262 parseDB(const char *filename
, Options
*options
);
265 parseName(char *name
, int16_t length
);
268 skipNoise(char *line
, int16_t start
, int16_t limit
);
271 getWord(char *line
, int16_t start
, int16_t limit
);
280 compressLine(uint8_t *s
, int16_t length
, int16_t *pGroupTop
);
283 compareWords(const void *context
, const void *word1
, const void *word2
);
286 generateData(const char *dataDir
, Options
*options
);
289 generateAlgorithmicData(UNewDataMemory
*pData
, Options
*options
);
292 findToken(uint8_t *s
, int16_t length
);
295 findWord(char *s
, int16_t length
);
298 addWord(char *s
, int16_t length
);
301 countWord(Word
*word
);
304 addLine(uint32_t code
, char *names
[], int16_t lengths
[], int16_t count
);
307 addGroup(uint32_t groupMSB
, uint8_t *strings
, int16_t length
);
310 addToken(uint8_t *s
, int16_t length
);
313 appendLineLength(int16_t length
);
316 appendLineLengthNibble(uint8_t nibble
);
319 allocLine(int32_t length
);
322 allocWord(uint32_t length
);
324 /* -------------------------------------------------------------------------- */
339 static UOption options
[]={
341 UOPTION_HELP_QUESTION_MARK
,
346 { "unicode", NULL
, NULL
, NULL
, 'u', UOPT_REQUIRES_ARG
, 0 },
347 { "unicode1-names", NULL
, NULL
, NULL
, '1', UOPT_NO_ARG
, 0 },
348 { "no-iso-comments", NULL
, NULL
, NULL
, '\1', UOPT_NO_ARG
, 0 },
349 { "only-iso-comments", NULL
, NULL
, NULL
, '\1', UOPT_NO_ARG
, 0 }
353 main(int argc
, char* argv
[]) {
354 UVersionInfo version
;
355 Options moreOptions
={ TRUE
, FALSE
, TRUE
};
356 UErrorCode errorCode
= U_ZERO_ERROR
;
358 U_MAIN_INIT_ARGS(argc
, argv
);
362 if (U_FAILURE(errorCode
) && errorCode
!= U_FILE_ACCESS_ERROR
) {
363 /* Note: u_init() will try to open ICU property data.
364 * failures here are expected when building ICU from scratch.
367 fprintf(stderr
, "%s: can not initialize ICU. errorCode = %s\n",
368 argv
[0], u_errorName(errorCode
));
372 /* preset then read command line options */
373 options
[DESTDIR
].value
=u_getDataDirectory();
374 options
[UNICODE
].value
="4.1";
375 argc
=u_parseArgs(argc
, argv
, LENGTHOF(options
), options
);
377 /* error handling, printing usage message */
380 "error in command line argument \"%s\"\n",
385 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
387 * Broken into chucks because the C89 standard says the minimum
388 * required supported string length is 509 bytes.
391 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
393 "Read the UnicodeData.txt file and \n"
394 "create a binary file " DATA_NAME
"." DATA_TYPE
" with the character names\n"
396 "\tfilename absolute path/filename for the Unicode database text file\n"
397 "\t\t(default: standard input)\n"
402 "\t-h or -? or --help this usage text\n"
403 "\t-v or --verbose verbose output\n"
404 "\t-q or --quiet no output\n"
405 "\t-c or --copyright include a copyright notice\n"
406 "\t-d or --destdir destination directory, followed by the path\n"
407 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
409 "\t-1 or --unicode1-names store Unicode 1.0 character names\n"
410 "\t --no-iso-comments do not store ISO comments\n"
411 "\t --only-iso-comments write ucomment.icu with only ISO comments\n");
412 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
415 /* get the options values */
416 beVerbose
=options
[VERBOSE
].doesOccur
;
417 beQuiet
=options
[QUIET
].doesOccur
;
418 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
419 moreOptions
.store10Names
=options
[UNICODE1_NAMES
].doesOccur
;
420 moreOptions
.storeISOComments
=!options
[NO_ISO_COMMENTS
].doesOccur
;
421 if(options
[ONLY_ISO_COMMENTS
].doesOccur
) {
422 moreOptions
.storeNames
=moreOptions
.store10Names
=FALSE
;
423 moreOptions
.storeISOComments
=TRUE
;
426 /* set the Unicode version */
427 u_versionFromString(version
, options
[UNICODE
].value
);
428 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
429 ucdVersion
=findUnicodeVersion(version
);
432 parseDB(argc
>=2 ? argv
[1] : "-", &moreOptions
);
434 generateData(options
[DESTDIR
].value
, &moreOptions
);
444 for(i
=0; i
<256; ++i
) {
449 /* parsing ------------------------------------------------------------------ */
451 /* get a name, strip leading and trailing whitespace */
453 getName(char **pStart
, char *limit
) {
454 /* strip leading whitespace */
455 char *start
=(char *)u_skipWhitespace(*pStart
);
457 /* strip trailing whitespace */
458 while(start
<limit
&& (*(limit
-1)==' ' || *(limit
-1)=='\t')) {
464 return (int16_t)(limit
-start
);
467 static void U_CALLCONV
468 lineFn(void *context
,
469 char *fields
[][2], int32_t fieldCount
,
470 UErrorCode
*pErrorCode
) {
471 Options
*storeOptions
=(Options
*)context
;
473 int16_t lengths
[3]={ 0, 0, 0 };
474 static uint32_t prevCode
=0;
477 if(U_FAILURE(*pErrorCode
)) {
480 /* get the character code */
481 code
=uprv_strtoul(fields
[0][0], NULL
, 16);
483 /* get the character name */
484 if(storeOptions
->storeNames
) {
485 names
[0]=fields
[1][0];
486 lengths
[0]=getName(names
+0, fields
[1][1]);
487 if(names
[0][0]=='<') {
488 /* do not store pseudo-names in <> brackets */
493 /* store 1.0 names */
494 /* get the second character name, the one from Unicode 1.0 */
495 if(storeOptions
->store10Names
) {
496 names
[1]=fields
[10][0];
497 lengths
[1]=getName(names
+1, fields
[10][1]);
498 if(names
[1][0]=='<') {
499 /* do not store pseudo-names in <> brackets */
504 /* get the ISO 10646 comment */
505 if(storeOptions
->storeISOComments
) {
506 names
[2]=fields
[11][0];
507 lengths
[2]=getName(names
+2, fields
[11][1]);
510 if(lengths
[0]+lengths
[1]+lengths
[2]==0) {
514 /* check for non-character code points */
515 if(!U_IS_UNICODE_CHAR(code
)) {
516 fprintf(stderr
, "gennames: error - properties for non-character code point U+%04lx\n",
517 (unsigned long)code
);
518 *pErrorCode
=U_PARSE_ERROR
;
522 /* check that the code points (code) are in ascending order */
523 if(code
<=prevCode
&& code
>0) {
524 fprintf(stderr
, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
525 (unsigned long)code
, (unsigned long)prevCode
);
526 *pErrorCode
=U_PARSE_ERROR
;
531 parseName(names
[0], lengths
[0]);
532 parseName(names
[1], lengths
[1]);
533 parseName(names
[2], lengths
[2]);
536 * set the count argument to
537 * 1: only store regular names, or only store ISO 10646 comments
538 * 2: store regular and 1.0 names
539 * 3: store names and ISO 10646 comment
541 * addLine() will ignore empty trailing names
543 if(storeOptions
->storeNames
) {
544 /* store names and comments as parsed according to storeOptions */
545 addLine(code
, names
, lengths
, 3);
547 /* store only ISO 10646 comments */
548 addLine(code
, names
+2, lengths
+2, 1);
553 parseDB(const char *filename
, Options
*storeOptions
) {
555 UErrorCode errorCode
=U_ZERO_ERROR
;
557 u_parseDelimitedFile(filename
, ';', fields
, 15, lineFn
, storeOptions
, &errorCode
);
558 if(U_FAILURE(errorCode
)) {
559 fprintf(stderr
, "gennames parse error: %s\n", u_errorName(errorCode
));
564 printf("size of all names in the database: %lu\n",
565 (unsigned long)lineTop
);
566 printf("number of named Unicode characters: %lu\n",
567 (unsigned long)lineCount
);
568 printf("number of words in the dictionary from these names: %lu\n",
569 (unsigned long)wordCount
);
574 parseName(char *name
, int16_t length
) {
575 int16_t start
=0, limit
, wordLength
/*, prevStart=-1*/;
578 while(start
<length
) {
579 /* skip any "noise" characters */
580 limit
=skipNoise(name
, start
, length
);
589 /* get a word and add it if it is longer than 1 */
590 limit
=getWord(name
, start
, length
);
591 wordLength
=(int16_t)(limit
-start
);
593 word
=findWord(name
+start
, wordLength
);
595 word
=addWord(name
+start
, wordLength
);
602 * if there was a word before this
603 * (with no noise in between), then add the pair of words, too
606 wordLength
=limit
-prevStart
;
607 word
=findWord(name
+prevStart
, wordLength
);
609 word
=addWord(name
+prevStart
, wordLength
);
620 static UBool U_INLINE
622 return ('A'<=c
&& c
<='I') || /* EBCDIC-safe check for letters */
623 ('J'<=c
&& c
<='R') ||
624 ('S'<=c
&& c
<='Z') ||
626 ('a'<=c
&& c
<='i') || /* lowercase letters for ISO comments */
627 ('j'<=c
&& c
<='r') ||
628 ('s'<=c
&& c
<='z') ||
634 skipNoise(char *line
, int16_t start
, int16_t limit
) {
635 /* skip anything that is not part of a word in this sense */
636 while(start
<limit
&& !isWordChar(line
[start
])) {
644 getWord(char *line
, int16_t start
, int16_t limit
) {
645 char c
=0; /* initialize to avoid a compiler warning although the code was safe */
647 /* a unicode character name word consists of A-Z0-9 */
648 while(start
<limit
&& isWordChar(line
[start
])) {
652 /* include a following space or dash */
653 if(start
<limit
&& ((c
=line
[start
])==' ' || c
=='-')) {
660 /* compressing -------------------------------------------------------------- */
664 uint32_t i
, letterCount
;
666 UErrorCode errorCode
;
668 /* sort the words in reverse order by weight */
669 errorCode
=U_ZERO_ERROR
;
670 uprv_sortArray(words
, wordCount
, sizeof(Word
),
671 compareWords
, NULL
, FALSE
, &errorCode
);
673 /* remove the words that do not save anything */
674 while(wordCount
>0 && words
[wordCount
-1].weight
<1) {
678 /* count the letters in the token range */
680 for(i
=LEADBYTE_LIMIT
; i
<256; ++i
) {
686 printf("number of letters used in the names: %d\n", (int)letterCount
);
689 /* do we need double-byte tokens? */
690 if(wordCount
+letterCount
<=256) {
691 /* no, single-byte tokens are enough */
693 for(i
=0, wordNumber
=0; wordNumber
<(int16_t)wordCount
; ++i
) {
695 tokens
[i
]=wordNumber
;
697 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
698 (int)i
, (long)words
[wordNumber
].weight
,
699 words
[wordNumber
].length
, words
[wordNumber
].s
);
707 * The tokens that need two token bytes
708 * get their weight reduced by their count
709 * because they save less.
711 tokenCount
=256-letterCount
;
712 for(i
=tokenCount
; i
<wordCount
; ++i
) {
713 words
[i
].weight
-=words
[i
].count
;
716 /* sort these words in reverse order by weight */
717 errorCode
=U_ZERO_ERROR
;
718 uprv_sortArray(words
+tokenCount
, wordCount
-tokenCount
, sizeof(Word
),
719 compareWords
, NULL
, FALSE
, &errorCode
);
721 /* remove the words that do not save anything */
722 while(wordCount
>0 && words
[wordCount
-1].weight
<1) {
726 /* how many tokens and lead bytes do we have now? */
727 tokenCount
=wordCount
+letterCount
+(LEADBYTE_LIMIT
-1);
729 * adjust upwards to take into account that
730 * double-byte tokens must not
731 * use NAME_SEPARATOR_CHAR as a second byte
733 tokenCount
+=(tokenCount
-256+254)/255;
735 leadByteCount
=(int16_t)(tokenCount
>>8);
736 if(leadByteCount
<LEADBYTE_LIMIT
) {
737 /* adjust for the real number of lead bytes */
738 tokenCount
-=(LEADBYTE_LIMIT
-1)-leadByteCount
;
740 /* limit the number of lead bytes */
741 leadByteCount
=LEADBYTE_LIMIT
-1;
742 tokenCount
=LEADBYTE_LIMIT
*256;
743 wordCount
=tokenCount
-letterCount
-(LEADBYTE_LIMIT
-1);
744 /* adjust again to skip double-byte tokens with ';' */
745 wordCount
-=(tokenCount
-256+254)/255;
748 /* set token 0 to word 0 */
751 printf("tokens[0x000]: word%8ld \"%.*s\"\n",
752 (long)words
[0].weight
,
753 words
[0].length
, words
[0].s
);
757 /* set the lead byte tokens */
758 for(i
=1; (int16_t)i
<=leadByteCount
; ++i
) {
764 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
766 tokens
[i
]=wordNumber
;
768 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
769 (int)i
, (long)words
[wordNumber
].weight
,
770 words
[wordNumber
].length
, words
[wordNumber
].s
);
776 /* continue above 255 where there are no letters */
777 for(; (uint32_t)wordNumber
<wordCount
; ++i
) {
778 if((i
&0xff)==NAME_SEPARATOR_CHAR
) {
779 tokens
[i
]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
781 tokens
[i
]=wordNumber
;
783 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
784 (int)i
, (long)words
[wordNumber
].weight
,
785 words
[wordNumber
].length
, words
[wordNumber
].s
);
790 tokenCount
=i
; /* should be already tokenCount={i or i+1} */
794 printf("number of lead bytes: %d\n", leadByteCount
);
795 printf("number of single-byte tokens: %lu\n",
796 (unsigned long)256-letterCount
-leadByteCount
);
797 printf("number of tokens: %lu\n", (unsigned long)tokenCount
);
806 uint32_t i
=0, inLine
, outLine
=0xffffffff /* (uint32_t)(-1) */,
807 groupMSB
=0xffff, lineCount2
;
810 /* store the groups like lines, with compressed data after raw strings */
812 lineCount2
=lineCount
;
815 /* loop over all lines */
816 while(i
<lineCount2
) {
820 /* segment the lines to groups of 32 */
821 if(inLine
>>GROUP_SHIFT
!=groupMSB
) {
822 /* finish the current group with empty lines */
823 while((++outLine
&GROUP_MASK
)!=0) {
827 /* store the group like a line */
829 if(groupTop
>GROUP_STORE_SIZE
) {
830 fprintf(stderr
, "gennames: group store overflow\n");
831 exit(U_BUFFER_OVERFLOW_ERROR
);
833 addGroup(groupMSB
, groupStore
, groupTop
);
836 /* start the new group */
839 groupMSB
=inLine
>>GROUP_SHIFT
;
840 outLine
=(inLine
&~GROUP_MASK
)-1;
843 /* write empty lines between the previous line in the group and this one */
844 while(++outLine
<inLine
) {
848 /* write characters and tokens for this line */
849 appendLineLength(compressLine(line
->s
, line
->length
, &groupTop
));
852 /* finish and store the last group */
853 if(line
&& groupMSB
!=0xffff) {
854 /* finish the current group with empty lines */
855 while((++outLine
&GROUP_MASK
)!=0) {
859 /* store the group like a line */
861 if(groupTop
>GROUP_STORE_SIZE
) {
862 fprintf(stderr
, "gennames: group store overflow\n");
863 exit(U_BUFFER_OVERFLOW_ERROR
);
865 addGroup(groupMSB
, groupStore
, groupTop
);
870 printf("number of groups: %lu\n", (unsigned long)lineCount
);
875 compressLine(uint8_t *s
, int16_t length
, int16_t *pGroupTop
) {
876 int16_t start
, limit
, token
, groupTop
=*pGroupTop
;
880 /* write any "noise" characters */
881 limit
=skipNoise((char *)s
, start
, length
);
883 groupStore
[groupTop
++]=s
[start
++];
890 /* write a word, as token or directly */
891 limit
=getWord((char *)s
, start
, length
);
893 groupStore
[groupTop
++]=s
[start
++];
895 token
=findToken(s
+start
, (int16_t)(limit
-start
));
898 groupStore
[groupTop
++]=(uint8_t)(token
>>8);
900 groupStore
[groupTop
++]=(uint8_t)token
;
904 groupStore
[groupTop
++]=s
[start
++];
908 } while(start
<length
);
910 length
=(int16_t)(groupTop
-*pGroupTop
);
916 compareWords(const void *context
, const void *word1
, const void *word2
) {
917 /* reverse sort by word weight */
918 return ((Word
*)word2
)->weight
-((Word
*)word1
)->weight
;
921 /* generate output data ----------------------------------------------------- */
924 generateData(const char *dataDir
, Options
*storeOptions
) {
925 UNewDataMemory
*pData
;
926 UErrorCode errorCode
=U_ZERO_ERROR
;
927 uint16_t groupWords
[3];
928 uint32_t i
, groupTop
=lineTop
, offset
, size
,
929 tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
933 pData
=udata_create(dataDir
,
934 DATA_TYPE
, storeOptions
->storeNames
? DATA_NAME
: ISO_DATA_NAME
,
936 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
937 if(U_FAILURE(errorCode
)) {
938 fprintf(stderr
, "gennames: unable to create data memory, error %d\n", errorCode
);
942 /* first, see how much space we need, and prepare the token strings */
943 for(i
=0; i
<tokenCount
; ++i
) {
945 if(token
!=-1 && token
!=-2) {
946 tokens
[i
]=(int16_t)(addToken(words
[token
].s
, words
[token
].length
)-groupTop
);
951 * Required padding for data swapping:
952 * The token table undergoes a permutation during data swapping when the
953 * input and output charsets are different.
954 * The token table cannot grow during swapping, so we need to make sure that
955 * the table is long enough for successful in-place permutation.
957 * We simply round up tokenCount to the next multiple of 256 to account for
958 * all possible permutations.
960 * An optimization is possible if we only ever swap between ASCII and EBCDIC:
962 * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
963 * and will be swapped between ASCII and EBCDIC between
964 * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
965 * This should be the only -1 entry in tokens[256..511] on which the data
966 * swapper bases its trail byte permutation map (trailMap[]).
968 * It would be sufficient to increase tokenCount so that its lower 8 bits
969 * are at least 0x5e+1 to make room for swapping between the two semicolons.
970 * For values higher than 0x5e, the trail byte permutation map (trailMap[])
971 * should always be an identity map, where we do not need additional room.
974 tokenCount
=(tokenCount
+0xff)&~0xff;
975 if(!beQuiet
&& i
<tokenCount
) {
976 printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount
-i
));
978 for(; i
<tokenCount
; ++i
) {
979 if((i
&0xff)==NAME_SEPARATOR_CHAR
) {
980 tokens
[i
]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
982 tokens
[i
]=0; /* unused token for padding */
987 * Calculate the total size in bytes of the data including:
988 * - the offset to the token strings, uint32_t (4)
989 * - the offset to the group table, uint32_t (4)
990 * - the offset to the group strings, uint32_t (4)
991 * - the offset to the algorithmic names, uint32_t (4)
993 * - the number of tokens, uint16_t (2)
994 * - the token table, uint16_t[tokenCount] (2*tokenCount)
996 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
998 * - the number of groups, uint16_t (2)
999 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
1001 * - the group strings (groupTop-groupBottom), 2-padded
1003 * - the size of the data for the algorithmic names
1005 tokenStringOffset
=4+4+4+4+2+2*tokenCount
;
1006 groupsOffset
=(tokenStringOffset
+(lineTop
-groupTop
)+1)&~1;
1007 groupStringOffset
=groupsOffset
+2+6*lineCount
;
1008 algNamesOffset
=(groupStringOffset
+(groupTop
-groupBottom
)+3)&~3;
1010 offset
=generateAlgorithmicData(NULL
, storeOptions
);
1011 size
=algNamesOffset
+offset
;
1014 printf("size of the Unicode Names data:\n"
1015 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
1016 (unsigned long)size
, (unsigned long)(lineTop
-groupTop
),
1017 (unsigned long)(groupTop
-groupBottom
), (unsigned long)offset
);
1020 /* write the data to the file */
1022 udata_write32(pData
, tokenStringOffset
);
1023 udata_write32(pData
, groupsOffset
);
1024 udata_write32(pData
, groupStringOffset
);
1025 udata_write32(pData
, algNamesOffset
);
1028 udata_write16(pData
, (uint16_t)tokenCount
);
1029 udata_writeBlock(pData
, tokens
, 2*tokenCount
);
1032 udata_writeBlock(pData
, stringStore
+groupTop
, lineTop
-groupTop
);
1033 if((lineTop
-groupTop
)&1) {
1035 udata_writePadding(pData
, 1);
1039 udata_write16(pData
, (uint16_t)lineCount
);
1040 for(i
=0; i
<lineCount
; ++i
) {
1042 groupWords
[0]=(uint16_t)lines
[i
].code
;
1045 offset
= (uint32_t)((lines
[i
].s
- stringStore
)-groupBottom
);
1046 groupWords
[1]=(uint16_t)(offset
>>16);
1047 groupWords
[2]=(uint16_t)(offset
);
1048 udata_writeBlock(pData
, groupWords
, 6);
1052 udata_writeBlock(pData
, stringStore
+groupBottom
, groupTop
-groupBottom
);
1054 /* 4-align the algorithmic names data */
1055 udata_writePadding(pData
, algNamesOffset
-(groupStringOffset
+(groupTop
-groupBottom
)));
1057 generateAlgorithmicData(pData
, storeOptions
);
1060 dataLength
=udata_finish(pData
, &errorCode
);
1061 if(U_FAILURE(errorCode
)) {
1062 fprintf(stderr
, "gennames: error %d writing the output file\n", errorCode
);
1066 if(dataLength
!=(long)size
) {
1067 fprintf(stderr
, "gennames: data length %ld != calculated size %lu\n",
1068 dataLength
, (unsigned long)size
);
1069 exit(U_INTERNAL_PROGRAM_ERROR
);
1073 /* the structure for algorithmic names needs to be 4-aligned */
1074 typedef struct AlgorithmicRange
{
1075 uint32_t rangeStart
, rangeEnd
;
1076 uint8_t algorithmType
, algorithmVariant
;
1081 generateAlgorithmicData(UNewDataMemory
*pData
, Options
*storeOptions
) {
1082 static char prefix
[] = "CJK UNIFIED IDEOGRAPH-";
1083 # define PREFIX_LENGTH 23
1084 # define PREFIX_LENGTH_4 24
1085 uint32_t countAlgRanges
;
1087 static AlgorithmicRange cjkExtA
={
1090 sizeof(AlgorithmicRange
)+PREFIX_LENGTH_4
1092 static AlgorithmicRange cjk
={
1095 sizeof(AlgorithmicRange
)+PREFIX_LENGTH_4
1097 static AlgorithmicRange cjkExtB
={
1100 sizeof(AlgorithmicRange
)+PREFIX_LENGTH_4
1104 "HANGUL SYLLABLE \0"
1106 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1107 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1109 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1110 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1113 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1114 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1115 "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1118 static AlgorithmicRange hangul
={
1121 sizeof(AlgorithmicRange
)+6+sizeof(jamo
)
1124 /* modulo factors, maximum 8 */
1125 /* 3 factors: 19, 21, 28, most-to-least-significant */
1126 static uint16_t hangulFactors
[3]={
1134 if(ucdVersion
>=UNI_5_1
) {
1135 /* Unicode 5.1 and up has a longer CJK Unihan range than before */
1136 cjk
.rangeEnd
=0x9FC3;
1137 } else if(ucdVersion
>=UNI_4_1
) {
1138 /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1139 cjk
.rangeEnd
=0x9FBB;
1142 /* number of ranges of algorithmic names */
1143 if(!storeOptions
->storeNames
) {
1145 } else if(ucdVersion
>=UNI_3_1
) {
1146 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1148 } else if(ucdVersion
>=UNI_3_0
) {
1149 /* Unicode 3.0 has 3 ranges including CJK Extension A */
1152 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1157 udata_write32(pData
, countAlgRanges
);
1161 if(countAlgRanges
==0) {
1167 * uint32_t rangeStart
1169 * uint8_t algorithmType
1170 * uint8_t algorithmVariant
1171 * uint16_t size of range data
1172 * uint8_t[size] data
1175 /* range 0: cjk extension a */
1176 if(countAlgRanges
>=3) {
1178 udata_writeBlock(pData
, &cjkExtA
, sizeof(AlgorithmicRange
));
1179 udata_writeString(pData
, prefix
, PREFIX_LENGTH
);
1180 if(PREFIX_LENGTH
<PREFIX_LENGTH_4
) {
1181 udata_writePadding(pData
, PREFIX_LENGTH_4
-PREFIX_LENGTH
);
1184 size
+=sizeof(AlgorithmicRange
)+PREFIX_LENGTH_4
;
1190 udata_writeBlock(pData
, &cjk
, sizeof(AlgorithmicRange
));
1191 udata_writeString(pData
, prefix
, PREFIX_LENGTH
);
1192 if(PREFIX_LENGTH
<PREFIX_LENGTH_4
) {
1193 udata_writePadding(pData
, PREFIX_LENGTH_4
-PREFIX_LENGTH
);
1196 size
+=sizeof(AlgorithmicRange
)+PREFIX_LENGTH_4
;
1199 /* range 2: hangul syllables */
1201 udata_writeBlock(pData
, &hangul
, sizeof(AlgorithmicRange
));
1202 udata_writeBlock(pData
, hangulFactors
, 6);
1203 udata_writeString(pData
, jamo
, sizeof(jamo
));
1205 size
+=sizeof(AlgorithmicRange
)+6+sizeof(jamo
);
1208 /* range 3: cjk extension b */
1209 if(countAlgRanges
>=4) {
1211 udata_writeBlock(pData
, &cjkExtB
, sizeof(AlgorithmicRange
));
1212 udata_writeString(pData
, prefix
, PREFIX_LENGTH
);
1213 if(PREFIX_LENGTH
<PREFIX_LENGTH_4
) {
1214 udata_writePadding(pData
, PREFIX_LENGTH_4
-PREFIX_LENGTH
);
1217 size
+=sizeof(AlgorithmicRange
)+PREFIX_LENGTH_4
;
1224 /* helpers ------------------------------------------------------------------ */
1227 findToken(uint8_t *s
, int16_t length
) {
1230 for(i
=0; i
<(int16_t)tokenCount
; ++i
) {
1232 if(token
>=0 && length
==words
[token
].length
&& 0==uprv_memcmp(s
, words
[token
].s
, length
)) {
1241 findWord(char *s
, int16_t length
) {
1244 for(i
=0; i
<wordCount
; ++i
) {
1245 if(length
==words
[i
].length
&& 0==uprv_memcmp(s
, words
[i
].s
, length
)) {
1254 addWord(char *s
, int16_t length
) {
1255 uint8_t *stringStart
;
1258 if(wordCount
==MAX_WORD_COUNT
) {
1259 fprintf(stderr
, "gennames: too many words\n");
1260 exit(U_BUFFER_OVERFLOW_ERROR
);
1263 stringStart
=allocWord(length
);
1264 uprv_memcpy(stringStart
, s
, length
);
1266 word
=words
+wordCount
;
1269 * Initialize the weight with the costs for this token:
1270 * a zero-terminated string and a 16-bit offset.
1272 word
->weight
=-(length
+1+2);
1274 word
->length
=length
;
1275 word
->s
=stringStart
;
1283 countWord(Word
*word
) {
1284 /* add to the weight the savings: the length of the word minus 1 byte for the token */
1285 word
->weight
+=word
->length
-1;
1290 addLine(uint32_t code
, char *names
[], int16_t lengths
[], int16_t count
) {
1291 uint8_t *stringStart
;
1295 if(lineCount
==MAX_LINE_COUNT
) {
1296 fprintf(stderr
, "gennames: too many lines\n");
1297 exit(U_BUFFER_OVERFLOW_ERROR
);
1300 /* find the last non-empty name */
1301 while(count
>0 && lengths
[count
-1]==0) {
1305 return; /* should not occur: caller should not have called */
1308 /* there will be (count-1) separator characters */
1312 /* add lengths of strings */
1314 length
+=lengths
[--i
];
1317 /* allocate line memory */
1318 stringStart
=allocLine(length
);
1320 /* copy all strings into the line memory */
1321 length
=0; /* number of chars copied so far */
1322 for(i
=0; i
<count
; ++i
) {
1324 stringStart
[length
++]=NAME_SEPARATOR_CHAR
;
1327 uprv_memcpy(stringStart
+length
, names
[i
], lengths
[i
]);
1332 line
=lines
+lineCount
;
1335 line
->length
=length
;
1336 line
->s
=stringStart
;
1340 /* prevent a character value that is actually in a name from becoming a token */
1342 tokens
[stringStart
[--length
]]=-1;
1347 addGroup(uint32_t groupMSB
, uint8_t *strings
, int16_t length
) {
1348 uint8_t *stringStart
;
1351 if(lineCount
==MAX_LINE_COUNT
) {
1352 fprintf(stderr
, "gennames: too many groups\n");
1353 exit(U_BUFFER_OVERFLOW_ERROR
);
1356 /* store the line lengths first, then the strings */
1357 lineLengthsTop
=(lineLengthsTop
+1)/2;
1358 stringStart
=allocLine(lineLengthsTop
+length
);
1359 uprv_memcpy(stringStart
, lineLengths
, lineLengthsTop
);
1360 uprv_memcpy(stringStart
+lineLengthsTop
, strings
, length
);
1362 line
=lines
+lineCount
;
1364 line
->code
=groupMSB
;
1365 line
->length
=length
;
1366 line
->s
=stringStart
;
1372 addToken(uint8_t *s
, int16_t length
) {
1373 uint8_t *stringStart
;
1375 stringStart
=allocLine(length
+1);
1376 uprv_memcpy(stringStart
, s
, length
);
1377 stringStart
[length
]=0;
1379 return (uint32_t)(stringStart
- stringStore
);
1383 appendLineLength(int16_t length
) {
1385 fprintf(stderr
, "gennames: compressed line too long\n");
1386 exit(U_BUFFER_OVERFLOW_ERROR
);
1390 appendLineLengthNibble((uint8_t)((length
>>4)|12));
1392 appendLineLengthNibble((uint8_t)length
);
1396 appendLineLengthNibble(uint8_t nibble
) {
1397 if((lineLengthsTop
&1)==0) {
1398 lineLengths
[lineLengthsTop
/2]=(uint8_t)(nibble
<<4);
1400 lineLengths
[lineLengthsTop
/2]|=nibble
&0xf;
1406 allocLine(int32_t length
) {
1407 uint32_t top
=lineTop
+length
;
1410 if(top
>wordBottom
) {
1411 fprintf(stderr
, "gennames: out of memory\n");
1412 exit(U_MEMORY_ALLOCATION_ERROR
);
1414 p
=stringStore
+lineTop
;
1420 allocWord(uint32_t length
) {
1421 uint32_t bottom
=wordBottom
-length
;
1423 if(lineTop
>bottom
) {
1424 fprintf(stderr
, "gennames: out of memory\n");
1425 exit(U_MEMORY_ALLOCATION_ERROR
);
1428 return stringStore
+bottom
;
1432 * Hey, Emacs, please set the following:
1435 * indent-tabs-mode: nil