2 *******************************************************************************
4 * Copyright (C) 1999-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: genprops.c
10 * tab size: 8 (not used)
13 * created on: 1999dec08
14 * created by: Markus W. Scherer
16 * This program reads several of the Unicode character database text files,
17 * parses them, and extracts most of the properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
25 #include "unicode/utypes.h"
26 #include "unicode/uchar.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
41 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
43 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
45 /* prototypes --------------------------------------------------------------- */
48 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
50 /* -------------------------------------------------------------------------- */
65 /* Keep these values in sync with the above enums */
66 static UOption options
[]={
68 UOPTION_HELP_QUESTION_MARK
,
73 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
75 UOPTION_DEF("csource", 'C', UOPT_NO_ARG
)
79 main(int argc
, char* argv
[]) {
81 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
83 UErrorCode errorCode
=U_ZERO_ERROR
;
85 U_MAIN_INIT_ARGS(argc
, argv
);
87 /* preset then read command line options */
88 options
[DESTDIR
].value
=u_getDataDirectory();
89 options
[SOURCEDIR
].value
="";
90 options
[UNICODE_VERSION
].value
="";
91 options
[ICUDATADIR
].value
=u_getDataDirectory();
92 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
94 /* error handling, printing usage message */
97 "error in command line argument \"%s\"\n",
100 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
102 * Broken into chucks because the C89 standard says the minimum
103 * required supported string length is 509 bytes.
106 "Usage: %s [-options] [suffix]\n"
108 "read the UnicodeData.txt file and other Unicode properties files and\n"
109 "create a binary file " DATA_NAME
"." DATA_TYPE
" with the character properties\n"
114 "\t-h or -? or --help this usage text\n"
115 "\t-v or --verbose verbose output\n"
116 "\t-c or --copyright include a copyright notice\n"
117 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
118 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
120 "\t-d or --destdir destination directory, followed by the path\n"
121 "\t-s or --sourcedir source directory, followed by the path\n"
122 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
123 "\t followed by path, defaults to %s\n"
124 "\tsuffix suffix that is to be appended with a '-'\n"
125 "\t to the source file basenames before opening;\n"
126 "\t 'genprops new' will read UnicodeData-new.txt etc.\n",
127 u_getDataDirectory());
128 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
131 /* get the options values */
132 beVerbose
=options
[VERBOSE
].doesOccur
;
133 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
134 srcDir
=options
[SOURCEDIR
].value
;
135 destDir
=options
[DESTDIR
].value
;
143 if(options
[UNICODE_VERSION
].doesOccur
) {
144 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
146 /* else use the default dataVersion in store.c */
148 if (options
[ICUDATADIR
].doesOccur
) {
149 u_setDataDirectory(options
[ICUDATADIR
].value
);
152 /* prepare the filename beginning with the source dir */
153 uprv_strcpy(filename
, srcDir
);
154 basename
=filename
+uprv_strlen(filename
);
155 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
156 *basename
++=U_FILE_SEP_CHAR
;
162 /* process UnicodeData.txt */
163 writeUCDFilename(basename
, "UnicodeData", suffix
);
164 parseDB(filename
, &errorCode
);
166 /* process additional properties files */
168 generateAdditionalProperties(filename
, suffix
, &errorCode
);
170 /* process parsed data */
171 if(U_SUCCESS(errorCode
)) {
172 /* write the properties data file */
173 generateData(destDir
, options
[CSOURCE
].doesOccur
);
182 writeUCDFilename(char *basename
, const char *filename
, const char *suffix
) {
183 int32_t length
=(int32_t)uprv_strlen(filename
);
184 uprv_strcpy(basename
, filename
);
186 basename
[length
++]='-';
187 uprv_strcpy(basename
+length
, suffix
);
188 length
+=(int32_t)uprv_strlen(suffix
);
190 uprv_strcpy(basename
+length
, ".txt");
194 isToken(const char *token
, const char *s
) {
198 s
=u_skipWhitespace(s
);
205 z
=u_skipWhitespace(s
+j
);
206 if(*z
==';' || *z
==0) {
218 getTokenIndex(const char *const tokens
[], int32_t countTokens
, const char *s
) {
222 s
=u_skipWhitespace(s
);
223 for(i
=0; i
<countTokens
; ++i
) {
232 z
=u_skipWhitespace(s
+j
);
233 if(*z
==';' || *z
==0 || *z
=='#' || *z
=='\r' || *z
=='\n') {
245 /* parser for UnicodeData.txt ----------------------------------------------- */
247 /* general categories */
249 genCategoryNames
[U_CHAR_CATEGORY_COUNT
]={
251 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
252 "Mc", "Nd", "Nl", "No",
254 "Cc", "Cf", "Co", "Cs",
255 "Pd", "Ps", "Pe", "Pc", "Po",
256 "Sm", "Sc", "Sk", "So",
261 decompositionTypeNames
[U_DT_COUNT
]={
283 uint32_t first
, last
, props
;
287 static int32_t unicodeAreaIndex
=0;
289 static void U_CALLCONV
290 unicodeDataLineFn(void *context
,
291 char *fields
[][2], int32_t fieldCount
,
292 UErrorCode
*pErrorCode
) {
295 static uint32_t prevCode
=0;
299 /* reset the properties */
300 uprv_memset(&p
, 0, sizeof(Props
));
302 /* get the character code, field 0 */
303 p
.code
=(uint32_t)uprv_strtoul(fields
[0][0], &end
, 16);
304 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
305 fprintf(stderr
, "genprops: syntax error in field 0 at %s\n", fields
[0][0]);
306 *pErrorCode
=U_PARSE_ERROR
;
310 /* get general category, field 2 */
311 i
=getTokenIndex(genCategoryNames
, U_CHAR_CATEGORY_COUNT
, fields
[2][0]);
313 p
.generalCategory
=(uint8_t)i
;
315 fprintf(stderr
, "genprops: unknown general category \"%s\" at code 0x%lx\n",
316 fields
[2][0], (unsigned long)p
.code
);
317 *pErrorCode
=U_PARSE_ERROR
;
321 /* get decomposition type, field 5 */
322 if(fields
[5][0]<fields
[5][1]) {
323 /* there is some decomposition */
324 if(*fields
[5][0]!='<') {
328 /* get compatibility type */
330 while(end
<fields
[5][1] && *end
!='>') {
334 i
=getTokenIndex(decompositionTypeNames
, U_DT_COUNT
, fields
[5][0]+1);
336 fprintf(stderr
, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
337 fields
[5][0], (unsigned long)p
.code
);
338 *pErrorCode
=U_PARSE_ERROR
;
342 if(!upvec_setValue(pv
, p
.code
, p
.code
+1, 2, (uint32_t)i
, UPROPS_DT_MASK
, pErrorCode
)) {
343 fprintf(stderr
, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode
));
348 /* decimal digit value, field 6 */
349 if(fields
[6][0]<fields
[6][1]) {
350 value
=(uint32_t)uprv_strtoul(fields
[6][0], &end
, 10);
351 if(end
!=fields
[6][1] || value
>0x7fff) {
352 fprintf(stderr
, "genprops: syntax error in field 6 at code 0x%lx\n",
353 (unsigned long)p
.code
);
354 *pErrorCode
=U_PARSE_ERROR
;
357 p
.numericValue
=(int32_t)value
;
361 /* digit value, field 7 */
362 if(fields
[7][0]<fields
[7][1]) {
363 value
=(uint32_t)uprv_strtoul(fields
[7][0], &end
, 10);
364 if(end
!=fields
[7][1] || value
>0x7fff) {
365 fprintf(stderr
, "genprops: syntax error in field 7 at code 0x%lx\n",
366 (unsigned long)p
.code
);
367 *pErrorCode
=U_PARSE_ERROR
;
370 if(p
.numericType
==0) {
371 p
.numericValue
=(int32_t)value
;
373 } else if((int32_t)value
!=p
.numericValue
) {
374 fprintf(stderr
, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
375 (unsigned long)p
.code
);
376 *pErrorCode
=U_PARSE_ERROR
;
381 /* numeric value, field 8 */
382 if(fields
[8][0]<fields
[8][1]) {
383 char *s
=fields
[8][0];
386 /* get a possible minus sign */
394 value
=(uint32_t)uprv_strtoul(s
, &end
, 10);
395 if(value
>0 && *end
=='/') {
396 /* field 8 may contain a fractional value, get the denominator */
397 if(p
.numericType
>0) {
398 fprintf(stderr
, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
399 (unsigned long)p
.code
);
400 *pErrorCode
=U_PARSE_ERROR
;
404 p
.denominator
=(uint32_t)uprv_strtoul(end
+1, &end
, 10);
405 if(p
.denominator
==0) {
406 fprintf(stderr
, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
407 (unsigned long)p
.code
);
408 *pErrorCode
=U_PARSE_ERROR
;
412 if(end
!=fields
[8][1] || value
>0x7fffffff) {
413 fprintf(stderr
, "genprops: syntax error in field 8 at code 0x%lx\n",
414 (unsigned long)p
.code
);
415 *pErrorCode
=U_PARSE_ERROR
;
419 if(p
.numericType
==0) {
421 p
.numericValue
=-(int32_t)value
;
423 p
.numericValue
=(int32_t)value
;
426 } else if((int32_t)value
!=p
.numericValue
) {
427 fprintf(stderr
, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
428 (unsigned long)p
.code
);
429 *pErrorCode
=U_PARSE_ERROR
;
436 if(*fields
[1][0]=='<') {
437 /* first or last entry of a Unicode area */
438 size_t length
=fields
[1][1]-fields
[1][0];
441 /* name too short for an area name */
442 } else if(0==uprv_memcmp(", First>", fields
[1][1]-8, 8)) {
443 /* set the current area */
444 if(unicodeAreas
[unicodeAreaIndex
].first
==0xffffffff) {
446 unicodeAreas
[unicodeAreaIndex
].first
=p
.code
;
447 unicodeAreas
[unicodeAreaIndex
].props
=value
;
448 uprv_memcpy(unicodeAreas
[unicodeAreaIndex
].name
, fields
[1][0]+1, length
);
449 unicodeAreas
[unicodeAreaIndex
].name
[length
]=0;
451 /* error: a previous area is incomplete */
452 fprintf(stderr
, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas
[unicodeAreaIndex
].name
);
453 *pErrorCode
=U_PARSE_ERROR
;
457 } else if(0==uprv_memcmp(", Last>", fields
[1][1]-7, 7)) {
458 /* check that the current area matches, and complete it with the last code point */
460 if( unicodeAreas
[unicodeAreaIndex
].props
==value
&&
461 0==uprv_memcmp(unicodeAreas
[unicodeAreaIndex
].name
, fields
[1][0]+1, length
) &&
462 unicodeAreas
[unicodeAreaIndex
].name
[length
]==0 &&
463 unicodeAreas
[unicodeAreaIndex
].first
<p
.code
465 unicodeAreas
[unicodeAreaIndex
].last
=p
.code
;
467 printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
468 (unsigned long)unicodeAreas
[unicodeAreaIndex
].first
,
469 (unsigned long)unicodeAreas
[unicodeAreaIndex
].last
,
470 unicodeAreas
[unicodeAreaIndex
].name
);
472 unicodeAreas
[++unicodeAreaIndex
].first
=0xffffffff;
474 /* error: different properties between first & last, different area name, first>=last */
475 fprintf(stderr
, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas
[unicodeAreaIndex
].name
);
476 *pErrorCode
=U_PARSE_ERROR
;
481 /* not an area name */
485 /* check for non-character code points */
486 if((p
.code
&0xfffe)==0xfffe || (uint32_t)(p
.code
-0xfdd0)<0x20) {
487 fprintf(stderr
, "genprops: error - properties for non-character code point U+%04lx\n",
488 (unsigned long)p
.code
);
489 *pErrorCode
=U_PARSE_ERROR
;
493 /* check that the code points (p.code) are in ascending order */
494 if(p
.code
<=prevCode
&& p
.code
>0) {
495 fprintf(stderr
, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
496 (unsigned long)p
.code
, (unsigned long)prevCode
);
497 *pErrorCode
=U_PARSE_ERROR
;
502 /* properties for a single code point */
503 addProps(p
.code
, value
);
506 /* set repeated properties for the areas */
511 UBool hasPlane15PUA
, hasPlane16PUA
;
512 UErrorCode errorCode
;
515 * UnicodeData.txt before 3.0.1 did not contain the PUAs on
517 * If that is the case, then we add them here, using the properties
521 hasPlane15PUA
=hasPlane16PUA
=FALSE
;
523 for(i
=0; i
<unicodeAreaIndex
; ++i
) {
524 repeatProps(unicodeAreas
[i
].first
,
525 unicodeAreas
[i
].last
,
526 unicodeAreas
[i
].props
);
527 if(unicodeAreas
[i
].first
==0xe000) {
528 puaProps
=unicodeAreas
[i
].props
;
529 } else if(unicodeAreas
[i
].first
==0xf0000) {
531 } else if(unicodeAreas
[i
].first
==0x100000) {
538 repeatProps(0xf0000, 0xffffd, puaProps
);
541 repeatProps(0x100000, 0x10fffd, puaProps
);
545 /* Hangul have canonical decompositions */
546 errorCode
=U_ZERO_ERROR
;
547 if(!upvec_setValue(pv
, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL
, UPROPS_DT_MASK
, &errorCode
)) {
548 fprintf(stderr
, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode
));
554 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
557 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
561 /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
562 unicodeAreas
[0].first
=0xffffffff;
564 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
566 if(unicodeAreas
[unicodeAreaIndex
].first
!=0xffffffff) {
567 fprintf(stderr
, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
568 unicodeAreas
[unicodeAreaIndex
].name
,
569 (unsigned long)unicodeAreas
[unicodeAreaIndex
].first
);
570 *pErrorCode
=U_PARSE_ERROR
;
576 if(U_FAILURE(*pErrorCode
)) {
582 * Hey, Emacs, please set the following:
585 * indent-tabs-mode: nil