2 *******************************************************************************
4 * Copyright (C) 2004-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004dec30
14 * created by: Markus W. Scherer
16 * This program reads several of the Unicode character database text files,
17 * parses them, and extracts the bidi/shaping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/putil.h"
27 #include "unicode/uclean.h"
35 #include "ubidi_props.h"
38 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40 /* data --------------------------------------------------------------------- */
44 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
46 /* prototypes --------------------------------------------------------------- */
49 isToken(const char *token
, const char *s
);
52 parseBidiMirroring(const char *filename
, UErrorCode
*pErrorCode
);
55 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
57 /* miscellaneous ------------------------------------------------------------ */
59 /* TODO: more common code, move functions to uparse.h|c */
62 trimTerminateField(char *s
, char *limit
) {
63 /* trim leading whitespace */
64 s
=(char *)u_skipWhitespace(s
);
66 /* trim trailing whitespace */
67 while(s
<limit
&& (*(limit
-1)==' ' || *(limit
-1)=='\t')) {
76 parseTwoFieldFile(char *filename
, char *basename
,
77 const char *ucdFile
, const char *suffix
,
79 UErrorCode
*pErrorCode
) {
82 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
86 writeUCDFilename(basename
, ucdFile
, suffix
);
88 u_parseDelimitedFile(filename
, ';', fields
, 2, lineFn
, NULL
, pErrorCode
);
89 if(U_FAILURE(*pErrorCode
)) {
90 fprintf(stderr
, "error parsing %s.txt: %s\n", ucdFile
, u_errorName(*pErrorCode
));
94 static void U_CALLCONV
95 bidiClassLineFn(void *context
,
96 char *fields
[][2], int32_t fieldCount
,
97 UErrorCode
*pErrorCode
);
99 /* parse files with single enumerated properties ---------------------------- */
101 /* TODO: more common code, move functions to uparse.h|c */
104 const char *ucdFile
, *propName
;
106 int32_t vecWord
, vecShift
;
109 typedef struct SingleEnum SingleEnum
;
112 parseSingleEnumFile(char *filename
, char *basename
, const char *suffix
,
113 const SingleEnum
*sen
,
114 UErrorCode
*pErrorCode
);
116 static const SingleEnum jtSingleEnum
={
117 "DerivedJoiningType", "joining type",
119 0, UBIDI_JT_SHIFT
, UBIDI_JT_MASK
122 static const SingleEnum jgSingleEnum
={
123 "DerivedJoiningGroup", "joining group",
125 1, 0, 0xff /* column 1 bits 7..0 */
128 static void U_CALLCONV
129 singleEnumLineFn(void *context
,
130 char *fields
[][2], int32_t fieldCount
,
131 UErrorCode
*pErrorCode
) {
132 const SingleEnum
*sen
;
134 uint32_t start
, limit
, uv
;
137 sen
=(const SingleEnum
*)context
;
139 u_parseCodePointRange(fields
[0][0], &start
, &limit
, pErrorCode
);
140 if(U_FAILURE(*pErrorCode
)) {
141 fprintf(stderr
, "genbidi: syntax error in %s.txt field 0 at %s\n", sen
->ucdFile
, fields
[0][0]);
146 /* parse property alias */
147 s
=trimTerminateField(fields
[1][0], fields
[1][1]);
148 value
=u_getPropertyValueEnum(sen
->prop
, s
);
150 if(sen
->prop
==UCHAR_BLOCK
) {
151 if(isToken("Greek", s
)) {
152 value
=UBLOCK_GREEK
; /* Unicode 3.2 renames this to "Greek and Coptic" */
153 } else if(isToken("Combining Marks for Symbols", s
)) {
154 value
=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS
; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
155 } else if(isToken("Private Use", s
)) {
156 value
=UBLOCK_PRIVATE_USE
; /* Unicode 3.2 renames this to "Private Use Area" */
161 fprintf(stderr
, "genbidi error: unknown %s name in %s.txt field 1 at %s\n",
162 sen
->propName
, sen
->ucdFile
, s
);
166 uv
=(uint32_t)(value
<<sen
->vecShift
);
167 if((uv
&sen
->vecMask
)!=uv
) {
168 fprintf(stderr
, "genbidi error: %s value overflow (0x%x) at %s\n",
169 sen
->propName
, (int)uv
, s
);
170 exit(U_INTERNAL_PROGRAM_ERROR
);
173 if(!upvec_setValue(pv
, start
, limit
, sen
->vecWord
, uv
, sen
->vecMask
, pErrorCode
)) {
174 fprintf(stderr
, "genbidi error: unable to set %s code: %s\n",
175 sen
->propName
, u_errorName(*pErrorCode
));
181 parseSingleEnumFile(char *filename
, char *basename
, const char *suffix
,
182 const SingleEnum
*sen
,
183 UErrorCode
*pErrorCode
) {
186 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
190 writeUCDFilename(basename
, sen
->ucdFile
, suffix
);
192 u_parseDelimitedFile(filename
, ';', fields
, 2, singleEnumLineFn
, (void *)sen
, pErrorCode
);
193 if(U_FAILURE(*pErrorCode
)) {
194 fprintf(stderr
, "error parsing %s.txt: %s\n", sen
->ucdFile
, u_errorName(*pErrorCode
));
198 /* parse files with multiple binary properties ------------------------------ */
200 /* TODO: more common code, move functions to uparse.h|c */
202 /* TODO: similar to genbidi/props2.c but not the same; same as in gencase/gencase.c */
205 const char *propName
;
207 uint32_t vecValue
, vecMask
;
209 typedef struct Binary Binary
;
213 const Binary
*binaries
;
214 int32_t binariesCount
;
216 typedef struct Binaries Binaries
;
220 { "Bidi_Control", 0, U_MASK(UBIDI_BIDI_CONTROL_SHIFT
), U_MASK(UBIDI_BIDI_CONTROL_SHIFT
) },
221 { "Join_Control", 0, U_MASK(UBIDI_JOIN_CONTROL_SHIFT
), U_MASK(UBIDI_JOIN_CONTROL_SHIFT
) }
224 static const Binaries
226 "PropList", propListNames
, LENGTHOF(propListNames
)
229 static void U_CALLCONV
230 binariesLineFn(void *context
,
231 char *fields
[][2], int32_t fieldCount
,
232 UErrorCode
*pErrorCode
) {
235 uint32_t start
, limit
;
238 bin
=(const Binaries
*)context
;
240 u_parseCodePointRange(fields
[0][0], &start
, &limit
, pErrorCode
);
241 if(U_FAILURE(*pErrorCode
)) {
242 fprintf(stderr
, "genbidi: syntax error in %s.txt field 0 at %s\n", bin
->ucdFile
, fields
[0][0]);
247 /* parse binary property name */
248 s
=(char *)u_skipWhitespace(fields
[1][0]);
250 if(i
==bin
->binariesCount
) {
251 /* ignore unrecognized properties */
254 if(isToken(bin
->binaries
[i
].propName
, s
)) {
259 if(bin
->binaries
[i
].vecMask
==0) {
260 fprintf(stderr
, "genbidi error: mask value %d==0 for %s %s\n",
261 (int)bin
->binaries
[i
].vecMask
, bin
->ucdFile
, bin
->binaries
[i
].propName
);
262 exit(U_INTERNAL_PROGRAM_ERROR
);
265 if(!upvec_setValue(pv
, start
, limit
, bin
->binaries
[i
].vecWord
, bin
->binaries
[i
].vecValue
, bin
->binaries
[i
].vecMask
, pErrorCode
)) {
266 fprintf(stderr
, "genbidi error: unable to set %s, code: %s\n",
267 bin
->binaries
[i
].propName
, u_errorName(*pErrorCode
));
273 parseBinariesFile(char *filename
, char *basename
, const char *suffix
,
275 UErrorCode
*pErrorCode
) {
278 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
282 writeUCDFilename(basename
, bin
->ucdFile
, suffix
);
284 u_parseDelimitedFile(filename
, ';', fields
, 2, binariesLineFn
, (void *)bin
, pErrorCode
);
285 if(U_FAILURE(*pErrorCode
)) {
286 fprintf(stderr
, "error parsing %s.txt: %s\n", bin
->ucdFile
, u_errorName(*pErrorCode
));
290 /* -------------------------------------------------------------------------- */
304 /* Keep these values in sync with the above enums */
305 static UOption options
[]={
307 UOPTION_HELP_QUESTION_MARK
,
312 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
314 UOPTION_DEF("csource", 'C', UOPT_NO_ARG
)
318 main(int argc
, char* argv
[]) {
320 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
322 UErrorCode errorCode
=U_ZERO_ERROR
;
324 U_MAIN_INIT_ARGS(argc
, argv
);
326 /* preset then read command line options */
327 options
[DESTDIR
].value
=u_getDataDirectory();
328 options
[SOURCEDIR
].value
="";
329 options
[UNICODE_VERSION
].value
="";
330 options
[ICUDATADIR
].value
=u_getDataDirectory();
331 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
333 /* error handling, printing usage message */
336 "error in command line argument \"%s\"\n",
339 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
341 * Broken into chucks because the C89 standard says the minimum
342 * required supported string length is 509 bytes.
345 "Usage: %s [-options] [suffix]\n"
347 "read the UnicodeData.txt file and other Unicode properties files and\n"
348 "create a binary file " UBIDI_DATA_NAME
"." UBIDI_DATA_TYPE
" with the bidi/shaping properties\n"
353 "\t-h or -? or --help this usage text\n"
354 "\t-v or --verbose verbose output\n"
355 "\t-c or --copyright include a copyright notice\n"
356 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
357 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
359 "\t-d or --destdir destination directory, followed by the path\n"
360 "\t-s or --sourcedir source directory, followed by the path\n"
361 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
362 "\t followed by path, defaults to %s\n"
363 "\tsuffix suffix that is to be appended with a '-'\n"
364 "\t to the source file basenames before opening;\n"
365 "\t 'genbidi new' will read UnicodeData-new.txt etc.\n",
366 u_getDataDirectory());
367 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
370 /* get the options values */
371 beVerbose
=options
[VERBOSE
].doesOccur
;
372 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
373 srcDir
=options
[SOURCEDIR
].value
;
374 destDir
=options
[DESTDIR
].value
;
382 if(options
[UNICODE_VERSION
].doesOccur
) {
383 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
385 /* else use the default dataVersion in store.c */
387 if (options
[ICUDATADIR
].doesOccur
) {
388 u_setDataDirectory(options
[ICUDATADIR
].value
);
391 /* prepare the filename beginning with the source dir */
392 uprv_strcpy(filename
, srcDir
);
393 basename
=filename
+uprv_strlen(filename
);
394 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
395 *basename
++=U_FILE_SEP_CHAR
;
399 pv
=upvec_open(2, 10000);
401 /* process BidiMirroring.txt */
402 writeUCDFilename(basename
, "BidiMirroring", suffix
);
403 parseBidiMirroring(filename
, &errorCode
);
405 /* process additional properties files */
408 parseBinariesFile(filename
, basename
, suffix
, &propListBinaries
, &errorCode
);
410 parseSingleEnumFile(filename
, basename
, suffix
, &jtSingleEnum
, &errorCode
);
412 parseSingleEnumFile(filename
, basename
, suffix
, &jgSingleEnum
, &errorCode
);
414 /* process UnicodeData.txt */
415 writeUCDFilename(basename
, "UnicodeData", suffix
);
416 parseDB(filename
, &errorCode
);
418 /* set proper bidi class for unassigned code points (Cn) */
419 parseTwoFieldFile(filename
, basename
, "DerivedBidiClass", suffix
, bidiClassLineFn
, &errorCode
);
421 /* process parsed data */
422 if(U_SUCCESS(errorCode
)) {
423 /* write the properties data file */
424 generateData(destDir
, options
[CSOURCE
].doesOccur
);
432 writeUCDFilename(char *basename
, const char *filename
, const char *suffix
) {
433 int32_t length
=(int32_t)uprv_strlen(filename
);
434 uprv_strcpy(basename
, filename
);
436 basename
[length
++]='-';
437 uprv_strcpy(basename
+length
, suffix
);
438 length
+=(int32_t)uprv_strlen(suffix
);
440 uprv_strcpy(basename
+length
, ".txt");
443 /* TODO: move to toolutil */
445 isToken(const char *token
, const char *s
) {
449 s
=u_skipWhitespace(s
);
456 z
=u_skipWhitespace(s
+j
);
457 if(*z
==';' || *z
==0) {
468 /* parser for BidiMirroring.txt --------------------------------------------- */
470 static void U_CALLCONV
471 mirrorLineFn(void *context
,
472 char *fields
[][2], int32_t fieldCount
,
473 UErrorCode
*pErrorCode
) {
477 src
=(UChar32
)uprv_strtoul(fields
[0][0], &end
, 16);
478 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
479 fprintf(stderr
, "genbidi: syntax error in BidiMirroring.txt field 0 at %s\n", fields
[0][0]);
480 *pErrorCode
=U_PARSE_ERROR
;
484 mirror
=(UChar32
)uprv_strtoul(fields
[1][0], &end
, 16);
485 if(end
<=fields
[1][0] || end
!=fields
[1][1]) {
486 fprintf(stderr
, "genbidi: syntax error in BidiMirroring.txt field 1 at %s\n", fields
[1][0]);
487 *pErrorCode
=U_PARSE_ERROR
;
491 addMirror(src
, mirror
);
495 parseBidiMirroring(const char *filename
, UErrorCode
*pErrorCode
) {
498 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
502 u_parseDelimitedFile(filename
, ';', fields
, 2, mirrorLineFn
, NULL
, pErrorCode
);
505 /* parser for UnicodeData.txt ----------------------------------------------- */
507 static void U_CALLCONV
508 unicodeDataLineFn(void *context
,
509 char *fields
[][2], int32_t fieldCount
,
510 UErrorCode
*pErrorCode
) {
512 UErrorCode errorCode
;
515 errorCode
=U_ZERO_ERROR
;
517 /* get the character code, field 0 */
518 c
=(UChar32
)uprv_strtoul(fields
[0][0], &end
, 16);
519 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
520 fprintf(stderr
, "genbidi: syntax error in field 0 at %s\n", fields
[0][0]);
521 *pErrorCode
=U_PARSE_ERROR
;
525 /* get Mirrored flag, field 9 */
526 if(*fields
[9][0]=='Y') {
527 if(!upvec_setValue(pv
, c
, c
+1, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT
), U_MASK(UBIDI_IS_MIRRORED_SHIFT
), &errorCode
)) {
528 fprintf(stderr
, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n",
529 (long)c
, u_errorName(errorCode
));
532 } else if(fields
[9][1]-fields
[9][0]!=1 || *fields
[9][0]!='N') {
533 fprintf(stderr
, "genbidi: syntax error in field 9 at U+%04lx\n",
535 *pErrorCode
=U_PARSE_ERROR
;
541 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
542 /* default Bidi classes for unassigned code points */
543 static const UChar32 defaultBidi
[][3]={ /* { start, end, class } */
544 /* R: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF */
545 { 0x0590, 0x05FF, U_RIGHT_TO_LEFT
},
546 { 0x07C0, 0x08FF, U_RIGHT_TO_LEFT
},
547 { 0xFB1D, 0xFB4F, U_RIGHT_TO_LEFT
},
548 { 0x10800, 0x10FFF, U_RIGHT_TO_LEFT
},
550 /* AL: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE */
551 { 0x0600, 0x07BF, U_RIGHT_TO_LEFT_ARABIC
},
552 { 0xFB50, 0xFDCF, U_RIGHT_TO_LEFT_ARABIC
},
553 { 0xFDF0, 0xFDFF, U_RIGHT_TO_LEFT_ARABIC
},
554 { 0xFE70, 0xFEFE, U_RIGHT_TO_LEFT_ARABIC
}
563 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
568 * Set default Bidi classes for unassigned code points.
569 * See the documentation for Bidi_Class in UCD.html in the Unicode data.
570 * http://www.unicode.org/Public/
572 * Starting with Unicode 5.0, DerivedBidiClass.txt should (re)set
573 * the Bidi_Class values for all code points including unassigned ones
574 * and including L values for these.
575 * This code becomes unnecesary but harmless. Leave it for now in case
576 * someone uses genbidi on pre-Unicode 5.0 data.
578 for(i
=0; i
<LENGTHOF(defaultBidi
); ++i
) {
579 start
=defaultBidi
[i
][0];
580 end
=defaultBidi
[i
][1];
581 if(!upvec_setValue(pv
, start
, end
+1, 0, (uint32_t)defaultBidi
[i
][2], UBIDI_CLASS_MASK
, pErrorCode
)) {
582 fprintf(stderr
, "genbidi error: unable to set default bidi class for U+%04lx..U+%04lx, code: %s\n",
583 (long)start
, (long)end
, u_errorName(*pErrorCode
));
588 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
590 if(U_FAILURE(*pErrorCode
)) {
595 /* DerivedBidiClass.txt ----------------------------------------------------- */
597 static void U_CALLCONV
598 bidiClassLineFn(void *context
,
599 char *fields
[][2], int32_t fieldCount
,
600 UErrorCode
*pErrorCode
) {
602 uint32_t start
, limit
, value
;
604 /* get the code point range */
605 u_parseCodePointRange(fields
[0][0], &start
, &limit
, pErrorCode
);
606 if(U_FAILURE(*pErrorCode
)) {
607 fprintf(stderr
, "genbidi: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields
[0][0]);
612 /* parse bidi class */
613 s
=trimTerminateField(fields
[1][0], fields
[1][1]);
614 value
=u_getPropertyValueEnum(UCHAR_BIDI_CLASS
, s
);
615 if((int32_t)value
<0) {
616 fprintf(stderr
, "genbidi error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s
);
620 if(!upvec_setValue(pv
, start
, limit
, 0, value
, UBIDI_CLASS_MASK
, pErrorCode
)) {
621 fprintf(stderr
, "genbidi error: unable to set derived bidi class for U+%04x..U+%04x - %s\n",
622 (int)start
, (int)limit
-1, u_errorName(*pErrorCode
));
628 * Hey, Emacs, please set the following:
631 * indent-tabs-mode: nil