]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genbidi/genbidi.c
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / tools / genbidi / genbidi.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: genbidi.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004dec30
14 * created by: Markus W. Scherer
15 *
16 * This program reads several of the Unicode character database text files,
17 * parses them, and extracts the bidi/shaping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
21 */
22
23 #include <stdio.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/putil.h"
27 #include "unicode/uclean.h"
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "uarrsort.h"
31 #include "unewdata.h"
32 #include "uoptions.h"
33 #include "uparse.h"
34 #include "propsvec.h"
35 #include "ubidi_props.h"
36 #include "genbidi.h"
37
38 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
39
40 /* data --------------------------------------------------------------------- */
41
42 uint32_t *pv;
43
44 UBool beVerbose=FALSE, haveCopyright=TRUE;
45
46 /* prototypes --------------------------------------------------------------- */
47
48 static UBool
49 isToken(const char *token, const char *s);
50
51 static void
52 parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
53
54 static void
55 parseDB(const char *filename, UErrorCode *pErrorCode);
56
57 /* miscellaneous ------------------------------------------------------------ */
58
59 /* TODO: more common code, move functions to uparse.h|c */
60
61 static char *
62 trimTerminateField(char *s, char *limit) {
63 /* trim leading whitespace */
64 s=(char *)u_skipWhitespace(s);
65
66 /* trim trailing whitespace */
67 while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
68 --limit;
69 }
70 *limit=0;
71
72 return s;
73 }
74
75 static void
76 parseTwoFieldFile(char *filename, char *basename,
77 const char *ucdFile, const char *suffix,
78 UParseLineFn *lineFn,
79 UErrorCode *pErrorCode) {
80 char *fields[2][2];
81
82 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
83 return;
84 }
85
86 writeUCDFilename(basename, ucdFile, suffix);
87
88 u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
89 if(U_FAILURE(*pErrorCode)) {
90 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
91 }
92 }
93
94 static void U_CALLCONV
95 bidiClassLineFn(void *context,
96 char *fields[][2], int32_t fieldCount,
97 UErrorCode *pErrorCode);
98
99 /* parse files with single enumerated properties ---------------------------- */
100
101 /* TODO: more common code, move functions to uparse.h|c */
102
103 struct SingleEnum {
104 const char *ucdFile, *propName;
105 UProperty prop;
106 int32_t vecWord, vecShift;
107 uint32_t vecMask;
108 };
109 typedef struct SingleEnum SingleEnum;
110
111 static void
112 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
113 const SingleEnum *sen,
114 UErrorCode *pErrorCode);
115
116 static const SingleEnum jtSingleEnum={
117 "DerivedJoiningType", "joining type",
118 UCHAR_JOINING_TYPE,
119 0, UBIDI_JT_SHIFT, UBIDI_JT_MASK
120 };
121
122 static const SingleEnum jgSingleEnum={
123 "DerivedJoiningGroup", "joining group",
124 UCHAR_JOINING_GROUP,
125 1, 0, 0xff /* column 1 bits 7..0 */
126 };
127
128 static void U_CALLCONV
129 singleEnumLineFn(void *context,
130 char *fields[][2], int32_t fieldCount,
131 UErrorCode *pErrorCode) {
132 const SingleEnum *sen;
133 char *s;
134 uint32_t start, limit, uv;
135 int32_t value;
136
137 sen=(const SingleEnum *)context;
138
139 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
140 if(U_FAILURE(*pErrorCode)) {
141 fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
142 exit(*pErrorCode);
143 }
144 ++limit;
145
146 /* parse property alias */
147 s=trimTerminateField(fields[1][0], fields[1][1]);
148 value=u_getPropertyValueEnum(sen->prop, s);
149 if(value<0) {
150 if(sen->prop==UCHAR_BLOCK) {
151 if(isToken("Greek", s)) {
152 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
153 } else if(isToken("Combining Marks for Symbols", s)) {
154 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
155 } else if(isToken("Private Use", s)) {
156 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
157 }
158 }
159 }
160 if(value<0) {
161 fprintf(stderr, "genbidi error: unknown %s name in %s.txt field 1 at %s\n",
162 sen->propName, sen->ucdFile, s);
163 exit(U_PARSE_ERROR);
164 }
165
166 uv=(uint32_t)(value<<sen->vecShift);
167 if((uv&sen->vecMask)!=uv) {
168 fprintf(stderr, "genbidi error: %s value overflow (0x%x) at %s\n",
169 sen->propName, (int)uv, s);
170 exit(U_INTERNAL_PROGRAM_ERROR);
171 }
172
173 if(!upvec_setValue(pv, start, limit, sen->vecWord, uv, sen->vecMask, pErrorCode)) {
174 fprintf(stderr, "genbidi error: unable to set %s code: %s\n",
175 sen->propName, u_errorName(*pErrorCode));
176 exit(*pErrorCode);
177 }
178 }
179
180 static void
181 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
182 const SingleEnum *sen,
183 UErrorCode *pErrorCode) {
184 char *fields[2][2];
185
186 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
187 return;
188 }
189
190 writeUCDFilename(basename, sen->ucdFile, suffix);
191
192 u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
193 if(U_FAILURE(*pErrorCode)) {
194 fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
195 }
196 }
197
198 /* parse files with multiple binary properties ------------------------------ */
199
200 /* TODO: more common code, move functions to uparse.h|c */
201
202 /* TODO: similar to genbidi/props2.c but not the same; same as in gencase/gencase.c */
203
204 struct Binary {
205 const char *propName;
206 int32_t vecWord;
207 uint32_t vecValue, vecMask;
208 };
209 typedef struct Binary Binary;
210
211 struct Binaries {
212 const char *ucdFile;
213 const Binary *binaries;
214 int32_t binariesCount;
215 };
216 typedef struct Binaries Binaries;
217
218 static const Binary
219 propListNames[]={
220 { "Bidi_Control", 0, U_MASK(UBIDI_BIDI_CONTROL_SHIFT), U_MASK(UBIDI_BIDI_CONTROL_SHIFT) },
221 { "Join_Control", 0, U_MASK(UBIDI_JOIN_CONTROL_SHIFT), U_MASK(UBIDI_JOIN_CONTROL_SHIFT) }
222 };
223
224 static const Binaries
225 propListBinaries={
226 "PropList", propListNames, LENGTHOF(propListNames)
227 };
228
229 static void U_CALLCONV
230 binariesLineFn(void *context,
231 char *fields[][2], int32_t fieldCount,
232 UErrorCode *pErrorCode) {
233 const Binaries *bin;
234 char *s;
235 uint32_t start, limit;
236 int32_t i;
237
238 bin=(const Binaries *)context;
239
240 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
241 if(U_FAILURE(*pErrorCode)) {
242 fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
243 exit(*pErrorCode);
244 }
245 ++limit;
246
247 /* parse binary property name */
248 s=(char *)u_skipWhitespace(fields[1][0]);
249 for(i=0;; ++i) {
250 if(i==bin->binariesCount) {
251 /* ignore unrecognized properties */
252 return;
253 }
254 if(isToken(bin->binaries[i].propName, s)) {
255 break;
256 }
257 }
258
259 if(bin->binaries[i].vecMask==0) {
260 fprintf(stderr, "genbidi error: mask value %d==0 for %s %s\n",
261 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
262 exit(U_INTERNAL_PROGRAM_ERROR);
263 }
264
265 if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
266 fprintf(stderr, "genbidi error: unable to set %s, code: %s\n",
267 bin->binaries[i].propName, u_errorName(*pErrorCode));
268 exit(*pErrorCode);
269 }
270 }
271
272 static void
273 parseBinariesFile(char *filename, char *basename, const char *suffix,
274 const Binaries *bin,
275 UErrorCode *pErrorCode) {
276 char *fields[2][2];
277
278 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
279 return;
280 }
281
282 writeUCDFilename(basename, bin->ucdFile, suffix);
283
284 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
285 if(U_FAILURE(*pErrorCode)) {
286 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
287 }
288 }
289
290 /* -------------------------------------------------------------------------- */
291
292 enum {
293 HELP_H,
294 HELP_QUESTION_MARK,
295 VERBOSE,
296 COPYRIGHT,
297 DESTDIR,
298 SOURCEDIR,
299 UNICODE_VERSION,
300 ICUDATADIR,
301 CSOURCE
302 };
303
304 /* Keep these values in sync with the above enums */
305 static UOption options[]={
306 UOPTION_HELP_H,
307 UOPTION_HELP_QUESTION_MARK,
308 UOPTION_VERBOSE,
309 UOPTION_COPYRIGHT,
310 UOPTION_DESTDIR,
311 UOPTION_SOURCEDIR,
312 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
313 UOPTION_ICUDATADIR,
314 UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
315 };
316
317 extern int
318 main(int argc, char* argv[]) {
319 char filename[300];
320 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
321 char *basename=NULL;
322 UErrorCode errorCode=U_ZERO_ERROR;
323
324 U_MAIN_INIT_ARGS(argc, argv);
325
326 /* preset then read command line options */
327 options[DESTDIR].value=u_getDataDirectory();
328 options[SOURCEDIR].value="";
329 options[UNICODE_VERSION].value="";
330 options[ICUDATADIR].value=u_getDataDirectory();
331 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
332
333 /* error handling, printing usage message */
334 if(argc<0) {
335 fprintf(stderr,
336 "error in command line argument \"%s\"\n",
337 argv[-argc]);
338 }
339 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
340 /*
341 * Broken into chucks because the C89 standard says the minimum
342 * required supported string length is 509 bytes.
343 */
344 fprintf(stderr,
345 "Usage: %s [-options] [suffix]\n"
346 "\n"
347 "read the UnicodeData.txt file and other Unicode properties files and\n"
348 "create a binary file " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE " with the bidi/shaping properties\n"
349 "\n",
350 argv[0]);
351 fprintf(stderr,
352 "Options:\n"
353 "\t-h or -? or --help this usage text\n"
354 "\t-v or --verbose verbose output\n"
355 "\t-c or --copyright include a copyright notice\n"
356 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
357 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
358 fprintf(stderr,
359 "\t-d or --destdir destination directory, followed by the path\n"
360 "\t-s or --sourcedir source directory, followed by the path\n"
361 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
362 "\t followed by path, defaults to %s\n"
363 "\tsuffix suffix that is to be appended with a '-'\n"
364 "\t to the source file basenames before opening;\n"
365 "\t 'genbidi new' will read UnicodeData-new.txt etc.\n",
366 u_getDataDirectory());
367 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
368 }
369
370 /* get the options values */
371 beVerbose=options[VERBOSE].doesOccur;
372 haveCopyright=options[COPYRIGHT].doesOccur;
373 srcDir=options[SOURCEDIR].value;
374 destDir=options[DESTDIR].value;
375
376 if(argc>=2) {
377 suffix=argv[1];
378 } else {
379 suffix=NULL;
380 }
381
382 if(options[UNICODE_VERSION].doesOccur) {
383 setUnicodeVersion(options[UNICODE_VERSION].value);
384 }
385 /* else use the default dataVersion in store.c */
386
387 if (options[ICUDATADIR].doesOccur) {
388 u_setDataDirectory(options[ICUDATADIR].value);
389 }
390
391 /* prepare the filename beginning with the source dir */
392 uprv_strcpy(filename, srcDir);
393 basename=filename+uprv_strlen(filename);
394 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
395 *basename++=U_FILE_SEP_CHAR;
396 }
397
398 /* initialize */
399 pv=upvec_open(2, 10000);
400
401 /* process BidiMirroring.txt */
402 writeUCDFilename(basename, "BidiMirroring", suffix);
403 parseBidiMirroring(filename, &errorCode);
404
405 /* process additional properties files */
406 *basename=0;
407
408 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
409
410 parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, &errorCode);
411
412 parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, &errorCode);
413
414 /* process UnicodeData.txt */
415 writeUCDFilename(basename, "UnicodeData", suffix);
416 parseDB(filename, &errorCode);
417
418 /* set proper bidi class for unassigned code points (Cn) */
419 parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, &errorCode);
420
421 /* process parsed data */
422 if(U_SUCCESS(errorCode)) {
423 /* write the properties data file */
424 generateData(destDir, options[CSOURCE].doesOccur);
425 }
426
427 u_cleanup();
428 return errorCode;
429 }
430
431 U_CFUNC void
432 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
433 int32_t length=(int32_t)uprv_strlen(filename);
434 uprv_strcpy(basename, filename);
435 if(suffix!=NULL) {
436 basename[length++]='-';
437 uprv_strcpy(basename+length, suffix);
438 length+=(int32_t)uprv_strlen(suffix);
439 }
440 uprv_strcpy(basename+length, ".txt");
441 }
442
443 /* TODO: move to toolutil */
444 static UBool
445 isToken(const char *token, const char *s) {
446 const char *z;
447 int32_t j;
448
449 s=u_skipWhitespace(s);
450 for(j=0;; ++j) {
451 if(token[j]!=0) {
452 if(s[j]!=token[j]) {
453 break;
454 }
455 } else {
456 z=u_skipWhitespace(s+j);
457 if(*z==';' || *z==0) {
458 return TRUE;
459 } else {
460 break;
461 }
462 }
463 }
464
465 return FALSE;
466 }
467
468 /* parser for BidiMirroring.txt --------------------------------------------- */
469
470 static void U_CALLCONV
471 mirrorLineFn(void *context,
472 char *fields[][2], int32_t fieldCount,
473 UErrorCode *pErrorCode) {
474 char *end;
475 UChar32 src, mirror;
476
477 src=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
478 if(end<=fields[0][0] || end!=fields[0][1]) {
479 fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
480 *pErrorCode=U_PARSE_ERROR;
481 exit(U_PARSE_ERROR);
482 }
483
484 mirror=(UChar32)uprv_strtoul(fields[1][0], &end, 16);
485 if(end<=fields[1][0] || end!=fields[1][1]) {
486 fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
487 *pErrorCode=U_PARSE_ERROR;
488 exit(U_PARSE_ERROR);
489 }
490
491 addMirror(src, mirror);
492 }
493
494 static void
495 parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
496 char *fields[2][2];
497
498 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
499 return;
500 }
501
502 u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
503 }
504
505 /* parser for UnicodeData.txt ----------------------------------------------- */
506
507 static void U_CALLCONV
508 unicodeDataLineFn(void *context,
509 char *fields[][2], int32_t fieldCount,
510 UErrorCode *pErrorCode) {
511 char *end;
512 UErrorCode errorCode;
513 UChar32 c;
514
515 errorCode=U_ZERO_ERROR;
516
517 /* get the character code, field 0 */
518 c=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
519 if(end<=fields[0][0] || end!=fields[0][1]) {
520 fprintf(stderr, "genbidi: syntax error in field 0 at %s\n", fields[0][0]);
521 *pErrorCode=U_PARSE_ERROR;
522 exit(U_PARSE_ERROR);
523 }
524
525 /* get Mirrored flag, field 9 */
526 if(*fields[9][0]=='Y') {
527 if(!upvec_setValue(pv, c, c+1, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT), U_MASK(UBIDI_IS_MIRRORED_SHIFT), &errorCode)) {
528 fprintf(stderr, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n",
529 (long)c, u_errorName(errorCode));
530 exit(errorCode);
531 }
532 } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
533 fprintf(stderr, "genbidi: syntax error in field 9 at U+%04lx\n",
534 (long)c);
535 *pErrorCode=U_PARSE_ERROR;
536 exit(U_PARSE_ERROR);
537 }
538 }
539
540 static void
541 parseDB(const char *filename, UErrorCode *pErrorCode) {
542 /* default Bidi classes for unassigned code points */
543 static const UChar32 defaultBidi[][3]={ /* { start, end, class } */
544 /* R: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF */
545 { 0x0590, 0x05FF, U_RIGHT_TO_LEFT },
546 { 0x07C0, 0x08FF, U_RIGHT_TO_LEFT },
547 { 0xFB1D, 0xFB4F, U_RIGHT_TO_LEFT },
548 { 0x10800, 0x10FFF, U_RIGHT_TO_LEFT },
549
550 /* AL: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE */
551 { 0x0600, 0x07BF, U_RIGHT_TO_LEFT_ARABIC },
552 { 0xFB50, 0xFDCF, U_RIGHT_TO_LEFT_ARABIC },
553 { 0xFDF0, 0xFDFF, U_RIGHT_TO_LEFT_ARABIC },
554 { 0xFE70, 0xFEFE, U_RIGHT_TO_LEFT_ARABIC }
555
556 /* L otherwise */
557 };
558
559 char *fields[15][2];
560 UChar32 start, end;
561 int32_t i;
562
563 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
564 return;
565 }
566
567 /*
568 * Set default Bidi classes for unassigned code points.
569 * See the documentation for Bidi_Class in UCD.html in the Unicode data.
570 * http://www.unicode.org/Public/
571 *
572 * Starting with Unicode 5.0, DerivedBidiClass.txt should (re)set
573 * the Bidi_Class values for all code points including unassigned ones
574 * and including L values for these.
575 * This code becomes unnecesary but harmless. Leave it for now in case
576 * someone uses genbidi on pre-Unicode 5.0 data.
577 */
578 for(i=0; i<LENGTHOF(defaultBidi); ++i) {
579 start=defaultBidi[i][0];
580 end=defaultBidi[i][1];
581 if(!upvec_setValue(pv, start, end+1, 0, (uint32_t)defaultBidi[i][2], UBIDI_CLASS_MASK, pErrorCode)) {
582 fprintf(stderr, "genbidi error: unable to set default bidi class for U+%04lx..U+%04lx, code: %s\n",
583 (long)start, (long)end, u_errorName(*pErrorCode));
584 exit(*pErrorCode);
585 }
586 }
587
588 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
589
590 if(U_FAILURE(*pErrorCode)) {
591 return;
592 }
593 }
594
595 /* DerivedBidiClass.txt ----------------------------------------------------- */
596
597 static void U_CALLCONV
598 bidiClassLineFn(void *context,
599 char *fields[][2], int32_t fieldCount,
600 UErrorCode *pErrorCode) {
601 char *s;
602 uint32_t start, limit, value;
603
604 /* get the code point range */
605 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
606 if(U_FAILURE(*pErrorCode)) {
607 fprintf(stderr, "genbidi: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
608 exit(*pErrorCode);
609 }
610 ++limit;
611
612 /* parse bidi class */
613 s=trimTerminateField(fields[1][0], fields[1][1]);
614 value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
615 if((int32_t)value<0) {
616 fprintf(stderr, "genbidi error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
617 exit(U_PARSE_ERROR);
618 }
619
620 if(!upvec_setValue(pv, start, limit, 0, value, UBIDI_CLASS_MASK, pErrorCode)) {
621 fprintf(stderr, "genbidi error: unable to set derived bidi class for U+%04x..U+%04x - %s\n",
622 (int)start, (int)limit-1, u_errorName(*pErrorCode));
623 exit(*pErrorCode);
624 }
625 }
626
627 /*
628 * Hey, Emacs, please set the following:
629 *
630 * Local Variables:
631 * indent-tabs-mode: nil
632 * End:
633 *
634 */