]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genprops/genprops.c
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / tools / genprops / genprops.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
73c04bcf 4* Copyright (C) 1999-2005, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: genprops.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999dec08
14* created by: Markus W. Scherer
15*
16* This program reads several of the Unicode character database text files,
17* parses them, and extracts most of the properties for each character.
18* It then writes a binary file containing the properties
19* that is designed to be used directly for random-access to
20* the properties of each Unicode character.
21*/
22
23#include <stdio.h>
24#include <stdlib.h>
25#include "unicode/utypes.h"
26#include "unicode/uchar.h"
b75a7d8f 27#include "unicode/putil.h"
374ca955 28#include "unicode/uclean.h"
b75a7d8f
A
29#include "cmemory.h"
30#include "cstring.h"
31#include "unewdata.h"
32#include "uoptions.h"
33#include "uparse.h"
34#include "uprops.h"
35#include "propsvec.h"
36
37U_CDECL_BEGIN
38#include "genprops.h"
39U_CDECL_END
40
41#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
42
43UBool beVerbose=FALSE, haveCopyright=TRUE;
44
b75a7d8f
A
45/* prototypes --------------------------------------------------------------- */
46
b75a7d8f
A
47static void
48parseDB(const char *filename, UErrorCode *pErrorCode);
49
50/* -------------------------------------------------------------------------- */
51
374ca955
A
52enum
53{
54 HELP_H,
55 HELP_QUESTION_MARK,
56 VERBOSE,
57 COPYRIGHT,
58 DESTDIR,
59 SOURCEDIR,
60 UNICODE_VERSION,
73c04bcf
A
61 ICUDATADIR,
62 CSOURCE
374ca955
A
63};
64
65/* Keep these values in sync with the above enums */
b75a7d8f
A
66static UOption options[]={
67 UOPTION_HELP_H,
68 UOPTION_HELP_QUESTION_MARK,
69 UOPTION_VERBOSE,
70 UOPTION_COPYRIGHT,
71 UOPTION_DESTDIR,
72 UOPTION_SOURCEDIR,
73c04bcf
A
73 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
74 UOPTION_ICUDATADIR,
75 UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
b75a7d8f
A
76};
77
78extern int
79main(int argc, char* argv[]) {
80 char filename[300];
81 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
82 char *basename=NULL;
83 UErrorCode errorCode=U_ZERO_ERROR;
84
85 U_MAIN_INIT_ARGS(argc, argv);
86
87 /* preset then read command line options */
374ca955
A
88 options[DESTDIR].value=u_getDataDirectory();
89 options[SOURCEDIR].value="";
90 options[UNICODE_VERSION].value="";
91 options[ICUDATADIR].value=u_getDataDirectory();
b75a7d8f
A
92 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
93
94 /* error handling, printing usage message */
95 if(argc<0) {
96 fprintf(stderr,
97 "error in command line argument \"%s\"\n",
98 argv[-argc]);
99 }
374ca955 100 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
b75a7d8f
A
101 /*
102 * Broken into chucks because the C89 standard says the minimum
103 * required supported string length is 509 bytes.
104 */
105 fprintf(stderr,
106 "Usage: %s [-options] [suffix]\n"
107 "\n"
108 "read the UnicodeData.txt file and other Unicode properties files and\n"
109 "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
110 "\n",
111 argv[0]);
112 fprintf(stderr,
113 "Options:\n"
114 "\t-h or -? or --help this usage text\n"
115 "\t-v or --verbose verbose output\n"
116 "\t-c or --copyright include a copyright notice\n"
73c04bcf
A
117 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
118 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
b75a7d8f
A
119 fprintf(stderr,
120 "\t-d or --destdir destination directory, followed by the path\n"
121 "\t-s or --sourcedir source directory, followed by the path\n"
374ca955
A
122 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
123 "\t followed by path, defaults to %s\n"
b75a7d8f
A
124 "\tsuffix suffix that is to be appended with a '-'\n"
125 "\t to the source file basenames before opening;\n"
374ca955
A
126 "\t 'genprops new' will read UnicodeData-new.txt etc.\n",
127 u_getDataDirectory());
b75a7d8f
A
128 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
129 }
130
131 /* get the options values */
374ca955
A
132 beVerbose=options[VERBOSE].doesOccur;
133 haveCopyright=options[COPYRIGHT].doesOccur;
134 srcDir=options[SOURCEDIR].value;
135 destDir=options[DESTDIR].value;
b75a7d8f
A
136
137 if(argc>=2) {
138 suffix=argv[1];
139 } else {
140 suffix=NULL;
141 }
142
374ca955
A
143 if(options[UNICODE_VERSION].doesOccur) {
144 setUnicodeVersion(options[UNICODE_VERSION].value);
b75a7d8f
A
145 }
146 /* else use the default dataVersion in store.c */
147
374ca955
A
148 if (options[ICUDATADIR].doesOccur) {
149 u_setDataDirectory(options[ICUDATADIR].value);
150 }
151
b75a7d8f
A
152 /* prepare the filename beginning with the source dir */
153 uprv_strcpy(filename, srcDir);
154 basename=filename+uprv_strlen(filename);
155 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
156 *basename++=U_FILE_SEP_CHAR;
157 }
158
159 /* initialize */
160 initStore();
b75a7d8f
A
161
162 /* process UnicodeData.txt */
163 writeUCDFilename(basename, "UnicodeData", suffix);
164 parseDB(filename, &errorCode);
165
166 /* process additional properties files */
167 *basename=0;
168 generateAdditionalProperties(filename, suffix, &errorCode);
169
170 /* process parsed data */
171 if(U_SUCCESS(errorCode)) {
172 /* write the properties data file */
73c04bcf 173 generateData(destDir, options[CSOURCE].doesOccur);
b75a7d8f
A
174 }
175
73c04bcf 176 exitStore();
374ca955 177 u_cleanup();
b75a7d8f
A
178 return errorCode;
179}
180
181U_CFUNC void
182writeUCDFilename(char *basename, const char *filename, const char *suffix) {
374ca955 183 int32_t length=(int32_t)uprv_strlen(filename);
b75a7d8f
A
184 uprv_strcpy(basename, filename);
185 if(suffix!=NULL) {
186 basename[length++]='-';
187 uprv_strcpy(basename+length, suffix);
374ca955 188 length+=(int32_t)uprv_strlen(suffix);
b75a7d8f
A
189 }
190 uprv_strcpy(basename+length, ".txt");
191}
192
193U_CFUNC UBool
194isToken(const char *token, const char *s) {
195 const char *z;
196 int32_t j;
197
198 s=u_skipWhitespace(s);
199 for(j=0;; ++j) {
200 if(token[j]!=0) {
201 if(s[j]!=token[j]) {
202 break;
203 }
204 } else {
205 z=u_skipWhitespace(s+j);
206 if(*z==';' || *z==0) {
207 return TRUE;
208 } else {
209 break;
210 }
211 }
212 }
213
214 return FALSE;
215}
216
217U_CFUNC int32_t
218getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
219 const char *t, *z;
220 int32_t i, j;
221
222 s=u_skipWhitespace(s);
223 for(i=0; i<countTokens; ++i) {
224 t=tokens[i];
225 if(t!=NULL) {
226 for(j=0;; ++j) {
227 if(t[j]!=0) {
228 if(s[j]!=t[j]) {
229 break;
230 }
231 } else {
232 z=u_skipWhitespace(s+j);
233 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
234 return i;
235 } else {
236 break;
237 }
238 }
239 }
240 }
241 }
242 return -1;
243}
244
b75a7d8f
A
245/* parser for UnicodeData.txt ----------------------------------------------- */
246
247/* general categories */
248const char *const
249genCategoryNames[U_CHAR_CATEGORY_COUNT]={
250 "Cn",
251 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
252 "Mc", "Nd", "Nl", "No",
253 "Zs", "Zl", "Zp",
254 "Cc", "Cf", "Co", "Cs",
255 "Pd", "Ps", "Pe", "Pc", "Po",
256 "Sm", "Sc", "Sk", "So",
257 "Pi", "Pf"
258};
259
b75a7d8f
A
260const char *const
261decompositionTypeNames[U_DT_COUNT]={
262 NULL,
263 NULL,
264 "compat",
265 "circle",
266 "final",
267 "font",
268 "fraction",
269 "initial",
270 "isolated",
271 "medial",
272 "narrow",
273 "noBreak",
274 "small",
275 "square",
276 "sub",
277 "super",
278 "vertical",
279 "wide"
280};
281
282static struct {
283 uint32_t first, last, props;
284 char name[80];
285} unicodeAreas[32];
286
73c04bcf 287static int32_t unicodeAreaIndex=0;
b75a7d8f
A
288
289static void U_CALLCONV
290unicodeDataLineFn(void *context,
291 char *fields[][2], int32_t fieldCount,
292 UErrorCode *pErrorCode) {
293 Props p;
294 char *end;
295 static uint32_t prevCode=0;
296 uint32_t value;
297 int32_t i;
298
299 /* reset the properties */
300 uprv_memset(&p, 0, sizeof(Props));
301
302 /* get the character code, field 0 */
303 p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
304 if(end<=fields[0][0] || end!=fields[0][1]) {
305 fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
306 *pErrorCode=U_PARSE_ERROR;
307 exit(U_PARSE_ERROR);
308 }
309
310 /* get general category, field 2 */
311 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
312 if(i>=0) {
313 p.generalCategory=(uint8_t)i;
314 } else {
315 fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
316 fields[2][0], (unsigned long)p.code);
317 *pErrorCode=U_PARSE_ERROR;
318 exit(U_PARSE_ERROR);
319 }
320
b75a7d8f
A
321 /* get decomposition type, field 5 */
322 if(fields[5][0]<fields[5][1]) {
323 /* there is some decomposition */
324 if(*fields[5][0]!='<') {
325 /* canonical */
326 i=U_DT_CANONICAL;
327 } else {
328 /* get compatibility type */
329 end=fields[5][0]+1;
330 while(end<fields[5][1] && *end!='>') {
331 ++end;
332 }
333 *end='#';
334 i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
335 if(i<0) {
336 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
337 fields[5][0], (unsigned long)p.code);
338 *pErrorCode=U_PARSE_ERROR;
339 exit(U_PARSE_ERROR);
340 }
341 }
342 if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
343 fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
344 exit(*pErrorCode);
345 }
346 }
347
348 /* decimal digit value, field 6 */
349 if(fields[6][0]<fields[6][1]) {
350 value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
351 if(end!=fields[6][1] || value>0x7fff) {
352 fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
353 (unsigned long)p.code);
354 *pErrorCode=U_PARSE_ERROR;
355 exit(U_PARSE_ERROR);
356 }
357 p.numericValue=(int32_t)value;
358 p.numericType=1;
359 }
360
361 /* digit value, field 7 */
362 if(fields[7][0]<fields[7][1]) {
363 value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
364 if(end!=fields[7][1] || value>0x7fff) {
365 fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
366 (unsigned long)p.code);
367 *pErrorCode=U_PARSE_ERROR;
368 exit(U_PARSE_ERROR);
369 }
370 if(p.numericType==0) {
371 p.numericValue=(int32_t)value;
372 p.numericType=2;
373 } else if((int32_t)value!=p.numericValue) {
374 fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
375 (unsigned long)p.code);
376 *pErrorCode=U_PARSE_ERROR;
377 exit(U_PARSE_ERROR);
378 }
379 }
380
381 /* numeric value, field 8 */
382 if(fields[8][0]<fields[8][1]) {
383 char *s=fields[8][0];
384 UBool isNegative;
385
386 /* get a possible minus sign */
387 if(*s=='-') {
388 isNegative=TRUE;
389 ++s;
390 } else {
391 isNegative=FALSE;
392 }
393
394 value=(uint32_t)uprv_strtoul(s, &end, 10);
395 if(value>0 && *end=='/') {
396 /* field 8 may contain a fractional value, get the denominator */
397 if(p.numericType>0) {
398 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
399 (unsigned long)p.code);
400 *pErrorCode=U_PARSE_ERROR;
401 exit(U_PARSE_ERROR);
402 }
403
404 p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
405 if(p.denominator==0) {
406 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
407 (unsigned long)p.code);
408 *pErrorCode=U_PARSE_ERROR;
409 exit(U_PARSE_ERROR);
410 }
411 }
412 if(end!=fields[8][1] || value>0x7fffffff) {
413 fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
414 (unsigned long)p.code);
415 *pErrorCode=U_PARSE_ERROR;
416 exit(U_PARSE_ERROR);
417 }
418
419 if(p.numericType==0) {
420 if(isNegative) {
421 p.numericValue=-(int32_t)value;
422 } else {
423 p.numericValue=(int32_t)value;
424 }
425 p.numericType=3;
426 } else if((int32_t)value!=p.numericValue) {
427 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
428 (unsigned long)p.code);
429 *pErrorCode=U_PARSE_ERROR;
430 exit(U_PARSE_ERROR);
431 }
432 }
433
b75a7d8f
A
434 value=makeProps(&p);
435
436 if(*fields[1][0]=='<') {
437 /* first or last entry of a Unicode area */
438 size_t length=fields[1][1]-fields[1][0];
439
440 if(length<9) {
441 /* name too short for an area name */
442 } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
443 /* set the current area */
444 if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
445 length-=9;
446 unicodeAreas[unicodeAreaIndex].first=p.code;
447 unicodeAreas[unicodeAreaIndex].props=value;
448 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
449 unicodeAreas[unicodeAreaIndex].name[length]=0;
450 } else {
451 /* error: a previous area is incomplete */
452 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
453 *pErrorCode=U_PARSE_ERROR;
454 exit(U_PARSE_ERROR);
455 }
456 return;
457 } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
458 /* check that the current area matches, and complete it with the last code point */
459 length-=8;
460 if( unicodeAreas[unicodeAreaIndex].props==value &&
461 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
462 unicodeAreas[unicodeAreaIndex].name[length]==0 &&
463 unicodeAreas[unicodeAreaIndex].first<p.code
464 ) {
465 unicodeAreas[unicodeAreaIndex].last=p.code;
466 if(beVerbose) {
467 printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
468 (unsigned long)unicodeAreas[unicodeAreaIndex].first,
469 (unsigned long)unicodeAreas[unicodeAreaIndex].last,
470 unicodeAreas[unicodeAreaIndex].name);
471 }
472 unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
473 } else {
474 /* error: different properties between first & last, different area name, first>=last */
475 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
476 *pErrorCode=U_PARSE_ERROR;
477 exit(U_PARSE_ERROR);
478 }
479 return;
480 } else {
481 /* not an area name */
482 }
483 }
484
485 /* check for non-character code points */
486 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
487 fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
488 (unsigned long)p.code);
489 *pErrorCode=U_PARSE_ERROR;
490 exit(U_PARSE_ERROR);
491 }
492
493 /* check that the code points (p.code) are in ascending order */
494 if(p.code<=prevCode && p.code>0) {
495 fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
496 (unsigned long)p.code, (unsigned long)prevCode);
497 *pErrorCode=U_PARSE_ERROR;
498 exit(U_PARSE_ERROR);
499 }
500 prevCode=p.code;
501
502 /* properties for a single code point */
503 addProps(p.code, value);
504}
505
506/* set repeated properties for the areas */
507static void
508repeatAreaProps() {
509 uint32_t puaProps;
510 int32_t i;
511 UBool hasPlane15PUA, hasPlane16PUA;
512 UErrorCode errorCode;
513
514 /*
515 * UnicodeData.txt before 3.0.1 did not contain the PUAs on
516 * planes 15 and 16.
517 * If that is the case, then we add them here, using the properties
518 * from the BMP PUA.
519 */
520 puaProps=0;
521 hasPlane15PUA=hasPlane16PUA=FALSE;
522
523 for(i=0; i<unicodeAreaIndex; ++i) {
524 repeatProps(unicodeAreas[i].first,
525 unicodeAreas[i].last,
526 unicodeAreas[i].props);
527 if(unicodeAreas[i].first==0xe000) {
528 puaProps=unicodeAreas[i].props;
529 } else if(unicodeAreas[i].first==0xf0000) {
530 hasPlane15PUA=TRUE;
531 } else if(unicodeAreas[i].first==0x100000) {
532 hasPlane16PUA=TRUE;
533 }
534 }
535
536 if(puaProps!=0) {
537 if(!hasPlane15PUA) {
538 repeatProps(0xf0000, 0xffffd, puaProps);
539 }
540 if(!hasPlane16PUA) {
541 repeatProps(0x100000, 0x10fffd, puaProps);
542 }
543 }
544
545 /* Hangul have canonical decompositions */
546 errorCode=U_ZERO_ERROR;
547 if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
548 fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
549 exit(errorCode);
550 }
551}
552
553static void
554parseDB(const char *filename, UErrorCode *pErrorCode) {
b75a7d8f 555 char *fields[15][2];
b75a7d8f
A
556
557 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
558 return;
559 }
560
b75a7d8f
A
561 /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
562 unicodeAreas[0].first=0xffffffff;
563
564 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
565
566 if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
567 fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
568 unicodeAreas[unicodeAreaIndex].name,
569 (unsigned long)unicodeAreas[unicodeAreaIndex].first);
570 *pErrorCode=U_PARSE_ERROR;
571 exit(U_PARSE_ERROR);
572 }
573
574 repeatAreaProps();
575
b75a7d8f
A
576 if(U_FAILURE(*pErrorCode)) {
577 return;
578 }
b75a7d8f
A
579}
580
581/*
582 * Hey, Emacs, please set the following:
583 *
584 * Local Variables:
585 * indent-tabs-mode: nil
586 * End:
587 *
588 */