icuSources/tools/genprops/genprops.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2005, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  genprops.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 1999dec08
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads several of the Unicode character database text files,
  17 *   parses them, and extracts most of the properties for each character.
  18 *   It then writes a binary file containing the properties
  19 *   that is designed to be used directly for random-access to
  20 *   the properties of each Unicode character.
  21 */
  22
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include "unicode/utypes.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/putil.h"
  28 #include "unicode/uclean.h"
  29 #include "cmemory.h"
  30 #include "cstring.h"
  31 #include "unewdata.h"
  32 #include "uoptions.h"
  33 #include "uparse.h"
  34 #include "uprops.h"
  35 #include "propsvec.h"
  36
  37 U_CDECL_BEGIN
  38 #include "genprops.h"
  39 U_CDECL_END
  40
  41 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
  42
  43 UBool beVerbose=FALSE, haveCopyright=TRUE;
  44
  45 /* prototypes --------------------------------------------------------------- */
  46
  47 static void
  48 parseDB(const char *filename, UErrorCode *pErrorCode);
  49
  50 /* -------------------------------------------------------------------------- */
  51
  52 enum
  53 {
  54     HELP_H,
  55     HELP_QUESTION_MARK,
  56     VERBOSE,
  57     COPYRIGHT,
  58     DESTDIR,
  59     SOURCEDIR,
  60     UNICODE_VERSION,
  61     ICUDATADIR,
  62     CSOURCE
  63 };
  64
  65 /* Keep these values in sync with the above enums */
  66 static UOption options[]={
  67     UOPTION_HELP_H,
  68     UOPTION_HELP_QUESTION_MARK,
  69     UOPTION_VERBOSE,
  70     UOPTION_COPYRIGHT,
  71     UOPTION_DESTDIR,
  72     UOPTION_SOURCEDIR,
  73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
  74     UOPTION_ICUDATADIR,
  75     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
  76 };
  77
  78 extern int
  79 main(int argc, char* argv[]) {
  80     char filename[300];
  81     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
  82     char *basename=NULL;
  83     UErrorCode errorCode=U_ZERO_ERROR;
  84
  85     U_MAIN_INIT_ARGS(argc, argv);
  86
  87     /* preset then read command line options */
  88     options[DESTDIR].value=u_getDataDirectory();
  89     options[SOURCEDIR].value="";
  90     options[UNICODE_VERSION].value="";
  91     options[ICUDATADIR].value=u_getDataDirectory();
  92     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
  93
  94     /* error handling, printing usage message */
  95     if(argc<0) {
  96         fprintf(stderr,
  97             "error in command line argument \"%s\"\n",
  98             argv[-argc]);
  99     }
 100     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
 101         /*
 102          * Broken into chucks because the C89 standard says the minimum
 103          * required supported string length is 509 bytes.
 104          */
 105         fprintf(stderr,
 106             "Usage: %s [-options] [suffix]\n"
 107             "\n"
 108             "read the UnicodeData.txt file and other Unicode properties files and\n"
 109             "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
 110             "\n",
 111             argv[0]);
 112         fprintf(stderr,
 113             "Options:\n"
 114             "\t-h or -? or --help  this usage text\n"
 115             "\t-v or --verbose     verbose output\n"
 116             "\t-c or --copyright   include a copyright notice\n"
 117             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 118             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
 119         fprintf(stderr,
 120             "\t-d or --destdir     destination directory, followed by the path\n"
 121             "\t-s or --sourcedir   source directory, followed by the path\n"
 122             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 123             "\t                    followed by path, defaults to %s\n"
 124             "\tsuffix              suffix that is to be appended with a '-'\n"
 125             "\t                    to the source file basenames before opening;\n"
 126             "\t                    'genprops new' will read UnicodeData-new.txt etc.\n",
 127             u_getDataDirectory());
 128         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 129     }
 130
 131     /* get the options values */
 132     beVerbose=options[VERBOSE].doesOccur;
 133     haveCopyright=options[COPYRIGHT].doesOccur;
 134     srcDir=options[SOURCEDIR].value;
 135     destDir=options[DESTDIR].value;
 136
 137     if(argc>=2) {
 138         suffix=argv[1];
 139     } else {
 140         suffix=NULL;
 141     }
 142
 143     if(options[UNICODE_VERSION].doesOccur) {
 144         setUnicodeVersion(options[UNICODE_VERSION].value);
 145     }
 146     /* else use the default dataVersion in store.c */
 147
 148     if (options[ICUDATADIR].doesOccur) {
 149         u_setDataDirectory(options[ICUDATADIR].value);
 150     }
 151
 152     /* prepare the filename beginning with the source dir */
 153     uprv_strcpy(filename, srcDir);
 154     basename=filename+uprv_strlen(filename);
 155     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 156         *basename++=U_FILE_SEP_CHAR;
 157     }
 158
 159     /* initialize */
 160     initStore();
 161
 162     /* process UnicodeData.txt */
 163     writeUCDFilename(basename, "UnicodeData", suffix);
 164     parseDB(filename, &errorCode);
 165
 166     /* process additional properties files */
 167     *basename=0;
 168     generateAdditionalProperties(filename, suffix, &errorCode);
 169
 170     /* process parsed data */
 171     if(U_SUCCESS(errorCode)) {
 172         /* write the properties data file */
 173         generateData(destDir, options[CSOURCE].doesOccur);
 174     }
 175
 176     exitStore();
 177     u_cleanup();
 178     return errorCode;
 179 }
 180
 181 U_CFUNC void
 182 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
 183     int32_t length=(int32_t)uprv_strlen(filename);
 184     uprv_strcpy(basename, filename);
 185     if(suffix!=NULL) {
 186         basename[length++]='-';
 187         uprv_strcpy(basename+length, suffix);
 188         length+=(int32_t)uprv_strlen(suffix);
 189     }
 190     uprv_strcpy(basename+length, ".txt");
 191 }
 192
 193 U_CFUNC UBool
 194 isToken(const char *token, const char *s) {
 195     const char *z;
 196     int32_t j;
 197
 198     s=u_skipWhitespace(s);
 199     for(j=0;; ++j) {
 200         if(token[j]!=0) {
 201             if(s[j]!=token[j]) {
 202                 break;
 203             }
 204         } else {
 205             z=u_skipWhitespace(s+j);
 206             if(*z==';' || *z==0) {
 207                 return TRUE;
 208             } else {
 209                 break;
 210             }
 211         }
 212     }
 213
 214     return FALSE;
 215 }
 216
 217 U_CFUNC int32_t
 218 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
 219     const char *t, *z;
 220     int32_t i, j;
 221
 222     s=u_skipWhitespace(s);
 223     for(i=0; i<countTokens; ++i) {
 224         t=tokens[i];
 225         if(t!=NULL) {
 226             for(j=0;; ++j) {
 227                 if(t[j]!=0) {
 228                     if(s[j]!=t[j]) {
 229                         break;
 230                     }
 231                 } else {
 232                     z=u_skipWhitespace(s+j);
 233                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
 234                         return i;
 235                     } else {
 236                         break;
 237                     }
 238                 }
 239             }
 240         }
 241     }
 242     return -1;
 243 }
 244
 245 /* parser for UnicodeData.txt ----------------------------------------------- */
 246
 247 /* general categories */
 248 const char *const
 249 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
 250     "Cn",
 251     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
 252     "Mc", "Nd", "Nl", "No",
 253     "Zs", "Zl", "Zp",
 254     "Cc", "Cf", "Co", "Cs",
 255     "Pd", "Ps", "Pe", "Pc", "Po",
 256     "Sm", "Sc", "Sk", "So",
 257     "Pi", "Pf"
 258 };
 259
 260 const char *const
 261 decompositionTypeNames[U_DT_COUNT]={
 262     NULL,
 263     NULL,
 264     "compat",
 265     "circle",
 266     "final",
 267     "font",
 268     "fraction",
 269     "initial",
 270     "isolated",
 271     "medial",
 272     "narrow",
 273     "noBreak",
 274     "small",
 275     "square",
 276     "sub",
 277     "super",
 278     "vertical",
 279     "wide"
 280 };
 281
 282 static struct {
 283     uint32_t first, last, props;
 284     char name[80];
 285 } unicodeAreas[32];
 286
 287 static int32_t unicodeAreaIndex=0;
 288
 289 static void U_CALLCONV
 290 unicodeDataLineFn(void *context,
 291                   char *fields[][2], int32_t fieldCount,
 292                   UErrorCode *pErrorCode) {
 293     Props p;
 294     char *end;
 295     static uint32_t prevCode=0;
 296     uint32_t value;
 297     int32_t i;
 298
 299     /* reset the properties */
 300     uprv_memset(&p, 0, sizeof(Props));
 301
 302     /* get the character code, field 0 */
 303     p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
 304     if(end<=fields[0][0] || end!=fields[0][1]) {
 305         fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
 306         *pErrorCode=U_PARSE_ERROR;
 307         exit(U_PARSE_ERROR);
 308     }
 309
 310     /* get general category, field 2 */
 311     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
 312     if(i>=0) {
 313         p.generalCategory=(uint8_t)i;
 314     } else {
 315         fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
 316             fields[2][0], (unsigned long)p.code);
 317         *pErrorCode=U_PARSE_ERROR;
 318         exit(U_PARSE_ERROR);
 319     }
 320
 321     /* get decomposition type, field 5 */
 322     if(fields[5][0]<fields[5][1]) {
 323         /* there is some decomposition */
 324         if(*fields[5][0]!='<') {
 325             /* canonical */
 326             i=U_DT_CANONICAL;
 327         } else {
 328             /* get compatibility type */
 329             end=fields[5][0]+1;
 330             while(end<fields[5][1] && *end!='>') {
 331                 ++end;
 332             }
 333             *end='#';
 334             i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
 335             if(i<0) {
 336                 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
 337                     fields[5][0], (unsigned long)p.code);
 338                 *pErrorCode=U_PARSE_ERROR;
 339                 exit(U_PARSE_ERROR);
 340             }
 341         }
 342         if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
 343             fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
 344             exit(*pErrorCode);
 345         }
 346     }
 347
 348     /* decimal digit value, field 6 */
 349     if(fields[6][0]<fields[6][1]) {
 350         value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
 351         if(end!=fields[6][1] || value>0x7fff) {
 352             fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
 353                 (unsigned long)p.code);
 354             *pErrorCode=U_PARSE_ERROR;
 355             exit(U_PARSE_ERROR);
 356         }
 357         p.numericValue=(int32_t)value;
 358         p.numericType=1;
 359     }
 360
 361     /* digit value, field 7 */
 362     if(fields[7][0]<fields[7][1]) {
 363         value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
 364         if(end!=fields[7][1] || value>0x7fff) {
 365             fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
 366                 (unsigned long)p.code);
 367             *pErrorCode=U_PARSE_ERROR;
 368             exit(U_PARSE_ERROR);
 369         }
 370         if(p.numericType==0) {
 371             p.numericValue=(int32_t)value;
 372             p.numericType=2;
 373         } else if((int32_t)value!=p.numericValue) {
 374             fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
 375                 (unsigned long)p.code);
 376             *pErrorCode=U_PARSE_ERROR;
 377             exit(U_PARSE_ERROR);
 378         }
 379     }
 380
 381     /* numeric value, field 8 */
 382     if(fields[8][0]<fields[8][1]) {
 383         char *s=fields[8][0];
 384         UBool isNegative;
 385
 386         /* get a possible minus sign */
 387         if(*s=='-') {
 388             isNegative=TRUE;
 389             ++s;
 390         } else {
 391             isNegative=FALSE;
 392         }
 393
 394         value=(uint32_t)uprv_strtoul(s, &end, 10);
 395         if(value>0 && *end=='/') {
 396             /* field 8 may contain a fractional value, get the denominator */
 397             if(p.numericType>0) {
 398                 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
 399                     (unsigned long)p.code);
 400                 *pErrorCode=U_PARSE_ERROR;
 401                 exit(U_PARSE_ERROR);
 402             }
 403
 404             p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
 405             if(p.denominator==0) {
 406                 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
 407                     (unsigned long)p.code);
 408                 *pErrorCode=U_PARSE_ERROR;
 409                 exit(U_PARSE_ERROR);
 410             }
 411         }
 412         if(end!=fields[8][1] || value>0x7fffffff) {
 413             fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
 414                 (unsigned long)p.code);
 415             *pErrorCode=U_PARSE_ERROR;
 416             exit(U_PARSE_ERROR);
 417         }
 418
 419         if(p.numericType==0) {
 420             if(isNegative) {
 421                 p.numericValue=-(int32_t)value;
 422             } else {
 423                 p.numericValue=(int32_t)value;
 424             }
 425             p.numericType=3;
 426         } else if((int32_t)value!=p.numericValue) {
 427             fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
 428                 (unsigned long)p.code);
 429             *pErrorCode=U_PARSE_ERROR;
 430             exit(U_PARSE_ERROR);
 431         }
 432     }
 433
 434     value=makeProps(&p);
 435
 436     if(*fields[1][0]=='<') {
 437         /* first or last entry of a Unicode area */
 438         size_t length=fields[1][1]-fields[1][0];
 439
 440         if(length<9) {
 441             /* name too short for an area name */
 442         } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
 443             /* set the current area */
 444             if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
 445                 length-=9;
 446                 unicodeAreas[unicodeAreaIndex].first=p.code;
 447                 unicodeAreas[unicodeAreaIndex].props=value;
 448                 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
 449                 unicodeAreas[unicodeAreaIndex].name[length]=0;
 450             } else {
 451                 /* error: a previous area is incomplete */
 452                 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
 453                 *pErrorCode=U_PARSE_ERROR;
 454                 exit(U_PARSE_ERROR);
 455             }
 456             return;
 457         } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
 458             /* check that the current area matches, and complete it with the last code point */
 459             length-=8;
 460             if( unicodeAreas[unicodeAreaIndex].props==value &&
 461                 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
 462                 unicodeAreas[unicodeAreaIndex].name[length]==0 &&
 463                 unicodeAreas[unicodeAreaIndex].first<p.code
 464             ) {
 465                 unicodeAreas[unicodeAreaIndex].last=p.code;
 466                 if(beVerbose) {
 467                     printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
 468                         (unsigned long)unicodeAreas[unicodeAreaIndex].first,
 469                         (unsigned long)unicodeAreas[unicodeAreaIndex].last,
 470                         unicodeAreas[unicodeAreaIndex].name);
 471                 }
 472                 unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
 473             } else {
 474                 /* error: different properties between first & last, different area name, first>=last */
 475                 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
 476                 *pErrorCode=U_PARSE_ERROR;
 477                 exit(U_PARSE_ERROR);
 478             }
 479             return;
 480         } else {
 481             /* not an area name */
 482         }
 483     }
 484
 485     /* check for non-character code points */
 486     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
 487         fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
 488                 (unsigned long)p.code);
 489         *pErrorCode=U_PARSE_ERROR;
 490         exit(U_PARSE_ERROR);
 491     }
 492
 493     /* check that the code points (p.code) are in ascending order */
 494     if(p.code<=prevCode && p.code>0) {
 495         fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 496                 (unsigned long)p.code, (unsigned long)prevCode);
 497         *pErrorCode=U_PARSE_ERROR;
 498         exit(U_PARSE_ERROR);
 499     }
 500     prevCode=p.code;
 501
 502     /* properties for a single code point */
 503     addProps(p.code, value);
 504 }
 505
 506 /* set repeated properties for the areas */
 507 static void
 508 repeatAreaProps() {
 509     uint32_t puaProps;
 510     int32_t i;
 511     UBool hasPlane15PUA, hasPlane16PUA;
 512     UErrorCode errorCode;
 513
 514     /*
 515      * UnicodeData.txt before 3.0.1 did not contain the PUAs on
 516      * planes 15 and 16.
 517      * If that is the case, then we add them here, using the properties
 518      * from the BMP PUA.
 519      */
 520     puaProps=0;
 521     hasPlane15PUA=hasPlane16PUA=FALSE;
 522
 523     for(i=0; i<unicodeAreaIndex; ++i) {
 524         repeatProps(unicodeAreas[i].first,
 525                     unicodeAreas[i].last,
 526                     unicodeAreas[i].props);
 527         if(unicodeAreas[i].first==0xe000) {
 528             puaProps=unicodeAreas[i].props;
 529         } else if(unicodeAreas[i].first==0xf0000) {
 530             hasPlane15PUA=TRUE;
 531         } else if(unicodeAreas[i].first==0x100000) {
 532             hasPlane16PUA=TRUE;
 533         }
 534     }
 535
 536     if(puaProps!=0) {
 537         if(!hasPlane15PUA) {
 538             repeatProps(0xf0000, 0xffffd, puaProps);
 539         }
 540         if(!hasPlane16PUA) {
 541             repeatProps(0x100000, 0x10fffd, puaProps);
 542         }
 543     }
 544
 545     /* Hangul have canonical decompositions */
 546     errorCode=U_ZERO_ERROR;
 547     if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
 548         fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
 549         exit(errorCode);
 550     }
 551 }
 552
 553 static void
 554 parseDB(const char *filename, UErrorCode *pErrorCode) {
 555     char *fields[15][2];
 556
 557     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 558         return;
 559     }
 560
 561     /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
 562     unicodeAreas[0].first=0xffffffff;
 563
 564     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
 565
 566     if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
 567         fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
 568             unicodeAreas[unicodeAreaIndex].name,
 569             (unsigned long)unicodeAreas[unicodeAreaIndex].first);
 570         *pErrorCode=U_PARSE_ERROR;
 571         exit(U_PARSE_ERROR);
 572     }
 573
 574     repeatAreaProps();
 575
 576     if(U_FAILURE(*pErrorCode)) {
 577         return;
 578     }
 579 }
 580
 581 /*
 582  * Hey, Emacs, please set the following:
 583  *
 584  * Local Variables:
 585  * indent-tabs-mode: nil
 586  * End:
 587  *
 588  */