icuSources/tools/gennorm/gennorm.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2005, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gennorm.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2001may25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads the Unicode character database text file,
  17 *   parses it, and extracts the data for normalization.
  18 *   It then preprocesses it and writes a binary file for efficient use
  19 *   in various Unicode text normalization processes.
  20 */
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include "unicode/utypes.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/putil.h"
  28 #include "unicode/uclean.h"
  29 #include "unicode/udata.h"
  30 #include "unicode/uset.h"
  31 #include "cmemory.h"
  32 #include "cstring.h"
  33 #include "unewdata.h"
  34 #include "uoptions.h"
  35 #include "uparse.h"
  36 #include "unormimp.h"
  37
  38 U_CDECL_BEGIN
  39 #include "gennorm.h"
  40 U_CDECL_END
  41
  42 UBool beVerbose=FALSE, haveCopyright=TRUE;
  43
  44 /* prototypes --------------------------------------------------------------- */
  45
  46 static void
  47 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
  48
  49 static void
  50 parseDB(const char *filename, UErrorCode *pErrorCode);
  51
  52 /* -------------------------------------------------------------------------- */
  53
  54 enum {
  55     HELP_H,
  56     HELP_QUESTION_MARK,
  57     VERBOSE,
  58     COPYRIGHT,
  59     DESTDIR,
  60     SOURCEDIR,
  61     UNICODE_VERSION,
  62     ICUDATADIR,
  63     CSOURCE,
  64     STORE_FLAGS
  65 };
  66
  67 static UOption options[]={
  68     UOPTION_HELP_H,
  69     UOPTION_HELP_QUESTION_MARK,
  70     UOPTION_VERBOSE,
  71     UOPTION_COPYRIGHT,
  72     UOPTION_DESTDIR,
  73     UOPTION_SOURCEDIR,
  74     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
  75     UOPTION_ICUDATADIR,
  76     UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
  77     UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
  78 };
  79
  80 extern int
  81 main(int argc, char* argv[]) {
  82 #if !UCONFIG_NO_NORMALIZATION
  83     char filename[300];
  84 #endif
  85     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
  86     char *basename=NULL;
  87     UErrorCode errorCode=U_ZERO_ERROR;
  88
  89     U_MAIN_INIT_ARGS(argc, argv);
  90
  91     /* preset then read command line options */
  92     options[4].value=u_getDataDirectory();
  93     options[5].value="";
  94     options[6].value="3.0.0";
  95     options[ICUDATADIR].value=u_getDataDirectory();
  96     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
  97
  98     /* error handling, printing usage message */
  99     if(argc<0) {
 100         fprintf(stderr,
 101             "error in command line argument \"%s\"\n",
 102             argv[-argc]);
 103     }
 104     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
 105         /*
 106          * Broken into chucks because the C89 standard says the minimum
 107          * required supported string length is 509 bytes.
 108          */
 109         fprintf(stderr,
 110             "Usage: %s [-options] [suffix]\n"
 111             "\n"
 112             "Read the UnicodeData.txt file and other Unicode properties files and\n"
 113             "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
 114             "\n",
 115             argv[0]);
 116         fprintf(stderr,
 117             "Options:\n"
 118             "\t-h or -? or --help  this usage text\n"
 119             "\t-v or --verbose     verbose output\n"
 120             "\t-c or --copyright   include a copyright notice\n"
 121             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 122             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
 123         fprintf(stderr,
 124             "\t-p or --prune flags Prune for data modularization:\n"
 125             "\t                    Determine what data is to be stored.\n"
 126             "\t        0 (zero) stores minimal data (only for NFD)\n"
 127             "\t        lowercase letters turn off data, uppercase turn on (use with 0)\n");
 128         fprintf(stderr,
 129             "\t        k: compatibility decompositions (NFKC, NFKD)\n"
 130             "\t        c: composition data (NFC, NFKC)\n"
 131             "\t        f: FCD data (will be generated at load time)\n"
 132             "\t        a: auxiliary data (canonical closure etc.)\n"
 133             "\t        x: exclusion sets (Unicode 3.2-level normalization)\n");
 134         fprintf(stderr,
 135             "\t-d or --destdir     destination directory, followed by the path\n"
 136             "\t-s or --sourcedir   source directory, followed by the path\n"
 137             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 138             "\t                    followed by path, defaults to <%s>\n"
 139             "\tsuffix              suffix that is to be appended with a '-'\n"
 140             "\t                    to the source file basenames before opening;\n"
 141             "\t                    'gennorm new' will read UnicodeData-new.txt etc.\n",
 142             u_getDataDirectory());
 143         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 144     }
 145
 146     /* get the options values */
 147     beVerbose=options[2].doesOccur;
 148     haveCopyright=options[3].doesOccur;
 149     srcDir=options[5].value;
 150     destDir=options[4].value;
 151
 152     if(argc>=2) {
 153         suffix=argv[1];
 154     } else {
 155         suffix=NULL;
 156     }
 157
 158 #if UCONFIG_NO_NORMALIZATION
 159
 160     fprintf(stderr,
 161         "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
 162         " because UCONFIG_NO_NORMALIZATION is set, \n"
 163         "see icu/source/common/unicode/uconfig.h\n");
 164     generateData(destDir, options[CSOURCE].doesOccur);
 165
 166 #else
 167
 168     setUnicodeVersion(options[6].value);
 169
 170     if (options[ICUDATADIR].doesOccur) {
 171         u_setDataDirectory(options[ICUDATADIR].value);
 172     }
 173
 174     if(options[STORE_FLAGS].doesOccur) {
 175         const char *s=options[STORE_FLAGS].value;
 176         char c;
 177
 178         while((c=*s++)!=0) {
 179             switch(c) {
 180             case '0':
 181                 gStoreFlags=0;  /* store minimal data (only for NFD) */
 182                 break;
 183
 184             /* lowercase letters: omit data */
 185             case 'k':
 186                 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT);
 187                 break;
 188             case 'c':
 189                 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION);
 190                 break;
 191             case 'f':
 192                 gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD);
 193                 break;
 194             case 'a':
 195                 gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX);
 196                 break;
 197             case 'x':
 198                 gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS);
 199                 break;
 200
 201             /* uppercase letters: include data (use with 0) */
 202             case 'K':
 203                 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT);
 204                 break;
 205             case 'C':
 206                 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION);
 207                 break;
 208             case 'F':
 209                 gStoreFlags|=U_MASK(UGENNORM_STORE_FCD);
 210                 break;
 211             case 'A':
 212                 gStoreFlags|=U_MASK(UGENNORM_STORE_AUX);
 213                 break;
 214             case 'X':
 215                 gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS);
 216                 break;
 217
 218             default:
 219                 fprintf(stderr, "ignoring undefined prune flag '%c'\n", c);
 220                 break;
 221             }
 222         }
 223     }
 224
 225     /*
 226      * Verify that we can work with properties
 227      * but don't call u_init() because that needs unorm.icu which we are just
 228      * going to build here.
 229      */
 230     {
 231         U_STRING_DECL(ideo, "[:Ideographic:]", 15);
 232         USet *set;
 233
 234         U_STRING_INIT(ideo, "[:Ideographic:]", 15);
 235         set=uset_openPattern(ideo, -1, &errorCode);
 236         if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
 237             fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
 238             exit(errorCode);
 239         }
 240         uset_close(set);
 241     }
 242
 243     /* prepare the filename beginning with the source dir */
 244     uprv_strcpy(filename, srcDir);
 245     basename=filename+uprv_strlen(filename);
 246     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 247         *basename++=U_FILE_SEP_CHAR;
 248     }
 249
 250     /* initialize */
 251     init();
 252
 253     /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
 254     if(suffix==NULL) {
 255         uprv_strcpy(basename, "DerivedNormalizationProps.txt");
 256     } else {
 257         uprv_strcpy(basename, "DerivedNormalizationProps");
 258         basename[30]='-';
 259         uprv_strcpy(basename+31, suffix);
 260         uprv_strcat(basename+31, ".txt");
 261     }
 262     parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
 263     if(U_FAILURE(errorCode)) {
 264         /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
 265         if(suffix==NULL) {
 266             uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
 267         } else {
 268             uprv_strcpy(basename, "DerivedNormalizationProperties");
 269             basename[30]='-';
 270             uprv_strcpy(basename+31, suffix);
 271             uprv_strcat(basename+31, ".txt");
 272         }
 273         parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
 274     }
 275
 276     /* process UnicodeData.txt */
 277     if(suffix==NULL) {
 278         uprv_strcpy(basename, "UnicodeData.txt");
 279     } else {
 280         uprv_strcpy(basename, "UnicodeData");
 281         basename[11]='-';
 282         uprv_strcpy(basename+12, suffix);
 283         uprv_strcat(basename+12, ".txt");
 284     }
 285     parseDB(filename, &errorCode);
 286
 287     /* process parsed data */
 288     if(U_SUCCESS(errorCode)) {
 289         processData();
 290
 291         /* write the properties data file */
 292         generateData(destDir, options[CSOURCE].doesOccur);
 293
 294         cleanUpData();
 295     }
 296
 297 #endif
 298
 299     return errorCode;
 300 }
 301
 302 #if !UCONFIG_NO_NORMALIZATION
 303
 304 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
 305
 306 static void U_CALLCONV
 307 derivedNormalizationPropertiesLineFn(void *context,
 308                                      char *fields[][2], int32_t fieldCount,
 309                                      UErrorCode *pErrorCode) {
 310     UChar string[32];
 311     char *s;
 312     uint32_t start, end;
 313     int32_t count;
 314     uint8_t qcFlags;
 315
 316     /* get code point range */
 317     count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
 318     if(U_FAILURE(*pErrorCode)) {
 319         fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
 320         exit(*pErrorCode);
 321     }
 322
 323     /* ignore hangul - handle explicitly */
 324     if(start==0xac00) {
 325         return;
 326     }
 327
 328     /* get property - ignore unrecognized ones */
 329     s=(char *)u_skipWhitespace(fields[1][0]);
 330     if(*s=='N' && s[1]=='F') {
 331         /* quick check flag */
 332         qcFlags=0x11;
 333         s+=2;
 334         if(*s=='K') {
 335             qcFlags<<=1;
 336             ++s;
 337         }
 338
 339         if(*s=='C' && s[1]=='_') {
 340             s+=2;
 341         } else if(*s=='D' && s[1]=='_') {
 342             qcFlags<<=2;
 343             s+=2;
 344         } else {
 345             return;
 346         }
 347
 348         if(0==uprv_strncmp(s, "NO", 2)) {
 349             qcFlags&=0xf;
 350         } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
 351             qcFlags&=0x30;
 352         } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
 353             /*
 354              * Unicode 4.0.1:
 355              * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
 356              */
 357             /* start of the field */
 358             s=(char *)u_skipWhitespace(s+1);
 359             if(*s=='N') {
 360                 qcFlags&=0xf;
 361             } else if(*s=='M') {
 362                 qcFlags&=0x30;
 363             } else {
 364                 return; /* do nothing for "Yes" because it's the default value */
 365             }
 366         } else {
 367             return; /* do nothing for "Yes" because it's the default value */
 368         }
 369
 370         /* set this flag for all code points in this range */
 371         while(start<=end) {
 372             setQCFlags(start++, qcFlags);
 373         }
 374     } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
 375         /* full composition exclusion */
 376         while(start<=end) {
 377             setCompositionExclusion(start++);
 378         }
 379     } else if(
 380         ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
 381         (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
 382
 383     ) {
 384         /* FC_NFKC_Closure, parse field 2 to get the string */
 385         char *t;
 386
 387         /* start of the field */
 388         s=(char *)u_skipWhitespace(s+1);
 389
 390         /* find the end of the field */
 391         for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
 392         *t=0;
 393
 394         string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
 395         if(U_FAILURE(*pErrorCode)) {
 396             fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
 397             exit(*pErrorCode);
 398         }
 399         while(start<=end) {
 400             setFNC(start++, string);
 401         }
 402     }
 403 }
 404
 405 static void
 406 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
 407     char *fields[2][2];
 408
 409     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 410         return;
 411     }
 412
 413     u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
 414     if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
 415         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
 416         exit(*pErrorCode);
 417     }
 418 }
 419
 420 /* parser for UnicodeData.txt ----------------------------------------------- */
 421
 422 static void U_CALLCONV
 423 unicodeDataLineFn(void *context,
 424                   char *fields[][2], int32_t fieldCount,
 425                   UErrorCode *pErrorCode) {
 426     uint32_t decomp[40];
 427     Norm norm;
 428     const char *s;
 429     char *end;
 430     uint32_t code, value;
 431     int32_t length;
 432     UBool isCompat, something=FALSE;
 433
 434     /* ignore First and Last entries for ranges */
 435     if( *fields[1][0]=='<' &&
 436         (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
 437         (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
 438     ) {
 439         return;
 440     }
 441
 442     /* reset the properties */
 443     uprv_memset(&norm, 0, sizeof(Norm));
 444
 445     /*
 446      * The combiningIndex must not be initialized to 0 because 0 is the
 447      * combiningIndex of the first forward-combining character.
 448      */
 449     norm.combiningIndex=0xffff;
 450
 451     /* get the character code, field 0 */
 452     code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
 453     if(end<=fields[0][0] || end!=fields[0][1]) {
 454         fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
 455         *pErrorCode=U_PARSE_ERROR;
 456         exit(U_PARSE_ERROR);
 457     }
 458
 459     /* get canonical combining class, field 3 */
 460     value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
 461     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
 462         fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
 463         *pErrorCode=U_PARSE_ERROR;
 464         exit(U_PARSE_ERROR);
 465     }
 466     if(value>0) {
 467         norm.udataCC=(uint8_t)value;
 468         something=TRUE;
 469     }
 470
 471     /* get the decomposition, field 5 */
 472     if(fields[5][0]<fields[5][1]) {
 473         if(*(s=fields[5][0])=='<') {
 474             ++s;
 475             isCompat=TRUE;
 476
 477             /* skip and ignore the compatibility type name */
 478             do {
 479                 if(s==fields[5][1]) {
 480                     /* missing '>' */
 481                     fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
 482                     *pErrorCode=U_PARSE_ERROR;
 483                     exit(U_PARSE_ERROR);
 484                 }
 485             } while(*s++!='>');
 486         } else {
 487             isCompat=FALSE;
 488         }
 489
 490         /* parse the decomposition string */
 491         length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
 492         if(U_FAILURE(*pErrorCode)) {
 493             fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
 494                     (long)code, u_errorName(*pErrorCode));
 495             exit(*pErrorCode);
 496         }
 497
 498         /* store the string */
 499         if(length>0) {
 500             something=TRUE;
 501             if(isCompat) {
 502                 norm.lenNFKD=(uint8_t)length;
 503                 norm.nfkd=decomp;
 504             } else {
 505                 if(length>2) {
 506                     fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
 507                             (long)code, (long)length);
 508                     *pErrorCode=U_PARSE_ERROR;
 509                     exit(U_PARSE_ERROR);
 510                 }
 511                 norm.lenNFD=(uint8_t)length;
 512                 norm.nfd=decomp;
 513             }
 514         }
 515     }
 516
 517     /* check for non-character code points */
 518     if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
 519         fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
 520                 (long)code);
 521         *pErrorCode=U_PARSE_ERROR;
 522         exit(U_PARSE_ERROR);
 523     }
 524
 525     if(something) {
 526         /* there are normalization values, so store them */
 527 #if 0
 528         if(beVerbose) {
 529             printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
 530                    (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
 531         }
 532 #endif
 533         storeNorm(code, &norm);
 534     }
 535 }
 536
 537 static void
 538 parseDB(const char *filename, UErrorCode *pErrorCode) {
 539     char *fields[15][2];
 540
 541     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 542         return;
 543     }
 544
 545     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
 546     if(U_FAILURE(*pErrorCode)) {
 547         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
 548         exit(*pErrorCode);
 549     }
 550 }
 551
 552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
 553
 554 /*
 555  * Hey, Emacs, please set the following:
 556  *
 557  * Local Variables:
 558  * indent-tabs-mode: nil
 559  * End:
 560  *
 561  */