icuSources/tools/gennorm/gennorm.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gennorm.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2001may25
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads the Unicode character database text file,
  17 *   parses it, and extracts the data for normalization.
  18 *   It then preprocesses it and writes a binary file for efficient use
  19 *   in various Unicode text normalization processes.
  20 */
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include "unicode/utypes.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/putil.h"
  28 #include "unicode/uclean.h"
  29 #include "unicode/udata.h"
  30 #include "unicode/uset.h"
  31 #include "cmemory.h"
  32 #include "cstring.h"
  33 #include "unewdata.h"
  34 #include "uoptions.h"
  35 #include "uparse.h"
  36 #include "unormimp.h"
  37
  38 U_CDECL_BEGIN
  39 #include "gennorm.h"
  40 U_CDECL_END
  41
  42 #ifdef WIN32
  43 #   pragma warning(disable: 4100)
  44 #endif
  45
  46 UBool beVerbose=FALSE, haveCopyright=TRUE;
  47
  48 /* prototypes --------------------------------------------------------------- */
  49
  50 static void
  51 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
  52
  53 static void
  54 parseDB(const char *filename, UErrorCode *pErrorCode);
  55
  56 /* -------------------------------------------------------------------------- */
  57
  58 enum {
  59     HELP_H,
  60     HELP_QUESTION_MARK,
  61     VERBOSE,
  62     COPYRIGHT,
  63     DESTDIR,
  64     SOURCEDIR,
  65     UNICODE_VERSION,
  66     ICUDATADIR
  67 };
  68
  69 static UOption options[]={
  70     UOPTION_HELP_H,
  71     UOPTION_HELP_QUESTION_MARK,
  72     UOPTION_VERBOSE,
  73     UOPTION_COPYRIGHT,
  74     UOPTION_DESTDIR,
  75     UOPTION_SOURCEDIR,
  76     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
  77     UOPTION_ICUDATADIR
  78 };
  79
  80 extern int
  81 main(int argc, char* argv[]) {
  82 #if !UCONFIG_NO_NORMALIZATION
  83     char filename[300];
  84 #endif
  85     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
  86     char *basename=NULL;
  87     UErrorCode errorCode=U_ZERO_ERROR;
  88
  89     U_MAIN_INIT_ARGS(argc, argv);
  90
  91     /* preset then read command line options */
  92     options[4].value=u_getDataDirectory();
  93     options[5].value="";
  94     options[6].value="3.0.0";
  95     options[ICUDATADIR].value=u_getDataDirectory();
  96     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
  97
  98     /* error handling, printing usage message */
  99     if(argc<0) {
 100         fprintf(stderr,
 101             "error in command line argument \"%s\"\n",
 102             argv[-argc]);
 103     }
 104     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
 105         /*
 106          * Broken into chucks because the C89 standard says the minimum
 107          * required supported string length is 509 bytes.
 108          */
 109         fprintf(stderr,
 110             "Usage: %s [-options] [suffix]\n"
 111             "\n"
 112             "Read the UnicodeData.txt file and other Unicode properties files and\n"
 113             "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
 114             "\n",
 115             argv[0]);
 116         fprintf(stderr,
 117             "Options:\n"
 118             "\t-h or -? or --help  this usage text\n"
 119             "\t-v or --verbose     verbose output\n"
 120             "\t-c or --copyright   include a copyright notice\n"
 121             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
 122         fprintf(stderr,
 123             "\t-d or --destdir     destination directory, followed by the path\n"
 124             "\t-s or --sourcedir   source directory, followed by the path\n"
 125             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 126             "\t                    followed by path, defaults to <%s>\n"
 127             "\tsuffix              suffix that is to be appended with a '-'\n"
 128             "\t                    to the source file basenames before opening;\n"
 129             "\t                    'gennorm new' will read UnicodeData-new.txt etc.\n",
 130             u_getDataDirectory());
 131         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 132     }
 133
 134     /* get the options values */
 135     beVerbose=options[2].doesOccur;
 136     haveCopyright=options[3].doesOccur;
 137     srcDir=options[5].value;
 138     destDir=options[4].value;
 139
 140     if(argc>=2) {
 141         suffix=argv[1];
 142     } else {
 143         suffix=NULL;
 144     }
 145
 146 #if UCONFIG_NO_NORMALIZATION
 147
 148     fprintf(stderr,
 149         "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
 150         " because UCONFIG_NO_NORMALIZATION is set, \n"
 151         "see icu/source/common/unicode/uconfig.h\n");
 152     generateData(destDir);
 153
 154 #else
 155
 156     setUnicodeVersion(options[6].value);
 157
 158     if (options[ICUDATADIR].doesOccur) {
 159         u_setDataDirectory(options[ICUDATADIR].value);
 160     }
 161
 162     /*
 163      * Verify that we can work with properties
 164      * but don't call u_init() because that needs unorm.icu which we are just
 165      * going to build here.
 166      */
 167     {
 168         U_STRING_DECL(ideo, "[:Ideographic:]", 15);
 169         USet *set;
 170
 171         U_STRING_INIT(ideo, "[:Ideographic:]", 15);
 172         set=uset_openPattern(ideo, -1, &errorCode);
 173         if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
 174             fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
 175             exit(errorCode);
 176         }
 177         uset_close(set);
 178     }
 179
 180     /* prepare the filename beginning with the source dir */
 181     uprv_strcpy(filename, srcDir);
 182     basename=filename+uprv_strlen(filename);
 183     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 184         *basename++=U_FILE_SEP_CHAR;
 185     }
 186
 187     /* initialize */
 188     init();
 189
 190     /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
 191     if(suffix==NULL) {
 192         uprv_strcpy(basename, "DerivedNormalizationProps.txt");
 193     } else {
 194         uprv_strcpy(basename, "DerivedNormalizationProps");
 195         basename[30]='-';
 196         uprv_strcpy(basename+31, suffix);
 197         uprv_strcat(basename+31, ".txt");
 198     }
 199     parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
 200     if(U_FAILURE(errorCode)) {
 201         /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
 202         if(suffix==NULL) {
 203             uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
 204         } else {
 205             uprv_strcpy(basename, "DerivedNormalizationProperties");
 206             basename[30]='-';
 207             uprv_strcpy(basename+31, suffix);
 208             uprv_strcat(basename+31, ".txt");
 209         }
 210         parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
 211     }
 212
 213     /* process UnicodeData.txt */
 214     if(suffix==NULL) {
 215         uprv_strcpy(basename, "UnicodeData.txt");
 216     } else {
 217         uprv_strcpy(basename, "UnicodeData");
 218         basename[11]='-';
 219         uprv_strcpy(basename+12, suffix);
 220         uprv_strcat(basename+12, ".txt");
 221     }
 222     parseDB(filename, &errorCode);
 223
 224     /* process parsed data */
 225     if(U_SUCCESS(errorCode)) {
 226         processData();
 227
 228         /* write the properties data file */
 229         generateData(destDir);
 230
 231         cleanUpData();
 232     }
 233
 234 #endif
 235
 236     return errorCode;
 237 }
 238
 239 #if !UCONFIG_NO_NORMALIZATION
 240
 241 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
 242
 243 static void U_CALLCONV
 244 derivedNormalizationPropertiesLineFn(void *context,
 245                                      char *fields[][2], int32_t fieldCount,
 246                                      UErrorCode *pErrorCode) {
 247     UChar string[32];
 248     char *s;
 249     uint32_t start, end;
 250     int32_t count;
 251     uint8_t qcFlags;
 252
 253     /* get code point range */
 254     count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
 255     if(U_FAILURE(*pErrorCode)) {
 256         fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
 257         exit(*pErrorCode);
 258     }
 259
 260     /* ignore hangul - handle explicitly */
 261     if(start==0xac00) {
 262         return;
 263     }
 264
 265     /* get property - ignore unrecognized ones */
 266     s=(char *)u_skipWhitespace(fields[1][0]);
 267     if(*s=='N' && s[1]=='F') {
 268         /* quick check flag */
 269         qcFlags=0x11;
 270         s+=2;
 271         if(*s=='K') {
 272             qcFlags<<=1;
 273             ++s;
 274         }
 275
 276         if(*s=='C' && s[1]=='_') {
 277             s+=2;
 278         } else if(*s=='D' && s[1]=='_') {
 279             qcFlags<<=2;
 280             s+=2;
 281         } else {
 282             return;
 283         }
 284
 285         if(0==uprv_strncmp(s, "NO", 2)) {
 286             qcFlags&=0xf;
 287         } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
 288             qcFlags&=0x30;
 289         } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
 290             /*
 291              * Unicode 4.0.1:
 292              * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
 293              */
 294             /* start of the field */
 295             s=(char *)u_skipWhitespace(s+1);
 296             if(*s=='N') {
 297                 qcFlags&=0xf;
 298             } else if(*s=='M') {
 299                 qcFlags&=0x30;
 300             } else {
 301                 return; /* do nothing for "Yes" because it's the default value */
 302             }
 303         } else {
 304             return; /* do nothing for "Yes" because it's the default value */
 305         }
 306
 307         /* set this flag for all code points in this range */
 308         while(start<=end) {
 309             setQCFlags(start++, qcFlags);
 310         }
 311     } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
 312         /* full composition exclusion */
 313         while(start<=end) {
 314             setCompositionExclusion(start++);
 315         }
 316     } else if(
 317         ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
 318         (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
 319
 320     ) {
 321         /* FC_NFKC_Closure, parse field 2 to get the string */
 322         char *t;
 323
 324         /* start of the field */
 325         s=(char *)u_skipWhitespace(s+1);
 326
 327         /* find the end of the field */
 328         for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
 329         *t=0;
 330
 331         string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
 332         if(U_FAILURE(*pErrorCode)) {
 333             fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
 334             exit(*pErrorCode);
 335         }
 336         while(start<=end) {
 337             setFNC(start++, string);
 338         }
 339     }
 340 }
 341
 342 static void
 343 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
 344     char *fields[2][2];
 345
 346     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 347         return;
 348     }
 349
 350     u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
 351     if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
 352         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
 353         exit(*pErrorCode);
 354     }
 355 }
 356
 357 /* parser for UnicodeData.txt ----------------------------------------------- */
 358
 359 static void U_CALLCONV
 360 unicodeDataLineFn(void *context,
 361                   char *fields[][2], int32_t fieldCount,
 362                   UErrorCode *pErrorCode) {
 363     uint32_t decomp[40];
 364     Norm norm;
 365     const char *s;
 366     char *end;
 367     uint32_t code, value;
 368     int32_t length;
 369     UBool isCompat, something=FALSE;
 370
 371     /* ignore First and Last entries for ranges */
 372     if( *fields[1][0]=='<' &&
 373         (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
 374         (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
 375     ) {
 376         return;
 377     }
 378
 379     /* reset the properties */
 380     uprv_memset(&norm, 0, sizeof(Norm));
 381
 382     /* get the character code, field 0 */
 383     code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
 384     if(end<=fields[0][0] || end!=fields[0][1]) {
 385         fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
 386         *pErrorCode=U_PARSE_ERROR;
 387         exit(U_PARSE_ERROR);
 388     }
 389
 390     /* get canonical combining class, field 3 */
 391     value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
 392     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
 393         fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
 394         *pErrorCode=U_PARSE_ERROR;
 395         exit(U_PARSE_ERROR);
 396     }
 397     if(value>0) {
 398         norm.udataCC=(uint8_t)value;
 399         something=TRUE;
 400     }
 401
 402     /* get the decomposition, field 5 */
 403     if(fields[5][0]<fields[5][1]) {
 404         if(*(s=fields[5][0])=='<') {
 405             ++s;
 406             isCompat=TRUE;
 407
 408             /* skip and ignore the compatibility type name */
 409             do {
 410                 if(s==fields[5][1]) {
 411                     /* missing '>' */
 412                     fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
 413                     *pErrorCode=U_PARSE_ERROR;
 414                     exit(U_PARSE_ERROR);
 415                 }
 416             } while(*s++!='>');
 417         } else {
 418             isCompat=FALSE;
 419         }
 420
 421         /* parse the decomposition string */
 422         length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
 423         if(U_FAILURE(*pErrorCode)) {
 424             fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
 425                     (long)code, u_errorName(*pErrorCode));
 426             exit(*pErrorCode);
 427         }
 428
 429         /* store the string */
 430         if(length>0) {
 431             something=TRUE;
 432             if(isCompat) {
 433                 norm.lenNFKD=(uint8_t)length;
 434                 norm.nfkd=decomp;
 435             } else {
 436                 if(length>2) {
 437                     fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
 438                             (long)code, (long)length);
 439                     *pErrorCode=U_PARSE_ERROR;
 440                     exit(U_PARSE_ERROR);
 441                 }
 442                 norm.lenNFD=(uint8_t)length;
 443                 norm.nfd=decomp;
 444             }
 445         }
 446     }
 447
 448     /* check for non-character code points */
 449     if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
 450         fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
 451                 (long)code);
 452         *pErrorCode=U_PARSE_ERROR;
 453         exit(U_PARSE_ERROR);
 454     }
 455
 456     if(something) {
 457         /* there are normalization values, so store them */
 458 #if 0
 459         if(beVerbose) {
 460             printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
 461                    (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
 462         }
 463 #endif
 464         storeNorm(code, &norm);
 465     }
 466 }
 467
 468 static void
 469 parseDB(const char *filename, UErrorCode *pErrorCode) {
 470     char *fields[15][2];
 471
 472     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 473         return;
 474     }
 475
 476     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
 477     if(U_FAILURE(*pErrorCode)) {
 478         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
 479         exit(*pErrorCode);
 480     }
 481 }
 482
 483 #endif /* #if !UCONFIG_NO_NORMALIZATION */
 484
 485 /*
 486  * Hey, Emacs, please set the following:
 487  *
 488  * Local Variables:
 489  * indent-tabs-mode: nil
 490  * End:
 491  *
 492  */