icuSources/tools/genprops/genprops.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2003, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  genprops.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 1999dec08
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads several of the Unicode character database text files,
  17 *   parses them, and extracts most of the properties for each character.
  18 *   It then writes a binary file containing the properties
  19 *   that is designed to be used directly for random-access to
  20 *   the properties of each Unicode character.
  21 */
  22
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include "unicode/utypes.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uset.h"
  28 #include "unicode/putil.h"
  29 #include "unicode/uclean.h"
  30 #include "cmemory.h"
  31 #include "cstring.h"
  32 #include "unewdata.h"
  33 #include "uoptions.h"
  34 #include "uparse.h"
  35 #include "uprops.h"
  36 #include "propsvec.h"
  37
  38 U_CDECL_BEGIN
  39 #include "genprops.h"
  40 U_CDECL_END
  41
  42 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
  43
  44 UBool beVerbose=FALSE, haveCopyright=TRUE;
  45
  46 /*
  47  * Unicode set collecting the case-sensitive characters;
  48  * see uchar.h UCHAR_CASE_SENSITIVE.
  49  * Add code points from case mappings/foldings in
  50  * the root locale and with default options.
  51  */
  52 static USet *caseSensitive;
  53
  54 /* prototypes --------------------------------------------------------------- */
  55
  56 static void
  57 parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
  58
  59 static void
  60 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
  61
  62 static void
  63 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
  64
  65 static void
  66 parseDB(const char *filename, UErrorCode *pErrorCode);
  67
  68 /* -------------------------------------------------------------------------- */
  69
  70
  71 enum
  72 {
  73     HELP_H,
  74     HELP_QUESTION_MARK,
  75     VERBOSE,
  76     COPYRIGHT,
  77     DESTDIR,
  78     SOURCEDIR,
  79     UNICODE_VERSION,
  80     ICUDATADIR
  81 };
  82
  83 /* Keep these values in sync with the above enums */
  84 static UOption options[]={
  85     UOPTION_HELP_H,
  86     UOPTION_HELP_QUESTION_MARK,
  87     UOPTION_VERBOSE,
  88     UOPTION_COPYRIGHT,
  89     UOPTION_DESTDIR,
  90     UOPTION_SOURCEDIR,
  91     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
  92     UOPTION_ICUDATADIR
  93 };
  94
  95 extern int
  96 main(int argc, char* argv[]) {
  97     char filename[300];
  98     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
  99     char *basename=NULL;
 100     UErrorCode errorCode=U_ZERO_ERROR;
 101
 102     U_MAIN_INIT_ARGS(argc, argv);
 103
 104     /* preset then read command line options */
 105     options[DESTDIR].value=u_getDataDirectory();
 106     options[SOURCEDIR].value="";
 107     options[UNICODE_VERSION].value="";
 108     options[ICUDATADIR].value=u_getDataDirectory();
 109     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 110
 111     /* error handling, printing usage message */
 112     if(argc<0) {
 113         fprintf(stderr,
 114             "error in command line argument \"%s\"\n",
 115             argv[-argc]);
 116     }
 117     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
 118         /*
 119          * Broken into chucks because the C89 standard says the minimum
 120          * required supported string length is 509 bytes.
 121          */
 122         fprintf(stderr,
 123             "Usage: %s [-options] [suffix]\n"
 124             "\n"
 125             "read the UnicodeData.txt file and other Unicode properties files and\n"
 126             "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
 127             "\n",
 128             argv[0]);
 129         fprintf(stderr,
 130             "Options:\n"
 131             "\t-h or -? or --help  this usage text\n"
 132             "\t-v or --verbose     verbose output\n"
 133             "\t-c or --copyright   include a copyright notice\n"
 134             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
 135         fprintf(stderr,
 136             "\t-d or --destdir     destination directory, followed by the path\n"
 137             "\t-s or --sourcedir   source directory, followed by the path\n"
 138             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 139             "\t                    followed by path, defaults to %s\n"
 140             "\tsuffix              suffix that is to be appended with a '-'\n"
 141             "\t                    to the source file basenames before opening;\n"
 142             "\t                    'genprops new' will read UnicodeData-new.txt etc.\n",
 143             u_getDataDirectory());
 144         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 145     }
 146
 147     /* get the options values */
 148     beVerbose=options[VERBOSE].doesOccur;
 149     haveCopyright=options[COPYRIGHT].doesOccur;
 150     srcDir=options[SOURCEDIR].value;
 151     destDir=options[DESTDIR].value;
 152
 153     if(argc>=2) {
 154         suffix=argv[1];
 155     } else {
 156         suffix=NULL;
 157     }
 158
 159     if(options[UNICODE_VERSION].doesOccur) {
 160         setUnicodeVersion(options[UNICODE_VERSION].value);
 161     }
 162     /* else use the default dataVersion in store.c */
 163
 164     if (options[ICUDATADIR].doesOccur) {
 165         u_setDataDirectory(options[ICUDATADIR].value);
 166     }
 167
 168     /* prepare the filename beginning with the source dir */
 169     uprv_strcpy(filename, srcDir);
 170     basename=filename+uprv_strlen(filename);
 171     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 172         *basename++=U_FILE_SEP_CHAR;
 173     }
 174
 175     /* initialize */
 176     initStore();
 177     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
 178
 179     /* process BidiMirroring.txt */
 180     writeUCDFilename(basename, "BidiMirroring", suffix);
 181     parseBidiMirroring(filename, &errorCode);
 182
 183     /* process SpecialCasing.txt */
 184     writeUCDFilename(basename, "SpecialCasing", suffix);
 185     parseSpecialCasing(filename, &errorCode);
 186
 187     /* process CaseFolding.txt */
 188     writeUCDFilename(basename, "CaseFolding", suffix);
 189     parseCaseFolding(filename, &errorCode);
 190
 191     /* process UnicodeData.txt */
 192     writeUCDFilename(basename, "UnicodeData", suffix);
 193     parseDB(filename, &errorCode);
 194
 195     /* process additional properties files */
 196     *basename=0;
 197     generateAdditionalProperties(filename, suffix, &errorCode);
 198
 199     /* process parsed data */
 200     if(U_SUCCESS(errorCode)) {
 201         /* write the properties data file */
 202         generateData(destDir);
 203     }
 204
 205     u_cleanup();
 206     return errorCode;
 207 }
 208
 209 U_CFUNC void
 210 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
 211     int32_t length=(int32_t)uprv_strlen(filename);
 212     uprv_strcpy(basename, filename);
 213     if(suffix!=NULL) {
 214         basename[length++]='-';
 215         uprv_strcpy(basename+length, suffix);
 216         length+=(int32_t)uprv_strlen(suffix);
 217     }
 218     uprv_strcpy(basename+length, ".txt");
 219 }
 220
 221 U_CFUNC UBool
 222 isToken(const char *token, const char *s) {
 223     const char *z;
 224     int32_t j;
 225
 226     s=u_skipWhitespace(s);
 227     for(j=0;; ++j) {
 228         if(token[j]!=0) {
 229             if(s[j]!=token[j]) {
 230                 break;
 231             }
 232         } else {
 233             z=u_skipWhitespace(s+j);
 234             if(*z==';' || *z==0) {
 235                 return TRUE;
 236             } else {
 237                 break;
 238             }
 239         }
 240     }
 241
 242     return FALSE;
 243 }
 244
 245 U_CFUNC int32_t
 246 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
 247     const char *t, *z;
 248     int32_t i, j;
 249
 250     s=u_skipWhitespace(s);
 251     for(i=0; i<countTokens; ++i) {
 252         t=tokens[i];
 253         if(t!=NULL) {
 254             for(j=0;; ++j) {
 255                 if(t[j]!=0) {
 256                     if(s[j]!=t[j]) {
 257                         break;
 258                     }
 259                 } else {
 260                     z=u_skipWhitespace(s+j);
 261                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
 262                         return i;
 263                     } else {
 264                         break;
 265                     }
 266                 }
 267             }
 268         }
 269     }
 270     return -1;
 271 }
 272
 273 static void
 274 _set_addAll(USet *set, const UChar *s, int32_t length) {
 275     UChar32 c;
 276     int32_t i;
 277
 278     /* needs length>=0 */
 279     for(i=0; i<length; /* U16_NEXT advances i */) {
 280         U16_NEXT(s, i, length, c);
 281         uset_add(set, c);
 282     }
 283 }
 284
 285 /* parser for BidiMirroring.txt --------------------------------------------- */
 286
 287 #define MAX_MIRROR_COUNT 2000
 288
 289 static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
 290 static int32_t mirrorCount=0;
 291
 292 static void U_CALLCONV
 293 mirrorLineFn(void *context,
 294              char *fields[][2], int32_t fieldCount,
 295              UErrorCode *pErrorCode) {
 296     char *end;
 297     static uint32_t prevCode=0;
 298
 299     mirrorMappings[mirrorCount][0]=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
 300     if(end<=fields[0][0] || end!=fields[0][1]) {
 301         fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
 302         *pErrorCode=U_PARSE_ERROR;
 303         exit(U_PARSE_ERROR);
 304     }
 305
 306     mirrorMappings[mirrorCount][1]=(uint32_t)uprv_strtoul(fields[1][0], &end, 16);
 307     if(end<=fields[1][0] || end!=fields[1][1]) {
 308         fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
 309         *pErrorCode=U_PARSE_ERROR;
 310         exit(U_PARSE_ERROR);
 311     }
 312
 313     /* check that the code points (mirrorMappings[mirrorCount][0]) are in ascending order */
 314     if(mirrorMappings[mirrorCount][0]<=prevCode && mirrorMappings[mirrorCount][0]>0) {
 315         fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n",
 316                 (unsigned long)mirrorMappings[mirrorCount][0],
 317                 (unsigned long)prevCode);
 318         *pErrorCode=U_PARSE_ERROR;
 319         exit(U_PARSE_ERROR);
 320     }
 321     prevCode=mirrorMappings[mirrorCount][0];
 322
 323     if(++mirrorCount==MAX_MIRROR_COUNT) {
 324         fprintf(stderr, "genprops: too many mirror mappings\n");
 325         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 326         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 327     }
 328 }
 329
 330 static void
 331 parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
 332     char *fields[2][2];
 333
 334     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 335         return;
 336     }
 337
 338     u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
 339 }
 340
 341 /* parser for SpecialCasing.txt --------------------------------------------- */
 342
 343 #define MAX_SPECIAL_CASING_COUNT 500
 344
 345 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
 346 static int32_t specialCasingCount=0;
 347
 348 static void U_CALLCONV
 349 specialCasingLineFn(void *context,
 350                     char *fields[][2], int32_t fieldCount,
 351                     UErrorCode *pErrorCode) {
 352     char *end;
 353
 354     /* get code point */
 355     specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 356     end=(char *)u_skipWhitespace(end);
 357     if(end<=fields[0][0] || end!=fields[0][1]) {
 358         fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
 359         *pErrorCode=U_PARSE_ERROR;
 360         exit(U_PARSE_ERROR);
 361     }
 362
 363     /* is this a complex mapping? */
 364     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
 365         /* there is some condition text in the fifth field */
 366         specialCasings[specialCasingCount].isComplex=TRUE;
 367
 368         /* do not store any actual mappings for this */
 369         specialCasings[specialCasingCount].lowerCase[0]=0;
 370         specialCasings[specialCasingCount].upperCase[0]=0;
 371         specialCasings[specialCasingCount].titleCase[0]=0;
 372     } else {
 373         /* just set the "complex" flag and get the case mappings */
 374         specialCasings[specialCasingCount].isComplex=FALSE;
 375         specialCasings[specialCasingCount].lowerCase[0]=
 376             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
 377         specialCasings[specialCasingCount].upperCase[0]=
 378             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
 379         specialCasings[specialCasingCount].titleCase[0]=
 380             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
 381         if(U_FAILURE(*pErrorCode)) {
 382             fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
 383             exit(*pErrorCode);
 384         }
 385
 386         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
 387         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
 388         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
 389         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
 390     }
 391
 392     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
 393         fprintf(stderr, "genprops: too many special casing mappings\n");
 394         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 395         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 396     }
 397 }
 398
 399 static int
 400 compareSpecialCasings(const void *left, const void *right) {
 401     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
 402 }
 403
 404 static void
 405 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
 406     char *fields[5][2];
 407     int32_t i, j;
 408
 409     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 410         return;
 411     }
 412
 413     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
 414
 415     /* sort the special casing entries by code point */
 416     if(specialCasingCount>0) {
 417         qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
 418     }
 419
 420     /* replace multiple entries for any code point by one "complex" one */
 421     j=0;
 422     for(i=1; i<specialCasingCount; ++i) {
 423         if(specialCasings[i-1].code==specialCasings[i].code) {
 424             /* there is a duplicate code point */
 425             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following qsort */
 426             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
 427             specialCasings[i].lowerCase[0]=0;
 428             specialCasings[i].upperCase[0]=0;
 429             specialCasings[i].titleCase[0]=0;
 430             ++j;
 431         }
 432     }
 433
 434     /* if some entries just were removed, then re-sort */
 435     if(j>0) {
 436         qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
 437         specialCasingCount-=j;
 438     }
 439
 440     /*
 441      * Add one complex mapping to caseSensitive that was filtered out above:
 442      * Greek final Sigma has a conditional mapping but not locale-sensitive,
 443      * and it is taken when lowercasing just U+03A3 alone.
 444      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 445      */
 446     uset_add(caseSensitive, 0x3c2);
 447 }
 448
 449 /* parser for CaseFolding.txt ----------------------------------------------- */
 450
 451 #define MAX_CASE_FOLDING_COUNT 2000
 452
 453 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
 454 static int32_t caseFoldingCount=0;
 455
 456 static void U_CALLCONV
 457 caseFoldingLineFn(void *context,
 458                   char *fields[][2], int32_t fieldCount,
 459                   UErrorCode *pErrorCode) {
 460     char *end;
 461     static uint32_t prevCode=0;
 462     int32_t count;
 463     char status;
 464
 465     /* get code point */
 466     caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 467     end=(char *)u_skipWhitespace(end);
 468     if(end<=fields[0][0] || end!=fields[0][1]) {
 469         fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
 470         *pErrorCode=U_PARSE_ERROR;
 471         exit(U_PARSE_ERROR);
 472     }
 473
 474     /* get the status of this mapping */
 475     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
 476     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
 477         fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
 478         *pErrorCode=U_PARSE_ERROR;
 479         exit(U_PARSE_ERROR);
 480     }
 481
 482     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
 483     if(status=='L') {
 484         return;
 485     }
 486
 487     /* get the mapping */
 488     count=caseFoldings[caseFoldingCount].full[0]=
 489         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode);
 490     if(U_FAILURE(*pErrorCode)) {
 491         fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
 492         exit(*pErrorCode);
 493     }
 494
 495     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
 496     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
 497         caseFoldings[caseFoldingCount].simple=0;
 498     }
 499
 500     /* update the case-sensitive set */
 501     if(status!='T') {
 502         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
 503         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
 504     }
 505
 506     /* check the status */
 507     if(status=='S') {
 508         /* check if there was a full mapping for this code point before */
 509         if( caseFoldingCount>0 &&
 510             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 511             caseFoldings[caseFoldingCount-1].status=='F'
 512         ) {
 513             /* merge the two entries */
 514             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
 515             return;
 516         }
 517     } else if(status=='F') {
 518         /* check if there was a simple mapping for this code point before */
 519         if( caseFoldingCount>0 &&
 520             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 521             caseFoldings[caseFoldingCount-1].status=='S'
 522         ) {
 523             /* merge the two entries */
 524             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
 525             return;
 526         }
 527     } else if(status=='I' || status=='T') {
 528         /* check if there was a default mapping for this code point before (remove it) */
 529         while(caseFoldingCount>0 &&
 530               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
 531         ) {
 532             prevCode=0;
 533             --caseFoldingCount;
 534         }
 535         /* store only a marker for special handling for cases like dotless i */
 536         caseFoldings[caseFoldingCount].simple=0;
 537         caseFoldings[caseFoldingCount].full[0]=0;
 538     }
 539
 540     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
 541     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
 542         fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
 543                 (unsigned long)caseFoldings[caseFoldingCount].code,
 544                 (unsigned long)prevCode);
 545         *pErrorCode=U_PARSE_ERROR;
 546         exit(U_PARSE_ERROR);
 547     }
 548     prevCode=caseFoldings[caseFoldingCount].code;
 549
 550     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
 551         fprintf(stderr, "genprops: too many case folding mappings\n");
 552         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 553         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 554     }
 555 }
 556
 557 static void
 558 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
 559     char *fields[3][2];
 560
 561     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 562         return;
 563     }
 564
 565     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
 566 }
 567
 568 /* parser for UnicodeData.txt ----------------------------------------------- */
 569
 570 /* general categories */
 571 const char *const
 572 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
 573     "Cn",
 574     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
 575     "Mc", "Nd", "Nl", "No",
 576     "Zs", "Zl", "Zp",
 577     "Cc", "Cf", "Co", "Cs",
 578     "Pd", "Ps", "Pe", "Pc", "Po",
 579     "Sm", "Sc", "Sk", "So",
 580     "Pi", "Pf"
 581 };
 582
 583 const char *const
 584 bidiNames[U_CHAR_DIRECTION_COUNT]={
 585     "L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
 586     "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
 587 };
 588
 589 const char *const
 590 decompositionTypeNames[U_DT_COUNT]={
 591     NULL,
 592     NULL,
 593     "compat",
 594     "circle",
 595     "final",
 596     "font",
 597     "fraction",
 598     "initial",
 599     "isolated",
 600     "medial",
 601     "narrow",
 602     "noBreak",
 603     "small",
 604     "square",
 605     "sub",
 606     "super",
 607     "vertical",
 608     "wide"
 609 };
 610
 611 static struct {
 612     uint32_t first, last, props;
 613     char name[80];
 614 } unicodeAreas[32];
 615
 616 static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
 617
 618 static void U_CALLCONV
 619 unicodeDataLineFn(void *context,
 620                   char *fields[][2], int32_t fieldCount,
 621                   UErrorCode *pErrorCode) {
 622     Props p;
 623     char *end;
 624     static uint32_t prevCode=0;
 625     uint32_t value;
 626     int32_t i;
 627
 628     /* reset the properties */
 629     uprv_memset(&p, 0, sizeof(Props));
 630
 631     /* get the character code, field 0 */
 632     p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
 633     if(end<=fields[0][0] || end!=fields[0][1]) {
 634         fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
 635         *pErrorCode=U_PARSE_ERROR;
 636         exit(U_PARSE_ERROR);
 637     }
 638
 639     /* get general category, field 2 */
 640     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
 641     if(i>=0) {
 642         p.generalCategory=(uint8_t)i;
 643     } else {
 644         fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
 645             fields[2][0], (unsigned long)p.code);
 646         *pErrorCode=U_PARSE_ERROR;
 647         exit(U_PARSE_ERROR);
 648     }
 649
 650     /* get BiDi category, field 4 */
 651     i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
 652     if(i>=0) {
 653         p.bidi=(uint8_t)i;
 654     } else {
 655         fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
 656             fields[4][0], (unsigned long)p.code);
 657         *pErrorCode=U_PARSE_ERROR;
 658         exit(U_PARSE_ERROR);
 659     }
 660
 661     /* get decomposition type, field 5 */
 662     if(fields[5][0]<fields[5][1]) {
 663         /* there is some decomposition */
 664         if(*fields[5][0]!='<') {
 665             /* canonical */
 666             i=U_DT_CANONICAL;
 667         } else {
 668             /* get compatibility type */
 669             end=fields[5][0]+1;
 670             while(end<fields[5][1] && *end!='>') {
 671                 ++end;
 672             }
 673             *end='#';
 674             i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
 675             if(i<0) {
 676                 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
 677                     fields[5][0], (unsigned long)p.code);
 678                 *pErrorCode=U_PARSE_ERROR;
 679                 exit(U_PARSE_ERROR);
 680             }
 681         }
 682         if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
 683             fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
 684             exit(*pErrorCode);
 685         }
 686     }
 687
 688     /* decimal digit value, field 6 */
 689     if(fields[6][0]<fields[6][1]) {
 690         value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
 691         if(end!=fields[6][1] || value>0x7fff) {
 692             fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
 693                 (unsigned long)p.code);
 694             *pErrorCode=U_PARSE_ERROR;
 695             exit(U_PARSE_ERROR);
 696         }
 697         p.numericValue=(int32_t)value;
 698         p.numericType=1;
 699     }
 700
 701     /* digit value, field 7 */
 702     if(fields[7][0]<fields[7][1]) {
 703         value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
 704         if(end!=fields[7][1] || value>0x7fff) {
 705             fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
 706                 (unsigned long)p.code);
 707             *pErrorCode=U_PARSE_ERROR;
 708             exit(U_PARSE_ERROR);
 709         }
 710         if(p.numericType==0) {
 711             p.numericValue=(int32_t)value;
 712             p.numericType=2;
 713         } else if((int32_t)value!=p.numericValue) {
 714             fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
 715                 (unsigned long)p.code);
 716             *pErrorCode=U_PARSE_ERROR;
 717             exit(U_PARSE_ERROR);
 718         }
 719     }
 720
 721     /* numeric value, field 8 */
 722     if(fields[8][0]<fields[8][1]) {
 723         char *s=fields[8][0];
 724         UBool isNegative;
 725
 726         /* get a possible minus sign */
 727         if(*s=='-') {
 728             isNegative=TRUE;
 729             ++s;
 730         } else {
 731             isNegative=FALSE;
 732         }
 733
 734         value=(uint32_t)uprv_strtoul(s, &end, 10);
 735         if(value>0 && *end=='/') {
 736             /* field 8 may contain a fractional value, get the denominator */
 737             if(p.numericType>0) {
 738                 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
 739                     (unsigned long)p.code);
 740                 *pErrorCode=U_PARSE_ERROR;
 741                 exit(U_PARSE_ERROR);
 742             }
 743
 744             p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
 745             if(p.denominator==0) {
 746                 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
 747                     (unsigned long)p.code);
 748                 *pErrorCode=U_PARSE_ERROR;
 749                 exit(U_PARSE_ERROR);
 750             }
 751         }
 752         if(end!=fields[8][1] || value>0x7fffffff) {
 753             fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
 754                 (unsigned long)p.code);
 755             *pErrorCode=U_PARSE_ERROR;
 756             exit(U_PARSE_ERROR);
 757         }
 758
 759         if(p.numericType==0) {
 760             if(isNegative) {
 761                 p.numericValue=-(int32_t)value;
 762             } else {
 763                 p.numericValue=(int32_t)value;
 764             }
 765             p.numericType=3;
 766         } else if((int32_t)value!=p.numericValue) {
 767             fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
 768                 (unsigned long)p.code);
 769             *pErrorCode=U_PARSE_ERROR;
 770             exit(U_PARSE_ERROR);
 771         }
 772     }
 773
 774     /* get Mirrored flag, field 9 */
 775     if(*fields[9][0]=='Y') {
 776         p.isMirrored=1;
 777     } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
 778         fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n",
 779             (unsigned long)p.code);
 780         *pErrorCode=U_PARSE_ERROR;
 781         exit(U_PARSE_ERROR);
 782     }
 783
 784     /* get uppercase mapping, field 12 */
 785     value=(uint32_t)uprv_strtoul(fields[12][0], &end, 16);
 786     if(end!=fields[12][1]) {
 787         fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n",
 788             (unsigned long)p.code);
 789         *pErrorCode=U_PARSE_ERROR;
 790         exit(U_PARSE_ERROR);
 791     }
 792     if(value!=0 && value!=p.code) {
 793         p.upperCase=value;
 794         uset_add(caseSensitive, (UChar32)p.code);
 795         uset_add(caseSensitive, (UChar32)value);
 796     }
 797
 798     /* get lowercase value, field 13 */
 799     value=(uint32_t)uprv_strtoul(fields[13][0], &end, 16);
 800     if(end!=fields[13][1]) {
 801         fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n",
 802             (unsigned long)p.code);
 803         *pErrorCode=U_PARSE_ERROR;
 804         exit(U_PARSE_ERROR);
 805     }
 806     if(value!=0 && value!=p.code) {
 807         p.lowerCase=value;
 808         uset_add(caseSensitive, (UChar32)p.code);
 809         uset_add(caseSensitive, (UChar32)value);
 810     }
 811
 812     /* get titlecase value, field 14 */
 813     value=(uint32_t)uprv_strtoul(fields[14][0], &end, 16);
 814     if(end!=fields[14][1]) {
 815         fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n",
 816             (unsigned long)p.code);
 817         *pErrorCode=U_PARSE_ERROR;
 818         exit(U_PARSE_ERROR);
 819     }
 820     if(value!=0 && value!=p.code) {
 821         p.titleCase=value;
 822         uset_add(caseSensitive, (UChar32)p.code);
 823         uset_add(caseSensitive, (UChar32)value);
 824     }
 825
 826     /* set additional properties from previously parsed files */
 827     if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
 828         p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
 829     }
 830     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
 831         p.specialCasing=specialCasings+specialCasingIndex++;
 832     } else {
 833         p.specialCasing=NULL;
 834     }
 835     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
 836         p.caseFolding=caseFoldings+caseFoldingIndex++;
 837
 838         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
 839         if( p.caseFolding->status=='C' &&
 840             p.caseFolding->simple==p.lowerCase
 841         ) {
 842             p.caseFolding=NULL;
 843         }
 844     } else {
 845         p.caseFolding=NULL;
 846     }
 847
 848     value=makeProps(&p);
 849
 850     if(*fields[1][0]=='<') {
 851         /* first or last entry of a Unicode area */
 852         size_t length=fields[1][1]-fields[1][0];
 853
 854         if(length<9) {
 855             /* name too short for an area name */
 856         } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
 857             /* set the current area */
 858             if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
 859                 length-=9;
 860                 unicodeAreas[unicodeAreaIndex].first=p.code;
 861                 unicodeAreas[unicodeAreaIndex].props=value;
 862                 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
 863                 unicodeAreas[unicodeAreaIndex].name[length]=0;
 864             } else {
 865                 /* error: a previous area is incomplete */
 866                 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
 867                 *pErrorCode=U_PARSE_ERROR;
 868                 exit(U_PARSE_ERROR);
 869             }
 870             return;
 871         } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
 872             /* check that the current area matches, and complete it with the last code point */
 873             length-=8;
 874             if( unicodeAreas[unicodeAreaIndex].props==value &&
 875                 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
 876                 unicodeAreas[unicodeAreaIndex].name[length]==0 &&
 877                 unicodeAreas[unicodeAreaIndex].first<p.code
 878             ) {
 879                 unicodeAreas[unicodeAreaIndex].last=p.code;
 880                 if(beVerbose) {
 881                     printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
 882                         (unsigned long)unicodeAreas[unicodeAreaIndex].first,
 883                         (unsigned long)unicodeAreas[unicodeAreaIndex].last,
 884                         unicodeAreas[unicodeAreaIndex].name);
 885                 }
 886                 unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
 887             } else {
 888                 /* error: different properties between first & last, different area name, first>=last */
 889                 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
 890                 *pErrorCode=U_PARSE_ERROR;
 891                 exit(U_PARSE_ERROR);
 892             }
 893             return;
 894         } else {
 895             /* not an area name */
 896         }
 897     }
 898
 899     /* check for non-character code points */
 900     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
 901         fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
 902                 (unsigned long)p.code);
 903         *pErrorCode=U_PARSE_ERROR;
 904         exit(U_PARSE_ERROR);
 905     }
 906
 907     /* check that the code points (p.code) are in ascending order */
 908     if(p.code<=prevCode && p.code>0) {
 909         fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 910                 (unsigned long)p.code, (unsigned long)prevCode);
 911         *pErrorCode=U_PARSE_ERROR;
 912         exit(U_PARSE_ERROR);
 913     }
 914     prevCode=p.code;
 915
 916     /* properties for a single code point */
 917     addProps(p.code, value);
 918 }
 919
 920 /* set repeated properties for the areas */
 921 static void
 922 repeatAreaProps() {
 923     uint32_t puaProps;
 924     int32_t i;
 925     UBool hasPlane15PUA, hasPlane16PUA;
 926     UErrorCode errorCode;
 927
 928     /*
 929      * UnicodeData.txt before 3.0.1 did not contain the PUAs on
 930      * planes 15 and 16.
 931      * If that is the case, then we add them here, using the properties
 932      * from the BMP PUA.
 933      */
 934     puaProps=0;
 935     hasPlane15PUA=hasPlane16PUA=FALSE;
 936
 937     for(i=0; i<unicodeAreaIndex; ++i) {
 938         repeatProps(unicodeAreas[i].first,
 939                     unicodeAreas[i].last,
 940                     unicodeAreas[i].props);
 941         if(unicodeAreas[i].first==0xe000) {
 942             puaProps=unicodeAreas[i].props;
 943         } else if(unicodeAreas[i].first==0xf0000) {
 944             hasPlane15PUA=TRUE;
 945         } else if(unicodeAreas[i].first==0x100000) {
 946             hasPlane16PUA=TRUE;
 947         }
 948     }
 949
 950     if(puaProps!=0) {
 951         if(!hasPlane15PUA) {
 952             repeatProps(0xf0000, 0xffffd, puaProps);
 953         }
 954         if(!hasPlane16PUA) {
 955             repeatProps(0x100000, 0x10fffd, puaProps);
 956         }
 957     }
 958
 959     /* Hangul have canonical decompositions */
 960     errorCode=U_ZERO_ERROR;
 961     if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
 962         fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
 963         exit(errorCode);
 964     }
 965 }
 966
 967 static void
 968 parseDB(const char *filename, UErrorCode *pErrorCode) {
 969     /* default Bidi classes for unassigned code points */
 970     static const uint32_t defaultBidi[][2]={ /* { limit, class } */
 971         { 0x0590, U_LEFT_TO_RIGHT },
 972         { 0x0600, U_RIGHT_TO_LEFT },
 973         { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
 974         { 0xFB1D, U_LEFT_TO_RIGHT },
 975         { 0xFB50, U_RIGHT_TO_LEFT },
 976         { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
 977         { 0xFE70, U_LEFT_TO_RIGHT },
 978         { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
 979         { 0x110000, U_LEFT_TO_RIGHT }
 980     };
 981
 982     char *fields[15][2];
 983     UChar32 start, end;
 984     uint32_t prev;
 985     int32_t i;
 986
 987     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 988         return;
 989     }
 990
 991     /*
 992      * Set default Bidi classes for unassigned code points.
 993      * See table 3-7 "Bidirectional Character Types" in UAX #9.
 994      * http://www.unicode.org/reports/tr9/
 995      */
 996     prev=0;
 997     for(i=0; i<LENGTHOF(defaultBidi); ++i) {
 998         if(defaultBidi[i][1]!=0) {
 999             repeatProps(prev, defaultBidi[i][0]-1, defaultBidi[i][1]<<UPROPS_BIDI_SHIFT);
1000         }
1001         prev=defaultBidi[i][0];
1002     }
1003
1004     /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
1005     unicodeAreas[0].first=0xffffffff;
1006
1007     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
1008
1009     if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
1010         fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
1011             unicodeAreas[unicodeAreaIndex].name,
1012             (unsigned long)unicodeAreas[unicodeAreaIndex].first);
1013         *pErrorCode=U_PARSE_ERROR;
1014         exit(U_PARSE_ERROR);
1015     }
1016
1017     repeatAreaProps();
1018
1019     /* are all sub-properties consumed? */
1020     if(mirrorIndex<mirrorCount) {
1021         fprintf(stderr, "genprops: error - some code points in BidiMirroring.txt are missing from UnicodeData.txt\n");
1022         *pErrorCode=U_PARSE_ERROR;
1023         exit(U_PARSE_ERROR);
1024     }
1025     if(specialCasingIndex<specialCasingCount) {
1026         fprintf(stderr, "genprops: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
1027         *pErrorCode=U_PARSE_ERROR;
1028         exit(U_PARSE_ERROR);
1029     }
1030     if(caseFoldingIndex<caseFoldingCount) {
1031         fprintf(stderr, "genprops: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
1032         *pErrorCode=U_PARSE_ERROR;
1033         exit(U_PARSE_ERROR);
1034     }
1035
1036     if(U_FAILURE(*pErrorCode)) {
1037         return;
1038     }
1039
1040     for(i=0;
1041         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
1042         ++i
1043     ) {
1044         addCaseSensitive(start, end);
1045     }
1046     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
1047         *pErrorCode=U_ZERO_ERROR;
1048     }
1049 }
1050
1051 /*
1052  * Hey, Emacs, please set the following:
1053  *
1054  * Local Variables:
1055  * indent-tabs-mode: nil
1056  * End:
1057  *
1058  */
1059