icuSources/tools/gencase/gencase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gencase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug28
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads several of the Unicode character database text files,
  17 *   parses them, and the case mapping properties for each character.
  18 *   It then writes a binary file containing the properties
  19 *   that is designed to be used directly for random-access to
  20 *   the properties of each Unicode character.
  21 */
  22
  23 #include <stdio.h>
  24 #include "unicode/utypes.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/uset.h"
  27 #include "unicode/putil.h"
  28 #include "unicode/uclean.h"
  29 #include "cmemory.h"
  30 #include "cstring.h"
  31 #include "uarrsort.h"
  32 #include "unewdata.h"
  33 #include "uoptions.h"
  34 #include "uparse.h"
  35 #include "uprops.h"
  36 #include "propsvec.h"
  37 #include "gencase.h"
  38
  39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
  40
  41 /* data --------------------------------------------------------------------- */
  42
  43 uint32_t *pv;
  44
  45 UBool beVerbose=FALSE, haveCopyright=TRUE;
  46
  47 /*
  48  * Unicode set collecting the case-sensitive characters;
  49  * see uchar.h UCHAR_CASE_SENSITIVE.
  50  * Add code points from case mappings/foldings in
  51  * the root locale and with default options.
  52  */
  53 static USet *caseSensitive;
  54
  55 /* prototypes --------------------------------------------------------------- */
  56
  57 static void
  58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
  59
  60 static void
  61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
  62
  63 static void
  64 parseDB(const char *filename, UErrorCode *pErrorCode);
  65
  66 /* parse files with multiple binary properties ------------------------------ */
  67
  68 /* TODO: more common code, move functions to uparse.h|c */
  69
  70 /* TODO: similar to genprops/props2.c but not the same */
  71
  72 struct Binary {
  73     const char *propName;
  74     int32_t vecWord;
  75     uint32_t vecValue, vecMask;
  76 };
  77 typedef struct Binary Binary;
  78
  79 struct Binaries {
  80     const char *ucdFile;
  81     const Binary *binaries;
  82     int32_t binariesCount;
  83 };
  84 typedef struct Binaries Binaries;
  85
  86 static const Binary
  87 propListNames[]={
  88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
  89 };
  90
  91 static const Binaries
  92 propListBinaries={
  93     "PropList", propListNames, LENGTHOF(propListNames)
  94 };
  95
  96 static const Binary
  97 derCorePropsNames[]={
  98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
  99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK }
 100 };
 101
 102 static const Binaries
 103 derCorePropsBinaries={
 104     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
 105 };
 106
 107 static void U_CALLCONV
 108 binariesLineFn(void *context,
 109                char *fields[][2], int32_t fieldCount,
 110                UErrorCode *pErrorCode) {
 111     const Binaries *bin;
 112     char *s;
 113     uint32_t start, limit;
 114     int32_t i;
 115
 116     bin=(const Binaries *)context;
 117
 118     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
 119     if(U_FAILURE(*pErrorCode)) {
 120         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
 121         exit(*pErrorCode);
 122     }
 123     ++limit;
 124
 125     /* parse binary property name */
 126     s=(char *)u_skipWhitespace(fields[1][0]);
 127     for(i=0;; ++i) {
 128         if(i==bin->binariesCount) {
 129             /* ignore unrecognized properties */
 130             return;
 131         }
 132         if(isToken(bin->binaries[i].propName, s)) {
 133             break;
 134         }
 135     }
 136
 137     if(bin->binaries[i].vecMask==0) {
 138         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
 139                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
 140         exit(U_INTERNAL_PROGRAM_ERROR);
 141     }
 142
 143     if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
 144         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
 145                         bin->binaries[i].propName, u_errorName(*pErrorCode));
 146         exit(*pErrorCode);
 147     }
 148 }
 149
 150 static void
 151 parseBinariesFile(char *filename, char *basename, const char *suffix,
 152                   const Binaries *bin,
 153                   UErrorCode *pErrorCode) {
 154     char *fields[2][2];
 155
 156     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 157         return;
 158     }
 159
 160     writeUCDFilename(basename, bin->ucdFile, suffix);
 161
 162     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
 163     if(U_FAILURE(*pErrorCode)) {
 164         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
 165     }
 166 }
 167
 168 /* -------------------------------------------------------------------------- */
 169
 170 enum
 171 {
 172     HELP_H,
 173     HELP_QUESTION_MARK,
 174     VERBOSE,
 175     COPYRIGHT,
 176     DESTDIR,
 177     SOURCEDIR,
 178     UNICODE_VERSION,
 179     ICUDATADIR
 180 };
 181
 182 /* Keep these values in sync with the above enums */
 183 static UOption options[]={
 184     UOPTION_HELP_H,
 185     UOPTION_HELP_QUESTION_MARK,
 186     UOPTION_VERBOSE,
 187     UOPTION_COPYRIGHT,
 188     UOPTION_DESTDIR,
 189     UOPTION_SOURCEDIR,
 190     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
 191     UOPTION_ICUDATADIR
 192 };
 193
 194 extern int
 195 main(int argc, char* argv[]) {
 196     char filename[300];
 197     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
 198     char *basename=NULL;
 199     UErrorCode errorCode=U_ZERO_ERROR;
 200
 201     U_MAIN_INIT_ARGS(argc, argv);
 202
 203     /* preset then read command line options */
 204     options[DESTDIR].value=u_getDataDirectory();
 205     options[SOURCEDIR].value="";
 206     options[UNICODE_VERSION].value="";
 207     options[ICUDATADIR].value=u_getDataDirectory();
 208     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 209
 210     /* error handling, printing usage message */
 211     if(argc<0) {
 212         fprintf(stderr,
 213             "error in command line argument \"%s\"\n",
 214             argv[-argc]);
 215     }
 216     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
 217         /*
 218          * Broken into chucks because the C89 standard says the minimum
 219          * required supported string length is 509 bytes.
 220          */
 221         fprintf(stderr,
 222             "Usage: %s [-options] [suffix]\n"
 223             "\n"
 224             "read the UnicodeData.txt file and other Unicode properties files and\n"
 225             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
 226             "\n",
 227             argv[0]);
 228         fprintf(stderr,
 229             "Options:\n"
 230             "\t-h or -? or --help  this usage text\n"
 231             "\t-v or --verbose     verbose output\n"
 232             "\t-c or --copyright   include a copyright notice\n"
 233             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
 234         fprintf(stderr,
 235             "\t-d or --destdir     destination directory, followed by the path\n"
 236             "\t-s or --sourcedir   source directory, followed by the path\n"
 237             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 238             "\t                    followed by path, defaults to %s\n"
 239             "\tsuffix              suffix that is to be appended with a '-'\n"
 240             "\t                    to the source file basenames before opening;\n"
 241             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
 242             u_getDataDirectory());
 243         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 244     }
 245
 246     /* get the options values */
 247     beVerbose=options[VERBOSE].doesOccur;
 248     haveCopyright=options[COPYRIGHT].doesOccur;
 249     srcDir=options[SOURCEDIR].value;
 250     destDir=options[DESTDIR].value;
 251
 252     if(argc>=2) {
 253         suffix=argv[1];
 254     } else {
 255         suffix=NULL;
 256     }
 257
 258     if(options[UNICODE_VERSION].doesOccur) {
 259         setUnicodeVersion(options[UNICODE_VERSION].value);
 260     }
 261     /* else use the default dataVersion in store.c */
 262
 263     if (options[ICUDATADIR].doesOccur) {
 264         u_setDataDirectory(options[ICUDATADIR].value);
 265     }
 266
 267     /* prepare the filename beginning with the source dir */
 268     uprv_strcpy(filename, srcDir);
 269     basename=filename+uprv_strlen(filename);
 270     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 271         *basename++=U_FILE_SEP_CHAR;
 272     }
 273
 274     /* initialize */
 275     pv=upvec_open(1, 10000);
 276     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
 277
 278     /* process SpecialCasing.txt */
 279     writeUCDFilename(basename, "SpecialCasing", suffix);
 280     parseSpecialCasing(filename, &errorCode);
 281
 282     /* process CaseFolding.txt */
 283     writeUCDFilename(basename, "CaseFolding", suffix);
 284     parseCaseFolding(filename, &errorCode);
 285
 286     /* process additional properties files */
 287     *basename=0;
 288
 289     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
 290
 291     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
 292
 293     /* process UnicodeData.txt */
 294     writeUCDFilename(basename, "UnicodeData", suffix);
 295     parseDB(filename, &errorCode);
 296
 297     /* process parsed data */
 298     makeCaseClosure();
 299
 300     makeExceptions();
 301
 302     if(U_SUCCESS(errorCode)) {
 303         /* write the properties data file */
 304         generateData(destDir);
 305     }
 306
 307     u_cleanup();
 308     return errorCode;
 309 }
 310
 311 U_CFUNC void
 312 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
 313     int32_t length=(int32_t)uprv_strlen(filename);
 314     uprv_strcpy(basename, filename);
 315     if(suffix!=NULL) {
 316         basename[length++]='-';
 317         uprv_strcpy(basename+length, suffix);
 318         length+=(int32_t)uprv_strlen(suffix);
 319     }
 320     uprv_strcpy(basename+length, ".txt");
 321 }
 322
 323 /* TODO: move to toolutil */
 324 U_CFUNC UBool
 325 isToken(const char *token, const char *s) {
 326     const char *z;
 327     int32_t j;
 328
 329     s=u_skipWhitespace(s);
 330     for(j=0;; ++j) {
 331         if(token[j]!=0) {
 332             if(s[j]!=token[j]) {
 333                 break;
 334             }
 335         } else {
 336             z=u_skipWhitespace(s+j);
 337             if(*z==';' || *z==0) {
 338                 return TRUE;
 339             } else {
 340                 break;
 341             }
 342         }
 343     }
 344
 345     return FALSE;
 346 }
 347
 348 static int32_t
 349 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
 350     const char *t, *z;
 351     int32_t i, j;
 352
 353     s=u_skipWhitespace(s);
 354     for(i=0; i<countTokens; ++i) {
 355         t=tokens[i];
 356         if(t!=NULL) {
 357             for(j=0;; ++j) {
 358                 if(t[j]!=0) {
 359                     if(s[j]!=t[j]) {
 360                         break;
 361                     }
 362                 } else {
 363                     z=u_skipWhitespace(s+j);
 364                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
 365                         return i;
 366                     } else {
 367                         break;
 368                     }
 369                 }
 370             }
 371         }
 372     }
 373     return -1;
 374 }
 375
 376 static void
 377 _set_addAll(USet *set, const UChar *s, int32_t length) {
 378     UChar32 c;
 379     int32_t i;
 380
 381     /* needs length>=0 */
 382     for(i=0; i<length; /* U16_NEXT advances i */) {
 383         U16_NEXT(s, i, length, c);
 384         uset_add(set, c);
 385     }
 386 }
 387
 388 /* parser for SpecialCasing.txt --------------------------------------------- */
 389
 390 #define MAX_SPECIAL_CASING_COUNT 500
 391
 392 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
 393 static int32_t specialCasingCount=0;
 394
 395 static void U_CALLCONV
 396 specialCasingLineFn(void *context,
 397                     char *fields[][2], int32_t fieldCount,
 398                     UErrorCode *pErrorCode) {
 399     char *end;
 400
 401     /* get code point */
 402     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 403     end=(char *)u_skipWhitespace(end);
 404     if(end<=fields[0][0] || end!=fields[0][1]) {
 405         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
 406         *pErrorCode=U_PARSE_ERROR;
 407         exit(U_PARSE_ERROR);
 408     }
 409
 410     /* is this a complex mapping? */
 411     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
 412         /* there is some condition text in the fifth field */
 413         specialCasings[specialCasingCount].isComplex=TRUE;
 414
 415         /* do not store any actual mappings for this */
 416         specialCasings[specialCasingCount].lowerCase[0]=0;
 417         specialCasings[specialCasingCount].upperCase[0]=0;
 418         specialCasings[specialCasingCount].titleCase[0]=0;
 419     } else {
 420         /* just set the "complex" flag and get the case mappings */
 421         specialCasings[specialCasingCount].isComplex=FALSE;
 422         specialCasings[specialCasingCount].lowerCase[0]=
 423             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
 424         specialCasings[specialCasingCount].upperCase[0]=
 425             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
 426         specialCasings[specialCasingCount].titleCase[0]=
 427             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
 428         if(U_FAILURE(*pErrorCode)) {
 429             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
 430             exit(*pErrorCode);
 431         }
 432
 433         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
 434         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
 435         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
 436         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
 437     }
 438
 439     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
 440         fprintf(stderr, "gencase: too many special casing mappings\n");
 441         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 442         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 443     }
 444 }
 445
 446 static int32_t U_CALLCONV
 447 compareSpecialCasings(const void *context, const void *left, const void *right) {
 448     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
 449 }
 450
 451 static void
 452 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
 453     char *fields[5][2];
 454     int32_t i, j;
 455
 456     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 457         return;
 458     }
 459
 460     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
 461
 462     /* sort the special casing entries by code point */
 463     if(specialCasingCount>0) {
 464         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
 465                        compareSpecialCasings, NULL, FALSE, pErrorCode);
 466     }
 467     if(U_FAILURE(*pErrorCode)) {
 468         return;
 469     }
 470
 471     /* replace multiple entries for any code point by one "complex" one */
 472     j=0;
 473     for(i=1; i<specialCasingCount; ++i) {
 474         if(specialCasings[i-1].code==specialCasings[i].code) {
 475             /* there is a duplicate code point */
 476             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
 477             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
 478             specialCasings[i].lowerCase[0]=0;
 479             specialCasings[i].upperCase[0]=0;
 480             specialCasings[i].titleCase[0]=0;
 481             ++j;
 482         }
 483     }
 484
 485     /* if some entries just were removed, then re-sort */
 486     if(j>0) {
 487         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
 488                        compareSpecialCasings, NULL, FALSE, pErrorCode);
 489         specialCasingCount-=j;
 490     }
 491     if(U_FAILURE(*pErrorCode)) {
 492         return;
 493     }
 494
 495     /*
 496      * Add one complex mapping to caseSensitive that was filtered out above:
 497      * Greek final Sigma has a conditional mapping but not locale-sensitive,
 498      * and it is taken when lowercasing just U+03A3 alone.
 499      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 500      */
 501     uset_add(caseSensitive, 0x3c2);
 502 }
 503
 504 /* parser for CaseFolding.txt ----------------------------------------------- */
 505
 506 #define MAX_CASE_FOLDING_COUNT 2000
 507
 508 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
 509 static int32_t caseFoldingCount=0;
 510
 511 static void U_CALLCONV
 512 caseFoldingLineFn(void *context,
 513                   char *fields[][2], int32_t fieldCount,
 514                   UErrorCode *pErrorCode) {
 515     char *end;
 516     static UChar32 prevCode=0;
 517     int32_t count;
 518     char status;
 519
 520     /* get code point */
 521     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 522     end=(char *)u_skipWhitespace(end);
 523     if(end<=fields[0][0] || end!=fields[0][1]) {
 524         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
 525         *pErrorCode=U_PARSE_ERROR;
 526         exit(U_PARSE_ERROR);
 527     }
 528
 529     /* get the status of this mapping */
 530     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
 531     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
 532         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
 533         *pErrorCode=U_PARSE_ERROR;
 534         exit(U_PARSE_ERROR);
 535     }
 536
 537     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
 538     if(status=='L') {
 539         return;
 540     }
 541
 542     /* get the mapping */
 543     count=caseFoldings[caseFoldingCount].full[0]=
 544         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
 545     if(U_FAILURE(*pErrorCode)) {
 546         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
 547         exit(*pErrorCode);
 548     }
 549
 550     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
 551     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
 552         caseFoldings[caseFoldingCount].simple=0;
 553     }
 554
 555     /* update the case-sensitive set */
 556     if(status!='T') {
 557         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
 558         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
 559     }
 560
 561     /* check the status */
 562     if(status=='S') {
 563         /* check if there was a full mapping for this code point before */
 564         if( caseFoldingCount>0 &&
 565             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 566             caseFoldings[caseFoldingCount-1].status=='F'
 567         ) {
 568             /* merge the two entries */
 569             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
 570             return;
 571         }
 572     } else if(status=='F') {
 573         /* check if there was a simple mapping for this code point before */
 574         if( caseFoldingCount>0 &&
 575             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 576             caseFoldings[caseFoldingCount-1].status=='S'
 577         ) {
 578             /* merge the two entries */
 579             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
 580             return;
 581         }
 582     } else if(status=='I' || status=='T') {
 583         /* check if there was a default mapping for this code point before (remove it) */
 584         while(caseFoldingCount>0 &&
 585               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
 586         ) {
 587             prevCode=0;
 588             --caseFoldingCount;
 589         }
 590         /* store only a marker for special handling for cases like dotless i */
 591         caseFoldings[caseFoldingCount].simple=0;
 592         caseFoldings[caseFoldingCount].full[0]=0;
 593     }
 594
 595     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
 596     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
 597         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
 598                 (unsigned long)caseFoldings[caseFoldingCount].code,
 599                 (unsigned long)prevCode);
 600         *pErrorCode=U_PARSE_ERROR;
 601         exit(U_PARSE_ERROR);
 602     }
 603     prevCode=caseFoldings[caseFoldingCount].code;
 604
 605     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
 606         fprintf(stderr, "gencase: too many case folding mappings\n");
 607         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 608         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 609     }
 610 }
 611
 612 static void
 613 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
 614     char *fields[3][2];
 615
 616     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 617         return;
 618     }
 619
 620     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
 621 }
 622
 623 /* parser for UnicodeData.txt ----------------------------------------------- */
 624
 625 /* general categories */
 626 const char *const
 627 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
 628     "Cn",
 629     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
 630     "Mc", "Nd", "Nl", "No",
 631     "Zs", "Zl", "Zp",
 632     "Cc", "Cf", "Co", "Cs",
 633     "Pd", "Ps", "Pe", "Pc", "Po",
 634     "Sm", "Sc", "Sk", "So",
 635     "Pi", "Pf"
 636 };
 637
 638 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
 639
 640 static void U_CALLCONV
 641 unicodeDataLineFn(void *context,
 642                   char *fields[][2], int32_t fieldCount,
 643                   UErrorCode *pErrorCode) {
 644     Props p;
 645     char *end;
 646     static UChar32 prevCode=0;
 647     UChar32 value;
 648     int32_t i;
 649
 650     /* reset the properties */
 651     uprv_memset(&p, 0, sizeof(Props));
 652
 653     /* get the character code, field 0 */
 654     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
 655     if(end<=fields[0][0] || end!=fields[0][1]) {
 656         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
 657         *pErrorCode=U_PARSE_ERROR;
 658         exit(U_PARSE_ERROR);
 659     }
 660
 661     /* get general category, field 2 */
 662     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
 663     if(i>=0) {
 664         p.gc=(uint8_t)i;
 665     } else {
 666         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
 667             fields[2][0], (unsigned long)p.code);
 668         *pErrorCode=U_PARSE_ERROR;
 669         exit(U_PARSE_ERROR);
 670     }
 671
 672     /* get canonical combining class, field 3 */
 673     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
 674     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
 675         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
 676         *pErrorCode=U_PARSE_ERROR;
 677         exit(U_PARSE_ERROR);
 678     }
 679     p.cc=(uint8_t)value;
 680
 681     /* get uppercase mapping, field 12 */
 682     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
 683     if(end!=fields[12][1]) {
 684         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
 685             (unsigned long)p.code);
 686         *pErrorCode=U_PARSE_ERROR;
 687         exit(U_PARSE_ERROR);
 688     }
 689     if(value!=0 && value!=p.code) {
 690         p.upperCase=value;
 691         uset_add(caseSensitive, p.code);
 692         uset_add(caseSensitive, value);
 693     }
 694
 695     /* get lowercase value, field 13 */
 696     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
 697     if(end!=fields[13][1]) {
 698         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
 699             (unsigned long)p.code);
 700         *pErrorCode=U_PARSE_ERROR;
 701         exit(U_PARSE_ERROR);
 702     }
 703     if(value!=0 && value!=p.code) {
 704         p.lowerCase=value;
 705         uset_add(caseSensitive, p.code);
 706         uset_add(caseSensitive, value);
 707     }
 708
 709     /* get titlecase value, field 14 */
 710     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
 711     if(end!=fields[14][1]) {
 712         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
 713             (unsigned long)p.code);
 714         *pErrorCode=U_PARSE_ERROR;
 715         exit(U_PARSE_ERROR);
 716     }
 717     if(value!=0 && value!=p.code) {
 718         p.titleCase=value;
 719         uset_add(caseSensitive, p.code);
 720         uset_add(caseSensitive, value);
 721     }
 722
 723     /* set additional properties from previously parsed files */
 724     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
 725         p.specialCasing=specialCasings+specialCasingIndex++;
 726     } else {
 727         p.specialCasing=NULL;
 728     }
 729     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
 730         p.caseFolding=caseFoldings+caseFoldingIndex++;
 731
 732         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
 733         if( p.caseFolding->status=='C' &&
 734             p.caseFolding->simple==p.lowerCase
 735         ) {
 736             p.caseFolding=NULL;
 737         }
 738     } else {
 739         p.caseFolding=NULL;
 740     }
 741
 742     /* check for non-character code points */
 743     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
 744         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
 745                 (unsigned long)p.code);
 746         *pErrorCode=U_PARSE_ERROR;
 747         exit(U_PARSE_ERROR);
 748     }
 749
 750     /* check that the code points (p.code) are in ascending order */
 751     if(p.code<=prevCode && p.code>0) {
 752         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 753                 (unsigned long)p.code, (unsigned long)prevCode);
 754         *pErrorCode=U_PARSE_ERROR;
 755         exit(U_PARSE_ERROR);
 756     }
 757
 758     /* properties for a single code point */
 759     setProps(&p);
 760
 761     prevCode=p.code;
 762 }
 763
 764 static void
 765 parseDB(const char *filename, UErrorCode *pErrorCode) {
 766     char *fields[15][2];
 767     UChar32 start, end;
 768     int32_t i;
 769
 770     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 771         return;
 772     }
 773
 774     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
 775
 776     /* are all sub-properties consumed? */
 777     if(specialCasingIndex<specialCasingCount) {
 778         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
 779         *pErrorCode=U_PARSE_ERROR;
 780         exit(U_PARSE_ERROR);
 781     }
 782     if(caseFoldingIndex<caseFoldingCount) {
 783         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
 784         *pErrorCode=U_PARSE_ERROR;
 785         exit(U_PARSE_ERROR);
 786     }
 787
 788     if(U_FAILURE(*pErrorCode)) {
 789         return;
 790     }
 791
 792     for(i=0;
 793         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
 794         ++i
 795     ) {
 796         addCaseSensitive(start, end);
 797     }
 798     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
 799         *pErrorCode=U_ZERO_ERROR;
 800     }
 801 }
 802
 803 /*
 804  * Hey, Emacs, please set the following:
 805  *
 806  * Local Variables:
 807  * indent-tabs-mode: nil
 808  * End:
 809  *
 810  */