icuSources/tools/gencase/gencase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2005, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gencase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug28
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads several of the Unicode character database text files,
  17 *   parses them, and the case mapping properties for each character.
  18 *   It then writes a binary file containing the properties
  19 *   that is designed to be used directly for random-access to
  20 *   the properties of each Unicode character.
  21 */
  22
  23 #include <stdio.h>
  24 #include "unicode/utypes.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/uset.h"
  27 #include "unicode/putil.h"
  28 #include "unicode/uclean.h"
  29 #include "cmemory.h"
  30 #include "cstring.h"
  31 #include "uarrsort.h"
  32 #include "unewdata.h"
  33 #include "uoptions.h"
  34 #include "uparse.h"
  35 #include "uprops.h"
  36 #include "propsvec.h"
  37 #include "gencase.h"
  38
  39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
  40
  41 /* data --------------------------------------------------------------------- */
  42
  43 uint32_t *pv;
  44
  45 UBool beVerbose=FALSE, haveCopyright=TRUE;
  46
  47 /*
  48  * Unicode set collecting the case-sensitive characters;
  49  * see uchar.h UCHAR_CASE_SENSITIVE.
  50  * Add code points from case mappings/foldings in
  51  * the root locale and with default options.
  52  */
  53 static USet *caseSensitive;
  54
  55 /* prototypes --------------------------------------------------------------- */
  56
  57 static void
  58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
  59
  60 static void
  61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
  62
  63 static void
  64 parseDB(const char *filename, UErrorCode *pErrorCode);
  65
  66 /* parse files with multiple binary properties ------------------------------ */
  67
  68 /* TODO: more common code, move functions to uparse.h|c */
  69
  70 /* TODO: similar to genprops/props2.c but not the same */
  71
  72 struct Binary {
  73     const char *propName;
  74     int32_t vecWord;
  75     uint32_t vecValue, vecMask;
  76 };
  77 typedef struct Binary Binary;
  78
  79 struct Binaries {
  80     const char *ucdFile;
  81     const Binary *binaries;
  82     int32_t binariesCount;
  83 };
  84 typedef struct Binaries Binaries;
  85
  86 static const Binary
  87 propListNames[]={
  88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
  89 };
  90
  91 static const Binaries
  92 propListBinaries={
  93     "PropList", propListNames, LENGTHOF(propListNames)
  94 };
  95
  96 static const Binary
  97 derCorePropsNames[]={
  98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
  99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK }
 100 };
 101
 102 static const Binaries
 103 derCorePropsBinaries={
 104     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
 105 };
 106
 107 /* treat Word_Break=MidLetter as a binary property (we ignore all other Word_Break values) */
 108 static const Binary
 109 wordBreakNames[]={
 110     { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
 111 };
 112
 113 static const Binaries
 114 wordBreakBinaries={
 115     "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
 116 };
 117
 118 static void U_CALLCONV
 119 binariesLineFn(void *context,
 120                char *fields[][2], int32_t fieldCount,
 121                UErrorCode *pErrorCode) {
 122     const Binaries *bin;
 123     char *s;
 124     uint32_t start, limit;
 125     int32_t i;
 126
 127     bin=(const Binaries *)context;
 128
 129     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
 130     if(U_FAILURE(*pErrorCode)) {
 131         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
 132         exit(*pErrorCode);
 133     }
 134     ++limit;
 135
 136     /* parse binary property name */
 137     s=(char *)u_skipWhitespace(fields[1][0]);
 138     for(i=0;; ++i) {
 139         if(i==bin->binariesCount) {
 140             /* ignore unrecognized properties */
 141             return;
 142         }
 143         if(isToken(bin->binaries[i].propName, s)) {
 144             break;
 145         }
 146     }
 147
 148     if(bin->binaries[i].vecMask==0) {
 149         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
 150                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
 151         exit(U_INTERNAL_PROGRAM_ERROR);
 152     }
 153
 154     if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
 155         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
 156                         bin->binaries[i].propName, u_errorName(*pErrorCode));
 157         exit(*pErrorCode);
 158     }
 159 }
 160
 161 static void
 162 parseBinariesFile(char *filename, char *basename, const char *suffix,
 163                   const Binaries *bin,
 164                   UErrorCode *pErrorCode) {
 165     char *fields[2][2];
 166
 167     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 168         return;
 169     }
 170
 171     writeUCDFilename(basename, bin->ucdFile, suffix);
 172
 173     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
 174     if(U_FAILURE(*pErrorCode)) {
 175         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
 176     }
 177 }
 178
 179 /* -------------------------------------------------------------------------- */
 180
 181 enum
 182 {
 183     HELP_H,
 184     HELP_QUESTION_MARK,
 185     VERBOSE,
 186     COPYRIGHT,
 187     DESTDIR,
 188     SOURCEDIR,
 189     UNICODE_VERSION,
 190     ICUDATADIR,
 191     CSOURCE
 192 };
 193
 194 /* Keep these values in sync with the above enums */
 195 static UOption options[]={
 196     UOPTION_HELP_H,
 197     UOPTION_HELP_QUESTION_MARK,
 198     UOPTION_VERBOSE,
 199     UOPTION_COPYRIGHT,
 200     UOPTION_DESTDIR,
 201     UOPTION_SOURCEDIR,
 202     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
 203     UOPTION_ICUDATADIR,
 204     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
 205 };
 206
 207 extern int
 208 main(int argc, char* argv[]) {
 209     char filename[300];
 210     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
 211     char *basename=NULL;
 212     UErrorCode errorCode=U_ZERO_ERROR;
 213
 214     U_MAIN_INIT_ARGS(argc, argv);
 215
 216     /* preset then read command line options */
 217     options[DESTDIR].value=u_getDataDirectory();
 218     options[SOURCEDIR].value="";
 219     options[UNICODE_VERSION].value="";
 220     options[ICUDATADIR].value=u_getDataDirectory();
 221     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 222
 223     /* error handling, printing usage message */
 224     if(argc<0) {
 225         fprintf(stderr,
 226             "error in command line argument \"%s\"\n",
 227             argv[-argc]);
 228     }
 229     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
 230         /*
 231          * Broken into chucks because the C89 standard says the minimum
 232          * required supported string length is 509 bytes.
 233          */
 234         fprintf(stderr,
 235             "Usage: %s [-options] [suffix]\n"
 236             "\n"
 237             "read the UnicodeData.txt file and other Unicode properties files and\n"
 238             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
 239             "\n",
 240             argv[0]);
 241         fprintf(stderr,
 242             "Options:\n"
 243             "\t-h or -? or --help  this usage text\n"
 244             "\t-v or --verbose     verbose output\n"
 245             "\t-c or --copyright   include a copyright notice\n"
 246             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 247             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
 248         fprintf(stderr,
 249             "\t-d or --destdir     destination directory, followed by the path\n"
 250             "\t-s or --sourcedir   source directory, followed by the path\n"
 251             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 252             "\t                    followed by path, defaults to %s\n"
 253             "\tsuffix              suffix that is to be appended with a '-'\n"
 254             "\t                    to the source file basenames before opening;\n"
 255             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
 256             u_getDataDirectory());
 257         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 258     }
 259
 260     /* get the options values */
 261     beVerbose=options[VERBOSE].doesOccur;
 262     haveCopyright=options[COPYRIGHT].doesOccur;
 263     srcDir=options[SOURCEDIR].value;
 264     destDir=options[DESTDIR].value;
 265
 266     if(argc>=2) {
 267         suffix=argv[1];
 268     } else {
 269         suffix=NULL;
 270     }
 271
 272     if(options[UNICODE_VERSION].doesOccur) {
 273         setUnicodeVersion(options[UNICODE_VERSION].value);
 274     }
 275     /* else use the default dataVersion in store.c */
 276
 277     if (options[ICUDATADIR].doesOccur) {
 278         u_setDataDirectory(options[ICUDATADIR].value);
 279     }
 280
 281     /* prepare the filename beginning with the source dir */
 282     uprv_strcpy(filename, srcDir);
 283     basename=filename+uprv_strlen(filename);
 284     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 285         *basename++=U_FILE_SEP_CHAR;
 286     }
 287
 288     /* initialize */
 289     pv=upvec_open(2, 10000);
 290     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
 291
 292     /* process SpecialCasing.txt */
 293     writeUCDFilename(basename, "SpecialCasing", suffix);
 294     parseSpecialCasing(filename, &errorCode);
 295
 296     /* process CaseFolding.txt */
 297     writeUCDFilename(basename, "CaseFolding", suffix);
 298     parseCaseFolding(filename, &errorCode);
 299
 300     /* process additional properties files */
 301     *basename=0;
 302
 303     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
 304
 305     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
 306
 307     if(ucdVersion>=UNI_4_1) {
 308         parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
 309     }
 310
 311     /* process UnicodeData.txt */
 312     writeUCDFilename(basename, "UnicodeData", suffix);
 313     parseDB(filename, &errorCode);
 314
 315     /* process parsed data */
 316     makeCaseClosure();
 317
 318     makeExceptions();
 319
 320     if(U_SUCCESS(errorCode)) {
 321         /* write the properties data file */
 322         generateData(destDir, options[CSOURCE].doesOccur);
 323     }
 324
 325     u_cleanup();
 326     return errorCode;
 327 }
 328
 329 U_CFUNC void
 330 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
 331     int32_t length=(int32_t)uprv_strlen(filename);
 332     uprv_strcpy(basename, filename);
 333     if(suffix!=NULL) {
 334         basename[length++]='-';
 335         uprv_strcpy(basename+length, suffix);
 336         length+=(int32_t)uprv_strlen(suffix);
 337     }
 338     uprv_strcpy(basename+length, ".txt");
 339 }
 340
 341 /* TODO: move to toolutil */
 342 U_CFUNC UBool
 343 isToken(const char *token, const char *s) {
 344     const char *z;
 345     int32_t j;
 346
 347     s=u_skipWhitespace(s);
 348     for(j=0;; ++j) {
 349         if(token[j]!=0) {
 350             if(s[j]!=token[j]) {
 351                 break;
 352             }
 353         } else {
 354             z=u_skipWhitespace(s+j);
 355             if(*z==';' || *z==0) {
 356                 return TRUE;
 357             } else {
 358                 break;
 359             }
 360         }
 361     }
 362
 363     return FALSE;
 364 }
 365
 366 static int32_t
 367 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
 368     const char *t, *z;
 369     int32_t i, j;
 370
 371     s=u_skipWhitespace(s);
 372     for(i=0; i<countTokens; ++i) {
 373         t=tokens[i];
 374         if(t!=NULL) {
 375             for(j=0;; ++j) {
 376                 if(t[j]!=0) {
 377                     if(s[j]!=t[j]) {
 378                         break;
 379                     }
 380                 } else {
 381                     z=u_skipWhitespace(s+j);
 382                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
 383                         return i;
 384                     } else {
 385                         break;
 386                     }
 387                 }
 388             }
 389         }
 390     }
 391     return -1;
 392 }
 393
 394 static void
 395 _set_addAll(USet *set, const UChar *s, int32_t length) {
 396     UChar32 c;
 397     int32_t i;
 398
 399     /* needs length>=0 */
 400     for(i=0; i<length; /* U16_NEXT advances i */) {
 401         U16_NEXT(s, i, length, c);
 402         uset_add(set, c);
 403     }
 404 }
 405
 406 /* parser for SpecialCasing.txt --------------------------------------------- */
 407
 408 #define MAX_SPECIAL_CASING_COUNT 500
 409
 410 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
 411 static int32_t specialCasingCount=0;
 412
 413 static void U_CALLCONV
 414 specialCasingLineFn(void *context,
 415                     char *fields[][2], int32_t fieldCount,
 416                     UErrorCode *pErrorCode) {
 417     char *end;
 418
 419     /* get code point */
 420     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 421     end=(char *)u_skipWhitespace(end);
 422     if(end<=fields[0][0] || end!=fields[0][1]) {
 423         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
 424         *pErrorCode=U_PARSE_ERROR;
 425         exit(U_PARSE_ERROR);
 426     }
 427
 428     /* is this a complex mapping? */
 429     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
 430         /* there is some condition text in the fifth field */
 431         specialCasings[specialCasingCount].isComplex=TRUE;
 432
 433         /* do not store any actual mappings for this */
 434         specialCasings[specialCasingCount].lowerCase[0]=0;
 435         specialCasings[specialCasingCount].upperCase[0]=0;
 436         specialCasings[specialCasingCount].titleCase[0]=0;
 437     } else {
 438         /* just set the "complex" flag and get the case mappings */
 439         specialCasings[specialCasingCount].isComplex=FALSE;
 440         specialCasings[specialCasingCount].lowerCase[0]=
 441             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
 442         specialCasings[specialCasingCount].upperCase[0]=
 443             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
 444         specialCasings[specialCasingCount].titleCase[0]=
 445             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
 446         if(U_FAILURE(*pErrorCode)) {
 447             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
 448             exit(*pErrorCode);
 449         }
 450
 451         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
 452         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
 453         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
 454         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
 455     }
 456
 457     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
 458         fprintf(stderr, "gencase: too many special casing mappings\n");
 459         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 460         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 461     }
 462 }
 463
 464 static int32_t U_CALLCONV
 465 compareSpecialCasings(const void *context, const void *left, const void *right) {
 466     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
 467 }
 468
 469 static void
 470 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
 471     char *fields[5][2];
 472     int32_t i, j;
 473
 474     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 475         return;
 476     }
 477
 478     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
 479
 480     /* sort the special casing entries by code point */
 481     if(specialCasingCount>0) {
 482         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
 483                        compareSpecialCasings, NULL, FALSE, pErrorCode);
 484     }
 485     if(U_FAILURE(*pErrorCode)) {
 486         return;
 487     }
 488
 489     /* replace multiple entries for any code point by one "complex" one */
 490     j=0;
 491     for(i=1; i<specialCasingCount; ++i) {
 492         if(specialCasings[i-1].code==specialCasings[i].code) {
 493             /* there is a duplicate code point */
 494             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
 495             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
 496             specialCasings[i].lowerCase[0]=0;
 497             specialCasings[i].upperCase[0]=0;
 498             specialCasings[i].titleCase[0]=0;
 499             ++j;
 500         }
 501     }
 502
 503     /* if some entries just were removed, then re-sort */
 504     if(j>0) {
 505         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
 506                        compareSpecialCasings, NULL, FALSE, pErrorCode);
 507         specialCasingCount-=j;
 508     }
 509     if(U_FAILURE(*pErrorCode)) {
 510         return;
 511     }
 512
 513     /*
 514      * Add one complex mapping to caseSensitive that was filtered out above:
 515      * Greek final Sigma has a conditional mapping but not locale-sensitive,
 516      * and it is taken when lowercasing just U+03A3 alone.
 517      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 518      */
 519     uset_add(caseSensitive, 0x3c2);
 520 }
 521
 522 /* parser for CaseFolding.txt ----------------------------------------------- */
 523
 524 #define MAX_CASE_FOLDING_COUNT 2000
 525
 526 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
 527 static int32_t caseFoldingCount=0;
 528
 529 static void U_CALLCONV
 530 caseFoldingLineFn(void *context,
 531                   char *fields[][2], int32_t fieldCount,
 532                   UErrorCode *pErrorCode) {
 533     char *end;
 534     static UChar32 prevCode=0;
 535     int32_t count;
 536     char status;
 537
 538     /* get code point */
 539     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 540     end=(char *)u_skipWhitespace(end);
 541     if(end<=fields[0][0] || end!=fields[0][1]) {
 542         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
 543         *pErrorCode=U_PARSE_ERROR;
 544         exit(U_PARSE_ERROR);
 545     }
 546
 547     /* get the status of this mapping */
 548     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
 549     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
 550         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
 551         *pErrorCode=U_PARSE_ERROR;
 552         exit(U_PARSE_ERROR);
 553     }
 554
 555     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
 556     if(status=='L') {
 557         return;
 558     }
 559
 560     /* get the mapping */
 561     count=caseFoldings[caseFoldingCount].full[0]=
 562         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
 563     if(U_FAILURE(*pErrorCode)) {
 564         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
 565         exit(*pErrorCode);
 566     }
 567
 568     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
 569     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
 570         caseFoldings[caseFoldingCount].simple=0;
 571     }
 572
 573     /* update the case-sensitive set */
 574     if(status!='T') {
 575         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
 576         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
 577     }
 578
 579     /* check the status */
 580     if(status=='S') {
 581         /* check if there was a full mapping for this code point before */
 582         if( caseFoldingCount>0 &&
 583             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 584             caseFoldings[caseFoldingCount-1].status=='F'
 585         ) {
 586             /* merge the two entries */
 587             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
 588             return;
 589         }
 590     } else if(status=='F') {
 591         /* check if there was a simple mapping for this code point before */
 592         if( caseFoldingCount>0 &&
 593             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 594             caseFoldings[caseFoldingCount-1].status=='S'
 595         ) {
 596             /* merge the two entries */
 597             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
 598             return;
 599         }
 600     } else if(status=='I' || status=='T') {
 601         /* check if there was a default mapping for this code point before (remove it) */
 602         while(caseFoldingCount>0 &&
 603               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
 604         ) {
 605             prevCode=0;
 606             --caseFoldingCount;
 607         }
 608         /* store only a marker for special handling for cases like dotless i */
 609         caseFoldings[caseFoldingCount].simple=0;
 610         caseFoldings[caseFoldingCount].full[0]=0;
 611     }
 612
 613     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
 614     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
 615         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
 616                 (unsigned long)caseFoldings[caseFoldingCount].code,
 617                 (unsigned long)prevCode);
 618         *pErrorCode=U_PARSE_ERROR;
 619         exit(U_PARSE_ERROR);
 620     }
 621     prevCode=caseFoldings[caseFoldingCount].code;
 622
 623     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
 624         fprintf(stderr, "gencase: too many case folding mappings\n");
 625         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 626         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 627     }
 628 }
 629
 630 static void
 631 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
 632     char *fields[3][2];
 633
 634     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 635         return;
 636     }
 637
 638     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
 639 }
 640
 641 /* parser for UnicodeData.txt ----------------------------------------------- */
 642
 643 /* general categories */
 644 const char *const
 645 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
 646     "Cn",
 647     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
 648     "Mc", "Nd", "Nl", "No",
 649     "Zs", "Zl", "Zp",
 650     "Cc", "Cf", "Co", "Cs",
 651     "Pd", "Ps", "Pe", "Pc", "Po",
 652     "Sm", "Sc", "Sk", "So",
 653     "Pi", "Pf"
 654 };
 655
 656 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
 657
 658 static void U_CALLCONV
 659 unicodeDataLineFn(void *context,
 660                   char *fields[][2], int32_t fieldCount,
 661                   UErrorCode *pErrorCode) {
 662     Props p;
 663     char *end;
 664     static UChar32 prevCode=0;
 665     UChar32 value;
 666     int32_t i;
 667
 668     /* reset the properties */
 669     uprv_memset(&p, 0, sizeof(Props));
 670
 671     /* get the character code, field 0 */
 672     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
 673     if(end<=fields[0][0] || end!=fields[0][1]) {
 674         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
 675         *pErrorCode=U_PARSE_ERROR;
 676         exit(U_PARSE_ERROR);
 677     }
 678
 679     /* get general category, field 2 */
 680     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
 681     if(i>=0) {
 682         p.gc=(uint8_t)i;
 683     } else {
 684         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
 685             fields[2][0], (unsigned long)p.code);
 686         *pErrorCode=U_PARSE_ERROR;
 687         exit(U_PARSE_ERROR);
 688     }
 689
 690     /* get canonical combining class, field 3 */
 691     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
 692     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
 693         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
 694         *pErrorCode=U_PARSE_ERROR;
 695         exit(U_PARSE_ERROR);
 696     }
 697     p.cc=(uint8_t)value;
 698
 699     /* get uppercase mapping, field 12 */
 700     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
 701     if(end!=fields[12][1]) {
 702         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
 703             (unsigned long)p.code);
 704         *pErrorCode=U_PARSE_ERROR;
 705         exit(U_PARSE_ERROR);
 706     }
 707     if(value!=0 && value!=p.code) {
 708         p.upperCase=value;
 709         uset_add(caseSensitive, p.code);
 710         uset_add(caseSensitive, value);
 711     }
 712
 713     /* get lowercase value, field 13 */
 714     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
 715     if(end!=fields[13][1]) {
 716         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
 717             (unsigned long)p.code);
 718         *pErrorCode=U_PARSE_ERROR;
 719         exit(U_PARSE_ERROR);
 720     }
 721     if(value!=0 && value!=p.code) {
 722         p.lowerCase=value;
 723         uset_add(caseSensitive, p.code);
 724         uset_add(caseSensitive, value);
 725     }
 726
 727     /* get titlecase value, field 14 */
 728     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
 729     if(end!=fields[14][1]) {
 730         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
 731             (unsigned long)p.code);
 732         *pErrorCode=U_PARSE_ERROR;
 733         exit(U_PARSE_ERROR);
 734     }
 735     if(value!=0 && value!=p.code) {
 736         p.titleCase=value;
 737         uset_add(caseSensitive, p.code);
 738         uset_add(caseSensitive, value);
 739     }
 740
 741     /* set additional properties from previously parsed files */
 742     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
 743         p.specialCasing=specialCasings+specialCasingIndex++;
 744     } else {
 745         p.specialCasing=NULL;
 746     }
 747     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
 748         p.caseFolding=caseFoldings+caseFoldingIndex++;
 749
 750         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
 751         if( p.caseFolding->status=='C' &&
 752             p.caseFolding->simple==p.lowerCase
 753         ) {
 754             p.caseFolding=NULL;
 755         }
 756     } else {
 757         p.caseFolding=NULL;
 758     }
 759
 760     /* check for non-character code points */
 761     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
 762         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
 763                 (unsigned long)p.code);
 764         *pErrorCode=U_PARSE_ERROR;
 765         exit(U_PARSE_ERROR);
 766     }
 767
 768     /* check that the code points (p.code) are in ascending order */
 769     if(p.code<=prevCode && p.code>0) {
 770         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 771                 (unsigned long)p.code, (unsigned long)prevCode);
 772         *pErrorCode=U_PARSE_ERROR;
 773         exit(U_PARSE_ERROR);
 774     }
 775
 776     /* properties for a single code point */
 777     setProps(&p);
 778
 779     prevCode=p.code;
 780 }
 781
 782 static void
 783 parseDB(const char *filename, UErrorCode *pErrorCode) {
 784     char *fields[15][2];
 785     UChar32 start, end;
 786     int32_t i;
 787
 788     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 789         return;
 790     }
 791
 792     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
 793
 794     /* are all sub-properties consumed? */
 795     if(specialCasingIndex<specialCasingCount) {
 796         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
 797         *pErrorCode=U_PARSE_ERROR;
 798         exit(U_PARSE_ERROR);
 799     }
 800     if(caseFoldingIndex<caseFoldingCount) {
 801         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
 802         *pErrorCode=U_PARSE_ERROR;
 803         exit(U_PARSE_ERROR);
 804     }
 805
 806     if(U_FAILURE(*pErrorCode)) {
 807         return;
 808     }
 809
 810     for(i=0;
 811         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
 812         ++i
 813     ) {
 814         addCaseSensitive(start, end);
 815     }
 816     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
 817         *pErrorCode=U_ZERO_ERROR;
 818     }
 819 }
 820
 821 /*
 822  * Hey, Emacs, please set the following:
 823  *
 824  * Local Variables:
 825  * indent-tabs-mode: nil
 826  * End:
 827  *
 828  */