icuSources/tools/gencase/gencase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  gencase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug28
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This program reads several of the Unicode character database text files,
  17 *   parses them, and the case mapping properties for each character.
  18 *   It then writes a binary file containing the properties
  19 *   that is designed to be used directly for random-access to
  20 *   the properties of each Unicode character.
  21 */
  22
  23 #include <stdio.h>
  24 #include "unicode/utypes.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/uset.h"
  27 #include "unicode/putil.h"
  28 #include "unicode/uclean.h"
  29 #include "cmemory.h"
  30 #include "cstring.h"
  31 #include "uarrsort.h"
  32 #include "unewdata.h"
  33 #include "uoptions.h"
  34 #include "uparse.h"
  35 #include "uprops.h"
  36 #include "propsvec.h"
  37 #include "gencase.h"
  38
  39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
  40
  41 /* data --------------------------------------------------------------------- */
  42
  43 uint32_t *pv;
  44
  45 UBool beVerbose=FALSE, haveCopyright=TRUE;
  46
  47 /*
  48  * Unicode set collecting the case-sensitive characters;
  49  * see uchar.h UCHAR_CASE_SENSITIVE.
  50  * Add code points from case mappings/foldings in
  51  * the root locale and with default options.
  52  */
  53 static USet *caseSensitive;
  54
  55 /* prototypes --------------------------------------------------------------- */
  56
  57 static void
  58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
  59
  60 static void
  61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
  62
  63 static void
  64 parseDB(const char *filename, UErrorCode *pErrorCode);
  65
  66 /* parse files with multiple binary properties ------------------------------ */
  67
  68 /* TODO: more common code, move functions to uparse.h|c */
  69
  70 /* TODO: similar to genprops/props2.c but not the same */
  71
  72 struct Binary {
  73     const char *propName;
  74     int32_t vecWord;
  75     uint32_t vecValue, vecMask;
  76 };
  77 typedef struct Binary Binary;
  78
  79 struct Binaries {
  80     const char *ucdFile;
  81     const Binary *binaries;
  82     int32_t binariesCount;
  83 };
  84 typedef struct Binaries Binaries;
  85
  86 static const Binary
  87 propListNames[]={
  88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
  89 };
  90
  91 static const Binaries
  92 propListBinaries={
  93     "PropList", propListNames, LENGTHOF(propListNames)
  94 };
  95
  96 static const Binary
  97 derCorePropsNames[]={
  98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
  99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK }
 100 };
 101
 102 static const Binaries
 103 derCorePropsBinaries={
 104     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
 105 };
 106
 107 /*
 108  * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
 109  * We need not distinguish between them because both add to case-ignorable.
 110  * We ignore all other Word_Break values.
 111  */
 112 static const Binary
 113 wordBreakNames[]={
 114     { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
 115     { "MidNumLet",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
 116 };
 117
 118 static const Binaries
 119 wordBreakBinaries={
 120     "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
 121 };
 122
 123 static void U_CALLCONV
 124 binariesLineFn(void *context,
 125                char *fields[][2], int32_t fieldCount,
 126                UErrorCode *pErrorCode) {
 127     const Binaries *bin;
 128     char *s;
 129     uint32_t start, limit;
 130     int32_t i;
 131
 132     bin=(const Binaries *)context;
 133
 134     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
 135     if(U_FAILURE(*pErrorCode)) {
 136         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
 137         exit(*pErrorCode);
 138     }
 139     ++limit;
 140
 141     /* parse binary property name */
 142     s=(char *)u_skipWhitespace(fields[1][0]);
 143     for(i=0;; ++i) {
 144         if(i==bin->binariesCount) {
 145             /* ignore unrecognized properties */
 146             return;
 147         }
 148         if(isToken(bin->binaries[i].propName, s)) {
 149             break;
 150         }
 151     }
 152
 153     if(bin->binaries[i].vecMask==0) {
 154         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
 155                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
 156         exit(U_INTERNAL_PROGRAM_ERROR);
 157     }
 158
 159     if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
 160         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
 161                         bin->binaries[i].propName, u_errorName(*pErrorCode));
 162         exit(*pErrorCode);
 163     }
 164 }
 165
 166 static void
 167 parseBinariesFile(char *filename, char *basename, const char *suffix,
 168                   const Binaries *bin,
 169                   UErrorCode *pErrorCode) {
 170     char *fields[2][2];
 171
 172     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 173         return;
 174     }
 175
 176     writeUCDFilename(basename, bin->ucdFile, suffix);
 177
 178     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
 179     if(U_FAILURE(*pErrorCode)) {
 180         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
 181     }
 182 }
 183
 184 /* -------------------------------------------------------------------------- */
 185
 186 enum
 187 {
 188     HELP_H,
 189     HELP_QUESTION_MARK,
 190     VERBOSE,
 191     COPYRIGHT,
 192     DESTDIR,
 193     SOURCEDIR,
 194     UNICODE_VERSION,
 195     ICUDATADIR,
 196     CSOURCE
 197 };
 198
 199 /* Keep these values in sync with the above enums */
 200 static UOption options[]={
 201     UOPTION_HELP_H,
 202     UOPTION_HELP_QUESTION_MARK,
 203     UOPTION_VERBOSE,
 204     UOPTION_COPYRIGHT,
 205     UOPTION_DESTDIR,
 206     UOPTION_SOURCEDIR,
 207     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
 208     UOPTION_ICUDATADIR,
 209     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
 210 };
 211
 212 extern int
 213 main(int argc, char* argv[]) {
 214     char filename[300];
 215     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
 216     char *basename=NULL;
 217     UErrorCode errorCode=U_ZERO_ERROR;
 218
 219     U_MAIN_INIT_ARGS(argc, argv);
 220
 221     /* preset then read command line options */
 222     options[DESTDIR].value=u_getDataDirectory();
 223     options[SOURCEDIR].value="";
 224     options[UNICODE_VERSION].value="";
 225     options[ICUDATADIR].value=u_getDataDirectory();
 226     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
 227
 228     /* error handling, printing usage message */
 229     if(argc<0) {
 230         fprintf(stderr,
 231             "error in command line argument \"%s\"\n",
 232             argv[-argc]);
 233     }
 234     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
 235         /*
 236          * Broken into chucks because the C89 standard says the minimum
 237          * required supported string length is 509 bytes.
 238          */
 239         fprintf(stderr,
 240             "Usage: %s [-options] [suffix]\n"
 241             "\n"
 242             "read the UnicodeData.txt file and other Unicode properties files and\n"
 243             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
 244             "\n",
 245             argv[0]);
 246         fprintf(stderr,
 247             "Options:\n"
 248             "\t-h or -? or --help  this usage text\n"
 249             "\t-v or --verbose     verbose output\n"
 250             "\t-c or --copyright   include a copyright notice\n"
 251             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
 252             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
 253         fprintf(stderr,
 254             "\t-d or --destdir     destination directory, followed by the path\n"
 255             "\t-s or --sourcedir   source directory, followed by the path\n"
 256             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
 257             "\t                    followed by path, defaults to %s\n"
 258             "\tsuffix              suffix that is to be appended with a '-'\n"
 259             "\t                    to the source file basenames before opening;\n"
 260             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
 261             u_getDataDirectory());
 262         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 263     }
 264
 265     /* get the options values */
 266     beVerbose=options[VERBOSE].doesOccur;
 267     haveCopyright=options[COPYRIGHT].doesOccur;
 268     srcDir=options[SOURCEDIR].value;
 269     destDir=options[DESTDIR].value;
 270
 271     if(argc>=2) {
 272         suffix=argv[1];
 273     } else {
 274         suffix=NULL;
 275     }
 276
 277     if(options[UNICODE_VERSION].doesOccur) {
 278         setUnicodeVersion(options[UNICODE_VERSION].value);
 279     }
 280     /* else use the default dataVersion in store.c */
 281
 282     if (options[ICUDATADIR].doesOccur) {
 283         u_setDataDirectory(options[ICUDATADIR].value);
 284     }
 285
 286     /* prepare the filename beginning with the source dir */
 287     uprv_strcpy(filename, srcDir);
 288     basename=filename+uprv_strlen(filename);
 289     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
 290         *basename++=U_FILE_SEP_CHAR;
 291     }
 292
 293     /* initialize */
 294     pv=upvec_open(2, 10000);
 295     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
 296
 297     /* process SpecialCasing.txt */
 298     writeUCDFilename(basename, "SpecialCasing", suffix);
 299     parseSpecialCasing(filename, &errorCode);
 300
 301     /* process CaseFolding.txt */
 302     writeUCDFilename(basename, "CaseFolding", suffix);
 303     parseCaseFolding(filename, &errorCode);
 304
 305     /* process additional properties files */
 306     *basename=0;
 307
 308     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
 309
 310     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
 311
 312     if(ucdVersion>=UNI_4_1) {
 313         parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
 314     }
 315
 316     /* process UnicodeData.txt */
 317     writeUCDFilename(basename, "UnicodeData", suffix);
 318     parseDB(filename, &errorCode);
 319
 320     /* process parsed data */
 321     makeCaseClosure();
 322
 323     makeExceptions();
 324
 325     if(U_SUCCESS(errorCode)) {
 326         /* write the properties data file */
 327         generateData(destDir, options[CSOURCE].doesOccur);
 328     }
 329
 330     u_cleanup();
 331     return errorCode;
 332 }
 333
 334 U_CFUNC void
 335 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
 336     int32_t length=(int32_t)uprv_strlen(filename);
 337     uprv_strcpy(basename, filename);
 338     if(suffix!=NULL) {
 339         basename[length++]='-';
 340         uprv_strcpy(basename+length, suffix);
 341         length+=(int32_t)uprv_strlen(suffix);
 342     }
 343     uprv_strcpy(basename+length, ".txt");
 344 }
 345
 346 /* TODO: move to toolutil */
 347 U_CFUNC UBool
 348 isToken(const char *token, const char *s) {
 349     const char *z;
 350     int32_t j;
 351
 352     s=u_skipWhitespace(s);
 353     for(j=0;; ++j) {
 354         if(token[j]!=0) {
 355             if(s[j]!=token[j]) {
 356                 break;
 357             }
 358         } else {
 359             z=u_skipWhitespace(s+j);
 360             if(*z==';' || *z==0) {
 361                 return TRUE;
 362             } else {
 363                 break;
 364             }
 365         }
 366     }
 367
 368     return FALSE;
 369 }
 370
 371 static int32_t
 372 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
 373     const char *t, *z;
 374     int32_t i, j;
 375
 376     s=u_skipWhitespace(s);
 377     for(i=0; i<countTokens; ++i) {
 378         t=tokens[i];
 379         if(t!=NULL) {
 380             for(j=0;; ++j) {
 381                 if(t[j]!=0) {
 382                     if(s[j]!=t[j]) {
 383                         break;
 384                     }
 385                 } else {
 386                     z=u_skipWhitespace(s+j);
 387                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
 388                         return i;
 389                     } else {
 390                         break;
 391                     }
 392                 }
 393             }
 394         }
 395     }
 396     return -1;
 397 }
 398
 399 static void
 400 _set_addAll(USet *set, const UChar *s, int32_t length) {
 401     UChar32 c;
 402     int32_t i;
 403
 404     /* needs length>=0 */
 405     for(i=0; i<length; /* U16_NEXT advances i */) {
 406         U16_NEXT(s, i, length, c);
 407         uset_add(set, c);
 408     }
 409 }
 410
 411 /* parser for SpecialCasing.txt --------------------------------------------- */
 412
 413 #define MAX_SPECIAL_CASING_COUNT 500
 414
 415 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
 416 static int32_t specialCasingCount=0;
 417
 418 static void U_CALLCONV
 419 specialCasingLineFn(void *context,
 420                     char *fields[][2], int32_t fieldCount,
 421                     UErrorCode *pErrorCode) {
 422     char *end;
 423
 424     /* get code point */
 425     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 426     end=(char *)u_skipWhitespace(end);
 427     if(end<=fields[0][0] || end!=fields[0][1]) {
 428         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
 429         *pErrorCode=U_PARSE_ERROR;
 430         exit(U_PARSE_ERROR);
 431     }
 432
 433     /* is this a complex mapping? */
 434     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
 435         /* there is some condition text in the fifth field */
 436         specialCasings[specialCasingCount].isComplex=TRUE;
 437
 438         /* do not store any actual mappings for this */
 439         specialCasings[specialCasingCount].lowerCase[0]=0;
 440         specialCasings[specialCasingCount].upperCase[0]=0;
 441         specialCasings[specialCasingCount].titleCase[0]=0;
 442     } else {
 443         /* just set the "complex" flag and get the case mappings */
 444         specialCasings[specialCasingCount].isComplex=FALSE;
 445         specialCasings[specialCasingCount].lowerCase[0]=
 446             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
 447         specialCasings[specialCasingCount].upperCase[0]=
 448             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
 449         specialCasings[specialCasingCount].titleCase[0]=
 450             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
 451         if(U_FAILURE(*pErrorCode)) {
 452             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
 453             exit(*pErrorCode);
 454         }
 455
 456         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
 457         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
 458         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
 459         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
 460     }
 461
 462     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
 463         fprintf(stderr, "gencase: too many special casing mappings\n");
 464         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 465         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 466     }
 467 }
 468
 469 static int32_t U_CALLCONV
 470 compareSpecialCasings(const void *context, const void *left, const void *right) {
 471     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
 472 }
 473
 474 static void
 475 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
 476     char *fields[5][2];
 477     int32_t i, j;
 478
 479     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 480         return;
 481     }
 482
 483     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
 484
 485     /* sort the special casing entries by code point */
 486     if(specialCasingCount>0) {
 487         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
 488                        compareSpecialCasings, NULL, FALSE, pErrorCode);
 489     }
 490     if(U_FAILURE(*pErrorCode)) {
 491         return;
 492     }
 493
 494     /* replace multiple entries for any code point by one "complex" one */
 495     j=0;
 496     for(i=1; i<specialCasingCount; ++i) {
 497         if(specialCasings[i-1].code==specialCasings[i].code) {
 498             /* there is a duplicate code point */
 499             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
 500             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
 501             specialCasings[i].lowerCase[0]=0;
 502             specialCasings[i].upperCase[0]=0;
 503             specialCasings[i].titleCase[0]=0;
 504             ++j;
 505         }
 506     }
 507
 508     /* if some entries just were removed, then re-sort */
 509     if(j>0) {
 510         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
 511                        compareSpecialCasings, NULL, FALSE, pErrorCode);
 512         specialCasingCount-=j;
 513     }
 514     if(U_FAILURE(*pErrorCode)) {
 515         return;
 516     }
 517
 518     /*
 519      * Add one complex mapping to caseSensitive that was filtered out above:
 520      * Greek final Sigma has a conditional mapping but not locale-sensitive,
 521      * and it is taken when lowercasing just U+03A3 alone.
 522      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 523      */
 524     uset_add(caseSensitive, 0x3c2);
 525 }
 526
 527 /* parser for CaseFolding.txt ----------------------------------------------- */
 528
 529 #define MAX_CASE_FOLDING_COUNT 2000
 530
 531 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
 532 static int32_t caseFoldingCount=0;
 533
 534 static void U_CALLCONV
 535 caseFoldingLineFn(void *context,
 536                   char *fields[][2], int32_t fieldCount,
 537                   UErrorCode *pErrorCode) {
 538     char *end;
 539     static UChar32 prevCode=0;
 540     int32_t count;
 541     char status;
 542
 543     /* get code point */
 544     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
 545     end=(char *)u_skipWhitespace(end);
 546     if(end<=fields[0][0] || end!=fields[0][1]) {
 547         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
 548         *pErrorCode=U_PARSE_ERROR;
 549         exit(U_PARSE_ERROR);
 550     }
 551
 552     /* get the status of this mapping */
 553     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
 554     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
 555         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
 556         *pErrorCode=U_PARSE_ERROR;
 557         exit(U_PARSE_ERROR);
 558     }
 559
 560     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
 561     if(status=='L') {
 562         return;
 563     }
 564
 565     /* get the mapping */
 566     count=caseFoldings[caseFoldingCount].full[0]=
 567         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
 568     if(U_FAILURE(*pErrorCode)) {
 569         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
 570         exit(*pErrorCode);
 571     }
 572
 573     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
 574     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
 575         caseFoldings[caseFoldingCount].simple=0;
 576     }
 577
 578     /* update the case-sensitive set */
 579     if(status!='T') {
 580         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
 581         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
 582     }
 583
 584     /* check the status */
 585     if(status=='S') {
 586         /* check if there was a full mapping for this code point before */
 587         if( caseFoldingCount>0 &&
 588             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 589             caseFoldings[caseFoldingCount-1].status=='F'
 590         ) {
 591             /* merge the two entries */
 592             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
 593             return;
 594         }
 595     } else if(status=='F') {
 596         /* check if there was a simple mapping for this code point before */
 597         if( caseFoldingCount>0 &&
 598             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
 599             caseFoldings[caseFoldingCount-1].status=='S'
 600         ) {
 601             /* merge the two entries */
 602             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
 603             return;
 604         }
 605     } else if(status=='I' || status=='T') {
 606         /* check if there was a default mapping for this code point before (remove it) */
 607         while(caseFoldingCount>0 &&
 608               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
 609         ) {
 610             prevCode=0;
 611             --caseFoldingCount;
 612         }
 613         /* store only a marker for special handling for cases like dotless i */
 614         caseFoldings[caseFoldingCount].simple=0;
 615         caseFoldings[caseFoldingCount].full[0]=0;
 616     }
 617
 618     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
 619     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
 620         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
 621                 (unsigned long)caseFoldings[caseFoldingCount].code,
 622                 (unsigned long)prevCode);
 623         *pErrorCode=U_PARSE_ERROR;
 624         exit(U_PARSE_ERROR);
 625     }
 626     prevCode=caseFoldings[caseFoldingCount].code;
 627
 628     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
 629         fprintf(stderr, "gencase: too many case folding mappings\n");
 630         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 631         exit(U_INDEX_OUTOFBOUNDS_ERROR);
 632     }
 633 }
 634
 635 static void
 636 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
 637     char *fields[3][2];
 638
 639     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 640         return;
 641     }
 642
 643     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
 644 }
 645
 646 /* parser for UnicodeData.txt ----------------------------------------------- */
 647
 648 /* general categories */
 649 const char *const
 650 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
 651     "Cn",
 652     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
 653     "Mc", "Nd", "Nl", "No",
 654     "Zs", "Zl", "Zp",
 655     "Cc", "Cf", "Co", "Cs",
 656     "Pd", "Ps", "Pe", "Pc", "Po",
 657     "Sm", "Sc", "Sk", "So",
 658     "Pi", "Pf"
 659 };
 660
 661 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
 662
 663 static void U_CALLCONV
 664 unicodeDataLineFn(void *context,
 665                   char *fields[][2], int32_t fieldCount,
 666                   UErrorCode *pErrorCode) {
 667     Props p;
 668     char *end;
 669     static UChar32 prevCode=0;
 670     UChar32 value;
 671     int32_t i;
 672
 673     /* reset the properties */
 674     uprv_memset(&p, 0, sizeof(Props));
 675
 676     /* get the character code, field 0 */
 677     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
 678     if(end<=fields[0][0] || end!=fields[0][1]) {
 679         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
 680         *pErrorCode=U_PARSE_ERROR;
 681         exit(U_PARSE_ERROR);
 682     }
 683
 684     /* get general category, field 2 */
 685     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
 686     if(i>=0) {
 687         p.gc=(uint8_t)i;
 688     } else {
 689         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
 690             fields[2][0], (unsigned long)p.code);
 691         *pErrorCode=U_PARSE_ERROR;
 692         exit(U_PARSE_ERROR);
 693     }
 694
 695     /* get canonical combining class, field 3 */
 696     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
 697     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
 698         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
 699         *pErrorCode=U_PARSE_ERROR;
 700         exit(U_PARSE_ERROR);
 701     }
 702     p.cc=(uint8_t)value;
 703
 704     /* get uppercase mapping, field 12 */
 705     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
 706     if(end!=fields[12][1]) {
 707         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
 708             (unsigned long)p.code);
 709         *pErrorCode=U_PARSE_ERROR;
 710         exit(U_PARSE_ERROR);
 711     }
 712     if(value!=0 && value!=p.code) {
 713         p.upperCase=value;
 714         uset_add(caseSensitive, p.code);
 715         uset_add(caseSensitive, value);
 716     }
 717
 718     /* get lowercase value, field 13 */
 719     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
 720     if(end!=fields[13][1]) {
 721         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
 722             (unsigned long)p.code);
 723         *pErrorCode=U_PARSE_ERROR;
 724         exit(U_PARSE_ERROR);
 725     }
 726     if(value!=0 && value!=p.code) {
 727         p.lowerCase=value;
 728         uset_add(caseSensitive, p.code);
 729         uset_add(caseSensitive, value);
 730     }
 731
 732     /* get titlecase value, field 14 */
 733     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
 734     if(end!=fields[14][1]) {
 735         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
 736             (unsigned long)p.code);
 737         *pErrorCode=U_PARSE_ERROR;
 738         exit(U_PARSE_ERROR);
 739     }
 740     if(value!=0 && value!=p.code) {
 741         p.titleCase=value;
 742         uset_add(caseSensitive, p.code);
 743         uset_add(caseSensitive, value);
 744     }
 745
 746     /* set additional properties from previously parsed files */
 747     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
 748         p.specialCasing=specialCasings+specialCasingIndex++;
 749     } else {
 750         p.specialCasing=NULL;
 751     }
 752     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
 753         p.caseFolding=caseFoldings+caseFoldingIndex++;
 754
 755         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
 756         if( p.caseFolding->status=='C' &&
 757             p.caseFolding->simple==p.lowerCase
 758         ) {
 759             p.caseFolding=NULL;
 760         }
 761     } else {
 762         p.caseFolding=NULL;
 763     }
 764
 765     /* check for non-character code points */
 766     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
 767         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
 768                 (unsigned long)p.code);
 769         *pErrorCode=U_PARSE_ERROR;
 770         exit(U_PARSE_ERROR);
 771     }
 772
 773     /* check that the code points (p.code) are in ascending order */
 774     if(p.code<=prevCode && p.code>0) {
 775         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
 776                 (unsigned long)p.code, (unsigned long)prevCode);
 777         *pErrorCode=U_PARSE_ERROR;
 778         exit(U_PARSE_ERROR);
 779     }
 780
 781     /* properties for a single code point */
 782     setProps(&p);
 783
 784     prevCode=p.code;
 785 }
 786
 787 static void
 788 parseDB(const char *filename, UErrorCode *pErrorCode) {
 789     char *fields[15][2];
 790     UChar32 start, end;
 791     int32_t i;
 792
 793     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 794         return;
 795     }
 796
 797     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
 798
 799     /* are all sub-properties consumed? */
 800     if(specialCasingIndex<specialCasingCount) {
 801         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
 802         *pErrorCode=U_PARSE_ERROR;
 803         exit(U_PARSE_ERROR);
 804     }
 805     if(caseFoldingIndex<caseFoldingCount) {
 806         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
 807         *pErrorCode=U_PARSE_ERROR;
 808         exit(U_PARSE_ERROR);
 809     }
 810
 811     if(U_FAILURE(*pErrorCode)) {
 812         return;
 813     }
 814
 815     for(i=0;
 816         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
 817         ++i
 818     ) {
 819         addCaseSensitive(start, end);
 820     }
 821     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
 822         *pErrorCode=U_ZERO_ERROR;
 823     }
 824 }
 825
 826 /*
 827  * Hey, Emacs, please set the following:
 828  *
 829  * Local Variables:
 830  * indent-tabs-mode: nil
 831  * End:
 832  *
 833  */