icuSources/tools/makeconv/makeconv.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  ********************************************************************************
   5  *
   6  *   Copyright (C) 1998-2015, International Business Machines
   7  *   Corporation and others.  All Rights Reserved.
   8  *
   9  ********************************************************************************
  10  *
  11  *
  12  *  makeconv.cpp:
  13  *  tool creating a binary (compressed) representation of the conversion mapping
  14  *  table (IBM NLTC ucmap format).
  15  *
  16  *  05/04/2000    helena     Added fallback mapping into the picture...
  17  *  06/29/2000  helena      Major rewrite of the callback APIs.
  18  */
  19
  20 #include <stdio.h>
  21 #include "unicode/putil.h"
  22 #include "unicode/ucnv_err.h"
  23 #include "charstr.h"
  24 #include "ucnv_bld.h"
  25 #include "ucnv_imp.h"
  26 #include "ucnv_cnv.h"
  27 #include "cstring.h"
  28 #include "cmemory.h"
  29 #include "uinvchar.h"
  30 #include "filestrm.h"
  31 #include "toolutil.h"
  32 #include "uoptions.h"
  33 #include "unicode/udata.h"
  34 #include "unewdata.h"
  35 #include "uparse.h"
  36 #include "ucm.h"
  37 #include "makeconv.h"
  38 #include "genmbcs.h"
  39
  40 #define DEBUG 0
  41
  42 typedef struct ConvData {
  43     UCMFile *ucm;
  44     NewConverter *cnvData, *extData;
  45     UConverterSharedData sharedData;
  46     UConverterStaticData staticData;
  47 } ConvData;
  48
  49 static void
  50 initConvData(ConvData *data) {
  51     uprv_memset(data, 0, sizeof(ConvData));
  52     data->sharedData.structSize=sizeof(UConverterSharedData);
  53     data->staticData.structSize=sizeof(UConverterStaticData);
  54     data->sharedData.staticData=&data->staticData;
  55 }
  56
  57 static void
  58 cleanupConvData(ConvData *data) {
  59     if(data!=NULL) {
  60         if(data->cnvData!=NULL) {
  61             data->cnvData->close(data->cnvData);
  62             data->cnvData=NULL;
  63         }
  64         if(data->extData!=NULL) {
  65             data->extData->close(data->extData);
  66             data->extData=NULL;
  67         }
  68         ucm_close(data->ucm);
  69         data->ucm=NULL;
  70     }
  71 }
  72
  73 /*
  74  * from ucnvstat.c - static prototypes of data-based converters
  75  */
  76 U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
  77
  78 /*
  79  * Global - verbosity
  80  */
  81 UBool VERBOSE = FALSE;
  82 UBool QUIET = FALSE;
  83 UBool SMALL = FALSE;
  84 UBool IGNORE_SISO_CHECK = FALSE;
  85
  86 static void
  87 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
  88
  89 /*
  90  * Set up the UNewData and write the converter..
  91  */
  92 static void
  93 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
  94
  95 UBool haveCopyright=TRUE;
  96
  97 static UDataInfo dataInfo={
  98     sizeof(UDataInfo),
  99     0,
 100
 101     U_IS_BIG_ENDIAN,
 102     U_CHARSET_FAMILY,
 103     sizeof(UChar),
 104     0,
 105
 106     {0x63, 0x6e, 0x76, 0x74},     /* dataFormat="cnvt" */
 107     {6, 2, 0, 0},                 /* formatVersion */
 108     {0, 0, 0, 0}                  /* dataVersion (calculated at runtime) */
 109 };
 110
 111 static void
 112 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
 113 {
 114     UNewDataMemory *mem = NULL;
 115     uint32_t sz2;
 116     uint32_t size = 0;
 117     int32_t tableType;
 118
 119     if(U_FAILURE(*status))
 120       {
 121         return;
 122       }
 123
 124     tableType=TABLE_NONE;
 125     if(data->cnvData!=NULL) {
 126         tableType|=TABLE_BASE;
 127     }
 128     if(data->extData!=NULL) {
 129         tableType|=TABLE_EXT;
 130     }
 131
 132     mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
 133
 134     if(U_FAILURE(*status))
 135       {
 136         fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
 137                 cnvName,
 138                 "cnv",
 139                 u_errorName(*status));
 140         return;
 141       }
 142
 143     if(VERBOSE)
 144       {
 145         printf("- Opened udata %s.%s\n", cnvName, "cnv");
 146       }
 147
 148
 149     /* all read only, clean, platform independent data.  Mmmm. :)  */
 150     udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
 151     size += sizeof(UConverterStaticData); /* Is 4-aligned  - by size */
 152     /* Now, write the table */
 153     if(tableType&TABLE_BASE) {
 154         size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
 155     }
 156     if(tableType&TABLE_EXT) {
 157         size += data->extData->write(data->extData, &data->staticData, mem, tableType);
 158     }
 159
 160     sz2 = udata_finish(mem, status);
 161     if(size != sz2)
 162     {
 163         fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
 164         *status=U_INTERNAL_PROGRAM_ERROR;
 165     }
 166     if(VERBOSE)
 167     {
 168       printf("- Wrote %u bytes to the udata.\n", (int)sz2);
 169     }
 170 }
 171
 172 enum {
 173     OPT_HELP_H,
 174     OPT_HELP_QUESTION_MARK,
 175     OPT_COPYRIGHT,
 176     OPT_VERSION,
 177     OPT_DESTDIR,
 178     OPT_VERBOSE,
 179     OPT_SMALL,
 180     OPT_IGNORE_SISO_CHECK,
 181     OPT_QUIET,
 182
 183     OPT_COUNT
 184 };
 185
 186 static UOption options[]={
 187     UOPTION_HELP_H,
 188     UOPTION_HELP_QUESTION_MARK,
 189     UOPTION_COPYRIGHT,
 190     UOPTION_VERSION,
 191     UOPTION_DESTDIR,
 192     UOPTION_VERBOSE,
 193     { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
 194     { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
 195     UOPTION_QUIET,
 196 };
 197
 198 int main(int argc, char* argv[])
 199 {
 200     ConvData data;
 201     char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
 202
 203     U_MAIN_INIT_ARGS(argc, argv);
 204
 205     /* Set up the ICU version number */
 206     UVersionInfo icuVersion;
 207     u_getVersion(icuVersion);
 208     uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
 209
 210     /* preset then read command line options */
 211     options[OPT_DESTDIR].value=u_getDataDirectory();
 212     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
 213
 214     /* error handling, printing usage message */
 215     if(argc<0) {
 216         fprintf(stderr,
 217             "error in command line argument \"%s\"\n",
 218             argv[-argc]);
 219     } else if(argc<2) {
 220         argc=-1;
 221     }
 222     if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
 223         FILE *stdfile=argc<0 ? stderr : stdout;
 224         fprintf(stdfile,
 225             "usage: %s [-options] files...\n"
 226             "\tread .ucm codepage mapping files and write .cnv files\n"
 227             "options:\n"
 228             "\t-h or -? or --help  this usage text\n"
 229             "\t-V or --version     show a version message\n"
 230             "\t-c or --copyright   include a copyright notice\n"
 231             "\t-d or --destdir     destination directory, followed by the path\n"
 232             "\t-v or --verbose     Turn on verbose output\n"
 233             "\t-q or --quiet       do not display warnings and progress\n",
 234             argv[0]);
 235         fprintf(stdfile,
 236             "\t      --small       Generate smaller .cnv files. They will be\n"
 237             "\t                    significantly smaller but may not be compatible with\n"
 238             "\t                    older versions of ICU and will require heap memory\n"
 239             "\t                    allocation when loaded.\n"
 240             "\t      --ignore-siso-check         Use SI/SO other than 0xf/0xe.\n");
 241         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 242     }
 243
 244     if(options[OPT_VERSION].doesOccur) {
 245         printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
 246                dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
 247         printf("%s\n", U_COPYRIGHT_STRING);
 248         exit(0);
 249     }
 250
 251     /* get the options values */
 252     haveCopyright = options[OPT_COPYRIGHT].doesOccur;
 253     const char *destdir = options[OPT_DESTDIR].value;
 254     VERBOSE = options[OPT_VERBOSE].doesOccur;
 255     QUIET = options[OPT_QUIET].doesOccur;
 256     SMALL = options[OPT_SMALL].doesOccur;
 257
 258     if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
 259         IGNORE_SISO_CHECK = TRUE;
 260     }
 261
 262     icu::CharString outFileName;
 263     UErrorCode err = U_ZERO_ERROR;
 264     if (destdir != NULL && *destdir != 0) {
 265         outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
 266         if (U_FAILURE(err)) {
 267             return err;
 268         }
 269     }
 270     int32_t outBasenameStart = outFileName.length();
 271
 272 #if DEBUG
 273     {
 274       int i;
 275       printf("makeconv: processing %d files...\n", argc - 1);
 276       for(i=1; i<argc; ++i) {
 277         printf("%s ", argv[i]);
 278       }
 279       printf("\n");
 280       fflush(stdout);
 281     }
 282 #endif
 283
 284     UBool printFilename = (UBool) (argc > 2 || VERBOSE);
 285     for (++argv; --argc; ++argv)
 286     {
 287         UErrorCode localError = U_ZERO_ERROR;
 288         const char *arg = getLongPathname(*argv);
 289
 290         /*produces the right destination path for display*/
 291         outFileName.truncate(outBasenameStart);
 292         if (outBasenameStart != 0)
 293         {
 294             /* find the last file sepator */
 295             const char *basename = findBasename(arg);
 296             outFileName.append(basename, localError);
 297         }
 298         else
 299         {
 300             outFileName.append(arg, localError);
 301         }
 302         if (U_FAILURE(localError)) {
 303             return localError;
 304         }
 305
 306         /*removes the extension if any is found*/
 307         int32_t lastDotIndex = outFileName.lastIndexOf('.');
 308         if (lastDotIndex >= outBasenameStart) {
 309             outFileName.truncate(lastDotIndex);
 310         }
 311
 312         /* the basename without extension is the converter name */
 313         if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
 314             fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
 315             return U_BUFFER_OVERFLOW_ERROR;
 316         }
 317         uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
 318
 319         /*Adds the target extension*/
 320         outFileName.append(CONVERTER_FILE_EXTENSION, localError);
 321         if (U_FAILURE(localError)) {
 322             return localError;
 323         }
 324
 325 #if DEBUG
 326         printf("makeconv: processing %s  ...\n", arg);
 327         fflush(stdout);
 328 #endif
 329         initConvData(&data);
 330         createConverter(&data, arg, &localError);
 331
 332         if (U_FAILURE(localError))
 333         {
 334             /* if an error is found, print out an error msg and keep going */
 335             fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
 336                     outFileName.data(), arg, u_errorName(localError));
 337             if(U_SUCCESS(err)) {
 338                 err = localError;
 339             }
 340         }
 341         else
 342         {
 343             /* Insure the static data name matches the  file name */
 344             /* Changed to ignore directory and only compare base name
 345              LDH 1/2/08*/
 346             char *p;
 347             p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
 348
 349             if(p == NULL)            /* OK, try alternate */
 350             {
 351                 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
 352                 if(p == NULL)
 353                 {
 354                     p=cnvName; /* If no separators, no problem */
 355                 }
 356             }
 357             else
 358             {
 359                 p++;   /* If found separator, don't include it in compare */
 360             }
 361             if(uprv_stricmp(p,data.staticData.name) && !QUIET)
 362             {
 363                 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
 364                     cnvName,  CONVERTER_FILE_EXTENSION,
 365                     data.staticData.name);
 366             }
 367
 368             uprv_strcpy((char*)data.staticData.name, cnvName);
 369
 370             if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
 371                 fprintf(stderr,
 372                     "Error: A converter name must contain only invariant characters.\n"
 373                     "%s is not a valid converter name.\n",
 374                     data.staticData.name);
 375                 if(U_SUCCESS(err)) {
 376                     err = U_INVALID_TABLE_FORMAT;
 377                 }
 378             }
 379
 380             localError = U_ZERO_ERROR;
 381             writeConverterData(&data, cnvName, destdir, &localError);
 382
 383             if(U_FAILURE(localError))
 384             {
 385                 /* if an error is found, print out an error msg and keep going*/
 386                 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
 387                     u_errorName(localError));
 388                 if(U_SUCCESS(err)) {
 389                     err = localError;
 390                 }
 391             }
 392             else if (printFilename)
 393             {
 394                 puts(outFileName.data() + outBasenameStart);
 395             }
 396         }
 397         fflush(stdout);
 398         fflush(stderr);
 399
 400         cleanupConvData(&data);
 401     }
 402
 403     return err;
 404 }
 405
 406 static void
 407 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
 408     if( (name[0]=='i' || name[0]=='I') &&
 409         (name[1]=='b' || name[1]=='B') &&
 410         (name[2]=='m' || name[2]=='M')
 411     ) {
 412         name+=3;
 413         if(*name=='-') {
 414             ++name;
 415         }
 416         *pPlatform=UCNV_IBM;
 417         *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
 418     } else {
 419         *pPlatform=UCNV_UNKNOWN;
 420         *pCCSID=0;
 421     }
 422 }
 423
 424 static void
 425 readHeader(ConvData *data,
 426            FileStream* convFile,
 427            UErrorCode *pErrorCode) {
 428     char line[1024];
 429     char *s, *key, *value;
 430     const UConverterStaticData *prototype;
 431     UConverterStaticData *staticData;
 432
 433     if(U_FAILURE(*pErrorCode)) {
 434         return;
 435     }
 436
 437     staticData=&data->staticData;
 438     staticData->platform=UCNV_IBM;
 439     staticData->subCharLen=0;
 440
 441     while(T_FileStream_readLine(convFile, line, sizeof(line))) {
 442         /* basic parsing and handling of state-related items */
 443         if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
 444             continue;
 445         }
 446
 447         /* stop at the beginning of the mapping section */
 448         if(uprv_strcmp(line, "CHARMAP")==0) {
 449             break;
 450         }
 451
 452         /* collect the information from the header field, ignore unknown keys */
 453         if(uprv_strcmp(key, "code_set_name")==0) {
 454             if(*value!=0) {
 455                 uprv_strcpy((char *)staticData->name, value);
 456                 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
 457             }
 458         } else if(uprv_strcmp(key, "subchar")==0) {
 459             uint8_t bytes[UCNV_EXT_MAX_BYTES];
 460             int8_t length;
 461
 462             s=value;
 463             length=ucm_parseBytes(bytes, line, (const char **)&s);
 464             if(1<=length && length<=4 && *s==0) {
 465                 staticData->subCharLen=length;
 466                 uprv_memcpy(staticData->subChar, bytes, length);
 467             } else {
 468                 fprintf(stderr, "error: illegal <subchar> %s\n", value);
 469                 *pErrorCode=U_INVALID_TABLE_FORMAT;
 470                 return;
 471             }
 472         } else if(uprv_strcmp(key, "subchar1")==0) {
 473             uint8_t bytes[UCNV_EXT_MAX_BYTES];
 474
 475             s=value;
 476             if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
 477                 staticData->subChar1=bytes[0];
 478             } else {
 479                 fprintf(stderr, "error: illegal <subchar1> %s\n", value);
 480                 *pErrorCode=U_INVALID_TABLE_FORMAT;
 481                 return;
 482             }
 483         }
 484     }
 485
 486     /* copy values from the UCMFile to the static data */
 487     staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
 488     staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
 489     staticData->conversionType=data->ucm->states.conversionType;
 490
 491     if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
 492         fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
 493         *pErrorCode=U_INVALID_TABLE_FORMAT;
 494         return;
 495     }
 496
 497     /*
 498      * Now that we know the type, copy any 'default' values from the table.
 499      * We need not check the type any further because the parser only
 500      * recognizes what we have prototypes for.
 501      *
 502      * For delta (extension-only) tables, copy values from the base file
 503      * instead, see createConverter().
 504      */
 505     if(data->ucm->baseName[0]==0) {
 506         prototype=ucnv_converterStaticData[staticData->conversionType];
 507         if(prototype!=NULL) {
 508             if(staticData->name[0]==0) {
 509                 uprv_strcpy((char *)staticData->name, prototype->name);
 510             }
 511
 512             if(staticData->codepage==0) {
 513                 staticData->codepage=prototype->codepage;
 514             }
 515
 516             if(staticData->platform==0) {
 517                 staticData->platform=prototype->platform;
 518             }
 519
 520             if(staticData->minBytesPerChar==0) {
 521                 staticData->minBytesPerChar=prototype->minBytesPerChar;
 522             }
 523
 524             if(staticData->maxBytesPerChar==0) {
 525                 staticData->maxBytesPerChar=prototype->maxBytesPerChar;
 526             }
 527
 528             if(staticData->subCharLen==0) {
 529                 staticData->subCharLen=prototype->subCharLen;
 530                 if(prototype->subCharLen>0) {
 531                     uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
 532                 }
 533             }
 534         }
 535     }
 536
 537     if(data->ucm->states.outputType<0) {
 538         data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
 539     }
 540
 541     if( staticData->subChar1!=0 &&
 542             (staticData->minBytesPerChar>1 ||
 543                 (staticData->conversionType!=UCNV_MBCS &&
 544                  staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
 545     ) {
 546         fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
 547         *pErrorCode=U_INVALID_TABLE_FORMAT;
 548     }
 549 }
 550
 551 /* return TRUE if a base table was read, FALSE for an extension table */
 552 static UBool
 553 readFile(ConvData *data, const char* converterName,
 554          UErrorCode *pErrorCode) {
 555     char line[1024];
 556     char *end;
 557     FileStream *convFile;
 558
 559     UCMStates *baseStates;
 560     UBool dataIsBase;
 561
 562     if(U_FAILURE(*pErrorCode)) {
 563         return FALSE;
 564     }
 565
 566     data->ucm=ucm_open();
 567
 568     convFile=T_FileStream_open(converterName, "r");
 569     if(convFile==NULL) {
 570         *pErrorCode=U_FILE_ACCESS_ERROR;
 571         return FALSE;
 572     }
 573
 574     readHeader(data, convFile, pErrorCode);
 575     if(U_FAILURE(*pErrorCode)) {
 576         return FALSE;
 577     }
 578
 579     if(data->ucm->baseName[0]==0) {
 580         dataIsBase=TRUE;
 581         baseStates=&data->ucm->states;
 582         ucm_processStates(baseStates, IGNORE_SISO_CHECK);
 583     } else {
 584         dataIsBase=FALSE;
 585         baseStates=NULL;
 586     }
 587
 588     /* read the base table */
 589     ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
 590     if(U_FAILURE(*pErrorCode)) {
 591         return FALSE;
 592     }
 593
 594     /* read an extension table if there is one */
 595     while(T_FileStream_readLine(convFile, line, sizeof(line))) {
 596         end=uprv_strchr(line, 0);
 597         while(line<end &&
 598               (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
 599             --end;
 600         }
 601         *end=0;
 602
 603         if(line[0]=='#' || u_skipWhitespace(line)==end) {
 604             continue; /* ignore empty and comment lines */
 605         }
 606
 607         if(0==uprv_strcmp(line, "CHARMAP")) {
 608             /* read the extension table */
 609             ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
 610         } else {
 611             fprintf(stderr, "unexpected text after the base mapping table\n");
 612         }
 613         break;
 614     }
 615
 616     T_FileStream_close(convFile);
 617
 618     if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
 619         fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
 620         *pErrorCode=U_INVALID_TABLE_FORMAT;
 621     }
 622
 623     return dataIsBase;
 624 }
 625
 626 static void
 627 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
 628     ConvData baseData;
 629     UBool dataIsBase;
 630
 631     UConverterStaticData *staticData;
 632     UCMStates *states, *baseStates;
 633
 634     if(U_FAILURE(*pErrorCode)) {
 635         return;
 636     }
 637
 638     initConvData(data);
 639
 640     dataIsBase=readFile(data, converterName, pErrorCode);
 641     if(U_FAILURE(*pErrorCode)) {
 642         return;
 643     }
 644
 645     staticData=&data->staticData;
 646     states=&data->ucm->states;
 647
 648     if(dataIsBase) {
 649         /*
 650          * Build a normal .cnv file with a base table
 651          * and an optional extension table.
 652          */
 653         data->cnvData=MBCSOpen(data->ucm);
 654         if(data->cnvData==NULL) {
 655             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 656
 657         } else if(!data->cnvData->isValid(data->cnvData,
 658                             staticData->subChar, staticData->subCharLen)
 659         ) {
 660             fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
 661             *pErrorCode=U_INVALID_TABLE_FORMAT;
 662
 663         } else if(staticData->subChar1!=0 &&
 664                     !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
 665         ) {
 666             fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
 667             *pErrorCode=U_INVALID_TABLE_FORMAT;
 668
 669         } else if(
 670             data->ucm->ext->mappingsLength>0 &&
 671             !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
 672         ) {
 673             *pErrorCode=U_INVALID_TABLE_FORMAT;
 674         } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
 675             /* sort the table so that it can be turned into UTF-8-friendly data */
 676             ucm_sortTable(data->ucm->base);
 677         }
 678
 679         if(U_SUCCESS(*pErrorCode)) {
 680             if(
 681                 /* add the base table after ucm_checkBaseExt()! */
 682                 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
 683             ) {
 684                 *pErrorCode=U_INVALID_TABLE_FORMAT;
 685             } else {
 686                 /*
 687                  * addTable() may have requested moving more mappings to the extension table
 688                  * if they fit into the base toUnicode table but not into the
 689                  * base fromUnicode table.
 690                  * (Especially for UTF-8-friendly fromUnicode tables.)
 691                  * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
 692                  * to be excluded from the extension toUnicode data.
 693                  * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
 694                  * the base fromUnicode table.
 695                  */
 696                 ucm_moveMappings(data->ucm->base, data->ucm->ext);
 697                 ucm_sortTable(data->ucm->ext);
 698                 if(data->ucm->ext->mappingsLength>0) {
 699                     /* prepare the extension table, if there is one */
 700                     data->extData=CnvExtOpen(data->ucm);
 701                     if(data->extData==NULL) {
 702                         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 703                     } else if(
 704                         !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
 705                     ) {
 706                         *pErrorCode=U_INVALID_TABLE_FORMAT;
 707                     }
 708                 }
 709             }
 710         }
 711     } else {
 712         /* Build an extension-only .cnv file. */
 713         char baseFilename[500];
 714         char *basename;
 715
 716         initConvData(&baseData);
 717
 718         /* assemble a path/filename for data->ucm->baseName */
 719         uprv_strcpy(baseFilename, converterName);
 720         basename=(char *)findBasename(baseFilename);
 721         uprv_strcpy(basename, data->ucm->baseName);
 722         uprv_strcat(basename, ".ucm");
 723
 724         /* read the base table */
 725         dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
 726         if(U_FAILURE(*pErrorCode)) {
 727             return;
 728         } else if(!dataIsBase) {
 729             fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
 730             *pErrorCode=U_INVALID_TABLE_FORMAT;
 731         } else {
 732             /* prepare the extension table */
 733             data->extData=CnvExtOpen(data->ucm);
 734             if(data->extData==NULL) {
 735                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 736             } else {
 737                 /* fill in gaps in extension file header fields */
 738                 UCMapping *m, *mLimit;
 739                 uint8_t fallbackFlags;
 740
 741                 baseStates=&baseData.ucm->states;
 742                 if(states->conversionType==UCNV_DBCS) {
 743                     staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
 744                 } else if(states->minCharLength==0) {
 745                     staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
 746                 }
 747                 if(states->maxCharLength<states->minCharLength) {
 748                     staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
 749                 }
 750
 751                 if(staticData->subCharLen==0) {
 752                     uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
 753                     staticData->subCharLen=baseData.staticData.subCharLen;
 754                 }
 755                 /*
 756                  * do not copy subChar1 -
 757                  * only use what is explicitly specified
 758                  * because it cannot be unset in the extension file header
 759                  */
 760
 761                 /* get the fallback flags */
 762                 fallbackFlags=0;
 763                 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
 764                     m<mLimit && fallbackFlags!=3;
 765                     ++m
 766                 ) {
 767                     if(m->f==1) {
 768                         fallbackFlags|=1;
 769                     } else if(m->f==3) {
 770                         fallbackFlags|=2;
 771                     }
 772                 }
 773
 774                 if(fallbackFlags&1) {
 775                     staticData->hasFromUnicodeFallback=TRUE;
 776                 }
 777                 if(fallbackFlags&2) {
 778                     staticData->hasToUnicodeFallback=TRUE;
 779                 }
 780
 781                 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
 782                     fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
 783                     *pErrorCode=U_INVALID_TABLE_FORMAT;
 784
 785                 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
 786                     fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
 787                     *pErrorCode=U_INVALID_TABLE_FORMAT;
 788
 789                 } else if(
 790                     !ucm_checkValidity(data->ucm->ext, baseStates) ||
 791                     !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
 792                 ) {
 793                     *pErrorCode=U_INVALID_TABLE_FORMAT;
 794                 } else {
 795                     if(states->maxCharLength>1) {
 796                         /*
 797                          * When building a normal .cnv file with a base table
 798                          * for an MBCS (not SBCS) table with explicit precision flags,
 799                          * the MBCSAddTable() function marks some mappings for moving
 800                          * to the extension table.
 801                          * They fit into the base toUnicode table but not into the
 802                          * base fromUnicode table.
 803                          * (Note: We do have explicit precision flags because they are
 804                          * required for extension table generation, and
 805                          * ucm_checkBaseExt() verified it.)
 806                          *
 807                          * We do not call MBCSAddTable() here (we probably could)
 808                          * so we need to do the analysis before building the extension table.
 809                          * We assume that MBCSAddTable() will build a UTF-8-friendly table.
 810                          * Redundant mappings in the extension table are ok except they cost some size.
 811                          *
 812                          * Do this after ucm_checkBaseExt().
 813                          */
 814                         const MBCSData *mbcsData=MBCSGetDummy();
 815                         int32_t needsMove=0;
 816                         for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
 817                             m<mLimit;
 818                             ++m
 819                         ) {
 820                             if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
 821                                 m->f|=MBCS_FROM_U_EXT_FLAG;
 822                                 m->moveFlag=UCM_MOVE_TO_EXT;
 823                                 ++needsMove;
 824                             }
 825                         }
 826
 827                         if(needsMove!=0) {
 828                             ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
 829                             ucm_sortTable(data->ucm->ext);
 830                         }
 831                     }
 832                     if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
 833                         *pErrorCode=U_INVALID_TABLE_FORMAT;
 834                     }
 835                 }
 836             }
 837         }
 838
 839         cleanupConvData(&baseData);
 840     }
 841 }
 842
 843 /*
 844  * Hey, Emacs, please set the following:
 845  *
 846  * Local Variables:
 847  * indent-tabs-mode: nil
 848  * End:
 849  *
 850  */