ICU-400.42.tar.gz

[apple/icu.git] / icuSources / tools / makeconv / makeconv.c
diff --git a/icuSources/tools/makeconv/makeconv.c b/icuSources/tools/makeconv/makeconv.c

index 328cda43aed3ef8a71d8d15d3fa8310e994f7d1e..a2815e8587a7aa7abe89e9620acdd79af8fbbc68 100644 (file)
--- a/icuSources/tools/makeconv/makeconv.c
+++ b/icuSources/tools/makeconv/makeconv.c
@@ -1,7 +1,7 @@
  /*
   ********************************************************************************
   *
- *   Copyright (C) 1998-2003, International Business Machines
+ *   Copyright (C) 1998-2008, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   *
   ********************************************************************************
@@ -17,166 +17,77 @@
  
  #include <stdio.h>
  #include "unicode/putil.h"
-#include "ucnv_io.h"
  #include "unicode/ucnv_err.h"
  #include "ucnv_bld.h"
  #include "ucnv_imp.h"
  #include "ucnv_cnv.h"
  #include "cstring.h"
  #include "cmemory.h"
+#include "uinvchar.h"
  #include "filestrm.h"
  #include "toolutil.h"
  #include "uoptions.h"
  #include "unicode/udata.h"
  #include "unewdata.h"
-#include "ucmpwrit.h"
+#include "uparse.h"
+#include "ucm.h"
  #include "makeconv.h"
  #include "genmbcs.h"
  
-#define DEBUG 0
-
-/*
- * from ucnvstat.c - static prototypes of data-based converters
- */
-extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
-
-/*
- * Global - verbosity
- */
-UBool VERBOSE = FALSE;
-UBool TOUCHFILE = FALSE;
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  
-/*Reads the header of the table file and fills in basic knowledge about the converter
- *in "converter"
- */
-static void readHeaderFromFile(UConverterSharedData* myConverter, FileStream* convFile, const char* converterName, UErrorCode* err);
-
-/*Reads the rest of the file, and fills up the shared objects if necessary
-Returns the UConverterTable. */
-static void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err);
-
-/* creates a UConverterSharedData from a mapping file.
- * Fills in:  *staticData, *table.  Converter is NOT otherwise useful.
- */
-static UConverterSharedData* createConverterFromTableFile(const char* realName, UErrorCode* err);
-
-/*
- * Set up the UNewData and write the converter..
- */
-void writeConverterData(UConverterSharedData *mySharedData, const char *cnvName, const char *cnvDir, UErrorCode *status);
+#define DEBUG 0
  
-static const char NLTC_SEPARATORS[9] = { '\r', '\n', '\t', ' ', '<', '>' ,'"' , 'U', '\0' };
-static const char FALLBACK_SEPARATOR = '|';
-static const char CODEPOINT_SEPARATORS[8] = {  '\r', '>', '\\', 'x', '\n', ' ', '\t', '\0' };
-static const char UNICODE_CODEPOINT_SEPARATORS[6] = {  '<', '>', 'U', ' ', '\t', '\0' };
+typedef struct ConvData {
+    UCMFile *ucm;
+    NewConverter *cnvData, *extData;
+    UConverterSharedData sharedData;
+    UConverterStaticData staticData;
+} ConvData;
  
-static const char *
-skipWhitespace(const char *s) {
-    while(*s==' ' || *s=='\t') {
-        ++s;
-    }
-    return s;
+static void
+initConvData(ConvData *data) {
+    uprv_memset(data, 0, sizeof(ConvData));
+    data->sharedData.structSize=sizeof(UConverterSharedData);
+    data->staticData.structSize=sizeof(UConverterStaticData);
+    data->sharedData.staticData=&data->staticData;
  }
  
-static int32_t
-parseCodepageBytes(const char *s, uint32_t *pBytes, const char **pEnd) {
-    char *end;
-    int32_t length=0;
-    uint32_t bytes=0, value;
-
-    while(s[0]=='\\' && s[1]=='x') {
-        if(length==4) {
-            return -1;
+static void
+cleanupConvData(ConvData *data) {
+    if(data!=NULL) {
+        if(data->cnvData!=NULL) {
+            data->cnvData->close(data->cnvData);
+            data->cnvData=NULL;
          }
-        value=uprv_strtoul(s+2, &end, 16);
-        s+=4;
-        if(end!=s) {
-            return -1;
+        if(data->extData!=NULL) {
+            data->extData->close(data->extData);
+            data->extData=NULL;
          }
-        bytes=(bytes<<8)|value;
-        ++length;
-    }
-    if(length==0) {
-        return -1;
+        ucm_close(data->ucm);
+        data->ucm=NULL;
      }
-    if(pEnd!=NULL) {
-        *pEnd=s;
-    }
-    *pBytes=bytes;
-    return length;
-}
-
-/* Remove all characters followed by '#'. There is an exception if there
- * is a fallback sign '|' after the comment and the comment does not
- * start in column 0. In this case, we just blank from '#' to just
- * before the '|' in order to support the fact that IBM official .ucm
- * files have the fallback information in comments!
- */
-static char *
-  removeComments (char *line)
-{
-  char *pound;
-
-  line = (char*)skipWhitespace(line);
-  pound = uprv_strchr (line, '#');
-  if (pound != NULL)
-  {
-      char *fallback = pound == line ? 0 : uprv_strchr(pound + 1, '|');
-      if (fallback != NULL)
-      {
-          uprv_memset(pound, ' ', fallback-pound);
-      }
-      else
-      {
-          *pound = '\0';
-      }
-  }
-  return line;
  }
  
-/* Returns true in c is a in set 'setOfChars', false otherwise
+/*
+ * from ucnvstat.c - static prototypes of data-based converters
   */
-static UBool
-  isInSet (char c, const char *setOfChars)
-{
-  uint8_t i = 0;
-
-  while (setOfChars[i] != '\0')
-    {
-      if (c == setOfChars[i++])
-        return TRUE;
-    }
-
-  return FALSE;
-}
+extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
  
-/* Returns pointer to the next non-whitespace (or non-separator)
+/*
+ * Global - verbosity
   */
-static int32_t
-  nextTokenOffset (const char *line, const char *separators)
-{
-    int32_t i = 0;
-
-    while (line[i] && isInSet(line[i], separators))
-        i++;
+UBool VERBOSE = FALSE;
+UBool SMALL = FALSE;
  
-    return i;
-}
+static void
+createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
  
-/* Returns pointer to the next token based on the set of separators
+/*
+ * Set up the UNewData and write the converter..
   */
-static char *
-  getToken (char *token, char *line, const char *separators)
-{
-    int32_t i = nextTokenOffset (line, separators);
-    int8_t j = 0;
-
-    while (line[i] && (!isInSet(line[i], separators)))
-        token[j++] = line[i++];
-    token[j] = '\0';
-
-    return line + i;
-}
+static void
+writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
  
  UBool haveCopyright=TRUE;
  
@@ -194,20 +105,27 @@ static UDataInfo dataInfo={
      {0, 0, 0, 0}                  /* dataVersion (calculated at runtime) */
  };
  
-void writeConverterData(UConverterSharedData *mySharedData,
-                        const char *cnvName,
-                        const char *cnvDir,
-                        UErrorCode *status)
+static void
+writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
  {
      UNewDataMemory *mem = NULL;
      uint32_t sz2;
      uint32_t size = 0;
+    int32_t tableType;
  
      if(U_FAILURE(*status))
        {
          return;
        }
  
+    tableType=TABLE_NONE;
+    if(data->cnvData!=NULL) {
+        tableType|=TABLE_BASE;
+    }
+    if(data->extData!=NULL) {
+        tableType|=TABLE_EXT;
+    }
+
      mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
  
      if(U_FAILURE(*status))
@@ -221,51 +139,68 @@ void writeConverterData(UConverterSharedData *mySharedData,
  
      if(VERBOSE)
        {
-        fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv");
+        printf("- Opened udata %s.%s\n", cnvName, "cnv");
        }
  
+
      /* all read only, clean, platform independent data.  Mmmm. :)  */
-    udata_writeBlock(mem, mySharedData->staticData, sizeof(UConverterStaticData));
+    udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
      size += sizeof(UConverterStaticData); /* Is 4-aligned  - by size */
      /* Now, write the table */
-    size += ((NewConverter *)mySharedData->table)->write((NewConverter *)mySharedData->table, mySharedData->staticData, mem);
+    if(tableType&TABLE_BASE) {
+        size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
+    }
+    if(tableType&TABLE_EXT) {
+        size += data->extData->write(data->extData, &data->staticData, mem, tableType);
+    }
  
      sz2 = udata_finish(mem, status);
      if(size != sz2)
      {
-        fprintf(stderr, "error: wrote %d bytes to the .cnv file but counted %d bytes\n", sz2, size);
+        fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
          *status=U_INTERNAL_PROGRAM_ERROR;
      }
      if(VERBOSE)
      {
-      fprintf(stderr, "- Wrote %d bytes to the udata.\n", sz2);
+      printf("- Wrote %u bytes to the udata.\n", (int)sz2);
      }
  }
  
+enum {
+    OPT_HELP_H,
+    OPT_HELP_QUESTION_MARK,
+    OPT_COPYRIGHT,
+    OPT_VERSION,
+    OPT_DESTDIR,
+    OPT_VERBOSE,
+    OPT_SMALL,
+    OPT_COUNT
+};
+
  static UOption options[]={
-    UOPTION_HELP_H,              /* 0  Numbers for those who*/
-    UOPTION_HELP_QUESTION_MARK,  /* 1   can't count. */
-    UOPTION_COPYRIGHT,           /* 2 */
-    UOPTION_VERSION,             /* 3 */
-    UOPTION_DESTDIR,             /* 4 */
-    UOPTION_VERBOSE,             /* 5 */
-    UOPTION_PACKAGE_NAME,        /* 6 */
-       UOPTION_DEF( "touchfile", 't', UOPT_NO_ARG) /* 7 */
+    UOPTION_HELP_H,
+    UOPTION_HELP_QUESTION_MARK,
+    UOPTION_COPYRIGHT,
+    UOPTION_VERSION,
+    UOPTION_DESTDIR,
+    UOPTION_VERBOSE,
+    { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
  };
  
  int main(int argc, char* argv[])
  {
-    UConverterSharedData* mySharedData = NULL;
-    UErrorCode err = U_ZERO_ERROR;
+    ConvData data;
+    UErrorCode err = U_ZERO_ERROR, localError;
      char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
-    char touchFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
      const char* destdir, *arg;
-    const char *pkgName = NULL;
      size_t destdirlen;
      char* dot = NULL, *outBasename;
      char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
      char cnvNameWithPkg[UCNV_MAX_FULL_FILE_NAME_LENGTH];
      UVersionInfo icuVersion;
+    UBool printFilename;
+
+    err = U_ZERO_ERROR;
  
      U_MAIN_INIT_ARGS(argc, argv);
  
@@ -274,8 +209,8 @@ int main(int argc, char* argv[])
      uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
  
      /* preset then read command line options */
-    options[4].value=u_getDataDirectory();
-    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
+    options[OPT_DESTDIR].value=u_getDataDirectory();
+    argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
  
      /* error handling, printing usage message */
      if(argc<0) {
@@ -285,8 +220,9 @@ int main(int argc, char* argv[])
      } else if(argc<2) {
          argc=-1;
      }
-    if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
-        fprintf(stderr,
+    if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
+        FILE *stdfile=argc<0 ? stderr : stdout;
+        fprintf(stdfile,
              "usage: %s [-options] files...\n"
              "\tread .ucm codepage mapping files and write .cnv files\n"
              "options:\n"
@@ -296,54 +232,26 @@ int main(int argc, char* argv[])
              "\t-d or --destdir     destination directory, followed by the path\n"
              "\t-v or --verbose     Turn on verbose output\n",
              argv[0]);
-        fprintf(stderr,
-            "\t-p or --pkgname     sets the 'package' name for output files.\n"
-            "\t                    If name is ICUDATA, then the default icu package\n"
-            "\t                    name will be used.\n"
-            "\t-t or --touchfile   Generate additional small file without packagename, for nmake\n");
+        fprintf(stdfile,
+            "\t      --small       Generate smaller .cnv files. They will be\n"
+            "\t                    significantly smaller but may not be compatible with\n"
+            "\t                    older versions of ICU and will require heap memory\n"
+            "\t                    allocation when loaded.\n");
          return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
      }
  
-    if(options[3].doesOccur) {
-      fprintf(stderr,"makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
-            dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
-      fprintf(stderr, "Copyright (C) 1998-2000, International Business Machines\n");
-      fprintf(stderr,"Corporation and others.  All Rights Reserved.\n");
+    if(options[OPT_VERSION].doesOccur) {
+        printf("makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
+               dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
+        printf("%s\n", U_COPYRIGHT_STRING);
          exit(0);
      }
  
-   TOUCHFILE = options[7].doesOccur;
-
-   if(!options[6].doesOccur)
-    {
-        fprintf(stderr, "%s :  option -p (package name) is required.\n",
-                argv[0]);
-        exit(1);
-    }
-    else
-    {
-        pkgName =options[6].value;
-        if(!strcmp(pkgName, "ICUDATA"))
-        {
-            pkgName = U_ICUDATA_NAME;
-        }
-        if(pkgName[0] == 0)
-        {
-            pkgName = NULL;
-
-            if(TOUCHFILE)
-            {
-                fprintf(stderr, "%s: Don't use touchfile option with an empty packagename.\n",
-                        argv[0]);
-                exit(1);
-            }
-        }
-    }
-
      /* get the options values */
-    haveCopyright = options[2].doesOccur;
-    destdir = options[4].value;
-    VERBOSE = options[5].doesOccur;
+    haveCopyright = options[OPT_COPYRIGHT].doesOccur;
+    destdir = options[OPT_DESTDIR].value;
+    VERBOSE = options[OPT_VERBOSE].doesOccur;
+    SMALL = options[OPT_SMALL].doesOccur;
  
      if (destdir != NULL && *destdir != 0) {
          uprv_strcpy(outFileName, destdir);
@@ -370,140 +278,128 @@ int main(int argc, char* argv[])
      }
  #endif
  
-  for (++argv; --argc; ++argv)
+    err = U_ZERO_ERROR;
+    printFilename = (UBool) (argc > 2 || VERBOSE);
+    for (++argv; --argc; ++argv)
      {
-      err = U_ZERO_ERROR;
-      arg = getLongPathname(*argv);
+        arg = getLongPathname(*argv);
  
-      /*produces the right destination path for display*/
-      if (destdirlen != 0)
+        /* Check for potential buffer overflow */
+        if(strlen(arg) > UCNV_MAX_FULL_FILE_NAME_LENGTH)
          {
-          const char *basename;
+            fprintf(stderr, "%s\n", u_errorName(U_BUFFER_OVERFLOW_ERROR));
+            return U_BUFFER_OVERFLOW_ERROR;
+        }
  
-          /* find the last file sepator */
-          basename = uprv_strrchr(arg, U_FILE_SEP_CHAR);
-          if (basename == NULL) {
-              basename = arg;
-          } else {
-              ++basename;
-          }
+        /*produces the right destination path for display*/
+        if (destdirlen != 0)
+        {
+            const char *basename;
  
-          uprv_strcpy(outBasename, basename);
+            /* find the last file sepator */
+            basename = findBasename(arg);
+            uprv_strcpy(outBasename, basename);
          }
-      else
+        else
          {
-          uprv_strcpy(outFileName, arg);
+            uprv_strcpy(outFileName, arg);
          }
  
-      /*removes the extension if any is found*/
-      dot = uprv_strrchr(outBasename, '.');
-      if (dot)
+        /*removes the extension if any is found*/
+        dot = uprv_strrchr(outBasename, '.');
+        if (dot)
          {
-          *dot = '\0';
+            *dot = '\0';
          }
  
-      /* the basename without extension is the converter name */
-      uprv_strcpy(cnvName, outBasename);
-
-      if(TOUCHFILE)
-      {
-          uprv_strcpy(touchFileName, outBasename);
-          uprv_strcat(touchFileName, ".cnv");
-      }
-
-      if(pkgName != NULL)
-      {
-          /* changes both baename and filename */
-          uprv_strcpy(outBasename, pkgName);
-          uprv_strcat(outBasename, "_");
-          uprv_strcat(outBasename, cnvName);
-      }
-
+        /* the basename without extension is the converter name */
+        uprv_strcpy(cnvName, outBasename);
  
-      /*Adds the target extension*/
-      uprv_strcat(outBasename, CONVERTER_FILE_EXTENSION);
+        /*Adds the target extension*/
+        uprv_strcat(outBasename, CONVERTER_FILE_EXTENSION);
  
  #if DEBUG
          printf("makeconv: processing %s  ...\n", arg);
          fflush(stdout);
  #endif
-      mySharedData = createConverterFromTableFile(arg, &err);
+        localError = U_ZERO_ERROR;
+        initConvData(&data);
+        createConverter(&data, arg, &localError);
  
-      if (U_FAILURE(err) || (mySharedData == NULL))
+        if (U_FAILURE(localError))
          {
-          /* if an error is found, print out an error msg and keep going */
-          fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (error code %d - %s)\n", outFileName, arg, err,
-                        u_errorName(err));
-          err = U_ZERO_ERROR;
+            /* if an error is found, print out an error msg and keep going */
+            fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg,
+                u_errorName(localError));
+            if(U_SUCCESS(err)) {
+                err = localError;
+            }
          }
-      else
+        else
          {
-          /* Make the static data name equal to the file name */
-          if( /*VERBOSE &&  */ uprv_stricmp(cnvName,mySharedData->staticData->name))
-          {
-            fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
-                    cnvName,
-                    CONVERTER_FILE_EXTENSION,
-                    mySharedData->staticData->name);
-          }
-
-          uprv_strcpy((char*)mySharedData->staticData->name, cnvName);
-
-          if(pkgName == NULL)
-          {
-              uprv_strcpy(cnvNameWithPkg, cnvName);
-          }
-          else
-          {
-              uprv_strcpy(cnvNameWithPkg, pkgName);
-              uprv_strcat(cnvNameWithPkg, "_");
-              uprv_strcat(cnvNameWithPkg, cnvName);
-          }
-
-          writeConverterData(mySharedData, cnvNameWithPkg, destdir, &err);
-          ((NewConverter *)mySharedData->table)->close((NewConverter *)mySharedData->table);
-          if(TOUCHFILE)
-          {
-              FileStream *q;
-              char msg[1024];
-
-              sprintf(msg, "This empty file tells nmake that %s in package %s has been updated.\n",
-                  cnvName, pkgName);
-
-              q = T_FileStream_open(touchFileName, "w");
-              if(q == NULL)
-              {
-                  fprintf(stderr, "Error writing touchfile \"%s\"\n", touchFileName);
-                  err = U_FILE_ACCESS_ERROR;
-              }
-
-              else
-              {
-                  T_FileStream_write(q, msg, uprv_strlen(msg));
-                  T_FileStream_close(q);
-              }
-          }
-
-    /* write the information data */
-          uprv_free((UConverterStaticData *)mySharedData->staticData);
-          uprv_free(mySharedData);
-
-          if(U_FAILURE(err))
-          {
-                  /* if an error is found, print out an error msg and keep going*/
-            fprintf(stderr, "Error writing \"%s\" file for \"%s\" (error code %d - %s)\n", outFileName, arg, err,
-                    u_errorName(err));
-          }
-          else
-          {
-              puts(outFileName);
-          }
+            /* Insure the static data name matches the  file name */
+            /* Changed to ignore directory and only compare base name
+             LDH 1/2/08*/
+            char *p;
+            p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
+
+            if(p == NULL)            /* OK, try alternate */
+            {
+                p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
+                if(p == NULL)
+                {
+                    p=cnvName; /* If no separators, no problem */
+                }
+            }
+            else
+            {
+                p++;   /* If found separtor, don't include it in compare */
+            }
+            if(uprv_stricmp(p,data.staticData.name))
+            {
+                fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
+                    cnvName,  CONVERTER_FILE_EXTENSION,
+                    data.staticData.name);
+            }
+
+            uprv_strcpy((char*)data.staticData.name, cnvName);
+
+            if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
+                fprintf(stderr,
+                    "Error: A converter name must contain only invariant characters.\n"
+                    "%s is not a valid converter name.\n",
+                    data.staticData.name);
+                if(U_SUCCESS(err)) {
+                    err = U_INVALID_TABLE_FORMAT;
+                }
+            }
+
+            uprv_strcpy(cnvNameWithPkg, cnvName);
+
+            localError = U_ZERO_ERROR;
+            writeConverterData(&data, cnvNameWithPkg, destdir, &localError);
+
+            if(U_FAILURE(localError))
+            {
+                /* if an error is found, print out an error msg and keep going*/
+                fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName, arg,
+                    u_errorName(localError));
+                if(U_SUCCESS(err)) {
+                    err = localError;
+                }
+            }
+            else if (printFilename)
+            {
+                puts(outBasename);
+            }
          }
-      fflush(stdout);
-      fflush(stderr);
+        fflush(stdout);
+        fflush(stderr);
+
+        cleanupConvData(&data);
      }
  
-  return err;
+    return err;
  }
  
  static void
@@ -524,517 +420,424 @@ getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID
      }
  }
  
-/*Reads the header of the table file and fills in basic knowledge about the converter in "converter"*/
-void readHeaderFromFile(UConverterSharedData* mySharedData,
-                        FileStream* convFile,
-                        const char* converterName,
-                        UErrorCode *pErrorCode)
-{
+static void
+readHeader(ConvData *data,
+           FileStream* convFile,
+           const char* converterName,
+           UErrorCode *pErrorCode) {
      char line[200];
-    char *s, *end, *key, *value;
+    char *s, *key, *value;
+    const UConverterStaticData *prototype;
      UConverterStaticData *staticData;
-    char c;
  
      if(U_FAILURE(*pErrorCode)) {
          return;
      }
  
-    staticData=(UConverterStaticData *)mySharedData->staticData;
-    staticData->conversionType=UCNV_UNSUPPORTED_CONVERTER;
+    staticData=&data->staticData;
      staticData->platform=UCNV_IBM;
      staticData->subCharLen=0;
  
      while(T_FileStream_readLine(convFile, line, sizeof(line))) {
-        /* remove comments and trailing CR and LF and remove whitespace from the end */
-        for(end=line; (c=*end)!=0; ++end) {
-            if(c=='#' || c=='\r' || c=='\n') {
-                break;
-            }
-        }
-        while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
-            --end;
-        }
-        *end=0;
-
-        /* skip leading white space and ignore empty lines */
-        s=(char *)skipWhitespace(line);
-        if(*s==0) {
+        /* basic parsing and handling of state-related items */
+        if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
              continue;
          }
  
          /* stop at the beginning of the mapping section */
-        if(uprv_memcmp(s, "CHARMAP", 7)==0) {
+        if(uprv_strcmp(line, "CHARMAP")==0) {
              break;
          }
  
-        /* get the key name, bracketed in <> */
-        if(*s!='<') {
-            fprintf(stderr, "error: no header field <key> in line \"%s\"\n", line);
-            *pErrorCode=U_INVALID_TABLE_FORMAT;
-            return;
-        }
-        key=++s;
-        while(*s!='>') {
-            if(*s==0) {
-                fprintf(stderr, "error: incomplete header field <key> in line \"%s\"\n", line);
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
-            }
-            ++s;
-        }
-        *s=0;
-
-        /* get the value string, possibly quoted */
-        s=(char *)skipWhitespace(s+1);
-        if(*s!='"') {
-            value=s;
-        } else {
-            /* remove the quotes */
-            value=s+1;
-            if(end>value && *(end-1)=='"') {
-                *--end=0;
-            }
-        }
-
          /* collect the information from the header field, ignore unknown keys */
          if(uprv_strcmp(key, "code_set_name")==0) {
              if(*value!=0) {
-                uprv_strcpy((char*)staticData->name, value);
+                uprv_strcpy((char *)staticData->name, value);
                  getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
              }
-        } else if(uprv_strcmp(key, "uconv_class")==0) {
-            const UConverterStaticData *prototype;
-
-            if(uprv_strcmp(value, "DBCS")==0) {
-                staticData->conversionType=UCNV_DBCS;
-            } else if(uprv_strcmp(value, "SBCS")==0) {
-                staticData->conversionType = UCNV_SBCS;
-            } else if(uprv_strcmp(value, "MBCS")==0) {
-                staticData->conversionType = UCNV_MBCS;
-            } else if(uprv_strcmp(value, "EBCDIC_STATEFUL")==0) {
-                staticData->conversionType = UCNV_EBCDIC_STATEFUL;
-            } else {
-                fprintf(stderr, "error: unknown <uconv_class> %s\n", value);
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
-            }
-
-            /* Now that we know the type, copy any 'default' values from the table. */
-            prototype=ucnv_converterStaticData[staticData->conversionType];
-            if(prototype!=NULL) {
-                if(staticData->name[0]==0) {
-                    uprv_strcpy((char*)staticData->name, prototype->name);
-                }
-
-                if(staticData->codepage==0) {
-                    staticData->codepage = prototype->codepage;
-                }
-
-                if(staticData->platform==0) {
-                    staticData->platform = prototype->platform;
-                }
-
-                if(staticData->minBytesPerChar==0) {
-                    staticData->minBytesPerChar = prototype->minBytesPerChar;
-                }
-
-                if(staticData->maxBytesPerChar==0) {
-                    staticData->maxBytesPerChar = prototype->maxBytesPerChar;
-                }
-
-                if(staticData->subCharLen==0) {
-                    staticData->subCharLen=prototype->subCharLen;
-                    if(prototype->subCharLen>0) {
-                        uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
-                    }
-                }
-            }
-        } else if(uprv_strcmp(key, "mb_cur_max")==0) {
-            if('1'<=*value && *value<='4' && value[1]==0) {
-                staticData->maxBytesPerChar=(int8_t)(*value-'0');
-            } else {
-                fprintf(stderr, "error: illegal <mb_cur_max> %s\n", value);
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
-            }
-        } else if(uprv_strcmp(key, "mb_cur_min")==0) {
-            if('1'<=*value && *value<='4' && value[1]==0) {
-                staticData->minBytesPerChar=(int8_t)(*value-'0');
-            } else {
-                fprintf(stderr, "error: illegal <mb_cur_min> %s\n", value);
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
-            }
          } else if(uprv_strcmp(key, "subchar")==0) {
-            uint32_t bytes;
-            int32_t length;
-
-            length=parseCodepageBytes(value, &bytes, (const char **)&end);
-            if(length>0 && *end==0) {
-                staticData->subCharLen=(int8_t)length;
-                do {
-                    staticData->subChar[--length]=(uint8_t)bytes;
-                    bytes>>=8;
-                } while(length>0);
+            uint8_t bytes[UCNV_EXT_MAX_BYTES];
+            int8_t length;
+
+            s=value;
+            length=ucm_parseBytes(bytes, line, (const char **)&s);
+            if(1<=length && length<=4 && *s==0) {
+                staticData->subCharLen=length;
+                uprv_memcpy(staticData->subChar, bytes, length);
              } else {
                  fprintf(stderr, "error: illegal <subchar> %s\n", value);
                  *pErrorCode=U_INVALID_TABLE_FORMAT;
                  return;
              }
          } else if(uprv_strcmp(key, "subchar1")==0) {
-            uint32_t bytes;
+            uint8_t bytes[UCNV_EXT_MAX_BYTES];
  
-            if(1==parseCodepageBytes(value, &bytes, (const char **)&end) && *end==0) {
-                staticData->subChar1=(uint8_t)bytes;
+            s=value;
+            if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
+                staticData->subChar1=bytes[0];
              } else {
                  fprintf(stderr, "error: illegal <subchar1> %s\n", value);
                  *pErrorCode=U_INVALID_TABLE_FORMAT;
                  return;
              }
-        } else if(uprv_strcmp(key, "icu:state")==0) {
-            /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
-            switch(staticData->conversionType) {
-            case UCNV_SBCS:
-            case UCNV_DBCS:
-            case UCNV_EBCDIC_STATEFUL:
-                staticData->conversionType = UCNV_MBCS;
-                break;
-            case UCNV_MBCS:
-                break;
-            default:
-                fprintf(stderr, "error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
+        }
+    }
+
+    /* copy values from the UCMFile to the static data */
+    staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
+    staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
+    staticData->conversionType=data->ucm->states.conversionType;
+
+    if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
+        fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
+        *pErrorCode=U_INVALID_TABLE_FORMAT;
+        return;
+    }
+
+    /*
+     * Now that we know the type, copy any 'default' values from the table.
+     * We need not check the type any further because the parser only
+     * recognizes what we have prototypes for.
+     *
+     * For delta (extension-only) tables, copy values from the base file
+     * instead, see createConverter().
+     */
+    if(data->ucm->baseName[0]==0) {
+        prototype=ucnv_converterStaticData[staticData->conversionType];
+        if(prototype!=NULL) {
+            if(staticData->name[0]==0) {
+                uprv_strcpy((char *)staticData->name, prototype->name);
+            }
+
+            if(staticData->codepage==0) {
+                staticData->codepage=prototype->codepage;
+            }
+
+            if(staticData->platform==0) {
+                staticData->platform=prototype->platform;
+            }
+
+            if(staticData->minBytesPerChar==0) {
+                staticData->minBytesPerChar=prototype->minBytesPerChar;
              }
  
              if(staticData->maxBytesPerChar==0) {
-                fprintf(stderr, "error: <icu:state> before the <mb_cur_max> line\n");
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
+                staticData->maxBytesPerChar=prototype->maxBytesPerChar;
              }
-            if(mySharedData->table==NULL) {
-                mySharedData->table=(UConverterTable *)MBCSOpen(staticData->maxBytesPerChar);
-                if(mySharedData->table==NULL) {
-                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                    return;
+
+            if(staticData->subCharLen==0) {
+                staticData->subCharLen=prototype->subCharLen;
+                if(prototype->subCharLen>0) {
+                    uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
                  }
              }
-            if(!MBCSAddState((NewConverter *)mySharedData->table, value)) {
-                *pErrorCode=U_INVALID_TABLE_FORMAT;
-                return;
-            }
          }
      }
  
-    if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
-        *pErrorCode=U_INVALID_TABLE_FORMAT;
-    } else if(staticData->conversionType==UCNV_MBCS && mySharedData->table==NULL) {
-        fprintf(stderr, "error: missing state table information (<icu:state>) for MBCS\n");
-        *pErrorCode=U_INVALID_TABLE_FORMAT;
-    } else if(staticData->subChar1!=0 &&
-              !staticData->conversionType==UCNV_MBCS &&
-              !staticData->conversionType==UCNV_EBCDIC_STATEFUL
+    if(data->ucm->states.outputType<0) {
+        data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
+    }
+
+    if( staticData->subChar1!=0 &&
+            (staticData->minBytesPerChar>1 ||
+                (staticData->conversionType!=UCNV_MBCS &&
+                 staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
      ) {
          fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
          *pErrorCode=U_INVALID_TABLE_FORMAT;
      }
  }
  
-void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err)
-{
-    char storageLine[200];
-    char* line = NULL;
-    UConverterStaticData *staticData=(UConverterStaticData *)sharedData->staticData;
-    NewConverter *cnvData = (NewConverter *)sharedData->table;
-    UChar32 unicodeValue, codepageValue;
-    uint8_t mbcsBytes[8];
-    int32_t mbcsLength;
-    char codepointBytes[20];
-    UBool isOK = TRUE;
-    uint8_t precisionMask = 0, unicodeMask = 0;
-    char endOfLine;
-
-    if(cnvData->startMappings!=NULL)
-    {
-        if(!cnvData->startMappings(cnvData)) {
-            *err = U_INVALID_TABLE_FORMAT;
-            return;
-        }
-    }
-
-    if(cnvData->isValid!=NULL)
-    {
-        const uint8_t *p = staticData->subChar;
-        codepageValue = 0;
-        switch(staticData->subCharLen) {
-        case 4:     codepageValue = (codepageValue << 8) | *p++;
-        case 3:     codepageValue = (codepageValue << 8) | *p++;
-        case 2:     codepageValue = (codepageValue << 8) | *p++;
-        case 1:     codepageValue = (codepageValue << 8) | *p;
-        default:    break; /* must never occur */
-        }
-        if(!cnvData->isValid(cnvData, staticData->subChar, staticData->subCharLen, codepageValue)) {
-            fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
-            *err = U_INVALID_TABLE_FORMAT;
-            isOK = FALSE;
-        }
-    }
+/* return TRUE if a base table was read, FALSE for an extension table */
+static UBool
+readFile(ConvData *data, const char* converterName,
+         UErrorCode *pErrorCode) {
+    char line[200];
+    char *end;
+    FileStream *convFile;
  
-    staticData->hasFromUnicodeFallback = staticData->hasToUnicodeFallback = FALSE;
+    UCMStates *baseStates;
+    UBool dataIsBase;
  
-    while (T_FileStream_readLine(convFile, storageLine, sizeof(storageLine)))
-    {
-        removeComments(storageLine);
-        line = storageLine;
-        if (line[nextTokenOffset(line, NLTC_SEPARATORS)] != '\0')
-        {
-            /* get the Unicode code point */
-            line = getToken(codepointBytes, line, UNICODE_CODEPOINT_SEPARATORS);
-            if (uprv_strcmp(codepointBytes, "END") == 0)
-            {
-                break;
-            }
-            unicodeValue = (UChar32)T_CString_stringToInteger(codepointBytes, 16);
+    if(U_FAILURE(*pErrorCode)) {
+        return FALSE;
+    }
  
-            /* get the codepage bytes */
-            codepageValue = 0;
-            mbcsLength = 0;
-            do
-            {
-                line = getToken(codepointBytes, line, CODEPOINT_SEPARATORS);
-                mbcsBytes[mbcsLength] = (uint8_t)T_CString_stringToInteger(codepointBytes, 16);
-                codepageValue = codepageValue << 8 | mbcsBytes[mbcsLength++];
-
-                /* End of line could be \0 or | (if fallback) */
-                endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)];
-            } while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR));
-
-            if(unicodeValue>=0x10000) {
-                unicodeMask|=UCNV_HAS_SUPPLEMENTARY;    /* there are supplementary code points */
-            } else if(UTF_IS_SURROGATE(unicodeValue)) {
-                unicodeMask|=UCNV_HAS_SURROGATES;       /* there are single surrogates */
-            }
+    data->ucm=ucm_open();
  
-            if((uint32_t)unicodeValue > 0x10ffff)
-            {
-                fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine);
-                isOK = FALSE;
-            }
-            else if(endOfLine == FALLBACK_SEPARATOR)
-            {
-                /* we know that there is a fallback separator */
-                precisionMask |= 1;
-                line = uprv_strchr(line, FALLBACK_SEPARATOR) + 1;
-                switch(*line)
-                {
-                case '0':
-                    /* set roundtrip mappings */
-                    isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0) &&
-                            cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0);
-                    break;
-                case '1':
-                    /* set only a fallback mapping from Unicode to codepage */
-                    staticData->hasFromUnicodeFallback = TRUE;
-                    isOK &= cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1);
-                    break;
-                case '2':
-                    /* skip subchar mappings */
-                    break;
-                case '3':
-                    /* set only a fallback mapping from codepage to Unicode */
-                    staticData->hasToUnicodeFallback = TRUE;
-                    isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1);
-                    break;
-                default:
-                    fprintf(stderr, "error: illegal fallback indicator '%s' in '%s'\n", line - 1, storageLine);
-                    *err = U_INVALID_TABLE_FORMAT;
-                    break;
-                }
-            }
-            else
-            {
-                precisionMask |= 2;
-                /* set the mappings */
-                isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1) &&
-                        cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1);
-            }
-        }
+    convFile=T_FileStream_open(converterName, "r");
+    if(convFile==NULL) {
+        *pErrorCode=U_FILE_ACCESS_ERROR;
+        return FALSE;
      }
  
-    if(unicodeMask == 3)
-    {
-        fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n");
+    readHeader(data, convFile, converterName, pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return FALSE;
      }
-    staticData->unicodeMask = unicodeMask;
  
-    if(cnvData->finishMappings!=NULL)
-    {
-        cnvData->finishMappings(cnvData, staticData);
+    if(data->ucm->baseName[0]==0) {
+        dataIsBase=TRUE;
+        baseStates=&data->ucm->states;
+        ucm_processStates(baseStates);
+    } else {
+        dataIsBase=FALSE;
+        baseStates=NULL;
      }
  
-    if(!isOK)
-    {
-        *err = U_INVALID_TABLE_FORMAT;
-    }
-    else if(precisionMask == 3)
-    {
-        fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
-        *err = U_INVALID_TABLE_FORMAT;
+    /* read the base table */
+    ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return FALSE;
      }
-}
  
-/*creates a UConverterStaticData, fills in necessary links to it the appropriate function pointers*/
-UConverterSharedData* createConverterFromTableFile(const char* converterName, UErrorCode* err)
-{
-    FileStream* convFile = NULL;
-    UConverterSharedData* mySharedData = NULL;
-    UConverterStaticData* myStaticData = NULL;
+    /* read an extension table if there is one */
+    while(T_FileStream_readLine(convFile, line, sizeof(line))) {
+        end=uprv_strchr(line, 0);
+        while(line<end &&
+              (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
+            --end;
+        }
+        *end=0;
  
-    if (U_FAILURE(*err)) return NULL;
+        if(line[0]=='#' || u_skipWhitespace(line)==end) {
+            continue; /* ignore empty and comment lines */
+        }
  
-    convFile = T_FileStream_open(converterName, "r");
-    if (convFile == NULL)
-    {
-        *err = U_FILE_ACCESS_ERROR;
-        return NULL;
+        if(0==uprv_strcmp(line, "CHARMAP")) {
+            /* read the extension table */
+            ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
+        } else {
+            fprintf(stderr, "unexpected text after the base mapping table\n");
+        }
+        break;
      }
  
+    T_FileStream_close(convFile);
  
-    mySharedData = (UConverterSharedData*) uprv_malloc(sizeof(UConverterSharedData));
-    if (mySharedData == NULL)
-    {
-        *err = U_MEMORY_ALLOCATION_ERROR;
-        T_FileStream_close(convFile);
-        return NULL;
+    if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
+        fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
+        *pErrorCode=U_INVALID_TABLE_FORMAT;
      }
  
-    uprv_memset(mySharedData, 0, sizeof(UConverterSharedData));
+    return dataIsBase;
+}
+
+static void
+createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
+    ConvData baseData;
+    UBool dataIsBase;
  
-    mySharedData->structSize = sizeof(UConverterSharedData);
+    UConverterStaticData *staticData;
+    UCMStates *states, *baseStates;
  
-    myStaticData =  (UConverterStaticData*) uprv_malloc(sizeof(UConverterStaticData));
-    if (myStaticData == NULL)
-    {
-        *err = U_MEMORY_ALLOCATION_ERROR;
-        T_FileStream_close(convFile);
-        return NULL;
+    if(U_FAILURE(*pErrorCode)) {
+        return;
      }
-    uprv_memset(myStaticData, 0, sizeof(UConverterStaticData));
-    mySharedData->staticData = myStaticData;
-    myStaticData->structSize = sizeof(UConverterStaticData);
-    /*  mySharedData->staticDataOwned = FALSE; */ /* not owned if in udata */
-    mySharedData->sharedDataCached = FALSE;
  
-    mySharedData->dataMemory = NULL; /* for init */
+    initConvData(data);
  
-    readHeaderFromFile(mySharedData, convFile, converterName, err);
+    dataIsBase=readFile(data, converterName, pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return;
+    }
+
+    staticData=&data->staticData;
+    states=&data->ucm->states;
+
+    if(dataIsBase) {
+        /*
+         * Build a normal .cnv file with a base table
+         * and an optional extension table.
+         */
+        data->cnvData=MBCSOpen(data->ucm);
+        if(data->cnvData==NULL) {
+            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+
+        } else if(!data->cnvData->isValid(data->cnvData,
+                            staticData->subChar, staticData->subCharLen)
+        ) {
+            fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
+            *pErrorCode=U_INVALID_TABLE_FORMAT;
  
-    if (U_FAILURE(*err)) return NULL;
+        } else if(staticData->subChar1!=0 &&
+                    !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
+        ) {
+            fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
+            *pErrorCode=U_INVALID_TABLE_FORMAT;
  
-    switch (myStaticData->conversionType)
-    {
-    case UCNV_SBCS:
-      {
-        /* SBCS: use MBCS data structure with a default state table */
-        if(mySharedData->staticData->maxBytesPerChar!=1) {
-            fprintf(stderr, "error: SBCS codepage with max bytes/char!=1\n");
-            *err = U_INVALID_TABLE_FORMAT;
-            break;
+        } else if(
+            data->ucm->ext->mappingsLength>0 &&
+            !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
+        ) {
+            *pErrorCode=U_INVALID_TABLE_FORMAT;
+        } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
+            /* sort the table so that it can be turned into UTF-8-friendly data */
+            ucm_sortTable(data->ucm->base);
          }
-        myStaticData->conversionType = UCNV_MBCS;
-        if(mySharedData->table == NULL) {
-            NewConverter *sharedDataTable = MBCSOpen(1);
-            if(sharedDataTable != NULL) {
-                if(!MBCSAddState(sharedDataTable, "0-ff")) {
-                    *err = U_INVALID_TABLE_FORMAT;
-                    sharedDataTable->close(sharedDataTable);
-                } else {
-                    mySharedData->table = (UConverterTable *)sharedDataTable;
-                }
+
+        if(U_SUCCESS(*pErrorCode)) {
+            if(
+                /* add the base table after ucm_checkBaseExt()! */
+                !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
+            ) {
+                *pErrorCode=U_INVALID_TABLE_FORMAT;
              } else {
-                *err = U_MEMORY_ALLOCATION_ERROR;
+                /*
+                 * addTable() may have requested moving more mappings to the extension table
+                 * if they fit into the base toUnicode table but not into the
+                 * base fromUnicode table.
+                 * (Especially for UTF-8-friendly fromUnicode tables.)
+                 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
+                 * to be excluded from the extension toUnicode data.
+                 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
+                 * the base fromUnicode table.
+                 */
+                ucm_moveMappings(data->ucm->base, data->ucm->ext);
+                ucm_sortTable(data->ucm->ext);
+                if(data->ucm->ext->mappingsLength>0) {
+                    /* prepare the extension table, if there is one */
+                    data->extData=CnvExtOpen(data->ucm);
+                    if(data->extData==NULL) {
+                        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+                    } else if(
+                        !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+                    ) {
+                        *pErrorCode=U_INVALID_TABLE_FORMAT;
+                    }
+                }
              }
          }
-        break;
-      }
-    case UCNV_MBCS:
-      {
-        /* MBCSOpen() was called by readHeaderFromFile() */
-        break;
-      }
-    case UCNV_EBCDIC_STATEFUL:
-      {
-        /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
-        if(mySharedData->staticData->maxBytesPerChar!=2) {
-            fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n");
-            *err = U_INVALID_TABLE_FORMAT;
-            break;
-        }
-        myStaticData->conversionType = UCNV_MBCS;
-        if(mySharedData->table == NULL) {
-            NewConverter *sharedDataTable = MBCSOpen(2);
-            if(sharedDataTable != NULL) {
-                if( !MBCSAddState(sharedDataTable, "0-ff, e:1.s, f:0.s") ||
-                    !MBCSAddState(sharedDataTable, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4") ||
-                    !MBCSAddState(sharedDataTable, "0-40:1.i, 41-fe:1., ff:1.i") ||
-                    !MBCSAddState(sharedDataTable, "0-ff:1.i, 40:1.") ||
-                    !MBCSAddState(sharedDataTable, "0-ff:1.i")
+    } else {
+        /* Build an extension-only .cnv file. */
+        char baseFilename[500];
+        char *basename;
+
+        initConvData(&baseData);
+
+        /* assemble a path/filename for data->ucm->baseName */
+        uprv_strcpy(baseFilename, converterName);
+        basename=(char *)findBasename(baseFilename);
+        uprv_strcpy(basename, data->ucm->baseName);
+        uprv_strcat(basename, ".ucm");
+
+        /* read the base table */
+        dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            return;
+        } else if(!dataIsBase) {
+            fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
+            *pErrorCode=U_INVALID_TABLE_FORMAT;
+        } else {
+            /* prepare the extension table */
+            data->extData=CnvExtOpen(data->ucm);
+            if(data->extData==NULL) {
+                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+            } else {
+                /* fill in gaps in extension file header fields */
+                UCMapping *m, *mLimit;
+                uint8_t fallbackFlags;
+
+                baseStates=&baseData.ucm->states;
+                if(states->conversionType==UCNV_DBCS) {
+                    staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
+                } else if(states->minCharLength==0) {
+                    staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
+                }
+                if(states->maxCharLength<states->minCharLength) {
+                    staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
+                }
+
+                if(staticData->subCharLen==0) {
+                    uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
+                    staticData->subCharLen=baseData.staticData.subCharLen;
+                }
+                /*
+                 * do not copy subChar1 -
+                 * only use what is explicitly specified
+                 * because it cannot be unset in the extension file header
+                 */
+
+                /* get the fallback flags */
+                fallbackFlags=0;
+                for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
+                    m<mLimit && fallbackFlags!=3;
+                    ++m
                  ) {
-                    *err = U_INVALID_TABLE_FORMAT;
-                    sharedDataTable->close(sharedDataTable);
-                } else {
-                    mySharedData->table = (UConverterTable *)sharedDataTable;
+                    if(m->f==1) {
+                        fallbackFlags|=1;
+                    } else if(m->f==3) {
+                        fallbackFlags|=2;
+                    }
                  }
-            } else {
-                *err = U_MEMORY_ALLOCATION_ERROR;
-            }
-        }
-        break;
-      }
-    case UCNV_DBCS:
-      {
-        /* DBCS: use MBCS data structure with a default state table */
-        if(mySharedData->staticData->maxBytesPerChar!=2) {
-            fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n");
-            *err = U_INVALID_TABLE_FORMAT;
-            break;
-        }
-        myStaticData->conversionType = UCNV_MBCS;
-        if(mySharedData->table == NULL) {
-            NewConverter *sharedDataTable = MBCSOpen(2);
-            if(sharedDataTable != NULL) {
-                if( !MBCSAddState(sharedDataTable, "0-3f:3, 40:2, 41-fe:1, ff:3") ||
-                    !MBCSAddState(sharedDataTable, "41-fe") ||
-                    !MBCSAddState(sharedDataTable, "40") ||
-                    !MBCSAddState(sharedDataTable, "")
+
+                if(fallbackFlags&1) {
+                    staticData->hasFromUnicodeFallback=TRUE;
+                }
+                if(fallbackFlags&2) {
+                    staticData->hasToUnicodeFallback=TRUE;
+                }
+
+                if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
+                    fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
+                    *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+                } else if(1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
+                    fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
+                    *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+                } else if(
+                    !ucm_checkValidity(data->ucm->ext, baseStates) ||
+                    !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
                  ) {
-                    *err = U_INVALID_TABLE_FORMAT;
-                    sharedDataTable->close(sharedDataTable);
+                    *pErrorCode=U_INVALID_TABLE_FORMAT;
                  } else {
-                    mySharedData->table = (UConverterTable *)sharedDataTable;
+                    if(states->maxCharLength>1) {
+                        /*
+                         * When building a normal .cnv file with a base table
+                         * for an MBCS (not SBCS) table with explicit precision flags,
+                         * the MBCSAddTable() function marks some mappings for moving
+                         * to the extension table.
+                         * They fit into the base toUnicode table but not into the
+                         * base fromUnicode table.
+                         * (Note: We do have explicit precision flags because they are
+                         * required for extension table generation, and
+                         * ucm_checkBaseExt() verified it.)
+                         *
+                         * We do not call MBCSAddTable() here (we probably could)
+                         * so we need to do the analysis before building the extension table.
+                         * We assume that MBCSAddTable() will build a UTF-8-friendly table.
+                         * Redundant mappings in the extension table are ok except they cost some size.
+                         *
+                         * Do this after ucm_checkBaseExt().
+                         */
+                        const MBCSData *mbcsData=MBCSGetDummy();
+                        int32_t needsMove=0;
+                        for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
+                            m<mLimit;
+                            ++m
+                        ) {
+                            if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
+                                m->f|=MBCS_FROM_U_EXT_FLAG;
+                                m->moveFlag=UCM_MOVE_TO_EXT;
+                                ++needsMove;
+                            }
+                        }
+
+                        if(needsMove!=0) {
+                            ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
+                            ucm_sortTable(data->ucm->ext);
+                        }
+                    }
+                    if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
+                        *pErrorCode=U_INVALID_TABLE_FORMAT;
+                    }
                  }
-            } else {
-                *err = U_MEMORY_ALLOCATION_ERROR;
              }
          }
-        break;
-      }
-
-    default :
-        fprintf(stderr, "error: <uconv_class> omitted\n");
-        *err = U_INVALID_TABLE_FORMAT;
-        mySharedData->table = NULL;
-        break;
-    };
  
-    if(U_SUCCESS(*err) && mySharedData->table != NULL)
-    {
-        loadTableFromFile(convFile, mySharedData, err);
+        cleanupConvData(&baseData);
      }
-
-    T_FileStream_close(convFile);
-
-    return mySharedData;
  }
  
  /*