]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/tools/makeconv/makeconv.c
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / tools / makeconv / makeconv.c
index f3b6443054427531c7b694a875db8787ba4d8f2b..424c962251138ce7dc9de7cb865fe198d689f0d6 100644 (file)
@@ -1,7 +1,7 @@
 /*
  ********************************************************************************
  *
- *   Copyright (C) 1998-2004, International Business Machines
+ *   Copyright (C) 1998-2012, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  ********************************************************************************
@@ -17,7 +17,6 @@
 
 #include <stdio.h>
 #include "unicode/putil.h"
-#include "ucnv_io.h"
 #include "unicode/ucnv_err.h"
 #include "ucnv_bld.h"
 #include "ucnv_imp.h"
@@ -35,6 +34,8 @@
 #include "makeconv.h"
 #include "genmbcs.h"
 
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
 #define DEBUG 0
 
 typedef struct ConvData {
@@ -77,7 +78,8 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP
  * Global - verbosity
  */
 UBool VERBOSE = FALSE;
-UBool TOUCHFILE = FALSE;
+UBool SMALL = FALSE;
+UBool IGNORE_SISO_CHECK = FALSE;
 
 static void
 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
@@ -138,7 +140,7 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
 
     if(VERBOSE)
       {
-        fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv");
+        printf("- Opened udata %s.%s\n", cnvName, "cnv");
       }
 
 
@@ -161,19 +163,31 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
     }
     if(VERBOSE)
     {
-      fprintf(stderr, "- Wrote %u bytes to the udata.\n", (int)sz2);
+      printf("- Wrote %u bytes to the udata.\n", (int)sz2);
     }
 }
 
+enum {
+    OPT_HELP_H,
+    OPT_HELP_QUESTION_MARK,
+    OPT_COPYRIGHT,
+    OPT_VERSION,
+    OPT_DESTDIR,
+    OPT_VERBOSE,
+    OPT_SMALL,
+    OPT_IGNORE_SISO_CHECK,
+    OPT_COUNT
+};
+
 static UOption options[]={
-    UOPTION_HELP_H,              /* 0  Numbers for those who*/
-    UOPTION_HELP_QUESTION_MARK,  /* 1   can't count. */
-    UOPTION_COPYRIGHT,           /* 2 */
-    UOPTION_VERSION,             /* 3 */
-    UOPTION_DESTDIR,             /* 4 */
-    UOPTION_VERBOSE,             /* 5 */
-    UOPTION_PACKAGE_NAME,        /* 6 */
-    UOPTION_DEF( "touchfile", 't', UOPT_NO_ARG) /* 7 */
+    UOPTION_HELP_H,
+    UOPTION_HELP_QUESTION_MARK,
+    UOPTION_COPYRIGHT,
+    UOPTION_VERSION,
+    UOPTION_DESTDIR,
+    UOPTION_VERBOSE,
+    { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
+    { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
 };
 
 int main(int argc, char* argv[])
@@ -181,9 +195,7 @@ int main(int argc, char* argv[])
     ConvData data;
     UErrorCode err = U_ZERO_ERROR, localError;
     char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
-    char touchFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
     const char* destdir, *arg;
-    const char *pkgName = NULL;
     size_t destdirlen;
     char* dot = NULL, *outBasename;
     char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
@@ -200,8 +212,8 @@ int main(int argc, char* argv[])
     uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
 
     /* preset then read command line options */
-    options[4].value=u_getDataDirectory();
-    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
+    options[OPT_DESTDIR].value=u_getDataDirectory();
+    argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
 
     /* error handling, printing usage message */
     if(argc<0) {
@@ -211,8 +223,9 @@ int main(int argc, char* argv[])
     } else if(argc<2) {
         argc=-1;
     }
-    if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
-        fprintf(stderr,
+    if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
+        FILE *stdfile=argc<0 ? stderr : stdout;
+        fprintf(stdfile,
             "usage: %s [-options] files...\n"
             "\tread .ucm codepage mapping files and write .cnv files\n"
             "options:\n"
@@ -222,53 +235,32 @@ int main(int argc, char* argv[])
             "\t-d or --destdir     destination directory, followed by the path\n"
             "\t-v or --verbose     Turn on verbose output\n",
             argv[0]);
-        fprintf(stderr,
-            "\t-p or --pkgname     sets the 'package' name for output files.\n"
-            "\t                    If name is ICUDATA, then the default icu package\n"
-            "\t                    name will be used.\n"
-            "\t-t or --touchfile   Generate additional small file without packagename, for nmake\n");
+        fprintf(stdfile,
+            "\t      --small       Generate smaller .cnv files. They will be\n"
+            "\t                    significantly smaller but may not be compatible with\n"
+            "\t                    older versions of ICU and will require heap memory\n"
+            "\t                    allocation when loaded.\n"
+            "\t      --ignore-siso-check         Use SI/SO other than 0xf/0xe.\n");
         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
     }
 
-    if(options[3].doesOccur) {
-      fprintf(stderr,"makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
-            dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
-      fprintf(stderr, "Copyright (C) 1998-2000, International Business Machines\n");
-      fprintf(stderr,"Corporation and others.  All Rights Reserved.\n");
+    if(options[OPT_VERSION].doesOccur) {
+        printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
+               dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
+        printf("%s\n", U_COPYRIGHT_STRING);
         exit(0);
     }
 
-   TOUCHFILE = options[7].doesOccur;
-
-   if(!options[6].doesOccur)
-    {
-      pkgName=NULL;
-    }
-    else
-    {
-        pkgName =options[6].value;
-        if(!strcmp(pkgName, "ICUDATA"))
-        {
-            pkgName = U_ICUDATA_NAME;
-        }
-        if(pkgName[0] == 0)
-        {
-            pkgName = NULL;
+    /* get the options values */
+    haveCopyright = options[OPT_COPYRIGHT].doesOccur;
+    destdir = options[OPT_DESTDIR].value;
+    VERBOSE = options[OPT_VERBOSE].doesOccur;
+    SMALL = options[OPT_SMALL].doesOccur;
 
-            if(TOUCHFILE)
-            {
-                fprintf(stderr, "%s: Don't use touchfile option with an empty packagename.\n",
-                        argv[0]);
-                exit(1);
-            }
-        }
+    if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
+        IGNORE_SISO_CHECK = TRUE;
     }
 
-    /* get the options values */
-    haveCopyright = options[2].doesOccur;
-    destdir = options[4].value;
-    VERBOSE = options[5].doesOccur;
-
     if (destdir != NULL && *destdir != 0) {
         uprv_strcpy(outFileName, destdir);
         destdirlen = uprv_strlen(destdir);
@@ -300,6 +292,13 @@ int main(int argc, char* argv[])
     {
         arg = getLongPathname(*argv);
 
+        /* Check for potential buffer overflow */
+        if(strlen(arg) >= UCNV_MAX_FULL_FILE_NAME_LENGTH)
+        {
+            fprintf(stderr, "%s\n", u_errorName(U_BUFFER_OVERFLOW_ERROR));
+            return U_BUFFER_OVERFLOW_ERROR;
+        }
+
         /*produces the right destination path for display*/
         if (destdirlen != 0)
         {
@@ -324,21 +323,6 @@ int main(int argc, char* argv[])
         /* the basename without extension is the converter name */
         uprv_strcpy(cnvName, outBasename);
 
-        if(TOUCHFILE)
-        {
-            uprv_strcpy(touchFileName, outBasename);
-            uprv_strcat(touchFileName, ".cnv");
-        }
-
-        if(pkgName != NULL)
-        {
-            /* changes both basename and filename */
-            uprv_strcpy(outBasename, pkgName);
-            uprv_strcat(outBasename, "_");
-            uprv_strcat(outBasename, cnvName);
-        }
-
-
         /*Adds the target extension*/
         uprv_strcat(outBasename, CONVERTER_FILE_EXTENSION);
 
@@ -361,12 +345,28 @@ int main(int argc, char* argv[])
         }
         else
         {
-            /* Make the static data name equal to the file name */
-            if( /*VERBOSE &&  */ uprv_stricmp(cnvName,data.staticData.name))
+            /* Insure the static data name matches the  file name */
+            /* Changed to ignore directory and only compare base name
+             LDH 1/2/08*/
+            char *p;
+            p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
+
+            if(p == NULL)            /* OK, try alternate */
+            {
+                p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
+                if(p == NULL)
+                {
+                    p=cnvName; /* If no separators, no problem */
+                }
+            }
+            else
+            {
+                p++;   /* If found separtor, don't include it in compare */
+            }
+            if(uprv_stricmp(p,data.staticData.name))
             {
                 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
-                    cnvName,
-                    CONVERTER_FILE_EXTENSION,
+                    cnvName,  CONVERTER_FILE_EXTENSION,
                     data.staticData.name);
             }
 
@@ -382,40 +382,10 @@ int main(int argc, char* argv[])
                 }
             }
 
-            if(pkgName == NULL)
-            {
-                uprv_strcpy(cnvNameWithPkg, cnvName);
-            }
-            else
-            {
-                uprv_strcpy(cnvNameWithPkg, pkgName);
-                uprv_strcat(cnvNameWithPkg, "_");
-                uprv_strcat(cnvNameWithPkg, cnvName);
-            }
+            uprv_strcpy(cnvNameWithPkg, cnvName);
 
             localError = U_ZERO_ERROR;
             writeConverterData(&data, cnvNameWithPkg, destdir, &localError);
-            if(TOUCHFILE)
-            {
-                FileStream *q;
-                char msg[1024];
-
-                sprintf(msg, "This empty file tells nmake that %s in package %s has been updated.\n",
-                    cnvName, pkgName);
-
-                q = T_FileStream_open(touchFileName, "w");
-                if(q == NULL)
-                {
-                    fprintf(stderr, "Error writing touchfile \"%s\"\n", touchFileName);
-                    localError = U_FILE_ACCESS_ERROR;
-                }
-
-                else
-                {
-                    T_FileStream_write(q, msg, (int32_t)uprv_strlen(msg));
-                    T_FileStream_close(q);
-                }
-            }
 
             if(U_FAILURE(localError))
             {
@@ -428,7 +398,7 @@ int main(int argc, char* argv[])
             }
             else if (printFilename)
             {
-                puts(outFileName);
+                puts(outBasename);
             }
         }
         fflush(stdout);
@@ -463,7 +433,7 @@ readHeader(ConvData *data,
            FileStream* convFile,
            const char* converterName,
            UErrorCode *pErrorCode) {
-    char line[200];
+    char line[1024];
     char *s, *key, *value;
     const UConverterStaticData *prototype;
     UConverterStaticData *staticData;
@@ -590,7 +560,7 @@ readHeader(ConvData *data,
 static UBool
 readFile(ConvData *data, const char* converterName,
          UErrorCode *pErrorCode) {
-    char line[200];
+    char line[1024];
     char *end;
     FileStream *convFile;
 
@@ -617,7 +587,7 @@ readFile(ConvData *data, const char* converterName,
     if(data->ucm->baseName[0]==0) {
         dataIsBase=TRUE;
         baseStates=&data->ucm->states;
-        ucm_processStates(baseStates);
+        ucm_processStates(baseStates, IGNORE_SISO_CHECK);
     } else {
         dataIsBase=FALSE;
         baseStates=NULL;
@@ -684,6 +654,10 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
     states=&data->ucm->states;
 
     if(dataIsBase) {
+        /*
+         * Build a normal .cnv file with a base table
+         * and an optional extension table.
+         */
         data->cnvData=MBCSOpen(data->ucm);
         if(data->cnvData==NULL) {
             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
@@ -700,27 +674,50 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
             fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
             *pErrorCode=U_INVALID_TABLE_FORMAT;
 
-        } else if(data->ucm->ext->mappingsLength>0) {
-            /* prepare the extension table, if there is one */
-            data->extData=CnvExtOpen(data->ucm);
-            if(data->extData==NULL) {
-                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+        } else if(
+            data->ucm->ext->mappingsLength>0 &&
+            !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
+        ) {
+            *pErrorCode=U_INVALID_TABLE_FORMAT;
+        } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
+            /* sort the table so that it can be turned into UTF-8-friendly data */
+            ucm_sortTable(data->ucm->base);
+        }
 
-            } else if(
-                !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
-                !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+        if(U_SUCCESS(*pErrorCode)) {
+            if(
+                /* add the base table after ucm_checkBaseExt()! */
+                !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
             ) {
                 *pErrorCode=U_INVALID_TABLE_FORMAT;
+            } else {
+                /*
+                 * addTable() may have requested moving more mappings to the extension table
+                 * if they fit into the base toUnicode table but not into the
+                 * base fromUnicode table.
+                 * (Especially for UTF-8-friendly fromUnicode tables.)
+                 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
+                 * to be excluded from the extension toUnicode data.
+                 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
+                 * the base fromUnicode table.
+                 */
+                ucm_moveMappings(data->ucm->base, data->ucm->ext);
+                ucm_sortTable(data->ucm->ext);
+                if(data->ucm->ext->mappingsLength>0) {
+                    /* prepare the extension table, if there is one */
+                    data->extData=CnvExtOpen(data->ucm);
+                    if(data->extData==NULL) {
+                        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+                    } else if(
+                        !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+                    ) {
+                        *pErrorCode=U_INVALID_TABLE_FORMAT;
+                    }
+                }
             }
         }
-
-        /* add the base table after ucm_checkBaseExt()! */
-        if( U_SUCCESS(*pErrorCode) &&
-            !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
-        ) {
-            *pErrorCode=U_INVALID_TABLE_FORMAT;
-        }
     } else {
+        /* Build an extension-only .cnv file. */
         char baseFilename[500];
         char *basename;
 
@@ -744,7 +741,6 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
             data->extData=CnvExtOpen(data->ucm);
             if(data->extData==NULL) {
                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-
             } else {
                 /* fill in gaps in extension file header fields */
                 UCMapping *m, *mLimit;
@@ -782,16 +778,6 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
                         fallbackFlags|=2;
                     }
                 }
-                for(m=data->ucm->base->mappings, mLimit=m+data->ucm->base->mappingsLength;
-                    m<mLimit && fallbackFlags!=3;
-                    ++m
-                ) {
-                    if(m->f==1) {
-                        fallbackFlags|=1;
-                    } else if(m->f==3) {
-                        fallbackFlags|=2;
-                    }
-                }
 
                 if(fallbackFlags&1) {
                     staticData->hasFromUnicodeFallback=TRUE;
@@ -804,16 +790,56 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
                     fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
                     *pErrorCode=U_INVALID_TABLE_FORMAT;
 
-                } else if(1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
+                } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
                     fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
                     *pErrorCode=U_INVALID_TABLE_FORMAT;
 
                 } else if(
                     !ucm_checkValidity(data->ucm->ext, baseStates) ||
-                    !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
-                    !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+                    !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
                 ) {
                     *pErrorCode=U_INVALID_TABLE_FORMAT;
+                } else {
+                    if(states->maxCharLength>1) {
+                        /*
+                         * When building a normal .cnv file with a base table
+                         * for an MBCS (not SBCS) table with explicit precision flags,
+                         * the MBCSAddTable() function marks some mappings for moving
+                         * to the extension table.
+                         * They fit into the base toUnicode table but not into the
+                         * base fromUnicode table.
+                         * (Note: We do have explicit precision flags because they are
+                         * required for extension table generation, and
+                         * ucm_checkBaseExt() verified it.)
+                         *
+                         * We do not call MBCSAddTable() here (we probably could)
+                         * so we need to do the analysis before building the extension table.
+                         * We assume that MBCSAddTable() will build a UTF-8-friendly table.
+                         * Redundant mappings in the extension table are ok except they cost some size.
+                         *
+                         * Do this after ucm_checkBaseExt().
+                         */
+                        const MBCSData *mbcsData=MBCSGetDummy();
+                        int32_t needsMove=0;
+                        for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
+                            m<mLimit;
+                            ++m
+                        ) {
+                            if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
+                                m->f|=MBCS_FROM_U_EXT_FLAG;
+                                m->moveFlag=UCM_MOVE_TO_EXT;
+                                ++needsMove;
+                            }
+                        }
+
+                        if(needsMove!=0) {
+                            ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
+                            ucm_sortTable(data->ucm->ext);
+                        }
+                    }
+                    if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
+                        *pErrorCode=U_INVALID_TABLE_FORMAT;
+                    }
                 }
             }
         }