]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/tools/gennorm2/gennorm2.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / tools / gennorm2 / gennorm2.cpp
index f0d981ec53b46b77dfac45f9a65e61a3b25a7a81..bce5336be628c4a7b2242a2ad937a09135b3c6d1 100644 (file)
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2009-2010, International Business Machines
+*   Copyright (C) 2009-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  gennorm2.cpp
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
 #include "unicode/utypes.h"
 #include "n2builder.h"
 
+#include <fstream>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string>
 #include <string.h>
 #include "unicode/errorcode.h"
 #include "unicode/localpointer.h"
 #include "unewdata.h"
 #endif
 
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
 U_NAMESPACE_BEGIN
 
 UBool beVerbose=FALSE, haveCopyright=TRUE;
 
-U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
-
 #if !UCONFIG_NO_NORMALIZATION
-void parseFile(FILE *f, Normalizer2DataBuilder &builder);
+void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
 #endif
 
 /* -------------------------------------------------------------------------- */
@@ -60,6 +60,8 @@ enum {
     SOURCEDIR,
     OUTPUT_FILENAME,
     UNICODE_VERSION,
+    WRITE_C_SOURCE,
+    WRITE_COMBINED_DATA,
     OPT_FAST
 };
 
@@ -71,6 +73,8 @@ static UOption options[]={
     UOPTION_SOURCEDIR,
     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
+    UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
+    UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
 };
 
@@ -80,7 +84,6 @@ main(int argc, char* argv[]) {
 
     /* preset then read command line options */
     options[SOURCEDIR].value="";
-    options[UNICODE_VERSION].value=U_UNICODE_VERSION;
     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
 
     /* error handling, printing usage message */
@@ -95,17 +98,22 @@ main(int argc, char* argv[]) {
     if( argc<2 ||
         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
     ) {
-        /*
-         * Broken into chunks because the C89 standard says the minimum
-         * required supported string length is 509 bytes.
-         */
         fprintf(stderr,
             "Usage: %s [-options] infiles+ -o outputfilename\n"
             "\n"
             "Reads the infiles with normalization data and\n"
-            "creates a binary file (outputfilename) with the data.\n"
+            "creates a binary file, or a C source file (--csource), with the data,\n"
+            "or writes a data file with the combined data (--combined).\n"
+            "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
+            "\n"
+            "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
+            "\n"
+            "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
+            "in input-file syntax to the outputfilename.\n"
+            "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
+            "(Useful for computing minimal incremental mapping data files.)\n"
             "\n",
-            argv[0]);
+            argv[0], argv[0]);
         fprintf(stderr,
             "Options:\n"
             "\t-h or -? or --help  this usage text\n"
@@ -114,9 +122,12 @@ main(int argc, char* argv[]) {
             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
         fprintf(stderr,
             "\t-s or --sourcedir   source directory, followed by the path\n"
-            "\t-o or --output      output filename\n");
+            "\t-o or --output      output filename\n"
+            "\t      --csource     writes a C source file with initializers\n"
+            "\t      --combined    writes a .txt file (input-file syntax) with the\n"
+            "\t                    combined data from all of the input files\n");
         fprintf(stderr,
-            "\t      --fast        optimize the .nrm file for fast normalization,\n"
+            "\t      --fast        optimize the data for fast normalization,\n"
             "\t                    which might increase its size  (Writes fully decomposed\n"
             "\t                    regular mappings instead of delta mappings.\n"
             "\t                    You should measure the runtime speed to make sure that\n"
@@ -142,10 +153,15 @@ main(int argc, char* argv[]) {
 
 #else
 
-    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
+    LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
+    LocalPointer<Normalizer2DataBuilder> b2;
+    LocalPointer<Normalizer2DataBuilder> diff;
+    Normalizer2DataBuilder *builder = b1.getAlias();
     errorCode.assertSuccess();
 
-    builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+    if(options[UNICODE_VERSION].doesOccur) {
+        builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+    }
 
     if(options[OPT_FAST].doesOccur) {
         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
@@ -162,20 +178,50 @@ main(int argc, char* argv[]) {
         pathLength=filename.length();
     }
 
+    bool doMinus = false;
     for(int i=1; i<argc; ++i) {
         printf("gennorm2: processing %s\n", argv[i]);
+        if(strcmp(argv[i], "minus") == 0) {
+            if(doMinus) {
+                fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
+                exit(U_ILLEGAL_ARGUMENT_ERROR);
+            }
+            // Data from previous input files has been collected in b1.
+            // Collect data from further input files in b2.
+            b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+            diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+            errorCode.assertSuccess();
+            builder = b2.getAlias();
+            if(options[UNICODE_VERSION].doesOccur) {
+                builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+            }
+            if(options[OPT_FAST].doesOccur) {
+                builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
+            }
+            doMinus = true;
+            continue;
+        }
         filename.append(argv[i], errorCode);
-        LocalStdioFilePointer f(fopen(filename.data(), "r"));
-        if(f==NULL) {
+        std::ifstream f(filename.data());
+        if(f.fail()) {
             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
             exit(U_FILE_ACCESS_ERROR);
         }
         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
-        parseFile(f.getAlias(), *builder);
+        parseFile(f, *builder);
         filename.truncate(pathLength);
     }
 
-    builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
+    if(doMinus) {
+        Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
+        diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
+    } else if(options[WRITE_COMBINED_DATA].doesOccur) {
+        builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
+    } else if(options[WRITE_C_SOURCE].doesOccur) {
+        builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
+    } else {
+        builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
+    }
 
     return errorCode.get();
 
@@ -184,11 +230,19 @@ main(int argc, char* argv[]) {
 
 #if !UCONFIG_NO_NORMALIZATION
 
-void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
+void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
     IcuToolErrorCode errorCode("gennorm2/parseFile()");
-    char line[300];
+    std::string lineString;
     uint32_t startCP, endCP;
-    while(NULL!=fgets(line, (int)sizeof(line), f)) {
+    while(std::getline(f, lineString)) {
+        if (lineString.empty()) {
+            continue;  // skip empty lines.
+        }
+#if (U_CPLUSPLUS_VERSION >= 11)
+        char *line = &lineString.front();
+#else
+        char *line = &lineString.at(0);
+#endif
         char *comment=(char *)strchr(line, '#');
         if(comment!=NULL) {
             *comment=0;
@@ -198,6 +252,11 @@ void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
             continue;  // skip empty and comment-only lines
         }
         if(line[0]=='*') {
+            const char *s=u_skipWhitespace(line+1);
+            if(0==strncmp(s, "Unicode", 7)) {
+                s=u_skipWhitespace(s+7);
+                builder.setUnicodeVersion(s);
+            }
             continue;  // reserved syntax
         }
         const char *delimiter;
@@ -207,6 +266,11 @@ void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
             exit(errorCode.reset());
         }
+        if (endCP >= 0xd800 && startCP <= 0xdfff) {
+                fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
+                        line);
+                exit(U_ILLEGAL_ARGUMENT_ERROR);
+        }
         delimiter=u_skipWhitespace(delimiter);
         if(*delimiter==':') {
             const char *s=u_skipWhitespace(delimiter+1);
@@ -233,7 +297,7 @@ void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
         }
         if(*delimiter=='=' || *delimiter=='>') {
             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
-            int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
+            int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
             if(errorCode.isFailure()) {
                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
                 exit(errorCode.reset());