icuSources/tools/gennorm2/gennorm2.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2009-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  gennorm2.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2009nov25
  16 *   created by: Markus W. Scherer
  17 *
  18 *   This program reads text files that define Unicode normalization,
  19 *   parses them, and builds a binary data file.
  20 */
  21
  22 #include "unicode/utypes.h"
  23 #include "n2builder.h"
  24
  25 #include <fstream>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string>
  29 #include <string.h>
  30 #include "unicode/errorcode.h"
  31 #include "unicode/localpointer.h"
  32 #include "unicode/putil.h"
  33 #include "unicode/uchar.h"
  34 #include "unicode/unistr.h"
  35 #include "charstr.h"
  36 #include "normalizer2impl.h"
  37 #include "toolutil.h"
  38 #include "uoptions.h"
  39 #include "uparse.h"
  40
  41 #if UCONFIG_NO_NORMALIZATION
  42 #include "unewdata.h"
  43 #endif
  44
  45 U_NAMESPACE_BEGIN
  46
  47 UBool beVerbose=FALSE, haveCopyright=TRUE;
  48
  49 #if !UCONFIG_NO_NORMALIZATION
  50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
  51 #endif
  52
  53 /* -------------------------------------------------------------------------- */
  54
  55 enum {
  56     HELP_H,
  57     HELP_QUESTION_MARK,
  58     VERBOSE,
  59     COPYRIGHT,
  60     SOURCEDIR,
  61     OUTPUT_FILENAME,
  62     UNICODE_VERSION,
  63     WRITE_C_SOURCE,
  64     WRITE_COMBINED_DATA,
  65     OPT_FAST
  66 };
  67
  68 static UOption options[]={
  69     UOPTION_HELP_H,
  70     UOPTION_HELP_QUESTION_MARK,
  71     UOPTION_VERBOSE,
  72     UOPTION_COPYRIGHT,
  73     UOPTION_SOURCEDIR,
  74     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
  75     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
  76     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
  77     UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
  78     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
  79 };
  80
  81 extern "C" int
  82 main(int argc, char* argv[]) {
  83     U_MAIN_INIT_ARGS(argc, argv);
  84
  85     /* preset then read command line options */
  86     options[SOURCEDIR].value="";
  87     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
  88
  89     /* error handling, printing usage message */
  90     if(argc<0) {
  91         fprintf(stderr,
  92             "error in command line argument \"%s\"\n",
  93             argv[-argc]);
  94     }
  95     if(!options[OUTPUT_FILENAME].doesOccur) {
  96         argc=-1;
  97     }
  98     if( argc<2 ||
  99         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
 100     ) {
 101         fprintf(stderr,
 102             "Usage: %s [-options] infiles+ -o outputfilename\n"
 103             "\n"
 104             "Reads the infiles with normalization data and\n"
 105             "creates a binary file, or a C source file (--csource), with the data,\n"
 106             "or writes a data file with the combined data (--combined).\n"
 107             "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
 108             "\n"
 109             "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
 110             "\n"
 111             "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
 112             "in input-file syntax to the outputfilename.\n"
 113             "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
 114             "(Useful for computing minimal incremental mapping data files.)\n"
 115             "\n",
 116             argv[0], argv[0]);
 117         fprintf(stderr,
 118             "Options:\n"
 119             "\t-h or -? or --help  this usage text\n"
 120             "\t-v or --verbose     verbose output\n"
 121             "\t-c or --copyright   include a copyright notice\n"
 122             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
 123         fprintf(stderr,
 124             "\t-s or --sourcedir   source directory, followed by the path\n"
 125             "\t-o or --output      output filename\n"
 126             "\t      --csource     writes a C source file with initializers\n"
 127             "\t      --combined    writes a .txt file (input-file syntax) with the\n"
 128             "\t                    combined data from all of the input files\n");
 129         fprintf(stderr,
 130             "\t      --fast        optimize the data for fast normalization,\n"
 131             "\t                    which might increase its size  (Writes fully decomposed\n"
 132             "\t                    regular mappings instead of delta mappings.\n"
 133             "\t                    You should measure the runtime speed to make sure that\n"
 134             "\t                    this is a good trade-off.)\n");
 135         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
 136     }
 137
 138     beVerbose=options[VERBOSE].doesOccur;
 139     haveCopyright=options[COPYRIGHT].doesOccur;
 140
 141     IcuToolErrorCode errorCode("gennorm2/main()");
 142
 143 #if UCONFIG_NO_NORMALIZATION
 144
 145     fprintf(stderr,
 146         "gennorm2 writes a dummy binary data file "
 147         "because UCONFIG_NO_NORMALIZATION is set, \n"
 148         "see icu/source/common/unicode/uconfig.h\n");
 149     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
 150     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
 151     // return U_UNSUPPORTED_ERROR;
 152     return 0;
 153
 154 #else
 155
 156     LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
 157     LocalPointer<Normalizer2DataBuilder> b2;
 158     LocalPointer<Normalizer2DataBuilder> diff;
 159     Normalizer2DataBuilder *builder = b1.getAlias();
 160     errorCode.assertSuccess();
 161
 162     if(options[UNICODE_VERSION].doesOccur) {
 163         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
 164     }
 165
 166     if(options[OPT_FAST].doesOccur) {
 167         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
 168     }
 169
 170     // prepare the filename beginning with the source dir
 171     CharString filename(options[SOURCEDIR].value, errorCode);
 172     int32_t pathLength=filename.length();
 173     if( pathLength>0 &&
 174         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
 175         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
 176     ) {
 177         filename.append(U_FILE_SEP_CHAR, errorCode);
 178         pathLength=filename.length();
 179     }
 180
 181     bool doMinus = false;
 182     for(int i=1; i<argc; ++i) {
 183         printf("gennorm2: processing %s\n", argv[i]);
 184         if(strcmp(argv[i], "minus") == 0) {
 185             if(doMinus) {
 186                 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
 187                 exit(U_ILLEGAL_ARGUMENT_ERROR);
 188             }
 189             // Data from previous input files has been collected in b1.
 190             // Collect data from further input files in b2.
 191             b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
 192             diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
 193             errorCode.assertSuccess();
 194             builder = b2.getAlias();
 195             if(options[UNICODE_VERSION].doesOccur) {
 196                 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
 197             }
 198             if(options[OPT_FAST].doesOccur) {
 199                 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
 200             }
 201             doMinus = true;
 202             continue;
 203         }
 204         filename.append(argv[i], errorCode);
 205         std::ifstream f(filename.data());
 206         if(f.fail()) {
 207             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
 208             exit(U_FILE_ACCESS_ERROR);
 209         }
 210         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
 211         parseFile(f, *builder);
 212         filename.truncate(pathLength);
 213     }
 214
 215     if(doMinus) {
 216         Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
 217         diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
 218     } else if(options[WRITE_COMBINED_DATA].doesOccur) {
 219         builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
 220     } else if(options[WRITE_C_SOURCE].doesOccur) {
 221         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
 222     } else {
 223         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
 224     }
 225
 226     return errorCode.get();
 227
 228 #endif
 229 }
 230
 231 #if !UCONFIG_NO_NORMALIZATION
 232
 233 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
 234     IcuToolErrorCode errorCode("gennorm2/parseFile()");
 235     std::string lineString;
 236     uint32_t startCP, endCP;
 237     while(std::getline(f, lineString)) {
 238         if (lineString.empty()) {
 239             continue;  // skip empty lines.
 240         }
 241 #if (U_CPLUSPLUS_VERSION >= 11)
 242         char *line = &lineString.front();
 243 #else
 244         char *line = &lineString.at(0);
 245 #endif
 246         char *comment=(char *)strchr(line, '#');
 247         if(comment!=NULL) {
 248             *comment=0;
 249         }
 250         u_rtrim(line);
 251         if(line[0]==0) {
 252             continue;  // skip empty and comment-only lines
 253         }
 254         if(line[0]=='*') {
 255             const char *s=u_skipWhitespace(line+1);
 256             if(0==strncmp(s, "Unicode", 7)) {
 257                 s=u_skipWhitespace(s+7);
 258                 builder.setUnicodeVersion(s);
 259             }
 260             continue;  // reserved syntax
 261         }
 262         const char *delimiter;
 263         int32_t rangeLength=
 264             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
 265         if(errorCode.isFailure()) {
 266             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
 267             exit(errorCode.reset());
 268         }
 269         if (endCP >= 0xd800 && startCP <= 0xdfff) {
 270                 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
 271                         line);
 272                 exit(U_ILLEGAL_ARGUMENT_ERROR);
 273         }
 274         delimiter=u_skipWhitespace(delimiter);
 275         if(*delimiter==':') {
 276             const char *s=u_skipWhitespace(delimiter+1);
 277             char *end;
 278             unsigned long value=strtoul(s, &end, 10);
 279             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
 280                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
 281                 exit(U_PARSE_ERROR);
 282             }
 283             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
 284                 builder.setCC(c, (uint8_t)value);
 285             }
 286             continue;
 287         }
 288         if(*delimiter=='-') {
 289             if(*u_skipWhitespace(delimiter+1)!=0) {
 290                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
 291                 exit(U_PARSE_ERROR);
 292             }
 293             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
 294                 builder.removeMapping(c);
 295             }
 296             continue;
 297         }
 298         if(*delimiter=='=' || *delimiter=='>') {
 299             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
 300             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
 301             if(errorCode.isFailure()) {
 302                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
 303                 exit(errorCode.reset());
 304             }
 305             UnicodeString mapping(FALSE, uchars, length);
 306             if(*delimiter=='=') {
 307                 if(rangeLength!=1) {
 308                     fprintf(stderr,
 309                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
 310                             line);
 311                     exit(U_PARSE_ERROR);
 312                 }
 313                 builder.setRoundTripMapping((UChar32)startCP, mapping);
 314             } else {
 315                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
 316                     builder.setOneWayMapping(c, mapping);
 317                 }
 318             }
 319             continue;
 320         }
 321         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
 322         exit(U_PARSE_ERROR);
 323     }
 324 }
 325
 326 #endif // !UCONFIG_NO_NORMALIZATION
 327
 328 U_NAMESPACE_END
 329
 330 /*
 331  * Hey, Emacs, please set the following:
 332  *
 333  * Local Variables:
 334  * indent-tabs-mode: nil
 335  * End:
 336  *
 337  */