1 // © 2016 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   4 ******************************************************************************* 
   6 *   Copyright (C) 2009-2014, International Business Machines 
   7 *   Corporation and others.  All Rights Reserved. 
   9 ******************************************************************************* 
  10 *   file name:  gennorm2.cpp 
  12 *   tab size:   8 (not used) 
  15 *   created on: 2009nov25 
  16 *   created by: Markus W. Scherer 
  18 *   This program reads text files that define Unicode normalization, 
  19 *   parses them, and builds a binary data file. 
  22 #include "unicode/utypes.h" 
  23 #include "n2builder.h" 
  30 #include "unicode/errorcode.h" 
  31 #include "unicode/localpointer.h" 
  32 #include "unicode/putil.h" 
  33 #include "unicode/uchar.h" 
  34 #include "unicode/unistr.h" 
  36 #include "normalizer2impl.h" 
  41 #if UCONFIG_NO_NORMALIZATION 
  47 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
; 
  49 #if !UCONFIG_NO_NORMALIZATION 
  50 void parseFile(std::ifstream 
&f
, Normalizer2DataBuilder 
&builder
); 
  53 /* -------------------------------------------------------------------------- */ 
  68 static UOption options
[]={ 
  70     UOPTION_HELP_QUESTION_MARK
, 
  74     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG
), 
  75     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
), 
  76     UOPTION_DEF("csource", '\1', UOPT_NO_ARG
), 
  77     UOPTION_DEF("combined", '\1', UOPT_NO_ARG
), 
  78     UOPTION_DEF("fast", '\1', UOPT_NO_ARG
) 
  82 main(int argc
, char* argv
[]) { 
  83     U_MAIN_INIT_ARGS(argc
, argv
); 
  85     /* preset then read command line options */ 
  86     options
[SOURCEDIR
].value
=""; 
  87     argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[HELP_H
]), options
); 
  89     /* error handling, printing usage message */ 
  92             "error in command line argument \"%s\"\n", 
  95     if(!options
[OUTPUT_FILENAME
].doesOccur
) { 
  99         options
[HELP_H
].doesOccur 
|| options
[HELP_QUESTION_MARK
].doesOccur
 
 102             "Usage: %s [-options] infiles+ -o outputfilename\n" 
 104             "Reads the infiles with normalization data and\n" 
 105             "creates a binary file, or a C source file (--csource), with the data,\n" 
 106             "or writes a data file with the combined data (--combined).\n" 
 107             "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n" 
 109             "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n" 
 111             "Computes the difference of (a, b) minus (p, q) and writes the diff data\n" 
 112             "in input-file syntax to the outputfilename.\n" 
 113             "It is then possible to build (p, q, diff) to get the same data as (a, b).\n" 
 114             "(Useful for computing minimal incremental mapping data files.)\n" 
 119             "\t-h or -? or --help  this usage text\n" 
 120             "\t-v or --verbose     verbose output\n" 
 121             "\t-c or --copyright   include a copyright notice\n" 
 122             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n"); 
 124             "\t-s or --sourcedir   source directory, followed by the path\n" 
 125             "\t-o or --output      output filename\n" 
 126             "\t      --csource     writes a C source file with initializers\n" 
 127             "\t      --combined    writes a .txt file (input-file syntax) with the\n" 
 128             "\t                    combined data from all of the input files\n"); 
 130             "\t      --fast        optimize the data for fast normalization,\n" 
 131             "\t                    which might increase its size  (Writes fully decomposed\n" 
 132             "\t                    regular mappings instead of delta mappings.\n" 
 133             "\t                    You should measure the runtime speed to make sure that\n" 
 134             "\t                    this is a good trade-off.)\n"); 
 135         return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR 
: U_ZERO_ERROR
; 
 138     beVerbose
=options
[VERBOSE
].doesOccur
; 
 139     haveCopyright
=options
[COPYRIGHT
].doesOccur
; 
 141     IcuToolErrorCode 
errorCode("gennorm2/main()"); 
 143 #if UCONFIG_NO_NORMALIZATION 
 146         "gennorm2 writes a dummy binary data file " 
 147         "because UCONFIG_NO_NORMALIZATION is set, \n" 
 148         "see icu/source/common/unicode/uconfig.h\n"); 
 149     udata_createDummy(NULL
, NULL
, options
[OUTPUT_FILENAME
].value
, errorCode
); 
 150     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. 
 151     // return U_UNSUPPORTED_ERROR; 
 156     LocalPointer
<Normalizer2DataBuilder
> b1(new Normalizer2DataBuilder(errorCode
), errorCode
); 
 157     LocalPointer
<Normalizer2DataBuilder
> b2
; 
 158     LocalPointer
<Normalizer2DataBuilder
> diff
; 
 159     Normalizer2DataBuilder 
*builder 
= b1
.getAlias(); 
 160     errorCode
.assertSuccess(); 
 162     if(options
[UNICODE_VERSION
].doesOccur
) { 
 163         builder
->setUnicodeVersion(options
[UNICODE_VERSION
].value
); 
 166     if(options
[OPT_FAST
].doesOccur
) { 
 167         builder
->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST
); 
 170     // prepare the filename beginning with the source dir 
 171     CharString 
filename(options
[SOURCEDIR
].value
, errorCode
); 
 172     int32_t pathLength
=filename
.length(); 
 174         filename
[pathLength
-1]!=U_FILE_SEP_CHAR 
&& 
 175         filename
[pathLength
-1]!=U_FILE_ALT_SEP_CHAR
 
 177         filename
.append(U_FILE_SEP_CHAR
, errorCode
); 
 178         pathLength
=filename
.length(); 
 181     bool doMinus 
= false; 
 182     for(int i
=1; i
<argc
; ++i
) { 
 183         printf("gennorm2: processing %s\n", argv
[i
]); 
 184         if(strcmp(argv
[i
], "minus") == 0) { 
 186                 fprintf(stderr
, "gennorm2 error: only one 'minus' can be specified\n"); 
 187                 exit(U_ILLEGAL_ARGUMENT_ERROR
); 
 189             // Data from previous input files has been collected in b1. 
 190             // Collect data from further input files in b2. 
 191             b2
.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode
), errorCode
); 
 192             diff
.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode
), errorCode
); 
 193             errorCode
.assertSuccess(); 
 194             builder 
= b2
.getAlias(); 
 195             if(options
[UNICODE_VERSION
].doesOccur
) { 
 196                 builder
->setUnicodeVersion(options
[UNICODE_VERSION
].value
); 
 198             if(options
[OPT_FAST
].doesOccur
) { 
 199                 builder
->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST
); 
 204         filename
.append(argv
[i
], errorCode
); 
 205         std::ifstream 
f(filename
.data()); 
 207             fprintf(stderr
, "gennorm2 error: unable to open %s\n", filename
.data()); 
 208             exit(U_FILE_ACCESS_ERROR
); 
 210         builder
->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS
); 
 211         parseFile(f
, *builder
); 
 212         filename
.truncate(pathLength
); 
 216         Normalizer2DataBuilder::computeDiff(*b1
, *b2
, *diff
); 
 217         diff
->writeDataFile(options
[OUTPUT_FILENAME
].value
, /* writeRemoved= */ true); 
 218     } else if(options
[WRITE_COMBINED_DATA
].doesOccur
) { 
 219         builder
->writeDataFile(options
[OUTPUT_FILENAME
].value
, /* writeRemoved= */ false); 
 220     } else if(options
[WRITE_C_SOURCE
].doesOccur
) { 
 221         builder
->writeCSourceFile(options
[OUTPUT_FILENAME
].value
); 
 223         builder
->writeBinaryFile(options
[OUTPUT_FILENAME
].value
); 
 226     return errorCode
.get(); 
 231 #if !UCONFIG_NO_NORMALIZATION 
 233 void parseFile(std::ifstream 
&f
, Normalizer2DataBuilder 
&builder
) { 
 234     IcuToolErrorCode 
errorCode("gennorm2/parseFile()"); 
 235     std::string lineString
; 
 236     uint32_t startCP
, endCP
; 
 237     while(std::getline(f
, lineString
)) { 
 238         if (lineString
.empty()) { 
 239             continue;  // skip empty lines. 
 241 #if (U_CPLUSPLUS_VERSION >= 11) 
 242         char *line 
= &lineString
.front(); 
 244         char *line 
= &lineString
.at(0); 
 246         char *comment
=(char *)strchr(line
, '#'); 
 252             continue;  // skip empty and comment-only lines 
 255             const char *s
=u_skipWhitespace(line
+1); 
 256             if(0==strncmp(s
, "Unicode", 7)) { 
 257                 s
=u_skipWhitespace(s
+7); 
 258                 builder
.setUnicodeVersion(s
); 
 260             continue;  // reserved syntax 
 262         const char *delimiter
; 
 264             u_parseCodePointRangeAnyTerminator(line
, &startCP
, &endCP
, &delimiter
, errorCode
); 
 265         if(errorCode
.isFailure()) { 
 266             fprintf(stderr
, "gennorm2 error: parsing code point range from %s\n", line
); 
 267             exit(errorCode
.reset()); 
 269         delimiter
=u_skipWhitespace(delimiter
); 
 270         if(*delimiter
==':') { 
 271             const char *s
=u_skipWhitespace(delimiter
+1); 
 273             unsigned long value
=strtoul(s
, &end
, 10); 
 274             if(end
<=s 
|| *u_skipWhitespace(end
)!=0 || value
>=0xff) { 
 275                 fprintf(stderr
, "gennorm2 error: parsing ccc from %s\n", line
); 
 278             for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) { 
 279                 builder
.setCC(c
, (uint8_t)value
); 
 283         if(*delimiter
=='-') { 
 284             if(*u_skipWhitespace(delimiter
+1)!=0) { 
 285                 fprintf(stderr
, "gennorm2 error: parsing remove-mapping %s\n", line
); 
 288             for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) { 
 289                 builder
.removeMapping(c
); 
 293         if(*delimiter
=='=' || *delimiter
=='>') { 
 294             UChar uchars
[Normalizer2Impl::MAPPING_LENGTH_MASK
]; 
 295             int32_t length
=u_parseString(delimiter
+1, uchars
, UPRV_LENGTHOF(uchars
), NULL
, errorCode
); 
 296             if(errorCode
.isFailure()) { 
 297                 fprintf(stderr
, "gennorm2 error: parsing mapping string from %s\n", line
); 
 298                 exit(errorCode
.reset()); 
 300             UnicodeString 
mapping(FALSE
, uchars
, length
); 
 301             if(*delimiter
=='=') { 
 304                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", 
 308                 builder
.setRoundTripMapping((UChar32
)startCP
, mapping
); 
 310                 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) { 
 311                     builder
.setOneWayMapping(c
, mapping
); 
 316         fprintf(stderr
, "gennorm2 error: unrecognized data line %s\n", line
); 
 321 #endif // !UCONFIG_NO_NORMALIZATION 
 326  * Hey, Emacs, please set the following: 
 329  * indent-tabs-mode: nil