]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennorm2/gennorm2.cpp
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / tools / gennorm2 / gennorm2.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9
A
3/*
4*******************************************************************************
5*
b331163b 6* Copyright (C) 2009-2014, International Business Machines
729e4ab9
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: gennorm2.cpp
f3c0d7a5 11* encoding: UTF-8
729e4ab9
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov25
16* created by: Markus W. Scherer
17*
18* This program reads text files that define Unicode normalization,
19* parses them, and builds a binary data file.
20*/
21
22#include "unicode/utypes.h"
23#include "n2builder.h"
24
0f5d89e8 25#include <fstream>
729e4ab9
A
26#include <stdio.h>
27#include <stdlib.h>
0f5d89e8 28#include <string>
729e4ab9
A
29#include <string.h>
30#include "unicode/errorcode.h"
31#include "unicode/localpointer.h"
32#include "unicode/putil.h"
33#include "unicode/uchar.h"
34#include "unicode/unistr.h"
35#include "charstr.h"
36#include "normalizer2impl.h"
37#include "toolutil.h"
38#include "uoptions.h"
39#include "uparse.h"
40
41#if UCONFIG_NO_NORMALIZATION
42#include "unewdata.h"
43#endif
44
729e4ab9
A
45U_NAMESPACE_BEGIN
46
47UBool beVerbose=FALSE, haveCopyright=TRUE;
48
729e4ab9 49#if !UCONFIG_NO_NORMALIZATION
0f5d89e8 50void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
729e4ab9
A
51#endif
52
53/* -------------------------------------------------------------------------- */
54
55enum {
56 HELP_H,
57 HELP_QUESTION_MARK,
58 VERBOSE,
59 COPYRIGHT,
60 SOURCEDIR,
61 OUTPUT_FILENAME,
62 UNICODE_VERSION,
b331163b 63 WRITE_C_SOURCE,
0f5d89e8 64 WRITE_COMBINED_DATA,
729e4ab9
A
65 OPT_FAST
66};
67
68static UOption options[]={
69 UOPTION_HELP_H,
70 UOPTION_HELP_QUESTION_MARK,
71 UOPTION_VERBOSE,
72 UOPTION_COPYRIGHT,
73 UOPTION_SOURCEDIR,
74 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
75 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
b331163b 76 UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
0f5d89e8 77 UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
729e4ab9
A
78 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
79};
80
81extern "C" int
82main(int argc, char* argv[]) {
83 U_MAIN_INIT_ARGS(argc, argv);
84
85 /* preset then read command line options */
86 options[SOURCEDIR].value="";
729e4ab9
A
87 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
88
89 /* error handling, printing usage message */
90 if(argc<0) {
91 fprintf(stderr,
92 "error in command line argument \"%s\"\n",
93 argv[-argc]);
94 }
95 if(!options[OUTPUT_FILENAME].doesOccur) {
96 argc=-1;
97 }
98 if( argc<2 ||
99 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100 ) {
729e4ab9
A
101 fprintf(stderr,
102 "Usage: %s [-options] infiles+ -o outputfilename\n"
103 "\n"
104 "Reads the infiles with normalization data and\n"
0f5d89e8
A
105 "creates a binary file, or a C source file (--csource), with the data,\n"
106 "or writes a data file with the combined data (--combined).\n"
107 "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
108 "\n"
109 "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
110 "\n"
111 "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112 "in input-file syntax to the outputfilename.\n"
113 "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114 "(Useful for computing minimal incremental mapping data files.)\n"
729e4ab9 115 "\n",
0f5d89e8 116 argv[0], argv[0]);
729e4ab9
A
117 fprintf(stderr,
118 "Options:\n"
119 "\t-h or -? or --help this usage text\n"
120 "\t-v or --verbose verbose output\n"
121 "\t-c or --copyright include a copyright notice\n"
122 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
123 fprintf(stderr,
124 "\t-s or --sourcedir source directory, followed by the path\n"
b331163b 125 "\t-o or --output output filename\n"
0f5d89e8
A
126 "\t --csource writes a C source file with initializers\n"
127 "\t --combined writes a .txt file (input-file syntax) with the\n"
128 "\t combined data from all of the input files\n");
729e4ab9 129 fprintf(stderr,
b331163b 130 "\t --fast optimize the data for fast normalization,\n"
729e4ab9
A
131 "\t which might increase its size (Writes fully decomposed\n"
132 "\t regular mappings instead of delta mappings.\n"
133 "\t You should measure the runtime speed to make sure that\n"
134 "\t this is a good trade-off.)\n");
135 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
136 }
137
138 beVerbose=options[VERBOSE].doesOccur;
139 haveCopyright=options[COPYRIGHT].doesOccur;
140
141 IcuToolErrorCode errorCode("gennorm2/main()");
142
143#if UCONFIG_NO_NORMALIZATION
144
145 fprintf(stderr,
146 "gennorm2 writes a dummy binary data file "
147 "because UCONFIG_NO_NORMALIZATION is set, \n"
148 "see icu/source/common/unicode/uconfig.h\n");
149 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
150 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151 // return U_UNSUPPORTED_ERROR;
152 return 0;
153
154#else
155
0f5d89e8
A
156 LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
157 LocalPointer<Normalizer2DataBuilder> b2;
158 LocalPointer<Normalizer2DataBuilder> diff;
159 Normalizer2DataBuilder *builder = b1.getAlias();
729e4ab9
A
160 errorCode.assertSuccess();
161
4388f060
A
162 if(options[UNICODE_VERSION].doesOccur) {
163 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
164 }
729e4ab9
A
165
166 if(options[OPT_FAST].doesOccur) {
167 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
168 }
169
170 // prepare the filename beginning with the source dir
171 CharString filename(options[SOURCEDIR].value, errorCode);
172 int32_t pathLength=filename.length();
173 if( pathLength>0 &&
174 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
175 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
176 ) {
177 filename.append(U_FILE_SEP_CHAR, errorCode);
178 pathLength=filename.length();
179 }
180
0f5d89e8 181 bool doMinus = false;
729e4ab9
A
182 for(int i=1; i<argc; ++i) {
183 printf("gennorm2: processing %s\n", argv[i]);
0f5d89e8
A
184 if(strcmp(argv[i], "minus") == 0) {
185 if(doMinus) {
186 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
187 exit(U_ILLEGAL_ARGUMENT_ERROR);
188 }
189 // Data from previous input files has been collected in b1.
190 // Collect data from further input files in b2.
191 b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
192 diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
193 errorCode.assertSuccess();
194 builder = b2.getAlias();
195 if(options[UNICODE_VERSION].doesOccur) {
196 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
197 }
198 if(options[OPT_FAST].doesOccur) {
199 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
200 }
201 doMinus = true;
202 continue;
203 }
729e4ab9 204 filename.append(argv[i], errorCode);
0f5d89e8
A
205 std::ifstream f(filename.data());
206 if(f.fail()) {
729e4ab9
A
207 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
208 exit(U_FILE_ACCESS_ERROR);
209 }
210 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
0f5d89e8 211 parseFile(f, *builder);
729e4ab9
A
212 filename.truncate(pathLength);
213 }
214
0f5d89e8
A
215 if(doMinus) {
216 Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
217 diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
218 } else if(options[WRITE_COMBINED_DATA].doesOccur) {
219 builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
220 } else if(options[WRITE_C_SOURCE].doesOccur) {
b331163b
A
221 builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
222 } else {
223 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
224 }
729e4ab9
A
225
226 return errorCode.get();
227
228#endif
229}
230
231#if !UCONFIG_NO_NORMALIZATION
232
0f5d89e8 233void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
729e4ab9 234 IcuToolErrorCode errorCode("gennorm2/parseFile()");
0f5d89e8 235 std::string lineString;
729e4ab9 236 uint32_t startCP, endCP;
0f5d89e8
A
237 while(std::getline(f, lineString)) {
238 if (lineString.empty()) {
239 continue; // skip empty lines.
240 }
241#if (U_CPLUSPLUS_VERSION >= 11)
242 char *line = &lineString.front();
243#else
244 char *line = &lineString.at(0);
245#endif
729e4ab9
A
246 char *comment=(char *)strchr(line, '#');
247 if(comment!=NULL) {
248 *comment=0;
249 }
250 u_rtrim(line);
251 if(line[0]==0) {
252 continue; // skip empty and comment-only lines
253 }
254 if(line[0]=='*') {
4388f060
A
255 const char *s=u_skipWhitespace(line+1);
256 if(0==strncmp(s, "Unicode", 7)) {
257 s=u_skipWhitespace(s+7);
258 builder.setUnicodeVersion(s);
259 }
729e4ab9
A
260 continue; // reserved syntax
261 }
262 const char *delimiter;
263 int32_t rangeLength=
264 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
265 if(errorCode.isFailure()) {
266 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
267 exit(errorCode.reset());
268 }
3d1f044b
A
269 if (endCP >= 0xd800 && startCP <= 0xdfff) {
270 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
271 line);
272 exit(U_ILLEGAL_ARGUMENT_ERROR);
273 }
729e4ab9
A
274 delimiter=u_skipWhitespace(delimiter);
275 if(*delimiter==':') {
276 const char *s=u_skipWhitespace(delimiter+1);
277 char *end;
278 unsigned long value=strtoul(s, &end, 10);
279 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
280 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
281 exit(U_PARSE_ERROR);
282 }
283 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
284 builder.setCC(c, (uint8_t)value);
285 }
286 continue;
287 }
288 if(*delimiter=='-') {
289 if(*u_skipWhitespace(delimiter+1)!=0) {
290 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
291 exit(U_PARSE_ERROR);
292 }
293 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
294 builder.removeMapping(c);
295 }
296 continue;
297 }
298 if(*delimiter=='=' || *delimiter=='>') {
299 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
b331163b 300 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
729e4ab9
A
301 if(errorCode.isFailure()) {
302 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
303 exit(errorCode.reset());
304 }
305 UnicodeString mapping(FALSE, uchars, length);
306 if(*delimiter=='=') {
307 if(rangeLength!=1) {
308 fprintf(stderr,
309 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
310 line);
311 exit(U_PARSE_ERROR);
312 }
313 builder.setRoundTripMapping((UChar32)startCP, mapping);
314 } else {
315 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
316 builder.setOneWayMapping(c, mapping);
317 }
318 }
319 continue;
320 }
321 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
322 exit(U_PARSE_ERROR);
323 }
324}
325
326#endif // !UCONFIG_NO_NORMALIZATION
327
328U_NAMESPACE_END
329
330/*
331 * Hey, Emacs, please set the following:
332 *
333 * Local Variables:
334 * indent-tabs-mode: nil
335 * End:
336 *
337 */