]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
729e4ab9 A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
b331163b | 6 | * Copyright (C) 2009-2014, International Business Machines |
729e4ab9 A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: gennorm2.cpp | |
f3c0d7a5 | 11 | * encoding: UTF-8 |
729e4ab9 A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2009nov25 | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * This program reads text files that define Unicode normalization, | |
19 | * parses them, and builds a binary data file. | |
20 | */ | |
21 | ||
22 | #include "unicode/utypes.h" | |
23 | #include "n2builder.h" | |
24 | ||
0f5d89e8 | 25 | #include <fstream> |
729e4ab9 A |
26 | #include <stdio.h> |
27 | #include <stdlib.h> | |
0f5d89e8 | 28 | #include <string> |
729e4ab9 A |
29 | #include <string.h> |
30 | #include "unicode/errorcode.h" | |
31 | #include "unicode/localpointer.h" | |
32 | #include "unicode/putil.h" | |
33 | #include "unicode/uchar.h" | |
34 | #include "unicode/unistr.h" | |
35 | #include "charstr.h" | |
36 | #include "normalizer2impl.h" | |
37 | #include "toolutil.h" | |
38 | #include "uoptions.h" | |
39 | #include "uparse.h" | |
40 | ||
41 | #if UCONFIG_NO_NORMALIZATION | |
42 | #include "unewdata.h" | |
43 | #endif | |
44 | ||
729e4ab9 A |
45 | U_NAMESPACE_BEGIN |
46 | ||
47 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
48 | ||
729e4ab9 | 49 | #if !UCONFIG_NO_NORMALIZATION |
0f5d89e8 | 50 | void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder); |
729e4ab9 A |
51 | #endif |
52 | ||
53 | /* -------------------------------------------------------------------------- */ | |
54 | ||
55 | enum { | |
56 | HELP_H, | |
57 | HELP_QUESTION_MARK, | |
58 | VERBOSE, | |
59 | COPYRIGHT, | |
60 | SOURCEDIR, | |
61 | OUTPUT_FILENAME, | |
62 | UNICODE_VERSION, | |
b331163b | 63 | WRITE_C_SOURCE, |
0f5d89e8 | 64 | WRITE_COMBINED_DATA, |
729e4ab9 A |
65 | OPT_FAST |
66 | }; | |
67 | ||
68 | static UOption options[]={ | |
69 | UOPTION_HELP_H, | |
70 | UOPTION_HELP_QUESTION_MARK, | |
71 | UOPTION_VERBOSE, | |
72 | UOPTION_COPYRIGHT, | |
73 | UOPTION_SOURCEDIR, | |
74 | UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), | |
75 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), | |
b331163b | 76 | UOPTION_DEF("csource", '\1', UOPT_NO_ARG), |
0f5d89e8 | 77 | UOPTION_DEF("combined", '\1', UOPT_NO_ARG), |
729e4ab9 A |
78 | UOPTION_DEF("fast", '\1', UOPT_NO_ARG) |
79 | }; | |
80 | ||
81 | extern "C" int | |
82 | main(int argc, char* argv[]) { | |
83 | U_MAIN_INIT_ARGS(argc, argv); | |
84 | ||
85 | /* preset then read command line options */ | |
86 | options[SOURCEDIR].value=""; | |
729e4ab9 A |
87 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); |
88 | ||
89 | /* error handling, printing usage message */ | |
90 | if(argc<0) { | |
91 | fprintf(stderr, | |
92 | "error in command line argument \"%s\"\n", | |
93 | argv[-argc]); | |
94 | } | |
95 | if(!options[OUTPUT_FILENAME].doesOccur) { | |
96 | argc=-1; | |
97 | } | |
98 | if( argc<2 || | |
99 | options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur | |
100 | ) { | |
729e4ab9 A |
101 | fprintf(stderr, |
102 | "Usage: %s [-options] infiles+ -o outputfilename\n" | |
103 | "\n" | |
104 | "Reads the infiles with normalization data and\n" | |
0f5d89e8 A |
105 | "creates a binary file, or a C source file (--csource), with the data,\n" |
106 | "or writes a data file with the combined data (--combined).\n" | |
107 | "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n" | |
108 | "\n" | |
109 | "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n" | |
110 | "\n" | |
111 | "Computes the difference of (a, b) minus (p, q) and writes the diff data\n" | |
112 | "in input-file syntax to the outputfilename.\n" | |
113 | "It is then possible to build (p, q, diff) to get the same data as (a, b).\n" | |
114 | "(Useful for computing minimal incremental mapping data files.)\n" | |
729e4ab9 | 115 | "\n", |
0f5d89e8 | 116 | argv[0], argv[0]); |
729e4ab9 A |
117 | fprintf(stderr, |
118 | "Options:\n" | |
119 | "\t-h or -? or --help this usage text\n" | |
120 | "\t-v or --verbose verbose output\n" | |
121 | "\t-c or --copyright include a copyright notice\n" | |
122 | "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); | |
123 | fprintf(stderr, | |
124 | "\t-s or --sourcedir source directory, followed by the path\n" | |
b331163b | 125 | "\t-o or --output output filename\n" |
0f5d89e8 A |
126 | "\t --csource writes a C source file with initializers\n" |
127 | "\t --combined writes a .txt file (input-file syntax) with the\n" | |
128 | "\t combined data from all of the input files\n"); | |
729e4ab9 | 129 | fprintf(stderr, |
b331163b | 130 | "\t --fast optimize the data for fast normalization,\n" |
729e4ab9 A |
131 | "\t which might increase its size (Writes fully decomposed\n" |
132 | "\t regular mappings instead of delta mappings.\n" | |
133 | "\t You should measure the runtime speed to make sure that\n" | |
134 | "\t this is a good trade-off.)\n"); | |
135 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
136 | } | |
137 | ||
138 | beVerbose=options[VERBOSE].doesOccur; | |
139 | haveCopyright=options[COPYRIGHT].doesOccur; | |
140 | ||
141 | IcuToolErrorCode errorCode("gennorm2/main()"); | |
142 | ||
143 | #if UCONFIG_NO_NORMALIZATION | |
144 | ||
145 | fprintf(stderr, | |
146 | "gennorm2 writes a dummy binary data file " | |
147 | "because UCONFIG_NO_NORMALIZATION is set, \n" | |
148 | "see icu/source/common/unicode/uconfig.h\n"); | |
149 | udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); | |
150 | // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. | |
151 | // return U_UNSUPPORTED_ERROR; | |
152 | return 0; | |
153 | ||
154 | #else | |
155 | ||
0f5d89e8 A |
156 | LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode); |
157 | LocalPointer<Normalizer2DataBuilder> b2; | |
158 | LocalPointer<Normalizer2DataBuilder> diff; | |
159 | Normalizer2DataBuilder *builder = b1.getAlias(); | |
729e4ab9 A |
160 | errorCode.assertSuccess(); |
161 | ||
4388f060 A |
162 | if(options[UNICODE_VERSION].doesOccur) { |
163 | builder->setUnicodeVersion(options[UNICODE_VERSION].value); | |
164 | } | |
729e4ab9 A |
165 | |
166 | if(options[OPT_FAST].doesOccur) { | |
167 | builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); | |
168 | } | |
169 | ||
170 | // prepare the filename beginning with the source dir | |
171 | CharString filename(options[SOURCEDIR].value, errorCode); | |
172 | int32_t pathLength=filename.length(); | |
173 | if( pathLength>0 && | |
174 | filename[pathLength-1]!=U_FILE_SEP_CHAR && | |
175 | filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR | |
176 | ) { | |
177 | filename.append(U_FILE_SEP_CHAR, errorCode); | |
178 | pathLength=filename.length(); | |
179 | } | |
180 | ||
0f5d89e8 | 181 | bool doMinus = false; |
729e4ab9 A |
182 | for(int i=1; i<argc; ++i) { |
183 | printf("gennorm2: processing %s\n", argv[i]); | |
0f5d89e8 A |
184 | if(strcmp(argv[i], "minus") == 0) { |
185 | if(doMinus) { | |
186 | fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n"); | |
187 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
188 | } | |
189 | // Data from previous input files has been collected in b1. | |
190 | // Collect data from further input files in b2. | |
191 | b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); | |
192 | diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); | |
193 | errorCode.assertSuccess(); | |
194 | builder = b2.getAlias(); | |
195 | if(options[UNICODE_VERSION].doesOccur) { | |
196 | builder->setUnicodeVersion(options[UNICODE_VERSION].value); | |
197 | } | |
198 | if(options[OPT_FAST].doesOccur) { | |
199 | builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); | |
200 | } | |
201 | doMinus = true; | |
202 | continue; | |
203 | } | |
729e4ab9 | 204 | filename.append(argv[i], errorCode); |
0f5d89e8 A |
205 | std::ifstream f(filename.data()); |
206 | if(f.fail()) { | |
729e4ab9 A |
207 | fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); |
208 | exit(U_FILE_ACCESS_ERROR); | |
209 | } | |
210 | builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); | |
0f5d89e8 | 211 | parseFile(f, *builder); |
729e4ab9 A |
212 | filename.truncate(pathLength); |
213 | } | |
214 | ||
0f5d89e8 A |
215 | if(doMinus) { |
216 | Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff); | |
217 | diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true); | |
218 | } else if(options[WRITE_COMBINED_DATA].doesOccur) { | |
219 | builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false); | |
220 | } else if(options[WRITE_C_SOURCE].doesOccur) { | |
b331163b A |
221 | builder->writeCSourceFile(options[OUTPUT_FILENAME].value); |
222 | } else { | |
223 | builder->writeBinaryFile(options[OUTPUT_FILENAME].value); | |
224 | } | |
729e4ab9 A |
225 | |
226 | return errorCode.get(); | |
227 | ||
228 | #endif | |
229 | } | |
230 | ||
231 | #if !UCONFIG_NO_NORMALIZATION | |
232 | ||
0f5d89e8 | 233 | void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) { |
729e4ab9 | 234 | IcuToolErrorCode errorCode("gennorm2/parseFile()"); |
0f5d89e8 | 235 | std::string lineString; |
729e4ab9 | 236 | uint32_t startCP, endCP; |
0f5d89e8 A |
237 | while(std::getline(f, lineString)) { |
238 | if (lineString.empty()) { | |
239 | continue; // skip empty lines. | |
240 | } | |
241 | #if (U_CPLUSPLUS_VERSION >= 11) | |
242 | char *line = &lineString.front(); | |
243 | #else | |
244 | char *line = &lineString.at(0); | |
245 | #endif | |
729e4ab9 A |
246 | char *comment=(char *)strchr(line, '#'); |
247 | if(comment!=NULL) { | |
248 | *comment=0; | |
249 | } | |
250 | u_rtrim(line); | |
251 | if(line[0]==0) { | |
252 | continue; // skip empty and comment-only lines | |
253 | } | |
254 | if(line[0]=='*') { | |
4388f060 A |
255 | const char *s=u_skipWhitespace(line+1); |
256 | if(0==strncmp(s, "Unicode", 7)) { | |
257 | s=u_skipWhitespace(s+7); | |
258 | builder.setUnicodeVersion(s); | |
259 | } | |
729e4ab9 A |
260 | continue; // reserved syntax |
261 | } | |
262 | const char *delimiter; | |
263 | int32_t rangeLength= | |
264 | u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); | |
265 | if(errorCode.isFailure()) { | |
266 | fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); | |
267 | exit(errorCode.reset()); | |
268 | } | |
3d1f044b A |
269 | if (endCP >= 0xd800 && startCP <= 0xdfff) { |
270 | fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n", | |
271 | line); | |
272 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
273 | } | |
729e4ab9 A |
274 | delimiter=u_skipWhitespace(delimiter); |
275 | if(*delimiter==':') { | |
276 | const char *s=u_skipWhitespace(delimiter+1); | |
277 | char *end; | |
278 | unsigned long value=strtoul(s, &end, 10); | |
279 | if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { | |
280 | fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); | |
281 | exit(U_PARSE_ERROR); | |
282 | } | |
283 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
284 | builder.setCC(c, (uint8_t)value); | |
285 | } | |
286 | continue; | |
287 | } | |
288 | if(*delimiter=='-') { | |
289 | if(*u_skipWhitespace(delimiter+1)!=0) { | |
290 | fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); | |
291 | exit(U_PARSE_ERROR); | |
292 | } | |
293 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
294 | builder.removeMapping(c); | |
295 | } | |
296 | continue; | |
297 | } | |
298 | if(*delimiter=='=' || *delimiter=='>') { | |
299 | UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; | |
b331163b | 300 | int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode); |
729e4ab9 A |
301 | if(errorCode.isFailure()) { |
302 | fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); | |
303 | exit(errorCode.reset()); | |
304 | } | |
305 | UnicodeString mapping(FALSE, uchars, length); | |
306 | if(*delimiter=='=') { | |
307 | if(rangeLength!=1) { | |
308 | fprintf(stderr, | |
309 | "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", | |
310 | line); | |
311 | exit(U_PARSE_ERROR); | |
312 | } | |
313 | builder.setRoundTripMapping((UChar32)startCP, mapping); | |
314 | } else { | |
315 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
316 | builder.setOneWayMapping(c, mapping); | |
317 | } | |
318 | } | |
319 | continue; | |
320 | } | |
321 | fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); | |
322 | exit(U_PARSE_ERROR); | |
323 | } | |
324 | } | |
325 | ||
326 | #endif // !UCONFIG_NO_NORMALIZATION | |
327 | ||
328 | U_NAMESPACE_END | |
329 | ||
330 | /* | |
331 | * Hey, Emacs, please set the following: | |
332 | * | |
333 | * Local Variables: | |
334 | * indent-tabs-mode: nil | |
335 | * End: | |
336 | * | |
337 | */ |