]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
b331163b | 4 | * Copyright (C) 2009-2014, International Business Machines |
729e4ab9 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gennorm2.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2009nov25 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads text files that define Unicode normalization, | |
17 | * parses them, and builds a binary data file. | |
18 | */ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | #include "n2builder.h" | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include "unicode/errorcode.h" | |
27 | #include "unicode/localpointer.h" | |
28 | #include "unicode/putil.h" | |
29 | #include "unicode/uchar.h" | |
30 | #include "unicode/unistr.h" | |
31 | #include "charstr.h" | |
32 | #include "normalizer2impl.h" | |
33 | #include "toolutil.h" | |
34 | #include "uoptions.h" | |
35 | #include "uparse.h" | |
36 | ||
37 | #if UCONFIG_NO_NORMALIZATION | |
38 | #include "unewdata.h" | |
39 | #endif | |
40 | ||
729e4ab9 A |
41 | U_NAMESPACE_BEGIN |
42 | ||
43 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
44 | ||
45 | U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); | |
46 | ||
47 | #if !UCONFIG_NO_NORMALIZATION | |
48 | void parseFile(FILE *f, Normalizer2DataBuilder &builder); | |
49 | #endif | |
50 | ||
51 | /* -------------------------------------------------------------------------- */ | |
52 | ||
53 | enum { | |
54 | HELP_H, | |
55 | HELP_QUESTION_MARK, | |
56 | VERBOSE, | |
57 | COPYRIGHT, | |
58 | SOURCEDIR, | |
59 | OUTPUT_FILENAME, | |
60 | UNICODE_VERSION, | |
b331163b | 61 | WRITE_C_SOURCE, |
729e4ab9 A |
62 | OPT_FAST |
63 | }; | |
64 | ||
65 | static UOption options[]={ | |
66 | UOPTION_HELP_H, | |
67 | UOPTION_HELP_QUESTION_MARK, | |
68 | UOPTION_VERBOSE, | |
69 | UOPTION_COPYRIGHT, | |
70 | UOPTION_SOURCEDIR, | |
71 | UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), | |
72 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), | |
b331163b | 73 | UOPTION_DEF("csource", '\1', UOPT_NO_ARG), |
729e4ab9 A |
74 | UOPTION_DEF("fast", '\1', UOPT_NO_ARG) |
75 | }; | |
76 | ||
77 | extern "C" int | |
78 | main(int argc, char* argv[]) { | |
79 | U_MAIN_INIT_ARGS(argc, argv); | |
80 | ||
81 | /* preset then read command line options */ | |
82 | options[SOURCEDIR].value=""; | |
729e4ab9 A |
83 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); |
84 | ||
85 | /* error handling, printing usage message */ | |
86 | if(argc<0) { | |
87 | fprintf(stderr, | |
88 | "error in command line argument \"%s\"\n", | |
89 | argv[-argc]); | |
90 | } | |
91 | if(!options[OUTPUT_FILENAME].doesOccur) { | |
92 | argc=-1; | |
93 | } | |
94 | if( argc<2 || | |
95 | options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur | |
96 | ) { | |
97 | /* | |
98 | * Broken into chunks because the C89 standard says the minimum | |
99 | * required supported string length is 509 bytes. | |
100 | */ | |
101 | fprintf(stderr, | |
102 | "Usage: %s [-options] infiles+ -o outputfilename\n" | |
103 | "\n" | |
104 | "Reads the infiles with normalization data and\n" | |
b331163b | 105 | "creates a binary or C source file (outputfilename) with the data.\n" |
729e4ab9 A |
106 | "\n", |
107 | argv[0]); | |
108 | fprintf(stderr, | |
109 | "Options:\n" | |
110 | "\t-h or -? or --help this usage text\n" | |
111 | "\t-v or --verbose verbose output\n" | |
112 | "\t-c or --copyright include a copyright notice\n" | |
113 | "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); | |
114 | fprintf(stderr, | |
115 | "\t-s or --sourcedir source directory, followed by the path\n" | |
b331163b A |
116 | "\t-o or --output output filename\n" |
117 | "\t --csource writes a C source file with initializers\n"); | |
729e4ab9 | 118 | fprintf(stderr, |
b331163b | 119 | "\t --fast optimize the data for fast normalization,\n" |
729e4ab9 A |
120 | "\t which might increase its size (Writes fully decomposed\n" |
121 | "\t regular mappings instead of delta mappings.\n" | |
122 | "\t You should measure the runtime speed to make sure that\n" | |
123 | "\t this is a good trade-off.)\n"); | |
124 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
125 | } | |
126 | ||
127 | beVerbose=options[VERBOSE].doesOccur; | |
128 | haveCopyright=options[COPYRIGHT].doesOccur; | |
129 | ||
130 | IcuToolErrorCode errorCode("gennorm2/main()"); | |
131 | ||
132 | #if UCONFIG_NO_NORMALIZATION | |
133 | ||
134 | fprintf(stderr, | |
135 | "gennorm2 writes a dummy binary data file " | |
136 | "because UCONFIG_NO_NORMALIZATION is set, \n" | |
137 | "see icu/source/common/unicode/uconfig.h\n"); | |
138 | udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); | |
139 | // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. | |
140 | // return U_UNSUPPORTED_ERROR; | |
141 | return 0; | |
142 | ||
143 | #else | |
144 | ||
b331163b | 145 | LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode); |
729e4ab9 A |
146 | errorCode.assertSuccess(); |
147 | ||
4388f060 A |
148 | if(options[UNICODE_VERSION].doesOccur) { |
149 | builder->setUnicodeVersion(options[UNICODE_VERSION].value); | |
150 | } | |
729e4ab9 A |
151 | |
152 | if(options[OPT_FAST].doesOccur) { | |
153 | builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); | |
154 | } | |
155 | ||
156 | // prepare the filename beginning with the source dir | |
157 | CharString filename(options[SOURCEDIR].value, errorCode); | |
158 | int32_t pathLength=filename.length(); | |
159 | if( pathLength>0 && | |
160 | filename[pathLength-1]!=U_FILE_SEP_CHAR && | |
161 | filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR | |
162 | ) { | |
163 | filename.append(U_FILE_SEP_CHAR, errorCode); | |
164 | pathLength=filename.length(); | |
165 | } | |
166 | ||
167 | for(int i=1; i<argc; ++i) { | |
168 | printf("gennorm2: processing %s\n", argv[i]); | |
169 | filename.append(argv[i], errorCode); | |
170 | LocalStdioFilePointer f(fopen(filename.data(), "r")); | |
171 | if(f==NULL) { | |
172 | fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); | |
173 | exit(U_FILE_ACCESS_ERROR); | |
174 | } | |
175 | builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); | |
176 | parseFile(f.getAlias(), *builder); | |
177 | filename.truncate(pathLength); | |
178 | } | |
179 | ||
b331163b A |
180 | if(options[WRITE_C_SOURCE].doesOccur) { |
181 | builder->writeCSourceFile(options[OUTPUT_FILENAME].value); | |
182 | } else { | |
183 | builder->writeBinaryFile(options[OUTPUT_FILENAME].value); | |
184 | } | |
729e4ab9 A |
185 | |
186 | return errorCode.get(); | |
187 | ||
188 | #endif | |
189 | } | |
190 | ||
191 | #if !UCONFIG_NO_NORMALIZATION | |
192 | ||
193 | void parseFile(FILE *f, Normalizer2DataBuilder &builder) { | |
194 | IcuToolErrorCode errorCode("gennorm2/parseFile()"); | |
195 | char line[300]; | |
196 | uint32_t startCP, endCP; | |
197 | while(NULL!=fgets(line, (int)sizeof(line), f)) { | |
198 | char *comment=(char *)strchr(line, '#'); | |
199 | if(comment!=NULL) { | |
200 | *comment=0; | |
201 | } | |
202 | u_rtrim(line); | |
203 | if(line[0]==0) { | |
204 | continue; // skip empty and comment-only lines | |
205 | } | |
206 | if(line[0]=='*') { | |
4388f060 A |
207 | const char *s=u_skipWhitespace(line+1); |
208 | if(0==strncmp(s, "Unicode", 7)) { | |
209 | s=u_skipWhitespace(s+7); | |
210 | builder.setUnicodeVersion(s); | |
211 | } | |
729e4ab9 A |
212 | continue; // reserved syntax |
213 | } | |
214 | const char *delimiter; | |
215 | int32_t rangeLength= | |
216 | u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); | |
217 | if(errorCode.isFailure()) { | |
218 | fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); | |
219 | exit(errorCode.reset()); | |
220 | } | |
221 | delimiter=u_skipWhitespace(delimiter); | |
222 | if(*delimiter==':') { | |
223 | const char *s=u_skipWhitespace(delimiter+1); | |
224 | char *end; | |
225 | unsigned long value=strtoul(s, &end, 10); | |
226 | if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { | |
227 | fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); | |
228 | exit(U_PARSE_ERROR); | |
229 | } | |
230 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
231 | builder.setCC(c, (uint8_t)value); | |
232 | } | |
233 | continue; | |
234 | } | |
235 | if(*delimiter=='-') { | |
236 | if(*u_skipWhitespace(delimiter+1)!=0) { | |
237 | fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); | |
238 | exit(U_PARSE_ERROR); | |
239 | } | |
240 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
241 | builder.removeMapping(c); | |
242 | } | |
243 | continue; | |
244 | } | |
245 | if(*delimiter=='=' || *delimiter=='>') { | |
246 | UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; | |
b331163b | 247 | int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode); |
729e4ab9 A |
248 | if(errorCode.isFailure()) { |
249 | fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); | |
250 | exit(errorCode.reset()); | |
251 | } | |
252 | UnicodeString mapping(FALSE, uchars, length); | |
253 | if(*delimiter=='=') { | |
254 | if(rangeLength!=1) { | |
255 | fprintf(stderr, | |
256 | "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", | |
257 | line); | |
258 | exit(U_PARSE_ERROR); | |
259 | } | |
260 | builder.setRoundTripMapping((UChar32)startCP, mapping); | |
261 | } else { | |
262 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
263 | builder.setOneWayMapping(c, mapping); | |
264 | } | |
265 | } | |
266 | continue; | |
267 | } | |
268 | fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); | |
269 | exit(U_PARSE_ERROR); | |
270 | } | |
271 | } | |
272 | ||
273 | #endif // !UCONFIG_NO_NORMALIZATION | |
274 | ||
275 | U_NAMESPACE_END | |
276 | ||
277 | /* | |
278 | * Hey, Emacs, please set the following: | |
279 | * | |
280 | * Local Variables: | |
281 | * indent-tabs-mode: nil | |
282 | * End: | |
283 | * | |
284 | */ |