]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4388f060 | 4 | * Copyright (C) 2009-2012, International Business Machines |
729e4ab9 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gennorm2.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2009nov25 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads text files that define Unicode normalization, | |
17 | * parses them, and builds a binary data file. | |
18 | */ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | #include "n2builder.h" | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include "unicode/errorcode.h" | |
27 | #include "unicode/localpointer.h" | |
28 | #include "unicode/putil.h" | |
29 | #include "unicode/uchar.h" | |
30 | #include "unicode/unistr.h" | |
31 | #include "charstr.h" | |
32 | #include "normalizer2impl.h" | |
33 | #include "toolutil.h" | |
34 | #include "uoptions.h" | |
35 | #include "uparse.h" | |
36 | ||
37 | #if UCONFIG_NO_NORMALIZATION | |
38 | #include "unewdata.h" | |
39 | #endif | |
40 | ||
41 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
42 | ||
43 | U_NAMESPACE_BEGIN | |
44 | ||
45 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
46 | ||
47 | U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); | |
48 | ||
49 | #if !UCONFIG_NO_NORMALIZATION | |
50 | void parseFile(FILE *f, Normalizer2DataBuilder &builder); | |
51 | #endif | |
52 | ||
53 | /* -------------------------------------------------------------------------- */ | |
54 | ||
55 | enum { | |
56 | HELP_H, | |
57 | HELP_QUESTION_MARK, | |
58 | VERBOSE, | |
59 | COPYRIGHT, | |
60 | SOURCEDIR, | |
61 | OUTPUT_FILENAME, | |
62 | UNICODE_VERSION, | |
63 | OPT_FAST | |
64 | }; | |
65 | ||
66 | static UOption options[]={ | |
67 | UOPTION_HELP_H, | |
68 | UOPTION_HELP_QUESTION_MARK, | |
69 | UOPTION_VERBOSE, | |
70 | UOPTION_COPYRIGHT, | |
71 | UOPTION_SOURCEDIR, | |
72 | UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), | |
73 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), | |
74 | UOPTION_DEF("fast", '\1', UOPT_NO_ARG) | |
75 | }; | |
76 | ||
77 | extern "C" int | |
78 | main(int argc, char* argv[]) { | |
79 | U_MAIN_INIT_ARGS(argc, argv); | |
80 | ||
81 | /* preset then read command line options */ | |
82 | options[SOURCEDIR].value=""; | |
729e4ab9 A |
83 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); |
84 | ||
85 | /* error handling, printing usage message */ | |
86 | if(argc<0) { | |
87 | fprintf(stderr, | |
88 | "error in command line argument \"%s\"\n", | |
89 | argv[-argc]); | |
90 | } | |
91 | if(!options[OUTPUT_FILENAME].doesOccur) { | |
92 | argc=-1; | |
93 | } | |
94 | if( argc<2 || | |
95 | options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur | |
96 | ) { | |
97 | /* | |
98 | * Broken into chunks because the C89 standard says the minimum | |
99 | * required supported string length is 509 bytes. | |
100 | */ | |
101 | fprintf(stderr, | |
102 | "Usage: %s [-options] infiles+ -o outputfilename\n" | |
103 | "\n" | |
104 | "Reads the infiles with normalization data and\n" | |
105 | "creates a binary file (outputfilename) with the data.\n" | |
106 | "\n", | |
107 | argv[0]); | |
108 | fprintf(stderr, | |
109 | "Options:\n" | |
110 | "\t-h or -? or --help this usage text\n" | |
111 | "\t-v or --verbose verbose output\n" | |
112 | "\t-c or --copyright include a copyright notice\n" | |
113 | "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); | |
114 | fprintf(stderr, | |
115 | "\t-s or --sourcedir source directory, followed by the path\n" | |
116 | "\t-o or --output output filename\n"); | |
117 | fprintf(stderr, | |
118 | "\t --fast optimize the .nrm file for fast normalization,\n" | |
119 | "\t which might increase its size (Writes fully decomposed\n" | |
120 | "\t regular mappings instead of delta mappings.\n" | |
121 | "\t You should measure the runtime speed to make sure that\n" | |
122 | "\t this is a good trade-off.)\n"); | |
123 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
124 | } | |
125 | ||
126 | beVerbose=options[VERBOSE].doesOccur; | |
127 | haveCopyright=options[COPYRIGHT].doesOccur; | |
128 | ||
129 | IcuToolErrorCode errorCode("gennorm2/main()"); | |
130 | ||
131 | #if UCONFIG_NO_NORMALIZATION | |
132 | ||
133 | fprintf(stderr, | |
134 | "gennorm2 writes a dummy binary data file " | |
135 | "because UCONFIG_NO_NORMALIZATION is set, \n" | |
136 | "see icu/source/common/unicode/uconfig.h\n"); | |
137 | udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); | |
138 | // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. | |
139 | // return U_UNSUPPORTED_ERROR; | |
140 | return 0; | |
141 | ||
142 | #else | |
143 | ||
144 | LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode)); | |
145 | errorCode.assertSuccess(); | |
146 | ||
4388f060 A |
147 | if(options[UNICODE_VERSION].doesOccur) { |
148 | builder->setUnicodeVersion(options[UNICODE_VERSION].value); | |
149 | } | |
729e4ab9 A |
150 | |
151 | if(options[OPT_FAST].doesOccur) { | |
152 | builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); | |
153 | } | |
154 | ||
155 | // prepare the filename beginning with the source dir | |
156 | CharString filename(options[SOURCEDIR].value, errorCode); | |
157 | int32_t pathLength=filename.length(); | |
158 | if( pathLength>0 && | |
159 | filename[pathLength-1]!=U_FILE_SEP_CHAR && | |
160 | filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR | |
161 | ) { | |
162 | filename.append(U_FILE_SEP_CHAR, errorCode); | |
163 | pathLength=filename.length(); | |
164 | } | |
165 | ||
166 | for(int i=1; i<argc; ++i) { | |
167 | printf("gennorm2: processing %s\n", argv[i]); | |
168 | filename.append(argv[i], errorCode); | |
169 | LocalStdioFilePointer f(fopen(filename.data(), "r")); | |
170 | if(f==NULL) { | |
171 | fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); | |
172 | exit(U_FILE_ACCESS_ERROR); | |
173 | } | |
174 | builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); | |
175 | parseFile(f.getAlias(), *builder); | |
176 | filename.truncate(pathLength); | |
177 | } | |
178 | ||
179 | builder->writeBinaryFile(options[OUTPUT_FILENAME].value); | |
180 | ||
181 | return errorCode.get(); | |
182 | ||
183 | #endif | |
184 | } | |
185 | ||
186 | #if !UCONFIG_NO_NORMALIZATION | |
187 | ||
188 | void parseFile(FILE *f, Normalizer2DataBuilder &builder) { | |
189 | IcuToolErrorCode errorCode("gennorm2/parseFile()"); | |
190 | char line[300]; | |
191 | uint32_t startCP, endCP; | |
192 | while(NULL!=fgets(line, (int)sizeof(line), f)) { | |
193 | char *comment=(char *)strchr(line, '#'); | |
194 | if(comment!=NULL) { | |
195 | *comment=0; | |
196 | } | |
197 | u_rtrim(line); | |
198 | if(line[0]==0) { | |
199 | continue; // skip empty and comment-only lines | |
200 | } | |
201 | if(line[0]=='*') { | |
4388f060 A |
202 | const char *s=u_skipWhitespace(line+1); |
203 | if(0==strncmp(s, "Unicode", 7)) { | |
204 | s=u_skipWhitespace(s+7); | |
205 | builder.setUnicodeVersion(s); | |
206 | } | |
729e4ab9 A |
207 | continue; // reserved syntax |
208 | } | |
209 | const char *delimiter; | |
210 | int32_t rangeLength= | |
211 | u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); | |
212 | if(errorCode.isFailure()) { | |
213 | fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); | |
214 | exit(errorCode.reset()); | |
215 | } | |
216 | delimiter=u_skipWhitespace(delimiter); | |
217 | if(*delimiter==':') { | |
218 | const char *s=u_skipWhitespace(delimiter+1); | |
219 | char *end; | |
220 | unsigned long value=strtoul(s, &end, 10); | |
221 | if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { | |
222 | fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); | |
223 | exit(U_PARSE_ERROR); | |
224 | } | |
225 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
226 | builder.setCC(c, (uint8_t)value); | |
227 | } | |
228 | continue; | |
229 | } | |
230 | if(*delimiter=='-') { | |
231 | if(*u_skipWhitespace(delimiter+1)!=0) { | |
232 | fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); | |
233 | exit(U_PARSE_ERROR); | |
234 | } | |
235 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
236 | builder.removeMapping(c); | |
237 | } | |
238 | continue; | |
239 | } | |
240 | if(*delimiter=='=' || *delimiter=='>') { | |
241 | UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; | |
242 | int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode); | |
243 | if(errorCode.isFailure()) { | |
244 | fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); | |
245 | exit(errorCode.reset()); | |
246 | } | |
247 | UnicodeString mapping(FALSE, uchars, length); | |
248 | if(*delimiter=='=') { | |
249 | if(rangeLength!=1) { | |
250 | fprintf(stderr, | |
251 | "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", | |
252 | line); | |
253 | exit(U_PARSE_ERROR); | |
254 | } | |
255 | builder.setRoundTripMapping((UChar32)startCP, mapping); | |
256 | } else { | |
257 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
258 | builder.setOneWayMapping(c, mapping); | |
259 | } | |
260 | } | |
261 | continue; | |
262 | } | |
263 | fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); | |
264 | exit(U_PARSE_ERROR); | |
265 | } | |
266 | } | |
267 | ||
268 | #endif // !UCONFIG_NO_NORMALIZATION | |
269 | ||
270 | U_NAMESPACE_END | |
271 | ||
272 | /* | |
273 | * Hey, Emacs, please set the following: | |
274 | * | |
275 | * Local Variables: | |
276 | * indent-tabs-mode: nil | |
277 | * End: | |
278 | * | |
279 | */ |