]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2009-2010, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gennorm2.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2009nov25 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads text files that define Unicode normalization, | |
17 | * parses them, and builds a binary data file. | |
18 | */ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | #include "n2builder.h" | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include "unicode/errorcode.h" | |
27 | #include "unicode/localpointer.h" | |
28 | #include "unicode/putil.h" | |
29 | #include "unicode/uchar.h" | |
30 | #include "unicode/unistr.h" | |
31 | #include "charstr.h" | |
32 | #include "normalizer2impl.h" | |
33 | #include "toolutil.h" | |
34 | #include "uoptions.h" | |
35 | #include "uparse.h" | |
36 | ||
37 | #if UCONFIG_NO_NORMALIZATION | |
38 | #include "unewdata.h" | |
39 | #endif | |
40 | ||
41 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
42 | ||
43 | U_NAMESPACE_BEGIN | |
44 | ||
45 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
46 | ||
47 | U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); | |
48 | ||
49 | #if !UCONFIG_NO_NORMALIZATION | |
50 | void parseFile(FILE *f, Normalizer2DataBuilder &builder); | |
51 | #endif | |
52 | ||
53 | /* -------------------------------------------------------------------------- */ | |
54 | ||
55 | enum { | |
56 | HELP_H, | |
57 | HELP_QUESTION_MARK, | |
58 | VERBOSE, | |
59 | COPYRIGHT, | |
60 | SOURCEDIR, | |
61 | OUTPUT_FILENAME, | |
62 | UNICODE_VERSION, | |
63 | OPT_FAST | |
64 | }; | |
65 | ||
66 | static UOption options[]={ | |
67 | UOPTION_HELP_H, | |
68 | UOPTION_HELP_QUESTION_MARK, | |
69 | UOPTION_VERBOSE, | |
70 | UOPTION_COPYRIGHT, | |
71 | UOPTION_SOURCEDIR, | |
72 | UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), | |
73 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), | |
74 | UOPTION_DEF("fast", '\1', UOPT_NO_ARG) | |
75 | }; | |
76 | ||
77 | extern "C" int | |
78 | main(int argc, char* argv[]) { | |
79 | U_MAIN_INIT_ARGS(argc, argv); | |
80 | ||
81 | /* preset then read command line options */ | |
82 | options[SOURCEDIR].value=""; | |
83 | options[UNICODE_VERSION].value=U_UNICODE_VERSION; | |
84 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); | |
85 | ||
86 | /* error handling, printing usage message */ | |
87 | if(argc<0) { | |
88 | fprintf(stderr, | |
89 | "error in command line argument \"%s\"\n", | |
90 | argv[-argc]); | |
91 | } | |
92 | if(!options[OUTPUT_FILENAME].doesOccur) { | |
93 | argc=-1; | |
94 | } | |
95 | if( argc<2 || | |
96 | options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur | |
97 | ) { | |
98 | /* | |
99 | * Broken into chunks because the C89 standard says the minimum | |
100 | * required supported string length is 509 bytes. | |
101 | */ | |
102 | fprintf(stderr, | |
103 | "Usage: %s [-options] infiles+ -o outputfilename\n" | |
104 | "\n" | |
105 | "Reads the infiles with normalization data and\n" | |
106 | "creates a binary file (outputfilename) with the data.\n" | |
107 | "\n", | |
108 | argv[0]); | |
109 | fprintf(stderr, | |
110 | "Options:\n" | |
111 | "\t-h or -? or --help this usage text\n" | |
112 | "\t-v or --verbose verbose output\n" | |
113 | "\t-c or --copyright include a copyright notice\n" | |
114 | "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); | |
115 | fprintf(stderr, | |
116 | "\t-s or --sourcedir source directory, followed by the path\n" | |
117 | "\t-o or --output output filename\n"); | |
118 | fprintf(stderr, | |
119 | "\t --fast optimize the .nrm file for fast normalization,\n" | |
120 | "\t which might increase its size (Writes fully decomposed\n" | |
121 | "\t regular mappings instead of delta mappings.\n" | |
122 | "\t You should measure the runtime speed to make sure that\n" | |
123 | "\t this is a good trade-off.)\n"); | |
124 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
125 | } | |
126 | ||
127 | beVerbose=options[VERBOSE].doesOccur; | |
128 | haveCopyright=options[COPYRIGHT].doesOccur; | |
129 | ||
130 | IcuToolErrorCode errorCode("gennorm2/main()"); | |
131 | ||
132 | #if UCONFIG_NO_NORMALIZATION | |
133 | ||
134 | fprintf(stderr, | |
135 | "gennorm2 writes a dummy binary data file " | |
136 | "because UCONFIG_NO_NORMALIZATION is set, \n" | |
137 | "see icu/source/common/unicode/uconfig.h\n"); | |
138 | udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); | |
139 | // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. | |
140 | // return U_UNSUPPORTED_ERROR; | |
141 | return 0; | |
142 | ||
143 | #else | |
144 | ||
145 | LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode)); | |
146 | errorCode.assertSuccess(); | |
147 | ||
148 | builder->setUnicodeVersion(options[UNICODE_VERSION].value); | |
149 | ||
150 | if(options[OPT_FAST].doesOccur) { | |
151 | builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); | |
152 | } | |
153 | ||
154 | // prepare the filename beginning with the source dir | |
155 | CharString filename(options[SOURCEDIR].value, errorCode); | |
156 | int32_t pathLength=filename.length(); | |
157 | if( pathLength>0 && | |
158 | filename[pathLength-1]!=U_FILE_SEP_CHAR && | |
159 | filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR | |
160 | ) { | |
161 | filename.append(U_FILE_SEP_CHAR, errorCode); | |
162 | pathLength=filename.length(); | |
163 | } | |
164 | ||
165 | for(int i=1; i<argc; ++i) { | |
166 | printf("gennorm2: processing %s\n", argv[i]); | |
167 | filename.append(argv[i], errorCode); | |
168 | LocalStdioFilePointer f(fopen(filename.data(), "r")); | |
169 | if(f==NULL) { | |
170 | fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); | |
171 | exit(U_FILE_ACCESS_ERROR); | |
172 | } | |
173 | builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); | |
174 | parseFile(f.getAlias(), *builder); | |
175 | filename.truncate(pathLength); | |
176 | } | |
177 | ||
178 | builder->writeBinaryFile(options[OUTPUT_FILENAME].value); | |
179 | ||
180 | return errorCode.get(); | |
181 | ||
182 | #endif | |
183 | } | |
184 | ||
185 | #if !UCONFIG_NO_NORMALIZATION | |
186 | ||
187 | void parseFile(FILE *f, Normalizer2DataBuilder &builder) { | |
188 | IcuToolErrorCode errorCode("gennorm2/parseFile()"); | |
189 | char line[300]; | |
190 | uint32_t startCP, endCP; | |
191 | while(NULL!=fgets(line, (int)sizeof(line), f)) { | |
192 | char *comment=(char *)strchr(line, '#'); | |
193 | if(comment!=NULL) { | |
194 | *comment=0; | |
195 | } | |
196 | u_rtrim(line); | |
197 | if(line[0]==0) { | |
198 | continue; // skip empty and comment-only lines | |
199 | } | |
200 | if(line[0]=='*') { | |
201 | continue; // reserved syntax | |
202 | } | |
203 | const char *delimiter; | |
204 | int32_t rangeLength= | |
205 | u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); | |
206 | if(errorCode.isFailure()) { | |
207 | fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); | |
208 | exit(errorCode.reset()); | |
209 | } | |
210 | delimiter=u_skipWhitespace(delimiter); | |
211 | if(*delimiter==':') { | |
212 | const char *s=u_skipWhitespace(delimiter+1); | |
213 | char *end; | |
214 | unsigned long value=strtoul(s, &end, 10); | |
215 | if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { | |
216 | fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); | |
217 | exit(U_PARSE_ERROR); | |
218 | } | |
219 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
220 | builder.setCC(c, (uint8_t)value); | |
221 | } | |
222 | continue; | |
223 | } | |
224 | if(*delimiter=='-') { | |
225 | if(*u_skipWhitespace(delimiter+1)!=0) { | |
226 | fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); | |
227 | exit(U_PARSE_ERROR); | |
228 | } | |
229 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
230 | builder.removeMapping(c); | |
231 | } | |
232 | continue; | |
233 | } | |
234 | if(*delimiter=='=' || *delimiter=='>') { | |
235 | UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; | |
236 | int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode); | |
237 | if(errorCode.isFailure()) { | |
238 | fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); | |
239 | exit(errorCode.reset()); | |
240 | } | |
241 | UnicodeString mapping(FALSE, uchars, length); | |
242 | if(*delimiter=='=') { | |
243 | if(rangeLength!=1) { | |
244 | fprintf(stderr, | |
245 | "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", | |
246 | line); | |
247 | exit(U_PARSE_ERROR); | |
248 | } | |
249 | builder.setRoundTripMapping((UChar32)startCP, mapping); | |
250 | } else { | |
251 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { | |
252 | builder.setOneWayMapping(c, mapping); | |
253 | } | |
254 | } | |
255 | continue; | |
256 | } | |
257 | fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); | |
258 | exit(U_PARSE_ERROR); | |
259 | } | |
260 | } | |
261 | ||
262 | #endif // !UCONFIG_NO_NORMALIZATION | |
263 | ||
264 | U_NAMESPACE_END | |
265 | ||
266 | /* | |
267 | * Hey, Emacs, please set the following: | |
268 | * | |
269 | * Local Variables: | |
270 | * indent-tabs-mode: nil | |
271 | * End: | |
272 | * | |
273 | */ |