]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennorm2/gennorm2.cpp
ICU-59152.0.1.tar.gz
[apple/icu.git] / icuSources / tools / gennorm2 / gennorm2.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: gennorm2.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
17 *
18 * This program reads text files that define Unicode normalization,
19 * parses them, and builds a binary data file.
20 */
21
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include "unicode/errorcode.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/putil.h"
31 #include "unicode/uchar.h"
32 #include "unicode/unistr.h"
33 #include "charstr.h"
34 #include "normalizer2impl.h"
35 #include "toolutil.h"
36 #include "uoptions.h"
37 #include "uparse.h"
38
39 #if UCONFIG_NO_NORMALIZATION
40 #include "unewdata.h"
41 #endif
42
43 U_NAMESPACE_BEGIN
44
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51 #endif
52
53 /* -------------------------------------------------------------------------- */
54
55 enum {
56 HELP_H,
57 HELP_QUESTION_MARK,
58 VERBOSE,
59 COPYRIGHT,
60 SOURCEDIR,
61 OUTPUT_FILENAME,
62 UNICODE_VERSION,
63 WRITE_C_SOURCE,
64 OPT_FAST
65 };
66
67 static UOption options[]={
68 UOPTION_HELP_H,
69 UOPTION_HELP_QUESTION_MARK,
70 UOPTION_VERBOSE,
71 UOPTION_COPYRIGHT,
72 UOPTION_SOURCEDIR,
73 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
74 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
75 UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
76 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
77 };
78
79 extern "C" int
80 main(int argc, char* argv[]) {
81 U_MAIN_INIT_ARGS(argc, argv);
82
83 /* preset then read command line options */
84 options[SOURCEDIR].value="";
85 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
86
87 /* error handling, printing usage message */
88 if(argc<0) {
89 fprintf(stderr,
90 "error in command line argument \"%s\"\n",
91 argv[-argc]);
92 }
93 if(!options[OUTPUT_FILENAME].doesOccur) {
94 argc=-1;
95 }
96 if( argc<2 ||
97 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
98 ) {
99 /*
100 * Broken into chunks because the C89 standard says the minimum
101 * required supported string length is 509 bytes.
102 */
103 fprintf(stderr,
104 "Usage: %s [-options] infiles+ -o outputfilename\n"
105 "\n"
106 "Reads the infiles with normalization data and\n"
107 "creates a binary or C source file (outputfilename) with the data.\n"
108 "\n",
109 argv[0]);
110 fprintf(stderr,
111 "Options:\n"
112 "\t-h or -? or --help this usage text\n"
113 "\t-v or --verbose verbose output\n"
114 "\t-c or --copyright include a copyright notice\n"
115 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
116 fprintf(stderr,
117 "\t-s or --sourcedir source directory, followed by the path\n"
118 "\t-o or --output output filename\n"
119 "\t --csource writes a C source file with initializers\n");
120 fprintf(stderr,
121 "\t --fast optimize the data for fast normalization,\n"
122 "\t which might increase its size (Writes fully decomposed\n"
123 "\t regular mappings instead of delta mappings.\n"
124 "\t You should measure the runtime speed to make sure that\n"
125 "\t this is a good trade-off.)\n");
126 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
127 }
128
129 beVerbose=options[VERBOSE].doesOccur;
130 haveCopyright=options[COPYRIGHT].doesOccur;
131
132 IcuToolErrorCode errorCode("gennorm2/main()");
133
134 #if UCONFIG_NO_NORMALIZATION
135
136 fprintf(stderr,
137 "gennorm2 writes a dummy binary data file "
138 "because UCONFIG_NO_NORMALIZATION is set, \n"
139 "see icu/source/common/unicode/uconfig.h\n");
140 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
141 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
142 // return U_UNSUPPORTED_ERROR;
143 return 0;
144
145 #else
146
147 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
148 errorCode.assertSuccess();
149
150 if(options[UNICODE_VERSION].doesOccur) {
151 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
152 }
153
154 if(options[OPT_FAST].doesOccur) {
155 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
156 }
157
158 // prepare the filename beginning with the source dir
159 CharString filename(options[SOURCEDIR].value, errorCode);
160 int32_t pathLength=filename.length();
161 if( pathLength>0 &&
162 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
163 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
164 ) {
165 filename.append(U_FILE_SEP_CHAR, errorCode);
166 pathLength=filename.length();
167 }
168
169 for(int i=1; i<argc; ++i) {
170 printf("gennorm2: processing %s\n", argv[i]);
171 filename.append(argv[i], errorCode);
172 LocalStdioFilePointer f(fopen(filename.data(), "r"));
173 if(f==NULL) {
174 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
175 exit(U_FILE_ACCESS_ERROR);
176 }
177 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
178 parseFile(f.getAlias(), *builder);
179 filename.truncate(pathLength);
180 }
181
182 if(options[WRITE_C_SOURCE].doesOccur) {
183 builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
184 } else {
185 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
186 }
187
188 return errorCode.get();
189
190 #endif
191 }
192
193 #if !UCONFIG_NO_NORMALIZATION
194
195 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
196 IcuToolErrorCode errorCode("gennorm2/parseFile()");
197 char line[300];
198 uint32_t startCP, endCP;
199 while(NULL!=fgets(line, (int)sizeof(line), f)) {
200 char *comment=(char *)strchr(line, '#');
201 if(comment!=NULL) {
202 *comment=0;
203 }
204 u_rtrim(line);
205 if(line[0]==0) {
206 continue; // skip empty and comment-only lines
207 }
208 if(line[0]=='*') {
209 const char *s=u_skipWhitespace(line+1);
210 if(0==strncmp(s, "Unicode", 7)) {
211 s=u_skipWhitespace(s+7);
212 builder.setUnicodeVersion(s);
213 }
214 continue; // reserved syntax
215 }
216 const char *delimiter;
217 int32_t rangeLength=
218 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
219 if(errorCode.isFailure()) {
220 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
221 exit(errorCode.reset());
222 }
223 delimiter=u_skipWhitespace(delimiter);
224 if(*delimiter==':') {
225 const char *s=u_skipWhitespace(delimiter+1);
226 char *end;
227 unsigned long value=strtoul(s, &end, 10);
228 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
229 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
230 exit(U_PARSE_ERROR);
231 }
232 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
233 builder.setCC(c, (uint8_t)value);
234 }
235 continue;
236 }
237 if(*delimiter=='-') {
238 if(*u_skipWhitespace(delimiter+1)!=0) {
239 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
240 exit(U_PARSE_ERROR);
241 }
242 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
243 builder.removeMapping(c);
244 }
245 continue;
246 }
247 if(*delimiter=='=' || *delimiter=='>') {
248 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
249 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
250 if(errorCode.isFailure()) {
251 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
252 exit(errorCode.reset());
253 }
254 UnicodeString mapping(FALSE, uchars, length);
255 if(*delimiter=='=') {
256 if(rangeLength!=1) {
257 fprintf(stderr,
258 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
259 line);
260 exit(U_PARSE_ERROR);
261 }
262 builder.setRoundTripMapping((UChar32)startCP, mapping);
263 } else {
264 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
265 builder.setOneWayMapping(c, mapping);
266 }
267 }
268 continue;
269 }
270 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
271 exit(U_PARSE_ERROR);
272 }
273 }
274
275 #endif // !UCONFIG_NO_NORMALIZATION
276
277 U_NAMESPACE_END
278
279 /*
280 * Hey, Emacs, please set the following:
281 *
282 * Local Variables:
283 * indent-tabs-mode: nil
284 * End:
285 *
286 */