]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ******************************************************************************* | |
5 | * | |
6 | * Copyright (C) 2003-2016, International Business Machines | |
7 | * Corporation and others. All Rights Reserved. | |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: gensprep.c | |
11 | * encoding: UTF-8 | |
12 | * tab size: 8 (not used) | |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2003-02-06 | |
16 | * created by: Ram Viswanadha | |
17 | * | |
18 | * This program reads the Profile.txt files, | |
19 | * parses them, and extracts the data for StringPrep profile. | |
20 | * It then preprocesses it and writes a binary file for efficient use | |
21 | * in various StringPrep conversion processes. | |
22 | */ | |
23 | ||
24 | #define USPREP_TYPE_NAMES_ARRAY 1 | |
25 | ||
26 | #include <stdio.h> | |
27 | #include <stdlib.h> | |
28 | ||
29 | #include "cmemory.h" | |
30 | #include "cstring.h" | |
31 | #include "unewdata.h" | |
32 | #include "uoptions.h" | |
33 | #include "uparse.h" | |
34 | #include "sprpimpl.h" | |
35 | ||
36 | #include "unicode/uclean.h" | |
37 | #include "unicode/udata.h" | |
38 | #include "unicode/utypes.h" | |
39 | #include "unicode/putil.h" | |
40 | ||
41 | ||
42 | U_CDECL_BEGIN | |
43 | #include "gensprep.h" | |
44 | U_CDECL_END | |
45 | ||
46 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
47 | ||
48 | #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" | |
49 | ||
50 | #define NORMALIZE_DIRECTIVE "normalize" | |
51 | #define NORMALIZE_DIRECTIVE_LEN 9 | |
52 | #define CHECK_BIDI_DIRECTIVE "check-bidi" | |
53 | #define CHECK_BIDI_DIRECTIVE_LEN 10 | |
54 | ||
55 | /* prototypes --------------------------------------------------------------- */ | |
56 | ||
57 | static void | |
58 | parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode); | |
59 | ||
60 | static void | |
61 | parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode); | |
62 | ||
63 | ||
64 | /* -------------------------------------------------------------------------- */ | |
65 | ||
66 | static UOption options[]={ | |
67 | UOPTION_HELP_H, | |
68 | UOPTION_HELP_QUESTION_MARK, | |
69 | UOPTION_VERBOSE, | |
70 | UOPTION_COPYRIGHT, | |
71 | UOPTION_DESTDIR, | |
72 | UOPTION_SOURCEDIR, | |
73 | UOPTION_ICUDATADIR, | |
74 | UOPTION_BUNDLE_NAME, | |
75 | { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, | |
76 | { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, | |
77 | { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, | |
78 | { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, | |
79 | }; | |
80 | ||
81 | enum{ | |
82 | HELP, | |
83 | HELP_QUESTION_MARK, | |
84 | VERBOSE, | |
85 | COPYRIGHT, | |
86 | DESTDIR, | |
87 | SOURCEDIR, | |
88 | ICUDATADIR, | |
89 | BUNDLE_NAME, | |
90 | NORMALIZE, | |
91 | NORM_CORRECTION_DIR, | |
92 | CHECK_BIDI, | |
93 | UNICODE_VERSION | |
94 | }; | |
95 | ||
96 | static int printHelp(int argc, char* argv[]){ | |
97 | /* | |
98 | * Broken into chucks because the C89 standard says the minimum | |
99 | * required supported string length is 509 bytes. | |
100 | */ | |
101 | fprintf(stderr, | |
102 | "Usage: %s [-options] [file_name]\n" | |
103 | "\n" | |
104 | "Read the files specified and\n" | |
105 | "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n" | |
106 | "\n", | |
107 | argv[0]); | |
108 | fprintf(stderr, | |
109 | "Options:\n" | |
110 | "\t-h or -? or --help print this usage text\n" | |
111 | "\t-v or --verbose verbose output\n" | |
112 | "\t-c or --copyright include a copyright notice\n"); | |
113 | fprintf(stderr, | |
114 | "\t-d or --destdir destination directory, followed by the path\n" | |
115 | "\t-s or --sourcedir source directory of ICU data, followed by the path\n" | |
116 | "\t-b or --bundle-name generate the output data file with the name specified\n" | |
117 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" | |
118 | "\t followed by path, defaults to %s\n", | |
119 | u_getDataDirectory()); | |
120 | fprintf(stderr, | |
121 | "\t-n or --normalize turn on the option for normalization and include mappings\n" | |
122 | "\t from NormalizationCorrections.txt from the given path,\n" | |
123 | "\t e.g: /test/icu/source/data/unidata\n"); | |
124 | fprintf(stderr, | |
125 | "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" | |
126 | "\t when the input file contains a normalization directive.\n" | |
127 | "\t unlike -n/--normalize, this option does not force the\n" | |
128 | "\t normalization.\n"); | |
129 | fprintf(stderr, | |
130 | "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" | |
131 | "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" | |
132 | ); | |
133 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
134 | } | |
135 | ||
136 | ||
137 | extern int | |
138 | main(int argc, char* argv[]) { | |
139 | #if !UCONFIG_NO_IDNA | |
140 | char* filename = NULL; | |
141 | #endif | |
142 | const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL; | |
143 | const char *bundleName=NULL, *inputFileName = NULL; | |
144 | char *basename=NULL; | |
145 | int32_t sprepOptions = 0; | |
146 | ||
147 | UErrorCode errorCode=U_ZERO_ERROR; | |
148 | ||
149 | U_MAIN_INIT_ARGS(argc, argv); | |
150 | ||
151 | /* preset then read command line options */ | |
152 | options[DESTDIR].value=u_getDataDirectory(); | |
153 | options[SOURCEDIR].value=""; | |
154 | options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */ | |
155 | options[BUNDLE_NAME].value = DATA_NAME; | |
156 | options[NORMALIZE].value = ""; | |
157 | ||
158 | argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); | |
159 | ||
160 | /* error handling, printing usage message */ | |
161 | if(argc<0) { | |
162 | fprintf(stderr, | |
163 | "error in command line argument \"%s\"\n", | |
164 | argv[-argc]); | |
165 | } | |
166 | if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { | |
167 | return printHelp(argc, argv); | |
168 | ||
169 | } | |
170 | ||
171 | /* get the options values */ | |
172 | beVerbose=options[VERBOSE].doesOccur; | |
173 | haveCopyright=options[COPYRIGHT].doesOccur; | |
174 | srcDir=options[SOURCEDIR].value; | |
175 | destDir=options[DESTDIR].value; | |
176 | bundleName = options[BUNDLE_NAME].value; | |
177 | if(options[NORMALIZE].doesOccur) { | |
178 | icuUniDataDir = options[NORMALIZE].value; | |
179 | } else { | |
180 | icuUniDataDir = options[NORM_CORRECTION_DIR].value; | |
181 | } | |
182 | ||
183 | if(argc<2) { | |
184 | /* print the help message */ | |
185 | return printHelp(argc, argv); | |
186 | } else { | |
187 | inputFileName = argv[1]; | |
188 | } | |
189 | if(!options[UNICODE_VERSION].doesOccur){ | |
190 | return printHelp(argc, argv); | |
191 | } | |
192 | if(options[ICUDATADIR].doesOccur) { | |
193 | u_setDataDirectory(options[ICUDATADIR].value); | |
194 | } | |
195 | #if UCONFIG_NO_IDNA | |
196 | ||
197 | fprintf(stderr, | |
198 | "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE | |
199 | " because UCONFIG_NO_IDNA is set, \n" | |
200 | "see icu/source/common/unicode/uconfig.h\n"); | |
201 | generateData(destDir, bundleName); | |
202 | ||
203 | #else | |
204 | ||
205 | setUnicodeVersion(options[UNICODE_VERSION].value); | |
206 | filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + uprv_strlen(inputFileName) + (icuUniDataDir == NULL ? 0 : uprv_strlen(icuUniDataDir)) + 40); /* hopefully this should be enough */ | |
207 | ||
208 | /* prepare the filename beginning with the source dir */ | |
209 | if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){ | |
210 | filename[0] = '.'; | |
211 | filename[1] = U_FILE_SEP_CHAR; | |
212 | uprv_strcpy(filename+2,srcDir); | |
213 | }else{ | |
214 | uprv_strcpy(filename, srcDir); | |
215 | } | |
216 | ||
217 | basename=filename+uprv_strlen(filename); | |
218 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
219 | *basename++=U_FILE_SEP_CHAR; | |
220 | } | |
221 | ||
222 | /* initialize */ | |
223 | init(); | |
224 | ||
225 | /* process the file */ | |
226 | uprv_strcpy(basename,inputFileName); | |
227 | parseMappings(filename,FALSE, &errorCode); | |
228 | if(U_FAILURE(errorCode)) { | |
229 | fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode)); | |
230 | return errorCode; | |
231 | } | |
232 | ||
233 | if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ | |
234 | /* set up directory for NormalizationCorrections.txt */ | |
235 | uprv_strcpy(filename,icuUniDataDir); | |
236 | basename=filename+uprv_strlen(filename); | |
237 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
238 | *basename++=U_FILE_SEP_CHAR; | |
239 | } | |
240 | ||
241 | *basename++=U_FILE_SEP_CHAR; | |
242 | uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME); | |
243 | ||
244 | parseNormalizationCorrections(filename,&errorCode); | |
245 | if(U_FAILURE(errorCode)){ | |
246 | fprintf(stderr,"Could not open file %s for reading \n", filename); | |
247 | return errorCode; | |
248 | } | |
249 | sprepOptions |= _SPREP_NORMALIZATION_ON; | |
250 | } | |
251 | ||
252 | if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ | |
253 | sprepOptions |= _SPREP_CHECK_BIDI_ON; | |
254 | } | |
255 | ||
256 | setOptions(sprepOptions); | |
257 | ||
258 | /* process parsed data */ | |
259 | if(U_SUCCESS(errorCode)) { | |
260 | /* write the data file */ | |
261 | generateData(destDir, bundleName); | |
262 | ||
263 | cleanUpData(); | |
264 | } | |
265 | ||
266 | uprv_free(filename); | |
267 | ||
268 | u_cleanup(); | |
269 | ||
270 | #endif | |
271 | ||
272 | return errorCode; | |
273 | } | |
274 | ||
275 | #if !UCONFIG_NO_IDNA | |
276 | ||
277 | static void U_CALLCONV | |
278 | normalizationCorrectionsLineFn(void *context, | |
279 | char *fields[][2], int32_t fieldCount, | |
280 | UErrorCode *pErrorCode) { | |
281 | (void)context; // suppress compiler warnings about unused variable | |
282 | (void)fieldCount; // suppress compiler warnings about unused variable | |
283 | uint32_t mapping[40]; | |
284 | char *end, *s; | |
285 | uint32_t code; | |
286 | int32_t length; | |
287 | UVersionInfo version; | |
288 | UVersionInfo thisVersion; | |
289 | ||
290 | /* get the character code, field 0 */ | |
291 | code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); | |
292 | if(U_FAILURE(*pErrorCode)) { | |
293 | fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); | |
294 | exit(*pErrorCode); | |
295 | } | |
296 | /* Original (erroneous) decomposition */ | |
297 | s = fields[1][0]; | |
298 | ||
299 | /* parse the mapping string */ | |
300 | length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); | |
301 | ||
302 | /* ignore corrected decomposition */ | |
303 | ||
304 | u_versionFromString(version,fields[3][0] ); | |
305 | u_versionFromString(thisVersion, "3.2.0"); | |
306 | ||
307 | ||
308 | ||
309 | if(U_FAILURE(*pErrorCode)) { | |
310 | fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", | |
311 | (long)code, u_errorName(*pErrorCode)); | |
312 | exit(*pErrorCode); | |
313 | } | |
314 | ||
315 | /* store the mapping */ | |
316 | if( version[0] > thisVersion[0] || | |
317 | ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) | |
318 | ){ | |
319 | storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); | |
320 | } | |
321 | setUnicodeVersionNC(version); | |
322 | } | |
323 | ||
324 | static void | |
325 | parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) { | |
326 | char *fields[4][2]; | |
327 | ||
328 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
329 | return; | |
330 | } | |
331 | ||
332 | u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode); | |
333 | ||
334 | /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */ | |
335 | ||
336 | if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) { | |
337 | fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); | |
338 | exit(*pErrorCode); | |
339 | } | |
340 | } | |
341 | ||
342 | static void U_CALLCONV | |
343 | strprepProfileLineFn(void *context, | |
344 | char *fields[][2], int32_t fieldCount, | |
345 | UErrorCode *pErrorCode) { | |
346 | (void)fieldCount; // suppress compiler warnings about unused variable | |
347 | uint32_t mapping[40]; | |
348 | char *end, *map; | |
349 | uint32_t code; | |
350 | int32_t length; | |
351 | /*UBool* mapWithNorm = (UBool*) context;*/ | |
352 | const char* typeName; | |
353 | uint32_t rangeStart=0,rangeEnd =0; | |
354 | const char* filename = (const char*) context; | |
355 | const char *s; | |
356 | ||
357 | s = u_skipWhitespace(fields[0][0]); | |
358 | if (*s == '@') { | |
359 | /* special directive */ | |
360 | s++; | |
361 | length = (int32_t)(fields[0][1] - s); | |
362 | if (length >= NORMALIZE_DIRECTIVE_LEN | |
363 | && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { | |
364 | options[NORMALIZE].doesOccur = TRUE; | |
365 | return; | |
366 | } | |
367 | else if (length >= CHECK_BIDI_DIRECTIVE_LEN | |
368 | && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { | |
369 | options[CHECK_BIDI].doesOccur = TRUE; | |
370 | return; | |
371 | } | |
372 | else { | |
373 | fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); | |
374 | } | |
375 | } | |
376 | ||
377 | typeName = fields[2][0]; | |
378 | map = fields[1][0]; | |
379 | ||
380 | if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ | |
381 | ||
382 | u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); | |
383 | if(U_FAILURE(*pErrorCode)){ | |
384 | fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); | |
385 | return; | |
386 | } | |
387 | ||
388 | /* store the range */ | |
389 | storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); | |
390 | ||
391 | }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ | |
392 | ||
393 | u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); | |
394 | if(U_FAILURE(*pErrorCode)){ | |
395 | fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); | |
396 | return; | |
397 | } | |
398 | ||
399 | /* store the range */ | |
400 | storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); | |
401 | ||
402 | }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ | |
403 | ||
404 | /* get the character code, field 0 */ | |
405 | code=(uint32_t)uprv_strtoul(s, &end, 16); | |
406 | if(end<=s || end!=fields[0][1]) { | |
407 | fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); | |
408 | *pErrorCode=U_PARSE_ERROR; | |
409 | exit(U_PARSE_ERROR); | |
410 | } | |
411 | ||
412 | /* parse the mapping string */ | |
413 | length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); | |
414 | ||
415 | /* store the mapping */ | |
416 | storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); | |
417 | ||
418 | }else{ | |
419 | *pErrorCode = U_INVALID_FORMAT_ERROR; | |
420 | } | |
421 | ||
422 | if(U_FAILURE(*pErrorCode)) { | |
423 | fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, | |
424 | fields[0][0],fields[2][0],u_errorName(*pErrorCode)); | |
425 | exit(*pErrorCode); | |
426 | } | |
427 | ||
428 | } | |
429 | ||
430 | static void | |
431 | parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) { | |
432 | char *fields[3][2]; | |
433 | ||
434 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
435 | return; | |
436 | } | |
437 | ||
438 | u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode); | |
439 | ||
440 | /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/ | |
441 | ||
442 | if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { | |
443 | fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); | |
444 | exit(*pErrorCode); | |
445 | } | |
446 | } | |
447 | ||
448 | ||
449 | #endif /* #if !UCONFIG_NO_IDNA */ | |
450 | ||
451 | /* | |
452 | * Hey, Emacs, please set the following: | |
453 | * | |
454 | * Local Variables: | |
455 | * indent-tabs-mode: nil | |
456 | * End: | |
457 | * | |
458 | */ |