]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2001-2003, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gennorm.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2001may25 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads the Unicode character database text file, | |
17 | * parses it, and extracts the data for normalization. | |
18 | * It then preprocesses it and writes a binary file for efficient use | |
19 | * in various Unicode text normalization processes. | |
20 | */ | |
21 | ||
22 | #include <stdio.h> | |
23 | #include <stdlib.h> | |
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uchar.h" | |
26 | #include "unicode/putil.h" | |
27 | #include "cmemory.h" | |
28 | #include "cstring.h" | |
29 | #include "unicode/udata.h" | |
30 | #include "unewdata.h" | |
31 | #include "uoptions.h" | |
32 | #include "uparse.h" | |
33 | #include "unormimp.h" | |
34 | ||
35 | U_CDECL_BEGIN | |
36 | #include "gennorm.h" | |
37 | U_CDECL_END | |
38 | ||
39 | #ifdef WIN32 | |
40 | # pragma warning(disable: 4100) | |
41 | #endif | |
42 | ||
43 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
44 | ||
45 | /* prototypes --------------------------------------------------------------- */ | |
46 | ||
47 | static void | |
48 | parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError); | |
49 | ||
50 | static void | |
51 | parseDB(const char *filename, UErrorCode *pErrorCode); | |
52 | ||
53 | /* -------------------------------------------------------------------------- */ | |
54 | ||
55 | static UOption options[]={ | |
56 | UOPTION_HELP_H, | |
57 | UOPTION_HELP_QUESTION_MARK, | |
58 | UOPTION_VERBOSE, | |
59 | UOPTION_COPYRIGHT, | |
60 | UOPTION_DESTDIR, | |
61 | UOPTION_SOURCEDIR, | |
62 | { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 } | |
63 | }; | |
64 | ||
65 | extern int | |
66 | main(int argc, char* argv[]) { | |
67 | #if !UCONFIG_NO_NORMALIZATION | |
68 | char filename[300]; | |
69 | #endif | |
70 | const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; | |
71 | char *basename=NULL; | |
72 | UErrorCode errorCode=U_ZERO_ERROR; | |
73 | ||
74 | U_MAIN_INIT_ARGS(argc, argv); | |
75 | ||
76 | /* preset then read command line options */ | |
77 | options[4].value=u_getDataDirectory(); | |
78 | options[5].value=""; | |
79 | options[6].value="3.0.0"; | |
80 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); | |
81 | ||
82 | /* error handling, printing usage message */ | |
83 | if(argc<0) { | |
84 | fprintf(stderr, | |
85 | "error in command line argument \"%s\"\n", | |
86 | argv[-argc]); | |
87 | } | |
88 | if(argc<0 || options[0].doesOccur || options[1].doesOccur) { | |
89 | /* | |
90 | * Broken into chucks because the C89 standard says the minimum | |
91 | * required supported string length is 509 bytes. | |
92 | */ | |
93 | fprintf(stderr, | |
94 | "Usage: %s [-options] [suffix]\n" | |
95 | "\n" | |
96 | "Read the UnicodeData.txt file and other Unicode properties files and\n" | |
97 | "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n" | |
98 | "\n", | |
99 | argv[0]); | |
100 | fprintf(stderr, | |
101 | "Options:\n" | |
102 | "\t-h or -? or --help this usage text\n" | |
103 | "\t-v or --verbose verbose output\n" | |
104 | "\t-c or --copyright include a copyright notice\n" | |
105 | "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); | |
106 | fprintf(stderr, | |
107 | "\t-d or --destdir destination directory, followed by the path\n" | |
108 | "\t-s or --sourcedir source directory, followed by the path\n" | |
109 | "\tsuffix suffix that is to be appended with a '-'\n" | |
110 | "\t to the source file basenames before opening;\n" | |
111 | "\t 'gennorm new' will read UnicodeData-new.txt etc.\n"); | |
112 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
113 | } | |
114 | ||
115 | /* get the options values */ | |
116 | beVerbose=options[2].doesOccur; | |
117 | haveCopyright=options[3].doesOccur; | |
118 | srcDir=options[5].value; | |
119 | destDir=options[4].value; | |
120 | ||
121 | if(argc>=2) { | |
122 | suffix=argv[1]; | |
123 | } else { | |
124 | suffix=NULL; | |
125 | } | |
126 | ||
127 | #if UCONFIG_NO_NORMALIZATION | |
128 | ||
129 | fprintf(stderr, | |
130 | "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE | |
131 | " because UCONFIG_NO_NORMALIZATION is set, \n" | |
132 | "see icu/source/common/unicode/uconfig.h\n"); | |
133 | generateData(destDir); | |
134 | ||
135 | #else | |
136 | ||
137 | setUnicodeVersion(options[6].value); | |
138 | ||
139 | /* prepare the filename beginning with the source dir */ | |
140 | uprv_strcpy(filename, srcDir); | |
141 | basename=filename+uprv_strlen(filename); | |
142 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
143 | *basename++=U_FILE_SEP_CHAR; | |
144 | } | |
145 | ||
146 | /* initialize */ | |
147 | init(); | |
148 | ||
149 | /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */ | |
150 | if(suffix==NULL) { | |
151 | uprv_strcpy(basename, "DerivedNormalizationProps.txt"); | |
152 | } else { | |
153 | uprv_strcpy(basename, "DerivedNormalizationProps"); | |
154 | basename[30]='-'; | |
155 | uprv_strcpy(basename+31, suffix); | |
156 | uprv_strcat(basename+31, ".txt"); | |
157 | } | |
158 | parseDerivedNormalizationProperties(filename, &errorCode, FALSE); | |
159 | if(U_FAILURE(errorCode)) { | |
160 | /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */ | |
161 | if(suffix==NULL) { | |
162 | uprv_strcpy(basename, "DerivedNormalizationProperties.txt"); | |
163 | } else { | |
164 | uprv_strcpy(basename, "DerivedNormalizationProperties"); | |
165 | basename[30]='-'; | |
166 | uprv_strcpy(basename+31, suffix); | |
167 | uprv_strcat(basename+31, ".txt"); | |
168 | } | |
169 | parseDerivedNormalizationProperties(filename, &errorCode, TRUE); | |
170 | } | |
171 | ||
172 | /* process UnicodeData.txt */ | |
173 | if(suffix==NULL) { | |
174 | uprv_strcpy(basename, "UnicodeData.txt"); | |
175 | } else { | |
176 | uprv_strcpy(basename, "UnicodeData"); | |
177 | basename[11]='-'; | |
178 | uprv_strcpy(basename+12, suffix); | |
179 | uprv_strcat(basename+12, ".txt"); | |
180 | } | |
181 | parseDB(filename, &errorCode); | |
182 | ||
183 | /* process parsed data */ | |
184 | if(U_SUCCESS(errorCode)) { | |
185 | processData(); | |
186 | ||
187 | /* write the properties data file */ | |
188 | generateData(destDir); | |
189 | ||
190 | cleanUpData(); | |
191 | } | |
192 | ||
193 | #endif | |
194 | ||
195 | return errorCode; | |
196 | } | |
197 | ||
198 | #if !UCONFIG_NO_NORMALIZATION | |
199 | ||
200 | /* parser for DerivedNormalizationProperties.txt ---------------------------- */ | |
201 | ||
202 | static void U_CALLCONV | |
203 | derivedNormalizationPropertiesLineFn(void *context, | |
204 | char *fields[][2], int32_t fieldCount, | |
205 | UErrorCode *pErrorCode) { | |
206 | UChar string[32]; | |
207 | char *s; | |
208 | uint32_t start, end; | |
209 | int32_t count; | |
210 | uint8_t qcFlags; | |
211 | ||
212 | /* get code point range */ | |
213 | count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); | |
214 | if(U_FAILURE(*pErrorCode)) { | |
215 | fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]); | |
216 | exit(*pErrorCode); | |
217 | } | |
218 | ||
219 | /* ignore hangul - handle explicitly */ | |
220 | if(start==0xac00) { | |
221 | return; | |
222 | } | |
223 | ||
224 | /* get property - ignore unrecognized ones */ | |
225 | s=(char *)u_skipWhitespace(fields[1][0]); | |
226 | if(*s=='N' && s[1]=='F') { | |
227 | /* quick check flag */ | |
228 | qcFlags=0x11; | |
229 | s+=2; | |
230 | if(*s=='K') { | |
231 | qcFlags<<=1; | |
232 | ++s; | |
233 | } | |
234 | ||
235 | if(*s=='C' && s[1]=='_') { | |
236 | s+=2; | |
237 | } else if(*s=='D' && s[1]=='_') { | |
238 | qcFlags<<=2; | |
239 | s+=2; | |
240 | } else { | |
241 | return; | |
242 | } | |
243 | ||
244 | if(0==uprv_memcmp(s, "NO", 2)) { | |
245 | qcFlags&=0xf; | |
246 | } else if(0==uprv_memcmp(s, "MAYBE", 5)) { | |
247 | qcFlags&=0x30; | |
248 | } else { | |
249 | return; | |
250 | } | |
251 | ||
252 | /* set this flag for all code points in this range */ | |
253 | while(start<=end) { | |
254 | setQCFlags(start++, qcFlags); | |
255 | } | |
256 | } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) { | |
257 | /* full composition exclusion */ | |
258 | while(start<=end) { | |
259 | setCompositionExclusion(start++); | |
260 | } | |
261 | } else if(0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') { | |
262 | /* FC_NFKC_Closure, parse field 2 to get the string */ | |
263 | char *t; | |
264 | ||
265 | /* start of the field */ | |
266 | s=(char *)u_skipWhitespace(s+1); | |
267 | ||
268 | /* find the end of the field */ | |
269 | for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {} | |
270 | *t=0; | |
271 | ||
272 | string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode); | |
273 | if(U_FAILURE(*pErrorCode)) { | |
274 | fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]); | |
275 | exit(*pErrorCode); | |
276 | } | |
277 | while(start<=end) { | |
278 | setFNC(start++, string); | |
279 | } | |
280 | } | |
281 | } | |
282 | ||
283 | static void | |
284 | parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) { | |
285 | char *fields[2][2]; | |
286 | ||
287 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
288 | return; | |
289 | } | |
290 | ||
291 | u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode); | |
292 | if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { | |
293 | fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); | |
294 | exit(*pErrorCode); | |
295 | } | |
296 | } | |
297 | ||
298 | /* parser for UnicodeData.txt ----------------------------------------------- */ | |
299 | ||
300 | static void U_CALLCONV | |
301 | unicodeDataLineFn(void *context, | |
302 | char *fields[][2], int32_t fieldCount, | |
303 | UErrorCode *pErrorCode) { | |
304 | uint32_t decomp[40]; | |
305 | Norm norm; | |
306 | const char *s; | |
307 | char *end; | |
308 | uint32_t code, value; | |
309 | int32_t length; | |
310 | UBool isCompat, something=FALSE; | |
311 | ||
312 | /* ignore First and Last entries for ranges */ | |
313 | if( *fields[1][0]=='<' && | |
314 | (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 && | |
315 | (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) | |
316 | ) { | |
317 | return; | |
318 | } | |
319 | ||
320 | /* reset the properties */ | |
321 | uprv_memset(&norm, 0, sizeof(Norm)); | |
322 | ||
323 | /* get the character code, field 0 */ | |
324 | code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); | |
325 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
326 | fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]); | |
327 | *pErrorCode=U_PARSE_ERROR; | |
328 | exit(U_PARSE_ERROR); | |
329 | } | |
330 | ||
331 | /* get canonical combining class, field 3 */ | |
332 | value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10); | |
333 | if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { | |
334 | fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]); | |
335 | *pErrorCode=U_PARSE_ERROR; | |
336 | exit(U_PARSE_ERROR); | |
337 | } | |
338 | if(value>0) { | |
339 | norm.udataCC=(uint8_t)value; | |
340 | something=TRUE; | |
341 | } | |
342 | ||
343 | /* get the decomposition, field 5 */ | |
344 | if(fields[5][0]<fields[5][1]) { | |
345 | if(*(s=fields[5][0])=='<') { | |
346 | ++s; | |
347 | isCompat=TRUE; | |
348 | ||
349 | /* skip and ignore the compatibility type name */ | |
350 | do { | |
351 | if(s==fields[5][1]) { | |
352 | /* missing '>' */ | |
353 | fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]); | |
354 | *pErrorCode=U_PARSE_ERROR; | |
355 | exit(U_PARSE_ERROR); | |
356 | } | |
357 | } while(*s++!='>'); | |
358 | } else { | |
359 | isCompat=FALSE; | |
360 | } | |
361 | ||
362 | /* parse the decomposition string */ | |
363 | length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode); | |
364 | if(U_FAILURE(*pErrorCode)) { | |
365 | fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n", | |
366 | (long)code, u_errorName(*pErrorCode)); | |
367 | exit(*pErrorCode); | |
368 | } | |
369 | ||
370 | /* store the string */ | |
371 | if(length>0) { | |
372 | something=TRUE; | |
373 | if(isCompat) { | |
374 | norm.lenNFKD=(uint8_t)length; | |
375 | norm.nfkd=decomp; | |
376 | } else { | |
377 | if(length>2) { | |
378 | fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n", | |
379 | (long)code, (long)length); | |
380 | *pErrorCode=U_PARSE_ERROR; | |
381 | exit(U_PARSE_ERROR); | |
382 | } | |
383 | norm.lenNFD=(uint8_t)length; | |
384 | norm.nfd=decomp; | |
385 | } | |
386 | } | |
387 | } | |
388 | ||
389 | /* check for non-character code points */ | |
390 | if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) { | |
391 | fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n", | |
392 | (long)code); | |
393 | *pErrorCode=U_PARSE_ERROR; | |
394 | exit(U_PARSE_ERROR); | |
395 | } | |
396 | ||
397 | if(something) { | |
398 | /* there are normalization values, so store them */ | |
399 | #if 0 | |
400 | if(beVerbose) { | |
401 | printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n", | |
402 | (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD); | |
403 | } | |
404 | #endif | |
405 | storeNorm(code, &norm); | |
406 | } | |
407 | } | |
408 | ||
409 | static void | |
410 | parseDB(const char *filename, UErrorCode *pErrorCode) { | |
411 | char *fields[15][2]; | |
412 | ||
413 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
414 | return; | |
415 | } | |
416 | ||
417 | u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); | |
418 | if(U_FAILURE(*pErrorCode)) { | |
419 | fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); | |
420 | exit(*pErrorCode); | |
421 | } | |
422 | } | |
423 | ||
424 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
425 | ||
426 | /* | |
427 | * Hey, Emacs, please set the following: | |
428 | * | |
429 | * Local Variables: | |
430 | * indent-tabs-mode: nil | |
431 | * End: | |
432 | * | |
433 | */ |