]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
374ca955 | 4 | * Copyright (C) 2001-2004, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gennorm.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2001may25 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads the Unicode character database text file, | |
17 | * parses it, and extracts the data for normalization. | |
18 | * It then preprocesses it and writes a binary file for efficient use | |
19 | * in various Unicode text normalization processes. | |
20 | */ | |
21 | ||
22 | #include <stdio.h> | |
23 | #include <stdlib.h> | |
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uchar.h" | |
374ca955 | 26 | #include "unicode/ustring.h" |
b75a7d8f | 27 | #include "unicode/putil.h" |
374ca955 A |
28 | #include "unicode/uclean.h" |
29 | #include "unicode/udata.h" | |
30 | #include "unicode/uset.h" | |
b75a7d8f A |
31 | #include "cmemory.h" |
32 | #include "cstring.h" | |
b75a7d8f A |
33 | #include "unewdata.h" |
34 | #include "uoptions.h" | |
35 | #include "uparse.h" | |
36 | #include "unormimp.h" | |
37 | ||
38 | U_CDECL_BEGIN | |
39 | #include "gennorm.h" | |
40 | U_CDECL_END | |
41 | ||
42 | #ifdef WIN32 | |
43 | # pragma warning(disable: 4100) | |
44 | #endif | |
45 | ||
46 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
47 | ||
48 | /* prototypes --------------------------------------------------------------- */ | |
49 | ||
50 | static void | |
51 | parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError); | |
52 | ||
53 | static void | |
54 | parseDB(const char *filename, UErrorCode *pErrorCode); | |
55 | ||
56 | /* -------------------------------------------------------------------------- */ | |
57 | ||
374ca955 A |
58 | enum { |
59 | HELP_H, | |
60 | HELP_QUESTION_MARK, | |
61 | VERBOSE, | |
62 | COPYRIGHT, | |
63 | DESTDIR, | |
64 | SOURCEDIR, | |
65 | UNICODE_VERSION, | |
66 | ICUDATADIR | |
67 | }; | |
68 | ||
b75a7d8f A |
69 | static UOption options[]={ |
70 | UOPTION_HELP_H, | |
71 | UOPTION_HELP_QUESTION_MARK, | |
72 | UOPTION_VERBOSE, | |
73 | UOPTION_COPYRIGHT, | |
74 | UOPTION_DESTDIR, | |
75 | UOPTION_SOURCEDIR, | |
374ca955 A |
76 | { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, |
77 | UOPTION_ICUDATADIR | |
b75a7d8f A |
78 | }; |
79 | ||
80 | extern int | |
81 | main(int argc, char* argv[]) { | |
82 | #if !UCONFIG_NO_NORMALIZATION | |
83 | char filename[300]; | |
84 | #endif | |
85 | const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; | |
86 | char *basename=NULL; | |
87 | UErrorCode errorCode=U_ZERO_ERROR; | |
88 | ||
89 | U_MAIN_INIT_ARGS(argc, argv); | |
90 | ||
91 | /* preset then read command line options */ | |
92 | options[4].value=u_getDataDirectory(); | |
93 | options[5].value=""; | |
94 | options[6].value="3.0.0"; | |
374ca955 | 95 | options[ICUDATADIR].value=u_getDataDirectory(); |
b75a7d8f A |
96 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); |
97 | ||
98 | /* error handling, printing usage message */ | |
99 | if(argc<0) { | |
100 | fprintf(stderr, | |
101 | "error in command line argument \"%s\"\n", | |
102 | argv[-argc]); | |
103 | } | |
104 | if(argc<0 || options[0].doesOccur || options[1].doesOccur) { | |
105 | /* | |
106 | * Broken into chucks because the C89 standard says the minimum | |
107 | * required supported string length is 509 bytes. | |
108 | */ | |
109 | fprintf(stderr, | |
110 | "Usage: %s [-options] [suffix]\n" | |
111 | "\n" | |
112 | "Read the UnicodeData.txt file and other Unicode properties files and\n" | |
113 | "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n" | |
114 | "\n", | |
115 | argv[0]); | |
116 | fprintf(stderr, | |
117 | "Options:\n" | |
118 | "\t-h or -? or --help this usage text\n" | |
119 | "\t-v or --verbose verbose output\n" | |
120 | "\t-c or --copyright include a copyright notice\n" | |
121 | "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); | |
122 | fprintf(stderr, | |
123 | "\t-d or --destdir destination directory, followed by the path\n" | |
124 | "\t-s or --sourcedir source directory, followed by the path\n" | |
374ca955 A |
125 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" |
126 | "\t followed by path, defaults to <%s>\n" | |
b75a7d8f A |
127 | "\tsuffix suffix that is to be appended with a '-'\n" |
128 | "\t to the source file basenames before opening;\n" | |
374ca955 A |
129 | "\t 'gennorm new' will read UnicodeData-new.txt etc.\n", |
130 | u_getDataDirectory()); | |
b75a7d8f A |
131 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; |
132 | } | |
133 | ||
134 | /* get the options values */ | |
135 | beVerbose=options[2].doesOccur; | |
136 | haveCopyright=options[3].doesOccur; | |
137 | srcDir=options[5].value; | |
138 | destDir=options[4].value; | |
139 | ||
140 | if(argc>=2) { | |
141 | suffix=argv[1]; | |
142 | } else { | |
143 | suffix=NULL; | |
144 | } | |
145 | ||
146 | #if UCONFIG_NO_NORMALIZATION | |
147 | ||
148 | fprintf(stderr, | |
149 | "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE | |
150 | " because UCONFIG_NO_NORMALIZATION is set, \n" | |
151 | "see icu/source/common/unicode/uconfig.h\n"); | |
152 | generateData(destDir); | |
153 | ||
154 | #else | |
155 | ||
156 | setUnicodeVersion(options[6].value); | |
157 | ||
374ca955 A |
158 | if (options[ICUDATADIR].doesOccur) { |
159 | u_setDataDirectory(options[ICUDATADIR].value); | |
160 | } | |
161 | ||
162 | /* | |
163 | * Verify that we can work with properties | |
164 | * but don't call u_init() because that needs unorm.icu which we are just | |
165 | * going to build here. | |
166 | */ | |
167 | { | |
168 | U_STRING_DECL(ideo, "[:Ideographic:]", 15); | |
169 | USet *set; | |
170 | ||
171 | U_STRING_INIT(ideo, "[:Ideographic:]", 15); | |
172 | set=uset_openPattern(ideo, -1, &errorCode); | |
173 | if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) { | |
174 | fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode)); | |
175 | exit(errorCode); | |
176 | } | |
177 | uset_close(set); | |
178 | } | |
179 | ||
b75a7d8f A |
180 | /* prepare the filename beginning with the source dir */ |
181 | uprv_strcpy(filename, srcDir); | |
182 | basename=filename+uprv_strlen(filename); | |
183 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
184 | *basename++=U_FILE_SEP_CHAR; | |
185 | } | |
186 | ||
187 | /* initialize */ | |
188 | init(); | |
189 | ||
190 | /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */ | |
191 | if(suffix==NULL) { | |
192 | uprv_strcpy(basename, "DerivedNormalizationProps.txt"); | |
193 | } else { | |
194 | uprv_strcpy(basename, "DerivedNormalizationProps"); | |
195 | basename[30]='-'; | |
196 | uprv_strcpy(basename+31, suffix); | |
197 | uprv_strcat(basename+31, ".txt"); | |
198 | } | |
199 | parseDerivedNormalizationProperties(filename, &errorCode, FALSE); | |
200 | if(U_FAILURE(errorCode)) { | |
201 | /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */ | |
202 | if(suffix==NULL) { | |
203 | uprv_strcpy(basename, "DerivedNormalizationProperties.txt"); | |
204 | } else { | |
205 | uprv_strcpy(basename, "DerivedNormalizationProperties"); | |
206 | basename[30]='-'; | |
207 | uprv_strcpy(basename+31, suffix); | |
208 | uprv_strcat(basename+31, ".txt"); | |
209 | } | |
210 | parseDerivedNormalizationProperties(filename, &errorCode, TRUE); | |
211 | } | |
212 | ||
213 | /* process UnicodeData.txt */ | |
214 | if(suffix==NULL) { | |
215 | uprv_strcpy(basename, "UnicodeData.txt"); | |
216 | } else { | |
217 | uprv_strcpy(basename, "UnicodeData"); | |
218 | basename[11]='-'; | |
219 | uprv_strcpy(basename+12, suffix); | |
220 | uprv_strcat(basename+12, ".txt"); | |
221 | } | |
222 | parseDB(filename, &errorCode); | |
223 | ||
224 | /* process parsed data */ | |
225 | if(U_SUCCESS(errorCode)) { | |
226 | processData(); | |
227 | ||
228 | /* write the properties data file */ | |
229 | generateData(destDir); | |
230 | ||
231 | cleanUpData(); | |
232 | } | |
233 | ||
234 | #endif | |
235 | ||
236 | return errorCode; | |
237 | } | |
238 | ||
239 | #if !UCONFIG_NO_NORMALIZATION | |
240 | ||
241 | /* parser for DerivedNormalizationProperties.txt ---------------------------- */ | |
242 | ||
243 | static void U_CALLCONV | |
244 | derivedNormalizationPropertiesLineFn(void *context, | |
245 | char *fields[][2], int32_t fieldCount, | |
246 | UErrorCode *pErrorCode) { | |
247 | UChar string[32]; | |
248 | char *s; | |
249 | uint32_t start, end; | |
250 | int32_t count; | |
251 | uint8_t qcFlags; | |
252 | ||
253 | /* get code point range */ | |
254 | count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); | |
255 | if(U_FAILURE(*pErrorCode)) { | |
256 | fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]); | |
257 | exit(*pErrorCode); | |
258 | } | |
259 | ||
260 | /* ignore hangul - handle explicitly */ | |
261 | if(start==0xac00) { | |
262 | return; | |
263 | } | |
264 | ||
265 | /* get property - ignore unrecognized ones */ | |
266 | s=(char *)u_skipWhitespace(fields[1][0]); | |
267 | if(*s=='N' && s[1]=='F') { | |
268 | /* quick check flag */ | |
269 | qcFlags=0x11; | |
270 | s+=2; | |
271 | if(*s=='K') { | |
272 | qcFlags<<=1; | |
273 | ++s; | |
274 | } | |
275 | ||
276 | if(*s=='C' && s[1]=='_') { | |
277 | s+=2; | |
278 | } else if(*s=='D' && s[1]=='_') { | |
279 | qcFlags<<=2; | |
280 | s+=2; | |
281 | } else { | |
282 | return; | |
283 | } | |
284 | ||
374ca955 | 285 | if(0==uprv_strncmp(s, "NO", 2)) { |
b75a7d8f | 286 | qcFlags&=0xf; |
374ca955 | 287 | } else if(0==uprv_strncmp(s, "MAYBE", 5)) { |
b75a7d8f | 288 | qcFlags&=0x30; |
374ca955 A |
289 | } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') { |
290 | /* | |
291 | * Unicode 4.0.1: | |
292 | * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc. | |
293 | */ | |
294 | /* start of the field */ | |
295 | s=(char *)u_skipWhitespace(s+1); | |
296 | if(*s=='N') { | |
297 | qcFlags&=0xf; | |
298 | } else if(*s=='M') { | |
299 | qcFlags&=0x30; | |
300 | } else { | |
301 | return; /* do nothing for "Yes" because it's the default value */ | |
302 | } | |
b75a7d8f | 303 | } else { |
374ca955 | 304 | return; /* do nothing for "Yes" because it's the default value */ |
b75a7d8f A |
305 | } |
306 | ||
307 | /* set this flag for all code points in this range */ | |
308 | while(start<=end) { | |
309 | setQCFlags(start++, qcFlags); | |
310 | } | |
311 | } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) { | |
312 | /* full composition exclusion */ | |
313 | while(start<=end) { | |
314 | setCompositionExclusion(start++); | |
315 | } | |
374ca955 A |
316 | } else if( |
317 | ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') || | |
318 | (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';')) | |
319 | ||
320 | ) { | |
b75a7d8f A |
321 | /* FC_NFKC_Closure, parse field 2 to get the string */ |
322 | char *t; | |
323 | ||
324 | /* start of the field */ | |
325 | s=(char *)u_skipWhitespace(s+1); | |
326 | ||
327 | /* find the end of the field */ | |
328 | for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {} | |
329 | *t=0; | |
330 | ||
331 | string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode); | |
332 | if(U_FAILURE(*pErrorCode)) { | |
333 | fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]); | |
334 | exit(*pErrorCode); | |
335 | } | |
336 | while(start<=end) { | |
337 | setFNC(start++, string); | |
338 | } | |
339 | } | |
340 | } | |
341 | ||
342 | static void | |
343 | parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) { | |
344 | char *fields[2][2]; | |
345 | ||
346 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
347 | return; | |
348 | } | |
349 | ||
350 | u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode); | |
351 | if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { | |
352 | fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); | |
353 | exit(*pErrorCode); | |
354 | } | |
355 | } | |
356 | ||
357 | /* parser for UnicodeData.txt ----------------------------------------------- */ | |
358 | ||
359 | static void U_CALLCONV | |
360 | unicodeDataLineFn(void *context, | |
361 | char *fields[][2], int32_t fieldCount, | |
362 | UErrorCode *pErrorCode) { | |
363 | uint32_t decomp[40]; | |
364 | Norm norm; | |
365 | const char *s; | |
366 | char *end; | |
367 | uint32_t code, value; | |
368 | int32_t length; | |
369 | UBool isCompat, something=FALSE; | |
370 | ||
371 | /* ignore First and Last entries for ranges */ | |
372 | if( *fields[1][0]=='<' && | |
373 | (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 && | |
374 | (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) | |
375 | ) { | |
376 | return; | |
377 | } | |
378 | ||
379 | /* reset the properties */ | |
380 | uprv_memset(&norm, 0, sizeof(Norm)); | |
381 | ||
382 | /* get the character code, field 0 */ | |
383 | code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); | |
384 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
385 | fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]); | |
386 | *pErrorCode=U_PARSE_ERROR; | |
387 | exit(U_PARSE_ERROR); | |
388 | } | |
389 | ||
390 | /* get canonical combining class, field 3 */ | |
391 | value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10); | |
392 | if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { | |
393 | fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]); | |
394 | *pErrorCode=U_PARSE_ERROR; | |
395 | exit(U_PARSE_ERROR); | |
396 | } | |
397 | if(value>0) { | |
398 | norm.udataCC=(uint8_t)value; | |
399 | something=TRUE; | |
400 | } | |
401 | ||
402 | /* get the decomposition, field 5 */ | |
403 | if(fields[5][0]<fields[5][1]) { | |
404 | if(*(s=fields[5][0])=='<') { | |
405 | ++s; | |
406 | isCompat=TRUE; | |
407 | ||
408 | /* skip and ignore the compatibility type name */ | |
409 | do { | |
410 | if(s==fields[5][1]) { | |
411 | /* missing '>' */ | |
412 | fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]); | |
413 | *pErrorCode=U_PARSE_ERROR; | |
414 | exit(U_PARSE_ERROR); | |
415 | } | |
416 | } while(*s++!='>'); | |
417 | } else { | |
418 | isCompat=FALSE; | |
419 | } | |
420 | ||
421 | /* parse the decomposition string */ | |
422 | length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode); | |
423 | if(U_FAILURE(*pErrorCode)) { | |
424 | fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n", | |
425 | (long)code, u_errorName(*pErrorCode)); | |
426 | exit(*pErrorCode); | |
427 | } | |
428 | ||
429 | /* store the string */ | |
430 | if(length>0) { | |
431 | something=TRUE; | |
432 | if(isCompat) { | |
433 | norm.lenNFKD=(uint8_t)length; | |
434 | norm.nfkd=decomp; | |
435 | } else { | |
436 | if(length>2) { | |
437 | fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n", | |
438 | (long)code, (long)length); | |
439 | *pErrorCode=U_PARSE_ERROR; | |
440 | exit(U_PARSE_ERROR); | |
441 | } | |
442 | norm.lenNFD=(uint8_t)length; | |
443 | norm.nfd=decomp; | |
444 | } | |
445 | } | |
446 | } | |
447 | ||
448 | /* check for non-character code points */ | |
449 | if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) { | |
450 | fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n", | |
451 | (long)code); | |
452 | *pErrorCode=U_PARSE_ERROR; | |
453 | exit(U_PARSE_ERROR); | |
454 | } | |
455 | ||
456 | if(something) { | |
457 | /* there are normalization values, so store them */ | |
458 | #if 0 | |
459 | if(beVerbose) { | |
460 | printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n", | |
461 | (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD); | |
462 | } | |
463 | #endif | |
464 | storeNorm(code, &norm); | |
465 | } | |
466 | } | |
467 | ||
468 | static void | |
469 | parseDB(const char *filename, UErrorCode *pErrorCode) { | |
470 | char *fields[15][2]; | |
471 | ||
472 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
473 | return; | |
474 | } | |
475 | ||
476 | u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); | |
477 | if(U_FAILURE(*pErrorCode)) { | |
478 | fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); | |
479 | exit(*pErrorCode); | |
480 | } | |
481 | } | |
482 | ||
483 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
484 | ||
485 | /* | |
486 | * Hey, Emacs, please set the following: | |
487 | * | |
488 | * Local Variables: | |
489 | * indent-tabs-mode: nil | |
490 | * End: | |
491 | * | |
492 | */ |