]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennorm/gennorm.c
ICU-6.2.10.tar.gz
[apple/icu.git] / icuSources / tools / gennorm / gennorm.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gennorm.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001may25
14 * created by: Markus W. Scherer
15 *
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the data for normalization.
18 * It then preprocesses it and writes a binary file for efficient use
19 * in various Unicode text normalization processes.
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "unicode/udata.h"
30 #include "unicode/uset.h"
31 #include "cmemory.h"
32 #include "cstring.h"
33 #include "unewdata.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36 #include "unormimp.h"
37
38 U_CDECL_BEGIN
39 #include "gennorm.h"
40 U_CDECL_END
41
42 #ifdef WIN32
43 # pragma warning(disable: 4100)
44 #endif
45
46 UBool beVerbose=FALSE, haveCopyright=TRUE;
47
48 /* prototypes --------------------------------------------------------------- */
49
50 static void
51 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
52
53 static void
54 parseDB(const char *filename, UErrorCode *pErrorCode);
55
56 /* -------------------------------------------------------------------------- */
57
58 enum {
59 HELP_H,
60 HELP_QUESTION_MARK,
61 VERBOSE,
62 COPYRIGHT,
63 DESTDIR,
64 SOURCEDIR,
65 UNICODE_VERSION,
66 ICUDATADIR
67 };
68
69 static UOption options[]={
70 UOPTION_HELP_H,
71 UOPTION_HELP_QUESTION_MARK,
72 UOPTION_VERBOSE,
73 UOPTION_COPYRIGHT,
74 UOPTION_DESTDIR,
75 UOPTION_SOURCEDIR,
76 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
77 UOPTION_ICUDATADIR
78 };
79
80 extern int
81 main(int argc, char* argv[]) {
82 #if !UCONFIG_NO_NORMALIZATION
83 char filename[300];
84 #endif
85 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
86 char *basename=NULL;
87 UErrorCode errorCode=U_ZERO_ERROR;
88
89 U_MAIN_INIT_ARGS(argc, argv);
90
91 /* preset then read command line options */
92 options[4].value=u_getDataDirectory();
93 options[5].value="";
94 options[6].value="3.0.0";
95 options[ICUDATADIR].value=u_getDataDirectory();
96 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
97
98 /* error handling, printing usage message */
99 if(argc<0) {
100 fprintf(stderr,
101 "error in command line argument \"%s\"\n",
102 argv[-argc]);
103 }
104 if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
105 /*
106 * Broken into chucks because the C89 standard says the minimum
107 * required supported string length is 509 bytes.
108 */
109 fprintf(stderr,
110 "Usage: %s [-options] [suffix]\n"
111 "\n"
112 "Read the UnicodeData.txt file and other Unicode properties files and\n"
113 "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
114 "\n",
115 argv[0]);
116 fprintf(stderr,
117 "Options:\n"
118 "\t-h or -? or --help this usage text\n"
119 "\t-v or --verbose verbose output\n"
120 "\t-c or --copyright include a copyright notice\n"
121 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
122 fprintf(stderr,
123 "\t-d or --destdir destination directory, followed by the path\n"
124 "\t-s or --sourcedir source directory, followed by the path\n"
125 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
126 "\t followed by path, defaults to <%s>\n"
127 "\tsuffix suffix that is to be appended with a '-'\n"
128 "\t to the source file basenames before opening;\n"
129 "\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
130 u_getDataDirectory());
131 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
132 }
133
134 /* get the options values */
135 beVerbose=options[2].doesOccur;
136 haveCopyright=options[3].doesOccur;
137 srcDir=options[5].value;
138 destDir=options[4].value;
139
140 if(argc>=2) {
141 suffix=argv[1];
142 } else {
143 suffix=NULL;
144 }
145
146 #if UCONFIG_NO_NORMALIZATION
147
148 fprintf(stderr,
149 "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
150 " because UCONFIG_NO_NORMALIZATION is set, \n"
151 "see icu/source/common/unicode/uconfig.h\n");
152 generateData(destDir);
153
154 #else
155
156 setUnicodeVersion(options[6].value);
157
158 if (options[ICUDATADIR].doesOccur) {
159 u_setDataDirectory(options[ICUDATADIR].value);
160 }
161
162 /*
163 * Verify that we can work with properties
164 * but don't call u_init() because that needs unorm.icu which we are just
165 * going to build here.
166 */
167 {
168 U_STRING_DECL(ideo, "[:Ideographic:]", 15);
169 USet *set;
170
171 U_STRING_INIT(ideo, "[:Ideographic:]", 15);
172 set=uset_openPattern(ideo, -1, &errorCode);
173 if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
174 fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
175 exit(errorCode);
176 }
177 uset_close(set);
178 }
179
180 /* prepare the filename beginning with the source dir */
181 uprv_strcpy(filename, srcDir);
182 basename=filename+uprv_strlen(filename);
183 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
184 *basename++=U_FILE_SEP_CHAR;
185 }
186
187 /* initialize */
188 init();
189
190 /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
191 if(suffix==NULL) {
192 uprv_strcpy(basename, "DerivedNormalizationProps.txt");
193 } else {
194 uprv_strcpy(basename, "DerivedNormalizationProps");
195 basename[30]='-';
196 uprv_strcpy(basename+31, suffix);
197 uprv_strcat(basename+31, ".txt");
198 }
199 parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
200 if(U_FAILURE(errorCode)) {
201 /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
202 if(suffix==NULL) {
203 uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
204 } else {
205 uprv_strcpy(basename, "DerivedNormalizationProperties");
206 basename[30]='-';
207 uprv_strcpy(basename+31, suffix);
208 uprv_strcat(basename+31, ".txt");
209 }
210 parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
211 }
212
213 /* process UnicodeData.txt */
214 if(suffix==NULL) {
215 uprv_strcpy(basename, "UnicodeData.txt");
216 } else {
217 uprv_strcpy(basename, "UnicodeData");
218 basename[11]='-';
219 uprv_strcpy(basename+12, suffix);
220 uprv_strcat(basename+12, ".txt");
221 }
222 parseDB(filename, &errorCode);
223
224 /* process parsed data */
225 if(U_SUCCESS(errorCode)) {
226 processData();
227
228 /* write the properties data file */
229 generateData(destDir);
230
231 cleanUpData();
232 }
233
234 #endif
235
236 return errorCode;
237 }
238
239 #if !UCONFIG_NO_NORMALIZATION
240
241 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
242
243 static void U_CALLCONV
244 derivedNormalizationPropertiesLineFn(void *context,
245 char *fields[][2], int32_t fieldCount,
246 UErrorCode *pErrorCode) {
247 UChar string[32];
248 char *s;
249 uint32_t start, end;
250 int32_t count;
251 uint8_t qcFlags;
252
253 /* get code point range */
254 count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
255 if(U_FAILURE(*pErrorCode)) {
256 fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
257 exit(*pErrorCode);
258 }
259
260 /* ignore hangul - handle explicitly */
261 if(start==0xac00) {
262 return;
263 }
264
265 /* get property - ignore unrecognized ones */
266 s=(char *)u_skipWhitespace(fields[1][0]);
267 if(*s=='N' && s[1]=='F') {
268 /* quick check flag */
269 qcFlags=0x11;
270 s+=2;
271 if(*s=='K') {
272 qcFlags<<=1;
273 ++s;
274 }
275
276 if(*s=='C' && s[1]=='_') {
277 s+=2;
278 } else if(*s=='D' && s[1]=='_') {
279 qcFlags<<=2;
280 s+=2;
281 } else {
282 return;
283 }
284
285 if(0==uprv_strncmp(s, "NO", 2)) {
286 qcFlags&=0xf;
287 } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
288 qcFlags&=0x30;
289 } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
290 /*
291 * Unicode 4.0.1:
292 * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
293 */
294 /* start of the field */
295 s=(char *)u_skipWhitespace(s+1);
296 if(*s=='N') {
297 qcFlags&=0xf;
298 } else if(*s=='M') {
299 qcFlags&=0x30;
300 } else {
301 return; /* do nothing for "Yes" because it's the default value */
302 }
303 } else {
304 return; /* do nothing for "Yes" because it's the default value */
305 }
306
307 /* set this flag for all code points in this range */
308 while(start<=end) {
309 setQCFlags(start++, qcFlags);
310 }
311 } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
312 /* full composition exclusion */
313 while(start<=end) {
314 setCompositionExclusion(start++);
315 }
316 } else if(
317 ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
318 (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
319
320 ) {
321 /* FC_NFKC_Closure, parse field 2 to get the string */
322 char *t;
323
324 /* start of the field */
325 s=(char *)u_skipWhitespace(s+1);
326
327 /* find the end of the field */
328 for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
329 *t=0;
330
331 string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
332 if(U_FAILURE(*pErrorCode)) {
333 fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
334 exit(*pErrorCode);
335 }
336 while(start<=end) {
337 setFNC(start++, string);
338 }
339 }
340 }
341
342 static void
343 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
344 char *fields[2][2];
345
346 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
347 return;
348 }
349
350 u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
351 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
352 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
353 exit(*pErrorCode);
354 }
355 }
356
357 /* parser for UnicodeData.txt ----------------------------------------------- */
358
359 static void U_CALLCONV
360 unicodeDataLineFn(void *context,
361 char *fields[][2], int32_t fieldCount,
362 UErrorCode *pErrorCode) {
363 uint32_t decomp[40];
364 Norm norm;
365 const char *s;
366 char *end;
367 uint32_t code, value;
368 int32_t length;
369 UBool isCompat, something=FALSE;
370
371 /* ignore First and Last entries for ranges */
372 if( *fields[1][0]=='<' &&
373 (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
374 (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
375 ) {
376 return;
377 }
378
379 /* reset the properties */
380 uprv_memset(&norm, 0, sizeof(Norm));
381
382 /* get the character code, field 0 */
383 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
384 if(end<=fields[0][0] || end!=fields[0][1]) {
385 fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
386 *pErrorCode=U_PARSE_ERROR;
387 exit(U_PARSE_ERROR);
388 }
389
390 /* get canonical combining class, field 3 */
391 value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
392 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
393 fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
394 *pErrorCode=U_PARSE_ERROR;
395 exit(U_PARSE_ERROR);
396 }
397 if(value>0) {
398 norm.udataCC=(uint8_t)value;
399 something=TRUE;
400 }
401
402 /* get the decomposition, field 5 */
403 if(fields[5][0]<fields[5][1]) {
404 if(*(s=fields[5][0])=='<') {
405 ++s;
406 isCompat=TRUE;
407
408 /* skip and ignore the compatibility type name */
409 do {
410 if(s==fields[5][1]) {
411 /* missing '>' */
412 fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
413 *pErrorCode=U_PARSE_ERROR;
414 exit(U_PARSE_ERROR);
415 }
416 } while(*s++!='>');
417 } else {
418 isCompat=FALSE;
419 }
420
421 /* parse the decomposition string */
422 length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
423 if(U_FAILURE(*pErrorCode)) {
424 fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
425 (long)code, u_errorName(*pErrorCode));
426 exit(*pErrorCode);
427 }
428
429 /* store the string */
430 if(length>0) {
431 something=TRUE;
432 if(isCompat) {
433 norm.lenNFKD=(uint8_t)length;
434 norm.nfkd=decomp;
435 } else {
436 if(length>2) {
437 fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
438 (long)code, (long)length);
439 *pErrorCode=U_PARSE_ERROR;
440 exit(U_PARSE_ERROR);
441 }
442 norm.lenNFD=(uint8_t)length;
443 norm.nfd=decomp;
444 }
445 }
446 }
447
448 /* check for non-character code points */
449 if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
450 fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
451 (long)code);
452 *pErrorCode=U_PARSE_ERROR;
453 exit(U_PARSE_ERROR);
454 }
455
456 if(something) {
457 /* there are normalization values, so store them */
458 #if 0
459 if(beVerbose) {
460 printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
461 (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
462 }
463 #endif
464 storeNorm(code, &norm);
465 }
466 }
467
468 static void
469 parseDB(const char *filename, UErrorCode *pErrorCode) {
470 char *fields[15][2];
471
472 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
473 return;
474 }
475
476 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
477 if(U_FAILURE(*pErrorCode)) {
478 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
479 exit(*pErrorCode);
480 }
481 }
482
483 #endif /* #if !UCONFIG_NO_NORMALIZATION */
484
485 /*
486 * Hey, Emacs, please set the following:
487 *
488 * Local Variables:
489 * indent-tabs-mode: nil
490 * End:
491 *
492 */