]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennorm/gennorm.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / gennorm / gennorm.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gennorm.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001may25
14 * created by: Markus W. Scherer
15 *
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the data for normalization.
18 * It then preprocesses it and writes a binary file for efficient use
19 * in various Unicode text normalization processes.
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "unicode/udata.h"
30 #include "unicode/uset.h"
31 #include "cmemory.h"
32 #include "cstring.h"
33 #include "unewdata.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36 #include "unormimp.h"
37
38 U_CDECL_BEGIN
39 #include "gennorm.h"
40 U_CDECL_END
41
42 UBool beVerbose=FALSE, haveCopyright=TRUE;
43
44 /* prototypes --------------------------------------------------------------- */
45
46 static void
47 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
48
49 static void
50 parseDB(const char *filename, UErrorCode *pErrorCode);
51
52 /* -------------------------------------------------------------------------- */
53
54 enum {
55 HELP_H,
56 HELP_QUESTION_MARK,
57 VERBOSE,
58 COPYRIGHT,
59 DESTDIR,
60 SOURCEDIR,
61 UNICODE_VERSION,
62 ICUDATADIR,
63 CSOURCE,
64 STORE_FLAGS
65 };
66
67 static UOption options[]={
68 UOPTION_HELP_H,
69 UOPTION_HELP_QUESTION_MARK,
70 UOPTION_VERBOSE,
71 UOPTION_COPYRIGHT,
72 UOPTION_DESTDIR,
73 UOPTION_SOURCEDIR,
74 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
75 UOPTION_ICUDATADIR,
76 UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
77 UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
78 };
79
80 extern int
81 main(int argc, char* argv[]) {
82 #if !UCONFIG_NO_NORMALIZATION
83 char filename[300];
84 #endif
85 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
86 char *basename=NULL;
87 UErrorCode errorCode=U_ZERO_ERROR;
88
89 U_MAIN_INIT_ARGS(argc, argv);
90
91 /* preset then read command line options */
92 options[4].value=u_getDataDirectory();
93 options[5].value="";
94 options[6].value="3.0.0";
95 options[ICUDATADIR].value=u_getDataDirectory();
96 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
97
98 /* error handling, printing usage message */
99 if(argc<0) {
100 fprintf(stderr,
101 "error in command line argument \"%s\"\n",
102 argv[-argc]);
103 }
104 if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
105 /*
106 * Broken into chucks because the C89 standard says the minimum
107 * required supported string length is 509 bytes.
108 */
109 fprintf(stderr,
110 "Usage: %s [-options] [suffix]\n"
111 "\n"
112 "Read the UnicodeData.txt file and other Unicode properties files and\n"
113 "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
114 "\n",
115 argv[0]);
116 fprintf(stderr,
117 "Options:\n"
118 "\t-h or -? or --help this usage text\n"
119 "\t-v or --verbose verbose output\n"
120 "\t-c or --copyright include a copyright notice\n"
121 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
122 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
123 fprintf(stderr,
124 "\t-p or --prune flags Prune for data modularization:\n"
125 "\t Determine what data is to be stored.\n"
126 "\t 0 (zero) stores minimal data (only for NFD)\n"
127 "\t lowercase letters turn off data, uppercase turn on (use with 0)\n");
128 fprintf(stderr,
129 "\t k: compatibility decompositions (NFKC, NFKD)\n"
130 "\t c: composition data (NFC, NFKC)\n"
131 "\t f: FCD data (will be generated at load time)\n"
132 "\t a: auxiliary data (canonical closure etc.)\n"
133 "\t x: exclusion sets (Unicode 3.2-level normalization)\n");
134 fprintf(stderr,
135 "\t-d or --destdir destination directory, followed by the path\n"
136 "\t-s or --sourcedir source directory, followed by the path\n"
137 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
138 "\t followed by path, defaults to <%s>\n"
139 "\tsuffix suffix that is to be appended with a '-'\n"
140 "\t to the source file basenames before opening;\n"
141 "\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
142 u_getDataDirectory());
143 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
144 }
145
146 /* get the options values */
147 beVerbose=options[2].doesOccur;
148 haveCopyright=options[3].doesOccur;
149 srcDir=options[5].value;
150 destDir=options[4].value;
151
152 if(argc>=2) {
153 suffix=argv[1];
154 } else {
155 suffix=NULL;
156 }
157
158 #if UCONFIG_NO_NORMALIZATION
159
160 fprintf(stderr,
161 "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
162 " because UCONFIG_NO_NORMALIZATION is set, \n"
163 "see icu/source/common/unicode/uconfig.h\n");
164 generateData(destDir, options[CSOURCE].doesOccur);
165
166 #else
167
168 setUnicodeVersion(options[6].value);
169
170 if (options[ICUDATADIR].doesOccur) {
171 u_setDataDirectory(options[ICUDATADIR].value);
172 }
173
174 if(options[STORE_FLAGS].doesOccur) {
175 const char *s=options[STORE_FLAGS].value;
176 char c;
177
178 while((c=*s++)!=0) {
179 switch(c) {
180 case '0':
181 gStoreFlags=0; /* store minimal data (only for NFD) */
182 break;
183
184 /* lowercase letters: omit data */
185 case 'k':
186 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT);
187 break;
188 case 'c':
189 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION);
190 break;
191 case 'f':
192 gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD);
193 break;
194 case 'a':
195 gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX);
196 break;
197 case 'x':
198 gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS);
199 break;
200
201 /* uppercase letters: include data (use with 0) */
202 case 'K':
203 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT);
204 break;
205 case 'C':
206 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION);
207 break;
208 case 'F':
209 gStoreFlags|=U_MASK(UGENNORM_STORE_FCD);
210 break;
211 case 'A':
212 gStoreFlags|=U_MASK(UGENNORM_STORE_AUX);
213 break;
214 case 'X':
215 gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS);
216 break;
217
218 default:
219 fprintf(stderr, "ignoring undefined prune flag '%c'\n", c);
220 break;
221 }
222 }
223 }
224
225 /*
226 * Verify that we can work with properties
227 * but don't call u_init() because that needs unorm.icu which we are just
228 * going to build here.
229 */
230 {
231 U_STRING_DECL(ideo, "[:Ideographic:]", 15);
232 USet *set;
233
234 U_STRING_INIT(ideo, "[:Ideographic:]", 15);
235 set=uset_openPattern(ideo, -1, &errorCode);
236 if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
237 fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
238 exit(errorCode);
239 }
240 uset_close(set);
241 }
242
243 /* prepare the filename beginning with the source dir */
244 uprv_strcpy(filename, srcDir);
245 basename=filename+uprv_strlen(filename);
246 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
247 *basename++=U_FILE_SEP_CHAR;
248 }
249
250 /* initialize */
251 init();
252
253 /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
254 if(suffix==NULL) {
255 uprv_strcpy(basename, "DerivedNormalizationProps.txt");
256 } else {
257 uprv_strcpy(basename, "DerivedNormalizationProps");
258 basename[30]='-';
259 uprv_strcpy(basename+31, suffix);
260 uprv_strcat(basename+31, ".txt");
261 }
262 parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
263 if(U_FAILURE(errorCode)) {
264 /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
265 if(suffix==NULL) {
266 uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
267 } else {
268 uprv_strcpy(basename, "DerivedNormalizationProperties");
269 basename[30]='-';
270 uprv_strcpy(basename+31, suffix);
271 uprv_strcat(basename+31, ".txt");
272 }
273 parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
274 }
275
276 /* process UnicodeData.txt */
277 if(suffix==NULL) {
278 uprv_strcpy(basename, "UnicodeData.txt");
279 } else {
280 uprv_strcpy(basename, "UnicodeData");
281 basename[11]='-';
282 uprv_strcpy(basename+12, suffix);
283 uprv_strcat(basename+12, ".txt");
284 }
285 parseDB(filename, &errorCode);
286
287 /* process parsed data */
288 if(U_SUCCESS(errorCode)) {
289 processData();
290
291 /* write the properties data file */
292 generateData(destDir, options[CSOURCE].doesOccur);
293
294 cleanUpData();
295 }
296
297 #endif
298
299 return errorCode;
300 }
301
302 #if !UCONFIG_NO_NORMALIZATION
303
304 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
305
306 static void U_CALLCONV
307 derivedNormalizationPropertiesLineFn(void *context,
308 char *fields[][2], int32_t fieldCount,
309 UErrorCode *pErrorCode) {
310 UChar string[32];
311 char *s;
312 uint32_t start, end;
313 int32_t count;
314 uint8_t qcFlags;
315
316 /* get code point range */
317 count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
318 if(U_FAILURE(*pErrorCode)) {
319 fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
320 exit(*pErrorCode);
321 }
322
323 /* ignore hangul - handle explicitly */
324 if(start==0xac00) {
325 return;
326 }
327
328 /* get property - ignore unrecognized ones */
329 s=(char *)u_skipWhitespace(fields[1][0]);
330 if(*s=='N' && s[1]=='F') {
331 /* quick check flag */
332 qcFlags=0x11;
333 s+=2;
334 if(*s=='K') {
335 qcFlags<<=1;
336 ++s;
337 }
338
339 if(*s=='C' && s[1]=='_') {
340 s+=2;
341 } else if(*s=='D' && s[1]=='_') {
342 qcFlags<<=2;
343 s+=2;
344 } else {
345 return;
346 }
347
348 if(0==uprv_strncmp(s, "NO", 2)) {
349 qcFlags&=0xf;
350 } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
351 qcFlags&=0x30;
352 } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
353 /*
354 * Unicode 4.0.1:
355 * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
356 */
357 /* start of the field */
358 s=(char *)u_skipWhitespace(s+1);
359 if(*s=='N') {
360 qcFlags&=0xf;
361 } else if(*s=='M') {
362 qcFlags&=0x30;
363 } else {
364 return; /* do nothing for "Yes" because it's the default value */
365 }
366 } else {
367 return; /* do nothing for "Yes" because it's the default value */
368 }
369
370 /* set this flag for all code points in this range */
371 while(start<=end) {
372 setQCFlags(start++, qcFlags);
373 }
374 } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
375 /* full composition exclusion */
376 while(start<=end) {
377 setCompositionExclusion(start++);
378 }
379 } else if(
380 ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
381 (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
382
383 ) {
384 /* FC_NFKC_Closure, parse field 2 to get the string */
385 char *t;
386
387 /* start of the field */
388 s=(char *)u_skipWhitespace(s+1);
389
390 /* find the end of the field */
391 for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
392 *t=0;
393
394 string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
395 if(U_FAILURE(*pErrorCode)) {
396 fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
397 exit(*pErrorCode);
398 }
399 while(start<=end) {
400 setFNC(start++, string);
401 }
402 }
403 }
404
405 static void
406 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
407 char *fields[2][2];
408
409 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
410 return;
411 }
412
413 u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
414 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
415 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
416 exit(*pErrorCode);
417 }
418 }
419
420 /* parser for UnicodeData.txt ----------------------------------------------- */
421
422 static void U_CALLCONV
423 unicodeDataLineFn(void *context,
424 char *fields[][2], int32_t fieldCount,
425 UErrorCode *pErrorCode) {
426 uint32_t decomp[40];
427 Norm norm;
428 const char *s;
429 char *end;
430 uint32_t code, value;
431 int32_t length;
432 UBool isCompat, something=FALSE;
433
434 /* ignore First and Last entries for ranges */
435 if( *fields[1][0]=='<' &&
436 (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
437 (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
438 ) {
439 return;
440 }
441
442 /* reset the properties */
443 uprv_memset(&norm, 0, sizeof(Norm));
444
445 /*
446 * The combiningIndex must not be initialized to 0 because 0 is the
447 * combiningIndex of the first forward-combining character.
448 */
449 norm.combiningIndex=0xffff;
450
451 /* get the character code, field 0 */
452 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
453 if(end<=fields[0][0] || end!=fields[0][1]) {
454 fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
455 *pErrorCode=U_PARSE_ERROR;
456 exit(U_PARSE_ERROR);
457 }
458
459 /* get canonical combining class, field 3 */
460 value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
461 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
462 fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
463 *pErrorCode=U_PARSE_ERROR;
464 exit(U_PARSE_ERROR);
465 }
466 if(value>0) {
467 norm.udataCC=(uint8_t)value;
468 something=TRUE;
469 }
470
471 /* get the decomposition, field 5 */
472 if(fields[5][0]<fields[5][1]) {
473 if(*(s=fields[5][0])=='<') {
474 ++s;
475 isCompat=TRUE;
476
477 /* skip and ignore the compatibility type name */
478 do {
479 if(s==fields[5][1]) {
480 /* missing '>' */
481 fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
482 *pErrorCode=U_PARSE_ERROR;
483 exit(U_PARSE_ERROR);
484 }
485 } while(*s++!='>');
486 } else {
487 isCompat=FALSE;
488 }
489
490 /* parse the decomposition string */
491 length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
492 if(U_FAILURE(*pErrorCode)) {
493 fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
494 (long)code, u_errorName(*pErrorCode));
495 exit(*pErrorCode);
496 }
497
498 /* store the string */
499 if(length>0) {
500 something=TRUE;
501 if(isCompat) {
502 norm.lenNFKD=(uint8_t)length;
503 norm.nfkd=decomp;
504 } else {
505 if(length>2) {
506 fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
507 (long)code, (long)length);
508 *pErrorCode=U_PARSE_ERROR;
509 exit(U_PARSE_ERROR);
510 }
511 norm.lenNFD=(uint8_t)length;
512 norm.nfd=decomp;
513 }
514 }
515 }
516
517 /* check for non-character code points */
518 if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
519 fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
520 (long)code);
521 *pErrorCode=U_PARSE_ERROR;
522 exit(U_PARSE_ERROR);
523 }
524
525 if(something) {
526 /* there are normalization values, so store them */
527 #if 0
528 if(beVerbose) {
529 printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
530 (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
531 }
532 #endif
533 storeNorm(code, &norm);
534 }
535 }
536
537 static void
538 parseDB(const char *filename, UErrorCode *pErrorCode) {
539 char *fields[15][2];
540
541 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
542 return;
543 }
544
545 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
546 if(U_FAILURE(*pErrorCode)) {
547 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
548 exit(*pErrorCode);
549 }
550 }
551
552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
553
554 /*
555 * Hey, Emacs, please set the following:
556 *
557 * Local Variables:
558 * indent-tabs-mode: nil
559 * End:
560 *
561 */