2 *******************************************************************************
4 * Copyright (C) 2001-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001may25
14 * created by: Markus W. Scherer
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the data for normalization.
18 * It then preprocesses it and writes a binary file for efficient use
19 * in various Unicode text normalization processes.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "unicode/udata.h"
30 #include "unicode/uset.h"
42 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
44 /* prototypes --------------------------------------------------------------- */
47 parseDerivedNormalizationProperties(const char *filename
, UErrorCode
*pErrorCode
, UBool reportError
);
50 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
52 /* -------------------------------------------------------------------------- */
67 static UOption options
[]={
69 UOPTION_HELP_QUESTION_MARK
,
74 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
76 UOPTION_DEF("csource", 'C', UOPT_NO_ARG
),
77 UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG
)
81 main(int argc
, char* argv
[]) {
82 #if !UCONFIG_NO_NORMALIZATION
85 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
87 UErrorCode errorCode
=U_ZERO_ERROR
;
89 U_MAIN_INIT_ARGS(argc
, argv
);
91 /* preset then read command line options */
92 options
[4].value
=u_getDataDirectory();
94 options
[6].value
="3.0.0";
95 options
[ICUDATADIR
].value
=u_getDataDirectory();
96 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
98 /* error handling, printing usage message */
101 "error in command line argument \"%s\"\n",
104 if(argc
<0 || options
[0].doesOccur
|| options
[1].doesOccur
) {
106 * Broken into chucks because the C89 standard says the minimum
107 * required supported string length is 509 bytes.
110 "Usage: %s [-options] [suffix]\n"
112 "Read the UnicodeData.txt file and other Unicode properties files and\n"
113 "create a binary file " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
" with the normalization data\n"
118 "\t-h or -? or --help this usage text\n"
119 "\t-v or --verbose verbose output\n"
120 "\t-c or --copyright include a copyright notice\n"
121 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
122 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
124 "\t-p or --prune flags Prune for data modularization:\n"
125 "\t Determine what data is to be stored.\n"
126 "\t 0 (zero) stores minimal data (only for NFD)\n"
127 "\t lowercase letters turn off data, uppercase turn on (use with 0)\n");
129 "\t k: compatibility decompositions (NFKC, NFKD)\n"
130 "\t c: composition data (NFC, NFKC)\n"
131 "\t f: FCD data (will be generated at load time)\n"
132 "\t a: auxiliary data (canonical closure etc.)\n"
133 "\t x: exclusion sets (Unicode 3.2-level normalization)\n");
135 "\t-d or --destdir destination directory, followed by the path\n"
136 "\t-s or --sourcedir source directory, followed by the path\n"
137 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
138 "\t followed by path, defaults to <%s>\n"
139 "\tsuffix suffix that is to be appended with a '-'\n"
140 "\t to the source file basenames before opening;\n"
141 "\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
142 u_getDataDirectory());
143 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
146 /* get the options values */
147 beVerbose
=options
[2].doesOccur
;
148 haveCopyright
=options
[3].doesOccur
;
149 srcDir
=options
[5].value
;
150 destDir
=options
[4].value
;
158 #if UCONFIG_NO_NORMALIZATION
161 "gennorm writes a dummy " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
162 " because UCONFIG_NO_NORMALIZATION is set, \n"
163 "see icu/source/common/unicode/uconfig.h\n");
164 generateData(destDir
, options
[CSOURCE
].doesOccur
);
168 setUnicodeVersion(options
[6].value
);
170 if (options
[ICUDATADIR
].doesOccur
) {
171 u_setDataDirectory(options
[ICUDATADIR
].value
);
174 if(options
[STORE_FLAGS
].doesOccur
) {
175 const char *s
=options
[STORE_FLAGS
].value
;
181 gStoreFlags
=0; /* store minimal data (only for NFD) */
184 /* lowercase letters: omit data */
186 gStoreFlags
&=~U_MASK(UGENNORM_STORE_COMPAT
);
189 gStoreFlags
&=~U_MASK(UGENNORM_STORE_COMPOSITION
);
192 gStoreFlags
&=~U_MASK(UGENNORM_STORE_FCD
);
195 gStoreFlags
&=~U_MASK(UGENNORM_STORE_AUX
);
198 gStoreFlags
&=~U_MASK(UGENNORM_STORE_EXCLUSIONS
);
201 /* uppercase letters: include data (use with 0) */
203 gStoreFlags
|=U_MASK(UGENNORM_STORE_COMPAT
);
206 gStoreFlags
|=U_MASK(UGENNORM_STORE_COMPOSITION
);
209 gStoreFlags
|=U_MASK(UGENNORM_STORE_FCD
);
212 gStoreFlags
|=U_MASK(UGENNORM_STORE_AUX
);
215 gStoreFlags
|=U_MASK(UGENNORM_STORE_EXCLUSIONS
);
219 fprintf(stderr
, "ignoring undefined prune flag '%c'\n", c
);
226 * Verify that we can work with properties
227 * but don't call u_init() because that needs unorm.icu which we are just
228 * going to build here.
231 U_STRING_DECL(ideo
, "[:Ideographic:]", 15);
234 U_STRING_INIT(ideo
, "[:Ideographic:]", 15);
235 set
=uset_openPattern(ideo
, -1, &errorCode
);
236 if(U_FAILURE(errorCode
) || !uset_contains(set
, 0xf900)) {
237 fprintf(stderr
, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode
));
243 /* prepare the filename beginning with the source dir */
244 uprv_strcpy(filename
, srcDir
);
245 basename
=filename
+uprv_strlen(filename
);
246 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
247 *basename
++=U_FILE_SEP_CHAR
;
253 /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
255 uprv_strcpy(basename
, "DerivedNormalizationProps.txt");
257 uprv_strcpy(basename
, "DerivedNormalizationProps");
259 uprv_strcpy(basename
+31, suffix
);
260 uprv_strcat(basename
+31, ".txt");
262 parseDerivedNormalizationProperties(filename
, &errorCode
, FALSE
);
263 if(U_FAILURE(errorCode
)) {
264 /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
266 uprv_strcpy(basename
, "DerivedNormalizationProperties.txt");
268 uprv_strcpy(basename
, "DerivedNormalizationProperties");
270 uprv_strcpy(basename
+31, suffix
);
271 uprv_strcat(basename
+31, ".txt");
273 parseDerivedNormalizationProperties(filename
, &errorCode
, TRUE
);
276 /* process UnicodeData.txt */
278 uprv_strcpy(basename
, "UnicodeData.txt");
280 uprv_strcpy(basename
, "UnicodeData");
282 uprv_strcpy(basename
+12, suffix
);
283 uprv_strcat(basename
+12, ".txt");
285 parseDB(filename
, &errorCode
);
287 /* process parsed data */
288 if(U_SUCCESS(errorCode
)) {
291 /* write the properties data file */
292 generateData(destDir
, options
[CSOURCE
].doesOccur
);
302 #if !UCONFIG_NO_NORMALIZATION
304 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
306 static void U_CALLCONV
307 derivedNormalizationPropertiesLineFn(void *context
,
308 char *fields
[][2], int32_t fieldCount
,
309 UErrorCode
*pErrorCode
) {
316 /* get code point range */
317 count
=u_parseCodePointRange(fields
[0][0], &start
, &end
, pErrorCode
);
318 if(U_FAILURE(*pErrorCode
)) {
319 fprintf(stderr
, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields
[0][0]);
323 /* ignore hangul - handle explicitly */
328 /* get property - ignore unrecognized ones */
329 s
=(char *)u_skipWhitespace(fields
[1][0]);
330 if(*s
=='N' && s
[1]=='F') {
331 /* quick check flag */
339 if(*s
=='C' && s
[1]=='_') {
341 } else if(*s
=='D' && s
[1]=='_') {
348 if(0==uprv_strncmp(s
, "NO", 2)) {
350 } else if(0==uprv_strncmp(s
, "MAYBE", 5)) {
352 } else if(0==uprv_strncmp(s
, "QC", 2) && *(s
=(char *)u_skipWhitespace(s
+2))==';') {
355 * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
357 /* start of the field */
358 s
=(char *)u_skipWhitespace(s
+1);
364 return; /* do nothing for "Yes" because it's the default value */
367 return; /* do nothing for "Yes" because it's the default value */
370 /* set this flag for all code points in this range */
372 setQCFlags(start
++, qcFlags
);
374 } else if(0==uprv_memcmp(s
, "Comp_Ex", 7) || 0==uprv_memcmp(s
, "Full_Composition_Exclusion", 26)) {
375 /* full composition exclusion */
377 setCompositionExclusion(start
++);
380 ((0==uprv_memcmp(s
, "FNC", 3) && *(s
=(char *)u_skipWhitespace(s
+3))==';') ||
381 (0==uprv_memcmp(s
, "FC_NFKC", 7) && *(s
=(char *)u_skipWhitespace(s
+7))==';'))
384 /* FC_NFKC_Closure, parse field 2 to get the string */
387 /* start of the field */
388 s
=(char *)u_skipWhitespace(s
+1);
390 /* find the end of the field */
391 for(t
=s
; *t
!=';' && *t
!='#' && *t
!=0 && *t
!='\n' && *t
!='\r'; ++t
) {}
394 string
[0]=(UChar
)u_parseString(s
, string
+1, 31, NULL
, pErrorCode
);
395 if(U_FAILURE(*pErrorCode
)) {
396 fprintf(stderr
, "gennorm error: illegal FNC string at %s\n", fields
[0][0]);
400 setFNC(start
++, string
);
406 parseDerivedNormalizationProperties(const char *filename
, UErrorCode
*pErrorCode
, UBool reportError
) {
409 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
413 u_parseDelimitedFile(filename
, ';', fields
, 2, derivedNormalizationPropertiesLineFn
, NULL
, pErrorCode
);
414 if(U_FAILURE(*pErrorCode
) && (reportError
|| *pErrorCode
!=U_FILE_ACCESS_ERROR
)) {
415 fprintf(stderr
, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename
, u_errorName(*pErrorCode
));
420 /* parser for UnicodeData.txt ----------------------------------------------- */
422 static void U_CALLCONV
423 unicodeDataLineFn(void *context
,
424 char *fields
[][2], int32_t fieldCount
,
425 UErrorCode
*pErrorCode
) {
430 uint32_t code
, value
;
432 UBool isCompat
, something
=FALSE
;
434 /* ignore First and Last entries for ranges */
435 if( *fields
[1][0]=='<' &&
436 (length
=(int32_t)(fields
[1][1]-fields
[1][0]))>=9 &&
437 (0==uprv_memcmp(", First>", fields
[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields
[1][1]-7, 7))
442 /* reset the properties */
443 uprv_memset(&norm
, 0, sizeof(Norm
));
446 * The combiningIndex must not be initialized to 0 because 0 is the
447 * combiningIndex of the first forward-combining character.
449 norm
.combiningIndex
=0xffff;
451 /* get the character code, field 0 */
452 code
=(uint32_t)uprv_strtoul(fields
[0][0], &end
, 16);
453 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
454 fprintf(stderr
, "gennorm: syntax error in field 0 at %s\n", fields
[0][0]);
455 *pErrorCode
=U_PARSE_ERROR
;
459 /* get canonical combining class, field 3 */
460 value
=(uint32_t)uprv_strtoul(fields
[3][0], &end
, 10);
461 if(end
<=fields
[3][0] || end
!=fields
[3][1] || value
>0xff) {
462 fprintf(stderr
, "gennorm: syntax error in field 3 at %s\n", fields
[0][0]);
463 *pErrorCode
=U_PARSE_ERROR
;
467 norm
.udataCC
=(uint8_t)value
;
471 /* get the decomposition, field 5 */
472 if(fields
[5][0]<fields
[5][1]) {
473 if(*(s
=fields
[5][0])=='<') {
477 /* skip and ignore the compatibility type name */
479 if(s
==fields
[5][1]) {
481 fprintf(stderr
, "gennorm: syntax error in field 5 at %s\n", fields
[0][0]);
482 *pErrorCode
=U_PARSE_ERROR
;
490 /* parse the decomposition string */
491 length
=u_parseCodePoints(s
, decomp
, sizeof(decomp
)/4, pErrorCode
);
492 if(U_FAILURE(*pErrorCode
)) {
493 fprintf(stderr
, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
494 (long)code
, u_errorName(*pErrorCode
));
498 /* store the string */
502 norm
.lenNFKD
=(uint8_t)length
;
506 fprintf(stderr
, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
507 (long)code
, (long)length
);
508 *pErrorCode
=U_PARSE_ERROR
;
511 norm
.lenNFD
=(uint8_t)length
;
517 /* check for non-character code points */
518 if((code
&0xfffe)==0xfffe || (uint32_t)(code
-0xfdd0)<0x20 || code
>0x10ffff) {
519 fprintf(stderr
, "gennorm: error - properties for non-character code point U+%04lx\n",
521 *pErrorCode
=U_PARSE_ERROR
;
526 /* there are normalization values, so store them */
529 printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
530 (long)code
, norm
.udataCC
, (long)norm
.lenNFD
, (long)norm
.lenNFKD
);
533 storeNorm(code
, &norm
);
538 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
541 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
545 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
546 if(U_FAILURE(*pErrorCode
)) {
547 fprintf(stderr
, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename
, u_errorName(*pErrorCode
));
552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
555 * Hey, Emacs, please set the following:
558 * indent-tabs-mode: nil