2 *******************************************************************************
4 * Copyright (C) 2001-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001may25
14 * created by: Markus W. Scherer
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the data for normalization.
18 * It then preprocesses it and writes a binary file for efficient use
19 * in various Unicode text normalization processes.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "unicode/udata.h"
30 #include "unicode/uset.h"
43 # pragma warning(disable: 4100)
46 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
48 /* prototypes --------------------------------------------------------------- */
51 parseDerivedNormalizationProperties(const char *filename
, UErrorCode
*pErrorCode
, UBool reportError
);
54 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
56 /* -------------------------------------------------------------------------- */
69 static UOption options
[]={
71 UOPTION_HELP_QUESTION_MARK
,
76 { "unicode", NULL
, NULL
, NULL
, 'u', UOPT_REQUIRES_ARG
, 0 },
81 main(int argc
, char* argv
[]) {
82 #if !UCONFIG_NO_NORMALIZATION
85 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
87 UErrorCode errorCode
=U_ZERO_ERROR
;
89 U_MAIN_INIT_ARGS(argc
, argv
);
91 /* preset then read command line options */
92 options
[4].value
=u_getDataDirectory();
94 options
[6].value
="3.0.0";
95 options
[ICUDATADIR
].value
=u_getDataDirectory();
96 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
98 /* error handling, printing usage message */
101 "error in command line argument \"%s\"\n",
104 if(argc
<0 || options
[0].doesOccur
|| options
[1].doesOccur
) {
106 * Broken into chucks because the C89 standard says the minimum
107 * required supported string length is 509 bytes.
110 "Usage: %s [-options] [suffix]\n"
112 "Read the UnicodeData.txt file and other Unicode properties files and\n"
113 "create a binary file " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
" with the normalization data\n"
118 "\t-h or -? or --help this usage text\n"
119 "\t-v or --verbose verbose output\n"
120 "\t-c or --copyright include a copyright notice\n"
121 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
123 "\t-d or --destdir destination directory, followed by the path\n"
124 "\t-s or --sourcedir source directory, followed by the path\n"
125 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
126 "\t followed by path, defaults to <%s>\n"
127 "\tsuffix suffix that is to be appended with a '-'\n"
128 "\t to the source file basenames before opening;\n"
129 "\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
130 u_getDataDirectory());
131 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
134 /* get the options values */
135 beVerbose
=options
[2].doesOccur
;
136 haveCopyright
=options
[3].doesOccur
;
137 srcDir
=options
[5].value
;
138 destDir
=options
[4].value
;
146 #if UCONFIG_NO_NORMALIZATION
149 "gennorm writes a dummy " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
150 " because UCONFIG_NO_NORMALIZATION is set, \n"
151 "see icu/source/common/unicode/uconfig.h\n");
152 generateData(destDir
);
156 setUnicodeVersion(options
[6].value
);
158 if (options
[ICUDATADIR
].doesOccur
) {
159 u_setDataDirectory(options
[ICUDATADIR
].value
);
163 * Verify that we can work with properties
164 * but don't call u_init() because that needs unorm.icu which we are just
165 * going to build here.
168 U_STRING_DECL(ideo
, "[:Ideographic:]", 15);
171 U_STRING_INIT(ideo
, "[:Ideographic:]", 15);
172 set
=uset_openPattern(ideo
, -1, &errorCode
);
173 if(U_FAILURE(errorCode
) || !uset_contains(set
, 0xf900)) {
174 fprintf(stderr
, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode
));
180 /* prepare the filename beginning with the source dir */
181 uprv_strcpy(filename
, srcDir
);
182 basename
=filename
+uprv_strlen(filename
);
183 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
184 *basename
++=U_FILE_SEP_CHAR
;
190 /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
192 uprv_strcpy(basename
, "DerivedNormalizationProps.txt");
194 uprv_strcpy(basename
, "DerivedNormalizationProps");
196 uprv_strcpy(basename
+31, suffix
);
197 uprv_strcat(basename
+31, ".txt");
199 parseDerivedNormalizationProperties(filename
, &errorCode
, FALSE
);
200 if(U_FAILURE(errorCode
)) {
201 /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
203 uprv_strcpy(basename
, "DerivedNormalizationProperties.txt");
205 uprv_strcpy(basename
, "DerivedNormalizationProperties");
207 uprv_strcpy(basename
+31, suffix
);
208 uprv_strcat(basename
+31, ".txt");
210 parseDerivedNormalizationProperties(filename
, &errorCode
, TRUE
);
213 /* process UnicodeData.txt */
215 uprv_strcpy(basename
, "UnicodeData.txt");
217 uprv_strcpy(basename
, "UnicodeData");
219 uprv_strcpy(basename
+12, suffix
);
220 uprv_strcat(basename
+12, ".txt");
222 parseDB(filename
, &errorCode
);
224 /* process parsed data */
225 if(U_SUCCESS(errorCode
)) {
228 /* write the properties data file */
229 generateData(destDir
);
239 #if !UCONFIG_NO_NORMALIZATION
241 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
243 static void U_CALLCONV
244 derivedNormalizationPropertiesLineFn(void *context
,
245 char *fields
[][2], int32_t fieldCount
,
246 UErrorCode
*pErrorCode
) {
253 /* get code point range */
254 count
=u_parseCodePointRange(fields
[0][0], &start
, &end
, pErrorCode
);
255 if(U_FAILURE(*pErrorCode
)) {
256 fprintf(stderr
, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields
[0][0]);
260 /* ignore hangul - handle explicitly */
265 /* get property - ignore unrecognized ones */
266 s
=(char *)u_skipWhitespace(fields
[1][0]);
267 if(*s
=='N' && s
[1]=='F') {
268 /* quick check flag */
276 if(*s
=='C' && s
[1]=='_') {
278 } else if(*s
=='D' && s
[1]=='_') {
285 if(0==uprv_strncmp(s
, "NO", 2)) {
287 } else if(0==uprv_strncmp(s
, "MAYBE", 5)) {
289 } else if(0==uprv_strncmp(s
, "QC", 2) && *(s
=(char *)u_skipWhitespace(s
+2))==';') {
292 * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
294 /* start of the field */
295 s
=(char *)u_skipWhitespace(s
+1);
301 return; /* do nothing for "Yes" because it's the default value */
304 return; /* do nothing for "Yes" because it's the default value */
307 /* set this flag for all code points in this range */
309 setQCFlags(start
++, qcFlags
);
311 } else if(0==uprv_memcmp(s
, "Comp_Ex", 7) || 0==uprv_memcmp(s
, "Full_Composition_Exclusion", 26)) {
312 /* full composition exclusion */
314 setCompositionExclusion(start
++);
317 ((0==uprv_memcmp(s
, "FNC", 3) && *(s
=(char *)u_skipWhitespace(s
+3))==';') ||
318 (0==uprv_memcmp(s
, "FC_NFKC", 7) && *(s
=(char *)u_skipWhitespace(s
+7))==';'))
321 /* FC_NFKC_Closure, parse field 2 to get the string */
324 /* start of the field */
325 s
=(char *)u_skipWhitespace(s
+1);
327 /* find the end of the field */
328 for(t
=s
; *t
!=';' && *t
!='#' && *t
!=0 && *t
!='\n' && *t
!='\r'; ++t
) {}
331 string
[0]=(UChar
)u_parseString(s
, string
+1, 31, NULL
, pErrorCode
);
332 if(U_FAILURE(*pErrorCode
)) {
333 fprintf(stderr
, "gennorm error: illegal FNC string at %s\n", fields
[0][0]);
337 setFNC(start
++, string
);
343 parseDerivedNormalizationProperties(const char *filename
, UErrorCode
*pErrorCode
, UBool reportError
) {
346 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
350 u_parseDelimitedFile(filename
, ';', fields
, 2, derivedNormalizationPropertiesLineFn
, NULL
, pErrorCode
);
351 if(U_FAILURE(*pErrorCode
) && (reportError
|| *pErrorCode
!=U_FILE_ACCESS_ERROR
)) {
352 fprintf(stderr
, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename
, u_errorName(*pErrorCode
));
357 /* parser for UnicodeData.txt ----------------------------------------------- */
359 static void U_CALLCONV
360 unicodeDataLineFn(void *context
,
361 char *fields
[][2], int32_t fieldCount
,
362 UErrorCode
*pErrorCode
) {
367 uint32_t code
, value
;
369 UBool isCompat
, something
=FALSE
;
371 /* ignore First and Last entries for ranges */
372 if( *fields
[1][0]=='<' &&
373 (length
=(int32_t)(fields
[1][1]-fields
[1][0]))>=9 &&
374 (0==uprv_memcmp(", First>", fields
[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields
[1][1]-7, 7))
379 /* reset the properties */
380 uprv_memset(&norm
, 0, sizeof(Norm
));
382 /* get the character code, field 0 */
383 code
=(uint32_t)uprv_strtoul(fields
[0][0], &end
, 16);
384 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
385 fprintf(stderr
, "gennorm: syntax error in field 0 at %s\n", fields
[0][0]);
386 *pErrorCode
=U_PARSE_ERROR
;
390 /* get canonical combining class, field 3 */
391 value
=(uint32_t)uprv_strtoul(fields
[3][0], &end
, 10);
392 if(end
<=fields
[3][0] || end
!=fields
[3][1] || value
>0xff) {
393 fprintf(stderr
, "gennorm: syntax error in field 3 at %s\n", fields
[0][0]);
394 *pErrorCode
=U_PARSE_ERROR
;
398 norm
.udataCC
=(uint8_t)value
;
402 /* get the decomposition, field 5 */
403 if(fields
[5][0]<fields
[5][1]) {
404 if(*(s
=fields
[5][0])=='<') {
408 /* skip and ignore the compatibility type name */
410 if(s
==fields
[5][1]) {
412 fprintf(stderr
, "gennorm: syntax error in field 5 at %s\n", fields
[0][0]);
413 *pErrorCode
=U_PARSE_ERROR
;
421 /* parse the decomposition string */
422 length
=u_parseCodePoints(s
, decomp
, sizeof(decomp
)/4, pErrorCode
);
423 if(U_FAILURE(*pErrorCode
)) {
424 fprintf(stderr
, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
425 (long)code
, u_errorName(*pErrorCode
));
429 /* store the string */
433 norm
.lenNFKD
=(uint8_t)length
;
437 fprintf(stderr
, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
438 (long)code
, (long)length
);
439 *pErrorCode
=U_PARSE_ERROR
;
442 norm
.lenNFD
=(uint8_t)length
;
448 /* check for non-character code points */
449 if((code
&0xfffe)==0xfffe || (uint32_t)(code
-0xfdd0)<0x20 || code
>0x10ffff) {
450 fprintf(stderr
, "gennorm: error - properties for non-character code point U+%04lx\n",
452 *pErrorCode
=U_PARSE_ERROR
;
457 /* there are normalization values, so store them */
460 printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
461 (long)code
, norm
.udataCC
, (long)norm
.lenNFD
, (long)norm
.lenNFKD
);
464 storeNorm(code
, &norm
);
469 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
472 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
476 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
477 if(U_FAILURE(*pErrorCode
)) {
478 fprintf(stderr
, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename
, u_errorName(*pErrorCode
));
483 #endif /* #if !UCONFIG_NO_NORMALIZATION */
486 * Hey, Emacs, please set the following:
489 * indent-tabs-mode: nil