[apple/icu.git] / icuSources / tools / gennorm / gennorm.c

/*
*******************************************************************************
*
*   Copyright (C) 2001-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  gennorm.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2001may25
*   created by: Markus W. Scherer
*
*   This program reads the Unicode character database text file,
*   parses it, and extracts the data for normalization.
*   It then preprocesses it and writes a binary file for efficient use
*   in various Unicode text normalization processes.
*/

#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "unormimp.h"

U_CDECL_BEGIN
#include "gennorm.h"
U_CDECL_END

#ifdef WIN32
#   pragma warning(disable: 4100)
#endif

UBool beVerbose=FALSE, haveCopyright=TRUE;

/* prototypes --------------------------------------------------------------- */

static void
parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);

static void
parseDB(const char *filename, UErrorCode *pErrorCode);

/* -------------------------------------------------------------------------- */

static UOption options[]={
    UOPTION_HELP_H,
    UOPTION_HELP_QUESTION_MARK,
    UOPTION_VERBOSE,
    UOPTION_COPYRIGHT,
    UOPTION_DESTDIR,
    UOPTION_SOURCEDIR,
    { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
};

extern int
main(int argc, char* argv[]) {
#if !UCONFIG_NO_NORMALIZATION
    char filename[300];
#endif
    const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
    char *basename=NULL;
    UErrorCode errorCode=U_ZERO_ERROR;

    U_MAIN_INIT_ARGS(argc, argv);

    /* preset then read command line options */
    options[4].value=u_getDataDirectory();
    options[5].value="";
    options[6].value="3.0.0";
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);

    /* error handling, printing usage message */
    if(argc<0) {
        fprintf(stderr,
            "error in command line argument \"%s\"\n",
            argv[-argc]);
    }
    if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
        /*
         * Broken into chucks because the C89 standard says the minimum
         * required supported string length is 509 bytes.
         */
        fprintf(stderr,
            "Usage: %s [-options] [suffix]\n"
            "\n"
            "Read the UnicodeData.txt file and other Unicode properties files and\n"
            "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
            "\n",
            argv[0]);
        fprintf(stderr,
            "Options:\n"
            "\t-h or -? or --help  this usage text\n"
            "\t-v or --verbose     verbose output\n"
            "\t-c or --copyright   include a copyright notice\n"
            "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
        fprintf(stderr,
            "\t-d or --destdir     destination directory, followed by the path\n"
            "\t-s or --sourcedir   source directory, followed by the path\n"
            "\tsuffix              suffix that is to be appended with a '-'\n"
            "\t                    to the source file basenames before opening;\n"
            "\t                    'gennorm new' will read UnicodeData-new.txt etc.\n");
        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    }

    /* get the options values */
    beVerbose=options[2].doesOccur;
    haveCopyright=options[3].doesOccur;
    srcDir=options[5].value;
    destDir=options[4].value;

    if(argc>=2) {
        suffix=argv[1];
    } else {
        suffix=NULL;
    }

#if UCONFIG_NO_NORMALIZATION

    fprintf(stderr,
        "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
        " because UCONFIG_NO_NORMALIZATION is set, \n"
        "see icu/source/common/unicode/uconfig.h\n");
    generateData(destDir);

#else

    setUnicodeVersion(options[6].value);

    /* prepare the filename beginning with the source dir */
    uprv_strcpy(filename, srcDir);
    basename=filename+uprv_strlen(filename);
    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
        *basename++=U_FILE_SEP_CHAR;
    }

    /* initialize */
    init();

    /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
    if(suffix==NULL) {
        uprv_strcpy(basename, "DerivedNormalizationProps.txt");
    } else {
        uprv_strcpy(basename, "DerivedNormalizationProps");
        basename[30]='-';
        uprv_strcpy(basename+31, suffix);
        uprv_strcat(basename+31, ".txt");
    }
    parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
    if(U_FAILURE(errorCode)) {
        /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
        if(suffix==NULL) {
            uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
        } else {
            uprv_strcpy(basename, "DerivedNormalizationProperties");
            basename[30]='-';
            uprv_strcpy(basename+31, suffix);
            uprv_strcat(basename+31, ".txt");
        }
        parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
    }

    /* process UnicodeData.txt */
    if(suffix==NULL) {
        uprv_strcpy(basename, "UnicodeData.txt");
    } else {
        uprv_strcpy(basename, "UnicodeData");
        basename[11]='-';
        uprv_strcpy(basename+12, suffix);
        uprv_strcat(basename+12, ".txt");
    }
    parseDB(filename, &errorCode);

    /* process parsed data */
    if(U_SUCCESS(errorCode)) {
        processData();

        /* write the properties data file */
        generateData(destDir);

        cleanUpData();
    }

#endif

    return errorCode;
}

#if !UCONFIG_NO_NORMALIZATION

/* parser for DerivedNormalizationProperties.txt ---------------------------- */

static void U_CALLCONV
derivedNormalizationPropertiesLineFn(void *context,
                                     char *fields[][2], int32_t fieldCount,
                                     UErrorCode *pErrorCode) {
    UChar string[32];
    char *s;
    uint32_t start, end;
    int32_t count;
    uint8_t qcFlags;

    /* get code point range */
    count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }

    /* ignore hangul - handle explicitly */
    if(start==0xac00) {
        return;
    }

    /* get property - ignore unrecognized ones */
    s=(char *)u_skipWhitespace(fields[1][0]);
    if(*s=='N' && s[1]=='F') {
        /* quick check flag */
        qcFlags=0x11;
        s+=2;
        if(*s=='K') {
            qcFlags<<=1;
            ++s;
        }

        if(*s=='C' && s[1]=='_') {
            s+=2;
        } else if(*s=='D' && s[1]=='_') {
            qcFlags<<=2;
            s+=2;
        } else {
            return;
        }

        if(0==uprv_memcmp(s, "NO", 2)) {
            qcFlags&=0xf;
        } else if(0==uprv_memcmp(s, "MAYBE", 5)) {
            qcFlags&=0x30;
        } else {
            return;
        }

        /* set this flag for all code points in this range */
        while(start<=end) {
            setQCFlags(start++, qcFlags);
        }
    } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
        /* full composition exclusion */
        while(start<=end) {
            setCompositionExclusion(start++);
        }
    } else if(0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') {
        /* FC_NFKC_Closure, parse field 2 to get the string */
        char *t;

        /* start of the field */
        s=(char *)u_skipWhitespace(s+1);

        /* find the end of the field */
        for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
        *t=0;

        string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
            exit(*pErrorCode);
        }
        while(start<=end) {
            setFNC(start++, string);
        }
    }
}

static void
parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
    char *fields[2][2];

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
    if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
        fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

/* parser for UnicodeData.txt ----------------------------------------------- */

static void U_CALLCONV
unicodeDataLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
    uint32_t decomp[40];
    Norm norm;
    const char *s;
    char *end;
    uint32_t code, value;
    int32_t length;
    UBool isCompat, something=FALSE;

    /* ignore First and Last entries for ranges */
    if( *fields[1][0]=='<' &&
        (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
        (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
    ) {
        return;
    }

    /* reset the properties */
    uprv_memset(&norm, 0, sizeof(Norm));

    /* get the character code, field 0 */
    code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get canonical combining class, field 3 */
    value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
    if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
        fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    if(value>0) {
        norm.udataCC=(uint8_t)value;
        something=TRUE;
    }

    /* get the decomposition, field 5 */
    if(fields[5][0]<fields[5][1]) {
        if(*(s=fields[5][0])=='<') {
            ++s;
            isCompat=TRUE;

            /* skip and ignore the compatibility type name */
            do {
                if(s==fields[5][1]) {
                    /* missing '>' */
                    fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
                    *pErrorCode=U_PARSE_ERROR;
                    exit(U_PARSE_ERROR);
                }
            } while(*s++!='>');
        } else {
            isCompat=FALSE;
        }

        /* parse the decomposition string */
        length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
                    (long)code, u_errorName(*pErrorCode));
            exit(*pErrorCode);
        }

        /* store the string */
        if(length>0) {
            something=TRUE;
            if(isCompat) {
                norm.lenNFKD=(uint8_t)length;
                norm.nfkd=decomp;
            } else {
                if(length>2) {
                    fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
                            (long)code, (long)length);
                    *pErrorCode=U_PARSE_ERROR;
                    exit(U_PARSE_ERROR);
                }
                norm.lenNFD=(uint8_t)length;
                norm.nfd=decomp;
            }
        }
    }

    /* check for non-character code points */
    if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
        fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
                (long)code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    if(something) {
        /* there are normalization values, so store them */
#if 0
        if(beVerbose) {
            printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
                   (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
        }
#endif
        storeNorm(code, &norm);
    }
}

static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
    char *fields[15][2];

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}

#endif /* #if !UCONFIG_NO_NORMALIZATION */

/*
 * Hey, Emacs, please set the following:
 *
 * Local Variables:
 * indent-tabs-mode: nil
 * End:
 *
 */
Commit	Line	Data
b75a7d8f A	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2001-2003, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: gennorm.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2001may25
	14	* created by: Markus W. Scherer
	15	*
	16	* This program reads the Unicode character database text file,
	17	* parses it, and extracts the data for normalization.
	18	* It then preprocesses it and writes a binary file for efficient use
	19	* in various Unicode text normalization processes.
	20	*/
	21
	22	#include <stdio.h>
	23	#include <stdlib.h>
	24	#include "unicode/utypes.h"
	25	#include "unicode/uchar.h"
	26	#include "unicode/putil.h"
	27	#include "cmemory.h"
	28	#include "cstring.h"
	29	#include "unicode/udata.h"
	30	#include "unewdata.h"
	31	#include "uoptions.h"
	32	#include "uparse.h"
	33	#include "unormimp.h"
	34
	35	U_CDECL_BEGIN
	36	#include "gennorm.h"
	37	U_CDECL_END
	38
	39	#ifdef WIN32
	40	# pragma warning(disable: 4100)
	41	#endif
	42
	43	UBool beVerbose=FALSE, haveCopyright=TRUE;
	44
	45	/* prototypes --------------------------------------------------------------- */
	46
	47	static void
	48	parseDerivedNormalizationProperties(const char filename, UErrorCode pErrorCode, UBool reportError);
	49
	50	static void
	51	parseDB(const char filename, UErrorCode pErrorCode);
	52
	53	/* -------------------------------------------------------------------------- */
	54
	55	static UOption options[]={
	56	UOPTION_HELP_H,
	57	UOPTION_HELP_QUESTION_MARK,
	58	UOPTION_VERBOSE,
	59	UOPTION_COPYRIGHT,
	60	UOPTION_DESTDIR,
	61	UOPTION_SOURCEDIR,
	62	{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
	63	};
	64
65	extern int
66	main(int argc, char* argv[]) {
67	#if !UCONFIG_NO_NORMALIZATION
68	char filename[300];
69	#endif
70	const char srcDir=NULL, destDir=NULL, *suffix=NULL;
71	char *basename=NULL;
72	UErrorCode errorCode=U_ZERO_ERROR;
73
74	U_MAIN_INIT_ARGS(argc, argv);
75
76	/* preset then read command line options */
77	options[4].value=u_getDataDirectory();
78	options[5].value="";
79	options[6].value="3.0.0";
80	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
81
82	/* error handling, printing usage message */
83	if(argc<0) {
84	fprintf(stderr,
85	"error in command line argument \"%s\"\n",
86	argv[-argc]);
87	}
88	if(argc<0 \|\| options[0].doesOccur \|\| options[1].doesOccur) {
89	/*
90	* Broken into chucks because the C89 standard says the minimum
91	* required supported string length is 509 bytes.
92	*/
93	fprintf(stderr,
94	"Usage: %s [-options] [suffix]\n"
95	"\n"
96	"Read the UnicodeData.txt file and other Unicode properties files and\n"
97	"create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
98	"\n",
99	argv[0]);
100	fprintf(stderr,
101	"Options:\n"
102	"\t-h or -? or --help this usage text\n"
103	"\t-v or --verbose verbose output\n"
104	"\t-c or --copyright include a copyright notice\n"
105	"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
106	fprintf(stderr,
107	"\t-d or --destdir destination directory, followed by the path\n"
108	"\t-s or --sourcedir source directory, followed by the path\n"
109	"\tsuffix suffix that is to be appended with a '-'\n"
110	"\t to the source file basenames before opening;\n"
111	"\t 'gennorm new' will read UnicodeData-new.txt etc.\n");
112	return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
113	}
114
115	/* get the options values */
116	beVerbose=options[2].doesOccur;
117	haveCopyright=options[3].doesOccur;
118	srcDir=options[5].value;
119	destDir=options[4].value;
120
121	if(argc>=2) {
122	suffix=argv[1];
123	} else {
124	suffix=NULL;
125	}
126
127	#if UCONFIG_NO_NORMALIZATION
128
129	fprintf(stderr,
130	"gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
131	" because UCONFIG_NO_NORMALIZATION is set, \n"
132	"see icu/source/common/unicode/uconfig.h\n");
133	generateData(destDir);
134
135	#else
136
137	setUnicodeVersion(options[6].value);
138
139	/* prepare the filename beginning with the source dir */
140	uprv_strcpy(filename, srcDir);
141	basename=filename+uprv_strlen(filename);
142	if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
143	*basename++=U_FILE_SEP_CHAR;
144	}
145
146	/* initialize */
147	init();
148
149	/* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
150	if(suffix==NULL) {
151	uprv_strcpy(basename, "DerivedNormalizationProps.txt");
152	} else {
153	uprv_strcpy(basename, "DerivedNormalizationProps");
154	basename[30]='-';
155	uprv_strcpy(basename+31, suffix);
156	uprv_strcat(basename+31, ".txt");
157	}
158	parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
159	if(U_FAILURE(errorCode)) {
160	/* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
161	if(suffix==NULL) {
162	uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
163	} else {
164	uprv_strcpy(basename, "DerivedNormalizationProperties");
165	basename[30]='-';
166	uprv_strcpy(basename+31, suffix);
167	uprv_strcat(basename+31, ".txt");
168	}
169	parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
170	}
171
172	/* process UnicodeData.txt */
173	if(suffix==NULL) {
174	uprv_strcpy(basename, "UnicodeData.txt");
175	} else {
176	uprv_strcpy(basename, "UnicodeData");
177	basename[11]='-';
178	uprv_strcpy(basename+12, suffix);
179	uprv_strcat(basename+12, ".txt");
180	}
181	parseDB(filename, &errorCode);
182
183	/* process parsed data */
184	if(U_SUCCESS(errorCode)) {
185	processData();
186
187	/* write the properties data file */
188	generateData(destDir);
189
190	cleanUpData();
191	}
192
193	#endif
194
195	return errorCode;
196	}
197
198	#if !UCONFIG_NO_NORMALIZATION
199
200	/* parser for DerivedNormalizationProperties.txt ---------------------------- */
201
202	static void U_CALLCONV
203	derivedNormalizationPropertiesLineFn(void *context,
204	char *fields[][2], int32_t fieldCount,
205	UErrorCode *pErrorCode) {
206	UChar string[32];
207	char *s;
208	uint32_t start, end;
209	int32_t count;
210	uint8_t qcFlags;
211
212	/* get code point range */
213	count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
214	if(U_FAILURE(*pErrorCode)) {
215	fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
216	exit(*pErrorCode);
217	}
218
219	/* ignore hangul - handle explicitly */
220	if(start==0xac00) {
221	return;
222	}
223
224	/* get property - ignore unrecognized ones */
225	s=(char *)u_skipWhitespace(fields[1][0]);
226	if(*s=='N' && s[1]=='F') {
227	/* quick check flag */
228	qcFlags=0x11;
229	s+=2;
230	if(*s=='K') {
231	qcFlags<<=1;
232	++s;
233	}
234
235	if(*s=='C' && s[1]=='_') {
236	s+=2;
237	} else if(*s=='D' && s[1]=='_') {
238	qcFlags<<=2;
239	s+=2;
240	} else {
241	return;
242	}
243
244	if(0==uprv_memcmp(s, "NO", 2)) {
245	qcFlags&=0xf;
246	} else if(0==uprv_memcmp(s, "MAYBE", 5)) {
247	qcFlags&=0x30;
248	} else {
249	return;
250	}
251
252	/* set this flag for all code points in this range */
253	while(start<=end) {
254	setQCFlags(start++, qcFlags);
255	}
256	} else if(0==uprv_memcmp(s, "Comp_Ex", 7) \|\| 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
257	/* full composition exclusion */
258	while(start<=end) {
259	setCompositionExclusion(start++);
260	}
261	} else if(0==uprv_memcmp(s, "FNC", 3) && (s=(char )u_skipWhitespace(s+3))==';') {
262	/* FC_NFKC_Closure, parse field 2 to get the string */
263	char *t;
264
265	/* start of the field */
266	s=(char *)u_skipWhitespace(s+1);
267
268	/* find the end of the field */
269	for(t=s; t!=';' && t!='#' && t!=0 && t!='\n' && *t!='\r'; ++t) {}
270	*t=0;
271
272	string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
273	if(U_FAILURE(*pErrorCode)) {
274	fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
275	exit(*pErrorCode);
276	}
277	while(start<=end) {
278	setFNC(start++, string);
279	}
280	}
281	}
282
283	static void
284	parseDerivedNormalizationProperties(const char filename, UErrorCode pErrorCode, UBool reportError) {
285	char *fields[2][2];
286
287	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
288	return;
289	}
290
291	u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
292	if(U_FAILURE(pErrorCode) && (reportError \|\| pErrorCode!=U_FILE_ACCESS_ERROR)) {
293	fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
294	exit(*pErrorCode);
295	}
296	}
297
298	/* parser for UnicodeData.txt ----------------------------------------------- */
299
300	static void U_CALLCONV
301	unicodeDataLineFn(void *context,
302	char *fields[][2], int32_t fieldCount,
303	UErrorCode *pErrorCode) {
304	uint32_t decomp[40];
305	Norm norm;
306	const char *s;
307	char *end;
308	uint32_t code, value;
309	int32_t length;
310	UBool isCompat, something=FALSE;
311
312	/* ignore First and Last entries for ranges */
313	if( *fields[1][0]=='<' &&
314	(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
315	(0==uprv_memcmp(", First>", fields[1][1]-8, 8) \|\| 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
316	) {
317	return;
318	}
319
320	/* reset the properties */
321	uprv_memset(&norm, 0, sizeof(Norm));
322
323	/* get the character code, field 0 */
324	code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
325	if(end<=fields[0][0] \|\| end!=fields[0][1]) {
326	fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
327	*pErrorCode=U_PARSE_ERROR;
328	exit(U_PARSE_ERROR);
329	}
330
331	/* get canonical combining class, field 3 */
332	value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
333	if(end<=fields[3][0] \|\| end!=fields[3][1] \|\| value>0xff) {
334	fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
335	*pErrorCode=U_PARSE_ERROR;
336	exit(U_PARSE_ERROR);
337	}
338	if(value>0) {
339	norm.udataCC=(uint8_t)value;
340	something=TRUE;
341	}
342
343	/* get the decomposition, field 5 */
344	if(fields[5][0]<fields[5][1]) {
345	if(*(s=fields[5][0])=='<') {
346	++s;
347	isCompat=TRUE;
348
349	/* skip and ignore the compatibility type name */
350	do {
351	if(s==fields[5][1]) {
352	/* missing '>' */
353	fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
354	*pErrorCode=U_PARSE_ERROR;
355	exit(U_PARSE_ERROR);
356	}
357	} while(*s++!='>');
358	} else {
359	isCompat=FALSE;
360	}
361
362	/* parse the decomposition string */
363	length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
364	if(U_FAILURE(*pErrorCode)) {
365	fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
366	(long)code, u_errorName(*pErrorCode));
367	exit(*pErrorCode);
368	}
369
370	/* store the string */
371	if(length>0) {
372	something=TRUE;
373	if(isCompat) {
374	norm.lenNFKD=(uint8_t)length;
375	norm.nfkd=decomp;
376	} else {
377	if(length>2) {
378	fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
379	(long)code, (long)length);
380	*pErrorCode=U_PARSE_ERROR;
381	exit(U_PARSE_ERROR);
382	}
383	norm.lenNFD=(uint8_t)length;
384	norm.nfd=decomp;
385	}
386	}
387	}
388
389	/* check for non-character code points */
390	if((code&0xfffe)==0xfffe \|\| (uint32_t)(code-0xfdd0)<0x20 \|\| code>0x10ffff) {
391	fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
392	(long)code);
393	*pErrorCode=U_PARSE_ERROR;
394	exit(U_PARSE_ERROR);
395	}
396
397	if(something) {
398	/* there are normalization values, so store them */
399	#if 0
400	if(beVerbose) {
401	printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
402	(long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
403	}
404	#endif
405	storeNorm(code, &norm);
406	}
407	}
408
409	static void
410	parseDB(const char filename, UErrorCode pErrorCode) {
411	char *fields[15][2];
412
413	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
414	return;
415	}
416
417	u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
418	if(U_FAILURE(*pErrorCode)) {
419	fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
420	exit(*pErrorCode);
421	}
422	}
423
424	#endif /* #if !UCONFIG_NO_NORMALIZATION */
425
426	/*
427	* Hey, Emacs, please set the following:
428	*
429	* Local Variables:
430	* indent-tabs-mode: nil
431	* End:
432	*
433	*/