git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/gennorm/gennorm.c

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2001-2004, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: gennorm.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2001may25
	14	* created by: Markus W. Scherer
	15	*
	16	* This program reads the Unicode character database text file,
	17	* parses it, and extracts the data for normalization.
	18	* It then preprocesses it and writes a binary file for efficient use
	19	* in various Unicode text normalization processes.
	20	*/
	21
	22	#include <stdio.h>
	23	#include <stdlib.h>
	24	#include "unicode/utypes.h"
	25	#include "unicode/uchar.h"
	26	#include "unicode/ustring.h"
	27	#include "unicode/putil.h"
	28	#include "unicode/uclean.h"
	29	#include "unicode/udata.h"
	30	#include "unicode/uset.h"
	31	#include "cmemory.h"
	32	#include "cstring.h"
	33	#include "unewdata.h"
	34	#include "uoptions.h"
	35	#include "uparse.h"
	36	#include "unormimp.h"
	37
	38	U_CDECL_BEGIN
	39	#include "gennorm.h"
	40	U_CDECL_END
	41
	42	#ifdef WIN32
	43	# pragma warning(disable: 4100)
	44	#endif
	45
	46	UBool beVerbose=FALSE, haveCopyright=TRUE;
	47
	48	/* prototypes --------------------------------------------------------------- */
	49
	50	static void
	51	parseDerivedNormalizationProperties(const char filename, UErrorCode pErrorCode, UBool reportError);
	52
	53	static void
	54	parseDB(const char filename, UErrorCode pErrorCode);
	55
	56	/* -------------------------------------------------------------------------- */
	57
	58	enum {
	59	HELP_H,
	60	HELP_QUESTION_MARK,
	61	VERBOSE,
	62	COPYRIGHT,
	63	DESTDIR,
	64	SOURCEDIR,
	65	UNICODE_VERSION,
	66	ICUDATADIR
	67	};
	68
	69	static UOption options[]={
	70	UOPTION_HELP_H,
	71	UOPTION_HELP_QUESTION_MARK,
	72	UOPTION_VERBOSE,
	73	UOPTION_COPYRIGHT,
	74	UOPTION_DESTDIR,
	75	UOPTION_SOURCEDIR,
	76	{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
	77	UOPTION_ICUDATADIR
	78	};
	79
	80	extern int
	81	main(int argc, char* argv[]) {
	82	#if !UCONFIG_NO_NORMALIZATION
	83	char filename[300];
	84	#endif
	85	const char srcDir=NULL, destDir=NULL, *suffix=NULL;
	86	char *basename=NULL;
	87	UErrorCode errorCode=U_ZERO_ERROR;
	88
	89	U_MAIN_INIT_ARGS(argc, argv);
	90
	91	/* preset then read command line options */
	92	options[4].value=u_getDataDirectory();
	93	options[5].value="";
	94	options[6].value="3.0.0";
	95	options[ICUDATADIR].value=u_getDataDirectory();
	96	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
	97
	98	/* error handling, printing usage message */
	99	if(argc<0) {
	100	fprintf(stderr,
	101	"error in command line argument \"%s\"\n",
	102	argv[-argc]);
	103	}
	104	if(argc<0 \|\| options[0].doesOccur \|\| options[1].doesOccur) {
	105	/*
	106	* Broken into chucks because the C89 standard says the minimum
	107	* required supported string length is 509 bytes.
	108	*/
	109	fprintf(stderr,
	110	"Usage: %s [-options] [suffix]\n"
	111	"\n"
	112	"Read the UnicodeData.txt file and other Unicode properties files and\n"
	113	"create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
	114	"\n",
	115	argv[0]);
	116	fprintf(stderr,
	117	"Options:\n"
	118	"\t-h or -? or --help this usage text\n"
	119	"\t-v or --verbose verbose output\n"
	120	"\t-c or --copyright include a copyright notice\n"
	121	"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
	122	fprintf(stderr,
	123	"\t-d or --destdir destination directory, followed by the path\n"
	124	"\t-s or --sourcedir source directory, followed by the path\n"
	125	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
	126	"\t followed by path, defaults to <%s>\n"
	127	"\tsuffix suffix that is to be appended with a '-'\n"
	128	"\t to the source file basenames before opening;\n"
	129	"\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
	130	u_getDataDirectory());
	131	return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	132	}
	133
	134	/* get the options values */
	135	beVerbose=options[2].doesOccur;
	136	haveCopyright=options[3].doesOccur;
	137	srcDir=options[5].value;
	138	destDir=options[4].value;
	139
	140	if(argc>=2) {
	141	suffix=argv[1];
	142	} else {
	143	suffix=NULL;
	144	}
	145
	146	#if UCONFIG_NO_NORMALIZATION
	147
	148	fprintf(stderr,
	149	"gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
	150	" because UCONFIG_NO_NORMALIZATION is set, \n"
	151	"see icu/source/common/unicode/uconfig.h\n");
	152	generateData(destDir);
	153
	154	#else
	155
	156	setUnicodeVersion(options[6].value);
	157
	158	if (options[ICUDATADIR].doesOccur) {
	159	u_setDataDirectory(options[ICUDATADIR].value);
	160	}
	161
	162	/*
	163	* Verify that we can work with properties
	164	* but don't call u_init() because that needs unorm.icu which we are just
	165	* going to build here.
	166	*/
	167	{
	168	U_STRING_DECL(ideo, "[:Ideographic:]", 15);
	169	USet *set;
	170
	171	U_STRING_INIT(ideo, "[:Ideographic:]", 15);
	172	set=uset_openPattern(ideo, -1, &errorCode);
	173	if(U_FAILURE(errorCode) \|\| !uset_contains(set, 0xf900)) {
	174	fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
	175	exit(errorCode);
	176	}
	177	uset_close(set);
	178	}
	179
	180	/* prepare the filename beginning with the source dir */
	181	uprv_strcpy(filename, srcDir);
	182	basename=filename+uprv_strlen(filename);
	183	if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
	184	*basename++=U_FILE_SEP_CHAR;
	185	}
	186
	187	/* initialize */
	188	init();
	189
	190	/* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
	191	if(suffix==NULL) {
	192	uprv_strcpy(basename, "DerivedNormalizationProps.txt");
	193	} else {
	194	uprv_strcpy(basename, "DerivedNormalizationProps");
	195	basename[30]='-';
	196	uprv_strcpy(basename+31, suffix);
	197	uprv_strcat(basename+31, ".txt");
	198	}
	199	parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
	200	if(U_FAILURE(errorCode)) {
	201	/* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
	202	if(suffix==NULL) {
	203	uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
	204	} else {
	205	uprv_strcpy(basename, "DerivedNormalizationProperties");
	206	basename[30]='-';
	207	uprv_strcpy(basename+31, suffix);
	208	uprv_strcat(basename+31, ".txt");
	209	}
	210	parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
	211	}
	212
	213	/* process UnicodeData.txt */
	214	if(suffix==NULL) {
	215	uprv_strcpy(basename, "UnicodeData.txt");
	216	} else {
	217	uprv_strcpy(basename, "UnicodeData");
	218	basename[11]='-';
	219	uprv_strcpy(basename+12, suffix);
	220	uprv_strcat(basename+12, ".txt");
	221	}
	222	parseDB(filename, &errorCode);
	223
	224	/* process parsed data */
	225	if(U_SUCCESS(errorCode)) {
	226	processData();
	227
	228	/* write the properties data file */
	229	generateData(destDir);
	230
	231	cleanUpData();
	232	}
	233
	234	#endif
	235
	236	return errorCode;
	237	}
	238
	239	#if !UCONFIG_NO_NORMALIZATION
	240
	241	/* parser for DerivedNormalizationProperties.txt ---------------------------- */
	242
	243	static void U_CALLCONV
	244	derivedNormalizationPropertiesLineFn(void *context,
	245	char *fields[][2], int32_t fieldCount,
	246	UErrorCode *pErrorCode) {
	247	UChar string[32];
	248	char *s;
	249	uint32_t start, end;
	250	int32_t count;
	251	uint8_t qcFlags;
	252
	253	/* get code point range */
	254	count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
	255	if(U_FAILURE(*pErrorCode)) {
	256	fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
	257	exit(*pErrorCode);
	258	}
	259
	260	/* ignore hangul - handle explicitly */
	261	if(start==0xac00) {
	262	return;
	263	}
	264
	265	/* get property - ignore unrecognized ones */
	266	s=(char *)u_skipWhitespace(fields[1][0]);
	267	if(*s=='N' && s[1]=='F') {
	268	/* quick check flag */
	269	qcFlags=0x11;
	270	s+=2;
	271	if(*s=='K') {
	272	qcFlags<<=1;
	273	++s;
	274	}
	275
	276	if(*s=='C' && s[1]=='_') {
	277	s+=2;
	278	} else if(*s=='D' && s[1]=='_') {
	279	qcFlags<<=2;
	280	s+=2;
	281	} else {
	282	return;
	283	}
	284
	285	if(0==uprv_strncmp(s, "NO", 2)) {
	286	qcFlags&=0xf;
	287	} else if(0==uprv_strncmp(s, "MAYBE", 5)) {
	288	qcFlags&=0x30;
	289	} else if(0==uprv_strncmp(s, "QC", 2) && (s=(char )u_skipWhitespace(s+2))==';') {
	290	/*
	291	* Unicode 4.0.1:
	292	* changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
	293	*/
	294	/* start of the field */
	295	s=(char *)u_skipWhitespace(s+1);
	296	if(*s=='N') {
	297	qcFlags&=0xf;
	298	} else if(*s=='M') {
	299	qcFlags&=0x30;
	300	} else {
	301	return; /* do nothing for "Yes" because it's the default value */
	302	}
	303	} else {
	304	return; /* do nothing for "Yes" because it's the default value */
	305	}
	306
	307	/* set this flag for all code points in this range */
	308	while(start<=end) {
	309	setQCFlags(start++, qcFlags);
	310	}
	311	} else if(0==uprv_memcmp(s, "Comp_Ex", 7) \|\| 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
	312	/* full composition exclusion */
	313	while(start<=end) {
	314	setCompositionExclusion(start++);
	315	}
	316	} else if(
	317	((0==uprv_memcmp(s, "FNC", 3) && (s=(char )u_skipWhitespace(s+3))==';') \|\|
	318	(0==uprv_memcmp(s, "FC_NFKC", 7) && (s=(char )u_skipWhitespace(s+7))==';'))
	319
	320	) {
	321	/* FC_NFKC_Closure, parse field 2 to get the string */
	322	char *t;
	323
	324	/* start of the field */
	325	s=(char *)u_skipWhitespace(s+1);
	326
	327	/* find the end of the field */
	328	for(t=s; t!=';' && t!='#' && t!=0 && t!='\n' && *t!='\r'; ++t) {}
	329	*t=0;
	330
	331	string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
	332	if(U_FAILURE(*pErrorCode)) {
	333	fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
	334	exit(*pErrorCode);
	335	}
	336	while(start<=end) {
	337	setFNC(start++, string);
	338	}
	339	}
	340	}
	341
	342	static void
	343	parseDerivedNormalizationProperties(const char filename, UErrorCode pErrorCode, UBool reportError) {
	344	char *fields[2][2];
	345
	346	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	347	return;
	348	}
	349
	350	u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
	351	if(U_FAILURE(pErrorCode) && (reportError \|\| pErrorCode!=U_FILE_ACCESS_ERROR)) {
	352	fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
	353	exit(*pErrorCode);
	354	}
	355	}
	356
	357	/* parser for UnicodeData.txt ----------------------------------------------- */
	358
	359	static void U_CALLCONV
	360	unicodeDataLineFn(void *context,
	361	char *fields[][2], int32_t fieldCount,
	362	UErrorCode *pErrorCode) {
	363	uint32_t decomp[40];
	364	Norm norm;
	365	const char *s;
	366	char *end;
	367	uint32_t code, value;
	368	int32_t length;
	369	UBool isCompat, something=FALSE;
	370
	371	/* ignore First and Last entries for ranges */
	372	if( *fields[1][0]=='<' &&
	373	(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
	374	(0==uprv_memcmp(", First>", fields[1][1]-8, 8) \|\| 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
	375	) {
	376	return;
	377	}
	378
	379	/* reset the properties */
	380	uprv_memset(&norm, 0, sizeof(Norm));
	381
	382	/* get the character code, field 0 */
	383	code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
	384	if(end<=fields[0][0] \|\| end!=fields[0][1]) {
	385	fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
	386	*pErrorCode=U_PARSE_ERROR;
	387	exit(U_PARSE_ERROR);
	388	}
	389
	390	/* get canonical combining class, field 3 */
	391	value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
	392	if(end<=fields[3][0] \|\| end!=fields[3][1] \|\| value>0xff) {
	393	fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
	394	*pErrorCode=U_PARSE_ERROR;
	395	exit(U_PARSE_ERROR);
	396	}
	397	if(value>0) {
	398	norm.udataCC=(uint8_t)value;
	399	something=TRUE;
	400	}
	401
	402	/* get the decomposition, field 5 */
	403	if(fields[5][0]<fields[5][1]) {
	404	if(*(s=fields[5][0])=='<') {
	405	++s;
	406	isCompat=TRUE;
	407
	408	/* skip and ignore the compatibility type name */
	409	do {
	410	if(s==fields[5][1]) {
	411	/* missing '>' */
	412	fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
	413	*pErrorCode=U_PARSE_ERROR;
	414	exit(U_PARSE_ERROR);
	415	}
	416	} while(*s++!='>');
	417	} else {
	418	isCompat=FALSE;
	419	}
	420
	421	/* parse the decomposition string */
	422	length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
	423	if(U_FAILURE(*pErrorCode)) {
	424	fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
	425	(long)code, u_errorName(*pErrorCode));
	426	exit(*pErrorCode);
	427	}
	428
	429	/* store the string */
	430	if(length>0) {
	431	something=TRUE;
	432	if(isCompat) {
	433	norm.lenNFKD=(uint8_t)length;
	434	norm.nfkd=decomp;
	435	} else {
	436	if(length>2) {
	437	fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
	438	(long)code, (long)length);
	439	*pErrorCode=U_PARSE_ERROR;
	440	exit(U_PARSE_ERROR);
	441	}
	442	norm.lenNFD=(uint8_t)length;
	443	norm.nfd=decomp;
	444	}
	445	}
	446	}
	447
	448	/* check for non-character code points */
	449	if((code&0xfffe)==0xfffe \|\| (uint32_t)(code-0xfdd0)<0x20 \|\| code>0x10ffff) {
	450	fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
	451	(long)code);
	452	*pErrorCode=U_PARSE_ERROR;
	453	exit(U_PARSE_ERROR);
	454	}
	455
	456	if(something) {
	457	/* there are normalization values, so store them */
	458	#if 0
	459	if(beVerbose) {
	460	printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
	461	(long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
	462	}
	463	#endif
	464	storeNorm(code, &norm);
	465	}
	466	}
	467
	468	static void
	469	parseDB(const char filename, UErrorCode pErrorCode) {
	470	char *fields[15][2];
	471
	472	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	473	return;
	474	}
	475
	476	u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
	477	if(U_FAILURE(*pErrorCode)) {
	478	fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
	479	exit(*pErrorCode);
	480	}
	481	}
	482
	483	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	484
	485	/*
	486	* Hey, Emacs, please set the following:
	487	*
	488	* Local Variables:
	489	* indent-tabs-mode: nil
	490	* End:
	491	*
	492	*/