git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/gennorm/gennorm.c

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2001-2005, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: gennorm.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2001may25
	14	* created by: Markus W. Scherer
	15	*
	16	* This program reads the Unicode character database text file,
	17	* parses it, and extracts the data for normalization.
	18	* It then preprocesses it and writes a binary file for efficient use
	19	* in various Unicode text normalization processes.
	20	*/
	21
	22	#include <stdio.h>
	23	#include <stdlib.h>
	24	#include "unicode/utypes.h"
	25	#include "unicode/uchar.h"
	26	#include "unicode/ustring.h"
	27	#include "unicode/putil.h"
	28	#include "unicode/uclean.h"
	29	#include "unicode/udata.h"
	30	#include "unicode/uset.h"
	31	#include "cmemory.h"
	32	#include "cstring.h"
	33	#include "unewdata.h"
	34	#include "uoptions.h"
	35	#include "uparse.h"
	36	#include "unormimp.h"
	37
	38	U_CDECL_BEGIN
	39	#include "gennorm.h"
	40	U_CDECL_END
	41
	42	UBool beVerbose=FALSE, haveCopyright=TRUE;
	43
	44	/* prototypes --------------------------------------------------------------- */
	45
	46	static void
	47	parseDerivedNormalizationProperties(const char filename, UErrorCode pErrorCode, UBool reportError);
	48
	49	static void
	50	parseDB(const char filename, UErrorCode pErrorCode);
	51
	52	/* -------------------------------------------------------------------------- */
	53
	54	enum {
	55	HELP_H,
	56	HELP_QUESTION_MARK,
	57	VERBOSE,
	58	COPYRIGHT,
	59	DESTDIR,
	60	SOURCEDIR,
	61	UNICODE_VERSION,
	62	ICUDATADIR,
	63	CSOURCE,
	64	STORE_FLAGS
	65	};
	66
	67	static UOption options[]={
	68	UOPTION_HELP_H,
	69	UOPTION_HELP_QUESTION_MARK,
	70	UOPTION_VERBOSE,
	71	UOPTION_COPYRIGHT,
	72	UOPTION_DESTDIR,
	73	UOPTION_SOURCEDIR,
	74	UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
	75	UOPTION_ICUDATADIR,
	76	UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
	77	UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
	78	};
	79
	80	extern int
	81	main(int argc, char* argv[]) {
	82	#if !UCONFIG_NO_NORMALIZATION
	83	char filename[300];
	84	#endif
	85	const char srcDir=NULL, destDir=NULL, *suffix=NULL;
	86	char *basename=NULL;
	87	UErrorCode errorCode=U_ZERO_ERROR;
	88
	89	U_MAIN_INIT_ARGS(argc, argv);
	90
	91	/* preset then read command line options */
	92	options[4].value=u_getDataDirectory();
	93	options[5].value="";
	94	options[6].value="3.0.0";
	95	options[ICUDATADIR].value=u_getDataDirectory();
	96	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
	97
	98	/* error handling, printing usage message */
	99	if(argc<0) {
	100	fprintf(stderr,
	101	"error in command line argument \"%s\"\n",
	102	argv[-argc]);
	103	}
	104	if(argc<0 \|\| options[0].doesOccur \|\| options[1].doesOccur) {
	105	/*
	106	* Broken into chucks because the C89 standard says the minimum
	107	* required supported string length is 509 bytes.
	108	*/
	109	fprintf(stderr,
	110	"Usage: %s [-options] [suffix]\n"
	111	"\n"
	112	"Read the UnicodeData.txt file and other Unicode properties files and\n"
	113	"create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
	114	"\n",
	115	argv[0]);
	116	fprintf(stderr,
	117	"Options:\n"
	118	"\t-h or -? or --help this usage text\n"
	119	"\t-v or --verbose verbose output\n"
	120	"\t-c or --copyright include a copyright notice\n"
	121	"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
	122	"\t-C or --csource generate a .c source file rather than the .icu binary\n");
	123	fprintf(stderr,
	124	"\t-p or --prune flags Prune for data modularization:\n"
	125	"\t Determine what data is to be stored.\n"
	126	"\t 0 (zero) stores minimal data (only for NFD)\n"
	127	"\t lowercase letters turn off data, uppercase turn on (use with 0)\n");
	128	fprintf(stderr,
	129	"\t k: compatibility decompositions (NFKC, NFKD)\n"
	130	"\t c: composition data (NFC, NFKC)\n"
	131	"\t f: FCD data (will be generated at load time)\n"
	132	"\t a: auxiliary data (canonical closure etc.)\n"
	133	"\t x: exclusion sets (Unicode 3.2-level normalization)\n");
	134	fprintf(stderr,
	135	"\t-d or --destdir destination directory, followed by the path\n"
	136	"\t-s or --sourcedir source directory, followed by the path\n"
	137	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
	138	"\t followed by path, defaults to <%s>\n"
	139	"\tsuffix suffix that is to be appended with a '-'\n"
	140	"\t to the source file basenames before opening;\n"
	141	"\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
	142	u_getDataDirectory());
	143	return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	144	}
	145
	146	/* get the options values */
	147	beVerbose=options[2].doesOccur;
	148	haveCopyright=options[3].doesOccur;
	149	srcDir=options[5].value;
	150	destDir=options[4].value;
	151
	152	if(argc>=2) {
	153	suffix=argv[1];
	154	} else {
	155	suffix=NULL;
	156	}
	157
	158	#if UCONFIG_NO_NORMALIZATION
	159
	160	fprintf(stderr,
	161	"gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
	162	" because UCONFIG_NO_NORMALIZATION is set, \n"
	163	"see icu/source/common/unicode/uconfig.h\n");
	164	generateData(destDir, options[CSOURCE].doesOccur);
	165
	166	#else
	167
	168	setUnicodeVersion(options[6].value);
	169
	170	if (options[ICUDATADIR].doesOccur) {
	171	u_setDataDirectory(options[ICUDATADIR].value);
	172	}
	173
	174	if(options[STORE_FLAGS].doesOccur) {
	175	const char *s=options[STORE_FLAGS].value;
	176	char c;
	177
	178	while((c=*s++)!=0) {
	179	switch(c) {
	180	case '0':
	181	gStoreFlags=0; /* store minimal data (only for NFD) */
	182	break;
	183
	184	/* lowercase letters: omit data */
	185	case 'k':
	186	gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT);
	187	break;
	188	case 'c':
	189	gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION);
	190	break;
	191	case 'f':
	192	gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD);
	193	break;
	194	case 'a':
	195	gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX);
	196	break;
	197	case 'x':
	198	gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS);
	199	break;
	200
	201	/* uppercase letters: include data (use with 0) */
	202	case 'K':
	203	gStoreFlags\|=U_MASK(UGENNORM_STORE_COMPAT);
	204	break;
	205	case 'C':
	206	gStoreFlags\|=U_MASK(UGENNORM_STORE_COMPOSITION);
	207	break;
	208	case 'F':
	209	gStoreFlags\|=U_MASK(UGENNORM_STORE_FCD);
	210	break;
	211	case 'A':
	212	gStoreFlags\|=U_MASK(UGENNORM_STORE_AUX);
	213	break;
	214	case 'X':
	215	gStoreFlags\|=U_MASK(UGENNORM_STORE_EXCLUSIONS);
	216	break;
	217
	218	default:
	219	fprintf(stderr, "ignoring undefined prune flag '%c'\n", c);
	220	break;
	221	}
	222	}
	223	}
	224
	225	/*
	226	* Verify that we can work with properties
	227	* but don't call u_init() because that needs unorm.icu which we are just
	228	* going to build here.
	229	*/
	230	{
	231	U_STRING_DECL(ideo, "[:Ideographic:]", 15);
	232	USet *set;
	233
	234	U_STRING_INIT(ideo, "[:Ideographic:]", 15);
	235	set=uset_openPattern(ideo, -1, &errorCode);
	236	if(U_FAILURE(errorCode) \|\| !uset_contains(set, 0xf900)) {
	237	fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
	238	exit(errorCode);
	239	}
	240	uset_close(set);
	241	}
	242
	243	/* prepare the filename beginning with the source dir */
	244	uprv_strcpy(filename, srcDir);
	245	basename=filename+uprv_strlen(filename);
	246	if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
	247	*basename++=U_FILE_SEP_CHAR;
	248	}
	249
	250	/* initialize */
	251	init();
	252
	253	/* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
	254	if(suffix==NULL) {
	255	uprv_strcpy(basename, "DerivedNormalizationProps.txt");
	256	} else {
	257	uprv_strcpy(basename, "DerivedNormalizationProps");
	258	basename[30]='-';
	259	uprv_strcpy(basename+31, suffix);
	260	uprv_strcat(basename+31, ".txt");
	261	}
	262	parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
	263	if(U_FAILURE(errorCode)) {
	264	/* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
	265	if(suffix==NULL) {
	266	uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
	267	} else {
	268	uprv_strcpy(basename, "DerivedNormalizationProperties");
	269	basename[30]='-';
	270	uprv_strcpy(basename+31, suffix);
	271	uprv_strcat(basename+31, ".txt");
	272	}
	273	parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
	274	}
	275
	276	/* process UnicodeData.txt */
	277	if(suffix==NULL) {
	278	uprv_strcpy(basename, "UnicodeData.txt");
	279	} else {
	280	uprv_strcpy(basename, "UnicodeData");
	281	basename[11]='-';
	282	uprv_strcpy(basename+12, suffix);
	283	uprv_strcat(basename+12, ".txt");
	284	}
	285	parseDB(filename, &errorCode);
	286
	287	/* process parsed data */
	288	if(U_SUCCESS(errorCode)) {
	289	processData();
	290
	291	/* write the properties data file */
	292	generateData(destDir, options[CSOURCE].doesOccur);
	293
	294	cleanUpData();
	295	}
	296
	297	#endif
	298
	299	return errorCode;
	300	}
	301
	302	#if !UCONFIG_NO_NORMALIZATION
	303
	304	/* parser for DerivedNormalizationProperties.txt ---------------------------- */
	305
	306	static void U_CALLCONV
	307	derivedNormalizationPropertiesLineFn(void *context,
	308	char *fields[][2], int32_t fieldCount,
	309	UErrorCode *pErrorCode) {
	310	UChar string[32];
	311	char *s;
	312	uint32_t start, end;
	313	int32_t count;
	314	uint8_t qcFlags;
	315
	316	/* get code point range */
	317	count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
	318	if(U_FAILURE(*pErrorCode)) {
	319	fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
	320	exit(*pErrorCode);
	321	}
	322
	323	/* ignore hangul - handle explicitly */
	324	if(start==0xac00) {
	325	return;
	326	}
	327
	328	/* get property - ignore unrecognized ones */
	329	s=(char *)u_skipWhitespace(fields[1][0]);
	330	if(*s=='N' && s[1]=='F') {
	331	/* quick check flag */
	332	qcFlags=0x11;
	333	s+=2;
	334	if(*s=='K') {
	335	qcFlags<<=1;
	336	++s;
	337	}
	338
	339	if(*s=='C' && s[1]=='_') {
	340	s+=2;
	341	} else if(*s=='D' && s[1]=='_') {
	342	qcFlags<<=2;
	343	s+=2;
	344	} else {
	345	return;
	346	}
	347
	348	if(0==uprv_strncmp(s, "NO", 2)) {
	349	qcFlags&=0xf;
	350	} else if(0==uprv_strncmp(s, "MAYBE", 5)) {
	351	qcFlags&=0x30;
	352	} else if(0==uprv_strncmp(s, "QC", 2) && (s=(char )u_skipWhitespace(s+2))==';') {
	353	/*
	354	* Unicode 4.0.1:
	355	* changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
	356	*/
	357	/* start of the field */
	358	s=(char *)u_skipWhitespace(s+1);
	359	if(*s=='N') {
	360	qcFlags&=0xf;
	361	} else if(*s=='M') {
	362	qcFlags&=0x30;
	363	} else {
	364	return; /* do nothing for "Yes" because it's the default value */
	365	}
	366	} else {
	367	return; /* do nothing for "Yes" because it's the default value */
	368	}
	369
	370	/* set this flag for all code points in this range */
	371	while(start<=end) {
	372	setQCFlags(start++, qcFlags);
	373	}
	374	} else if(0==uprv_memcmp(s, "Comp_Ex", 7) \|\| 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
	375	/* full composition exclusion */
	376	while(start<=end) {
	377	setCompositionExclusion(start++);
	378	}
	379	} else if(
	380	((0==uprv_memcmp(s, "FNC", 3) && (s=(char )u_skipWhitespace(s+3))==';') \|\|
	381	(0==uprv_memcmp(s, "FC_NFKC", 7) && (s=(char )u_skipWhitespace(s+7))==';'))
	382
	383	) {
	384	/* FC_NFKC_Closure, parse field 2 to get the string */
	385	char *t;
	386
	387	/* start of the field */
	388	s=(char *)u_skipWhitespace(s+1);
	389
	390	/* find the end of the field */
	391	for(t=s; t!=';' && t!='#' && t!=0 && t!='\n' && *t!='\r'; ++t) {}
	392	*t=0;
	393
	394	string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
	395	if(U_FAILURE(*pErrorCode)) {
	396	fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
	397	exit(*pErrorCode);
	398	}
	399	while(start<=end) {
	400	setFNC(start++, string);
	401	}
	402	}
	403	}
	404
	405	static void
	406	parseDerivedNormalizationProperties(const char filename, UErrorCode pErrorCode, UBool reportError) {
	407	char *fields[2][2];
	408
	409	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	410	return;
	411	}
	412
	413	u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
	414	if(U_FAILURE(pErrorCode) && (reportError \|\| pErrorCode!=U_FILE_ACCESS_ERROR)) {
	415	fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
	416	exit(*pErrorCode);
	417	}
	418	}
	419
	420	/* parser for UnicodeData.txt ----------------------------------------------- */
	421
	422	static void U_CALLCONV
	423	unicodeDataLineFn(void *context,
	424	char *fields[][2], int32_t fieldCount,
	425	UErrorCode *pErrorCode) {
	426	uint32_t decomp[40];
	427	Norm norm;
	428	const char *s;
	429	char *end;
	430	uint32_t code, value;
	431	int32_t length;
	432	UBool isCompat, something=FALSE;
	433
	434	/* ignore First and Last entries for ranges */
	435	if( *fields[1][0]=='<' &&
	436	(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
	437	(0==uprv_memcmp(", First>", fields[1][1]-8, 8) \|\| 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
	438	) {
	439	return;
	440	}
	441
	442	/* reset the properties */
	443	uprv_memset(&norm, 0, sizeof(Norm));
	444
	445	/*
	446	* The combiningIndex must not be initialized to 0 because 0 is the
	447	* combiningIndex of the first forward-combining character.
	448	*/
	449	norm.combiningIndex=0xffff;
	450
	451	/* get the character code, field 0 */
	452	code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
	453	if(end<=fields[0][0] \|\| end!=fields[0][1]) {
	454	fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
	455	*pErrorCode=U_PARSE_ERROR;
	456	exit(U_PARSE_ERROR);
	457	}
	458
	459	/* get canonical combining class, field 3 */
	460	value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
	461	if(end<=fields[3][0] \|\| end!=fields[3][1] \|\| value>0xff) {
	462	fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
	463	*pErrorCode=U_PARSE_ERROR;
	464	exit(U_PARSE_ERROR);
	465	}
	466	if(value>0) {
	467	norm.udataCC=(uint8_t)value;
	468	something=TRUE;
	469	}
	470
	471	/* get the decomposition, field 5 */
	472	if(fields[5][0]<fields[5][1]) {
	473	if(*(s=fields[5][0])=='<') {
	474	++s;
	475	isCompat=TRUE;
	476
	477	/* skip and ignore the compatibility type name */
	478	do {
	479	if(s==fields[5][1]) {
	480	/* missing '>' */
	481	fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
	482	*pErrorCode=U_PARSE_ERROR;
	483	exit(U_PARSE_ERROR);
	484	}
	485	} while(*s++!='>');
	486	} else {
	487	isCompat=FALSE;
	488	}
	489
	490	/* parse the decomposition string */
	491	length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
	492	if(U_FAILURE(*pErrorCode)) {
	493	fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
	494	(long)code, u_errorName(*pErrorCode));
	495	exit(*pErrorCode);
	496	}
	497
	498	/* store the string */
	499	if(length>0) {
	500	something=TRUE;
	501	if(isCompat) {
	502	norm.lenNFKD=(uint8_t)length;
	503	norm.nfkd=decomp;
	504	} else {
	505	if(length>2) {
	506	fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
	507	(long)code, (long)length);
	508	*pErrorCode=U_PARSE_ERROR;
	509	exit(U_PARSE_ERROR);
	510	}
	511	norm.lenNFD=(uint8_t)length;
	512	norm.nfd=decomp;
	513	}
	514	}
	515	}
	516
	517	/* check for non-character code points */
	518	if((code&0xfffe)==0xfffe \|\| (uint32_t)(code-0xfdd0)<0x20 \|\| code>0x10ffff) {
	519	fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
	520	(long)code);
	521	*pErrorCode=U_PARSE_ERROR;
	522	exit(U_PARSE_ERROR);
	523	}
	524
	525	if(something) {
	526	/* there are normalization values, so store them */
	527	#if 0
	528	if(beVerbose) {
	529	printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
	530	(long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
	531	}
	532	#endif
	533	storeNorm(code, &norm);
	534	}
	535	}
	536
	537	static void
	538	parseDB(const char filename, UErrorCode pErrorCode) {
	539	char *fields[15][2];
	540
	541	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	542	return;
	543	}
	544
	545	u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
	546	if(U_FAILURE(*pErrorCode)) {
	547	fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
	548	exit(*pErrorCode);
	549	}
	550	}
	551
	552	#endif /* #if !UCONFIG_NO_NORMALIZATION */
	553
	554	/*
	555	* Hey, Emacs, please set the following:
	556	*
	557	* Local Variables:
	558	* indent-tabs-mode: nil
	559	* End:
	560	*
	561	*/