git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/genprops/genprops.c

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1999-2005, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: genprops.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 1999dec08
	14	* created by: Markus W. Scherer
	15	*
	16	* This program reads several of the Unicode character database text files,
	17	* parses them, and extracts most of the properties for each character.
	18	* It then writes a binary file containing the properties
	19	* that is designed to be used directly for random-access to
	20	* the properties of each Unicode character.
	21	*/
	22
	23	#include <stdio.h>
	24	#include <stdlib.h>
	25	#include "unicode/utypes.h"
	26	#include "unicode/uchar.h"
	27	#include "unicode/putil.h"
	28	#include "unicode/uclean.h"
	29	#include "cmemory.h"
	30	#include "cstring.h"
	31	#include "unewdata.h"
	32	#include "uoptions.h"
	33	#include "uparse.h"
	34	#include "uprops.h"
	35	#include "propsvec.h"
	36
	37	U_CDECL_BEGIN
	38	#include "genprops.h"
	39	U_CDECL_END
	40
	41	#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
	42
	43	UBool beVerbose=FALSE, haveCopyright=TRUE;
	44
	45	/* prototypes --------------------------------------------------------------- */
	46
	47	static void
	48	parseDB(const char filename, UErrorCode pErrorCode);
	49
	50	/* -------------------------------------------------------------------------- */
	51
	52	enum
	53	{
	54	HELP_H,
	55	HELP_QUESTION_MARK,
	56	VERBOSE,
	57	COPYRIGHT,
	58	DESTDIR,
	59	SOURCEDIR,
	60	UNICODE_VERSION,
	61	ICUDATADIR,
	62	CSOURCE
	63	};
	64
	65	/* Keep these values in sync with the above enums */
	66	static UOption options[]={
	67	UOPTION_HELP_H,
	68	UOPTION_HELP_QUESTION_MARK,
	69	UOPTION_VERBOSE,
	70	UOPTION_COPYRIGHT,
	71	UOPTION_DESTDIR,
	72	UOPTION_SOURCEDIR,
	73	UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
	74	UOPTION_ICUDATADIR,
	75	UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
	76	};
	77
	78	extern int
	79	main(int argc, char* argv[]) {
	80	char filename[300];
	81	const char srcDir=NULL, destDir=NULL, *suffix=NULL;
	82	char *basename=NULL;
	83	UErrorCode errorCode=U_ZERO_ERROR;
	84
	85	U_MAIN_INIT_ARGS(argc, argv);
	86
	87	/* preset then read command line options */
	88	options[DESTDIR].value=u_getDataDirectory();
	89	options[SOURCEDIR].value="";
	90	options[UNICODE_VERSION].value="";
	91	options[ICUDATADIR].value=u_getDataDirectory();
	92	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
	93
	94	/* error handling, printing usage message */
	95	if(argc<0) {
	96	fprintf(stderr,
	97	"error in command line argument \"%s\"\n",
	98	argv[-argc]);
	99	}
	100	if(argc<0 \|\| options[HELP_H].doesOccur \|\| options[HELP_QUESTION_MARK].doesOccur) {
	101	/*
	102	* Broken into chucks because the C89 standard says the minimum
	103	* required supported string length is 509 bytes.
	104	*/
	105	fprintf(stderr,
	106	"Usage: %s [-options] [suffix]\n"
	107	"\n"
	108	"read the UnicodeData.txt file and other Unicode properties files and\n"
	109	"create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
	110	"\n",
	111	argv[0]);
	112	fprintf(stderr,
	113	"Options:\n"
	114	"\t-h or -? or --help this usage text\n"
	115	"\t-v or --verbose verbose output\n"
	116	"\t-c or --copyright include a copyright notice\n"
	117	"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
	118	"\t-C or --csource generate a .c source file rather than the .icu binary\n");
	119	fprintf(stderr,
	120	"\t-d or --destdir destination directory, followed by the path\n"
	121	"\t-s or --sourcedir source directory, followed by the path\n"
	122	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
	123	"\t followed by path, defaults to %s\n"
	124	"\tsuffix suffix that is to be appended with a '-'\n"
	125	"\t to the source file basenames before opening;\n"
	126	"\t 'genprops new' will read UnicodeData-new.txt etc.\n",
	127	u_getDataDirectory());
	128	return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
	129	}
	130
	131	/* get the options values */
	132	beVerbose=options[VERBOSE].doesOccur;
	133	haveCopyright=options[COPYRIGHT].doesOccur;
	134	srcDir=options[SOURCEDIR].value;
	135	destDir=options[DESTDIR].value;
	136
	137	if(argc>=2) {
	138	suffix=argv[1];
	139	} else {
	140	suffix=NULL;
	141	}
	142
	143	if(options[UNICODE_VERSION].doesOccur) {
	144	setUnicodeVersion(options[UNICODE_VERSION].value);
	145	}
	146	/* else use the default dataVersion in store.c */
	147
	148	if (options[ICUDATADIR].doesOccur) {
	149	u_setDataDirectory(options[ICUDATADIR].value);
	150	}
	151
	152	/* prepare the filename beginning with the source dir */
	153	uprv_strcpy(filename, srcDir);
	154	basename=filename+uprv_strlen(filename);
	155	if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
	156	*basename++=U_FILE_SEP_CHAR;
	157	}
	158
	159	/* initialize */
	160	initStore();
	161
	162	/* process UnicodeData.txt */
	163	writeUCDFilename(basename, "UnicodeData", suffix);
	164	parseDB(filename, &errorCode);
	165
	166	/* process additional properties files */
	167	*basename=0;
	168	generateAdditionalProperties(filename, suffix, &errorCode);
	169
	170	/* process parsed data */
	171	if(U_SUCCESS(errorCode)) {
	172	/* write the properties data file */
	173	generateData(destDir, options[CSOURCE].doesOccur);
	174	}
	175
	176	exitStore();
	177	u_cleanup();
	178	return errorCode;
	179	}
	180
	181	U_CFUNC void
	182	writeUCDFilename(char basename, const char filename, const char *suffix) {
	183	int32_t length=(int32_t)uprv_strlen(filename);
	184	uprv_strcpy(basename, filename);
	185	if(suffix!=NULL) {
	186	basename[length++]='-';
	187	uprv_strcpy(basename+length, suffix);
	188	length+=(int32_t)uprv_strlen(suffix);
	189	}
	190	uprv_strcpy(basename+length, ".txt");
	191	}
	192
	193	U_CFUNC UBool
	194	isToken(const char token, const char s) {
	195	const char *z;
	196	int32_t j;
	197
	198	s=u_skipWhitespace(s);
	199	for(j=0;; ++j) {
	200	if(token[j]!=0) {
	201	if(s[j]!=token[j]) {
	202	break;
	203	}
	204	} else {
	205	z=u_skipWhitespace(s+j);
	206	if(z==';' \|\| z==0) {
	207	return TRUE;
	208	} else {
	209	break;
	210	}
	211	}
	212	}
	213
	214	return FALSE;
	215	}
	216
	217	U_CFUNC int32_t
	218	getTokenIndex(const char const tokens[], int32_t countTokens, const char s) {
	219	const char t, z;
	220	int32_t i, j;
	221
	222	s=u_skipWhitespace(s);
	223	for(i=0; i<countTokens; ++i) {
	224	t=tokens[i];
	225	if(t!=NULL) {
	226	for(j=0;; ++j) {
	227	if(t[j]!=0) {
	228	if(s[j]!=t[j]) {
	229	break;
	230	}
	231	} else {
	232	z=u_skipWhitespace(s+j);
	233	if(z==';' \|\| z==0 \|\| z=='#' \|\| z=='\r' \|\| *z=='\n') {
	234	return i;
	235	} else {
	236	break;
	237	}
	238	}
	239	}
	240	}
	241	}
	242	return -1;
	243	}
	244
	245	/* parser for UnicodeData.txt ----------------------------------------------- */
	246
	247	/* general categories */
	248	const char *const
	249	genCategoryNames[U_CHAR_CATEGORY_COUNT]={
	250	"Cn",
	251	"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
	252	"Mc", "Nd", "Nl", "No",
	253	"Zs", "Zl", "Zp",
	254	"Cc", "Cf", "Co", "Cs",
	255	"Pd", "Ps", "Pe", "Pc", "Po",
	256	"Sm", "Sc", "Sk", "So",
	257	"Pi", "Pf"
	258	};
	259
	260	const char *const
	261	decompositionTypeNames[U_DT_COUNT]={
	262	NULL,
	263	NULL,
	264	"compat",
	265	"circle",
	266	"final",
	267	"font",
	268	"fraction",
	269	"initial",
	270	"isolated",
	271	"medial",
	272	"narrow",
	273	"noBreak",
	274	"small",
	275	"square",
	276	"sub",
	277	"super",
	278	"vertical",
	279	"wide"
	280	};
	281
	282	static struct {
	283	uint32_t first, last, props;
	284	char name[80];
	285	} unicodeAreas[32];
	286
	287	static int32_t unicodeAreaIndex=0;
	288
	289	static void U_CALLCONV
	290	unicodeDataLineFn(void *context,
	291	char *fields[][2], int32_t fieldCount,
	292	UErrorCode *pErrorCode) {
	293	Props p;
	294	char *end;
	295	static uint32_t prevCode=0;
	296	uint32_t value;
	297	int32_t i;
	298
	299	/* reset the properties */
	300	uprv_memset(&p, 0, sizeof(Props));
	301
	302	/* get the character code, field 0 */
	303	p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
	304	if(end<=fields[0][0] \|\| end!=fields[0][1]) {
	305	fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
	306	*pErrorCode=U_PARSE_ERROR;
	307	exit(U_PARSE_ERROR);
	308	}
	309
	310	/* get general category, field 2 */
	311	i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
	312	if(i>=0) {
	313	p.generalCategory=(uint8_t)i;
	314	} else {
	315	fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
	316	fields[2][0], (unsigned long)p.code);
	317	*pErrorCode=U_PARSE_ERROR;
	318	exit(U_PARSE_ERROR);
	319	}
	320
	321	/* get decomposition type, field 5 */
	322	if(fields[5][0]<fields[5][1]) {
	323	/* there is some decomposition */
	324	if(*fields[5][0]!='<') {
	325	/* canonical */
	326	i=U_DT_CANONICAL;
	327	} else {
	328	/* get compatibility type */
	329	end=fields[5][0]+1;
	330	while(end<fields[5][1] && *end!='>') {
	331	++end;
	332	}
	333	*end='#';
	334	i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
	335	if(i<0) {
	336	fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
	337	fields[5][0], (unsigned long)p.code);
	338	*pErrorCode=U_PARSE_ERROR;
	339	exit(U_PARSE_ERROR);
	340	}
	341	}
	342	if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
	343	fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
	344	exit(*pErrorCode);
	345	}
	346	}
	347
	348	/* decimal digit value, field 6 */
	349	if(fields[6][0]<fields[6][1]) {
	350	value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
	351	if(end!=fields[6][1] \|\| value>0x7fff) {
	352	fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
	353	(unsigned long)p.code);
	354	*pErrorCode=U_PARSE_ERROR;
	355	exit(U_PARSE_ERROR);
	356	}
	357	p.numericValue=(int32_t)value;
	358	p.numericType=1;
	359	}
	360
	361	/* digit value, field 7 */
	362	if(fields[7][0]<fields[7][1]) {
	363	value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
	364	if(end!=fields[7][1] \|\| value>0x7fff) {
	365	fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
	366	(unsigned long)p.code);
	367	*pErrorCode=U_PARSE_ERROR;
	368	exit(U_PARSE_ERROR);
	369	}
	370	if(p.numericType==0) {
	371	p.numericValue=(int32_t)value;
	372	p.numericType=2;
	373	} else if((int32_t)value!=p.numericValue) {
	374	fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
	375	(unsigned long)p.code);
	376	*pErrorCode=U_PARSE_ERROR;
	377	exit(U_PARSE_ERROR);
	378	}
	379	}
	380
	381	/* numeric value, field 8 */
	382	if(fields[8][0]<fields[8][1]) {
	383	char *s=fields[8][0];
	384	UBool isNegative;
	385
	386	/* get a possible minus sign */
	387	if(*s=='-') {
	388	isNegative=TRUE;
	389	++s;
	390	} else {
	391	isNegative=FALSE;
	392	}
	393
	394	value=(uint32_t)uprv_strtoul(s, &end, 10);
	395	if(value>0 && *end=='/') {
	396	/* field 8 may contain a fractional value, get the denominator */
	397	if(p.numericType>0) {
	398	fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
	399	(unsigned long)p.code);
	400	*pErrorCode=U_PARSE_ERROR;
	401	exit(U_PARSE_ERROR);
	402	}
	403
	404	p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
	405	if(p.denominator==0) {
	406	fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
	407	(unsigned long)p.code);
	408	*pErrorCode=U_PARSE_ERROR;
	409	exit(U_PARSE_ERROR);
	410	}
	411	}
	412	if(end!=fields[8][1] \|\| value>0x7fffffff) {
	413	fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
	414	(unsigned long)p.code);
	415	*pErrorCode=U_PARSE_ERROR;
	416	exit(U_PARSE_ERROR);
	417	}
	418
	419	if(p.numericType==0) {
	420	if(isNegative) {
	421	p.numericValue=-(int32_t)value;
	422	} else {
	423	p.numericValue=(int32_t)value;
	424	}
	425	p.numericType=3;
	426	} else if((int32_t)value!=p.numericValue) {
	427	fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
	428	(unsigned long)p.code);
	429	*pErrorCode=U_PARSE_ERROR;
	430	exit(U_PARSE_ERROR);
	431	}
	432	}
	433
	434	value=makeProps(&p);
	435
	436	if(*fields[1][0]=='<') {
	437	/* first or last entry of a Unicode area */
	438	size_t length=fields[1][1]-fields[1][0];
	439
	440	if(length<9) {
	441	/* name too short for an area name */
	442	} else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
	443	/* set the current area */
	444	if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
	445	length-=9;
	446	unicodeAreas[unicodeAreaIndex].first=p.code;
	447	unicodeAreas[unicodeAreaIndex].props=value;
	448	uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
	449	unicodeAreas[unicodeAreaIndex].name[length]=0;
	450	} else {
	451	/* error: a previous area is incomplete */
	452	fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
	453	*pErrorCode=U_PARSE_ERROR;
	454	exit(U_PARSE_ERROR);
	455	}
	456	return;
	457	} else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
	458	/* check that the current area matches, and complete it with the last code point */
	459	length-=8;
	460	if( unicodeAreas[unicodeAreaIndex].props==value &&
	461	0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
	462	unicodeAreas[unicodeAreaIndex].name[length]==0 &&
	463	unicodeAreas[unicodeAreaIndex].first<p.code
	464	) {
	465	unicodeAreas[unicodeAreaIndex].last=p.code;
	466	if(beVerbose) {
	467	printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
	468	(unsigned long)unicodeAreas[unicodeAreaIndex].first,
	469	(unsigned long)unicodeAreas[unicodeAreaIndex].last,
	470	unicodeAreas[unicodeAreaIndex].name);
	471	}
	472	unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
	473	} else {
	474	/* error: different properties between first & last, different area name, first>=last */
	475	fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
	476	*pErrorCode=U_PARSE_ERROR;
	477	exit(U_PARSE_ERROR);
	478	}
	479	return;
	480	} else {
	481	/* not an area name */
	482	}
	483	}
	484
	485	/* check for non-character code points */
	486	if((p.code&0xfffe)==0xfffe \|\| (uint32_t)(p.code-0xfdd0)<0x20) {
	487	fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
	488	(unsigned long)p.code);
	489	*pErrorCode=U_PARSE_ERROR;
	490	exit(U_PARSE_ERROR);
	491	}
	492
	493	/* check that the code points (p.code) are in ascending order */
	494	if(p.code<=prevCode && p.code>0) {
	495	fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
	496	(unsigned long)p.code, (unsigned long)prevCode);
	497	*pErrorCode=U_PARSE_ERROR;
	498	exit(U_PARSE_ERROR);
	499	}
	500	prevCode=p.code;
	501
	502	/* properties for a single code point */
	503	addProps(p.code, value);
	504	}
	505
	506	/* set repeated properties for the areas */
	507	static void
	508	repeatAreaProps() {
	509	uint32_t puaProps;
	510	int32_t i;
	511	UBool hasPlane15PUA, hasPlane16PUA;
	512	UErrorCode errorCode;
	513
	514	/*
	515	* UnicodeData.txt before 3.0.1 did not contain the PUAs on
	516	* planes 15 and 16.
	517	* If that is the case, then we add them here, using the properties
	518	* from the BMP PUA.
	519	*/
	520	puaProps=0;
	521	hasPlane15PUA=hasPlane16PUA=FALSE;
	522
	523	for(i=0; i<unicodeAreaIndex; ++i) {
	524	repeatProps(unicodeAreas[i].first,
	525	unicodeAreas[i].last,
	526	unicodeAreas[i].props);
	527	if(unicodeAreas[i].first==0xe000) {
	528	puaProps=unicodeAreas[i].props;
	529	} else if(unicodeAreas[i].first==0xf0000) {
	530	hasPlane15PUA=TRUE;
	531	} else if(unicodeAreas[i].first==0x100000) {
	532	hasPlane16PUA=TRUE;
	533	}
	534	}
	535
	536	if(puaProps!=0) {
	537	if(!hasPlane15PUA) {
	538	repeatProps(0xf0000, 0xffffd, puaProps);
	539	}
	540	if(!hasPlane16PUA) {
	541	repeatProps(0x100000, 0x10fffd, puaProps);
	542	}
	543	}
	544
	545	/* Hangul have canonical decompositions */
	546	errorCode=U_ZERO_ERROR;
	547	if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
	548	fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
	549	exit(errorCode);
	550	}
	551	}
	552
	553	static void
	554	parseDB(const char filename, UErrorCode pErrorCode) {
	555	char *fields[15][2];
	556
	557	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	558	return;
	559	}
	560
	561	/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
	562	unicodeAreas[0].first=0xffffffff;
	563
	564	u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
	565
	566	if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
	567	fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
	568	unicodeAreas[unicodeAreaIndex].name,
	569	(unsigned long)unicodeAreas[unicodeAreaIndex].first);
	570	*pErrorCode=U_PARSE_ERROR;
	571	exit(U_PARSE_ERROR);
	572	}
	573
	574	repeatAreaProps();
	575
	576	if(U_FAILURE(*pErrorCode)) {
	577	return;
	578	}
	579	}
	580
	581	/*
	582	* Hey, Emacs, please set the following:
	583	*
	584	* Local Variables:
	585	* indent-tabs-mode: nil
	586	* End:
	587	*
	588	*/