git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/genctd/genctd.cpp

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2002-2009, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*
	7	* File genctd.c
	8	*/
	9
	10	//--------------------------------------------------------------------
	11	//
	12	// Tool for generating CompactTrieDictionary data files (.ctd files).
	13	//
	14	// Usage: genctd [options] -o output-file.ctd input-file
	15	//
	16	// options: -v verbose
	17	// -? or -h help
	18	//
	19	// The input file is a plain text file containing words, one per line.
	20	// Words end at the first whitespace; lines beginning with whitespace
	21	// are ignored.
	22	// The file can be encoded as utf-8, or utf-16 (either endian), or
	23	// in the default code page (platform dependent.). utf encoded
	24	// files must include a BOM.
	25	//
	26	//--------------------------------------------------------------------
	27
	28	#include "unicode/utypes.h"
	29	#include "unicode/uchar.h"
	30	#include "unicode/ucnv.h"
	31	#include "unicode/uniset.h"
	32	#include "unicode/unistr.h"
	33	#include "unicode/uclean.h"
	34	#include "unicode/udata.h"
	35	#include "unicode/putil.h"
	36
	37	#include "uoptions.h"
	38	#include "unewdata.h"
	39	#include "ucmndata.h"
	40	#include "rbbidata.h"
	41	#include "triedict.h"
	42	#include "cmemory.h"
	43
	44	#include <stdio.h>
	45	#include <stdlib.h>
	46	#include <string.h>
	47
	48	U_NAMESPACE_USE
	49
	50	static char *progName;
	51	static UOption options[]={
	52	UOPTION_HELP_H, /* 0 */
	53	UOPTION_HELP_QUESTION_MARK, /* 1 */
	54	UOPTION_VERBOSE, /* 2 */
	55	{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
	56	UOPTION_ICUDATADIR, /* 4 */
	57	UOPTION_DESTDIR, /* 5 */
	58	UOPTION_COPYRIGHT, /* 6 */
	59	};
	60
	61	void usageAndDie(int retCode) {
	62	printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
	63	printf("\tRead in word list and write out compact trie dictionary\n"
	64	"options:\n"
	65	"\t-h or -? or --help this usage text\n"
	66	"\t-V or --version show a version message\n"
	67	"\t-c or --copyright include a copyright notice\n"
	68	"\t-v or --verbose turn on verbose output\n"
	69	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
	70	"\t followed by path, defaults to %s\n"
	71	"\t-d or --destdir destination directory, followed by the path\n",
	72	u_getDataDirectory());
	73	exit (retCode);
	74	}
	75
	76
	77	#if UCONFIG_NO_BREAK_ITERATION \|\| UCONFIG_NO_FILE_IO
	78
	79	/* dummy UDataInfo cf. udata.h */
	80	static UDataInfo dummyDataInfo = {
	81	sizeof(UDataInfo),
	82	0,
	83
	84	U_IS_BIG_ENDIAN,
	85	U_CHARSET_FAMILY,
	86	U_SIZEOF_UCHAR,
	87	0,
	88
	89	{ 0, 0, 0, 0 }, /* dummy dataFormat */
	90	{ 0, 0, 0, 0 }, /* dummy formatVersion */
	91	{ 0, 0, 0, 0 } /* dummy dataVersion */
	92	};
	93
	94	#else
	95
	96	//
	97	// Set up the ICU data header, defined in ucmndata.h
	98	//
	99	DataHeader dh ={
	100	{sizeof(DataHeader), // Struct MappedData
	101	0xda,
	102	0x27},
	103
	104	{ // struct UDataInfo
	105	sizeof(UDataInfo), // size
	106	0, // reserved
	107	U_IS_BIG_ENDIAN,
	108	U_CHARSET_FAMILY,
	109	U_SIZEOF_UCHAR,
	110	0, // reserved
	111
	112	{ 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
	113	{ 1, 0, 0, 0 }, // 1.0.0.0
	114	{ 0, 0, 0, 0 }, // Irrelevant for this data type
	115	}};
	116
	117	#endif
	118
	119	//----------------------------------------------------------------------------
	120	//
	121	// main for genctd
	122	//
	123	//----------------------------------------------------------------------------
	124	int main(int argc, char **argv) {
	125	UErrorCode status = U_ZERO_ERROR;
	126	const char *wordFileName;
	127	const char *outFileName;
	128	const char *outDir = NULL;
	129	const char *copyright = NULL;
	130
	131	//
	132	// Pick up and check the command line arguments,
	133	// using the standard ICU tool utils option handling.
	134	//
	135	U_MAIN_INIT_ARGS(argc, argv);
	136	progName = argv[0];
	137	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
	138	if(argc<0) {
	139	// Unrecognized option
	140	fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
	141	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	142	}
	143
	144	if(options[0].doesOccur \|\| options[1].doesOccur) {
	145	// -? or -h for help.
	146	usageAndDie(0);
	147	}
	148
	149	if (!options[3].doesOccur \|\| argc < 2) {
	150	fprintf(stderr, "input and output file must both be specified.\n");
	151	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	152	}
	153	outFileName = options[3].value;
	154	wordFileName = argv[1];
	155
	156	if (options[4].doesOccur) {
	157	u_setDataDirectory(options[4].value);
	158	}
	159
	160	status = U_ZERO_ERROR;
	161
	162	/* Combine the directory with the file name */
	163	if(options[5].doesOccur) {
	164	outDir = options[5].value;
	165	}
	166	if (options[6].doesOccur) {
	167	copyright = U_COPYRIGHT_STRING;
	168	}
	169
	170	#if UCONFIG_NO_BREAK_ITERATION \|\| UCONFIG_NO_FILE_IO
	171
	172	UNewDataMemory *pData;
	173	char msg[1024];
	174
	175	/* write message with just the name */
	176	sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
	177	fprintf(stderr, "%s\n", msg);
	178
	179	/* write the dummy data file */
	180	pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
	181	udata_writeBlock(pData, msg, strlen(msg));
	182	udata_finish(pData, &status);
	183	return (int)status;
	184
	185	#else
	186	/* Initialize ICU */
	187	u_init(&status);
	188	if (U_FAILURE(status)) {
	189	fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
	190	argv[0], u_errorName(status));
	191	exit(1);
	192	}
	193	status = U_ZERO_ERROR;
	194
	195	//
	196	// Read in the dictionary source file
	197	//
	198	long result;
	199	long wordFileSize;
	200	FILE *file;
	201	char *wordBufferC;
	202
	203	file = fopen(wordFileName, "rb");
	204	if( file == 0 ) {
	205	fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
	206	exit(-1);
	207	}
	208	fseek(file, 0, SEEK_END);
	209	wordFileSize = ftell(file);
	210	fseek(file, 0, SEEK_SET);
	211	wordBufferC = new char[wordFileSize+10];
	212
	213	result = (long)fread(wordBufferC, 1, wordFileSize, file);
	214	if (result != wordFileSize) {
	215	fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
	216	exit (-1);
	217	}
	218	wordBufferC[wordFileSize]=0;
	219	fclose(file);
	220
	221	//
	222	// Look for a Unicode Signature (BOM) on the word file
	223	//
	224	int32_t signatureLength;
	225	const char * wordSourceC = wordBufferC;
	226	const char* encoding = ucnv_detectUnicodeSignature(
	227	wordSourceC, wordFileSize, &signatureLength, &status);
	228	if (U_FAILURE(status)) {
	229	exit(status);
	230	}
	231	if(encoding!=NULL ){
	232	wordSourceC += signatureLength;
	233	wordFileSize -= signatureLength;
	234	}
	235
	236	//
	237	// Open a converter to take the rule file to UTF-16
	238	//
	239	UConverter* conv;
	240	conv = ucnv_open(encoding, &status);
	241	if (U_FAILURE(status)) {
	242	fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
	243	exit(status);
	244	}
	245
	246	//
	247	// Convert the words to UChar.
	248	// Preflight first to determine required buffer size.
	249	//
	250	uint32_t destCap = ucnv_toUChars(conv,
	251	NULL, // dest,
	252	0, // destCapacity,
	253	wordSourceC,
	254	wordFileSize,
	255	&status);
	256	if (status != U_BUFFER_OVERFLOW_ERROR) {
	257	fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	258	exit(status);
	259	};
	260
	261	status = U_ZERO_ERROR;
	262	UChar *wordSourceU = new UChar[destCap+1];
	263	ucnv_toUChars(conv,
	264	wordSourceU, // dest,
	265	destCap+1,
	266	wordSourceC,
	267	wordFileSize,
	268	&status);
	269	if (U_FAILURE(status)) {
	270	fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
	271	exit(status);
	272	};
	273	ucnv_close(conv);
	274
	275	// Get rid of the original file buffer
	276	delete[] wordBufferC;
	277
	278	// Create a MutableTrieDictionary, and loop through all the lines, inserting
	279	// words.
	280
	281	// First, pick a median character.
	282	UChar *current = wordSourceU + (destCap/2);
	283	UChar uc = *current++;
	284	UnicodeSet breaks;
	285	breaks.add(0x000A); // Line Feed
	286	breaks.add(0x000D); // Carriage Return
	287	breaks.add(0x2028); // Line Separator
	288	breaks.add(0x2029); // Paragraph Separator
	289
	290	do {
	291	// Look for line break
	292	while (uc && !breaks.contains(uc)) {
	293	uc = *current++;
	294	}
	295	// Now skip to first non-line-break
	296	while (uc && breaks.contains(uc)) {
	297	uc = *current++;
	298	}
	299	}
	300	while (uc && (breaks.contains(uc) \|\| u_isspace(uc)));
	301
	302	MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
	303
	304	if (U_FAILURE(status)) {
	305	fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
	306	exit(status);
	307	}
	308
	309	// Now add the words. Words are non-space characters at the beginning of
	310	// lines, and must be at least one UChar.
	311	current = wordSourceU;
	312	UChar *candidate = current;
	313	uc = *current++;
	314	int32_t length = 0;
	315
	316	while (uc) {
	317	while (uc && !u_isspace(uc)) {
	318	++length;
	319	uc = *current++;
	320	}
	321	if (length > 0) {
	322	mtd->addWord(candidate, length, status);
	323	if (U_FAILURE(status)) {
	324	fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
	325	u_errorName(status));
	326	exit(status);
	327	}
	328	}
	329	// Find beginning of next line
	330	while (uc && !breaks.contains(uc)) {
	331	uc = *current++;
	332	}
	333	while (uc && breaks.contains(uc)) {
	334	uc = *current++;
	335	}
	336	candidate = current-1;
	337	length = 0;
	338	}
	339
	340	// Get rid of the Unicode text buffer
	341	delete[] wordSourceU;
	342
	343	// Now, create a CompactTrieDictionary from the mutable dictionary
	344	CompactTrieDictionary ctd = new CompactTrieDictionary(mtd, status);
	345	if (U_FAILURE(status)) {
	346	fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
	347	exit(status);
	348	}
	349
	350	// Get rid of the MutableTrieDictionary
	351	delete mtd;
	352
	353	//
	354	// Get the binary data from the dictionary.
	355	//
	356	uint32_t outDataSize = ctd->dataSize();
	357	const uint8_t outData = (const uint8_t )ctd->data();
	358
	359	//
	360	// Create the output file
	361	//
	362	size_t bytesWritten;
	363	UNewDataMemory *pData;
	364	pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
	365	if(U_FAILURE(status)) {
	366	fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
	367	outFileName, u_errorName(status));
	368	exit(status);
	369	}
	370
	371
	372	// Write the data itself.
	373	udata_writeBlock(pData, outData, outDataSize);
	374	// finish up
	375	bytesWritten = udata_finish(pData, &status);
	376	if(U_FAILURE(status)) {
	377	fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
	378	exit(status);
	379	}
	380
	381	if (bytesWritten != outDataSize) {
	382	fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
	383	exit(-1);
	384	}
	385
	386	// Get rid of the CompactTrieDictionary
	387	delete ctd;
	388
	389	u_cleanup();
	390
	391	printf("genctd: tool completed successfully.\n");
	392	return 0;
	393
	394	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	395	}
	396