[apple/icu.git] / icuSources / tools / gendict / gendict.cpp

/*
**********************************************************************
*   Copyright (C) 2002-2013, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
* File gendict.cpp
*/

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ucharstriebuilder.h"
#include "unicode/bytestriebuilder.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/ucnv.h"
#include "unicode/utf16.h"

#include "charstr.h"
#include "dictionarydata.h"
#include "uoptions.h"
#include "unewdata.h"
#include "cmemory.h"
#include "uassert.h"
#include "ucbuf.h"
#include "toolutil.h"
#include "cstring.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "putilimp.h"
UDate startTime;

static int elapsedTime() {
  return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0);
}

#if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API

#include <signal.h>
#include <unistd.h>

const char *wToolname="gendict";
const char *wOutname="(some file)";

const int firstSeconds = 5; /* seconds between notices*/
const int nextSeconds = 15; /* seconds between notices*/

static void alarm_fn(int /*n*/) {
  printf("%s: still writing\t%s (%ds)\t...\n",    wToolname, wOutname, elapsedTime());
  
  signal(SIGALRM, &alarm_fn);
  alarm(nextSeconds); // reset the alarm
}

static void install_watchdog(const char *toolName, const char *outFileName) {
  wToolname=toolName;
  wOutname=outFileName;

  signal(SIGALRM, &alarm_fn);

  alarm(firstSeconds); // set the alarm
}

#else
static void install_watchdog(const char*, const char*) {
  // not implemented
}
#endif


U_NAMESPACE_USE

static char *progName;
static UOption options[]={
    UOPTION_HELP_H,             /* 0 */
    UOPTION_HELP_QUESTION_MARK, /* 1 */
    UOPTION_VERBOSE,            /* 2 */
    UOPTION_ICUDATADIR,         /* 4 */
    UOPTION_COPYRIGHT,          /* 5 */
    { "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
    { "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
    { "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
};

enum arguments {
    ARG_HELP = 0,
    ARG_QMARK,
    ARG_VERBOSE,
    ARG_ICUDATADIR,
    ARG_COPYRIGHT,
    ARG_UCHARS,
    ARG_BYTES,
    ARG_TRANSFORM
};

// prints out the standard usage method describing command line arguments, 
// then bails out with the desired exit code
static void usageAndDie(UErrorCode retCode) {
    fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
    fprintf((U_SUCCESS(retCode) ? stdout : stderr),
           "\tRead in a word list and write out a string trie dictionary\n"
           "options:\n"
           "\t-h or -? or --help  this usage text\n"
           "\t-V or --version     show a version message\n"
           "\t-c or --copyright   include a copyright notice\n"
           "\t-v or --verbose     turn on verbose output\n"
           "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
           "\t                    followed by path, defaults to %s\n"
           "\t--uchars            output a UCharsTrie (mutually exclusive with -b!)\n"
           "\t--bytes             output a BytesTrie (mutually exclusive with -u!)\n"
           "\t--transform         the kind of transform to use (eg --transform offset-40A3,\n"
           "\t                    which specifies an offset transform with constant 0x40A3)\n",
            u_getDataDirectory());
    exit(retCode);
}


/* UDataInfo cf. udata.h */
static UDataInfo dataInfo = {
    sizeof(UDataInfo),
    0,

    U_IS_BIG_ENDIAN,
    U_CHARSET_FAMILY,
    U_SIZEOF_UCHAR,
    0,

    { 0x44, 0x69, 0x63, 0x74 },     /* "Dict" */
    { 1, 0, 0, 0 },                 /* format version */
    { 0, 0, 0, 0 }                  /* data version */
};

#if !UCONFIG_NO_BREAK_ITERATION

// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
// may want to put this somewhere in ICU, as it could be useful outside
// of this tool?
class DataDict {
private:
    BytesTrieBuilder *bt;
    UCharsTrieBuilder *ut;
    UChar32 transformConstant;
    int32_t transformType;
public:
    // constructs a new data dictionary. if there is an error, 
    // it will be returned in status
    // isBytesTrie != 0 will produce a BytesTrieBuilder,
    // isBytesTrie == 0 will produce a UCharsTrieBuilder
    DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL), 
        transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
        if (isBytesTrie) {
            bt = new BytesTrieBuilder(status);
        } else {
            ut = new UCharsTrieBuilder(status);
        }
    }

    ~DataDict() {
        delete bt;
        delete ut;
    }

private:
    char transform(UChar32 c, UErrorCode &status) {
        if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
            if (c == 0x200D) { return (char)0xFF; }
            else if (c == 0x200C) { return (char)0xFE; }
            int32_t delta = c - transformConstant;
            if (delta < 0 || 0xFD < delta) {
                fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
                        (long)c, (long)transformConstant);
                exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
            }
            return (char)delta;
        } else { // no such transform type 
            status = U_INTERNAL_PROGRAM_ERROR;
            return (char)c; // it should be noted this transform type will not generally work
        }
    }

    void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
        UChar32 c = 0;
        int32_t len = word.length();
        for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
            c = word.char32At(i);
            buf.append(transform(c, errorCode), errorCode);
        }
    }

public:
    // sets the desired transformation data.
    // should be populated from a command line argument
    // so far the only acceptable format is offset-<hex constant>
    // eventually others (mask-<hex constant>?) may be enabled
    // more complex functions may be more difficult
    void setTransform(const char *t) {
        if (strncmp(t, "offset-", 7) == 0) {
            char *end;
            unsigned long base = uprv_strtoul(t + 7, &end, 16);
            if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
                fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
                usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
            }
            transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
            transformConstant = (UChar32)base;
        }
        else {
            fprintf(stderr, "Invalid transform specified: %s\n", t);
            usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
        }
    }

    // add a word to the trie
    void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
        if (bt) {
            CharString buf;
            transform(word, buf, status);
            bt->add(buf.toStringPiece(), value, status);
        }
        if (ut) { ut->add(word, value, status); }
    }

    // if we are a bytestrie, give back the StringPiece representing the serialized version of us
    StringPiece serializeBytes(UErrorCode &status) {
        return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
    }

    // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
    void serializeUChars(UnicodeString &s, UErrorCode &status) {
        ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
    }

    int32_t getTransform() {
        return (int32_t)(transformType | transformConstant); 
    }
};
#endif

static const UChar LINEFEED_CHARACTER = 0x000A;
static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;

static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
    int32_t lineLength;
    const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
    if(line == NULL || errorCode.isFailure()) { return FALSE; }
    // Strip trailing CR/LF, comments, and spaces.
    const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
    if(comment != NULL) {
        lineLength = (int32_t)(comment - line);
    } else {
        while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
    }
    while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
    fileLine.setTo(FALSE, line, lineLength);
    return TRUE;
}

//----------------------------------------------------------------------------
//
//  main      for gendict
//
//----------------------------------------------------------------------------
int  main(int argc, char **argv) {
    //
    // Pick up and check the command line arguments,
    //    using the standard ICU tool utils option handling.
    //
    U_MAIN_INIT_ARGS(argc, argv);
    progName = argv[0];
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    if(argc<0) {
        // Unrecognized option
        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) {
        //  -? or -h for help.
        usageAndDie(U_ZERO_ERROR);
    }

    UBool verbose = options[ARG_VERBOSE].doesOccur;

    if (argc < 3) {
        fprintf(stderr, "input and output file must both be specified.\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    const char *outFileName  = argv[2];
    const char *wordFileName = argv[1];

    startTime = uprv_getRawUTCtime(); // initialize start timer
    // set up the watchdog
    install_watchdog(progName, outFileName);

    if (options[ARG_ICUDATADIR].doesOccur) {
        u_setDataDirectory(options[ARG_ICUDATADIR].value);
    }

    const char *copyright = NULL;
    if (options[ARG_COPYRIGHT].doesOccur) {
        copyright = U_COPYRIGHT_STRING;
    }

    if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
        fprintf(stderr, "you must specify exactly one type of trie to output!\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    UBool isBytesTrie = options[ARG_BYTES].doesOccur;
    if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
        fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    IcuToolErrorCode status("gendict/main()");

#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    const char* outDir=NULL;

    UNewDataMemory *pData;
    char msg[1024];
    UErrorCode tempstatus = U_ZERO_ERROR;

    /* write message with just the name */ // potential for a buffer overflow here...
    sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    fprintf(stderr, "%s\n", msg);

    /* write the dummy data file */
    pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus);
    udata_writeBlock(pData, msg, strlen(msg));
    udata_finish(pData, &tempstatus);
    return (int)tempstatus;

#else
    //  Read in the dictionary source file
    if (verbose) { printf("Opening file %s...\n", wordFileName); }
    const char *codepage = "UTF-8";
    UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status);
    if (status.isFailure()) {
        fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
    DataDict dict(isBytesTrie, status);
    if (status.isFailure()) {
        fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
        exit(status.reset());
    }
    if (options[ARG_TRANSFORM].doesOccur) {
        dict.setTransform(options[ARG_TRANSFORM].value);
    }

    UnicodeString fileLine;
    if (verbose) { puts("Adding words to dictionary..."); }
    UBool hasValues = FALSE;
    UBool hasValuelessContents = FALSE;
    int lineCount = 0;
    int wordCount = 0;
    int minlen = 255;
    int maxlen = 0;
    UBool isOk = TRUE;
    while (readLine(f, fileLine, status)) {
        lineCount++;
        if (fileLine.isEmpty()) continue;
        
        // Parse word [spaces value].
        int32_t keyLen;
        for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
        if (keyLen == 0) {
            fprintf(stderr, "Error: no word on line %i!\n", lineCount);
            isOk = FALSE;
            continue;
        }
        int32_t valueStart;
        for (valueStart = keyLen;
            valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
            ++valueStart) {}

        if (keyLen < valueStart) {
            int32_t valueLength = fileLine.length() - valueStart;
            if (valueLength > 15) {
                fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
                isOk = FALSE;
                continue;
            }
            char s[16];
            fileLine.extract(valueStart, valueLength, s, 16, US_INV);
            char *end;
            unsigned long value = uprv_strtoul(s, &end, 0);
            if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) {
                fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
                isOk = FALSE;
                continue;
            }
            dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
            hasValues = TRUE;
            wordCount++;
            if (keyLen < minlen) minlen = keyLen;
            if (keyLen > maxlen) maxlen = keyLen;
        } else {
            dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
            hasValuelessContents = TRUE;
            wordCount++;
            if (keyLen < minlen) minlen = keyLen;
            if (keyLen > maxlen) maxlen = keyLen;
        }

        if (status.isFailure()) {
            fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
                status.errorName(), lineCount);
            exit(status.reset());
        }
    }
    if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }

    if (!isOk && status.isSuccess()) {
        status.set(U_ILLEGAL_ARGUMENT_ERROR);
    }
    if (hasValues && hasValuelessContents) {
        fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
    }

    if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
    int32_t outDataSize;
    const void *outData;
    UnicodeString usp;
    if (isBytesTrie) {
        StringPiece sp = dict.serializeBytes(status);
        outDataSize = sp.size();
        outData = sp.data();
    } else {
        dict.serializeUChars(usp, status);
        outDataSize = usp.length() * U_SIZEOF_UCHAR;
        outData = usp.getBuffer();
    }
    if (status.isFailure()) {
        fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { puts("Opening output file..."); }
    UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
    if (status.isFailure()) {
        fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
        exit(status.reset());
    }

    if (verbose) { puts("Writing to output file..."); }
    int32_t indexes[DictionaryData::IX_COUNT] = {
        DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
    };
    int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
    indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
    indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
    indexes[DictionaryData::IX_TOTAL_SIZE] = size;

    indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
    if (hasValues) {
        indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
    }

    indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
    udata_writeBlock(pData, indexes, sizeof(indexes));
    udata_writeBlock(pData, outData, outDataSize);
    size_t bytesWritten = udata_finish(pData, status);
    if (status.isFailure()) {
        fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
        exit(status.reset());
    }

    if (bytesWritten != (size_t)size) {
        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }

    printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime());

#ifdef TEST_GENDICT
    if (isBytesTrie) {
        BytesTrie::Iterator it(outData, outDataSize, status);
        while (it.hasNext()) {
            it.next(status);
            const StringPiece s = it.getString();
            int32_t val = it.getValue();
            printf("%s -> %i\n", s.data(), val);
        }
    } else {
        UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
        while (it.hasNext()) {
            it.next(status);
            const UnicodeString s = it.getString();
            int32_t val = it.getValue();
            char tmp[1024];
            s.extract(0, s.length(), tmp, 1024);
            printf("%s -> %i\n", tmp, val);
        }
    }
#endif

    return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
Commit	Line	Data
51004dcb A	1	/*
51004dcb A	2	**********************************************************************
57a6839d	3	* Copyright (C) 2002-2013, International Business Machines
51004dcb A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*
	7	* File gendict.cpp
	8	*/
	9
	10	#include "unicode/utypes.h"
	11	#include "unicode/uchar.h"
	12	#include "unicode/ucnv.h"
	13	#include "unicode/uniset.h"
	14	#include "unicode/unistr.h"
	15	#include "unicode/uclean.h"
	16	#include "unicode/udata.h"
	17	#include "unicode/putil.h"
	18	#include "unicode/ucharstriebuilder.h"
	19	#include "unicode/bytestriebuilder.h"
	20	#include "unicode/ucharstrie.h"
	21	#include "unicode/bytestrie.h"
	22	#include "unicode/ucnv.h"
	23	#include "unicode/utf16.h"
	24
	25	#include "charstr.h"
	26	#include "dictionarydata.h"
	27	#include "uoptions.h"
	28	#include "unewdata.h"
	29	#include "cmemory.h"
	30	#include "uassert.h"
	31	#include "ucbuf.h"
	32	#include "toolutil.h"
	33	#include "cstring.h"
	34
	35	#include <stdio.h>
	36	#include <stdlib.h>
	37	#include <string.h>
	38
	39	#include "putilimp.h"
57a6839d	40	UDate startTime;
51004dcb A	41
	42	static int elapsedTime() {
	43	return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0);
	44	}
	45
	46	#if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API
57a6839d	47
51004dcb A	48	#include <signal.h>
	49	#include <unistd.h>
	50
	51	const char *wToolname="gendict";
	52	const char *wOutname="(some file)";
	53
	54	const int firstSeconds = 5; /* seconds between notices*/
	55	const int nextSeconds = 15; /* seconds between notices*/
	56
	57	static void alarm_fn(int /n/) {
	58	printf("%s: still writing\t%s (%ds)\t...\n", wToolname, wOutname, elapsedTime());
	59
	60	signal(SIGALRM, &alarm_fn);
	61	alarm(nextSeconds); // reset the alarm
	62	}
	63
	64	static void install_watchdog(const char toolName, const char outFileName) {
	65	wToolname=toolName;
	66	wOutname=outFileName;
	67
51004dcb A	68	signal(SIGALRM, &alarm_fn);
	69
	70	alarm(firstSeconds); // set the alarm
	71	}
	72
	73	#else
	74	static void install_watchdog(const char, const char) {
	75	// not implemented
	76	}
	77	#endif
	78
	79
	80
	81
	82	U_NAMESPACE_USE
	83
	84	static char *progName;
	85	static UOption options[]={
	86	UOPTION_HELP_H, /* 0 */
	87	UOPTION_HELP_QUESTION_MARK, /* 1 */
	88	UOPTION_VERBOSE, /* 2 */
	89	UOPTION_ICUDATADIR, /* 4 */
	90	UOPTION_COPYRIGHT, /* 5 */
	91	{ "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
	92	{ "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
	93	{ "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
	94	};
	95
	96	enum arguments {
	97	ARG_HELP = 0,
	98	ARG_QMARK,
	99	ARG_VERBOSE,
	100	ARG_ICUDATADIR,
	101	ARG_COPYRIGHT,
	102	ARG_UCHARS,
	103	ARG_BYTES,
	104	ARG_TRANSFORM
	105	};
	106
	107	// prints out the standard usage method describing command line arguments,
	108	// then bails out with the desired exit code
	109	static void usageAndDie(UErrorCode retCode) {
	110	fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
	111	fprintf((U_SUCCESS(retCode) ? stdout : stderr),
	112	"\tRead in a word list and write out a string trie dictionary\n"
	113	"options:\n"
	114	"\t-h or -? or --help this usage text\n"
	115	"\t-V or --version show a version message\n"
	116	"\t-c or --copyright include a copyright notice\n"
	117	"\t-v or --verbose turn on verbose output\n"
	118	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
	119	"\t followed by path, defaults to %s\n"
	120	"\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
	121	"\t--bytes output a BytesTrie (mutually exclusive with -u!)\n"
	122	"\t--transform the kind of transform to use (eg --transform offset-40A3,\n"
	123	"\t which specifies an offset transform with constant 0x40A3)\n",
	124	u_getDataDirectory());
	125	exit(retCode);
	126	}
	127
	128
	129	/* UDataInfo cf. udata.h */
	130	static UDataInfo dataInfo = {
	131	sizeof(UDataInfo),
132	0,
133
134	U_IS_BIG_ENDIAN,
135	U_CHARSET_FAMILY,
136	U_SIZEOF_UCHAR,
137	0,
138
139	{ 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */
140	{ 1, 0, 0, 0 }, /* format version */
141	{ 0, 0, 0, 0 } /* data version */
142	};
143
144	#if !UCONFIG_NO_BREAK_ITERATION
145
146	// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
147	// may want to put this somewhere in ICU, as it could be useful outside
148	// of this tool?
149	class DataDict {
150	private:
151	BytesTrieBuilder *bt;
152	UCharsTrieBuilder *ut;
153	UChar32 transformConstant;
154	int32_t transformType;
155	public:
156	// constructs a new data dictionary. if there is an error,
157	// it will be returned in status
158	// isBytesTrie != 0 will produce a BytesTrieBuilder,
159	// isBytesTrie == 0 will produce a UCharsTrieBuilder
160	DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL),
161	transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
162	if (isBytesTrie) {
163	bt = new BytesTrieBuilder(status);
164	} else {
165	ut = new UCharsTrieBuilder(status);
166	}
167	}
168
169	~DataDict() {
170	delete bt;
171	delete ut;
172	}
173
174	private:
175	char transform(UChar32 c, UErrorCode &status) {
176	if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
177	if (c == 0x200D) { return (char)0xFF; }
178	else if (c == 0x200C) { return (char)0xFE; }
179	int32_t delta = c - transformConstant;
180	if (delta < 0 \|\| 0xFD < delta) {
181	fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
182	(long)c, (long)transformConstant);
183	exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
184	}
185	return (char)delta;
186	} else { // no such transform type
187	status = U_INTERNAL_PROGRAM_ERROR;
188	return (char)c; // it should be noted this transform type will not generally work
189	}
190	}
191
192	void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
193	UChar32 c = 0;
194	int32_t len = word.length();
195	for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
196	c = word.char32At(i);
197	buf.append(transform(c, errorCode), errorCode);
198	}
199	}
200
201	public:
202	// sets the desired transformation data.
203	// should be populated from a command line argument
204	// so far the only acceptable format is offset-<hex constant>
205	// eventually others (mask-<hex constant>?) may be enabled
206	// more complex functions may be more difficult
207	void setTransform(const char *t) {
208	if (strncmp(t, "offset-", 7) == 0) {
209	char *end;
210	unsigned long base = uprv_strtoul(t + 7, &end, 16);
211	if (end == (t + 7) \|\| *end != 0 \|\| base > 0x10FF80) {
212	fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
213	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
214	}
215	transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
216	transformConstant = (UChar32)base;
217	}
218	else {
219	fprintf(stderr, "Invalid transform specified: %s\n", t);
220	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
221	}
222	}
223
224	// add a word to the trie
225	void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
226	if (bt) {
227	CharString buf;
228	transform(word, buf, status);
229	bt->add(buf.toStringPiece(), value, status);
230	}
231	if (ut) { ut->add(word, value, status); }
232	}
233
234	// if we are a bytestrie, give back the StringPiece representing the serialized version of us
235	StringPiece serializeBytes(UErrorCode &status) {
236	return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
237	}
238
239	// if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
240	void serializeUChars(UnicodeString &s, UErrorCode &status) {
241	ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
242	}
243
244	int32_t getTransform() {
245	return (int32_t)(transformType \| transformConstant);
246	}
247	};
248	#endif
249
250	static const UChar LINEFEED_CHARACTER = 0x000A;
251	static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;
252
253	static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
254	int32_t lineLength;
255	const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
256	if(line == NULL \|\| errorCode.isFailure()) { return FALSE; }
257	// Strip trailing CR/LF, comments, and spaces.
258	const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
259	if(comment != NULL) {
260	lineLength = (int32_t)(comment - line);
261	} else {
262	while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER \|\| line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
263	}
264	while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
265	fileLine.setTo(FALSE, line, lineLength);
266	return TRUE;
267	}
268
269	//----------------------------------------------------------------------------
270	//
271	// main for gendict
272	//
273	//----------------------------------------------------------------------------
274	int main(int argc, char **argv) {
275	//
276	// Pick up and check the command line arguments,
277	// using the standard ICU tool utils option handling.
278	//
279	U_MAIN_INIT_ARGS(argc, argv);
280	progName = argv[0];
281	argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
282	if(argc<0) {
283	// Unrecognized option
284	fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
285	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
286	}
287
288	if(options[ARG_HELP].doesOccur \|\| options[ARG_QMARK].doesOccur) {
289	// -? or -h for help.
290	usageAndDie(U_ZERO_ERROR);
291	}
292
293	UBool verbose = options[ARG_VERBOSE].doesOccur;
294
295	if (argc < 3) {
296	fprintf(stderr, "input and output file must both be specified.\n");
297	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
298	}
299	const char *outFileName = argv[2];
300	const char *wordFileName = argv[1];
301
57a6839d	302	startTime = uprv_getRawUTCtime(); // initialize start timer
51004dcb A	303	// set up the watchdog
	304	install_watchdog(progName, outFileName);
	305
	306	if (options[ARG_ICUDATADIR].doesOccur) {
	307	u_setDataDirectory(options[ARG_ICUDATADIR].value);
	308	}
	309
	310	const char *copyright = NULL;
	311	if (options[ARG_COPYRIGHT].doesOccur) {
	312	copyright = U_COPYRIGHT_STRING;
	313	}
	314
	315	if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
	316	fprintf(stderr, "you must specify exactly one type of trie to output!\n");
	317	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	318	}
	319	UBool isBytesTrie = options[ARG_BYTES].doesOccur;
	320	if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
	321	fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
	322	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	323	}
	324
	325	IcuToolErrorCode status("gendict/main()");
	326
	327	#if UCONFIG_NO_BREAK_ITERATION \|\| UCONFIG_NO_FILE_IO
	328	const char* outDir=NULL;
	329
	330	UNewDataMemory *pData;
	331	char msg[1024];
	332	UErrorCode tempstatus = U_ZERO_ERROR;
	333
	334	/* write message with just the name */ // potential for a buffer overflow here...
	335	sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
	336	fprintf(stderr, "%s\n", msg);
	337
	338	/* write the dummy data file */
	339	pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus);
	340	udata_writeBlock(pData, msg, strlen(msg));
	341	udata_finish(pData, &tempstatus);
	342	return (int)tempstatus;
	343
	344	#else
	345	// Read in the dictionary source file
	346	if (verbose) { printf("Opening file %s...\n", wordFileName); }
	347	const char *codepage = "UTF-8";
	348	UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status);
	349	if (status.isFailure()) {
	350	fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
	351	exit(status.reset());
	352	}
	353	if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
	354	DataDict dict(isBytesTrie, status);
	355	if (status.isFailure()) {
	356	fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
	357	exit(status.reset());
	358	}
	359	if (options[ARG_TRANSFORM].doesOccur) {
	360	dict.setTransform(options[ARG_TRANSFORM].value);
	361	}
	362
	363	UnicodeString fileLine;
	364	if (verbose) { puts("Adding words to dictionary..."); }
	365	UBool hasValues = FALSE;
	366	UBool hasValuelessContents = FALSE;
367	int lineCount = 0;
57a6839d A	368	int wordCount = 0;
	369	int minlen = 255;
	370	int maxlen = 0;
51004dcb A	371	UBool isOk = TRUE;
	372	while (readLine(f, fileLine, status)) {
	373	lineCount++;
	374	if (fileLine.isEmpty()) continue;
	375
	376	// Parse word [spaces value].
	377	int32_t keyLen;
	378	for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
	379	if (keyLen == 0) {
	380	fprintf(stderr, "Error: no word on line %i!\n", lineCount);
	381	isOk = FALSE;
	382	continue;
	383	}
	384	int32_t valueStart;
	385	for (valueStart = keyLen;
	386	valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
	387	++valueStart) {}
	388
	389	if (keyLen < valueStart) {
	390	int32_t valueLength = fileLine.length() - valueStart;
	391	if (valueLength > 15) {
	392	fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
	393	isOk = FALSE;
	394	continue;
	395	}
	396	char s[16];
	397	fileLine.extract(valueStart, valueLength, s, 16, US_INV);
	398	char *end;
	399	unsigned long value = uprv_strtoul(s, &end, 0);
	400	if (end == s \|\| *end != 0 \|\| (int32_t)uprv_strlen(s) != valueLength \|\| value > 0xffffffff) {
	401	fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
	402	isOk = FALSE;
	403	continue;
	404	}
	405	dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
	406	hasValues = TRUE;
57a6839d A	407	wordCount++;
	408	if (keyLen < minlen) minlen = keyLen;
	409	if (keyLen > maxlen) maxlen = keyLen;
51004dcb A	410	} else {
51004dcb A	411	dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
57a6839d A	412	hasValuelessContents = TRUE;
	413	wordCount++;
	414	if (keyLen < minlen) minlen = keyLen;
	415	if (keyLen > maxlen) maxlen = keyLen;
51004dcb A	416	}
	417
	418	if (status.isFailure()) {
	419	fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
	420	status.errorName(), lineCount);
	421	exit(status.reset());
	422	}
	423	}
57a6839d	424	if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }
51004dcb A	425
	426	if (!isOk && status.isSuccess()) {
	427	status.set(U_ILLEGAL_ARGUMENT_ERROR);
	428	}
	429	if (hasValues && hasValuelessContents) {
	430	fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
	431	}
	432
57a6839d	433	if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
51004dcb A	434	int32_t outDataSize;
	435	const void *outData;
	436	UnicodeString usp;
	437	if (isBytesTrie) {
	438	StringPiece sp = dict.serializeBytes(status);
	439	outDataSize = sp.size();
	440	outData = sp.data();
	441	} else {
	442	dict.serializeUChars(usp, status);
	443	outDataSize = usp.length() * U_SIZEOF_UCHAR;
	444	outData = usp.getBuffer();
	445	}
	446	if (status.isFailure()) {
57a6839d	447	fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
51004dcb A	448	exit(status.reset());
	449	}
	450	if (verbose) { puts("Opening output file..."); }
	451	UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
	452	if (status.isFailure()) {
	453	fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
	454	exit(status.reset());
	455	}
	456
	457	if (verbose) { puts("Writing to output file..."); }
	458	int32_t indexes[DictionaryData::IX_COUNT] = {
	459	DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
	460	};
	461	int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
	462	indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
	463	indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
	464	indexes[DictionaryData::IX_TOTAL_SIZE] = size;
	465
	466	indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
	467	if (hasValues) {
	468	indexes[DictionaryData::IX_TRIE_TYPE] \|= DictionaryData::TRIE_HAS_VALUES;
	469	}
	470
	471	indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
	472	udata_writeBlock(pData, indexes, sizeof(indexes));
	473	udata_writeBlock(pData, outData, outDataSize);
	474	size_t bytesWritten = udata_finish(pData, status);
	475	if (status.isFailure()) {
	476	fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
	477	exit(status.reset());
	478	}
	479
	480	if (bytesWritten != (size_t)size) {
	481	fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
	482	exit(U_INTERNAL_PROGRAM_ERROR);
	483	}
	484
	485	printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime());
	486
	487	#ifdef TEST_GENDICT
	488	if (isBytesTrie) {
	489	BytesTrie::Iterator it(outData, outDataSize, status);
	490	while (it.hasNext()) {
	491	it.next(status);
	492	const StringPiece s = it.getString();
	493	int32_t val = it.getValue();
	494	printf("%s -> %i\n", s.data(), val);
	495	}
	496	} else {
	497	UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
	498	while (it.hasNext()) {
	499	it.next(status);
	500	const UnicodeString s = it.getString();
	501	int32_t val = it.getValue();
	502	char tmp[1024];
	503	s.extract(0, s.length(), tmp, 1024);
	504	printf("%s -> %i\n", tmp, val);
	505	}
	506	}
	507	#endif
	508
	509	return 0;
	510	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	511	}