[apple/icu.git] / icuSources / tools / gendict / gendict.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2002-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
* File gendict.cpp
*/

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ucharstriebuilder.h"
#include "unicode/bytestriebuilder.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/ucnv.h"
#include "unicode/ustring.h"
#include "unicode/utf16.h"

#include "charstr.h"
#include "dictionarydata.h"
#include "uoptions.h"
#include "unewdata.h"
#include "cmemory.h"
#include "uassert.h"
#include "ucbuf.h"
#include "toolutil.h"
#include "cstring.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "putilimp.h"
UDate startTime;

static int elapsedTime() {
  return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0);
}

U_NAMESPACE_USE

static char *progName;
static UOption options[]={
    UOPTION_HELP_H,             /* 0 */
    UOPTION_HELP_QUESTION_MARK, /* 1 */
    UOPTION_VERBOSE,            /* 2 */
    UOPTION_ICUDATADIR,         /* 4 */
    UOPTION_COPYRIGHT,          /* 5 */
    { "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
    { "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
    { "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
    UOPTION_QUIET,              /* 9 */
};

enum arguments {
    ARG_HELP = 0,
    ARG_QMARK,
    ARG_VERBOSE,
    ARG_ICUDATADIR,
    ARG_COPYRIGHT,
    ARG_UCHARS,
    ARG_BYTES,
    ARG_TRANSFORM,
    ARG_QUIET
};

// prints out the standard usage method describing command line arguments, 
// then bails out with the desired exit code
static void usageAndDie(UErrorCode retCode) {
    fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
    fprintf((U_SUCCESS(retCode) ? stdout : stderr),
           "\tRead in a word list and write out a string trie dictionary\n"
           "options:\n"
           "\t-h or -? or --help  this usage text\n"
           "\t-V or --version     show a version message\n"
           "\t-c or --copyright   include a copyright notice\n"
           "\t-v or --verbose     turn on verbose output\n"
           "\t-q or --quiet       do not display warnings and progress\n"
           "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
           "\t                    followed by path, defaults to %s\n"
           "\t--uchars            output a UCharsTrie (mutually exclusive with -b!)\n"
           "\t--bytes             output a BytesTrie (mutually exclusive with -u!)\n"
           "\t--transform         the kind of transform to use (eg --transform offset-40A3,\n"
           "\t                    which specifies an offset transform with constant 0x40A3)\n",
            u_getDataDirectory());
    exit(retCode);
}


/* UDataInfo cf. udata.h */
static UDataInfo dataInfo = {
    sizeof(UDataInfo),
    0,

    U_IS_BIG_ENDIAN,
    U_CHARSET_FAMILY,
    U_SIZEOF_UCHAR,
    0,

    { 0x44, 0x69, 0x63, 0x74 },     /* "Dict" */
    { 1, 0, 0, 0 },                 /* format version */
    { 0, 0, 0, 0 }                  /* data version */
};

#if !UCONFIG_NO_BREAK_ITERATION

// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
// may want to put this somewhere in ICU, as it could be useful outside
// of this tool?
class DataDict {
private:
    BytesTrieBuilder *bt;
    UCharsTrieBuilder *ut;
    UChar32 transformConstant;
    int32_t transformType;
public:
    // constructs a new data dictionary. if there is an error, 
    // it will be returned in status
    // isBytesTrie != 0 will produce a BytesTrieBuilder,
    // isBytesTrie == 0 will produce a UCharsTrieBuilder
    DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL), 
        transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
        if (isBytesTrie) {
            bt = new BytesTrieBuilder(status);
        } else {
            ut = new UCharsTrieBuilder(status);
        }
    }

    ~DataDict() {
        delete bt;
        delete ut;
    }

private:
    char transform(UChar32 c, UErrorCode &status) {
        if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
            if (c == 0x200D) { return (char)0xFF; }
            else if (c == 0x200C) { return (char)0xFE; }
            int32_t delta = c - transformConstant;
            if (delta < 0 || 0xFD < delta) {
                fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
                        (long)c, (long)transformConstant);
                exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
            }
            return (char)delta;
        } else { // no such transform type 
            status = U_INTERNAL_PROGRAM_ERROR;
            return (char)c; // it should be noted this transform type will not generally work
        }
    }

    void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
        UChar32 c = 0;
        int32_t len = word.length();
        for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
            c = word.char32At(i);
            buf.append(transform(c, errorCode), errorCode);
        }
    }

public:
    // sets the desired transformation data.
    // should be populated from a command line argument
    // so far the only acceptable format is offset-<hex constant>
    // eventually others (mask-<hex constant>?) may be enabled
    // more complex functions may be more difficult
    void setTransform(const char *t) {
        if (strncmp(t, "offset-", 7) == 0) {
            char *end;
            unsigned long base = uprv_strtoul(t + 7, &end, 16);
            if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
                fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
                usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
            }
            transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
            transformConstant = (UChar32)base;
        }
        else {
            fprintf(stderr, "Invalid transform specified: %s\n", t);
            usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
        }
    }

    // add a word to the trie
    void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
        if (bt) {
            CharString buf;
            transform(word, buf, status);
            bt->add(buf.toStringPiece(), value, status);
        }
        if (ut) { ut->add(word, value, status); }
    }

    // if we are a bytestrie, give back the StringPiece representing the serialized version of us
    StringPiece serializeBytes(UErrorCode &status) {
        return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
    }

    // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
    void serializeUChars(UnicodeString &s, UErrorCode &status) {
        ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
    }

    int32_t getTransform() {
        return (int32_t)(transformType | transformConstant); 
    }
};
#endif

static const UChar LINEFEED_CHARACTER = 0x000A;
static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;

static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
    int32_t lineLength;
    const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
    if(line == NULL || errorCode.isFailure()) { return FALSE; }
    // Strip trailing CR/LF, comments, and spaces.
    const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
    if(comment != NULL) {
        lineLength = (int32_t)(comment - line);
    } else {
        while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
    }
    while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
    fileLine.setTo(FALSE, line, lineLength);
    return TRUE;
}

//----------------------------------------------------------------------------
//
//  main      for gendict
//
//----------------------------------------------------------------------------
int  main(int argc, char **argv) {
    //
    // Pick up and check the command line arguments,
    //    using the standard ICU tool utils option handling.
    //
    U_MAIN_INIT_ARGS(argc, argv);
    progName = argv[0];
    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    if(argc<0) {
        // Unrecognized option
        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) {
        //  -? or -h for help.
        usageAndDie(U_ZERO_ERROR);
    }

    UBool verbose = options[ARG_VERBOSE].doesOccur;
    UBool quiet = options[ARG_QUIET].doesOccur;

    if (argc < 3) {
        fprintf(stderr, "input and output file must both be specified.\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    const char *outFileName  = argv[2];
    const char *wordFileName = argv[1];

    startTime = uprv_getRawUTCtime(); // initialize start timer

	if (options[ARG_ICUDATADIR].doesOccur) {
        u_setDataDirectory(options[ARG_ICUDATADIR].value);
    }

    const char *copyright = NULL;
    if (options[ARG_COPYRIGHT].doesOccur) {
        copyright = U_COPYRIGHT_STRING;
    }

    if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
        fprintf(stderr, "you must specify exactly one type of trie to output!\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    UBool isBytesTrie = options[ARG_BYTES].doesOccur;
    if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
        fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    IcuToolErrorCode status("gendict/main()");

#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    const char* outDir=NULL;

    UNewDataMemory *pData;
    char msg[1024];
    UErrorCode tempstatus = U_ZERO_ERROR;

    /* write message with just the name */ // potential for a buffer overflow here...
    sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    fprintf(stderr, "%s\n", msg);

    /* write the dummy data file */
    pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus);
    udata_writeBlock(pData, msg, strlen(msg));
    udata_finish(pData, &tempstatus);
    return (int)tempstatus;

#else
    //  Read in the dictionary source file
    if (verbose) { printf("Opening file %s...\n", wordFileName); }
    const char *codepage = "UTF-8";
    LocalUCHARBUFPointer f(ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status));
    if (status.isFailure()) {
        fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
    DataDict dict(isBytesTrie, status);
    if (status.isFailure()) {
        fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
        exit(status.reset());
    }
    if (options[ARG_TRANSFORM].doesOccur) {
        dict.setTransform(options[ARG_TRANSFORM].value);
    }

    UnicodeString fileLine;
    if (verbose) { puts("Adding words to dictionary..."); }
    UBool hasValues = FALSE;
    UBool hasValuelessContents = FALSE;
    int lineCount = 0;
    int wordCount = 0;
    int minlen = 255;
    int maxlen = 0;
    UBool isOk = TRUE;
    while (readLine(f.getAlias(), fileLine, status)) {
        lineCount++;
        if (fileLine.isEmpty()) continue;
 
        // Parse word [spaces value].
        int32_t keyLen;
        for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
        if (keyLen == 0) {
            fprintf(stderr, "Error: no word on line %i!\n", lineCount);
            isOk = FALSE;
            continue;
        }
        int32_t valueStart;
        for (valueStart = keyLen;
            valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
            ++valueStart) {}

        if (keyLen < valueStart) {
            int32_t valueLength = fileLine.length() - valueStart;
            if (valueLength > 15) {
                fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
                isOk = FALSE;
                continue;
            }
            char s[16];
            fileLine.extract(valueStart, valueLength, s, 16, US_INV);
            char *end;
            unsigned long value = uprv_strtoul(s, &end, 0);
            if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) {
                fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
                isOk = FALSE;
                continue;
            }
            dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
            hasValues = TRUE;
            wordCount++;
            if (keyLen < minlen) minlen = keyLen;
            if (keyLen > maxlen) maxlen = keyLen;
        } else {
            dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
            hasValuelessContents = TRUE;
            wordCount++;
            if (keyLen < minlen) minlen = keyLen;
            if (keyLen > maxlen) maxlen = keyLen;
        }

        if (status.isFailure()) {
            fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
                status.errorName(), lineCount);
            exit(status.reset());
        }
    }
    if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }

    if (!isOk && status.isSuccess()) {
        status.set(U_ILLEGAL_ARGUMENT_ERROR);
    }
    if (hasValues && hasValuelessContents) {
        fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
    }

    if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
    int32_t outDataSize;
    const void *outData;
    UnicodeString usp;
    if (isBytesTrie) {
        StringPiece sp = dict.serializeBytes(status);
        outDataSize = sp.size();
        outData = sp.data();
    } else {
        dict.serializeUChars(usp, status);
        outDataSize = usp.length() * U_SIZEOF_UCHAR;
        outData = usp.getBuffer();
    }
    if (status.isFailure()) {
        fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { puts("Opening output file..."); }
    UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
    if (status.isFailure()) {
        fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
        exit(status.reset());
    }

    if (verbose) { puts("Writing to output file..."); }
    int32_t indexes[DictionaryData::IX_COUNT] = {
        DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
    };
    int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
    indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
    indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
    indexes[DictionaryData::IX_TOTAL_SIZE] = size;

    indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
    if (hasValues) {
        indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
    }

    indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
    udata_writeBlock(pData, indexes, sizeof(indexes));
    udata_writeBlock(pData, outData, outDataSize);
    size_t bytesWritten = udata_finish(pData, status);
    if (status.isFailure()) {
        fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
        exit(status.reset());
    }

    if (bytesWritten != (size_t)size) {
        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }

    if (!quiet) { printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); }

#ifdef TEST_GENDICT
    if (isBytesTrie) {
        BytesTrie::Iterator it(outData, outDataSize, status);
        while (it.hasNext()) {
            it.next(status);
            const StringPiece s = it.getString();
            int32_t val = it.getValue();
            printf("%s -> %i\n", s.data(), val);
        }
    } else {
        UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
        while (it.hasNext()) {
            it.next(status);
            const UnicodeString s = it.getString();
            int32_t val = it.getValue();
            char tmp[1024];
            s.extract(0, s.length(), tmp, 1024);
            printf("%s -> %i\n", tmp, val);
        }
    }
#endif

    return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
51004dcb A	3	/*
51004dcb A	4	**********************************************************************
2ca993e8	5	* Copyright (C) 2002-2016, International Business Machines
51004dcb A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	*
	9	* File gendict.cpp
	10	*/
	11
	12	#include "unicode/utypes.h"
	13	#include "unicode/uchar.h"
	14	#include "unicode/ucnv.h"
	15	#include "unicode/uniset.h"
	16	#include "unicode/unistr.h"
	17	#include "unicode/uclean.h"
	18	#include "unicode/udata.h"
	19	#include "unicode/putil.h"
	20	#include "unicode/ucharstriebuilder.h"
	21	#include "unicode/bytestriebuilder.h"
	22	#include "unicode/ucharstrie.h"
	23	#include "unicode/bytestrie.h"
	24	#include "unicode/ucnv.h"
f3c0d7a5	25	#include "unicode/ustring.h"
51004dcb A	26	#include "unicode/utf16.h"
	27
	28	#include "charstr.h"
	29	#include "dictionarydata.h"
	30	#include "uoptions.h"
	31	#include "unewdata.h"
	32	#include "cmemory.h"
	33	#include "uassert.h"
	34	#include "ucbuf.h"
	35	#include "toolutil.h"
	36	#include "cstring.h"
	37
	38	#include <stdio.h>
	39	#include <stdlib.h>
	40	#include <string.h>
	41
	42	#include "putilimp.h"
57a6839d	43	UDate startTime;
51004dcb A	44
	45	static int elapsedTime() {
	46	return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0);
	47	}
	48
51004dcb A	49	U_NAMESPACE_USE
	50
	51	static char *progName;
	52	static UOption options[]={
	53	UOPTION_HELP_H, /* 0 */
	54	UOPTION_HELP_QUESTION_MARK, /* 1 */
	55	UOPTION_VERBOSE, /* 2 */
	56	UOPTION_ICUDATADIR, /* 4 */
	57	UOPTION_COPYRIGHT, /* 5 */
	58	{ "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
	59	{ "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
	60	{ "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
2ca993e8	61	UOPTION_QUIET, /* 9 */
51004dcb A	62	};
	63
	64	enum arguments {
	65	ARG_HELP = 0,
	66	ARG_QMARK,
	67	ARG_VERBOSE,
	68	ARG_ICUDATADIR,
	69	ARG_COPYRIGHT,
	70	ARG_UCHARS,
	71	ARG_BYTES,
2ca993e8 A	72	ARG_TRANSFORM,
2ca993e8 A	73	ARG_QUIET
51004dcb A	74	};
	75
	76	// prints out the standard usage method describing command line arguments,
	77	// then bails out with the desired exit code
	78	static void usageAndDie(UErrorCode retCode) {
	79	fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
	80	fprintf((U_SUCCESS(retCode) ? stdout : stderr),
	81	"\tRead in a word list and write out a string trie dictionary\n"
	82	"options:\n"
	83	"\t-h or -? or --help this usage text\n"
	84	"\t-V or --version show a version message\n"
	85	"\t-c or --copyright include a copyright notice\n"
	86	"\t-v or --verbose turn on verbose output\n"
2ca993e8	87	"\t-q or --quiet do not display warnings and progress\n"
51004dcb A	88	"\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
	89	"\t followed by path, defaults to %s\n"
	90	"\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
	91	"\t--bytes output a BytesTrie (mutually exclusive with -u!)\n"
	92	"\t--transform the kind of transform to use (eg --transform offset-40A3,\n"
	93	"\t which specifies an offset transform with constant 0x40A3)\n",
	94	u_getDataDirectory());
	95	exit(retCode);
	96	}
	97
	98
	99	/* UDataInfo cf. udata.h */
	100	static UDataInfo dataInfo = {
	101	sizeof(UDataInfo),
	102	0,
	103
	104	U_IS_BIG_ENDIAN,
	105	U_CHARSET_FAMILY,
	106	U_SIZEOF_UCHAR,
	107	0,
	108
	109	{ 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */
	110	{ 1, 0, 0, 0 }, /* format version */
	111	{ 0, 0, 0, 0 } /* data version */
	112	};
	113
	114	#if !UCONFIG_NO_BREAK_ITERATION
	115
	116	// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
	117	// may want to put this somewhere in ICU, as it could be useful outside
	118	// of this tool?
	119	class DataDict {
	120	private:
	121	BytesTrieBuilder *bt;
	122	UCharsTrieBuilder *ut;
	123	UChar32 transformConstant;
	124	int32_t transformType;
	125	public:
	126	// constructs a new data dictionary. if there is an error,
	127	// it will be returned in status
	128	// isBytesTrie != 0 will produce a BytesTrieBuilder,
	129	// isBytesTrie == 0 will produce a UCharsTrieBuilder
	130	DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL),
	131	transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
	132	if (isBytesTrie) {
	133	bt = new BytesTrieBuilder(status);
	134	} else {
	135	ut = new UCharsTrieBuilder(status);
	136	}
	137	}
	138
	139	~DataDict() {
	140	delete bt;
	141	delete ut;
	142	}
	143
	144	private:
	145	char transform(UChar32 c, UErrorCode &status) {
	146	if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
	147	if (c == 0x200D) { return (char)0xFF; }
	148	else if (c == 0x200C) { return (char)0xFE; }
	149	int32_t delta = c - transformConstant;
	150	if (delta < 0 \|\| 0xFD < delta) {
	151	fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
152	(long)c, (long)transformConstant);
153	exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
154	}
155	return (char)delta;
156	} else { // no such transform type
157	status = U_INTERNAL_PROGRAM_ERROR;
158	return (char)c; // it should be noted this transform type will not generally work
159	}
160	}
161
162	void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
163	UChar32 c = 0;
164	int32_t len = word.length();
165	for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
166	c = word.char32At(i);
167	buf.append(transform(c, errorCode), errorCode);
168	}
169	}
170
171	public:
172	// sets the desired transformation data.
173	// should be populated from a command line argument
174	// so far the only acceptable format is offset-<hex constant>
175	// eventually others (mask-<hex constant>?) may be enabled
176	// more complex functions may be more difficult
177	void setTransform(const char *t) {
178	if (strncmp(t, "offset-", 7) == 0) {
179	char *end;
180	unsigned long base = uprv_strtoul(t + 7, &end, 16);
181	if (end == (t + 7) \|\| *end != 0 \|\| base > 0x10FF80) {
182	fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
183	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
184	}
185	transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
186	transformConstant = (UChar32)base;
187	}
188	else {
189	fprintf(stderr, "Invalid transform specified: %s\n", t);
190	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
191	}
192	}
193
194	// add a word to the trie
195	void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
196	if (bt) {
197	CharString buf;
198	transform(word, buf, status);
199	bt->add(buf.toStringPiece(), value, status);
200	}
201	if (ut) { ut->add(word, value, status); }
202	}
203
204	// if we are a bytestrie, give back the StringPiece representing the serialized version of us
205	StringPiece serializeBytes(UErrorCode &status) {
206	return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
207	}
208
209	// if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
210	void serializeUChars(UnicodeString &s, UErrorCode &status) {
211	ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
212	}
213
214	int32_t getTransform() {
215	return (int32_t)(transformType \| transformConstant);
216	}
217	};
218	#endif
219
220	static const UChar LINEFEED_CHARACTER = 0x000A;
221	static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D;
222
223	static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
224	int32_t lineLength;
225	const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
226	if(line == NULL \|\| errorCode.isFailure()) { return FALSE; }
227	// Strip trailing CR/LF, comments, and spaces.
228	const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
229	if(comment != NULL) {
230	lineLength = (int32_t)(comment - line);
231	} else {
232	while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER \|\| line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
233	}
234	while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
235	fileLine.setTo(FALSE, line, lineLength);
236	return TRUE;
237	}
238
239	//----------------------------------------------------------------------------
240	//
241	// main for gendict
242	//
243	//----------------------------------------------------------------------------
244	int main(int argc, char **argv) {
245	//
246	// Pick up and check the command line arguments,
247	// using the standard ICU tool utils option handling.
248	//
249	U_MAIN_INIT_ARGS(argc, argv);
250	progName = argv[0];
2ca993e8	251	argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
51004dcb A	252	if(argc<0) {
	253	// Unrecognized option
	254	fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
	255	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	256	}
	257
	258	if(options[ARG_HELP].doesOccur \|\| options[ARG_QMARK].doesOccur) {
	259	// -? or -h for help.
	260	usageAndDie(U_ZERO_ERROR);
	261	}
	262
	263	UBool verbose = options[ARG_VERBOSE].doesOccur;
2ca993e8	264	UBool quiet = options[ARG_QUIET].doesOccur;
51004dcb A	265
	266	if (argc < 3) {
	267	fprintf(stderr, "input and output file must both be specified.\n");
	268	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	269	}
	270	const char *outFileName = argv[2];
	271	const char *wordFileName = argv[1];
	272
57a6839d	273	startTime = uprv_getRawUTCtime(); // initialize start timer
51004dcb	274
b331163b	275	if (options[ARG_ICUDATADIR].doesOccur) {
51004dcb A	276	u_setDataDirectory(options[ARG_ICUDATADIR].value);
	277	}
	278
	279	const char *copyright = NULL;
	280	if (options[ARG_COPYRIGHT].doesOccur) {
	281	copyright = U_COPYRIGHT_STRING;
	282	}
	283
	284	if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
	285	fprintf(stderr, "you must specify exactly one type of trie to output!\n");
	286	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	287	}
	288	UBool isBytesTrie = options[ARG_BYTES].doesOccur;
	289	if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
	290	fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
	291	usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
	292	}
	293
	294	IcuToolErrorCode status("gendict/main()");
	295
	296	#if UCONFIG_NO_BREAK_ITERATION \|\| UCONFIG_NO_FILE_IO
	297	const char* outDir=NULL;
	298
	299	UNewDataMemory *pData;
	300	char msg[1024];
	301	UErrorCode tempstatus = U_ZERO_ERROR;
	302
	303	/* write message with just the name */ // potential for a buffer overflow here...
	304	sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
	305	fprintf(stderr, "%s\n", msg);
	306
	307	/* write the dummy data file */
	308	pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus);
	309	udata_writeBlock(pData, msg, strlen(msg));
	310	udata_finish(pData, &tempstatus);
	311	return (int)tempstatus;
	312
	313	#else
	314	// Read in the dictionary source file
	315	if (verbose) { printf("Opening file %s...\n", wordFileName); }
	316	const char *codepage = "UTF-8";
0f5d89e8	317	LocalUCHARBUFPointer f(ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status));
51004dcb A	318	if (status.isFailure()) {
	319	fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
	320	exit(status.reset());
	321	}
	322	if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
	323	DataDict dict(isBytesTrie, status);
	324	if (status.isFailure()) {
	325	fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
	326	exit(status.reset());
	327	}
	328	if (options[ARG_TRANSFORM].doesOccur) {
	329	dict.setTransform(options[ARG_TRANSFORM].value);
	330	}
	331
	332	UnicodeString fileLine;
	333	if (verbose) { puts("Adding words to dictionary..."); }
	334	UBool hasValues = FALSE;
	335	UBool hasValuelessContents = FALSE;
	336	int lineCount = 0;
57a6839d A	337	int wordCount = 0;
	338	int minlen = 255;
	339	int maxlen = 0;
51004dcb	340	UBool isOk = TRUE;
0f5d89e8	341	while (readLine(f.getAlias(), fileLine, status)) {
51004dcb A	342	lineCount++;
51004dcb A	343	if (fileLine.isEmpty()) continue;
0f5d89e8	344
51004dcb A	345	// Parse word [spaces value].
	346	int32_t keyLen;
	347	for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
	348	if (keyLen == 0) {
	349	fprintf(stderr, "Error: no word on line %i!\n", lineCount);
	350	isOk = FALSE;
	351	continue;
	352	}
	353	int32_t valueStart;
	354	for (valueStart = keyLen;
	355	valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
	356	++valueStart) {}
	357
	358	if (keyLen < valueStart) {
	359	int32_t valueLength = fileLine.length() - valueStart;
	360	if (valueLength > 15) {
	361	fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
	362	isOk = FALSE;
	363	continue;
	364	}
	365	char s[16];
	366	fileLine.extract(valueStart, valueLength, s, 16, US_INV);
	367	char *end;
	368	unsigned long value = uprv_strtoul(s, &end, 0);
	369	if (end == s \|\| *end != 0 \|\| (int32_t)uprv_strlen(s) != valueLength \|\| value > 0xffffffff) {
	370	fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
	371	isOk = FALSE;
	372	continue;
	373	}
	374	dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
	375	hasValues = TRUE;
57a6839d A	376	wordCount++;
	377	if (keyLen < minlen) minlen = keyLen;
	378	if (keyLen > maxlen) maxlen = keyLen;
51004dcb A	379	} else {
51004dcb A	380	dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
57a6839d A	381	hasValuelessContents = TRUE;
	382	wordCount++;
	383	if (keyLen < minlen) minlen = keyLen;
	384	if (keyLen > maxlen) maxlen = keyLen;
51004dcb A	385	}
	386
	387	if (status.isFailure()) {
	388	fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
	389	status.errorName(), lineCount);
	390	exit(status.reset());
	391	}
	392	}
57a6839d	393	if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }
51004dcb A	394
	395	if (!isOk && status.isSuccess()) {
	396	status.set(U_ILLEGAL_ARGUMENT_ERROR);
	397	}
	398	if (hasValues && hasValuelessContents) {
	399	fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
	400	}
	401
57a6839d	402	if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
51004dcb A	403	int32_t outDataSize;
	404	const void *outData;
	405	UnicodeString usp;
	406	if (isBytesTrie) {
	407	StringPiece sp = dict.serializeBytes(status);
	408	outDataSize = sp.size();
	409	outData = sp.data();
	410	} else {
	411	dict.serializeUChars(usp, status);
	412	outDataSize = usp.length() * U_SIZEOF_UCHAR;
	413	outData = usp.getBuffer();
	414	}
	415	if (status.isFailure()) {
57a6839d	416	fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
51004dcb A	417	exit(status.reset());
	418	}
	419	if (verbose) { puts("Opening output file..."); }
	420	UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status);
	421	if (status.isFailure()) {
	422	fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
	423	exit(status.reset());
	424	}
	425
	426	if (verbose) { puts("Writing to output file..."); }
	427	int32_t indexes[DictionaryData::IX_COUNT] = {
	428	DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
	429	};
	430	int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
	431	indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
	432	indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
	433	indexes[DictionaryData::IX_TOTAL_SIZE] = size;
	434
	435	indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
	436	if (hasValues) {
	437	indexes[DictionaryData::IX_TRIE_TYPE] \|= DictionaryData::TRIE_HAS_VALUES;
	438	}
	439
	440	indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
	441	udata_writeBlock(pData, indexes, sizeof(indexes));
	442	udata_writeBlock(pData, outData, outDataSize);
	443	size_t bytesWritten = udata_finish(pData, status);
	444	if (status.isFailure()) {
	445	fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
	446	exit(status.reset());
	447	}
	448
	449	if (bytesWritten != (size_t)size) {
	450	fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
	451	exit(U_INTERNAL_PROGRAM_ERROR);
	452	}
	453
2ca993e8	454	if (!quiet) { printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); }
51004dcb A	455
	456	#ifdef TEST_GENDICT
	457	if (isBytesTrie) {
	458	BytesTrie::Iterator it(outData, outDataSize, status);
	459	while (it.hasNext()) {
	460	it.next(status);
	461	const StringPiece s = it.getString();
	462	int32_t val = it.getValue();
	463	printf("%s -> %i\n", s.data(), val);
	464	}
	465	} else {
	466	UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status);
	467	while (it.hasNext()) {
	468	it.next(status);
	469	const UnicodeString s = it.getString();
	470	int32_t val = it.getValue();
	471	char tmp[1024];
	472	s.extract(0, s.length(), tmp, 1024);
	473	printf("%s -> %i\n", tmp, val);
	474	}
	475	}
	476	#endif
	477
	478	return 0;
	479	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	480	}