2 **********************************************************************
3 * Copyright (C) 2002-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
10 //--------------------------------------------------------------------
12 // Tool for generating CompactTrieDictionary data files (.ctd files).
14 // Usage: genctd [options] -o output-file.ctd input-file
16 // options: -v verbose
19 // The input file is a plain text file containing words, one per line.
20 // Words end at the first whitespace; lines beginning with whitespace
22 // The file can be encoded as utf-8, or utf-16 (either endian), or
23 // in the default code page (platform dependent.). utf encoded
24 // files must include a BOM.
26 //--------------------------------------------------------------------
28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
50 static char *progName
;
51 static UOption options
[]={
52 UOPTION_HELP_H
, /* 0 */
53 UOPTION_HELP_QUESTION_MARK
, /* 1 */
54 UOPTION_VERBOSE
, /* 2 */
55 { "out", NULL
, NULL
, NULL
, 'o', UOPT_REQUIRES_ARG
, 0 }, /* 3 */
56 UOPTION_ICUDATADIR
, /* 4 */
57 UOPTION_DESTDIR
, /* 5 */
58 UOPTION_COPYRIGHT
, /* 6 */
61 void usageAndDie(int retCode
) {
62 printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName
);
63 printf("\tRead in word list and write out compact trie dictionary\n"
65 "\t-h or -? or --help this usage text\n"
66 "\t-V or --version show a version message\n"
67 "\t-c or --copyright include a copyright notice\n"
68 "\t-v or --verbose turn on verbose output\n"
69 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
70 "\t followed by path, defaults to %s\n"
71 "\t-d or --destdir destination directory, followed by the path\n",
72 u_getDataDirectory());
77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
79 /* dummy UDataInfo cf. udata.h */
80 static UDataInfo dummyDataInfo
= {
89 { 0, 0, 0, 0 }, /* dummy dataFormat */
90 { 0, 0, 0, 0 }, /* dummy formatVersion */
91 { 0, 0, 0, 0 } /* dummy dataVersion */
97 // Set up the ICU data header, defined in ucmndata.h
100 {sizeof(DataHeader
), // Struct MappedData
104 { // struct UDataInfo
105 sizeof(UDataInfo
), // size
112 { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
113 { 1, 0, 0, 0 }, // 1.0.0.0
114 { 0, 0, 0, 0 }, // Irrelevant for this data type
119 //----------------------------------------------------------------------------
123 //----------------------------------------------------------------------------
124 int main(int argc
, char **argv
) {
125 UErrorCode status
= U_ZERO_ERROR
;
126 const char *wordFileName
;
127 const char *outFileName
;
128 const char *outDir
= NULL
;
129 const char *copyright
= NULL
;
132 // Pick up and check the command line arguments,
133 // using the standard ICU tool utils option handling.
135 U_MAIN_INIT_ARGS(argc
, argv
);
137 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
139 // Unrecognized option
140 fprintf(stderr
, "error in command line argument \"%s\"\n", argv
[-argc
]);
141 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR
);
144 if(options
[0].doesOccur
|| options
[1].doesOccur
) {
145 // -? or -h for help.
149 if (!options
[3].doesOccur
|| argc
< 2) {
150 fprintf(stderr
, "input and output file must both be specified.\n");
151 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR
);
153 outFileName
= options
[3].value
;
154 wordFileName
= argv
[1];
156 if (options
[4].doesOccur
) {
157 u_setDataDirectory(options
[4].value
);
160 status
= U_ZERO_ERROR
;
162 /* Combine the directory with the file name */
163 if(options
[5].doesOccur
) {
164 outDir
= options
[5].value
;
166 if (options
[6].doesOccur
) {
167 copyright
= U_COPYRIGHT_STRING
;
170 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
172 UNewDataMemory
*pData
;
175 /* write message with just the name */
176 sprintf(msg
, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName
);
177 fprintf(stderr
, "%s\n", msg
);
179 /* write the dummy data file */
180 pData
= udata_create(outDir
, NULL
, outFileName
, &dummyDataInfo
, NULL
, &status
);
181 udata_writeBlock(pData
, msg
, strlen(msg
));
182 udata_finish(pData
, &status
);
188 if (U_FAILURE(status
)) {
189 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
190 argv
[0], u_errorName(status
));
193 status
= U_ZERO_ERROR
;
196 // Read in the dictionary source file
203 file
= fopen(wordFileName
, "rb");
205 fprintf(stderr
, "Could not open file \"%s\"\n", wordFileName
);
208 fseek(file
, 0, SEEK_END
);
209 wordFileSize
= ftell(file
);
210 fseek(file
, 0, SEEK_SET
);
211 wordBufferC
= new char[wordFileSize
+10];
213 result
= (long)fread(wordBufferC
, 1, wordFileSize
, file
);
214 if (result
!= wordFileSize
) {
215 fprintf(stderr
, "Error reading file \"%s\"\n", wordFileName
);
218 wordBufferC
[wordFileSize
]=0;
222 // Look for a Unicode Signature (BOM) on the word file
224 int32_t signatureLength
;
225 const char * wordSourceC
= wordBufferC
;
226 const char* encoding
= ucnv_detectUnicodeSignature(
227 wordSourceC
, wordFileSize
, &signatureLength
, &status
);
228 if (U_FAILURE(status
)) {
232 wordSourceC
+= signatureLength
;
233 wordFileSize
-= signatureLength
;
237 // Open a converter to take the rule file to UTF-16
240 conv
= ucnv_open(encoding
, &status
);
241 if (U_FAILURE(status
)) {
242 fprintf(stderr
, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status
));
247 // Convert the words to UChar.
248 // Preflight first to determine required buffer size.
250 uint32_t destCap
= ucnv_toUChars(conv
,
256 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
257 fprintf(stderr
, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
261 status
= U_ZERO_ERROR
;
262 UChar
*wordSourceU
= new UChar
[destCap
+1];
264 wordSourceU
, // dest,
269 if (U_FAILURE(status
)) {
270 fprintf(stderr
, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
275 // Get rid of the original file buffer
276 delete[] wordBufferC
;
278 // Create a MutableTrieDictionary, and loop through all the lines, inserting
281 // First, pick a median character.
282 UChar
*current
= wordSourceU
+ (destCap
/2);
283 UChar uc
= *current
++;
285 breaks
.add(0x000A); // Line Feed
286 breaks
.add(0x000D); // Carriage Return
287 breaks
.add(0x2028); // Line Separator
288 breaks
.add(0x2029); // Paragraph Separator
291 // Look for line break
292 while (uc
&& !breaks
.contains(uc
)) {
295 // Now skip to first non-line-break
296 while (uc
&& breaks
.contains(uc
)) {
300 while (uc
&& (breaks
.contains(uc
) || u_isspace(uc
)));
302 MutableTrieDictionary
*mtd
= new MutableTrieDictionary(uc
, status
);
304 if (U_FAILURE(status
)) {
305 fprintf(stderr
, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status
));
309 // Now add the words. Words are non-space characters at the beginning of
310 // lines, and must be at least one UChar.
311 current
= wordSourceU
;
312 UChar
*candidate
= current
;
317 while (uc
&& !u_isspace(uc
)) {
322 mtd
->addWord(candidate
, length
, status
);
323 if (U_FAILURE(status
)) {
324 fprintf(stderr
, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
325 u_errorName(status
));
329 // Find beginning of next line
330 while (uc
&& !breaks
.contains(uc
)) {
333 while (uc
&& breaks
.contains(uc
)) {
336 candidate
= current
-1;
340 // Get rid of the Unicode text buffer
341 delete[] wordSourceU
;
343 // Now, create a CompactTrieDictionary from the mutable dictionary
344 CompactTrieDictionary
*ctd
= new CompactTrieDictionary(*mtd
, status
);
345 if (U_FAILURE(status
)) {
346 fprintf(stderr
, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status
));
350 // Get rid of the MutableTrieDictionary
354 // Get the binary data from the dictionary.
356 uint32_t outDataSize
= ctd
->dataSize();
357 const uint8_t *outData
= (const uint8_t *)ctd
->data();
360 // Create the output file
363 UNewDataMemory
*pData
;
364 pData
= udata_create(outDir
, NULL
, outFileName
, &(dh
.info
), copyright
, &status
);
365 if(U_FAILURE(status
)) {
366 fprintf(stderr
, "genctd: Could not open output file \"%s\", \"%s\"\n",
367 outFileName
, u_errorName(status
));
372 // Write the data itself.
373 udata_writeBlock(pData
, outData
, outDataSize
);
375 bytesWritten
= udata_finish(pData
, &status
);
376 if(U_FAILURE(status
)) {
377 fprintf(stderr
, "genctd: error \"%s\" writing the output file\n", u_errorName(status
));
381 if (bytesWritten
!= outDataSize
) {
382 fprintf(stderr
, "Error writing to output file \"%s\"\n", outFileName
);
386 // Get rid of the CompactTrieDictionary
391 printf("genctd: tool completed successfully.\n");
394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */