]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genctd/genctd.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / genctd / genctd.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File genctd.c
8 */
9
10 //--------------------------------------------------------------------
11 //
12 // Tool for generating CompactTrieDictionary data files (.ctd files).
13 //
14 // Usage: genctd [options] -o output-file.ctd input-file
15 //
16 // options: -v verbose
17 // -? or -h help
18 //
19 // The input file is a plain text file containing words, one per line.
20 // Words end at the first whitespace; lines beginning with whitespace
21 // are ignored.
22 // The file can be encoded as utf-8, or utf-16 (either endian), or
23 // in the default code page (platform dependent.). utf encoded
24 // files must include a BOM.
25 //
26 //--------------------------------------------------------------------
27
28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "triedict.h"
42 #include "cmemory.h"
43
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47
48 static char *progName;
49 static UOption options[]={
50 UOPTION_HELP_H, /* 0 */
51 UOPTION_HELP_QUESTION_MARK, /* 1 */
52 UOPTION_VERBOSE, /* 2 */
53 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
54 UOPTION_ICUDATADIR, /* 4 */
55 UOPTION_DESTDIR, /* 5 */
56 UOPTION_COPYRIGHT, /* 6 */
57 };
58
59 void usageAndDie(int retCode) {
60 printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
61 printf("\tRead in word list and write out compact trie dictionary\n"
62 "options:\n"
63 "\t-h or -? or --help this usage text\n"
64 "\t-V or --version show a version message\n"
65 "\t-c or --copyright include a copyright notice\n"
66 "\t-v or --verbose turn on verbose output\n"
67 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
68 "\t followed by path, defaults to %s\n"
69 "\t-d or --destdir destination directory, followed by the path\n",
70 u_getDataDirectory());
71 exit (retCode);
72 }
73
74
75 #if UCONFIG_NO_BREAK_ITERATION
76
77 /* dummy UDataInfo cf. udata.h */
78 static UDataInfo dummyDataInfo = {
79 sizeof(UDataInfo),
80 0,
81
82 U_IS_BIG_ENDIAN,
83 U_CHARSET_FAMILY,
84 U_SIZEOF_UCHAR,
85 0,
86
87 { 0, 0, 0, 0 }, /* dummy dataFormat */
88 { 0, 0, 0, 0 }, /* dummy formatVersion */
89 { 0, 0, 0, 0 } /* dummy dataVersion */
90 };
91
92 #else
93
94 //
95 // Set up the ICU data header, defined in ucmndata.h
96 //
97 DataHeader dh ={
98 {sizeof(DataHeader), // Struct MappedData
99 0xda,
100 0x27},
101
102 { // struct UDataInfo
103 sizeof(UDataInfo), // size
104 0, // reserved
105 U_IS_BIG_ENDIAN,
106 U_CHARSET_FAMILY,
107 U_SIZEOF_UCHAR,
108 0, // reserved
109
110 { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
111 { 1, 0, 0, 0 }, // 1.0.0.0
112 { 0, 0, 0, 0 }, // Irrelevant for this data type
113 }};
114
115 #endif
116
117 //----------------------------------------------------------------------------
118 //
119 // main for genctd
120 //
121 //----------------------------------------------------------------------------
122 int main(int argc, char **argv) {
123 UErrorCode status = U_ZERO_ERROR;
124 const char *wordFileName;
125 const char *outFileName;
126 const char *outDir = NULL;
127 const char *copyright = NULL;
128
129 //
130 // Pick up and check the command line arguments,
131 // using the standard ICU tool utils option handling.
132 //
133 U_MAIN_INIT_ARGS(argc, argv);
134 progName = argv[0];
135 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
136 if(argc<0) {
137 // Unrecognized option
138 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
139 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
140 }
141
142 if(options[0].doesOccur || options[1].doesOccur) {
143 // -? or -h for help.
144 usageAndDie(0);
145 }
146
147 if (!options[3].doesOccur || argc < 2) {
148 fprintf(stderr, "input and output file must both be specified.\n");
149 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
150 }
151 outFileName = options[3].value;
152 wordFileName = argv[1];
153
154 if (options[4].doesOccur) {
155 u_setDataDirectory(options[4].value);
156 }
157
158 /* Initialize ICU */
159 u_init(&status);
160 if (U_FAILURE(status)) {
161 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
162 argv[0], u_errorName(status));
163 exit(1);
164 }
165 status = U_ZERO_ERROR;
166
167 /* Combine the directory with the file name */
168 if(options[5].doesOccur) {
169 outDir = options[5].value;
170 }
171 if (options[6].doesOccur) {
172 copyright = U_COPYRIGHT_STRING;
173 }
174
175 #if UCONFIG_NO_BREAK_ITERATION
176
177 UNewDataMemory *pData;
178 char msg[1024];
179
180 /* write message with just the name */
181 sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
182 fprintf(stderr, "%s\n", msg);
183
184 /* write the dummy data file */
185 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
186 udata_writeBlock(pData, msg, strlen(msg));
187 udata_finish(pData, &status);
188 return (int)status;
189
190 #else
191
192 //
193 // Read in the dictionary source file
194 //
195 long result;
196 long wordFileSize;
197 FILE *file;
198 char *wordBufferC;
199
200 file = fopen(wordFileName, "rb");
201 if( file == 0 ) {
202 fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
203 exit(-1);
204 }
205 fseek(file, 0, SEEK_END);
206 wordFileSize = ftell(file);
207 fseek(file, 0, SEEK_SET);
208 wordBufferC = new char[wordFileSize+10];
209
210 result = (long)fread(wordBufferC, 1, wordFileSize, file);
211 if (result != wordFileSize) {
212 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
213 exit (-1);
214 }
215 wordBufferC[wordFileSize]=0;
216 fclose(file);
217
218 //
219 // Look for a Unicode Signature (BOM) on the word file
220 //
221 int32_t signatureLength;
222 const char * wordSourceC = wordBufferC;
223 const char* encoding = ucnv_detectUnicodeSignature(
224 wordSourceC, wordFileSize, &signatureLength, &status);
225 if (U_FAILURE(status)) {
226 exit(status);
227 }
228 if(encoding!=NULL ){
229 wordSourceC += signatureLength;
230 wordFileSize -= signatureLength;
231 }
232
233 //
234 // Open a converter to take the rule file to UTF-16
235 //
236 UConverter* conv;
237 conv = ucnv_open(encoding, &status);
238 if (U_FAILURE(status)) {
239 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
240 exit(status);
241 }
242
243 //
244 // Convert the words to UChar.
245 // Preflight first to determine required buffer size.
246 //
247 uint32_t destCap = ucnv_toUChars(conv,
248 NULL, // dest,
249 0, // destCapacity,
250 wordSourceC,
251 wordFileSize,
252 &status);
253 if (status != U_BUFFER_OVERFLOW_ERROR) {
254 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
255 exit(status);
256 };
257
258 status = U_ZERO_ERROR;
259 UChar *wordSourceU = new UChar[destCap+1];
260 ucnv_toUChars(conv,
261 wordSourceU, // dest,
262 destCap+1,
263 wordSourceC,
264 wordFileSize,
265 &status);
266 if (U_FAILURE(status)) {
267 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
268 exit(status);
269 };
270 ucnv_close(conv);
271
272 // Get rid of the original file buffer
273 delete[] wordBufferC;
274
275 // Create a MutableTrieDictionary, and loop through all the lines, inserting
276 // words.
277
278 // First, pick a median character.
279 UChar *current = wordSourceU + (destCap/2);
280 UChar uc = *current++;
281 UnicodeSet breaks;
282 breaks.add(0x000A); // Line Feed
283 breaks.add(0x000D); // Carriage Return
284 breaks.add(0x2028); // Line Separator
285 breaks.add(0x2029); // Paragraph Separator
286
287 do {
288 // Look for line break
289 while (uc && !breaks.contains(uc)) {
290 uc = *current++;
291 }
292 // Now skip to first non-line-break
293 while (uc && breaks.contains(uc)) {
294 uc = *current++;
295 }
296 }
297 while (uc && (breaks.contains(uc) || u_isspace(uc)));
298
299 MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
300
301 if (U_FAILURE(status)) {
302 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
303 exit(status);
304 }
305
306 // Now add the words. Words are non-space characters at the beginning of
307 // lines, and must be at least one UChar.
308 current = wordSourceU;
309 UChar *candidate = current;
310 uc = *current++;
311 int32_t length = 0;
312
313 while (uc) {
314 while (uc && !u_isspace(uc)) {
315 ++length;
316 uc = *current++;
317 }
318 if (length > 0) {
319 mtd->addWord(candidate, length, status);
320 if (U_FAILURE(status)) {
321 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
322 u_errorName(status));
323 exit(status);
324 }
325 }
326 // Find beginning of next line
327 while (uc && !breaks.contains(uc)) {
328 uc = *current++;
329 }
330 while (uc && breaks.contains(uc)) {
331 uc = *current++;
332 }
333 candidate = current-1;
334 length = 0;
335 }
336
337 // Get rid of the Unicode text buffer
338 delete[] wordSourceU;
339
340 // Now, create a CompactTrieDictionary from the mutable dictionary
341 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
342 if (U_FAILURE(status)) {
343 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
344 exit(status);
345 }
346
347 // Get rid of the MutableTrieDictionary
348 delete mtd;
349
350 //
351 // Get the binary data from the dictionary.
352 //
353 uint32_t outDataSize = ctd->dataSize();
354 const uint8_t *outData = (const uint8_t *)ctd->data();
355
356 //
357 // Create the output file
358 //
359 size_t bytesWritten;
360 UNewDataMemory *pData;
361 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
362 if(U_FAILURE(status)) {
363 fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
364 outFileName, u_errorName(status));
365 exit(status);
366 }
367
368
369 // Write the data itself.
370 udata_writeBlock(pData, outData, outDataSize);
371 // finish up
372 bytesWritten = udata_finish(pData, &status);
373 if(U_FAILURE(status)) {
374 fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
375 exit(status);
376 }
377
378 if (bytesWritten != outDataSize) {
379 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
380 exit(-1);
381 }
382
383 // Get rid of the CompactTrieDictionary
384 delete ctd;
385
386 u_cleanup();
387
388 printf("genctd: tool completed successfully.\n");
389 return 0;
390
391 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
392 }
393