]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genbrk/genbrk.cpp
ICU-6.2.10.tar.gz
[apple/icu.git] / icuSources / tools / genbrk / genbrk.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 2002-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6*
7* File genbrk.c
8*/
9
10//--------------------------------------------------------------------
11//
12// Tool for generating RuleBasedBreakIterator data files (.brk files).
13// .brk files contain the precompiled rules for standard types
14// of iterators - word, line, sentence, etc.
15//
16// Usage: genbrk [options] -r rule-file.txt -o output-file.brk
17//
18// options: -v verbose
19// -? or -h help
20//
21// The input rule file is a plain text file containing break rules
22// in the input format accepted by RuleBasedBreakIterators. The
23// file can be encoded as utf-8, or utf-16 (either endian), or
24// in the default code page (platform dependent.). utf encoded
25// files must include a BOM.
26//
27//--------------------------------------------------------------------
28
b75a7d8f
A
29#include "unicode/utypes.h"
30#include "unicode/ucnv.h"
31#include "unicode/unistr.h"
32#include "unicode/rbbi.h"
33#include "unicode/uclean.h"
34#include "unicode/udata.h"
374ca955 35#include "unicode/putil.h"
b75a7d8f
A
36
37#include "uoptions.h"
38#include "unewdata.h"
39#include "ucmndata.h"
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44
45#define DATA_TYPE "brk"
46
47static char *progName;
48static UOption options[]={
49 UOPTION_HELP_H, /* 0 */
50 UOPTION_HELP_QUESTION_MARK, /* 1 */
51 UOPTION_VERBOSE, /* 2 */
52 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
53 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
54 UOPTION_ICUDATADIR, /* 5 */
374ca955
A
55 UOPTION_DESTDIR, /* 6 */
56 UOPTION_COPYRIGHT, /* 7 */
b75a7d8f
A
57};
58
59void usageAndDie(int retCode) {
374ca955
A
60 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
61 printf("\tRead in break iteration rules text and write out the binary data\n"
62 "options:\n"
63 "\t-h or -? or --help this usage text\n"
64 "\t-V or --version show a version message\n"
65 "\t-c or --copyright include a copyright notice\n"
66 "\t-v or --verbose turn on verbose output\n"
67 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
68 "\t followed by path, defaults to %s\n"
69 "\t-d or --destdir destination directory, followed by the path\n",
70 u_getDataDirectory());
b75a7d8f
A
71 exit (retCode);
72}
73
74
75#if UCONFIG_NO_BREAK_ITERATION
76
77/* dummy UDataInfo cf. udata.h */
78static UDataInfo dummyDataInfo = {
79 sizeof(UDataInfo),
80 0,
81
82 U_IS_BIG_ENDIAN,
83 U_CHARSET_FAMILY,
84 U_SIZEOF_UCHAR,
85 0,
86
87 { 0, 0, 0, 0 }, /* dummy dataFormat */
88 { 0, 0, 0, 0 }, /* dummy formatVersion */
89 { 0, 0, 0, 0 } /* dummy dataVersion */
90};
91
92#else
93
94//
95// Set up the ICU data header, defined in ucmndata.h
96//
97DataHeader dh ={
98 {sizeof(DataHeader), // Struct MappedData
99 0xda,
100 0x27},
101
102 { // struct UDataInfo
103 sizeof(UDataInfo), // size
104 0, // reserved
105 U_IS_BIG_ENDIAN,
106 U_CHARSET_FAMILY,
107 U_SIZEOF_UCHAR,
108 0, // reserved
109
110 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
374ca955
A
111 { 3, 0, 0, 0 }, // formatVersion
112 { 4, 0, 0, 0 } // dataVersion (Unicode version)
b75a7d8f
A
113 }};
114
115#endif
116
117//----------------------------------------------------------------------------
118//
119// main for genbrk
120//
121//----------------------------------------------------------------------------
122int main(int argc, char **argv) {
123 UErrorCode status = U_ZERO_ERROR;
124 const char *ruleFileName;
125 const char *outFileName;
126 const char *outDir = NULL;
374ca955 127 const char *copyright = NULL;
b75a7d8f
A
128
129 //
130 // Pick up and check the command line arguments,
131 // using the standard ICU tool utils option handling.
132 //
133 U_MAIN_INIT_ARGS(argc, argv);
134 progName = argv[0];
135 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
136 if(argc<0) {
137 // Unrecognized option
138 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
139 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
140 }
141
142 if(options[0].doesOccur || options[1].doesOccur) {
143 // -? or -h for help.
144 usageAndDie(0);
145 }
146
147 if (!(options[3].doesOccur && options[4].doesOccur)) {
148 fprintf(stderr, "rule file and output file must both be specified.\n");
149 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
150 }
151 ruleFileName = options[3].value;
152 outFileName = options[4].value;
b75a7d8f
A
153
154 if (options[5].doesOccur) {
155 u_setDataDirectory(options[5].value);
156 }
157
374ca955
A
158 /* Initialize ICU */
159 u_init(&status);
160 if (U_FAILURE(status)) {
161 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
162 argv[0], u_errorName(status));
163 exit(1);
164 }
165 status = U_ZERO_ERROR;
166
b75a7d8f
A
167 /* Combine the directory with the file name */
168 if(options[6].doesOccur) {
169 outDir = options[6].value;
b75a7d8f 170 }
374ca955
A
171 if (options[7].doesOccur) {
172 copyright = U_COPYRIGHT_STRING;
b75a7d8f 173 }
b75a7d8f
A
174
175#if UCONFIG_NO_BREAK_ITERATION
176
177 UNewDataMemory *pData;
178 char msg[2048], folder[2048], name[32];
179 char *basename;
180 int length;
181
182 /* split the outFileName into folder + name + type */
183 strcpy(folder, outFileName);
184 basename = strrchr(folder, U_FILE_SEP_CHAR);
185 if(basename == NULL) {
186 basename = folder;
187 } else {
188 ++basename;
189 }
190
191 /* copy the data name and remove it from the folder */
192 strcpy(name, basename);
193 *basename = 0;
194
195 /* write message with just the name */
196 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", name);
197 fprintf(stderr, "%s\n", msg);
198
199 /* remove the type suffix (hardcode to DATA_TYPE) */
200 length = strlen(name);
201 if(length > 4 && name[length - 4] == '.') {
202 name[length - 4] = 0;
203 }
204
205 /* write the dummy data file */
206 pData = udata_create(folder, DATA_TYPE, name, &dummyDataInfo, NULL, &status);
207 udata_writeBlock(pData, msg, strlen(msg));
208 udata_finish(pData, &status);
209 return (int)status;
210
211#else
212
213 //
214 // Read in the rule source file
215 //
374ca955 216 long result;
b75a7d8f
A
217 long ruleFileSize;
218 FILE *file;
219 char *ruleBufferC;
220
221 file = fopen(ruleFileName, "rb");
222 if( file == 0 ) {
223 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
224 exit(-1);
225 }
226 fseek(file, 0, SEEK_END);
227 ruleFileSize = ftell(file);
228 fseek(file, 0, SEEK_SET);
229 ruleBufferC = new char[ruleFileSize+10];
230
374ca955 231 result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
b75a7d8f
A
232 if (result != ruleFileSize) {
233 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
234 exit (-1);
235 }
236 ruleBufferC[ruleFileSize]=0;
237 fclose(file);
238
239 //
240 // Look for a Unicode Signature (BOM) on the rule file
241 //
242 int32_t signatureLength;
243 const char * ruleSourceC = ruleBufferC;
244 const char* encoding = ucnv_detectUnicodeSignature(
245 ruleSourceC, ruleFileSize, &signatureLength, &status);
246 if (U_FAILURE(status)) {
247 exit(status);
248 }
249 if(encoding!=NULL ){
250 ruleSourceC += signatureLength;
251 ruleFileSize -= signatureLength;
252 }
253
254 //
255 // Open a converter to take the rule file to UTF-16
256 //
257 UConverter* conv;
258 conv = ucnv_open(encoding, &status);
259 if (U_FAILURE(status)) {
260 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
261 exit(status);
262 }
263
264 //
265 // Convert the rules to UChar.
266 // Preflight first to determine required buffer size.
267 //
268 uint32_t destCap = ucnv_toUChars(conv,
269 NULL, // dest,
270 0, // destCapacity,
271 ruleSourceC,
272 ruleFileSize,
273 &status);
274 if (status != U_BUFFER_OVERFLOW_ERROR) {
275 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
276 exit(status);
277 };
278
279 status = U_ZERO_ERROR;
280 UChar *ruleSourceU = new UChar[destCap+1];
281 ucnv_toUChars(conv,
282 ruleSourceU, // dest,
283 destCap+1,
284 ruleSourceC,
285 ruleFileSize,
286 &status);
287 if (U_FAILURE(status)) {
288 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
289 exit(status);
290 };
291 ucnv_close(conv);
292
293
294 //
295 // Put the source rules into a UnicodeString
296 //
297 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
298
299 //
300 // Create the break iterator from the rules
301 // This will compile the rules.
302 //
303 UParseError parseError;
374ca955
A
304 parseError.line = 0;
305 parseError.offset = 0;
b75a7d8f
A
306 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
307 if (U_FAILURE(status)) {
308 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
374ca955 309 u_errorName(status), (int)parseError.line, (int)parseError.offset);
b75a7d8f
A
310 exit(status);
311 };
312
313
314 //
315 // Get the compiled rule data from the break iterator.
316 //
317 uint32_t outDataSize;
318 const uint8_t *outData;
319 outData = bi->getBinaryRules(outDataSize);
320
321
322 //
323 // Create the output file
324 //
325 size_t bytesWritten;
374ca955
A
326 UNewDataMemory *pData;
327 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
328 if(U_FAILURE(status)) {
329 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
330 outFileName, u_errorName(status));
331 exit(status);
b75a7d8f 332 }
b75a7d8f 333 // Write the data itself.
374ca955
A
334 udata_writeBlock(pData, outData, outDataSize);
335 // finish up
336 bytesWritten = udata_finish(pData, &status);
337 if(U_FAILURE(status)) {
338 fprintf(stderr, "genbrk: error %d writing the output file\n", status);
339 exit(status);
340 }
341
b75a7d8f 342 if (bytesWritten != outDataSize) {
374ca955 343 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
b75a7d8f
A
344 exit(-1);
345 }
346
b75a7d8f
A
347 delete bi;
348 delete[] ruleSourceU;
349 delete[] ruleBufferC;
b75a7d8f
A
350 u_cleanup();
351
352
353 printf("genbrk: tool completed successfully.\n");
354 return 0;
355
356#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
357}
374ca955 358