]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genbrk/genbrk.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / tools / genbrk / genbrk.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File genbrk.c
8 */
9
10 //--------------------------------------------------------------------
11 //
12 // Tool for generating RuleBasedBreakIterator data files (.brk files).
13 // .brk files contain the precompiled rules for standard types
14 // of iterators - word, line, sentence, etc.
15 //
16 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk
17 //
18 // options: -v verbose
19 // -? or -h help
20 //
21 // The input rule file is a plain text file containing break rules
22 // in the input format accepted by RuleBasedBreakIterators. The
23 // file can be encoded as utf-8, or utf-16 (either endian), or
24 // in the default code page (platform dependent.). utf encoded
25 // files must include a BOM.
26 //
27 //--------------------------------------------------------------------
28
29 #include "unicode/utypes.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/unistr.h"
32 #include "unicode/rbbi.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "cmemory.h"
42
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46
47 static char *progName;
48 static UOption options[]={
49 UOPTION_HELP_H, /* 0 */
50 UOPTION_HELP_QUESTION_MARK, /* 1 */
51 UOPTION_VERBOSE, /* 2 */
52 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
53 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
54 UOPTION_ICUDATADIR, /* 5 */
55 UOPTION_DESTDIR, /* 6 */
56 UOPTION_COPYRIGHT, /* 7 */
57 };
58
59 void usageAndDie(int retCode) {
60 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
61 printf("\tRead in break iteration rules text and write out the binary data\n"
62 "options:\n"
63 "\t-h or -? or --help this usage text\n"
64 "\t-V or --version show a version message\n"
65 "\t-c or --copyright include a copyright notice\n"
66 "\t-v or --verbose turn on verbose output\n"
67 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
68 "\t followed by path, defaults to %s\n"
69 "\t-d or --destdir destination directory, followed by the path\n",
70 u_getDataDirectory());
71 exit (retCode);
72 }
73
74
75 #if UCONFIG_NO_BREAK_ITERATION
76
77 /* dummy UDataInfo cf. udata.h */
78 static UDataInfo dummyDataInfo = {
79 sizeof(UDataInfo),
80 0,
81
82 U_IS_BIG_ENDIAN,
83 U_CHARSET_FAMILY,
84 U_SIZEOF_UCHAR,
85 0,
86
87 { 0, 0, 0, 0 }, /* dummy dataFormat */
88 { 0, 0, 0, 0 }, /* dummy formatVersion */
89 { 0, 0, 0, 0 } /* dummy dataVersion */
90 };
91
92 #else
93
94 //
95 // Set up the ICU data header, defined in ucmndata.h
96 //
97 DataHeader dh ={
98 {sizeof(DataHeader), // Struct MappedData
99 0xda,
100 0x27},
101
102 { // struct UDataInfo
103 sizeof(UDataInfo), // size
104 0, // reserved
105 U_IS_BIG_ENDIAN,
106 U_CHARSET_FAMILY,
107 U_SIZEOF_UCHAR,
108 0, // reserved
109
110 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
111 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
112 // from the RBBI rule builder. The values declared
113 // here should never appear in any real RBBI data.
114 { 4, 1, 0, 0 } // dataVersion (Unicode version)
115 }};
116
117 #endif
118
119 //----------------------------------------------------------------------------
120 //
121 // main for genbrk
122 //
123 //----------------------------------------------------------------------------
124 int main(int argc, char **argv) {
125 UErrorCode status = U_ZERO_ERROR;
126 const char *ruleFileName;
127 const char *outFileName;
128 const char *outDir = NULL;
129 const char *copyright = NULL;
130
131 //
132 // Pick up and check the command line arguments,
133 // using the standard ICU tool utils option handling.
134 //
135 U_MAIN_INIT_ARGS(argc, argv);
136 progName = argv[0];
137 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
138 if(argc<0) {
139 // Unrecognized option
140 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
141 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
142 }
143
144 if(options[0].doesOccur || options[1].doesOccur) {
145 // -? or -h for help.
146 usageAndDie(0);
147 }
148
149 if (!(options[3].doesOccur && options[4].doesOccur)) {
150 fprintf(stderr, "rule file and output file must both be specified.\n");
151 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
152 }
153 ruleFileName = options[3].value;
154 outFileName = options[4].value;
155
156 if (options[5].doesOccur) {
157 u_setDataDirectory(options[5].value);
158 }
159
160 /* Initialize ICU */
161 u_init(&status);
162 if (U_FAILURE(status)) {
163 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
164 argv[0], u_errorName(status));
165 exit(1);
166 }
167 status = U_ZERO_ERROR;
168
169 /* Combine the directory with the file name */
170 if(options[6].doesOccur) {
171 outDir = options[6].value;
172 }
173 if (options[7].doesOccur) {
174 copyright = U_COPYRIGHT_STRING;
175 }
176
177 #if UCONFIG_NO_BREAK_ITERATION
178
179 UNewDataMemory *pData;
180 char msg[1024];
181
182 /* write message with just the name */
183 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
184 fprintf(stderr, "%s\n", msg);
185
186 /* write the dummy data file */
187 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
188 udata_writeBlock(pData, msg, strlen(msg));
189 udata_finish(pData, &status);
190 return (int)status;
191
192 #else
193
194 //
195 // Read in the rule source file
196 //
197 long result;
198 long ruleFileSize;
199 FILE *file;
200 char *ruleBufferC;
201
202 file = fopen(ruleFileName, "rb");
203 if( file == 0 ) {
204 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
205 exit(-1);
206 }
207 fseek(file, 0, SEEK_END);
208 ruleFileSize = ftell(file);
209 fseek(file, 0, SEEK_SET);
210 ruleBufferC = new char[ruleFileSize+10];
211
212 result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
213 if (result != ruleFileSize) {
214 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
215 exit (-1);
216 }
217 ruleBufferC[ruleFileSize]=0;
218 fclose(file);
219
220 //
221 // Look for a Unicode Signature (BOM) on the rule file
222 //
223 int32_t signatureLength;
224 const char * ruleSourceC = ruleBufferC;
225 const char* encoding = ucnv_detectUnicodeSignature(
226 ruleSourceC, ruleFileSize, &signatureLength, &status);
227 if (U_FAILURE(status)) {
228 exit(status);
229 }
230 if(encoding!=NULL ){
231 ruleSourceC += signatureLength;
232 ruleFileSize -= signatureLength;
233 }
234
235 //
236 // Open a converter to take the rule file to UTF-16
237 //
238 UConverter* conv;
239 conv = ucnv_open(encoding, &status);
240 if (U_FAILURE(status)) {
241 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
242 exit(status);
243 }
244
245 //
246 // Convert the rules to UChar.
247 // Preflight first to determine required buffer size.
248 //
249 uint32_t destCap = ucnv_toUChars(conv,
250 NULL, // dest,
251 0, // destCapacity,
252 ruleSourceC,
253 ruleFileSize,
254 &status);
255 if (status != U_BUFFER_OVERFLOW_ERROR) {
256 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
257 exit(status);
258 };
259
260 status = U_ZERO_ERROR;
261 UChar *ruleSourceU = new UChar[destCap+1];
262 ucnv_toUChars(conv,
263 ruleSourceU, // dest,
264 destCap+1,
265 ruleSourceC,
266 ruleFileSize,
267 &status);
268 if (U_FAILURE(status)) {
269 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
270 exit(status);
271 };
272 ucnv_close(conv);
273
274
275 //
276 // Put the source rules into a UnicodeString
277 //
278 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
279
280 //
281 // Create the break iterator from the rules
282 // This will compile the rules.
283 //
284 UParseError parseError;
285 parseError.line = 0;
286 parseError.offset = 0;
287 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
288 if (U_FAILURE(status)) {
289 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
290 u_errorName(status), (int)parseError.line, (int)parseError.offset);
291 exit(status);
292 };
293
294
295 //
296 // Get the compiled rule data from the break iterator.
297 //
298 uint32_t outDataSize;
299 const uint8_t *outData;
300 outData = bi->getBinaryRules(outDataSize);
301
302 // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
303 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
304
305 //
306 // Create the output file
307 //
308 size_t bytesWritten;
309 UNewDataMemory *pData;
310 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
311 if(U_FAILURE(status)) {
312 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
313 outFileName, u_errorName(status));
314 exit(status);
315 }
316
317
318 // Write the data itself.
319 udata_writeBlock(pData, outData, outDataSize);
320 // finish up
321 bytesWritten = udata_finish(pData, &status);
322 if(U_FAILURE(status)) {
323 fprintf(stderr, "genbrk: error %d writing the output file\n", status);
324 exit(status);
325 }
326
327 if (bytesWritten != outDataSize) {
328 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
329 exit(-1);
330 }
331
332 delete bi;
333 delete[] ruleSourceU;
334 delete[] ruleBufferC;
335 u_cleanup();
336
337
338 printf("genbrk: tool completed successfully.\n");
339 return 0;
340
341 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
342 }
343