]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genbrk/genbrk.cpp
ICU-57163.0.1.tar.gz
[apple/icu.git] / icuSources / tools / genbrk / genbrk.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File genbrk.c
8 */
9
10 //--------------------------------------------------------------------
11 //
12 // Tool for generating RuleBasedBreakIterator data files (.brk files).
13 // .brk files contain the precompiled rules for standard types
14 // of iterators - word, line, sentence, etc.
15 //
16 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk
17 //
18 // options: -v verbose
19 // -? or -h help
20 //
21 // The input rule file is a plain text file containing break rules
22 // in the input format accepted by RuleBasedBreakIterators. The
23 // file can be encoded as utf-8, or utf-16 (either endian), or
24 // in the default code page (platform dependent.). utf encoded
25 // files must include a BOM.
26 //
27 //--------------------------------------------------------------------
28
29 #include "unicode/utypes.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/unistr.h"
32 #include "unicode/rbbi.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "cmemory.h"
42
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46
47 U_NAMESPACE_USE
48
49 static char *progName;
50 static UOption options[]={
51 UOPTION_HELP_H, /* 0 */
52 UOPTION_HELP_QUESTION_MARK, /* 1 */
53 UOPTION_VERBOSE, /* 2 */
54 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
56 UOPTION_ICUDATADIR, /* 5 */
57 UOPTION_DESTDIR, /* 6 */
58 UOPTION_COPYRIGHT, /* 7 */
59 UOPTION_QUIET, /* 8 */
60 };
61
62 void usageAndDie(int retCode) {
63 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
64 printf("\tRead in break iteration rules text and write out the binary data\n"
65 "options:\n"
66 "\t-h or -? or --help this usage text\n"
67 "\t-V or --version show a version message\n"
68 "\t-c or --copyright include a copyright notice\n"
69 "\t-v or --verbose turn on verbose output\n"
70 "\t-q or --quiet do not display warnings and progress\n"
71 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
72 "\t followed by path, defaults to %s\n"
73 "\t-d or --destdir destination directory, followed by the path\n",
74 u_getDataDirectory());
75 exit (retCode);
76 }
77
78
79 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
80
81 /* dummy UDataInfo cf. udata.h */
82 static UDataInfo dummyDataInfo = {
83 sizeof(UDataInfo),
84 0,
85
86 U_IS_BIG_ENDIAN,
87 U_CHARSET_FAMILY,
88 U_SIZEOF_UCHAR,
89 0,
90
91 { 0, 0, 0, 0 }, /* dummy dataFormat */
92 { 0, 0, 0, 0 }, /* dummy formatVersion */
93 { 0, 0, 0, 0 } /* dummy dataVersion */
94 };
95
96 #else
97
98 //
99 // Set up the ICU data header, defined in ucmndata.h
100 //
101 DataHeader dh ={
102 {sizeof(DataHeader), // Struct MappedData
103 0xda,
104 0x27},
105
106 { // struct UDataInfo
107 sizeof(UDataInfo), // size
108 0, // reserved
109 U_IS_BIG_ENDIAN,
110 U_CHARSET_FAMILY,
111 U_SIZEOF_UCHAR,
112 0, // reserved
113
114 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
115 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
116 // from the RBBI rule builder. The values declared
117 // here should never appear in any real RBBI data.
118 { 4, 1, 0, 0 } // dataVersion (Unicode version)
119 }};
120
121 #endif
122
123 //----------------------------------------------------------------------------
124 //
125 // main for genbrk
126 //
127 //----------------------------------------------------------------------------
128 int main(int argc, char **argv) {
129 UErrorCode status = U_ZERO_ERROR;
130 const char *ruleFileName;
131 const char *outFileName;
132 const char *outDir = NULL;
133 const char *copyright = NULL;
134
135 //
136 // Pick up and check the command line arguments,
137 // using the standard ICU tool utils option handling.
138 //
139 U_MAIN_INIT_ARGS(argc, argv);
140 progName = argv[0];
141 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
142 if(argc<0) {
143 // Unrecognized option
144 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
145 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
146 }
147
148 if(options[0].doesOccur || options[1].doesOccur) {
149 // -? or -h for help.
150 usageAndDie(0);
151 }
152
153 if (!(options[3].doesOccur && options[4].doesOccur)) {
154 fprintf(stderr, "rule file and output file must both be specified.\n");
155 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
156 }
157 ruleFileName = options[3].value;
158 outFileName = options[4].value;
159
160 if (options[5].doesOccur) {
161 u_setDataDirectory(options[5].value);
162 }
163
164 status = U_ZERO_ERROR;
165
166 /* Combine the directory with the file name */
167 if(options[6].doesOccur) {
168 outDir = options[6].value;
169 }
170 if (options[7].doesOccur) {
171 copyright = U_COPYRIGHT_STRING;
172 }
173
174 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
175
176 UNewDataMemory *pData;
177 char msg[1024];
178
179 /* write message with just the name */
180 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
181 fprintf(stderr, "%s\n", msg);
182
183 /* write the dummy data file */
184 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
185 udata_writeBlock(pData, msg, strlen(msg));
186 udata_finish(pData, &status);
187 return (int)status;
188
189 #else
190 /* Initialize ICU */
191 u_init(&status);
192 if (U_FAILURE(status)) {
193 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
194 argv[0], u_errorName(status));
195 exit(1);
196 }
197 status = U_ZERO_ERROR;
198
199 //
200 // Read in the rule source file
201 //
202 long result;
203 long ruleFileSize;
204 FILE *file;
205 char *ruleBufferC;
206
207 file = fopen(ruleFileName, "rb");
208 if( file == 0 ) {
209 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
210 exit(-1);
211 }
212 fseek(file, 0, SEEK_END);
213 ruleFileSize = ftell(file);
214 fseek(file, 0, SEEK_SET);
215 ruleBufferC = new char[ruleFileSize+10];
216
217 result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
218 if (result != ruleFileSize) {
219 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
220 exit (-1);
221 }
222 ruleBufferC[ruleFileSize]=0;
223 fclose(file);
224
225 //
226 // Look for a Unicode Signature (BOM) on the rule file
227 //
228 int32_t signatureLength;
229 const char * ruleSourceC = ruleBufferC;
230 const char* encoding = ucnv_detectUnicodeSignature(
231 ruleSourceC, ruleFileSize, &signatureLength, &status);
232 if (U_FAILURE(status)) {
233 exit(status);
234 }
235 if(encoding!=NULL ){
236 ruleSourceC += signatureLength;
237 ruleFileSize -= signatureLength;
238 }
239
240 //
241 // Open a converter to take the rule file to UTF-16
242 //
243 UConverter* conv;
244 conv = ucnv_open(encoding, &status);
245 if (U_FAILURE(status)) {
246 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
247 exit(status);
248 }
249
250 //
251 // Convert the rules to UChar.
252 // Preflight first to determine required buffer size.
253 //
254 uint32_t destCap = ucnv_toUChars(conv,
255 NULL, // dest,
256 0, // destCapacity,
257 ruleSourceC,
258 ruleFileSize,
259 &status);
260 if (status != U_BUFFER_OVERFLOW_ERROR) {
261 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
262 exit(status);
263 };
264
265 status = U_ZERO_ERROR;
266 UChar *ruleSourceU = new UChar[destCap+1];
267 ucnv_toUChars(conv,
268 ruleSourceU, // dest,
269 destCap+1,
270 ruleSourceC,
271 ruleFileSize,
272 &status);
273 if (U_FAILURE(status)) {
274 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
275 exit(status);
276 };
277 ucnv_close(conv);
278
279
280 //
281 // Put the source rules into a UnicodeString
282 //
283 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
284
285 //
286 // Create the break iterator from the rules
287 // This will compile the rules.
288 //
289 UParseError parseError;
290 parseError.line = 0;
291 parseError.offset = 0;
292 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
293 if (U_FAILURE(status)) {
294 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
295 u_errorName(status), (int)parseError.line, (int)parseError.offset);
296 exit(status);
297 };
298
299
300 //
301 // Get the compiled rule data from the break iterator.
302 //
303 uint32_t outDataSize;
304 const uint8_t *outData;
305 outData = bi->getBinaryRules(outDataSize);
306
307 // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
308 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
309
310 //
311 // Create the output file
312 //
313 size_t bytesWritten;
314 UNewDataMemory *pData;
315 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
316 if(U_FAILURE(status)) {
317 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
318 outFileName, u_errorName(status));
319 exit(status);
320 }
321
322
323 // Write the data itself.
324 udata_writeBlock(pData, outData, outDataSize);
325 // finish up
326 bytesWritten = udata_finish(pData, &status);
327 if(U_FAILURE(status)) {
328 fprintf(stderr, "genbrk: error %d writing the output file\n", status);
329 exit(status);
330 }
331
332 if (bytesWritten != outDataSize) {
333 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
334 exit(-1);
335 }
336
337 delete bi;
338 delete[] ruleSourceU;
339 delete[] ruleBufferC;
340 u_cleanup();
341
342
343 if(!options[8].doesOccur) {
344 printf("genbrk: tool completed successfully.\n");
345 }
346 return 0;
347
348 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
349 }
350