]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genbrk/genbrk.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / tools / genbrk / genbrk.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
729e4ab9 3* Copyright (C) 2002-2009, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6*
7* File genbrk.c
8*/
9
10//--------------------------------------------------------------------
11//
12// Tool for generating RuleBasedBreakIterator data files (.brk files).
13// .brk files contain the precompiled rules for standard types
14// of iterators - word, line, sentence, etc.
15//
16// Usage: genbrk [options] -r rule-file.txt -o output-file.brk
17//
18// options: -v verbose
19// -? or -h help
20//
21// The input rule file is a plain text file containing break rules
22// in the input format accepted by RuleBasedBreakIterators. The
23// file can be encoded as utf-8, or utf-16 (either endian), or
24// in the default code page (platform dependent.). utf encoded
25// files must include a BOM.
26//
27//--------------------------------------------------------------------
28
b75a7d8f
A
29#include "unicode/utypes.h"
30#include "unicode/ucnv.h"
31#include "unicode/unistr.h"
32#include "unicode/rbbi.h"
33#include "unicode/uclean.h"
34#include "unicode/udata.h"
374ca955 35#include "unicode/putil.h"
b75a7d8f
A
36
37#include "uoptions.h"
38#include "unewdata.h"
39#include "ucmndata.h"
73c04bcf
A
40#include "rbbidata.h"
41#include "cmemory.h"
b75a7d8f
A
42
43#include <stdio.h>
44#include <stdlib.h>
45#include <string.h>
46
46f4442e
A
47U_NAMESPACE_USE
48
b75a7d8f
A
49static char *progName;
50static UOption options[]={
51 UOPTION_HELP_H, /* 0 */
52 UOPTION_HELP_QUESTION_MARK, /* 1 */
53 UOPTION_VERBOSE, /* 2 */
54 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
56 UOPTION_ICUDATADIR, /* 5 */
374ca955
A
57 UOPTION_DESTDIR, /* 6 */
58 UOPTION_COPYRIGHT, /* 7 */
b75a7d8f
A
59};
60
61void usageAndDie(int retCode) {
374ca955
A
62 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
63 printf("\tRead in break iteration rules text and write out the binary data\n"
64 "options:\n"
65 "\t-h or -? or --help this usage text\n"
66 "\t-V or --version show a version message\n"
67 "\t-c or --copyright include a copyright notice\n"
68 "\t-v or --verbose turn on verbose output\n"
69 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
70 "\t followed by path, defaults to %s\n"
71 "\t-d or --destdir destination directory, followed by the path\n",
72 u_getDataDirectory());
b75a7d8f
A
73 exit (retCode);
74}
75
76
729e4ab9 77#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
b75a7d8f
A
78
79/* dummy UDataInfo cf. udata.h */
80static UDataInfo dummyDataInfo = {
81 sizeof(UDataInfo),
82 0,
83
84 U_IS_BIG_ENDIAN,
85 U_CHARSET_FAMILY,
86 U_SIZEOF_UCHAR,
87 0,
88
89 { 0, 0, 0, 0 }, /* dummy dataFormat */
90 { 0, 0, 0, 0 }, /* dummy formatVersion */
91 { 0, 0, 0, 0 } /* dummy dataVersion */
92};
93
94#else
95
96//
97// Set up the ICU data header, defined in ucmndata.h
98//
99DataHeader dh ={
100 {sizeof(DataHeader), // Struct MappedData
101 0xda,
102 0x27},
103
104 { // struct UDataInfo
105 sizeof(UDataInfo), // size
106 0, // reserved
107 U_IS_BIG_ENDIAN,
108 U_CHARSET_FAMILY,
109 U_SIZEOF_UCHAR,
110 0, // reserved
111
112 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
73c04bcf
A
113 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
114 // from the RBBI rule builder. The values declared
115 // here should never appear in any real RBBI data.
116 { 4, 1, 0, 0 } // dataVersion (Unicode version)
b75a7d8f
A
117 }};
118
119#endif
120
121//----------------------------------------------------------------------------
122//
123// main for genbrk
124//
125//----------------------------------------------------------------------------
126int main(int argc, char **argv) {
127 UErrorCode status = U_ZERO_ERROR;
128 const char *ruleFileName;
129 const char *outFileName;
130 const char *outDir = NULL;
374ca955 131 const char *copyright = NULL;
b75a7d8f
A
132
133 //
134 // Pick up and check the command line arguments,
135 // using the standard ICU tool utils option handling.
136 //
137 U_MAIN_INIT_ARGS(argc, argv);
138 progName = argv[0];
139 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
140 if(argc<0) {
141 // Unrecognized option
142 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
143 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
144 }
145
146 if(options[0].doesOccur || options[1].doesOccur) {
147 // -? or -h for help.
148 usageAndDie(0);
149 }
150
151 if (!(options[3].doesOccur && options[4].doesOccur)) {
152 fprintf(stderr, "rule file and output file must both be specified.\n");
153 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
154 }
155 ruleFileName = options[3].value;
156 outFileName = options[4].value;
b75a7d8f
A
157
158 if (options[5].doesOccur) {
159 u_setDataDirectory(options[5].value);
160 }
161
374ca955
A
162 status = U_ZERO_ERROR;
163
b75a7d8f
A
164 /* Combine the directory with the file name */
165 if(options[6].doesOccur) {
166 outDir = options[6].value;
b75a7d8f 167 }
374ca955
A
168 if (options[7].doesOccur) {
169 copyright = U_COPYRIGHT_STRING;
b75a7d8f 170 }
b75a7d8f 171
729e4ab9 172#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
b75a7d8f
A
173
174 UNewDataMemory *pData;
73c04bcf 175 char msg[1024];
b75a7d8f
A
176
177 /* write message with just the name */
729e4ab9 178 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
b75a7d8f
A
179 fprintf(stderr, "%s\n", msg);
180
b75a7d8f 181 /* write the dummy data file */
73c04bcf 182 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
b75a7d8f
A
183 udata_writeBlock(pData, msg, strlen(msg));
184 udata_finish(pData, &status);
185 return (int)status;
186
187#else
729e4ab9
A
188 /* Initialize ICU */
189 u_init(&status);
190 if (U_FAILURE(status)) {
191 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
192 argv[0], u_errorName(status));
193 exit(1);
194 }
195 status = U_ZERO_ERROR;
b75a7d8f
A
196
197 //
198 // Read in the rule source file
199 //
374ca955 200 long result;
b75a7d8f
A
201 long ruleFileSize;
202 FILE *file;
203 char *ruleBufferC;
204
205 file = fopen(ruleFileName, "rb");
206 if( file == 0 ) {
207 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
208 exit(-1);
209 }
210 fseek(file, 0, SEEK_END);
211 ruleFileSize = ftell(file);
212 fseek(file, 0, SEEK_SET);
213 ruleBufferC = new char[ruleFileSize+10];
214
374ca955 215 result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
b75a7d8f
A
216 if (result != ruleFileSize) {
217 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
218 exit (-1);
219 }
220 ruleBufferC[ruleFileSize]=0;
221 fclose(file);
222
223 //
224 // Look for a Unicode Signature (BOM) on the rule file
225 //
226 int32_t signatureLength;
227 const char * ruleSourceC = ruleBufferC;
228 const char* encoding = ucnv_detectUnicodeSignature(
229 ruleSourceC, ruleFileSize, &signatureLength, &status);
230 if (U_FAILURE(status)) {
231 exit(status);
232 }
233 if(encoding!=NULL ){
234 ruleSourceC += signatureLength;
235 ruleFileSize -= signatureLength;
236 }
237
238 //
239 // Open a converter to take the rule file to UTF-16
240 //
241 UConverter* conv;
242 conv = ucnv_open(encoding, &status);
243 if (U_FAILURE(status)) {
244 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
245 exit(status);
246 }
247
248 //
249 // Convert the rules to UChar.
250 // Preflight first to determine required buffer size.
251 //
252 uint32_t destCap = ucnv_toUChars(conv,
253 NULL, // dest,
254 0, // destCapacity,
255 ruleSourceC,
256 ruleFileSize,
257 &status);
258 if (status != U_BUFFER_OVERFLOW_ERROR) {
259 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
260 exit(status);
261 };
262
263 status = U_ZERO_ERROR;
264 UChar *ruleSourceU = new UChar[destCap+1];
265 ucnv_toUChars(conv,
266 ruleSourceU, // dest,
267 destCap+1,
268 ruleSourceC,
269 ruleFileSize,
270 &status);
271 if (U_FAILURE(status)) {
272 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
273 exit(status);
274 };
275 ucnv_close(conv);
276
277
278 //
279 // Put the source rules into a UnicodeString
280 //
281 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
282
283 //
284 // Create the break iterator from the rules
285 // This will compile the rules.
286 //
287 UParseError parseError;
374ca955
A
288 parseError.line = 0;
289 parseError.offset = 0;
b75a7d8f
A
290 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
291 if (U_FAILURE(status)) {
292 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
374ca955 293 u_errorName(status), (int)parseError.line, (int)parseError.offset);
b75a7d8f
A
294 exit(status);
295 };
296
297
298 //
299 // Get the compiled rule data from the break iterator.
300 //
301 uint32_t outDataSize;
302 const uint8_t *outData;
303 outData = bi->getBinaryRules(outDataSize);
304
73c04bcf
A
305 // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
306 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
b75a7d8f
A
307
308 //
309 // Create the output file
310 //
311 size_t bytesWritten;
374ca955
A
312 UNewDataMemory *pData;
313 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
314 if(U_FAILURE(status)) {
315 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
316 outFileName, u_errorName(status));
317 exit(status);
b75a7d8f 318 }
73c04bcf
A
319
320
b75a7d8f 321 // Write the data itself.
374ca955
A
322 udata_writeBlock(pData, outData, outDataSize);
323 // finish up
324 bytesWritten = udata_finish(pData, &status);
325 if(U_FAILURE(status)) {
326 fprintf(stderr, "genbrk: error %d writing the output file\n", status);
327 exit(status);
328 }
329
b75a7d8f 330 if (bytesWritten != outDataSize) {
374ca955 331 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
b75a7d8f
A
332 exit(-1);
333 }
334
b75a7d8f
A
335 delete bi;
336 delete[] ruleSourceU;
337 delete[] ruleBufferC;
b75a7d8f
A
338 u_cleanup();
339
340
341 printf("genbrk: tool completed successfully.\n");
342 return 0;
343
344#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
345}
374ca955 346