2 **********************************************************************
3 * Copyright (C) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
10 //--------------------------------------------------------------------
12 // Tool for generating RuleBasedBreakIterator data files (.brk files).
13 // .brk files contain the precompiled rules for standard types
14 // of iterators - word, line, sentence, etc.
16 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk
18 // options: -v verbose
21 // The input rule file is a plain text file containing break rules
22 // in the input format accepted by RuleBasedBreakIterators. The
23 // file can be encoded as utf-8, or utf-16 (either endian), or
24 // in the default code page (platform dependent.). utf encoded
25 // files must include a BOM.
27 //--------------------------------------------------------------------
29 #include "unicode/utypes.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/unistr.h"
32 #include "unicode/rbbi.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
49 static char *progName
;
50 static UOption options
[]={
51 UOPTION_HELP_H
, /* 0 */
52 UOPTION_HELP_QUESTION_MARK
, /* 1 */
53 UOPTION_VERBOSE
, /* 2 */
54 { "rules", NULL
, NULL
, NULL
, 'r', UOPT_REQUIRES_ARG
, 0 }, /* 3 */
55 { "out", NULL
, NULL
, NULL
, 'o', UOPT_REQUIRES_ARG
, 0 }, /* 4 */
56 UOPTION_ICUDATADIR
, /* 5 */
57 UOPTION_DESTDIR
, /* 6 */
58 UOPTION_COPYRIGHT
, /* 7 */
59 UOPTION_QUIET
, /* 8 */
62 void usageAndDie(int retCode
) {
63 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName
);
64 printf("\tRead in break iteration rules text and write out the binary data\n"
66 "\t-h or -? or --help this usage text\n"
67 "\t-V or --version show a version message\n"
68 "\t-c or --copyright include a copyright notice\n"
69 "\t-v or --verbose turn on verbose output\n"
70 "\t-q or --quiet do not display warnings and progress\n"
71 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
72 "\t followed by path, defaults to %s\n"
73 "\t-d or --destdir destination directory, followed by the path\n",
74 u_getDataDirectory());
79 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
81 /* dummy UDataInfo cf. udata.h */
82 static UDataInfo dummyDataInfo
= {
91 { 0, 0, 0, 0 }, /* dummy dataFormat */
92 { 0, 0, 0, 0 }, /* dummy formatVersion */
93 { 0, 0, 0, 0 } /* dummy dataVersion */
99 // Set up the ICU data header, defined in ucmndata.h
102 {sizeof(DataHeader
), // Struct MappedData
106 { // struct UDataInfo
107 sizeof(UDataInfo
), // size
114 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
115 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
116 // from the RBBI rule builder. The values declared
117 // here should never appear in any real RBBI data.
118 { 4, 1, 0, 0 } // dataVersion (Unicode version)
123 //----------------------------------------------------------------------------
127 //----------------------------------------------------------------------------
128 int main(int argc
, char **argv
) {
129 UErrorCode status
= U_ZERO_ERROR
;
130 const char *ruleFileName
;
131 const char *outFileName
;
132 const char *outDir
= NULL
;
133 const char *copyright
= NULL
;
136 // Pick up and check the command line arguments,
137 // using the standard ICU tool utils option handling.
139 U_MAIN_INIT_ARGS(argc
, argv
);
141 argc
=u_parseArgs(argc
, argv
, UPRV_LENGTHOF(options
), options
);
143 // Unrecognized option
144 fprintf(stderr
, "error in command line argument \"%s\"\n", argv
[-argc
]);
145 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR
);
148 if(options
[0].doesOccur
|| options
[1].doesOccur
) {
149 // -? or -h for help.
153 if (!(options
[3].doesOccur
&& options
[4].doesOccur
)) {
154 fprintf(stderr
, "rule file and output file must both be specified.\n");
155 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR
);
157 ruleFileName
= options
[3].value
;
158 outFileName
= options
[4].value
;
160 if (options
[5].doesOccur
) {
161 u_setDataDirectory(options
[5].value
);
164 status
= U_ZERO_ERROR
;
166 /* Combine the directory with the file name */
167 if(options
[6].doesOccur
) {
168 outDir
= options
[6].value
;
170 if (options
[7].doesOccur
) {
171 copyright
= U_COPYRIGHT_STRING
;
174 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
176 UNewDataMemory
*pData
;
179 /* write message with just the name */
180 sprintf(msg
, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName
);
181 fprintf(stderr
, "%s\n", msg
);
183 /* write the dummy data file */
184 pData
= udata_create(outDir
, NULL
, outFileName
, &dummyDataInfo
, NULL
, &status
);
185 udata_writeBlock(pData
, msg
, strlen(msg
));
186 udata_finish(pData
, &status
);
192 if (U_FAILURE(status
)) {
193 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
194 argv
[0], u_errorName(status
));
197 status
= U_ZERO_ERROR
;
200 // Read in the rule source file
207 file
= fopen(ruleFileName
, "rb");
209 fprintf(stderr
, "Could not open file \"%s\"\n", ruleFileName
);
212 fseek(file
, 0, SEEK_END
);
213 ruleFileSize
= ftell(file
);
214 fseek(file
, 0, SEEK_SET
);
215 ruleBufferC
= new char[ruleFileSize
+10];
217 result
= (long)fread(ruleBufferC
, 1, ruleFileSize
, file
);
218 if (result
!= ruleFileSize
) {
219 fprintf(stderr
, "Error reading file \"%s\"\n", ruleFileName
);
222 ruleBufferC
[ruleFileSize
]=0;
226 // Look for a Unicode Signature (BOM) on the rule file
228 int32_t signatureLength
;
229 const char * ruleSourceC
= ruleBufferC
;
230 const char* encoding
= ucnv_detectUnicodeSignature(
231 ruleSourceC
, ruleFileSize
, &signatureLength
, &status
);
232 if (U_FAILURE(status
)) {
236 ruleSourceC
+= signatureLength
;
237 ruleFileSize
-= signatureLength
;
241 // Open a converter to take the rule file to UTF-16
244 conv
= ucnv_open(encoding
, &status
);
245 if (U_FAILURE(status
)) {
246 fprintf(stderr
, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status
));
251 // Convert the rules to UChar.
252 // Preflight first to determine required buffer size.
254 uint32_t destCap
= ucnv_toUChars(conv
,
260 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
261 fprintf(stderr
, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
265 status
= U_ZERO_ERROR
;
266 UChar
*ruleSourceU
= new UChar
[destCap
+1];
268 ruleSourceU
, // dest,
273 if (U_FAILURE(status
)) {
274 fprintf(stderr
, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
281 // Put the source rules into a UnicodeString
283 UnicodeString
ruleSourceS(FALSE
, ruleSourceU
, destCap
);
286 // Create the break iterator from the rules
287 // This will compile the rules.
289 UParseError parseError
;
291 parseError
.offset
= 0;
292 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(ruleSourceS
, parseError
, status
);
293 if (U_FAILURE(status
)) {
294 fprintf(stderr
, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
295 u_errorName(status
), (int)parseError
.line
, (int)parseError
.offset
);
301 // Get the compiled rule data from the break iterator.
303 uint32_t outDataSize
;
304 const uint8_t *outData
;
305 outData
= bi
->getBinaryRules(outDataSize
);
307 // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
308 uprv_memcpy(dh
.info
.formatVersion
, ((RBBIDataHeader
*)outData
)->fFormatVersion
, sizeof(dh
.info
.formatVersion
));
311 // Create the output file
314 UNewDataMemory
*pData
;
315 pData
= udata_create(outDir
, NULL
, outFileName
, &(dh
.info
), copyright
, &status
);
316 if(U_FAILURE(status
)) {
317 fprintf(stderr
, "genbrk: Could not open output file \"%s\", \"%s\"\n",
318 outFileName
, u_errorName(status
));
323 // Write the data itself.
324 udata_writeBlock(pData
, outData
, outDataSize
);
326 bytesWritten
= udata_finish(pData
, &status
);
327 if(U_FAILURE(status
)) {
328 fprintf(stderr
, "genbrk: error %d writing the output file\n", status
);
332 if (bytesWritten
!= outDataSize
) {
333 fprintf(stderr
, "Error writing to output file \"%s\"\n", outFileName
);
338 delete[] ruleSourceU
;
339 delete[] ruleBufferC
;
343 if(!options
[8].doesOccur
) {
344 printf("genbrk: tool completed successfully.\n");
348 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */