]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genbrk/genbrk.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / genbrk / genbrk.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
3* Copyright (C) 2002-2003, International Business Machines
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6*
7* File genbrk.c
8*/
9
10//--------------------------------------------------------------------
11//
12// Tool for generating RuleBasedBreakIterator data files (.brk files).
13// .brk files contain the precompiled rules for standard types
14// of iterators - word, line, sentence, etc.
15//
16// Usage: genbrk [options] -r rule-file.txt -o output-file.brk
17//
18// options: -v verbose
19// -? or -h help
20//
21// The input rule file is a plain text file containing break rules
22// in the input format accepted by RuleBasedBreakIterators. The
23// file can be encoded as utf-8, or utf-16 (either endian), or
24// in the default code page (platform dependent.). utf encoded
25// files must include a BOM.
26//
27//--------------------------------------------------------------------
28
29#include <stdio.h>
30#include "unicode/utypes.h"
31#include "unicode/ucnv.h"
32#include "unicode/unistr.h"
33#include "unicode/rbbi.h"
34#include "unicode/uclean.h"
35#include "unicode/udata.h"
36
37#include "uoptions.h"
38#include "unewdata.h"
39#include "ucmndata.h"
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44
45#define DATA_TYPE "brk"
46
47static char *progName;
48static UOption options[]={
49 UOPTION_HELP_H, /* 0 */
50 UOPTION_HELP_QUESTION_MARK, /* 1 */
51 UOPTION_VERBOSE, /* 2 */
52 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
53 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
54 UOPTION_ICUDATADIR, /* 5 */
55 UOPTION_DESTDIR /* 6 */
56};
57
58void usageAndDie(int retCode) {
59 printf("Usage: %s [-v] -r rule-file -o output-file\n", progName);
60 exit (retCode);
61}
62
63
64#if UCONFIG_NO_BREAK_ITERATION
65
66/* dummy UDataInfo cf. udata.h */
67static UDataInfo dummyDataInfo = {
68 sizeof(UDataInfo),
69 0,
70
71 U_IS_BIG_ENDIAN,
72 U_CHARSET_FAMILY,
73 U_SIZEOF_UCHAR,
74 0,
75
76 { 0, 0, 0, 0 }, /* dummy dataFormat */
77 { 0, 0, 0, 0 }, /* dummy formatVersion */
78 { 0, 0, 0, 0 } /* dummy dataVersion */
79};
80
81#else
82
83//
84// Set up the ICU data header, defined in ucmndata.h
85//
86DataHeader dh ={
87 {sizeof(DataHeader), // Struct MappedData
88 0xda,
89 0x27},
90
91 { // struct UDataInfo
92 sizeof(UDataInfo), // size
93 0, // reserved
94 U_IS_BIG_ENDIAN,
95 U_CHARSET_FAMILY,
96 U_SIZEOF_UCHAR,
97 0, // reserved
98
99 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
100 { 2, 1, 0, 0 }, // formatVersion
101 { 3, 1, 0, 0 } // dataVersion (Unicode version)
102 }};
103
104#endif
105
106//----------------------------------------------------------------------------
107//
108// main for genbrk
109//
110//----------------------------------------------------------------------------
111int main(int argc, char **argv) {
112 UErrorCode status = U_ZERO_ERROR;
113 const char *ruleFileName;
114 const char *outFileName;
115 const char *outDir = NULL;
116 char *outFullFileName;
117 int32_t outFullFileNameLen;
118
119 //
120 // Pick up and check the command line arguments,
121 // using the standard ICU tool utils option handling.
122 //
123 U_MAIN_INIT_ARGS(argc, argv);
124 progName = argv[0];
125 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
126 if(argc<0) {
127 // Unrecognized option
128 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
129 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
130 }
131
132 if(options[0].doesOccur || options[1].doesOccur) {
133 // -? or -h for help.
134 usageAndDie(0);
135 }
136
137 if (!(options[3].doesOccur && options[4].doesOccur)) {
138 fprintf(stderr, "rule file and output file must both be specified.\n");
139 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
140 }
141 ruleFileName = options[3].value;
142 outFileName = options[4].value;
143 outFullFileNameLen = strlen(outFileName);
144
145 if (options[5].doesOccur) {
146 u_setDataDirectory(options[5].value);
147 }
148
149 /* Combine the directory with the file name */
150 if(options[6].doesOccur) {
151 outDir = options[6].value;
152 outFullFileNameLen += strlen(outDir);
153 }
154 outFullFileName = (char*)malloc(outFullFileNameLen + 2);
155 outFullFileName[0] = 0;
156 if (outDir) {
157 strcpy(outFullFileName, outDir);
158 strcat(outFullFileName, U_FILE_SEP_STRING);
159 }
160 strcat(outFullFileName, outFileName);
161
162#if UCONFIG_NO_BREAK_ITERATION
163
164 UNewDataMemory *pData;
165 char msg[2048], folder[2048], name[32];
166 char *basename;
167 int length;
168
169 /* split the outFileName into folder + name + type */
170 strcpy(folder, outFileName);
171 basename = strrchr(folder, U_FILE_SEP_CHAR);
172 if(basename == NULL) {
173 basename = folder;
174 } else {
175 ++basename;
176 }
177
178 /* copy the data name and remove it from the folder */
179 strcpy(name, basename);
180 *basename = 0;
181
182 /* write message with just the name */
183 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", name);
184 fprintf(stderr, "%s\n", msg);
185
186 /* remove the type suffix (hardcode to DATA_TYPE) */
187 length = strlen(name);
188 if(length > 4 && name[length - 4] == '.') {
189 name[length - 4] = 0;
190 }
191
192 /* write the dummy data file */
193 pData = udata_create(folder, DATA_TYPE, name, &dummyDataInfo, NULL, &status);
194 udata_writeBlock(pData, msg, strlen(msg));
195 udata_finish(pData, &status);
196 return (int)status;
197
198#else
199
200 //
201 // Read in the rule source file
202 //
203 int result;
204 long ruleFileSize;
205 FILE *file;
206 char *ruleBufferC;
207
208 file = fopen(ruleFileName, "rb");
209 if( file == 0 ) {
210 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
211 exit(-1);
212 }
213 fseek(file, 0, SEEK_END);
214 ruleFileSize = ftell(file);
215 fseek(file, 0, SEEK_SET);
216 ruleBufferC = new char[ruleFileSize+10];
217
218 result = fread(ruleBufferC, 1, ruleFileSize, file);
219 if (result != ruleFileSize) {
220 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
221 exit (-1);
222 }
223 ruleBufferC[ruleFileSize]=0;
224 fclose(file);
225
226 //
227 // Look for a Unicode Signature (BOM) on the rule file
228 //
229 int32_t signatureLength;
230 const char * ruleSourceC = ruleBufferC;
231 const char* encoding = ucnv_detectUnicodeSignature(
232 ruleSourceC, ruleFileSize, &signatureLength, &status);
233 if (U_FAILURE(status)) {
234 exit(status);
235 }
236 if(encoding!=NULL ){
237 ruleSourceC += signatureLength;
238 ruleFileSize -= signatureLength;
239 }
240
241 //
242 // Open a converter to take the rule file to UTF-16
243 //
244 UConverter* conv;
245 conv = ucnv_open(encoding, &status);
246 if (U_FAILURE(status)) {
247 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
248 exit(status);
249 }
250
251 //
252 // Convert the rules to UChar.
253 // Preflight first to determine required buffer size.
254 //
255 uint32_t destCap = ucnv_toUChars(conv,
256 NULL, // dest,
257 0, // destCapacity,
258 ruleSourceC,
259 ruleFileSize,
260 &status);
261 if (status != U_BUFFER_OVERFLOW_ERROR) {
262 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
263 exit(status);
264 };
265
266 status = U_ZERO_ERROR;
267 UChar *ruleSourceU = new UChar[destCap+1];
268 ucnv_toUChars(conv,
269 ruleSourceU, // dest,
270 destCap+1,
271 ruleSourceC,
272 ruleFileSize,
273 &status);
274 if (U_FAILURE(status)) {
275 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
276 exit(status);
277 };
278 ucnv_close(conv);
279
280
281 //
282 // Put the source rules into a UnicodeString
283 //
284 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
285
286 //
287 // Create the break iterator from the rules
288 // This will compile the rules.
289 //
290 UParseError parseError;
291 parseError.line = 0;
292 parseError.offset = 0;
293 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
294 if (U_FAILURE(status)) {
295 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
296 u_errorName(status), parseError.line, parseError.offset);
297 exit(status);
298 };
299
300
301 //
302 // Get the compiled rule data from the break iterator.
303 //
304 uint32_t outDataSize;
305 const uint8_t *outData;
306 outData = bi->getBinaryRules(outDataSize);
307
308
309 //
310 // Create the output file
311 //
312 size_t bytesWritten;
313 file = fopen(outFullFileName, "wb");
314 if (file == 0) {
315 fprintf(stderr, "Could not open output file \"%s\"\n", outFullFileName);
316 exit(-1);
317 }
318
319 bytesWritten = fwrite(&dh, 1, sizeof(DataHeader), file);
320
321 //
322 // Write the data itself.
323 //
324 bytesWritten = fwrite(outData, 1, outDataSize, file);
325 if (bytesWritten != outDataSize) {
326 fprintf(stderr, "Error writing to output file \"%s\"\n", outFullFileName);
327 exit(-1);
328 }
329
330 fclose(file);
331 delete bi;
332 delete[] ruleSourceU;
333 delete[] ruleBufferC;
334 free(outFullFileName);
335 u_cleanup();
336
337
338 printf("genbrk: tool completed successfully.\n");
339 return 0;
340
341#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
342}