]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gencfu/gencfu.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / tools / gencfu / gencfu.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2009-2016, International Business Machines
729e4ab9
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9* File gencfu.c
10*/
11
12//--------------------------------------------------------------------
13//
14// Tool for generating Unicode Confusable data files (.cfu files).
15// .cfu files contain the compiled of the confusable data
16// derived from the Unicode Consortium data described in
17// Unicode UAX 39.
18//
f3c0d7a5 19// Usage: gencfu [options] -r confusables-file.txt -o output-file.cfu
729e4ab9
A
20//
21// options: -v verbose
22// -? or -h help
23//
24// The input rule filew is are plain text files containing confusable character
25// definitions in the input format defined by Unicode UAX39 for the files
f3c0d7a5 26// confusables.txt. This source (.txt) format
729e4ab9
A
27// is also accepted direaccepted by ICU spoof detedtors. The
28// files must be encoded in utf-8 format, with or without a BOM.
29//
f3c0d7a5
A
30// The script used to compile confusablesWholeScript.txt into the CFU file
31// until the Unicode consortium deprecated it.
32//
729e4ab9
A
33//--------------------------------------------------------------------
34
35#include "unicode/utypes.h"
36#include "unicode/unistr.h"
37#include "unicode/uclean.h"
38#include "unicode/udata.h"
39#include "unicode/putil.h"
40
41#include "uoptions.h"
42#include "unewdata.h"
43#include "ucmndata.h"
44#include "uspoof_impl.h"
45#include "cmemory.h"
46
47#include <stdio.h>
48#include <stdlib.h>
49#include <string.h>
50
51U_NAMESPACE_USE
52
53static char *progName;
54static UOption options[]={
55 UOPTION_HELP_H, /* 0 */
56 UOPTION_HELP_QUESTION_MARK, /* 1 */
57 UOPTION_VERBOSE, /* 2 */
58 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
f3c0d7a5 59 { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */ // deprecated
729e4ab9
A
60 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */
61 UOPTION_ICUDATADIR, /* 6 */
62 UOPTION_DESTDIR, /* 7 */
63 UOPTION_COPYRIGHT, /* 8 */
2ca993e8 64 UOPTION_QUIET, /* 9 */
729e4ab9
A
65};
66
67void usageAndDie(int retCode) {
f3c0d7a5 68 printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
729e4ab9
A
69 printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
70 "options:\n"
71 "\t-h or -? or --help this usage text\n"
72 "\t-V or --version show a version message\n"
73 "\t-c or --copyright include a copyright notice\n"
74 "\t-v or --verbose turn on verbose output\n"
2ca993e8 75 "\t-q or --quiet do not display warnings and progress\n"
729e4ab9
A
76 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
77 "\t followed by path, defaults to %s\n"
78 "\t-d or --destdir destination directory, followed by the path\n",
79 u_getDataDirectory());
80 exit (retCode);
81}
82
83
84#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
85
86/* dummy UDataInfo cf. udata.h */
87static UDataInfo dummyDataInfo = {
88 sizeof(UDataInfo),
89 0,
90
91 U_IS_BIG_ENDIAN,
92 U_CHARSET_FAMILY,
93 U_SIZEOF_UCHAR,
94 0,
95
96 { 0, 0, 0, 0 }, /* dummy dataFormat */
97 { 0, 0, 0, 0 }, /* dummy formatVersion */
98 { 0, 0, 0, 0 } /* dummy dataVersion */
99};
100
101#else
102
103//
104// Set up the ICU data header, defined in ucmndata.h
105//
106DataHeader dh ={
107 {sizeof(DataHeader), // Struct MappedData
108 0xda,
109 0x27},
110
111 { // struct UDataInfo
112 sizeof(UDataInfo), // size
113 0, // reserved
114 U_IS_BIG_ENDIAN,
115 U_CHARSET_FAMILY,
116 U_SIZEOF_UCHAR,
117 0, // reserved
118
119 { 0x43, 0x66, 0x75, 0x20 }, // dataFormat="Cfu "
120 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
121 // from the builder. The values declared
122 // here should never appear in any real data.
123 { 5, 1, 0, 0 } // dataVersion (Unicode version)
124 }};
125
126#endif
127
128// Forward declaration for function for reading source files.
129static const char *readFile(const char *fileName, int32_t *len);
130
131//----------------------------------------------------------------------------
132//
133// main for gencfu
134//
135//----------------------------------------------------------------------------
136int main(int argc, char **argv) {
137 UErrorCode status = U_ZERO_ERROR;
138 const char *confFileName;
729e4ab9
A
139 const char *outFileName;
140 const char *outDir = NULL;
141 const char *copyright = NULL;
142
143 //
144 // Pick up and check the command line arguments,
145 // using the standard ICU tool utils option handling.
146 //
147 U_MAIN_INIT_ARGS(argc, argv);
148 progName = argv[0];
2ca993e8 149 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
729e4ab9
A
150 if(argc<0) {
151 // Unrecognized option
152 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
153 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
154 }
155
156 if(options[0].doesOccur || options[1].doesOccur) {
157 // -? or -h for help.
158 usageAndDie(0);
159 }
160
f3c0d7a5
A
161 if (!(options[3].doesOccur && options[5].doesOccur)) {
162 fprintf(stderr, "confusables file and output file must all be specified.\n");
729e4ab9
A
163 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
164 }
165 confFileName = options[3].value;
729e4ab9
A
166 outFileName = options[5].value;
167
168 if (options[6].doesOccur) {
169 u_setDataDirectory(options[6].value);
170 }
171
172 status = U_ZERO_ERROR;
173
174 /* Combine the directory with the file name */
175 if(options[7].doesOccur) {
176 outDir = options[7].value;
177 }
178 if (options[8].doesOccur) {
179 copyright = U_COPYRIGHT_STRING;
180 }
181
2ca993e8
A
182 UBool quiet = FALSE;
183 if (options[9].doesOccur) {
184 quiet = TRUE;
185 }
186
729e4ab9
A
187#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
188 // spoof detection data file parsing is dependent on regular expressions.
189 // TODO: have the tool return an error status. Requires fixing the ICU data build
190 // so that it doesn't abort entirely on that error.
191
192 UNewDataMemory *pData;
193 char msg[1024];
194
195 /* write message with just the name */
196 sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
197 fprintf(stderr, "%s\n", msg);
198
199 /* write the dummy data file */
200 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
201 udata_writeBlock(pData, msg, strlen(msg));
202 udata_finish(pData, &status);
203 return (int)status;
204
205#else
206 /* Initialize ICU */
207 u_init(&status);
208 if (U_FAILURE(status)) {
209 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
210 argv[0], u_errorName(status));
211 exit(1);
212 }
213 status = U_ZERO_ERROR;
214
215 // Read in the confusables source file
216
217 int32_t confusablesLen = 0;
218 const char *confusables = readFile(confFileName, &confusablesLen);
219 if (confusables == NULL) {
220 printf("gencfu: error reading file \"%s\"\n", confFileName);
221 exit(-1);
222 }
223
729e4ab9
A
224 //
225 // Create the Spoof Detector from the source confusables files.
226 // This will compile the data.
227 //
228 UParseError parseError;
229 parseError.line = 0;
230 parseError.offset = 0;
231 int32_t errType;
232 USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
f3c0d7a5 233 NULL, 0,
729e4ab9
A
234 &errType, &parseError, &status);
235 if (U_FAILURE(status)) {
729e4ab9 236 fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
f3c0d7a5 237 u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset);
729e4ab9
A
238 exit(status);
239 };
240
241
242 //
243 // Get the compiled rule data from the USpoofChecker.
244 //
245 uint32_t outDataSize;
246 uint8_t *outData;
247 outDataSize = uspoof_serialize(sc, NULL, 0, &status);
248 if (status != U_BUFFER_OVERFLOW_ERROR) {
249 fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
250 exit(status);
251 }
252 status = U_ZERO_ERROR;
253 outData = new uint8_t[outDataSize];
254 uspoof_serialize(sc, outData, outDataSize, &status);
255
256 // Copy the data format version numbers from the spoof data header into the UDataMemory header.
257
258 uprv_memcpy(dh.info.formatVersion,
259 reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
260 sizeof(dh.info.formatVersion));
261
262 //
263 // Create the output file
264 //
265 size_t bytesWritten;
266 UNewDataMemory *pData;
267 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
268 if(U_FAILURE(status)) {
269 fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
270 outFileName, u_errorName(status));
271 exit(status);
272 }
273
274
275 // Write the data itself.
276 udata_writeBlock(pData, outData, outDataSize);
277 // finish up
278 bytesWritten = udata_finish(pData, &status);
279 if(U_FAILURE(status)) {
280 fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
281 exit(status);
282 }
283
284 if (bytesWritten != outDataSize) {
285 fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
286 exit(-1);
287 }
288
289 uspoof_close(sc);
290 delete [] outData;
4388f060 291 delete [] confusables;
729e4ab9 292 u_cleanup();
2ca993e8
A
293 if (!quiet) {
294 printf("gencfu: tool completed successfully.\n");
295 }
729e4ab9
A
296 return 0;
297#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
298}
299
300
301 //
302 // Read in a confusables source file
303 //
304 static const char *readFile(const char *fileName, int32_t *len) {
305 char *result;
306 long fileSize;
307 FILE *file;
308
309 file = fopen(fileName, "rb");
310 if( file == 0 ) {
311 return NULL;
312 }
313 fseek(file, 0, SEEK_END);
314 fileSize = ftell(file);
315 fseek(file, 0, SEEK_SET);
316 result = new char[fileSize+10];
317 if (result==NULL) {
4388f060
A
318 fclose(file);
319 return NULL;
729e4ab9
A
320 }
321
322 long t = fread(result, 1, fileSize, file);
323 if (t != fileSize) {
324 delete [] result;
325 fclose(file);
326 return NULL;
327 }
328 result[fileSize]=0;
329 *len = static_cast<int32_t>(fileSize);
330 fclose(file);
331 return result;
332 }