2 **********************************************************************
3 * Copyright (C) 2009-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
10 //--------------------------------------------------------------------
12 // Tool for generating Unicode Confusable data files (.cfu files).
13 // .cfu files contain the compiled of the confusable data
14 // derived from the Unicode Consortium data described in
17 // Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu
19 // options: -v verbose
22 // The input rule filew is are plain text files containing confusable character
23 // definitions in the input format defined by Unicode UAX39 for the files
24 // confusables.txt and confusablesWholeScript.txt. This source (.txt) format
25 // is also accepted direaccepted by ICU spoof detedtors. The
26 // files must be encoded in utf-8 format, with or without a BOM.
28 //--------------------------------------------------------------------
30 #include "unicode/utypes.h"
31 #include "unicode/unistr.h"
32 #include "unicode/uclean.h"
33 #include "unicode/udata.h"
34 #include "unicode/putil.h"
39 #include "uspoof_impl.h"
48 static char *progName
;
49 static UOption options
[]={
50 UOPTION_HELP_H
, /* 0 */
51 UOPTION_HELP_QUESTION_MARK
, /* 1 */
52 UOPTION_VERBOSE
, /* 2 */
53 { "rules", NULL
, NULL
, NULL
, 'r', UOPT_REQUIRES_ARG
, 0 }, /* 3 */
54 { "wsrules", NULL
, NULL
, NULL
, 'w', UOPT_REQUIRES_ARG
, 0}, /* 4 */
55 { "out", NULL
, NULL
, NULL
, 'o', UOPT_REQUIRES_ARG
, 0 }, /* 5 */
56 UOPTION_ICUDATADIR
, /* 6 */
57 UOPTION_DESTDIR
, /* 7 */
58 UOPTION_COPYRIGHT
, /* 8 */
59 UOPTION_QUIET
, /* 9 */
62 void usageAndDie(int retCode
) {
63 printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName
);
64 printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
66 "\t-h or -? or --help this usage text\n"
67 "\t-V or --version show a version message\n"
68 "\t-c or --copyright include a copyright notice\n"
69 "\t-v or --verbose turn on verbose output\n"
70 "\t-q or --quiet do not display warnings and progress\n"
71 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
72 "\t followed by path, defaults to %s\n"
73 "\t-d or --destdir destination directory, followed by the path\n",
74 u_getDataDirectory());
79 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
81 /* dummy UDataInfo cf. udata.h */
82 static UDataInfo dummyDataInfo
= {
91 { 0, 0, 0, 0 }, /* dummy dataFormat */
92 { 0, 0, 0, 0 }, /* dummy formatVersion */
93 { 0, 0, 0, 0 } /* dummy dataVersion */
99 // Set up the ICU data header, defined in ucmndata.h
102 {sizeof(DataHeader
), // Struct MappedData
106 { // struct UDataInfo
107 sizeof(UDataInfo
), // size
114 { 0x43, 0x66, 0x75, 0x20 }, // dataFormat="Cfu "
115 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
116 // from the builder. The values declared
117 // here should never appear in any real data.
118 { 5, 1, 0, 0 } // dataVersion (Unicode version)
123 // Forward declaration for function for reading source files.
124 static const char *readFile(const char *fileName
, int32_t *len
);
126 //----------------------------------------------------------------------------
130 //----------------------------------------------------------------------------
131 int main(int argc
, char **argv
) {
132 UErrorCode status
= U_ZERO_ERROR
;
133 const char *confFileName
;
134 const char *confWSFileName
;
135 const char *outFileName
;
136 const char *outDir
= NULL
;
137 const char *copyright
= NULL
;
140 // Pick up and check the command line arguments,
141 // using the standard ICU tool utils option handling.
143 U_MAIN_INIT_ARGS(argc
, argv
);
145 argc
=u_parseArgs(argc
, argv
, UPRV_LENGTHOF(options
), options
);
147 // Unrecognized option
148 fprintf(stderr
, "error in command line argument \"%s\"\n", argv
[-argc
]);
149 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR
);
152 if(options
[0].doesOccur
|| options
[1].doesOccur
) {
153 // -? or -h for help.
157 if (!(options
[3].doesOccur
&& options
[4].doesOccur
&& options
[5].doesOccur
)) {
158 fprintf(stderr
, "confusables file, whole script confusables file and output file must all be specified.\n");
159 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR
);
161 confFileName
= options
[3].value
;
162 confWSFileName
= options
[4].value
;
163 outFileName
= options
[5].value
;
165 if (options
[6].doesOccur
) {
166 u_setDataDirectory(options
[6].value
);
169 status
= U_ZERO_ERROR
;
171 /* Combine the directory with the file name */
172 if(options
[7].doesOccur
) {
173 outDir
= options
[7].value
;
175 if (options
[8].doesOccur
) {
176 copyright
= U_COPYRIGHT_STRING
;
180 if (options
[9].doesOccur
) {
184 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
185 // spoof detection data file parsing is dependent on regular expressions.
186 // TODO: have the tool return an error status. Requires fixing the ICU data build
187 // so that it doesn't abort entirely on that error.
189 UNewDataMemory
*pData
;
192 /* write message with just the name */
193 sprintf(msg
, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName
);
194 fprintf(stderr
, "%s\n", msg
);
196 /* write the dummy data file */
197 pData
= udata_create(outDir
, NULL
, outFileName
, &dummyDataInfo
, NULL
, &status
);
198 udata_writeBlock(pData
, msg
, strlen(msg
));
199 udata_finish(pData
, &status
);
205 if (U_FAILURE(status
)) {
206 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
207 argv
[0], u_errorName(status
));
210 status
= U_ZERO_ERROR
;
212 // Read in the confusables source file
214 int32_t confusablesLen
= 0;
215 const char *confusables
= readFile(confFileName
, &confusablesLen
);
216 if (confusables
== NULL
) {
217 printf("gencfu: error reading file \"%s\"\n", confFileName
);
221 int32_t wsConfusablesLen
= 0;
222 const char *wsConfsables
= readFile(confWSFileName
, &wsConfusablesLen
);
223 if (wsConfsables
== NULL
) {
224 printf("gencfu: error reading file \"%s\"\n", confFileName
);
229 // Create the Spoof Detector from the source confusables files.
230 // This will compile the data.
232 UParseError parseError
;
234 parseError
.offset
= 0;
236 USpoofChecker
*sc
= uspoof_openFromSource(confusables
, confusablesLen
,
237 wsConfsables
, wsConfusablesLen
,
238 &errType
, &parseError
, &status
);
239 if (U_FAILURE(status
)) {
240 const char *errFile
=
241 (errType
== USPOOF_WHOLE_SCRIPT_CONFUSABLE
)? confWSFileName
: confFileName
;
242 fprintf(stderr
, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
243 u_errorName(status
), errFile
, (int)parseError
.line
, (int)parseError
.offset
);
249 // Get the compiled rule data from the USpoofChecker.
251 uint32_t outDataSize
;
253 outDataSize
= uspoof_serialize(sc
, NULL
, 0, &status
);
254 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
255 fprintf(stderr
, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status
));
258 status
= U_ZERO_ERROR
;
259 outData
= new uint8_t[outDataSize
];
260 uspoof_serialize(sc
, outData
, outDataSize
, &status
);
262 // Copy the data format version numbers from the spoof data header into the UDataMemory header.
264 uprv_memcpy(dh
.info
.formatVersion
,
265 reinterpret_cast<SpoofDataHeader
*>(outData
)->fFormatVersion
,
266 sizeof(dh
.info
.formatVersion
));
269 // Create the output file
272 UNewDataMemory
*pData
;
273 pData
= udata_create(outDir
, NULL
, outFileName
, &(dh
.info
), copyright
, &status
);
274 if(U_FAILURE(status
)) {
275 fprintf(stderr
, "gencfu: Could not open output file \"%s\", \"%s\"\n",
276 outFileName
, u_errorName(status
));
281 // Write the data itself.
282 udata_writeBlock(pData
, outData
, outDataSize
);
284 bytesWritten
= udata_finish(pData
, &status
);
285 if(U_FAILURE(status
)) {
286 fprintf(stderr
, "gencfu: Error %d writing the output file\n", status
);
290 if (bytesWritten
!= outDataSize
) {
291 fprintf(stderr
, "gencfu: Error writing to output file \"%s\"\n", outFileName
);
297 delete [] confusables
;
298 delete [] wsConfsables
;
301 printf("gencfu: tool completed successfully.\n");
304 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
309 // Read in a confusables source file
311 static const char *readFile(const char *fileName
, int32_t *len
) {
316 file
= fopen(fileName
, "rb");
320 fseek(file
, 0, SEEK_END
);
321 fileSize
= ftell(file
);
322 fseek(file
, 0, SEEK_SET
);
323 result
= new char[fileSize
+10];
329 long t
= fread(result
, 1, fileSize
, file
);
336 *len
= static_cast<int32_t>(fileSize
);