1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2003-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: gensprep.c
12 * tab size: 8 (not used)
15 * created on: 2003-02-06
16 * created by: Ram Viswanadha
18 * This program reads the Profile.txt files,
19 * parses them, and extracts the data for StringPrep profile.
20 * It then preprocesses it and writes a binary file for efficient use
21 * in various StringPrep conversion processes.
24 #define USPREP_TYPE_NAMES_ARRAY 1
36 #include "unicode/uclean.h"
37 #include "unicode/udata.h"
38 #include "unicode/utypes.h"
39 #include "unicode/putil.h"
46 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
48 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt"
50 #define NORMALIZE_DIRECTIVE "normalize"
51 #define NORMALIZE_DIRECTIVE_LEN 9
52 #define CHECK_BIDI_DIRECTIVE "check-bidi"
53 #define CHECK_BIDI_DIRECTIVE_LEN 10
55 /* prototypes --------------------------------------------------------------- */
58 parseMappings(const char *filename
, UBool reportError
, UErrorCode
*pErrorCode
);
61 parseNormalizationCorrections(const char *filename
, UErrorCode
*pErrorCode
);
64 /* -------------------------------------------------------------------------- */
66 static UOption options
[]={
68 UOPTION_HELP_QUESTION_MARK
,
75 { "normalization", NULL
, NULL
, NULL
, 'n', UOPT_REQUIRES_ARG
, 0 },
76 { "norm-correction", NULL
, NULL
, NULL
, 'm', UOPT_REQUIRES_ARG
, 0 },
77 { "check-bidi", NULL
, NULL
, NULL
, 'k', UOPT_NO_ARG
, 0},
78 { "unicode", NULL
, NULL
, NULL
, 'u', UOPT_REQUIRES_ARG
, 0 },
96 static int printHelp(int argc
, char* argv
[]){
98 * Broken into chucks because the C89 standard says the minimum
99 * required supported string length is 509 bytes.
102 "Usage: %s [-options] [file_name]\n"
104 "Read the files specified and\n"
105 "create a binary file [package-name]_[bundle-name]." DATA_TYPE
" with the StringPrep profile data\n"
110 "\t-h or -? or --help print this usage text\n"
111 "\t-v or --verbose verbose output\n"
112 "\t-c or --copyright include a copyright notice\n");
114 "\t-d or --destdir destination directory, followed by the path\n"
115 "\t-s or --sourcedir source directory of ICU data, followed by the path\n"
116 "\t-b or --bundle-name generate the output data file with the name specified\n"
117 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
118 "\t followed by path, defaults to %s\n",
119 u_getDataDirectory());
121 "\t-n or --normalize turn on the option for normalization and include mappings\n"
122 "\t from NormalizationCorrections.txt from the given path,\n"
123 "\t e.g: /test/icu/source/data/unidata\n");
125 "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n"
126 "\t when the input file contains a normalization directive.\n"
127 "\t unlike -n/--normalize, this option does not force the\n"
128 "\t normalization.\n");
130 "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n"
131 "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n"
133 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
138 main(int argc
, char* argv
[]) {
140 char* filename
= NULL
;
142 const char *srcDir
=NULL
, *destDir
=NULL
, *icuUniDataDir
=NULL
;
143 const char *bundleName
=NULL
, *inputFileName
= NULL
;
145 int32_t sprepOptions
= 0;
147 UErrorCode errorCode
=U_ZERO_ERROR
;
149 U_MAIN_INIT_ARGS(argc
, argv
);
151 /* preset then read command line options */
152 options
[DESTDIR
].value
=u_getDataDirectory();
153 options
[SOURCEDIR
].value
="";
154 options
[UNICODE_VERSION
].value
="0"; /* don't assume the unicode version */
155 options
[BUNDLE_NAME
].value
= DATA_NAME
;
156 options
[NORMALIZE
].value
= "";
158 argc
=u_parseArgs(argc
, argv
, UPRV_LENGTHOF(options
), options
);
160 /* error handling, printing usage message */
163 "error in command line argument \"%s\"\n",
166 if(argc
<0 || options
[HELP
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
167 return printHelp(argc
, argv
);
171 /* get the options values */
172 beVerbose
=options
[VERBOSE
].doesOccur
;
173 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
174 srcDir
=options
[SOURCEDIR
].value
;
175 destDir
=options
[DESTDIR
].value
;
176 bundleName
= options
[BUNDLE_NAME
].value
;
177 if(options
[NORMALIZE
].doesOccur
) {
178 icuUniDataDir
= options
[NORMALIZE
].value
;
180 icuUniDataDir
= options
[NORM_CORRECTION_DIR
].value
;
184 /* print the help message */
185 return printHelp(argc
, argv
);
187 inputFileName
= argv
[1];
189 if(!options
[UNICODE_VERSION
].doesOccur
){
190 return printHelp(argc
, argv
);
192 if(options
[ICUDATADIR
].doesOccur
) {
193 u_setDataDirectory(options
[ICUDATADIR
].value
);
198 "gensprep writes dummy " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
199 " because UCONFIG_NO_IDNA is set, \n"
200 "see icu/source/common/unicode/uconfig.h\n");
201 generateData(destDir
, bundleName
);
205 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
206 filename
= (char* ) uprv_malloc(uprv_strlen(srcDir
) + uprv_strlen(inputFileName
) + (icuUniDataDir
== NULL
? 0 : uprv_strlen(icuUniDataDir
)) + 40); /* hopefully this should be enough */
208 /* prepare the filename beginning with the source dir */
209 if(uprv_strchr(srcDir
,U_FILE_SEP_CHAR
) == NULL
&& uprv_strchr(srcDir
,U_FILE_ALT_SEP_CHAR
) == NULL
){
211 filename
[1] = U_FILE_SEP_CHAR
;
212 uprv_strcpy(filename
+2,srcDir
);
214 uprv_strcpy(filename
, srcDir
);
217 basename
=filename
+uprv_strlen(filename
);
218 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
219 *basename
++=U_FILE_SEP_CHAR
;
225 /* process the file */
226 uprv_strcpy(basename
,inputFileName
);
227 parseMappings(filename
,FALSE
, &errorCode
);
228 if(U_FAILURE(errorCode
)) {
229 fprintf(stderr
, "Could not open file %s for reading. Error: %s \n", filename
, u_errorName(errorCode
));
233 if(options
[NORMALIZE
].doesOccur
){ /* this option might be set by @normalize;; in the source file */
234 /* set up directory for NormalizationCorrections.txt */
235 uprv_strcpy(filename
,icuUniDataDir
);
236 basename
=filename
+uprv_strlen(filename
);
237 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
238 *basename
++=U_FILE_SEP_CHAR
;
241 *basename
++=U_FILE_SEP_CHAR
;
242 uprv_strcpy(basename
,NORM_CORRECTIONS_FILE_NAME
);
244 parseNormalizationCorrections(filename
,&errorCode
);
245 if(U_FAILURE(errorCode
)){
246 fprintf(stderr
,"Could not open file %s for reading \n", filename
);
249 sprepOptions
|= _SPREP_NORMALIZATION_ON
;
252 if(options
[CHECK_BIDI
].doesOccur
){ /* this option might be set by @check-bidi;; in the source file */
253 sprepOptions
|= _SPREP_CHECK_BIDI_ON
;
256 setOptions(sprepOptions
);
258 /* process parsed data */
259 if(U_SUCCESS(errorCode
)) {
260 /* write the data file */
261 generateData(destDir
, bundleName
);
277 static void U_CALLCONV
278 normalizationCorrectionsLineFn(void *context
,
279 char *fields
[][2], int32_t fieldCount
,
280 UErrorCode
*pErrorCode
) {
281 (void)context
; // suppress compiler warnings about unused variable
282 (void)fieldCount
; // suppress compiler warnings about unused variable
283 uint32_t mapping
[40];
287 UVersionInfo version
;
288 UVersionInfo thisVersion
;
290 /* get the character code, field 0 */
291 code
=(uint32_t)uprv_strtoul(fields
[0][0], &end
, 16);
292 if(U_FAILURE(*pErrorCode
)) {
293 fprintf(stderr
, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields
[0][0]);
296 /* Original (erroneous) decomposition */
299 /* parse the mapping string */
300 length
=u_parseCodePoints(s
, mapping
, sizeof(mapping
)/4, pErrorCode
);
302 /* ignore corrected decomposition */
304 u_versionFromString(version
,fields
[3][0] );
305 u_versionFromString(thisVersion
, "3.2.0");
309 if(U_FAILURE(*pErrorCode
)) {
310 fprintf(stderr
, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n",
311 (long)code
, u_errorName(*pErrorCode
));
315 /* store the mapping */
316 if( version
[0] > thisVersion
[0] ||
317 ((version
[0]==thisVersion
[0]) && (version
[1] > thisVersion
[1]))
319 storeMapping(code
,mapping
, length
, USPREP_MAP
, pErrorCode
);
321 setUnicodeVersionNC(version
);
325 parseNormalizationCorrections(const char *filename
, UErrorCode
*pErrorCode
) {
328 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
332 u_parseDelimitedFile(filename
, ';', fields
, 4, normalizationCorrectionsLineFn
, NULL
, pErrorCode
);
334 /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
336 if(U_FAILURE(*pErrorCode
) && ( *pErrorCode
!=U_FILE_ACCESS_ERROR
)) {
337 fprintf(stderr
, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename
, u_errorName(*pErrorCode
));
342 static void U_CALLCONV
343 strprepProfileLineFn(void *context
,
344 char *fields
[][2], int32_t fieldCount
,
345 UErrorCode
*pErrorCode
) {
346 (void)fieldCount
; // suppress compiler warnings about unused variable
347 uint32_t mapping
[40];
351 /*UBool* mapWithNorm = (UBool*) context;*/
352 const char* typeName
;
353 uint32_t rangeStart
=0,rangeEnd
=0;
354 const char* filename
= (const char*) context
;
357 s
= u_skipWhitespace(fields
[0][0]);
359 /* special directive */
361 length
= (int32_t)(fields
[0][1] - s
);
362 if (length
>= NORMALIZE_DIRECTIVE_LEN
363 && uprv_strncmp(s
, NORMALIZE_DIRECTIVE
, NORMALIZE_DIRECTIVE_LEN
) == 0) {
364 options
[NORMALIZE
].doesOccur
= TRUE
;
367 else if (length
>= CHECK_BIDI_DIRECTIVE_LEN
368 && uprv_strncmp(s
, CHECK_BIDI_DIRECTIVE
, CHECK_BIDI_DIRECTIVE_LEN
) == 0) {
369 options
[CHECK_BIDI
].doesOccur
= TRUE
;
373 fprintf(stderr
, "gensprep error parsing a directive %s.", fields
[0][0]);
377 typeName
= fields
[2][0];
380 if(uprv_strstr(typeName
, usprepTypeNames
[USPREP_UNASSIGNED
])!=NULL
){
382 u_parseCodePointRange(s
, &rangeStart
,&rangeEnd
, pErrorCode
);
383 if(U_FAILURE(*pErrorCode
)){
384 fprintf(stderr
, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode
));
388 /* store the range */
389 storeRange(rangeStart
,rangeEnd
,USPREP_UNASSIGNED
, pErrorCode
);
391 }else if(uprv_strstr(typeName
, usprepTypeNames
[USPREP_PROHIBITED
])!=NULL
){
393 u_parseCodePointRange(s
, &rangeStart
,&rangeEnd
, pErrorCode
);
394 if(U_FAILURE(*pErrorCode
)){
395 fprintf(stderr
, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode
));
399 /* store the range */
400 storeRange(rangeStart
,rangeEnd
,USPREP_PROHIBITED
, pErrorCode
);
402 }else if(uprv_strstr(typeName
, usprepTypeNames
[USPREP_MAP
])!=NULL
){
404 /* get the character code, field 0 */
405 code
=(uint32_t)uprv_strtoul(s
, &end
, 16);
406 if(end
<=s
|| end
!=fields
[0][1]) {
407 fprintf(stderr
, "gensprep: syntax error in field 0 at %s\n", fields
[0][0]);
408 *pErrorCode
=U_PARSE_ERROR
;
412 /* parse the mapping string */
413 length
=u_parseCodePoints(map
, mapping
, sizeof(mapping
)/4, pErrorCode
);
415 /* store the mapping */
416 storeMapping(code
,mapping
, length
,USPREP_MAP
, pErrorCode
);
419 *pErrorCode
= U_INVALID_FORMAT_ERROR
;
422 if(U_FAILURE(*pErrorCode
)) {
423 fprintf(stderr
, "gensprep error parsing %s line %s at %s. Error: %s\n",filename
,
424 fields
[0][0],fields
[2][0],u_errorName(*pErrorCode
));
431 parseMappings(const char *filename
, UBool reportError
, UErrorCode
*pErrorCode
) {
434 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
438 u_parseDelimitedFile(filename
, ';', fields
, 3, strprepProfileLineFn
, (void*)filename
, pErrorCode
);
440 /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
442 if(U_FAILURE(*pErrorCode
) && (reportError
|| *pErrorCode
!=U_FILE_ACCESS_ERROR
)) {
443 fprintf(stderr
, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename
, u_errorName(*pErrorCode
));
449 #endif /* #if !UCONFIG_NO_IDNA */
452 * Hey, Emacs, please set the following:
455 * indent-tabs-mode: nil