2 *******************************************************************************
4 * Copyright (C) 2009-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: gennorm2.cpp
10 * tab size: 8 (not used)
13 * created on: 2009nov25
14 * created by: Markus W. Scherer
16 * This program reads text files that define Unicode normalization,
17 * parses them, and builds a binary data file.
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
26 #include "unicode/errorcode.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/unistr.h"
32 #include "normalizer2impl.h"
37 #if UCONFIG_NO_NORMALIZATION
43 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
45 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer
, FILE, fclose
);
47 #if !UCONFIG_NO_NORMALIZATION
48 void parseFile(FILE *f
, Normalizer2DataBuilder
&builder
);
51 /* -------------------------------------------------------------------------- */
65 static UOption options
[]={
67 UOPTION_HELP_QUESTION_MARK
,
71 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG
),
72 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
73 UOPTION_DEF("csource", '\1', UOPT_NO_ARG
),
74 UOPTION_DEF("fast", '\1', UOPT_NO_ARG
)
78 main(int argc
, char* argv
[]) {
79 U_MAIN_INIT_ARGS(argc
, argv
);
81 /* preset then read command line options */
82 options
[SOURCEDIR
].value
="";
83 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[HELP_H
]), options
);
85 /* error handling, printing usage message */
88 "error in command line argument \"%s\"\n",
91 if(!options
[OUTPUT_FILENAME
].doesOccur
) {
95 options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
98 * Broken into chunks because the C89 standard says the minimum
99 * required supported string length is 509 bytes.
102 "Usage: %s [-options] infiles+ -o outputfilename\n"
104 "Reads the infiles with normalization data and\n"
105 "creates a binary or C source file (outputfilename) with the data.\n"
110 "\t-h or -? or --help this usage text\n"
111 "\t-v or --verbose verbose output\n"
112 "\t-c or --copyright include a copyright notice\n"
113 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
115 "\t-s or --sourcedir source directory, followed by the path\n"
116 "\t-o or --output output filename\n"
117 "\t --csource writes a C source file with initializers\n");
119 "\t --fast optimize the data for fast normalization,\n"
120 "\t which might increase its size (Writes fully decomposed\n"
121 "\t regular mappings instead of delta mappings.\n"
122 "\t You should measure the runtime speed to make sure that\n"
123 "\t this is a good trade-off.)\n");
124 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
127 beVerbose
=options
[VERBOSE
].doesOccur
;
128 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
130 IcuToolErrorCode
errorCode("gennorm2/main()");
132 #if UCONFIG_NO_NORMALIZATION
135 "gennorm2 writes a dummy binary data file "
136 "because UCONFIG_NO_NORMALIZATION is set, \n"
137 "see icu/source/common/unicode/uconfig.h\n");
138 udata_createDummy(NULL
, NULL
, options
[OUTPUT_FILENAME
].value
, errorCode
);
139 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140 // return U_UNSUPPORTED_ERROR;
145 LocalPointer
<Normalizer2DataBuilder
> builder(new Normalizer2DataBuilder(errorCode
), errorCode
);
146 errorCode
.assertSuccess();
148 if(options
[UNICODE_VERSION
].doesOccur
) {
149 builder
->setUnicodeVersion(options
[UNICODE_VERSION
].value
);
152 if(options
[OPT_FAST
].doesOccur
) {
153 builder
->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST
);
156 // prepare the filename beginning with the source dir
157 CharString
filename(options
[SOURCEDIR
].value
, errorCode
);
158 int32_t pathLength
=filename
.length();
160 filename
[pathLength
-1]!=U_FILE_SEP_CHAR
&&
161 filename
[pathLength
-1]!=U_FILE_ALT_SEP_CHAR
163 filename
.append(U_FILE_SEP_CHAR
, errorCode
);
164 pathLength
=filename
.length();
167 for(int i
=1; i
<argc
; ++i
) {
168 printf("gennorm2: processing %s\n", argv
[i
]);
169 filename
.append(argv
[i
], errorCode
);
170 LocalStdioFilePointer
f(fopen(filename
.data(), "r"));
172 fprintf(stderr
, "gennorm2 error: unable to open %s\n", filename
.data());
173 exit(U_FILE_ACCESS_ERROR
);
175 builder
->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS
);
176 parseFile(f
.getAlias(), *builder
);
177 filename
.truncate(pathLength
);
180 if(options
[WRITE_C_SOURCE
].doesOccur
) {
181 builder
->writeCSourceFile(options
[OUTPUT_FILENAME
].value
);
183 builder
->writeBinaryFile(options
[OUTPUT_FILENAME
].value
);
186 return errorCode
.get();
191 #if !UCONFIG_NO_NORMALIZATION
193 void parseFile(FILE *f
, Normalizer2DataBuilder
&builder
) {
194 IcuToolErrorCode
errorCode("gennorm2/parseFile()");
196 uint32_t startCP
, endCP
;
197 while(NULL
!=fgets(line
, (int)sizeof(line
), f
)) {
198 char *comment
=(char *)strchr(line
, '#');
204 continue; // skip empty and comment-only lines
207 const char *s
=u_skipWhitespace(line
+1);
208 if(0==strncmp(s
, "Unicode", 7)) {
209 s
=u_skipWhitespace(s
+7);
210 builder
.setUnicodeVersion(s
);
212 continue; // reserved syntax
214 const char *delimiter
;
216 u_parseCodePointRangeAnyTerminator(line
, &startCP
, &endCP
, &delimiter
, errorCode
);
217 if(errorCode
.isFailure()) {
218 fprintf(stderr
, "gennorm2 error: parsing code point range from %s\n", line
);
219 exit(errorCode
.reset());
221 delimiter
=u_skipWhitespace(delimiter
);
222 if(*delimiter
==':') {
223 const char *s
=u_skipWhitespace(delimiter
+1);
225 unsigned long value
=strtoul(s
, &end
, 10);
226 if(end
<=s
|| *u_skipWhitespace(end
)!=0 || value
>=0xff) {
227 fprintf(stderr
, "gennorm2 error: parsing ccc from %s\n", line
);
230 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) {
231 builder
.setCC(c
, (uint8_t)value
);
235 if(*delimiter
=='-') {
236 if(*u_skipWhitespace(delimiter
+1)!=0) {
237 fprintf(stderr
, "gennorm2 error: parsing remove-mapping %s\n", line
);
240 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) {
241 builder
.removeMapping(c
);
245 if(*delimiter
=='=' || *delimiter
=='>') {
246 UChar uchars
[Normalizer2Impl::MAPPING_LENGTH_MASK
];
247 int32_t length
=u_parseString(delimiter
+1, uchars
, UPRV_LENGTHOF(uchars
), NULL
, errorCode
);
248 if(errorCode
.isFailure()) {
249 fprintf(stderr
, "gennorm2 error: parsing mapping string from %s\n", line
);
250 exit(errorCode
.reset());
252 UnicodeString
mapping(FALSE
, uchars
, length
);
253 if(*delimiter
=='=') {
256 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
260 builder
.setRoundTripMapping((UChar32
)startCP
, mapping
);
262 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) {
263 builder
.setOneWayMapping(c
, mapping
);
268 fprintf(stderr
, "gennorm2 error: unrecognized data line %s\n", line
);
273 #endif // !UCONFIG_NO_NORMALIZATION
278 * Hey, Emacs, please set the following:
281 * indent-tabs-mode: nil