1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: gennorm2.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
18 * This program reads text files that define Unicode normalization,
19 * parses them, and builds a binary data file.
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
30 #include "unicode/errorcode.h"
31 #include "unicode/localpointer.h"
32 #include "unicode/putil.h"
33 #include "unicode/uchar.h"
34 #include "unicode/unistr.h"
36 #include "normalizer2impl.h"
41 #if UCONFIG_NO_NORMALIZATION
47 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(std::ifstream
&f
, Normalizer2DataBuilder
&builder
);
53 /* -------------------------------------------------------------------------- */
68 static UOption options
[]={
70 UOPTION_HELP_QUESTION_MARK
,
74 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG
),
75 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
76 UOPTION_DEF("csource", '\1', UOPT_NO_ARG
),
77 UOPTION_DEF("combined", '\1', UOPT_NO_ARG
),
78 UOPTION_DEF("fast", '\1', UOPT_NO_ARG
)
82 main(int argc
, char* argv
[]) {
83 U_MAIN_INIT_ARGS(argc
, argv
);
85 /* preset then read command line options */
86 options
[SOURCEDIR
].value
="";
87 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[HELP_H
]), options
);
89 /* error handling, printing usage message */
92 "error in command line argument \"%s\"\n",
95 if(!options
[OUTPUT_FILENAME
].doesOccur
) {
99 options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
102 "Usage: %s [-options] infiles+ -o outputfilename\n"
104 "Reads the infiles with normalization data and\n"
105 "creates a binary file, or a C source file (--csource), with the data,\n"
106 "or writes a data file with the combined data (--combined).\n"
107 "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
109 "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
111 "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112 "in input-file syntax to the outputfilename.\n"
113 "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114 "(Useful for computing minimal incremental mapping data files.)\n"
119 "\t-h or -? or --help this usage text\n"
120 "\t-v or --verbose verbose output\n"
121 "\t-c or --copyright include a copyright notice\n"
122 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
124 "\t-s or --sourcedir source directory, followed by the path\n"
125 "\t-o or --output output filename\n"
126 "\t --csource writes a C source file with initializers\n"
127 "\t --combined writes a .txt file (input-file syntax) with the\n"
128 "\t combined data from all of the input files\n");
130 "\t --fast optimize the data for fast normalization,\n"
131 "\t which might increase its size (Writes fully decomposed\n"
132 "\t regular mappings instead of delta mappings.\n"
133 "\t You should measure the runtime speed to make sure that\n"
134 "\t this is a good trade-off.)\n");
135 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
138 beVerbose
=options
[VERBOSE
].doesOccur
;
139 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
141 IcuToolErrorCode
errorCode("gennorm2/main()");
143 #if UCONFIG_NO_NORMALIZATION
146 "gennorm2 writes a dummy binary data file "
147 "because UCONFIG_NO_NORMALIZATION is set, \n"
148 "see icu/source/common/unicode/uconfig.h\n");
149 udata_createDummy(NULL
, NULL
, options
[OUTPUT_FILENAME
].value
, errorCode
);
150 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151 // return U_UNSUPPORTED_ERROR;
156 LocalPointer
<Normalizer2DataBuilder
> b1(new Normalizer2DataBuilder(errorCode
), errorCode
);
157 LocalPointer
<Normalizer2DataBuilder
> b2
;
158 LocalPointer
<Normalizer2DataBuilder
> diff
;
159 Normalizer2DataBuilder
*builder
= b1
.getAlias();
160 errorCode
.assertSuccess();
162 if(options
[UNICODE_VERSION
].doesOccur
) {
163 builder
->setUnicodeVersion(options
[UNICODE_VERSION
].value
);
166 if(options
[OPT_FAST
].doesOccur
) {
167 builder
->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST
);
170 // prepare the filename beginning with the source dir
171 CharString
filename(options
[SOURCEDIR
].value
, errorCode
);
172 int32_t pathLength
=filename
.length();
174 filename
[pathLength
-1]!=U_FILE_SEP_CHAR
&&
175 filename
[pathLength
-1]!=U_FILE_ALT_SEP_CHAR
177 filename
.append(U_FILE_SEP_CHAR
, errorCode
);
178 pathLength
=filename
.length();
181 bool doMinus
= false;
182 for(int i
=1; i
<argc
; ++i
) {
183 printf("gennorm2: processing %s\n", argv
[i
]);
184 if(strcmp(argv
[i
], "minus") == 0) {
186 fprintf(stderr
, "gennorm2 error: only one 'minus' can be specified\n");
187 exit(U_ILLEGAL_ARGUMENT_ERROR
);
189 // Data from previous input files has been collected in b1.
190 // Collect data from further input files in b2.
191 b2
.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode
), errorCode
);
192 diff
.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode
), errorCode
);
193 errorCode
.assertSuccess();
194 builder
= b2
.getAlias();
195 if(options
[UNICODE_VERSION
].doesOccur
) {
196 builder
->setUnicodeVersion(options
[UNICODE_VERSION
].value
);
198 if(options
[OPT_FAST
].doesOccur
) {
199 builder
->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST
);
204 filename
.append(argv
[i
], errorCode
);
205 std::ifstream
f(filename
.data());
207 fprintf(stderr
, "gennorm2 error: unable to open %s\n", filename
.data());
208 exit(U_FILE_ACCESS_ERROR
);
210 builder
->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS
);
211 parseFile(f
, *builder
);
212 filename
.truncate(pathLength
);
216 Normalizer2DataBuilder::computeDiff(*b1
, *b2
, *diff
);
217 diff
->writeDataFile(options
[OUTPUT_FILENAME
].value
, /* writeRemoved= */ true);
218 } else if(options
[WRITE_COMBINED_DATA
].doesOccur
) {
219 builder
->writeDataFile(options
[OUTPUT_FILENAME
].value
, /* writeRemoved= */ false);
220 } else if(options
[WRITE_C_SOURCE
].doesOccur
) {
221 builder
->writeCSourceFile(options
[OUTPUT_FILENAME
].value
);
223 builder
->writeBinaryFile(options
[OUTPUT_FILENAME
].value
);
226 return errorCode
.get();
231 #if !UCONFIG_NO_NORMALIZATION
233 void parseFile(std::ifstream
&f
, Normalizer2DataBuilder
&builder
) {
234 IcuToolErrorCode
errorCode("gennorm2/parseFile()");
235 std::string lineString
;
236 uint32_t startCP
, endCP
;
237 while(std::getline(f
, lineString
)) {
238 if (lineString
.empty()) {
239 continue; // skip empty lines.
241 #if (U_CPLUSPLUS_VERSION >= 11)
242 char *line
= &lineString
.front();
244 char *line
= &lineString
.at(0);
246 char *comment
=(char *)strchr(line
, '#');
252 continue; // skip empty and comment-only lines
255 const char *s
=u_skipWhitespace(line
+1);
256 if(0==strncmp(s
, "Unicode", 7)) {
257 s
=u_skipWhitespace(s
+7);
258 builder
.setUnicodeVersion(s
);
260 continue; // reserved syntax
262 const char *delimiter
;
264 u_parseCodePointRangeAnyTerminator(line
, &startCP
, &endCP
, &delimiter
, errorCode
);
265 if(errorCode
.isFailure()) {
266 fprintf(stderr
, "gennorm2 error: parsing code point range from %s\n", line
);
267 exit(errorCode
.reset());
269 if (endCP
>= 0xd800 && startCP
<= 0xdfff) {
270 fprintf(stderr
, "gennorm2 error: value or mapping for surrogate code points: %s\n",
272 exit(U_ILLEGAL_ARGUMENT_ERROR
);
274 delimiter
=u_skipWhitespace(delimiter
);
275 if(*delimiter
==':') {
276 const char *s
=u_skipWhitespace(delimiter
+1);
278 unsigned long value
=strtoul(s
, &end
, 10);
279 if(end
<=s
|| *u_skipWhitespace(end
)!=0 || value
>=0xff) {
280 fprintf(stderr
, "gennorm2 error: parsing ccc from %s\n", line
);
283 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) {
284 builder
.setCC(c
, (uint8_t)value
);
288 if(*delimiter
=='-') {
289 if(*u_skipWhitespace(delimiter
+1)!=0) {
290 fprintf(stderr
, "gennorm2 error: parsing remove-mapping %s\n", line
);
293 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) {
294 builder
.removeMapping(c
);
298 if(*delimiter
=='=' || *delimiter
=='>') {
299 UChar uchars
[Normalizer2Impl::MAPPING_LENGTH_MASK
];
300 int32_t length
=u_parseString(delimiter
+1, uchars
, UPRV_LENGTHOF(uchars
), NULL
, errorCode
);
301 if(errorCode
.isFailure()) {
302 fprintf(stderr
, "gennorm2 error: parsing mapping string from %s\n", line
);
303 exit(errorCode
.reset());
305 UnicodeString
mapping(FALSE
, uchars
, length
);
306 if(*delimiter
=='=') {
309 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
313 builder
.setRoundTripMapping((UChar32
)startCP
, mapping
);
315 for(UChar32 c
=(UChar32
)startCP
; c
<=(UChar32
)endCP
; ++c
) {
316 builder
.setOneWayMapping(c
, mapping
);
321 fprintf(stderr
, "gennorm2 error: unrecognized data line %s\n", line
);
326 #endif // !UCONFIG_NO_NORMALIZATION
331 * Hey, Emacs, please set the following:
334 * indent-tabs-mode: nil