2 **********************************************************************
3 * Copyright (C) 2002, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002apr17
12 * created by: Markus W. Scherer
14 * Performance test program for Unicode converters
15 * (converters that support all Unicode code points).
16 * Takes a UTF-8 file as input.
22 #include <fcntl.h> /* for _O_BINARY */
23 #include <io.h> /* for _setmode() */
25 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
28 # include <sys/time.h>
34 return t
.tv_sec
*1000+t
.tv_usec
/1000;
38 #include "unicode/utypes.h"
39 #include "unicode/ucnv.h"
40 #include "unicode/ustring.h"
42 /* definitions and text buffers */
44 #define INPUT_CAPACITY (1024*1024)
45 #define INTERMEDIATE_CAPACITY 4096
46 #define INTERMEDIATE_SMALL_CAPACITY 20
47 #define OUTPUT_CAPACITY INPUT_CAPACITY
49 #define TARGET_MEASURE_TIME_MS 2000
51 #define PERCENT(a, b) (int)(((a)*200+1)/(2*(b)))
53 #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
55 static UChar input
[INPUT_CAPACITY
], output
[OUTPUT_CAPACITY
];
56 static char intermediate
[INTERMEDIATE_CAPACITY
];
58 static int32_t inputLength
, encodedLength
, outputLength
, countInputCodePoints
;
60 static int32_t utf8Length
=0;
61 static double utf8Time
=0.;
63 static const char *const
65 "UTF-8", /* UTF-8 should always be first to serve as percentage reference */
66 "SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/
72 RoundtripFn(UConverter
*cnv
, int32_t intermediateCapacity
, UErrorCode
*pErrorCode
);
75 roundtrip(UConverter
*cnv
, int32_t intermediateCapacity
, UErrorCode
*pErrorCode
) {
76 const UChar
*pIn
, *pInLimit
;
77 UChar
*pOut
, *pOutLimit
;
78 char *pInter
, *pInterLimit
, *p
;
84 pInLimit
=input
+inputLength
;
87 pOutLimit
=output
+OUTPUT_CAPACITY
;
89 pInterLimit
=intermediate
+intermediateCapacity
;
91 encodedLength
=outputLength
=0;
94 while(pIn
<pInLimit
|| !flush
) {
95 /* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
97 flush
=(UBool
)(pIn
==pInLimit
);
103 encodedLength
+=(int32_t)(pInter
-intermediate
);
105 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
106 /* in case flush was TRUE make sure that we convert once more to really flush */
108 *pErrorCode
=U_ZERO_ERROR
;
109 } else if(U_FAILURE(*pErrorCode
)) {
113 /* convert the block [intermediate..pInter[ back to UTF-16 */
120 if(U_FAILURE(*pErrorCode
)) {
123 /* intermediate must have been consumed (p==pInter) because of the converter semantics */
126 outputLength
=pOut
-output
;
127 if(inputLength
!=outputLength
) {
128 fprintf(stderr
, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength
, outputLength
);
129 *pErrorCode
=U_INTERNAL_PROGRAM_ERROR
;
134 noop(UConverter
*cnv
, int32_t intermediateCapacity
, UErrorCode
*pErrorCode
) {
139 measureRoundtrips(RoundtripFn
*fn
, UConverter
*cnv
, const char *encName
, int32_t intermediateCapacity
, int32_t n
) {
141 UErrorCode errorCode
;
144 errorCode
=U_ZERO_ERROR
;
146 fn(cnv
, intermediateCapacity
, &errorCode
);
147 } while(U_SUCCESS(errorCode
) && --n
>0);
148 _time
=timeGetTime()-_time
;
150 if(U_FAILURE(errorCode
)) {
151 fprintf(stderr
, "error in roundtrip conversion (%s): %s\n", encName
, u_errorName(errorCode
));
155 if(0!=u_memcmp(input
, output
, inputLength
)) {
156 fprintf(stderr
, "error: roundtrip failed, input[]!=output[]\n");
164 perEncAndCapacity(UConverter
*cnv
, const char *encName
, int32_t intermediateCapacity
) {
169 /*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/
171 /* warm up caches and estimate loop time */
174 _time
=measureRoundtrips(roundtrip
, cnv
, encName
, intermediateCapacity
, n
);
175 if(_time
<500 && _time
<TARGET_MEASURE_TIME_MS
/10) {
182 if(_time
<TARGET_MEASURE_TIME_MS
) {
183 n
=(n
*TARGET_MEASURE_TIME_MS
)/_time
+1;
186 /* run actual measurement with a target test time of 10s */
187 _time
=measureRoundtrips(roundtrip
, cnv
, encName
, intermediateCapacity
, n
);
189 /* subtract same number of loops over no-operation function */
190 _time
-=measureRoundtrips(noop
, cnv
, encName
, intermediateCapacity
, n
);
192 rtTime
=((double)_time
*1000.)/(double)n
;
195 printf("* performance report for %8s:\n", encName
);
196 printf(" intermediate buffer capacity %8d B\n", intermediateCapacity
);
197 if(intermediateCapacity
==INTERMEDIATE_CAPACITY
&& utf8Length
!=0) {
198 printf(" number of encoding bytes %8d B (%3d%% of UTF-8)\n", encodedLength
, PERCENT(encodedLength
, utf8Length
));
199 printf(" roundtrip conversion time %8g μs (%3d%% of UTF-8)\n", rtTime
, PERCENT(rtTime
, utf8Time
));
201 printf(" number of encoding bytes %8d B\n", encodedLength
);
202 printf(" roundtrip conversion time %8g μs\n", rtTime
);
204 printf(" average bytes/code point %8g B/cp\n", (double)encodedLength
/countInputCodePoints
);
207 /* set UTF-8 values */
208 if(intermediateCapacity
==INTERMEDIATE_CAPACITY
&& 0==strcmp(encName
, "UTF-8")) {
209 utf8Length
=encodedLength
;
215 perEnc(UConverter
*cnv
, const char *encName
) {
216 /*printf("test performance for %s\n", encName);*/
217 perEncAndCapacity(cnv
, encName
, INTERMEDIATE_CAPACITY
);
218 perEncAndCapacity(cnv
, encName
, INTERMEDIATE_SMALL_CAPACITY
);
224 UErrorCode errorCode
;
227 printf("number of code points %8d cp\n", countInputCodePoints
);
228 printf("platform endianness: %8s-endian\n", U_IS_BIG_ENDIAN
? "big" : "little");
230 for(i
=0; i
<ARRAY_LENGTH(utfNames
); ++i
) {
231 errorCode
=U_ZERO_ERROR
;
232 cnv
=ucnv_open(utfNames
[i
], &errorCode
);
233 if(U_SUCCESS(errorCode
)) {
234 perEnc(cnv
, utfNames
[i
]);
237 fprintf(stderr
, "error opening converter for \"%s\" - %s\n", utfNames
[i
], u_errorName(errorCode
));
242 /* read a complete block from the input file */
244 readBlock(FILE *in
) {
245 int length
, blockLength
;
248 while(blockLength
<INTERMEDIATE_CAPACITY
&& !feof(in
)) {
249 length
=fread(intermediate
, 1, INTERMEDIATE_CAPACITY
-blockLength
, in
);
250 if(length
<0 || ferror(in
)) {
256 return (int32_t)blockLength
;
260 readInput(FILE *in
, const char *encName
) {
262 UChar
*pOut
, *pOutLimit
;
263 const char *p
, *limit
;
265 UErrorCode errorCode
;
268 pOutLimit
=input
+INPUT_CAPACITY
;
270 errorCode
=U_ZERO_ERROR
;
272 /* read the first block and open the converter */
273 length
=readBlock(in
);
279 int32_t signatureLength
;
280 encName
=ucnv_detectUnicodeSignature(intermediate
, length
,
283 if(U_FAILURE(errorCode
) || encName
==NULL
) {
284 /* default to UTF-8 */
285 printf("no Unicode signature - using UTF-8\n");
287 errorCode
=U_ZERO_ERROR
;
289 printf("detected signature for %s (removing %d bytes)\n", encName
, signatureLength
);
290 /* remove signature byte sequence */
291 memmove(intermediate
, intermediate
+signatureLength
, length
-=signatureLength
);
295 cnv
=ucnv_open(encName
, &errorCode
);
296 if(U_FAILURE(errorCode
)) {
297 fprintf(stderr
, "error: unable to ucnv_open(\"%s\") - %s\n", encName
, u_errorName(errorCode
));
302 /* convert the block */
311 if(U_FAILURE(errorCode
)) {
312 fprintf(stderr
, "error converting input to UTF-16: %s\n", u_errorName(errorCode
));
317 /* read the next block */
318 length
=readBlock(in
);
325 /* flush the converter */
333 if(U_FAILURE(errorCode
)) {
334 fprintf(stderr
, "error converting input to UTF-16: %s\n", u_errorName(errorCode
));
338 inputLength
=(int32_t)(pOut
-input
);
339 countInputCodePoints
=u_countChar32(input
, inputLength
);
341 fprintf(stderr
, "warning: input is empty\n");
349 showUsage(const char *myName
) {
352 "%s [-e encoding-name] filename | '-'\n"
353 " encoding-name must be the name of an encoding supported by ICU\n"
354 " the filename of the input file with text to be used\n"
355 " can be a dash (-) for standard input\n",
360 * Read file using some encoding, convert to 1M UTF-16 input buffer.
361 * For each UTF to be tested:
363 * convert from UTF-16 input buffer to UTF, 4kB buffer
364 * convert from 4kB buffer to 1M UTF-16 output buffer
365 * adjust n so that time elapsed is 10s (#define)
366 * ->divide 10s by time, increase n by that factor, run 2nd time
369 * subtract out loop/function overhead
370 * display #code points - #UTF bytes - time per roundtrip
372 * * do the same again with an intermediate buffer size of 20 instead of 4kB
374 * Test following UTFs:
375 * UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8
377 * Command-line arguments:
378 * - encoding (default UTF-8, detect BOM)
379 * - filename (allow "-")
382 main(int argc
, const char *argv
[]) {
384 const char *myName
, *encName
, *filename
, *basename
;
392 /* get encoding name argument */
393 if(argv
[1][0]=='-' && argv
[1][1]=='e') {
410 /* get filename argument */
416 if(filename
[0]=='-' && filename
[1]==0) {
417 filename
="(standard input)";
419 /* set stdin to binary mode */
420 _setmode(_fileno(stdin
), _O_BINARY
);
422 in
=fopen(filename
, "rb");
424 fprintf(stderr
, "error opening \"%s\"\n", filename
);
431 basename
=strrchr(filename
, U_FILE_SEP_CHAR
);
437 printf("# testing converter performance with file \"%s\"\n", basename
);
438 if(!readInput(in
, encName
)) {
439 fprintf(stderr
, "error reading \"%s\" (encoding %s)\n", filename
, encName
);
447 /* test performance */