]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/utfperf/utfperf.c
dfe7e3f81a2f5f294a2e1a097f622eca297e92e3
[apple/icu.git] / icuSources / test / utfperf / utfperf.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2002, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: utfperf.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002apr17
12 * created by: Markus W. Scherer
13 *
14 * Performance test program for Unicode converters
15 * (converters that support all Unicode code points).
16 * Takes a UTF-8 file as input.
17 */
18
19 #include <stdio.h>
20 #include <string.h>
21
22 #include <fcntl.h> /* for _O_BINARY */
23 #include <io.h> /* for _setmode() */
24
25 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
26 # include <windows.h>
27 #else
28 # include <sys/time.h>
29 static unsigned long
30 timeGetTime() {
31 struct timeval t;
32
33 gettimeofday(&t, 0);
34 return t.tv_sec*1000+t.tv_usec/1000;
35 };
36 #endif
37
38 #include "unicode/utypes.h"
39 #include "unicode/ucnv.h"
40 #include "unicode/ustring.h"
41
42 /* definitions and text buffers */
43
44 #define INPUT_CAPACITY (1024*1024)
45 #define INTERMEDIATE_CAPACITY 4096
46 #define INTERMEDIATE_SMALL_CAPACITY 20
47 #define OUTPUT_CAPACITY INPUT_CAPACITY
48
49 #define TARGET_MEASURE_TIME_MS 2000
50
51 #define PERCENT(a, b) (int)(((a)*200+1)/(2*(b)))
52
53 #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
54
55 static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY];
56 static char intermediate[INTERMEDIATE_CAPACITY];
57
58 static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
59
60 static int32_t utf8Length=0;
61 static double utf8Time=0.;
62
63 static const char *const
64 utfNames[]={
65 "UTF-8", /* UTF-8 should always be first to serve as percentage reference */
66 "SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/
67 };
68
69 /* functions */
70
71 typedef void
72 RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode);
73
74 static void
75 roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
76 const UChar *pIn, *pInLimit;
77 UChar *pOut, *pOutLimit;
78 char *pInter, *pInterLimit, *p;
79 UBool flush;
80
81 ucnv_reset(cnv);
82
83 pIn=input;
84 pInLimit=input+inputLength;
85
86 pOut=output;
87 pOutLimit=output+OUTPUT_CAPACITY;
88
89 pInterLimit=intermediate+intermediateCapacity;
90
91 encodedLength=outputLength=0;
92 flush=FALSE;
93
94 while(pIn<pInLimit || !flush) {
95 /* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
96 pInter=intermediate;
97 flush=(UBool)(pIn==pInLimit);
98 ucnv_fromUnicode(cnv,
99 &pInter, pInterLimit,
100 &pIn, pInLimit,
101 NULL, flush,
102 pErrorCode);
103 encodedLength+=(int32_t)(pInter-intermediate);
104
105 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
106 /* in case flush was TRUE make sure that we convert once more to really flush */
107 flush=FALSE;
108 *pErrorCode=U_ZERO_ERROR;
109 } else if(U_FAILURE(*pErrorCode)) {
110 return;
111 }
112
113 /* convert the block [intermediate..pInter[ back to UTF-16 */
114 p=intermediate;
115 ucnv_toUnicode(cnv,
116 &pOut, pOutLimit,
117 &p, pInter,
118 NULL, flush,
119 pErrorCode);
120 if(U_FAILURE(*pErrorCode)) {
121 return;
122 }
123 /* intermediate must have been consumed (p==pInter) because of the converter semantics */
124 }
125
126 outputLength=pOut-output;
127 if(inputLength!=outputLength) {
128 fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength);
129 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
130 }
131 }
132
133 static void
134 noop(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
135 /* do nothing */
136 }
137
138 static unsigned long
139 measureRoundtrips(RoundtripFn *fn, UConverter *cnv, const char *encName, int32_t intermediateCapacity, int32_t n) {
140 unsigned long _time;
141 UErrorCode errorCode;
142
143 _time=timeGetTime();
144 errorCode=U_ZERO_ERROR;
145 do {
146 fn(cnv, intermediateCapacity, &errorCode);
147 } while(U_SUCCESS(errorCode) && --n>0);
148 _time=timeGetTime()-_time;
149
150 if(U_FAILURE(errorCode)) {
151 fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode));
152 return 0x7fffffff;
153 }
154
155 if(0!=u_memcmp(input, output, inputLength)) {
156 fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n");
157 return 0x7fffffff;
158 }
159
160 return _time;
161 }
162
163 static void
164 perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) {
165 double rtTime;
166 unsigned long _time;
167 int32_t n;
168
169 /*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/
170
171 /* warm up caches and estimate loop time */
172 n=10;
173 for(;;) {
174 _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
175 if(_time<500 && _time<TARGET_MEASURE_TIME_MS/10) {
176 n*=10;
177 } else {
178 break;
179 }
180 }
181
182 if(_time<TARGET_MEASURE_TIME_MS) {
183 n=(n*TARGET_MEASURE_TIME_MS)/_time+1;
184 }
185
186 /* run actual measurement with a target test time of 10s */
187 _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
188
189 /* subtract same number of loops over no-operation function */
190 _time-=measureRoundtrips(noop, cnv, encName, intermediateCapacity, n);
191
192 rtTime=((double)_time*1000.)/(double)n;
193
194 /* report */
195 printf("* performance report for %8s:\n", encName);
196 printf(" intermediate buffer capacity %8d B\n", intermediateCapacity);
197 if(intermediateCapacity==INTERMEDIATE_CAPACITY && utf8Length!=0) {
198 printf(" number of encoding bytes %8d B (%3d%% of UTF-8)\n", encodedLength, PERCENT(encodedLength, utf8Length));
199 printf(" roundtrip conversion time %8g &#956;s (%3d%% of UTF-8)\n", rtTime, PERCENT(rtTime, utf8Time));
200 } else {
201 printf(" number of encoding bytes %8d B\n", encodedLength);
202 printf(" roundtrip conversion time %8g &#956;s\n", rtTime);
203 }
204 printf(" average bytes/code point %8g B/cp\n", (double)encodedLength/countInputCodePoints);
205 puts("");
206
207 /* set UTF-8 values */
208 if(intermediateCapacity==INTERMEDIATE_CAPACITY && 0==strcmp(encName, "UTF-8")) {
209 utf8Length=encodedLength;
210 utf8Time=rtTime;
211 }
212 }
213
214 static void
215 perEnc(UConverter *cnv, const char *encName) {
216 /*printf("test performance for %s\n", encName);*/
217 perEncAndCapacity(cnv, encName, INTERMEDIATE_CAPACITY);
218 perEncAndCapacity(cnv, encName, INTERMEDIATE_SMALL_CAPACITY);
219 }
220
221 static void
222 testPerformance() {
223 UConverter *cnv;
224 UErrorCode errorCode;
225 int32_t i;
226
227 printf("number of code points %8d cp\n", countInputCodePoints);
228 printf("platform endianness: %8s-endian\n", U_IS_BIG_ENDIAN ? "big" : "little");
229 puts("");
230 for(i=0; i<ARRAY_LENGTH(utfNames); ++i) {
231 errorCode=U_ZERO_ERROR;
232 cnv=ucnv_open(utfNames[i], &errorCode);
233 if(U_SUCCESS(errorCode)) {
234 perEnc(cnv, utfNames[i]);
235 ucnv_close(cnv);
236 } else {
237 fprintf(stderr, "error opening converter for \"%s\" - %s\n", utfNames[i], u_errorName(errorCode));
238 }
239 }
240 }
241
242 /* read a complete block from the input file */
243 static int32_t
244 readBlock(FILE *in) {
245 int length, blockLength;
246
247 blockLength=0;
248 while(blockLength<INTERMEDIATE_CAPACITY && !feof(in)) {
249 length=fread(intermediate, 1, INTERMEDIATE_CAPACITY-blockLength, in);
250 if(length<0 || ferror(in)) {
251 return -1;
252 }
253 blockLength+=length;
254 }
255
256 return (int32_t)blockLength;
257 }
258
259 static UBool
260 readInput(FILE *in, const char *encName) {
261 UConverter *cnv;
262 UChar *pOut, *pOutLimit;
263 const char *p, *limit;
264 int32_t length;
265 UErrorCode errorCode;
266
267 pOut=input;
268 pOutLimit=input+INPUT_CAPACITY;
269
270 errorCode=U_ZERO_ERROR;
271
272 /* read the first block and open the converter */
273 length=readBlock(in);
274 if(length<0) {
275 return FALSE;
276 }
277
278 if(encName==NULL) {
279 int32_t signatureLength;
280 encName=ucnv_detectUnicodeSignature(intermediate, length,
281 &signatureLength,
282 &errorCode);
283 if(U_FAILURE(errorCode) || encName==NULL) {
284 /* default to UTF-8 */
285 printf("no Unicode signature - using UTF-8\n");
286 encName="UTF-8";
287 errorCode=U_ZERO_ERROR;
288 } else {
289 printf("detected signature for %s (removing %d bytes)\n", encName, signatureLength);
290 /* remove signature byte sequence */
291 memmove(intermediate, intermediate+signatureLength, length-=signatureLength);
292 }
293 }
294
295 cnv=ucnv_open(encName, &errorCode);
296 if(U_FAILURE(errorCode)) {
297 fprintf(stderr, "error: unable to ucnv_open(\"%s\") - %s\n", encName, u_errorName(errorCode));
298 return FALSE;
299 }
300
301 while(length>0) {
302 /* convert the block */
303 p=intermediate;
304 limit=p+length;
305
306 ucnv_toUnicode(cnv,
307 &pOut, pOutLimit,
308 &p, limit,
309 NULL, FALSE,
310 &errorCode);
311 if(U_FAILURE(errorCode)) {
312 fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
313 ucnv_close(cnv);
314 return FALSE;
315 }
316
317 /* read the next block */
318 length=readBlock(in);
319 if(length<0) {
320 ucnv_close(cnv);
321 return FALSE;
322 }
323 }
324
325 /* flush the converter */
326 ucnv_toUnicode(cnv,
327 &pOut, pOutLimit,
328 &p, p,
329 NULL, TRUE,
330 &errorCode);
331 ucnv_close(cnv);
332
333 if(U_FAILURE(errorCode)) {
334 fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
335 return FALSE;
336 }
337
338 inputLength=(int32_t)(pOut-input);
339 countInputCodePoints=u_countChar32(input, inputLength);
340 if(inputLength<=0) {
341 fprintf(stderr, "warning: input is empty\n");
342 return FALSE;
343 }
344
345 return TRUE;
346 }
347
348 static void
349 showUsage(const char *myName) {
350 fprintf(stderr,
351 "Usage:\n"
352 "%s [-e encoding-name] filename | '-'\n"
353 " encoding-name must be the name of an encoding supported by ICU\n"
354 " the filename of the input file with text to be used\n"
355 " can be a dash (-) for standard input\n",
356 myName);
357 }
358
359 /*
360 * Read file using some encoding, convert to 1M UTF-16 input buffer.
361 * For each UTF to be tested:
362 * n times:
363 * convert from UTF-16 input buffer to UTF, 4kB buffer
364 * convert from 4kB buffer to 1M UTF-16 output buffer
365 * adjust n so that time elapsed is 10s (#define)
366 * ->divide 10s by time, increase n by that factor, run 2nd time
367 * n times:
368 * empty function
369 * subtract out loop/function overhead
370 * display #code points - #UTF bytes - time per roundtrip
371 *
372 * * do the same again with an intermediate buffer size of 20 instead of 4kB
373 *
374 * Test following UTFs:
375 * UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8
376 *
377 * Command-line arguments:
378 * - encoding (default UTF-8, detect BOM)
379 * - filename (allow "-")
380 */
381 extern int
382 main(int argc, const char *argv[]) {
383 FILE *in;
384 const char *myName, *encName, *filename, *basename;
385
386 myName=argv[0];
387 if(argc<2) {
388 showUsage(myName);
389 return 1;
390 }
391
392 /* get encoding name argument */
393 if(argv[1][0]=='-' && argv[1][1]=='e') {
394 encName=argv[1]+2;
395 --argc;
396 ++argv;
397 if(*encName==0) {
398 if(argc<2) {
399 showUsage(myName);
400 return 1;
401 }
402 encName=argv[1];
403 --argc;
404 ++argv;
405 }
406 } else {
407 encName=NULL;
408 }
409
410 /* get filename argument */
411 if(argc<2) {
412 showUsage(myName);
413 return 1;
414 }
415 filename=argv[1];
416 if(filename[0]=='-' && filename[1]==0) {
417 filename="(standard input)";
418 in=stdin;
419 /* set stdin to binary mode */
420 _setmode(_fileno(stdin), _O_BINARY);
421 } else {
422 in=fopen(filename, "rb");
423 if(in==NULL) {
424 fprintf(stderr, "error opening \"%s\"\n", filename);
425 showUsage(myName);
426 return 2;
427 }
428 }
429
430 /* read input */
431 basename=strrchr(filename, U_FILE_SEP_CHAR);
432 if(basename!=NULL) {
433 ++basename;
434 } else {
435 basename=filename;
436 }
437 printf("# testing converter performance with file \"%s\"\n", basename);
438 if(!readInput(in, encName)) {
439 fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName);
440 showUsage(myName);
441 return 2;
442 }
443 if(in!=stdin) {
444 fclose(in);
445 }
446
447 /* test performance */
448 testPerformance();
449 return 0;
450 }