+++ /dev/null
-/*
-**********************************************************************
-* Copyright (C) 2002, International Business Machines
-* Corporation and others. All Rights Reserved.
-**********************************************************************
-* file name: utfperf.c
-* encoding: US-ASCII
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2002apr17
-* created by: Markus W. Scherer
-*
-* Performance test program for Unicode converters
-* (converters that support all Unicode code points).
-* Takes a UTF-8 file as input.
-*/
-
-#include <stdio.h>
-#include <string.h>
-
-#include <fcntl.h> /* for _O_BINARY */
-#include <io.h> /* for _setmode() */
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-# include <windows.h>
-#else
-# include <sys/time.h>
- static unsigned long
- timeGetTime() {
- struct timeval t;
-
- gettimeofday(&t, 0);
- return t.tv_sec*1000+t.tv_usec/1000;
- };
-#endif
-
-#include "unicode/utypes.h"
-#include "unicode/ucnv.h"
-#include "unicode/ustring.h"
-
-/* definitions and text buffers */
-
-#define INPUT_CAPACITY (1024*1024)
-#define INTERMEDIATE_CAPACITY 4096
-#define INTERMEDIATE_SMALL_CAPACITY 20
-#define OUTPUT_CAPACITY INPUT_CAPACITY
-
-#define TARGET_MEASURE_TIME_MS 2000
-
-#define PERCENT(a, b) (int)(((a)*200+1)/(2*(b)))
-
-#define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
-
-static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY];
-static char intermediate[INTERMEDIATE_CAPACITY];
-
-static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
-
-static int32_t utf8Length=0;
-static double utf8Time=0.;
-
-static const char *const
-utfNames[]={
- "UTF-8", /* UTF-8 should always be first to serve as percentage reference */
- "SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/
-};
-
-/* functions */
-
-typedef void
-RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode);
-
-static void
-roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
- const UChar *pIn, *pInLimit;
- UChar *pOut, *pOutLimit;
- char *pInter, *pInterLimit, *p;
- UBool flush;
-
- ucnv_reset(cnv);
-
- pIn=input;
- pInLimit=input+inputLength;
-
- pOut=output;
- pOutLimit=output+OUTPUT_CAPACITY;
-
- pInterLimit=intermediate+intermediateCapacity;
-
- encodedLength=outputLength=0;
- flush=FALSE;
-
- while(pIn<pInLimit || !flush) {
- /* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
- pInter=intermediate;
- flush=(UBool)(pIn==pInLimit);
- ucnv_fromUnicode(cnv,
- &pInter, pInterLimit,
- &pIn, pInLimit,
- NULL, flush,
- pErrorCode);
- encodedLength+=(int32_t)(pInter-intermediate);
-
- if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
- /* in case flush was TRUE make sure that we convert once more to really flush */
- flush=FALSE;
- *pErrorCode=U_ZERO_ERROR;
- } else if(U_FAILURE(*pErrorCode)) {
- return;
- }
-
- /* convert the block [intermediate..pInter[ back to UTF-16 */
- p=intermediate;
- ucnv_toUnicode(cnv,
- &pOut, pOutLimit,
- &p, pInter,
- NULL, flush,
- pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- return;
- }
- /* intermediate must have been consumed (p==pInter) because of the converter semantics */
- }
-
- outputLength=pOut-output;
- if(inputLength!=outputLength) {
- fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength);
- *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
- }
-}
-
-static void
-noop(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
- /* do nothing */
-}
-
-static unsigned long
-measureRoundtrips(RoundtripFn *fn, UConverter *cnv, const char *encName, int32_t intermediateCapacity, int32_t n) {
- unsigned long _time;
- UErrorCode errorCode;
-
- _time=timeGetTime();
- errorCode=U_ZERO_ERROR;
- do {
- fn(cnv, intermediateCapacity, &errorCode);
- } while(U_SUCCESS(errorCode) && --n>0);
- _time=timeGetTime()-_time;
-
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode));
- return 0x7fffffff;
- }
-
- if(0!=u_memcmp(input, output, inputLength)) {
- fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n");
- return 0x7fffffff;
- }
-
- return _time;
-}
-
-static void
-perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) {
- double rtTime;
- unsigned long _time;
- int32_t n;
-
- /*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/
-
- /* warm up caches and estimate loop time */
- n=10;
- for(;;) {
- _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
- if(_time<500 && _time<TARGET_MEASURE_TIME_MS/10) {
- n*=10;
- } else {
- break;
- }
- }
-
- if(_time<TARGET_MEASURE_TIME_MS) {
- n=(n*TARGET_MEASURE_TIME_MS)/_time+1;
- }
-
- /* run actual measurement with a target test time of 10s */
- _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
-
- /* subtract same number of loops over no-operation function */
- _time-=measureRoundtrips(noop, cnv, encName, intermediateCapacity, n);
-
- rtTime=((double)_time*1000.)/(double)n;
-
- /* report */
- printf("* performance report for %8s:\n", encName);
- printf(" intermediate buffer capacity %8d B\n", intermediateCapacity);
- if(intermediateCapacity==INTERMEDIATE_CAPACITY && utf8Length!=0) {
- printf(" number of encoding bytes %8d B (%3d%% of UTF-8)\n", encodedLength, PERCENT(encodedLength, utf8Length));
- printf(" roundtrip conversion time %8g μs (%3d%% of UTF-8)\n", rtTime, PERCENT(rtTime, utf8Time));
- } else {
- printf(" number of encoding bytes %8d B\n", encodedLength);
- printf(" roundtrip conversion time %8g μs\n", rtTime);
- }
- printf(" average bytes/code point %8g B/cp\n", (double)encodedLength/countInputCodePoints);
- puts("");
-
- /* set UTF-8 values */
- if(intermediateCapacity==INTERMEDIATE_CAPACITY && 0==strcmp(encName, "UTF-8")) {
- utf8Length=encodedLength;
- utf8Time=rtTime;
- }
-}
-
-static void
-perEnc(UConverter *cnv, const char *encName) {
- /*printf("test performance for %s\n", encName);*/
- perEncAndCapacity(cnv, encName, INTERMEDIATE_CAPACITY);
- perEncAndCapacity(cnv, encName, INTERMEDIATE_SMALL_CAPACITY);
-}
-
-static void
-testPerformance() {
- UConverter *cnv;
- UErrorCode errorCode;
- int32_t i;
-
- printf("number of code points %8d cp\n", countInputCodePoints);
- printf("platform endianness: %8s-endian\n", U_IS_BIG_ENDIAN ? "big" : "little");
- puts("");
- for(i=0; i<ARRAY_LENGTH(utfNames); ++i) {
- errorCode=U_ZERO_ERROR;
- cnv=ucnv_open(utfNames[i], &errorCode);
- if(U_SUCCESS(errorCode)) {
- perEnc(cnv, utfNames[i]);
- ucnv_close(cnv);
- } else {
- fprintf(stderr, "error opening converter for \"%s\" - %s\n", utfNames[i], u_errorName(errorCode));
- }
- }
-}
-
-/* read a complete block from the input file */
-static int32_t
-readBlock(FILE *in) {
- int length, blockLength;
-
- blockLength=0;
- while(blockLength<INTERMEDIATE_CAPACITY && !feof(in)) {
- length=fread(intermediate, 1, INTERMEDIATE_CAPACITY-blockLength, in);
- if(length<0 || ferror(in)) {
- return -1;
- }
- blockLength+=length;
- }
-
- return (int32_t)blockLength;
-}
-
-static UBool
-readInput(FILE *in, const char *encName) {
- UConverter *cnv;
- UChar *pOut, *pOutLimit;
- const char *p, *limit;
- int32_t length;
- UErrorCode errorCode;
-
- pOut=input;
- pOutLimit=input+INPUT_CAPACITY;
-
- errorCode=U_ZERO_ERROR;
-
- /* read the first block and open the converter */
- length=readBlock(in);
- if(length<0) {
- return FALSE;
- }
-
- if(encName==NULL) {
- int32_t signatureLength;
- encName=ucnv_detectUnicodeSignature(intermediate, length,
- &signatureLength,
- &errorCode);
- if(U_FAILURE(errorCode) || encName==NULL) {
- /* default to UTF-8 */
- printf("no Unicode signature - using UTF-8\n");
- encName="UTF-8";
- errorCode=U_ZERO_ERROR;
- } else {
- printf("detected signature for %s (removing %d bytes)\n", encName, signatureLength);
- /* remove signature byte sequence */
- memmove(intermediate, intermediate+signatureLength, length-=signatureLength);
- }
- }
-
- cnv=ucnv_open(encName, &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "error: unable to ucnv_open(\"%s\") - %s\n", encName, u_errorName(errorCode));
- return FALSE;
- }
-
- while(length>0) {
- /* convert the block */
- p=intermediate;
- limit=p+length;
-
- ucnv_toUnicode(cnv,
- &pOut, pOutLimit,
- &p, limit,
- NULL, FALSE,
- &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
- ucnv_close(cnv);
- return FALSE;
- }
-
- /* read the next block */
- length=readBlock(in);
- if(length<0) {
- ucnv_close(cnv);
- return FALSE;
- }
- }
-
- /* flush the converter */
- ucnv_toUnicode(cnv,
- &pOut, pOutLimit,
- &p, p,
- NULL, TRUE,
- &errorCode);
- ucnv_close(cnv);
-
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
- return FALSE;
- }
-
- inputLength=(int32_t)(pOut-input);
- countInputCodePoints=u_countChar32(input, inputLength);
- if(inputLength<=0) {
- fprintf(stderr, "warning: input is empty\n");
- return FALSE;
- }
-
- return TRUE;
-}
-
-static void
-showUsage(const char *myName) {
- fprintf(stderr,
- "Usage:\n"
- "%s [-e encoding-name] filename | '-'\n"
- " encoding-name must be the name of an encoding supported by ICU\n"
- " the filename of the input file with text to be used\n"
- " can be a dash (-) for standard input\n",
- myName);
-}
-
-/*
- * Read file using some encoding, convert to 1M UTF-16 input buffer.
- * For each UTF to be tested:
- * n times:
- * convert from UTF-16 input buffer to UTF, 4kB buffer
- * convert from 4kB buffer to 1M UTF-16 output buffer
- * adjust n so that time elapsed is 10s (#define)
- * ->divide 10s by time, increase n by that factor, run 2nd time
- * n times:
- * empty function
- * subtract out loop/function overhead
- * display #code points - #UTF bytes - time per roundtrip
- *
- * * do the same again with an intermediate buffer size of 20 instead of 4kB
- *
- * Test following UTFs:
- * UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8
- *
- * Command-line arguments:
- * - encoding (default UTF-8, detect BOM)
- * - filename (allow "-")
- */
-extern int
-main(int argc, const char *argv[]) {
- FILE *in;
- const char *myName, *encName, *filename, *basename;
-
- myName=argv[0];
- if(argc<2) {
- showUsage(myName);
- return 1;
- }
-
- /* get encoding name argument */
- if(argv[1][0]=='-' && argv[1][1]=='e') {
- encName=argv[1]+2;
- --argc;
- ++argv;
- if(*encName==0) {
- if(argc<2) {
- showUsage(myName);
- return 1;
- }
- encName=argv[1];
- --argc;
- ++argv;
- }
- } else {
- encName=NULL;
- }
-
- /* get filename argument */
- if(argc<2) {
- showUsage(myName);
- return 1;
- }
- filename=argv[1];
- if(filename[0]=='-' && filename[1]==0) {
- filename="(standard input)";
- in=stdin;
- /* set stdin to binary mode */
- _setmode(_fileno(stdin), _O_BINARY);
- } else {
- in=fopen(filename, "rb");
- if(in==NULL) {
- fprintf(stderr, "error opening \"%s\"\n", filename);
- showUsage(myName);
- return 2;
- }
- }
-
- /* read input */
- basename=strrchr(filename, U_FILE_SEP_CHAR);
- if(basename!=NULL) {
- ++basename;
- } else {
- basename=filename;
- }
- printf("# testing converter performance with file \"%s\"\n", basename);
- if(!readInput(in, encName)) {
- fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName);
- showUsage(myName);
- return 2;
- }
- if(in!=stdin) {
- fclose(in);
- }
-
- /* test performance */
- testPerformance();
- return 0;
-}