ICU-6.2.16.tar.gz

[apple/icu.git] / icuSources / test / utfperf / utfperf.c
diff --git a/icuSources/test/utfperf/utfperf.c b/icuSources/test/utfperf/utfperf.c

new file mode 100644 (file)

index 0000000..dfe7e3f
--- /dev/null
+++ b/icuSources/test/utfperf/utfperf.c
@@ -0,0 +1,450 @@
+/*  
+**********************************************************************
+*   Copyright (C) 2002, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   file name:  utfperf.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2002apr17
+*   created by: Markus W. Scherer
+*
+*   Performance test program for Unicode converters
+*   (converters that support all Unicode code points).
+*   Takes a UTF-8 file as input.
+*/
+
+#include <stdio.h>
+#include <string.h>
+
+#include <fcntl.h>     /* for _O_BINARY */
+#include <io.h>                /* for _setmode() */
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#   include <windows.h>
+#else
+#   include <sys/time.h>
+    static unsigned long
+    timeGetTime() {
+        struct timeval t;
+
+        gettimeofday(&t, 0);
+        return t.tv_sec*1000+t.tv_usec/1000;
+    };
+#endif
+
+#include "unicode/utypes.h"
+#include "unicode/ucnv.h"
+#include "unicode/ustring.h"
+
+/* definitions and text buffers */
+
+#define INPUT_CAPACITY (1024*1024)
+#define INTERMEDIATE_CAPACITY 4096
+#define INTERMEDIATE_SMALL_CAPACITY 20
+#define OUTPUT_CAPACITY INPUT_CAPACITY
+
+#define TARGET_MEASURE_TIME_MS 2000
+
+#define PERCENT(a, b) (int)(((a)*200+1)/(2*(b)))
+
+#define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
+
+static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY];
+static char intermediate[INTERMEDIATE_CAPACITY];
+
+static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
+
+static int32_t utf8Length=0;
+static double utf8Time=0.;
+
+static const char *const
+utfNames[]={
+    "UTF-8", /* UTF-8 should always be first to serve as percentage reference */
+    "SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/
+};
+
+/* functions */
+
+typedef void
+RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode);
+
+static void
+roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
+    const UChar *pIn, *pInLimit;
+    UChar *pOut, *pOutLimit;
+    char *pInter, *pInterLimit, *p;
+    UBool flush;
+
+    ucnv_reset(cnv);
+
+    pIn=input;
+    pInLimit=input+inputLength;
+
+    pOut=output;
+    pOutLimit=output+OUTPUT_CAPACITY;
+
+    pInterLimit=intermediate+intermediateCapacity;
+
+    encodedLength=outputLength=0;
+    flush=FALSE;
+
+    while(pIn<pInLimit || !flush) {
+        /* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
+        pInter=intermediate;
+        flush=(UBool)(pIn==pInLimit);
+        ucnv_fromUnicode(cnv,
+                         &pInter, pInterLimit,
+                         &pIn, pInLimit,
+                         NULL, flush,
+                         pErrorCode);
+        encodedLength+=(int32_t)(pInter-intermediate);
+
+        if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+            /* in case flush was TRUE make sure that we convert once more to really flush */
+            flush=FALSE;
+            *pErrorCode=U_ZERO_ERROR;
+        } else if(U_FAILURE(*pErrorCode)) {
+            return;
+        }
+
+        /* convert the block [intermediate..pInter[ back to UTF-16 */
+        p=intermediate;
+        ucnv_toUnicode(cnv,
+                       &pOut, pOutLimit,
+                       &p, pInter,
+                       NULL, flush,
+                       pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            return;
+        }
+        /* intermediate must have been consumed (p==pInter) because of the converter semantics */
+    }
+
+    outputLength=pOut-output;
+    if(inputLength!=outputLength) {
+        fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength);
+        *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
+    }
+}
+
+static void
+noop(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
+    /* do nothing */
+}
+
+static unsigned long
+measureRoundtrips(RoundtripFn *fn, UConverter *cnv, const char *encName, int32_t intermediateCapacity, int32_t n) {
+    unsigned long _time;
+    UErrorCode errorCode;
+
+    _time=timeGetTime();
+    errorCode=U_ZERO_ERROR;
+    do {
+        fn(cnv, intermediateCapacity, &errorCode);
+    } while(U_SUCCESS(errorCode) && --n>0);
+    _time=timeGetTime()-_time;
+
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode));
+        return 0x7fffffff;
+    }
+
+    if(0!=u_memcmp(input, output, inputLength)) {
+        fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n");
+        return 0x7fffffff;
+    }
+
+    return _time;
+}
+
+static void
+perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) {
+    double rtTime;
+    unsigned long _time;
+    int32_t n;
+
+    /*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/
+
+    /* warm up caches and estimate loop time */
+    n=10;
+    for(;;) {
+        _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
+        if(_time<500 && _time<TARGET_MEASURE_TIME_MS/10) {
+            n*=10;
+        } else {
+            break;
+        }
+    }
+
+    if(_time<TARGET_MEASURE_TIME_MS) {
+        n=(n*TARGET_MEASURE_TIME_MS)/_time+1;
+    }
+
+    /* run actual measurement with a target test time of 10s */
+    _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
+
+    /* subtract same number of loops over no-operation function */
+    _time-=measureRoundtrips(noop, cnv, encName, intermediateCapacity, n);
+
+    rtTime=((double)_time*1000.)/(double)n;
+
+    /* report */
+    printf("* performance report for                %8s:\n", encName);
+    printf("  intermediate buffer capacity          %8d B\n", intermediateCapacity);
+    if(intermediateCapacity==INTERMEDIATE_CAPACITY && utf8Length!=0) {
+        printf("  number of encoding bytes              %8d B  (%3d%% of UTF-8)\n", encodedLength, PERCENT(encodedLength, utf8Length));
+        printf("  roundtrip conversion time             %8g &#956;s (%3d%% of UTF-8)\n", rtTime, PERCENT(rtTime, utf8Time));
+    } else {
+        printf("  number of encoding bytes              %8d B\n", encodedLength);
+        printf("  roundtrip conversion time             %8g &#956;s\n", rtTime);
+    }
+    printf("  average bytes/code point              %8g B/cp\n", (double)encodedLength/countInputCodePoints);
+    puts("");
+
+    /* set UTF-8 values */
+    if(intermediateCapacity==INTERMEDIATE_CAPACITY && 0==strcmp(encName, "UTF-8")) {
+        utf8Length=encodedLength;
+        utf8Time=rtTime;
+    }
+}
+
+static void
+perEnc(UConverter *cnv, const char *encName) {
+    /*printf("test performance for %s\n", encName);*/
+    perEncAndCapacity(cnv, encName, INTERMEDIATE_CAPACITY);
+    perEncAndCapacity(cnv, encName, INTERMEDIATE_SMALL_CAPACITY);
+}
+
+static void
+testPerformance() {
+    UConverter *cnv;
+    UErrorCode errorCode;
+    int32_t i;
+
+    printf("number of code points                   %8d cp\n", countInputCodePoints);
+    printf("platform endianness:                    %8s-endian\n", U_IS_BIG_ENDIAN ? "big" : "little");
+    puts("");
+    for(i=0; i<ARRAY_LENGTH(utfNames); ++i) {
+        errorCode=U_ZERO_ERROR;
+        cnv=ucnv_open(utfNames[i], &errorCode);
+        if(U_SUCCESS(errorCode)) {
+            perEnc(cnv, utfNames[i]);
+            ucnv_close(cnv);
+        } else {
+            fprintf(stderr, "error opening converter for \"%s\" - %s\n", utfNames[i], u_errorName(errorCode));
+        }
+    }
+}
+
+/* read a complete block from the input file */
+static int32_t
+readBlock(FILE *in) {
+    int length, blockLength;
+
+    blockLength=0;
+    while(blockLength<INTERMEDIATE_CAPACITY && !feof(in)) {
+        length=fread(intermediate, 1, INTERMEDIATE_CAPACITY-blockLength, in);
+        if(length<0 || ferror(in)) {
+            return -1;
+        }
+        blockLength+=length;
+    }
+
+    return (int32_t)blockLength;
+}
+
+static UBool
+readInput(FILE *in, const char *encName) {
+    UConverter *cnv;
+    UChar *pOut, *pOutLimit;
+    const char *p, *limit;
+    int32_t length;
+    UErrorCode errorCode;
+
+    pOut=input;
+    pOutLimit=input+INPUT_CAPACITY;
+
+    errorCode=U_ZERO_ERROR;
+
+    /* read the first block and open the converter */
+    length=readBlock(in);
+    if(length<0) {
+        return FALSE;
+    }
+
+    if(encName==NULL) {
+        int32_t signatureLength;
+        encName=ucnv_detectUnicodeSignature(intermediate, length,
+                                            &signatureLength,
+                                            &errorCode);
+        if(U_FAILURE(errorCode) || encName==NULL) {
+            /* default to UTF-8 */
+            printf("no Unicode signature - using UTF-8\n");
+            encName="UTF-8";
+            errorCode=U_ZERO_ERROR;
+        } else {
+            printf("detected signature for %s (removing %d bytes)\n", encName, signatureLength);
+            /* remove signature byte sequence */
+            memmove(intermediate, intermediate+signatureLength, length-=signatureLength);
+        }
+    }
+
+    cnv=ucnv_open(encName, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "error: unable to ucnv_open(\"%s\") - %s\n", encName, u_errorName(errorCode));
+        return FALSE;
+    }
+
+    while(length>0) {
+        /* convert the block */
+        p=intermediate;
+        limit=p+length;
+
+        ucnv_toUnicode(cnv,
+                       &pOut, pOutLimit,
+                       &p, limit,
+                       NULL, FALSE,
+                       &errorCode);
+        if(U_FAILURE(errorCode)) {
+            fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
+            ucnv_close(cnv);
+            return FALSE;
+        }
+
+        /* read the next block */
+        length=readBlock(in);
+        if(length<0) {
+            ucnv_close(cnv);
+            return FALSE;
+        }
+    }
+
+    /* flush the converter */
+    ucnv_toUnicode(cnv,
+                   &pOut, pOutLimit,
+                   &p, p,
+                   NULL, TRUE,
+                   &errorCode);
+    ucnv_close(cnv);
+
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
+        return FALSE;
+    }
+
+    inputLength=(int32_t)(pOut-input);
+    countInputCodePoints=u_countChar32(input, inputLength);
+    if(inputLength<=0) {
+        fprintf(stderr, "warning: input is empty\n");
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
+static void
+showUsage(const char *myName) {
+    fprintf(stderr,
+            "Usage:\n"
+            "%s [-e encoding-name] filename | '-'\n"
+            "    encoding-name must be the name of an encoding supported by ICU\n"
+            "    the filename of the input file with text to be used\n"
+            "      can be a dash (-) for standard input\n",
+            myName);
+}
+
+/*
+ * Read file using some encoding, convert to 1M UTF-16 input buffer.
+ * For each UTF to be tested:
+ *   n times:
+ *     convert from UTF-16 input buffer to UTF, 4kB buffer
+ *     convert from 4kB buffer to 1M UTF-16 output buffer
+ *   adjust n so that time elapsed is 10s (#define)
+ *     ->divide 10s by time, increase n by that factor, run 2nd time
+ *   n times:
+ *     empty function
+ *   subtract out loop/function overhead
+ *   display #code points - #UTF bytes - time per roundtrip
+ *
+ *   * do the same again with an intermediate buffer size of 20 instead of 4kB
+ *
+ * Test following UTFs:
+ * UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8
+ *
+ * Command-line arguments:
+ * - encoding (default UTF-8, detect BOM)
+ * - filename (allow "-")
+ */
+extern int
+main(int argc, const char *argv[]) {
+    FILE *in;
+    const char *myName, *encName, *filename, *basename;
+
+    myName=argv[0];
+    if(argc<2) {
+        showUsage(myName);
+        return 1;
+    }
+
+    /* get encoding name argument */
+    if(argv[1][0]=='-' && argv[1][1]=='e') {
+        encName=argv[1]+2;
+        --argc;
+        ++argv;
+        if(*encName==0) {
+            if(argc<2) {
+                showUsage(myName);
+                return 1;
+            }
+            encName=argv[1];
+            --argc;
+            ++argv;
+        }
+    } else {
+        encName=NULL;
+    }
+
+    /* get filename argument */
+    if(argc<2) {
+        showUsage(myName);
+        return 1;
+    }
+    filename=argv[1];
+    if(filename[0]=='-' && filename[1]==0) {
+        filename="(standard input)";
+        in=stdin;
+        /* set stdin to binary mode */
+        _setmode(_fileno(stdin), _O_BINARY);
+    } else {
+        in=fopen(filename, "rb");
+        if(in==NULL) {
+            fprintf(stderr, "error opening \"%s\"\n", filename);
+            showUsage(myName);
+            return 2;
+        }
+    }
+
+    /* read input */
+    basename=strrchr(filename, U_FILE_SEP_CHAR);
+    if(basename!=NULL) {
+        ++basename;
+    } else {
+        basename=filename;
+    }
+    printf("# testing converter performance with file \"%s\"\n", basename);
+    if(!readInput(in, encName)) {
+        fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName);
+        showUsage(myName);
+        return 2;
+    }
+    if(in!=stdin) {
+        fclose(in);
+    }
+
+    /* test performance */
+    testPerformance();
+    return 0;
+}