-/********************************************************************
- * COPYRIGHT:
- * Copyright (C) 2001 IBM, Inc. All Rights Reserved.
- *
- ********************************************************************/
-/********************************************************************************
-*
-* File CALLCOLL.C
-*
-* Modification History:
-* Name Description
-* Andy Heninger First Version
-*
-*********************************************************************************
-*/
-
-//
-// This program tests string collation and sort key generation performance.
-// Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
-// A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,
-// and include a byte order mark. Either LE or BE format is OK.
-//
-
-const char gUsageString[] =
- "usage: collperf options...\n"
- "-help Display this message.\n"
- "-file file_name utf-16 format file of names.\n"
- "-locale name ICU locale to use. Default is en_US\n"
- "-rules file_name Collation rules file (overrides locale)\n"
- "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
- " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
- "-win Run test using Windows native services. (ICU is default)\n"
- "-unix Run test using Unix strxfrm, strcoll services.\n"
- "-uselen Use API with string lengths. Default is null-terminated strings\n"
- "-usekeys Run tests using sortkeys rather than strcoll\n"
- "-strcmp Run tests using u_strcmp rather than strcoll\n"
- "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n"
- "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
- "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
- " under test at each call point. For measuring test overhead.\n"
- "-terse Terse numbers-only output. Intended for use by scripts.\n"
- "-french French accent ordering\n"
- "-frenchoff No French accent ordering (for use with French locales.)\n"
- "-norm Normalizing mode on\n"
- "-shifted Shifted mode\n"
- "-lower Lower case first\n"
- "-upper Upper case first\n"
- "-case Enable separate case level\n"
- "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
- "-keyhist Produce a table sort key size vs. string length\n"
- "-binsearch Binary Search timing test\n"
- "-keygen Sort Key Generation timing test\n"
- "-qsort Quicksort timing test\n"
- "-iter Iteration Performance Test\n"
- "-dump Display strings, sort keys and CEs.\n"
- ;
-
-
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <locale.h>
-#include <errno.h>
-
-#include <unicode/utypes.h>
-#include <unicode/ucol.h>
-#include <unicode/ucoleitr.h>
-#include <unicode/uloc.h>
-#include <unicode/ustring.h>
-#include <unicode/ures.h>
-#include <unicode/uchar.h>
-#include <unicode/ucnv.h>
-#include <unicode/utf8.h>
-
-#ifdef WIN32
-#include <windows.h>
-#else
-//
-// Stubs for Windows API functions when building on UNIXes.
-//
-typedef int DWORD;
-inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
-#include <sys/time.h>
-unsigned long timeGetTime() {
- struct timeval t;
- gettimeofday(&t, 0);
- unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
- val += t.tv_usec / 1000;
- return val;
-};
-inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
-const int LCMAP_SORTKEY = 0;
-#define MAKELCID(a,b) 0
-const int SORT_DEFAULT = 0;
-#endif
-
-
-
-//
-// Command line option variables
-// These global variables are set according to the options specified
-// on the command line by the user.
-char * opt_fName = 0;
-char * opt_locale = "en_US";
-int opt_langid = 0; // Defaults to value corresponding to opt_locale.
-char * opt_rules = 0;
-UBool opt_help = FALSE;
-int opt_loopCount = 1;
-int opt_iLoopCount = 1;
-UBool opt_terse = FALSE;
-UBool opt_qsort = FALSE;
-UBool opt_binsearch = FALSE;
-UBool opt_icu = TRUE;
-UBool opt_win = FALSE; // Run with Windows native functions.
-UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
-UBool opt_uselen = FALSE;
-UBool opt_usekeys = FALSE;
-UBool opt_strcmp = FALSE;
-UBool opt_strcmpCPO = FALSE;
-UBool opt_norm = FALSE;
-UBool opt_keygen = FALSE;
-UBool opt_french = FALSE;
-UBool opt_frenchoff = FALSE;
-UBool opt_shifted = FALSE;
-UBool opt_lower = FALSE;
-UBool opt_upper = FALSE;
-UBool opt_case = FALSE;
-int opt_level = 0;
-UBool opt_keyhist = FALSE;
-UBool opt_itertest = FALSE;
-UBool opt_dump = FALSE;
-
-
-
-//
-// Definitions for the command line options
-//
-struct OptSpec {
- const char *name;
- enum {FLAG, NUM, STRING} type;
- void *pVar;
-};
-
-OptSpec opts[] = {
- {"-file", OptSpec::STRING, &opt_fName},
- {"-locale", OptSpec::STRING, &opt_locale},
- {"-langid", OptSpec::NUM, &opt_langid},
- {"-rules", OptSpec::STRING, &opt_rules},
- {"-qsort", OptSpec::FLAG, &opt_qsort},
- {"-binsearch", OptSpec::FLAG, &opt_binsearch},
- {"-iter", OptSpec::FLAG, &opt_itertest},
- {"-win", OptSpec::FLAG, &opt_win},
- {"-unix", OptSpec::FLAG, &opt_unix},
- {"-uselen", OptSpec::FLAG, &opt_uselen},
- {"-usekeys", OptSpec::FLAG, &opt_usekeys},
- {"-strcmp", OptSpec::FLAG, &opt_strcmp},
- {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO},
- {"-norm", OptSpec::FLAG, &opt_norm},
- {"-french", OptSpec::FLAG, &opt_french},
- {"-frenchoff", OptSpec::FLAG, &opt_frenchoff},
- {"-shifted", OptSpec::FLAG, &opt_shifted},
- {"-lower", OptSpec::FLAG, &opt_lower},
- {"-upper", OptSpec::FLAG, &opt_upper},
- {"-case", OptSpec::FLAG, &opt_case},
- {"-level", OptSpec::NUM, &opt_level},
- {"-keyhist", OptSpec::FLAG, &opt_keyhist},
- {"-keygen", OptSpec::FLAG, &opt_keygen},
- {"-loop", OptSpec::NUM, &opt_loopCount},
- {"-iloop", OptSpec::NUM, &opt_iLoopCount},
- {"-terse", OptSpec::FLAG, &opt_terse},
- {"-dump", OptSpec::FLAG, &opt_dump},
- {"-help", OptSpec::FLAG, &opt_help},
- {"-?", OptSpec::FLAG, &opt_help},
- {0, OptSpec::FLAG, 0}
-};
-
-
-//---------------------------------------------------------------------------
-//
-// Global variables pointing to and describing the test file
-//
-//---------------------------------------------------------------------------
-
-//
-// struct Line
-//
-// Each line from the source file (containing a name, presumably) gets
-// one of these structs.
-//
-struct Line {
- UChar *name;
- int len;
- char *winSortKey;
- char *icuSortKey;
- char *unixSortKey;
- char *unixName;
-};
-
-
-
-Line *gFileLines; // Ptr to array of Line structs, one per line in the file.
-int gNumFileLines;
-UCollator *gCol;
-DWORD gWinLCID;
-
-Line **gSortedLines;
-Line **gRandomLines;
-int gCount;
-
-
-
-//---------------------------------------------------------------------------
-//
-// ProcessOptions() Function to read the command line options.
-//
-//---------------------------------------------------------------------------
-UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
-{
- int i;
- int argNum;
- const char *pArgName;
- OptSpec *pOpt;
-
- for (argNum=1; argNum<argc; argNum++) {
- pArgName = argv[argNum];
- for (pOpt = opts; pOpt->name != 0; pOpt++) {
- if (strcmp(pOpt->name, pArgName) == 0) {
- switch (pOpt->type) {
- case OptSpec::FLAG:
- *(UBool *)(pOpt->pVar) = TRUE;
- break;
- case OptSpec::STRING:
- argNum ++;
- if (argNum >= argc) {
- fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
- return FALSE;
- }
- *(const char **)(pOpt->pVar) = argv[argNum];
- break;
- case OptSpec::NUM:
- argNum ++;
- if (argNum >= argc) {
- fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
- return FALSE;
- }
- char *endp;
- i = strtol(argv[argNum], &endp, 0);
- if (endp == argv[argNum]) {
- fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
- return FALSE;
- }
- *(int *)(pOpt->pVar) = i;
- }
- break;
- }
- }
- if (pOpt->name == 0)
- {
- fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
- return FALSE;
- }
- }
-return TRUE;
-}
-
-//---------------------------------------------------------------------------------------
-//
-// Comparison functions for use by qsort.
-//
-// Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
-// or null terminated.
-//
-//---------------------------------------------------------------------------------------
-int ICUstrcmpK(const void *a, const void *b) {
- gCount++;
- int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
- return t;
-}
-
-
-int ICUstrcmpL(const void *a, const void *b) {
- gCount++;
- UCollationResult t;
- t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
- if (t == UCOL_LESS) return -1;
- if (t == UCOL_GREATER) return +1;
- return 0;
-}
-
-
-int ICUstrcmp(const void *a, const void *b) {
- gCount++;
- UCollationResult t;
- t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
- if (t == UCOL_LESS) return -1;
- if (t == UCOL_GREATER) return +1;
- return 0;
-}
-
-
-int Winstrcmp(const void *a, const void *b) {
- gCount++;
- int t;
- t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
- return t-2;
-}
-
-
-int UNIXstrcmp(const void *a, const void *b) {
- gCount++;
- int t;
- t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
- return t;
-}
-
-
-int WinstrcmpL(const void *a, const void *b) {
- gCount++;
- int t;
- t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
- return t-2;
-}
-
-
-int WinstrcmpK(const void *a, const void *b) {
- gCount++;
- int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
- return t;
-}
-
-
-//---------------------------------------------------------------------------------------
-//
-// Function for sorting the names (lines) into a random order.
-// Order is based on a hash of the ICU Sort key for the lines
-// The randomized order is used as input for the sorting timing tests.
-//
-//---------------------------------------------------------------------------------------
-int ICURandomCmp(const void *a, const void *b) {
- char *ask = (*(Line **)a)->icuSortKey;
- char *bsk = (*(Line **)b)->icuSortKey;
- int aVal = 0;
- int bVal = 0;
- int retVal;
- while (*ask != 0) {
- aVal += aVal*37 + *ask++;
- }
- while (*bsk != 0) {
- bVal += bVal*37 + *bsk++;
- }
- retVal = -1;
- if (aVal == bVal) {
- retVal = 0;
- }
- else if (aVal > bVal) {
- retVal = 1;
- }
- return retVal;
-}
-
-//---------------------------------------------------------------------------------------
-//
-// doKeyGen() Key Generation Timing Test
-//
-//---------------------------------------------------------------------------------------
-void doKeyGen()
-{
- int line;
- int loops;
- int iLoop;
- int t;
- int len=-1;
-
- // Adjust loop count to compensate for file size. Should be order n
- double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines));
- int adj_loopCount = int(dLoopCount);
- if (adj_loopCount < 1) adj_loopCount = 1;
-
-
- unsigned long startTime = timeGetTime();
-
- if (opt_win) {
- for (loops=0; loops<adj_loopCount; loops++) {
- for (line=0; line < gNumFileLines; line++) {
- if (opt_uselen) {
- len = gFileLines[line].len;
- }
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
- gFileLines[line].name, len,
- (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length.
- }
- }
- }
- }
- else if (opt_icu)
- {
- for (loops=0; loops<adj_loopCount; loops++) {
- for (line=0; line < gNumFileLines; line++) {
- if (opt_uselen) {
- len = gFileLines[line].len;
- }
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
- }
- }
- }
- }
- else if (opt_unix)
- {
- for (loops=0; loops<adj_loopCount; loops++) {
- for (line=0; line < gNumFileLines; line++) {
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
- }
- }
- }
- }
-
- unsigned long elapsedTime = timeGetTime() - startTime;
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
-
- if (opt_terse == FALSE) {
- printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines);
- printf("Sort Key Generation: time per key = %d ns\n", ns);
- }
- else {
- printf("%d, ", ns);
- }
-
- int totalKeyLen = 0;
- int totalChars = 0;
- for (line=0; line<gNumFileLines; line++) {
- totalChars += u_strlen(gFileLines[line].name);
- if (opt_win) {
- totalKeyLen += strlen(gFileLines[line].winSortKey);
- }
- else if (opt_icu) {
- totalKeyLen += strlen(gFileLines[line].icuSortKey);
- }
- else if (opt_unix) {
- totalKeyLen += strlen(gFileLines[line].unixSortKey);
- }
-
- }
- if (opt_terse == FALSE) {
- printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
- } else {
- printf("%f, ", (float)totalKeyLen / (float)totalChars);
- }
-}
-
-
-
-//---------------------------------------------------------------------------------------
-//
-// doBinarySearch() Binary Search timing test. Each name from the list
-// is looked up in the full sorted list of names.
-//
-//---------------------------------------------------------------------------------------
-void doBinarySearch()
-{
-
- gCount = 0;
- int line;
- int loops;
- int iLoop;
- unsigned long elapsedTime;
-
- // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup)
- // Accurate timings do not depend on this being perfect. The correction is just to try to
- // get total running times of about the right order, so the that user doesn't need to
- // manually adjust the loop count for every different file size.
- double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
- if (opt_usekeys) dLoopCount *= 5;
- int adj_loopCount = int(dLoopCount);
- if (adj_loopCount < 1) adj_loopCount = 1;
-
-
- for (;;) { // not really a loop, just allows "break" to work, to simplify
- // inadvertantly running more than one test through here.
- if (opt_strcmp || opt_strcmpCPO)
- {
- unsigned long startTime = timeGetTime();
- typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
- PF pf = u_strcmp;
- if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
- if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int
- // which forces the use of a cast here.
-
- int r;
- for (loops=0; loops<adj_loopCount; loops++) {
-
- for (line=0; line < gNumFileLines; line++) {
- int hi = gNumFileLines-1;
- int lo = 0;
- int guess = -1;
- for (;;) {
- int newGuess = (hi + lo) / 2;
- if (newGuess == guess)
- break;
- guess = newGuess;
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
- }
- gCount++;
- if (r== 0)
- break;
- if (r < 0)
- hi = guess;
- else
- lo = guess;
- }
- }
- }
- elapsedTime = timeGetTime() - startTime;
- break;
- }
-
-
- if (opt_icu)
- {
- unsigned long startTime = timeGetTime();
- UCollationResult r;
- for (loops=0; loops<adj_loopCount; loops++) {
-
- for (line=0; line < gNumFileLines; line++) {
- int lineLen = -1;
- int guessLen = -1;
- if (opt_uselen) {
- lineLen = (gSortedLines[line])->len;
- }
- int hi = gNumFileLines-1;
- int lo = 0;
- int guess = -1;
- for (;;) {
- int newGuess = (hi + lo) / 2;
- if (newGuess == guess)
- break;
- guess = newGuess;
- int ri;
- if (opt_usekeys) {
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
- }
- gCount++;
- r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
- }
- else
- {
- if (opt_uselen) {
- guessLen = (gSortedLines[guess])->len;
- }
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
- }
- gCount++;
- }
- if (r== UCOL_EQUAL)
- break;
- if (r == UCOL_LESS)
- hi = guess;
- else
- lo = guess;
- }
- }
- }
- elapsedTime = timeGetTime() - startTime;
- break;
- }
-
- if (opt_win)
- {
- unsigned long startTime = timeGetTime();
- int r;
- for (loops=0; loops<adj_loopCount; loops++) {
-
- for (line=0; line < gNumFileLines; line++) {
- int lineLen = -1;
- int guessLen = -1;
- if (opt_uselen) {
- lineLen = (gSortedLines[line])->len;
- }
- int hi = gNumFileLines-1;
- int lo = 0;
- int guess = -1;
- for (;;) {
- int newGuess = (hi + lo) / 2;
- if (newGuess == guess)
- break;
- guess = newGuess;
- if (opt_usekeys) {
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
- }
- gCount++;
- r+=2;
- }
- else
- {
- if (opt_uselen) {
- guessLen = (gSortedLines[guess])->len;
- }
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
- }
- if (r == 0) {
- if (opt_terse == FALSE) {
- fprintf(stderr, "Error returned from Windows CompareStringW.\n");
- }
- exit(-1);
- }
- gCount++;
- }
- if (r== 2) // strings ==
- break;
- if (r == 1) // line < guess
- hi = guess;
- else // line > guess
- lo = guess;
- }
- }
- }
- elapsedTime = timeGetTime() - startTime;
- break;
- }
-
- if (opt_unix)
- {
- unsigned long startTime = timeGetTime();
- int r;
- for (loops=0; loops<adj_loopCount; loops++) {
-
- for (line=0; line < gNumFileLines; line++) {
- int hi = gNumFileLines-1;
- int lo = 0;
- int guess = -1;
- for (;;) {
- int newGuess = (hi + lo) / 2;
- if (newGuess == guess)
- break;
- guess = newGuess;
- if (opt_usekeys) {
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
- }
- gCount++;
- }
- else
- {
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
- r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
- }
- errno = 0;
- if (errno != 0) {
- fprintf(stderr, "Error %d returned from strcoll.\n", errno);
- exit(-1);
- }
- gCount++;
- }
- if (r == 0) // strings ==
- break;
- if (r < 0) // line < guess
- hi = guess;
- else // line > guess
- lo = guess;
- }
- }
- }
- elapsedTime = timeGetTime() - startTime;
- break;
- }
- break;
- }
-
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
- if (opt_terse == FALSE) {
- printf("binary search: total # of string compares = %d\n", gCount);
- printf("binary search: compares per loop = %d\n", gCount / loops);
- printf("binary search: time per compare = %d ns\n", ns);
- } else {
- printf("%d, ", ns);
- }
-
-}
-
-
-
-
-//---------------------------------------------------------------------------------------
-//
-// doQSort() The quick sort timing test. Uses the C library qsort function.
-//
-//---------------------------------------------------------------------------------------
-void doQSort() {
- int i;
- Line **sortBuf = new Line *[gNumFileLines];
-
- // Adjust loop count to compensate for file size. QSort should be n log(n)
- double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
- if (opt_usekeys) dLoopCount *= 5;
- int adj_loopCount = int(dLoopCount);
- if (adj_loopCount < 1) adj_loopCount = 1;
-
-
- gCount = 0;
- unsigned long startTime = timeGetTime();
- if (opt_win && opt_usekeys) {
- for (i=0; i<opt_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
- }
- }
-
- else if (opt_win && opt_uselen) {
- for (i=0; i<adj_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
- }
- }
-
-
- else if (opt_win && !opt_uselen) {
- for (i=0; i<adj_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
- }
- }
-
- else if (opt_icu && opt_usekeys) {
- for (i=0; i<adj_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
- }
- }
-
- else if (opt_icu && opt_uselen) {
- for (i=0; i<adj_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
- }
- }
-
-
- else if (opt_icu && !opt_uselen) {
- for (i=0; i<adj_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
- }
- }
-
- else if (opt_unix && !opt_usekeys) {
- for (i=0; i<adj_loopCount; i++) {
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
- qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
- }
- }
-
- unsigned long elapsedTime = timeGetTime() - startTime;
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
- if (opt_terse == FALSE) {
- printf("qsort: total # of string compares = %d\n", gCount);
- printf("qsort: time per compare = %d ns\n", ns);
- } else {
- printf("%d, ", ns);
- }
-};
-
-
-
-//---------------------------------------------------------------------------------------
-//
-// doKeyHist() Output a table of data for
-// average sort key size vs. string length.
-//
-//---------------------------------------------------------------------------------------
-void doKeyHist() {
- int i;
- int maxLen = 0;
-
- // Find the maximum string length
- for (i=0; i<gNumFileLines; i++) {
- if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
- }
-
- // Allocate arrays to hold the histogram data
- int *accumulatedLen = new int[maxLen+1];
- int *numKeysOfSize = new int[maxLen+1];
- for (i=0; i<=maxLen; i++) {
- accumulatedLen[i] = 0;
- numKeysOfSize[i] = 0;
- }
-
- // Fill the arrays...
- for (i=0; i<gNumFileLines; i++) {
- int len = gFileLines[i].len;
- accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
- numKeysOfSize[len] += 1;
- }
-
- // And write out averages
- printf("String Length, Avg Key Length, Avg Key Len per char\n");
- for (i=1; i<=maxLen; i++) {
- if (numKeysOfSize[i] > 0) {
- printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
- (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
- }
- }
-}
-
-//---------------------------------------------------------------------------------------
-//
-// doForwardIterTest(UBool) Forward iteration test
-// argument null-terminated string used
-//
-//---------------------------------------------------------------------------------------
-void doForwardIterTest(UBool haslen) {
- int count = 0;
-
- UErrorCode error = U_ZERO_ERROR;
- printf("\n\nPerforming forward iteration performance test with ");
-
- if (haslen) {
- printf("non-null terminated data -----------\n");
- }
- else {
- printf("null terminated data -----------\n");
- }
- printf("performance test on strings from file -----------\n");
-
- UChar dummytext[] = {0, 0};
- UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
- ucol_setText(iter, dummytext, 1, &error);
-
- gCount = 0;
- unsigned long startTime = timeGetTime();
- while (count < opt_loopCount) {
- int linecount = 0;
- while (linecount < gNumFileLines) {
- UChar *str = gFileLines[linecount].name;
- int strlen = haslen?gFileLines[linecount].len:-1;
- ucol_setText(iter, str, strlen, &error);
- while (ucol_next(iter, &error) != UCOL_NULLORDER) {
- gCount++;
- }
-
- linecount ++;
- }
- count ++;
- }
- unsigned long elapsedTime = timeGetTime() - startTime;
- printf("elapsedTime %d\n", elapsedTime);
-
- // empty loop recalculation
- count = 0;
- startTime = timeGetTime();
- while (count < opt_loopCount) {
- int linecount = 0;
- while (linecount < gNumFileLines) {
- UChar *str = gFileLines[linecount].name;
- int strlen = haslen?gFileLines[linecount].len:-1;
- ucol_setText(iter, str, strlen, &error);
- linecount ++;
- }
- count ++;
- }
- elapsedTime -= (timeGetTime() - startTime);
- printf("elapsedTime %d\n", elapsedTime);
-
- ucol_closeElements(iter);
-
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
- printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
- opt_loopCount);
- printf("Average time per ucol_next() nano seconds %d\n", ns);
-
- printf("performance test on skipped-5 concatenated strings from file -----------\n");
-
- UChar *str;
- int strlen = 0;
- // appending all the strings
- int linecount = 0;
- while (linecount < gNumFileLines) {
- strlen += haslen?gFileLines[linecount].len:
- u_strlen(gFileLines[linecount].name);
- linecount ++;
- }
- str = (UChar *)malloc(sizeof(UChar) * strlen);
- int strindex = 0;
- linecount = 0;
- while (strindex < strlen) {
- int len = 0;
- len += haslen?gFileLines[linecount].len:
- u_strlen(gFileLines[linecount].name);
- memcpy(str + strindex, gFileLines[linecount].name,
- sizeof(UChar) * len);
- strindex += len;
- linecount ++;
- }
-
- printf("Total size of strings %d\n", strlen);
-
- gCount = 0;
- count = 0;
-
- if (!haslen) {
- strlen = -1;
- }
- iter = ucol_openElements(gCol, str, strlen, &error);
- if (!haslen) {
- strlen = u_strlen(str);
- }
- strlen -= 5; // any left over characters are not iterated,
- // this is to ensure the backwards and forwards iterators
- // gets the same position
- startTime = timeGetTime();
- while (count < opt_loopCount) {
- int count5 = 5;
- strindex = 0;
- ucol_setOffset(iter, strindex, &error);
- while (TRUE) {
- if (ucol_next(iter, &error) == UCOL_NULLORDER) {
- break;
- }
- gCount++;
- count5 --;
- if (count5 == 0) {
- strindex += 10;
- if (strindex > strlen) {
- break;
- }
- ucol_setOffset(iter, strindex, &error);
- count5 = 5;
- }
- }
- count ++;
- }
-
- elapsedTime = timeGetTime() - startTime;
- printf("elapsedTime %d\n", elapsedTime);
-
- // empty loop recalculation
- int tempgCount = 0;
- count = 0;
- startTime = timeGetTime();
- while (count < opt_loopCount) {
- int count5 = 5;
- strindex = 0;
- ucol_setOffset(iter, strindex, &error);
- while (TRUE) {
- tempgCount ++;
- count5 --;
- if (count5 == 0) {
- strindex += 10;
- if (strindex > strlen) {
- break;
- }
- ucol_setOffset(iter, strindex, &error);
- count5 = 5;
- }
- }
- count ++;
- }
- elapsedTime -= (timeGetTime() - startTime);
- printf("elapsedTime %d\n", elapsedTime);
-
- ucol_closeElements(iter);
-
- printf("gCount %d\n", gCount);
- ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
- printf("Average time per ucol_next() nano seconds %d\n", ns);
-}
-
-//---------------------------------------------------------------------------------------
-//
-// doBackwardIterTest(UBool) Backwards iteration test
-// argument null-terminated string used
-//
-//---------------------------------------------------------------------------------------
-void doBackwardIterTest(UBool haslen) {
- int count = 0;
- UErrorCode error = U_ZERO_ERROR;
- printf("\n\nPerforming backward iteration performance test with ");
-
- if (haslen) {
- printf("non-null terminated data -----------\n");
- }
- else {
- printf("null terminated data -----------\n");
- }
-
- printf("performance test on strings from file -----------\n");
-
- UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
- UChar dummytext[] = {0, 0};
- ucol_setText(iter, dummytext, 1, &error);
-
- gCount = 0;
- unsigned long startTime = timeGetTime();
- while (count < opt_loopCount) {
- int linecount = 0;
- while (linecount < gNumFileLines) {
- UChar *str = gFileLines[linecount].name;
- int strlen = haslen?gFileLines[linecount].len:-1;
- ucol_setText(iter, str, strlen, &error);
- while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
- gCount ++;
- }
-
- linecount ++;
- }
- count ++;
- }
- unsigned long elapsedTime = timeGetTime() - startTime;
-
- printf("elapsedTime %d\n", elapsedTime);
-
- // empty loop recalculation
- count = 0;
- startTime = timeGetTime();
- while (count < opt_loopCount) {
- int linecount = 0;
- while (linecount < gNumFileLines) {
- UChar *str = gFileLines[linecount].name;
- int strlen = haslen?gFileLines[linecount].len:-1;
- ucol_setText(iter, str, strlen, &error);
- linecount ++;
- }
- count ++;
- }
- elapsedTime -= (timeGetTime() - startTime);
-
- printf("elapsedTime %d\n", elapsedTime);
- ucol_closeElements(iter);
-
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
- printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
- opt_loopCount);
- printf("Average time per ucol_previous() nano seconds %d\n", ns);
-
- printf("performance test on skipped-5 concatenated strings from file -----------\n");
-
- UChar *str;
- int strlen = 0;
- // appending all the strings
- int linecount = 0;
- while (linecount < gNumFileLines) {
- strlen += haslen?gFileLines[linecount].len:
- u_strlen(gFileLines[linecount].name);
- linecount ++;
- }
- str = (UChar *)malloc(sizeof(UChar) * strlen);
- int strindex = 0;
- linecount = 0;
- while (strindex < strlen) {
- int len = 0;
- len += haslen?gFileLines[linecount].len:
- u_strlen(gFileLines[linecount].name);
- memcpy(str + strindex, gFileLines[linecount].name,
- sizeof(UChar) * len);
- strindex += len;
- linecount ++;
- }
-
- printf("Total size of strings %d\n", strlen);
-
- gCount = 0;
- count = 0;
-
- if (!haslen) {
- strlen = -1;
- }
-
- iter = ucol_openElements(gCol, str, strlen, &error);
- if (!haslen) {
- strlen = u_strlen(str);
- }
-
- startTime = timeGetTime();
- while (count < opt_loopCount) {
- int count5 = 5;
- strindex = 5;
- ucol_setOffset(iter, strindex, &error);
- while (TRUE) {
- if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
- break;
- }
- gCount ++;
- count5 --;
- if (count5 == 0) {
- strindex += 10;
- if (strindex > strlen) {
- break;
- }
- ucol_setOffset(iter, strindex, &error);
- count5 = 5;
- }
- }
- count ++;
- }
-
- elapsedTime = timeGetTime() - startTime;
- printf("elapsedTime %d\n", elapsedTime);
-
- // empty loop recalculation
- count = 0;
- int tempgCount = 0;
- startTime = timeGetTime();
- while (count < opt_loopCount) {
- int count5 = 5;
- strindex = 5;
- ucol_setOffset(iter, strindex, &error);
- while (TRUE) {
- tempgCount ++;
- count5 --;
- if (count5 == 0) {
- strindex += 10;
- if (strindex > strlen) {
- break;
- }
- ucol_setOffset(iter, strindex, &error);
- count5 = 5;
- }
- }
- count ++;
- }
- elapsedTime -= (timeGetTime() - startTime);
- printf("elapsedTime %d\n", elapsedTime);
- ucol_closeElements(iter);
-
- printf("gCount %d\n", gCount);
- ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
- printf("Average time per ucol_previous() nano seconds %d\n", ns);
-}
-
-//---------------------------------------------------------------------------------------
-//
-// doIterTest() Iteration test
-//
-//---------------------------------------------------------------------------------------
-void doIterTest() {
- doForwardIterTest(opt_uselen);
- doBackwardIterTest(opt_uselen);
-}
-
-
-//----------------------------------------------------------------------------------------
-//
-// UnixConvert -- Convert the lines of the file to the encoding for UNIX
-// Since it appears that Unicode support is going in the general
-// direction of the use of UTF-8 locales, that is the approach
-// that is used here.
-//
-//----------------------------------------------------------------------------------------
-void UnixConvert() {
- int line;
-
- UConverter *cvrtr; // An ICU code page converter.
- UErrorCode status = U_ZERO_ERROR;
-
-
- cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
- if (U_FAILURE(status)) {
- fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
- exit(-1);
- }
-
- for (line=0; line < gNumFileLines; line++) {
- int sizeNeeded = ucnv_fromUChars(cvrtr,
- 0, // ptr to target buffer.
- 0, // length of target buffer.
- gFileLines[line].name,
- -1, // source is null terminated
- &status);
- if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
- fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
- exit(-1);
- }
- status = U_ZERO_ERROR;
- gFileLines[line].unixName = new char[sizeNeeded+1];
- sizeNeeded = ucnv_fromUChars(cvrtr,
- gFileLines[line].unixName, // ptr to target buffer.
- sizeNeeded+1, // length of target buffer.
- gFileLines[line].name,
- -1, // source is null terminated
- &status);
- if (U_FAILURE(status)) {
- fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
- exit(-1);
- }
- gFileLines[line].unixName[sizeNeeded] = 0;
- };
- ucnv_close(cvrtr);
-}
-
-
-//----------------------------------------------------------------------------------------
-//
-// class UCharFile Class to hide all the gorp to read a file in
-// and produce a stream of UChars.
-//
-//----------------------------------------------------------------------------------------
-class UCharFile {
-public:
- UCharFile(const char *fileName);
- ~UCharFile();
- UChar get();
- UBool eof() {return fEof;};
- UBool error() {return fError;};
-
-private:
- UCharFile (const UCharFile &other) {}; // No copy constructor.
- UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op
-
- FILE *fFile;
- const char *fName;
- UBool fEof;
- UBool fError;
- UChar fPending2ndSurrogate;
-
- enum {UTF16LE, UTF16BE, UTF8} fEncoding;
-};
-
-UCharFile::UCharFile(const char * fileName) {
- fEof = FALSE;
- fError = FALSE;
- fName = fileName;
- fFile = fopen(fName, "rb");
- fPending2ndSurrogate = 0;
- if (fFile == NULL) {
- fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
- fError = TRUE;
- return;
- }
- //
- // Look for the byte order mark at the start of the file.
- //
- int BOMC1, BOMC2, BOMC3;
- BOMC1 = fgetc(fFile);
- BOMC2 = fgetc(fFile);
-
- if (BOMC1 == 0xff && BOMC2 == 0xfe) {
- fEncoding = UTF16LE; }
- else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
- fEncoding = UTF16BE; }
- else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
- fEncoding = UTF8; }
- else
- {
- fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
- "must include a BOM.\n", fileName);
- fError = true;
- return;
- }
-}
-
-
-UCharFile::~UCharFile() {
- fclose(fFile);
-}
-
-
-
-UChar UCharFile::get() {
- UChar c;
- switch (fEncoding) {
- case UTF16LE:
- {
- int cL, cH;
- cL = fgetc(fFile);
- cH = fgetc(fFile);
- c = cL | (cH << 8);
- if (cH == EOF) {
- c = 0;
- fEof = TRUE;
- }
- break;
- }
- case UTF16BE:
- {
- int cL, cH;
- cH = fgetc(fFile);
- cL = fgetc(fFile);
- c = cL | (cH << 8);
- if (cL == EOF) {
- c = 0;
- fEof = TRUE;
- }
- break;
- }
- case UTF8:
- {
- if (fPending2ndSurrogate != 0) {
- c = fPending2ndSurrogate;
- fPending2ndSurrogate = 0;
- break;
- }
-
- int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
- if (ch == EOF) {
- c = 0;
- fEof = TRUE;
- break;
- }
-
- if (ch <= 0x7f) {
- // It's ascii. No further utf-8 conversion.
- c = ch;
- break;
- }
-
- // Figure out the lenght of the char and read the rest of the bytes
- // into a temp array.
- int nBytes;
- if (ch >= 0xF0) {nBytes=4;}
- else if (ch >= 0xE0) {nBytes=3;}
- else if (ch >= 0xC0) {nBytes=2;}
- else {
- fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
- fError = TRUE;
- return 0;
- }
-
- unsigned char bytes[10];
- bytes[0] = (unsigned char)ch;
- int i;
- for (i=1; i<nBytes; i++) {
- bytes[i] = fgetc(fFile);
- if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
- fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
- fError = TRUE;
- return 0;
- }
- }
-
- // Convert the bytes from the temp array to a Unicode char.
- i = 0;
- uint32_t cp;
- UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
- c = (UChar)cp;
-
- if (cp >= 0x10000) {
- // The code point needs to be broken up into a utf-16 surrogate pair.
- // Process first half this time through the main loop, and
- // remember the other half for the next time through.
- UChar utf16Buf[3];
- i = 0;
- UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
- fPending2ndSurrogate = utf16Buf[1];
- c = utf16Buf[0];
- }
- break;
- };
- }
- return c;
-}
-
-//----------------------------------------------------------------------------------------
-//
-// openRulesCollator - Command line specified a rules file. Read it in
-// and open a collator with it.
-//
-//----------------------------------------------------------------------------------------
-UCollator *openRulesCollator() {
- UCharFile f(opt_rules);
- if (f.error()) {
- return 0;
- }
-
- int bufLen = 10000;
- UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
- int i = 0;
-
- for(;;) {
- buf[i] = f.get();
- if (f.eof()) {
- break;
- }
- if (f.error()) {
- return 0;
- }
- i++;
- if (i >= bufLen) {
- bufLen += 10000;
- buf = (UChar *)realloc(buf, bufLen);
- }
- }
- buf[i] = 0;
-
- UErrorCode status = U_ZERO_ERROR;
- UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
- UCOL_DEFAULT_STRENGTH, NULL, &status);
- if (U_FAILURE(status)) {
- fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
- return 0;
- }
- free(buf);
- return coll;
-}
-
-
-
-
-
-//----------------------------------------------------------------------------------------
-//
-// Main -- process command line, read in and pre-process the test file,
-// call other functions to do the actual tests.
-//
-//----------------------------------------------------------------------------------------
-int main(int argc, const char** argv) {
- if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
- printf(gUsageString);
- exit (1);
- }
-
- // Make sure that we've only got one API selected.
- if (opt_unix || opt_win) opt_icu = FALSE;
- if (opt_unix) opt_win = FALSE;
-
- //
- // Set up an ICU collator
- //
- UErrorCode status = U_ZERO_ERROR;
-
- if (opt_rules != 0) {
- gCol = openRulesCollator();
- if (gCol == 0) {return -1;}
- }
- else {
- gCol = ucol_open(opt_locale, &status);
- if (U_FAILURE(status)) {
- fprintf(stderr, "Collator creation failed.: %d\n", status);
- return -1;
- }
- }
- if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
- fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
- }
- if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
- fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
- }
-
- if (opt_norm) {
- ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
- }
- if (opt_french && opt_frenchoff) {
- fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options.");
- exit(-1);
- }
- if (opt_french) {
- ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
- }
- if (opt_frenchoff) {
- ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
- }
- if (opt_lower) {
- ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
- }
- if (opt_upper) {
- ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
- }
- if (opt_case) {
- ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
- }
- if (opt_shifted) {
- ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
- }
- if (opt_level != 0) {
- switch (opt_level) {
- case 1:
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
- break;
- case 2:
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
- break;
- case 3:
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
- break;
- case 4:
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
- break;
- case 5:
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
- break;
- default:
- fprintf(stderr, "-level param must be between 1 and 5\n");
- exit(-1);
- }
- }
-
- if (U_FAILURE(status)) {
- fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
- return -1;
- }
-
-
- //
- // Set up a Windows LCID
- //
- if (opt_langid != 0) {
- gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
- }
- else {
- gWinLCID = uloc_getLCID(opt_locale);
- }
-
-
- //
- // Set the UNIX locale
- //
- if (opt_unix) {
- if (setlocale(LC_ALL, opt_locale) == 0) {
- fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
- exit(-1);
- }
- }
-
- // Read in the input file.
- // File assumed to be utf-16.
- // Lines go onto heap buffers. Global index array to line starts is created.
- // Lines themselves are null terminated.
- //
-
- UCharFile f(opt_fName);
- if (f.error()) {
- exit(-1);
- }
-
- const int MAXLINES = 40000;
- gFileLines = new Line[MAXLINES];
- UChar buf[1024];
- int column = 0;
-
- // Read the file, split into lines, and save in memory.
- // Loop runs once per utf-16 value from the input file,
- // (The number of bytes read from file per loop iteration depends on external encoding.)
- for (;;) {
-
- UChar c = f.get();
- if (f.error()){
- exit(-1);
- }
-
-
- // We now have a good UTF-16 value in c.
-
- // Watch for CR, LF, EOF; these finish off a line.
- if (c == 0xd) {
- continue;
- }
-
- if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators!
- buf[column++] = 0;
- if (column > 1) {
- gFileLines[gNumFileLines].name = new UChar[column];
- gFileLines[gNumFileLines].len = column-1;
- memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
- gNumFileLines++;
- column = 0;
- if (gNumFileLines >= MAXLINES) {
- fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES);
- exit(-1);
- }
-
- }
- if (c == 0xa || c == 0x2028)
- continue;
- else
- break; // EOF
- }
- buf[column++] = c;
- if (column >= 1023)
- {
- static UBool warnFlag = TRUE;
- if (warnFlag) {
- fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
- warnFlag = FALSE;
- }
- column--;
- }
- }
-
- if (opt_terse == FALSE) {
- printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
- }
-
-
- // Convert the lines to the UNIX encoding.
- if (opt_unix) {
- UnixConvert();
- }
-
- //
- // Pre-compute ICU sort keys for the lines of the file.
- //
- int line;
- int t;
-
- for (line=0; line<gNumFileLines; line++) {
- t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
- gFileLines[line].icuSortKey = new char[t];
-
- if (t > sizeof(buf)) {
- t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
- }
- else
- {
- memcpy(gFileLines[line].icuSortKey, buf, t);
- }
- }
-
-
-
- //
- // Pre-compute Windows sort keys for the lines of the file.
- //
- for (line=0; line<gNumFileLines; line++) {
- t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
- gFileLines[line].winSortKey = new char[t];
- if (t > sizeof(buf)) {
- t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
- }
- else
- {
- memcpy(gFileLines[line].winSortKey, buf, t);
- }
- }
-
- //
- // Pre-compute UNIX sort keys for the lines of the file.
- //
- if (opt_unix) {
- for (line=0; line<gNumFileLines; line++) {
- t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf));
- gFileLines[line].unixSortKey = new char[t];
- if (t > sizeof(buf)) {
- t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf));
- }
- else
- {
- memcpy(gFileLines[line].unixSortKey, buf, t);
- }
- }
- }
-
-
- //
- // Dump file lines, CEs, Sort Keys if requested.
- //
- if (opt_dump) {
- int i;
- for (line=0; line<gNumFileLines; line++) {
- for (i=0;;i++) {
- UChar c = gFileLines[line].name[i];
- if (c == 0)
- break;
- if (c < 0x20 || c > 0x7e) {
- printf("\\u%.4x", c);
- }
- else {
- printf("%c", c);
- }
- }
- printf("\n");
-
- printf(" CEs: ");
- UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
- int32_t ce;
- i = 0;
- for (;;) {
- ce = ucol_next(CEiter, &status);
- if (ce == UCOL_NULLORDER) {
- break;
- }
- printf(" %.8x", ce);
- if (++i > 8) {
- printf("\n ");
- i = 0;
- }
- }
- printf("\n");
- ucol_closeElements(CEiter);
-
-
- printf(" ICU Sort Key: ");
- for (i=0; ; i++) {
- unsigned char c = gFileLines[line].icuSortKey[i];
- printf("%02x ", c);
- if (c == 0) {
- break;
- }
- if (i > 0 && i % 20 == 0) {
- printf("\n ");
- }
- }
- printf("\n");
- }
- }
-
-
- //
- // Pre-sort the lines.
- //
- int i;
- gSortedLines = new Line *[gNumFileLines];
- for (i=0; i<gNumFileLines; i++) {
- gSortedLines[i] = &gFileLines[i];
- }
-
- if (opt_win) {
- qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
- }
- else if (opt_unix) {
- qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
- }
- else /* ICU */
- {
- qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
- }
-
-
- //
- // Make up a randomized order, will be used for sorting tests.
- //
- gRandomLines = new Line *[gNumFileLines];
- for (i=0; i<gNumFileLines; i++) {
- gRandomLines[i] = &gFileLines[i];
- }
- qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
-
-
-
-
- //
- // We've got the file read into memory. Go do something with it.
- //
-
- if (opt_qsort) doQSort();
- if (opt_binsearch) doBinarySearch();
- if (opt_keygen) doKeyGen();
- if (opt_keyhist) doKeyHist();
- if (opt_itertest) doIterTest();
-
- return 0;
-
-}