icuSources/test/perf/collationperf/collperf.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (C) 2001-2011 IBM, Inc.   All Rights Reserved.
   4  *
   5  ********************************************************************/
   6 /********************************************************************************
   7 *
   8 * File CALLCOLL.C
   9 *
  10 * Modification History:
  11 *        Name                     Description
  12 *     Andy Heninger             First Version
  13 *
  14 *********************************************************************************
  15 */
  16
  17 //
  18 //  This program tests string collation and sort key generation performance.
  19 //      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
  20 //      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
  21 //      and include a byte order mark.  Either LE or BE format is OK.
  22 //
  23
  24 const char gUsageString[] =
  25  "usage:  collperf options...\n"
  26     "-help                      Display this message.\n"
  27     "-file file_name            utf-16 format file of names.\n"
  28     "-locale name               ICU locale to use.  Default is en_US\n"
  29     "-rules file_name           Collation rules file (overrides locale)\n"
  30     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
  31     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
  32     "-win                       Run test using Windows native services.  (ICU is default)\n"
  33     "-unix                      Run test using Unix strxfrm, strcoll services.\n"
  34     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
  35     "-usekeys                   Run tests using sortkeys rather than strcoll\n"
  36     "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
  37     "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
  38     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
  39     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
  40     "                               under test at each call point.  For measuring test overhead.\n"
  41     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
  42     "-french                    French accent ordering\n"
  43     "-frenchoff                 No French accent ordering (for use with French locales.)\n"
  44     "-norm                      Normalizing mode on\n"
  45     "-shifted                   Shifted mode\n"
  46     "-lower                     Lower case first\n"
  47     "-upper                     Upper case first\n"
  48     "-case                      Enable separate case level\n"
  49     "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
  50     "-keyhist                   Produce a table sort key size vs. string length\n"
  51     "-binsearch                 Binary Search timing test\n"
  52     "-keygen                    Sort Key Generation timing test\n"
  53     "-qsort                     Quicksort timing test\n"
  54     "-iter                      Iteration Performance Test\n"
  55     "-dump                      Display strings, sort keys and CEs.\n"
  56     ;
  57
  58
  59
  60 #include <stdio.h>
  61 #include <string.h>
  62 #include <stdlib.h>
  63 #include <math.h>
  64 #include <locale.h>
  65 #include <errno.h>
  66
  67 #include <unicode/utypes.h>
  68 #include <unicode/ucol.h>
  69 #include <unicode/ucoleitr.h>
  70 #include <unicode/uloc.h>
  71 #include <unicode/ustring.h>
  72 #include <unicode/ures.h>
  73 #include <unicode/uchar.h>
  74 #include <unicode/ucnv.h>
  75 #include <unicode/utf8.h>
  76
  77 #ifdef WIN32
  78 #include <windows.h>
  79 #else
  80 //
  81 //  Stubs for Windows API functions when building on UNIXes.
  82 //
  83 typedef int DWORD;
  84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
  85 #include <sys/time.h>
  86 unsigned long timeGetTime() {
  87     struct timeval t;
  88     gettimeofday(&t, 0);
  89     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
  90     val += t.tv_usec / 1000;
  91     return val;
  92 }
  93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
  94 const int LCMAP_SORTKEY = 0;
  95 #define MAKELCID(a,b) 0
  96 const int SORT_DEFAULT = 0;
  97 #endif
  98
  99
 100
 101 //
 102 //  Command line option variables
 103 //     These global variables are set according to the options specified
 104 //     on the command line by the user.
 105 char * opt_fName      = 0;
 106 const char * opt_locale     = "en_US";
 107 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
 108 char * opt_rules      = 0;
 109 UBool  opt_help       = FALSE;
 110 int    opt_loopCount  = 1;
 111 int    opt_iLoopCount = 1;
 112 UBool  opt_terse      = FALSE;
 113 UBool  opt_qsort      = FALSE;
 114 UBool  opt_binsearch  = FALSE;
 115 UBool  opt_icu        = TRUE;
 116 UBool  opt_win        = FALSE;      // Run with Windows native functions.
 117 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
 118 UBool  opt_uselen     = FALSE;
 119 UBool  opt_usekeys    = FALSE;
 120 UBool  opt_strcmp     = FALSE;
 121 UBool  opt_strcmpCPO  = FALSE;
 122 UBool  opt_norm       = FALSE;
 123 UBool  opt_keygen     = FALSE;
 124 UBool  opt_french     = FALSE;
 125 UBool  opt_frenchoff  = FALSE;
 126 UBool  opt_shifted    = FALSE;
 127 UBool  opt_lower      = FALSE;
 128 UBool  opt_upper      = FALSE;
 129 UBool  opt_case       = FALSE;
 130 int    opt_level      = 0;
 131 UBool  opt_keyhist    = FALSE;
 132 UBool  opt_itertest   = FALSE;
 133 UBool  opt_dump       = FALSE;
 134
 135
 136
 137 //
 138 //   Definitions for the command line options
 139 //
 140 struct OptSpec {
 141     const char *name;
 142     enum {FLAG, NUM, STRING} type;
 143     void *pVar;
 144 };
 145
 146 OptSpec opts[] = {
 147     {"-file",        OptSpec::STRING, &opt_fName},
 148     {"-locale",      OptSpec::STRING, &opt_locale},
 149     {"-langid",      OptSpec::NUM,    &opt_langid},
 150     {"-rules",       OptSpec::STRING, &opt_rules},
 151     {"-qsort",       OptSpec::FLAG,   &opt_qsort},
 152     {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
 153     {"-iter",        OptSpec::FLAG,   &opt_itertest},
 154     {"-win",         OptSpec::FLAG,   &opt_win},
 155     {"-unix",        OptSpec::FLAG,   &opt_unix},
 156     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
 157     {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
 158     {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
 159     {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
 160     {"-norm",        OptSpec::FLAG,   &opt_norm},
 161     {"-french",      OptSpec::FLAG,   &opt_french},
 162     {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
 163     {"-shifted",     OptSpec::FLAG,   &opt_shifted},
 164     {"-lower",       OptSpec::FLAG,   &opt_lower},
 165     {"-upper",       OptSpec::FLAG,   &opt_upper},
 166     {"-case",        OptSpec::FLAG,   &opt_case},
 167     {"-level",       OptSpec::NUM,    &opt_level},
 168     {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
 169     {"-keygen",      OptSpec::FLAG,   &opt_keygen},
 170     {"-loop",        OptSpec::NUM,    &opt_loopCount},
 171     {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
 172     {"-terse",       OptSpec::FLAG,   &opt_terse},
 173     {"-dump",        OptSpec::FLAG,   &opt_dump},
 174     {"-help",        OptSpec::FLAG,   &opt_help},
 175     {"-?",           OptSpec::FLAG,   &opt_help},
 176     {0, OptSpec::FLAG, 0}
 177 };
 178
 179
 180 //---------------------------------------------------------------------------
 181 //
 182 //  Global variables pointing to and describing the test file
 183 //
 184 //---------------------------------------------------------------------------
 185
 186 //
 187 //   struct Line
 188 //
 189 //      Each line from the source file (containing a name, presumably) gets
 190 //      one of these structs.
 191 //
 192 struct  Line {
 193     UChar     *name;
 194     int        len;
 195     char      *winSortKey;
 196     char      *icuSortKey;
 197     char      *unixSortKey;
 198     char      *unixName;
 199 };
 200
 201
 202
 203 Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
 204 int            gNumFileLines;
 205 UCollator     *gCol;
 206 DWORD          gWinLCID;
 207
 208 Line          **gSortedLines;
 209 Line          **gRandomLines;
 210 int            gCount;
 211
 212
 213
 214 //---------------------------------------------------------------------------
 215 //
 216 //  ProcessOptions()    Function to read the command line options.
 217 //
 218 //---------------------------------------------------------------------------
 219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
 220 {
 221     int         i;
 222     int         argNum;
 223     const char  *pArgName;
 224     OptSpec    *pOpt;
 225
 226     for (argNum=1; argNum<argc; argNum++) {
 227         pArgName = argv[argNum];
 228         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
 229             if (strcmp(pOpt->name, pArgName) == 0) {
 230                 switch (pOpt->type) {
 231                 case OptSpec::FLAG:
 232                     *(UBool *)(pOpt->pVar) = TRUE;
 233                     break;
 234                 case OptSpec::STRING:
 235                     argNum ++;
 236                     if (argNum >= argc) {
 237                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 238                         return FALSE;
 239                     }
 240                     *(const char **)(pOpt->pVar)  = argv[argNum];
 241                     break;
 242                 case OptSpec::NUM:
 243                     argNum ++;
 244                     if (argNum >= argc) {
 245                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 246                         return FALSE;
 247                     }
 248                     char *endp;
 249                     i = strtol(argv[argNum], &endp, 0);
 250                     if (endp == argv[argNum]) {
 251                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
 252                         return FALSE;
 253                     }
 254                     *(int *)(pOpt->pVar) = i;
 255                 }
 256                 break;
 257             }
 258         }
 259         if (pOpt->name == 0)
 260         {
 261             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
 262             return FALSE;
 263         }
 264     }
 265 return TRUE;
 266 }
 267
 268 //---------------------------------------------------------------------------------------
 269 //
 270 //   Comparison functions for use by qsort.
 271 //
 272 //       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
 273 //           or null terminated.
 274 //
 275 //---------------------------------------------------------------------------------------
 276 int ICUstrcmpK(const void *a, const void *b) {
 277     gCount++;
 278     int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
 279     return t;
 280 }
 281
 282
 283 int ICUstrcmpL(const void *a, const void *b) {
 284     gCount++;
 285     UCollationResult t;
 286     t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
 287     if (t == UCOL_LESS) return -1;
 288     if (t == UCOL_GREATER) return +1;
 289     return 0;
 290 }
 291
 292
 293 int ICUstrcmp(const void *a, const void *b) {
 294     gCount++;
 295     UCollationResult t;
 296     t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
 297     if (t == UCOL_LESS) return -1;
 298     if (t == UCOL_GREATER) return +1;
 299     return 0;
 300 }
 301
 302
 303 int Winstrcmp(const void *a, const void *b) {
 304     gCount++;
 305     int t;
 306     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
 307     return t-2;
 308 }
 309
 310
 311 int UNIXstrcmp(const void *a, const void *b) {
 312     gCount++;
 313     int t;
 314     t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
 315     return t;
 316 }
 317
 318
 319 int WinstrcmpL(const void *a, const void *b) {
 320     gCount++;
 321     int t;
 322     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
 323     return t-2;
 324 }
 325
 326
 327 int WinstrcmpK(const void *a, const void *b) {
 328     gCount++;
 329     int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
 330     return t;
 331 }
 332
 333
 334 //---------------------------------------------------------------------------------------
 335 //
 336 //   Function for sorting the names (lines) into a random order.
 337 //      Order is based on a hash of the  ICU Sort key for the lines
 338 //      The randomized order is used as input for the sorting timing tests.
 339 //
 340 //---------------------------------------------------------------------------------------
 341 int ICURandomCmp(const void *a, const void *b) {
 342     char  *ask = (*(Line **)a)->icuSortKey;
 343     char  *bsk = (*(Line **)b)->icuSortKey;
 344     int   aVal = 0;
 345     int   bVal = 0;
 346     int   retVal;
 347     while (*ask != 0) {
 348         aVal += aVal*37 + *ask++;
 349     }
 350     while (*bsk != 0) {
 351         bVal += bVal*37 + *bsk++;
 352     }
 353     retVal = -1;
 354     if (aVal == bVal) {
 355         retVal = 0;
 356     }
 357     else if (aVal > bVal) {
 358         retVal = 1;
 359     }
 360     return retVal;
 361 }
 362
 363 //---------------------------------------------------------------------------------------
 364 //
 365 //   doKeyGen()     Key Generation Timing Test
 366 //
 367 //---------------------------------------------------------------------------------------
 368 void doKeyGen()
 369 {
 370     int  line;
 371     int  loops = 0;
 372     int  iLoop;
 373     int  t;
 374     int  len=-1;
 375
 376     // Adjust loop count to compensate for file size.   Should be order n
 377     double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
 378     int adj_loopCount = int(dLoopCount);
 379     if (adj_loopCount < 1) adj_loopCount = 1;
 380
 381
 382     unsigned long startTime = timeGetTime();
 383
 384     if (opt_win) {
 385         for (loops=0; loops<adj_loopCount; loops++) {
 386             for (line=0; line < gNumFileLines; line++) {
 387                 if (opt_uselen) {
 388                     len = gFileLines[line].len;
 389                 }
 390                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 391                     t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
 392                         gFileLines[line].name, len,
 393                         (unsigned short *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
 394                 }
 395             }
 396         }
 397     }
 398     else if (opt_icu)
 399     {
 400         for (loops=0; loops<adj_loopCount; loops++) {
 401             for (line=0; line < gNumFileLines; line++) {
 402                 if (opt_uselen) {
 403                     len = gFileLines[line].len;
 404                 }
 405                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 406                     t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
 407                 }
 408             }
 409         }
 410     }
 411     else if (opt_unix)
 412     {
 413         for (loops=0; loops<adj_loopCount; loops++) {
 414             for (line=0; line < gNumFileLines; line++) {
 415                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 416                 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
 417                 }
 418             }
 419         }
 420     }
 421
 422     unsigned long elapsedTime = timeGetTime() - startTime;
 423     int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
 424
 425     if (opt_terse == FALSE) {
 426         printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
 427         printf("Sort Key Generation:  time per key = %d ns\n", ns);
 428     }
 429     else {
 430         printf("%d,  ", ns);
 431     }
 432
 433     int   totalKeyLen = 0;
 434     int   totalChars  = 0;
 435     for (line=0; line<gNumFileLines; line++) {
 436         totalChars += u_strlen(gFileLines[line].name);
 437         if (opt_win) {
 438             totalKeyLen += strlen(gFileLines[line].winSortKey);
 439         }
 440         else if (opt_icu) {
 441             totalKeyLen += strlen(gFileLines[line].icuSortKey);
 442         }
 443         else if (opt_unix) {
 444             totalKeyLen += strlen(gFileLines[line].unixSortKey);
 445         }
 446
 447     }
 448     if (opt_terse == FALSE) {
 449         printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
 450     } else {
 451         printf("%f, ", (float)totalKeyLen / (float)totalChars);
 452     }
 453 }
 454
 455
 456
 457 //---------------------------------------------------------------------------------------
 458 //
 459 //    doBinarySearch()    Binary Search timing test.  Each name from the list
 460 //                        is looked up in the full sorted list of names.
 461 //
 462 //---------------------------------------------------------------------------------------
 463 void doBinarySearch()
 464 {
 465
 466     gCount = 0;
 467     int  line;
 468     int  loops = 0;
 469     int  iLoop = 0;
 470     unsigned long elapsedTime = 0;
 471
 472     // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
 473     // Accurate timings do not depend on this being perfect.  The correction is just to try to
 474     //   get total running times of about the right order, so the that user doesn't need to
 475     //   manually adjust the loop count for every different file size.
 476     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
 477     if (opt_usekeys) dLoopCount *= 5;
 478     int adj_loopCount = int(dLoopCount);
 479     if (adj_loopCount < 1) adj_loopCount = 1;
 480
 481
 482     for (;;) {  // not really a loop, just allows "break" to work, to simplify
 483                 //   inadvertantly running more than one test through here.
 484         if (opt_strcmp || opt_strcmpCPO)
 485         {
 486             unsigned long startTime = timeGetTime();
 487             typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
 488             PF pf = u_strcmp;
 489             if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
 490             //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
 491                                                             //   which forces the use of a cast here.
 492
 493             int r = 0;
 494             for (loops=0; loops<adj_loopCount; loops++) {
 495
 496                 for (line=0; line < gNumFileLines; line++) {
 497                     int hi      = gNumFileLines-1;
 498                     int lo      = 0;
 499                     int  guess = -1;
 500                     for (;;) {
 501                         int newGuess = (hi + lo) / 2;
 502                         if (newGuess == guess)
 503                             break;
 504                         guess = newGuess;
 505                         for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 506                             r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
 507                         }
 508                         gCount++;
 509                         if (r== 0)
 510                             break;
 511                         if (r < 0)
 512                             hi = guess;
 513                         else
 514                             lo   = guess;
 515                     }
 516                 }
 517             }
 518             elapsedTime = timeGetTime() - startTime;
 519             break;
 520         }
 521
 522
 523         if (opt_icu)
 524         {
 525             unsigned long startTime = timeGetTime();
 526             UCollationResult  r = UCOL_EQUAL;
 527             for (loops=0; loops<adj_loopCount; loops++) {
 528
 529                 for (line=0; line < gNumFileLines; line++) {
 530                     int lineLen  = -1;
 531                     int guessLen = -1;
 532                     if (opt_uselen) {
 533                         lineLen = (gSortedLines[line])->len;
 534                     }
 535                     int hi      = gNumFileLines-1;
 536                     int lo      = 0;
 537                     int  guess = -1;
 538                     for (;;) {
 539                         int newGuess = (hi + lo) / 2;
 540                         if (newGuess == guess)
 541                             break;
 542                         guess = newGuess;
 543                         int ri = 0;
 544                         if (opt_usekeys) {
 545                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 546                                 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
 547                             }
 548                             gCount++;
 549                             r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
 550                         }
 551                         else
 552                         {
 553                             if (opt_uselen) {
 554                                 guessLen = (gSortedLines[guess])->len;
 555                             }
 556                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 557                                 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
 558                             }
 559                             gCount++;
 560                         }
 561                         if (r== UCOL_EQUAL)
 562                             break;
 563                         if (r == UCOL_LESS)
 564                             hi = guess;
 565                         else
 566                             lo   = guess;
 567                     }
 568                 }
 569             }
 570             elapsedTime = timeGetTime() - startTime;
 571             break;
 572         }
 573
 574         if (opt_win)
 575         {
 576             unsigned long startTime = timeGetTime();
 577             int r = 0;
 578             for (loops=0; loops<adj_loopCount; loops++) {
 579
 580                 for (line=0; line < gNumFileLines; line++) {
 581                     int lineLen  = -1;
 582                     int guessLen = -1;
 583                     if (opt_uselen) {
 584                         lineLen = (gSortedLines[line])->len;
 585                     }
 586                     int hi   = gNumFileLines-1;
 587                     int lo   = 0;
 588                     int  guess = -1;
 589                     for (;;) {
 590                         int newGuess = (hi + lo) / 2;
 591                         if (newGuess == guess)
 592                             break;
 593                         guess = newGuess;
 594                         if (opt_usekeys) {
 595                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 596                                 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
 597                             }
 598                             gCount++;
 599                             r+=2;
 600                         }
 601                         else
 602                         {
 603                             if (opt_uselen) {
 604                                 guessLen = (gSortedLines[guess])->len;
 605                             }
 606                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 607                                 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
 608                             }
 609                             if (r == 0) {
 610                                 if (opt_terse == FALSE) {
 611                                     fprintf(stderr, "Error returned from Windows CompareStringW.\n");
 612                                 }
 613                                 exit(-1);
 614                             }
 615                             gCount++;
 616                         }
 617                         if (r== 2)   //  strings ==
 618                             break;
 619                         if (r == 1)  //  line < guess
 620                             hi = guess;
 621                         else         //  line > guess
 622                             lo   = guess;
 623                     }
 624                 }
 625             }
 626             elapsedTime = timeGetTime() - startTime;
 627             break;
 628         }
 629
 630         if (opt_unix)
 631         {
 632             unsigned long startTime = timeGetTime();
 633             int r = 0;
 634             for (loops=0; loops<adj_loopCount; loops++) {
 635
 636                 for (line=0; line < gNumFileLines; line++) {
 637                     int hi   = gNumFileLines-1;
 638                     int lo   = 0;
 639                     int  guess = -1;
 640                     for (;;) {
 641                         int newGuess = (hi + lo) / 2;
 642                         if (newGuess == guess)
 643                             break;
 644                         guess = newGuess;
 645                         if (opt_usekeys) {
 646                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 647                                  r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
 648                             }
 649                             gCount++;
 650                         }
 651                         else
 652                         {
 653                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 654                                 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
 655                             }
 656                             errno = 0;
 657                             if (errno != 0) {
 658                                 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
 659                                 exit(-1);
 660                             }
 661                             gCount++;
 662                         }
 663                         if (r == 0)   //  strings ==
 664                             break;
 665                         if (r < 0)  //  line < guess
 666                             hi = guess;
 667                         else         //  line > guess
 668                             lo   = guess;
 669                     }
 670                 }
 671             }
 672             elapsedTime = timeGetTime() - startTime;
 673             break;
 674         }
 675         break;
 676     }
 677
 678     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 679     if (opt_terse == FALSE) {
 680         printf("binary search:  total # of string compares = %d\n", gCount);
 681         printf("binary search:  compares per loop = %d\n", gCount / loops);
 682         printf("binary search:  time per compare = %d ns\n", ns);
 683     } else {
 684         printf("%d, ", ns);
 685     }
 686
 687 }
 688
 689
 690
 691
 692 //---------------------------------------------------------------------------------------
 693 //
 694 //   doQSort()    The quick sort timing test.  Uses the C library qsort function.
 695 //
 696 //---------------------------------------------------------------------------------------
 697 void doQSort() {
 698     int i;
 699     Line **sortBuf = new Line *[gNumFileLines];
 700
 701     // Adjust loop count to compensate for file size.   QSort should be n log(n)
 702     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
 703     if (opt_usekeys) dLoopCount *= 5;
 704     int adj_loopCount = int(dLoopCount);
 705     if (adj_loopCount < 1) adj_loopCount = 1;
 706
 707
 708     gCount = 0;
 709     unsigned long startTime = timeGetTime();
 710     if (opt_win && opt_usekeys) {
 711         for (i=0; i<opt_loopCount; i++) {
 712             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 713             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
 714         }
 715     }
 716
 717     else if (opt_win && opt_uselen) {
 718         for (i=0; i<adj_loopCount; i++) {
 719             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 720             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
 721         }
 722     }
 723
 724
 725     else if (opt_win && !opt_uselen) {
 726         for (i=0; i<adj_loopCount; i++) {
 727             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 728             qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
 729         }
 730     }
 731
 732     else if (opt_icu && opt_usekeys) {
 733         for (i=0; i<adj_loopCount; i++) {
 734             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 735             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
 736         }
 737     }
 738
 739     else if (opt_icu && opt_uselen) {
 740         for (i=0; i<adj_loopCount; i++) {
 741             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 742             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
 743         }
 744     }
 745
 746
 747     else if (opt_icu && !opt_uselen) {
 748         for (i=0; i<adj_loopCount; i++) {
 749             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 750             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
 751         }
 752     }
 753
 754     else if (opt_unix && !opt_usekeys) {
 755         for (i=0; i<adj_loopCount; i++) {
 756             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 757             qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
 758         }
 759     }
 760
 761     unsigned long elapsedTime = timeGetTime() - startTime;
 762     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 763     if (opt_terse == FALSE) {
 764         printf("qsort:  total # of string compares = %d\n", gCount);
 765         printf("qsort:  time per compare = %d ns\n", ns);
 766     } else {
 767         printf("%d, ", ns);
 768     }
 769 }
 770
 771
 772
 773 //---------------------------------------------------------------------------------------
 774 //
 775 //    doKeyHist()       Output a table of data for
 776 //                        average sort key size vs. string length.
 777 //
 778 //---------------------------------------------------------------------------------------
 779 void doKeyHist() {
 780     int     i;
 781     int     maxLen = 0;
 782
 783     // Find the maximum string length
 784     for (i=0; i<gNumFileLines; i++) {
 785         if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
 786     }
 787
 788     // Allocate arrays to hold the histogram data
 789     int *accumulatedLen  = new int[maxLen+1];
 790     int *numKeysOfSize   = new int[maxLen+1];
 791     for (i=0; i<=maxLen; i++) {
 792         accumulatedLen[i] = 0;
 793         numKeysOfSize[i] = 0;
 794     }
 795
 796     // Fill the arrays...
 797     for (i=0; i<gNumFileLines; i++) {
 798         int len = gFileLines[i].len;
 799         accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
 800         numKeysOfSize[len] += 1;
 801     }
 802
 803     // And write out averages
 804     printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
 805     for (i=1; i<=maxLen; i++) {
 806         if (numKeysOfSize[i] > 0) {
 807             printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
 808                 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
 809         }
 810     }
 811     delete []accumulatedLen;
 812     delete []numKeysOfSize ;
 813 }
 814
 815 //---------------------------------------------------------------------------------------
 816 //
 817 //    doForwardIterTest(UBool)       Forward iteration test
 818 //                                   argument null-terminated string used
 819 //
 820 //---------------------------------------------------------------------------------------
 821 void doForwardIterTest(UBool haslen) {
 822     int count = 0;
 823
 824     UErrorCode error = U_ZERO_ERROR;
 825     printf("\n\nPerforming forward iteration performance test with ");
 826
 827     if (haslen) {
 828         printf("non-null terminated data -----------\n");
 829     }
 830     else {
 831         printf("null terminated data -----------\n");
 832     }
 833     printf("performance test on strings from file -----------\n");
 834
 835     UChar dummytext[] = {0, 0};
 836     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
 837     ucol_setText(iter, dummytext, 1, &error);
 838
 839     gCount = 0;
 840     unsigned long startTime = timeGetTime();
 841     while (count < opt_loopCount) {
 842         int linecount = 0;
 843         while (linecount < gNumFileLines) {
 844             UChar *str = gFileLines[linecount].name;
 845             int strlen = haslen?gFileLines[linecount].len:-1;
 846             ucol_setText(iter, str, strlen, &error);
 847             while (ucol_next(iter, &error) != UCOL_NULLORDER) {
 848                 gCount++;
 849             }
 850
 851             linecount ++;
 852         }
 853         count ++;
 854     }
 855     unsigned long elapsedTime = timeGetTime() - startTime;
 856     printf("elapsedTime %ld\n", elapsedTime);
 857
 858     // empty loop recalculation
 859     count = 0;
 860     startTime = timeGetTime();
 861     while (count < opt_loopCount) {
 862         int linecount = 0;
 863         while (linecount < gNumFileLines) {
 864             UChar *str = gFileLines[linecount].name;
 865             int strlen = haslen?gFileLines[linecount].len:-1;
 866             ucol_setText(iter, str, strlen, &error);
 867             linecount ++;
 868         }
 869         count ++;
 870     }
 871     elapsedTime -= (timeGetTime() - startTime);
 872     printf("elapsedTime %ld\n", elapsedTime);
 873
 874     ucol_closeElements(iter);
 875
 876     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 877     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
 878                                                                 opt_loopCount);
 879     printf("Average time per ucol_next() nano seconds %d\n", ns);
 880
 881     printf("performance test on skipped-5 concatenated strings from file -----------\n");
 882
 883     UChar *str;
 884     int    strlen = 0;
 885     // appending all the strings
 886     int linecount = 0;
 887     while (linecount < gNumFileLines) {
 888         strlen += haslen?gFileLines[linecount].len:
 889                                       u_strlen(gFileLines[linecount].name);
 890         linecount ++;
 891     }
 892     str = (UChar *)malloc(sizeof(UChar) * strlen);
 893     int strindex = 0;
 894     linecount = 0;
 895     while (strindex < strlen) {
 896         int len = 0;
 897         len += haslen?gFileLines[linecount].len:
 898                                       u_strlen(gFileLines[linecount].name);
 899         memcpy(str + strindex, gFileLines[linecount].name,
 900                sizeof(UChar) * len);
 901         strindex += len;
 902         linecount ++;
 903     }
 904
 905     printf("Total size of strings %d\n", strlen);
 906
 907     gCount = 0;
 908     count  = 0;
 909
 910     if (!haslen) {
 911         strlen = -1;
 912     }
 913     iter = ucol_openElements(gCol, str, strlen, &error);
 914     if (!haslen) {
 915         strlen = u_strlen(str);
 916     }
 917     strlen -= 5; // any left over characters are not iterated,
 918                  // this is to ensure the backwards and forwards iterators
 919                  // gets the same position
 920     startTime = timeGetTime();
 921     while (count < opt_loopCount) {
 922         int count5 = 5;
 923         strindex = 0;
 924         ucol_setOffset(iter, strindex, &error);
 925         while (TRUE) {
 926             if (ucol_next(iter, &error) == UCOL_NULLORDER) {
 927                 break;
 928             }
 929             gCount++;
 930             count5 --;
 931             if (count5 == 0) {
 932                 strindex += 10;
 933                 if (strindex > strlen) {
 934                     break;
 935                 }
 936                 ucol_setOffset(iter, strindex, &error);
 937                 count5 = 5;
 938             }
 939         }
 940         count ++;
 941     }
 942
 943     elapsedTime = timeGetTime() - startTime;
 944     printf("elapsedTime %ld\n", elapsedTime);
 945
 946     // empty loop recalculation
 947     int tempgCount = 0;
 948     count = 0;
 949     startTime = timeGetTime();
 950     while (count < opt_loopCount) {
 951         int count5 = 5;
 952         strindex = 0;
 953         ucol_setOffset(iter, strindex, &error);
 954         while (TRUE) {
 955             tempgCount ++;
 956             count5 --;
 957             if (count5 == 0) {
 958                 strindex += 10;
 959                 if (strindex > strlen) {
 960                     break;
 961                 }
 962                 ucol_setOffset(iter, strindex, &error);
 963                 count5 = 5;
 964             }
 965         }
 966         count ++;
 967     }
 968     elapsedTime -= (timeGetTime() - startTime);
 969     printf("elapsedTime %ld\n", elapsedTime);
 970
 971     ucol_closeElements(iter);
 972
 973     printf("gCount %d\n", gCount);
 974     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 975     printf("Average time per ucol_next() nano seconds %d\n", ns);
 976 }
 977
 978 //---------------------------------------------------------------------------------------
 979 //
 980 //    doBackwardIterTest(UBool)      Backwards iteration test
 981 //                                   argument null-terminated string used
 982 //
 983 //---------------------------------------------------------------------------------------
 984 void doBackwardIterTest(UBool haslen) {
 985     int count = 0;
 986     UErrorCode error = U_ZERO_ERROR;
 987     printf("\n\nPerforming backward iteration performance test with ");
 988
 989     if (haslen) {
 990         printf("non-null terminated data -----------\n");
 991     }
 992     else {
 993         printf("null terminated data -----------\n");
 994     }
 995
 996     printf("performance test on strings from file -----------\n");
 997
 998     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
 999     UChar dummytext[] = {0, 0};
1000     ucol_setText(iter, dummytext, 1, &error);
1001
1002     gCount = 0;
1003     unsigned long startTime = timeGetTime();
1004     while (count < opt_loopCount) {
1005         int linecount = 0;
1006         while (linecount < gNumFileLines) {
1007             UChar *str = gFileLines[linecount].name;
1008             int strlen = haslen?gFileLines[linecount].len:-1;
1009             ucol_setText(iter, str, strlen, &error);
1010             while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1011                 gCount ++;
1012             }
1013
1014             linecount ++;
1015         }
1016         count ++;
1017     }
1018     unsigned long elapsedTime = timeGetTime() - startTime;
1019
1020     printf("elapsedTime %ld\n", elapsedTime);
1021
1022     // empty loop recalculation
1023     count = 0;
1024     startTime = timeGetTime();
1025     while (count < opt_loopCount) {
1026         int linecount = 0;
1027         while (linecount < gNumFileLines) {
1028             UChar *str = gFileLines[linecount].name;
1029             int strlen = haslen?gFileLines[linecount].len:-1;
1030             ucol_setText(iter, str, strlen, &error);
1031             linecount ++;
1032         }
1033         count ++;
1034     }
1035     elapsedTime -= (timeGetTime() - startTime);
1036
1037     printf("elapsedTime %ld\n", elapsedTime);
1038     ucol_closeElements(iter);
1039
1040     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1041     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1042                                                                 opt_loopCount);
1043     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1044
1045     printf("performance test on skipped-5 concatenated strings from file -----------\n");
1046
1047     UChar *str;
1048     int    strlen = 0;
1049     // appending all the strings
1050     int linecount = 0;
1051     while (linecount < gNumFileLines) {
1052         strlen += haslen?gFileLines[linecount].len:
1053                                       u_strlen(gFileLines[linecount].name);
1054         linecount ++;
1055     }
1056     str = (UChar *)malloc(sizeof(UChar) * strlen);
1057     int strindex = 0;
1058     linecount = 0;
1059     while (strindex < strlen) {
1060         int len = 0;
1061         len += haslen?gFileLines[linecount].len:
1062                                       u_strlen(gFileLines[linecount].name);
1063         memcpy(str + strindex, gFileLines[linecount].name,
1064                sizeof(UChar) * len);
1065         strindex += len;
1066         linecount ++;
1067     }
1068
1069     printf("Total size of strings %d\n", strlen);
1070
1071     gCount = 0;
1072     count  = 0;
1073
1074     if (!haslen) {
1075         strlen = -1;
1076     }
1077
1078     iter = ucol_openElements(gCol, str, strlen, &error);
1079     if (!haslen) {
1080         strlen = u_strlen(str);
1081     }
1082
1083     startTime = timeGetTime();
1084     while (count < opt_loopCount) {
1085         int count5 = 5;
1086         strindex = 5;
1087         ucol_setOffset(iter, strindex, &error);
1088         while (TRUE) {
1089             if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1090                 break;
1091             }
1092              gCount ++;
1093              count5 --;
1094              if (count5 == 0) {
1095                  strindex += 10;
1096                  if (strindex > strlen) {
1097                     break;
1098                  }
1099                  ucol_setOffset(iter, strindex, &error);
1100                  count5 = 5;
1101              }
1102         }
1103         count ++;
1104     }
1105
1106     elapsedTime = timeGetTime() - startTime;
1107     printf("elapsedTime %ld\n", elapsedTime);
1108
1109     // empty loop recalculation
1110     count = 0;
1111     int tempgCount = 0;
1112     startTime = timeGetTime();
1113     while (count < opt_loopCount) {
1114         int count5 = 5;
1115         strindex = 5;
1116         ucol_setOffset(iter, strindex, &error);
1117         while (TRUE) {
1118              tempgCount ++;
1119              count5 --;
1120              if (count5 == 0) {
1121                  strindex += 10;
1122                  if (strindex > strlen) {
1123                     break;
1124                  }
1125                  ucol_setOffset(iter, strindex, &error);
1126                  count5 = 5;
1127              }
1128         }
1129         count ++;
1130     }
1131     elapsedTime -= (timeGetTime() - startTime);
1132     printf("elapsedTime %ld\n", elapsedTime);
1133     ucol_closeElements(iter);
1134
1135     printf("gCount %d\n", gCount);
1136     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1137     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1138 }
1139
1140 //---------------------------------------------------------------------------------------
1141 //
1142 //    doIterTest()       Iteration test
1143 //
1144 //---------------------------------------------------------------------------------------
1145 void doIterTest() {
1146     doForwardIterTest(opt_uselen);
1147     doBackwardIterTest(opt_uselen);
1148 }
1149
1150
1151 //----------------------------------------------------------------------------------------
1152 //
1153 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
1154 //                    Since it appears that Unicode support is going in the general
1155 //                    direction of the use of UTF-8 locales, that is the approach
1156 //                    that is used here.
1157 //
1158 //----------------------------------------------------------------------------------------
1159 void  UnixConvert() {
1160     int    line;
1161
1162     UConverter   *cvrtr;    // An ICU code page converter.
1163     UErrorCode    status = U_ZERO_ERROR;
1164
1165
1166     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
1167     if (U_FAILURE(status)) {
1168         fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1169         exit(-1);
1170     }
1171
1172     for (line=0; line < gNumFileLines; line++) {
1173         int sizeNeeded = ucnv_fromUChars(cvrtr,
1174                                          0,            // ptr to target buffer.
1175                                          0,            // length of target buffer.
1176                                          gFileLines[line].name,
1177                                          -1,           //  source is null terminated
1178                                          &status);
1179         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1180             //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1181             //exit(-1);
1182         }
1183         status = U_ZERO_ERROR;
1184         gFileLines[line].unixName = new char[sizeNeeded+1];
1185         sizeNeeded = ucnv_fromUChars(cvrtr,
1186                                          gFileLines[line].unixName, // ptr to target buffer.
1187                                          sizeNeeded+1, // length of target buffer.
1188                                          gFileLines[line].name,
1189                                          -1,           //  source is null terminated
1190                                          &status);
1191         if (U_FAILURE(status)) {
1192             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1193             exit(-1);
1194         }
1195         gFileLines[line].unixName[sizeNeeded] = 0;
1196     };
1197     ucnv_close(cvrtr);
1198 }
1199
1200
1201 //----------------------------------------------------------------------------------------
1202 //
1203 //  class UCharFile   Class to hide all the gorp to read a file in
1204 //                    and produce a stream of UChars.
1205 //
1206 //----------------------------------------------------------------------------------------
1207 class UCharFile {
1208 public:
1209     UCharFile(const char *fileName);
1210     ~UCharFile();
1211     UChar   get();
1212     UBool   eof() {return fEof;};
1213     UBool   error() {return fError;};
1214
1215 private:
1216     UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
1217     UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
1218
1219     FILE         *fFile;
1220     const char   *fName;
1221     UBool        fEof;
1222     UBool        fError;
1223     UChar        fPending2ndSurrogate;
1224
1225     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1226 };
1227
1228 UCharFile::UCharFile(const char * fileName) {
1229     fEof                 = FALSE;
1230     fError               = FALSE;
1231     fName                = fileName;
1232     fFile                = fopen(fName, "rb");
1233     fPending2ndSurrogate = 0;
1234     if (fFile == NULL) {
1235         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1236         fError = TRUE;
1237         return;
1238     }
1239     //
1240     //  Look for the byte order mark at the start of the file.
1241     //
1242     int BOMC1, BOMC2, BOMC3;
1243     BOMC1 = fgetc(fFile);
1244     BOMC2 = fgetc(fFile);
1245
1246     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1247         fEncoding = UTF16LE; }
1248     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1249         fEncoding = UTF16BE; }
1250     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1251         fEncoding = UTF8; }
1252     else
1253     {
1254         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
1255             "must include a BOM.\n", fileName);
1256         fError = true;
1257         return;
1258     }
1259 }
1260
1261
1262 UCharFile::~UCharFile() {
1263     fclose(fFile);
1264 }
1265
1266
1267
1268 UChar UCharFile::get() {
1269     UChar   c;
1270     switch (fEncoding) {
1271     case UTF16LE:
1272         {
1273             int  cL, cH;
1274             cL = fgetc(fFile);
1275             cH = fgetc(fFile);
1276             c  = cL  | (cH << 8);
1277             if (cH == EOF) {
1278                 c   = 0;
1279                 fEof = TRUE;
1280             }
1281             break;
1282         }
1283     case UTF16BE:
1284         {
1285             int  cL, cH;
1286             cH = fgetc(fFile);
1287             cL = fgetc(fFile);
1288             c  = cL  | (cH << 8);
1289             if (cL == EOF) {
1290                 c   = 0;
1291                 fEof = TRUE;
1292             }
1293             break;
1294         }
1295     case UTF8:
1296         {
1297             if (fPending2ndSurrogate != 0) {
1298                 c = fPending2ndSurrogate;
1299                 fPending2ndSurrogate = 0;
1300                 break;
1301             }
1302
1303             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
1304             if (ch == EOF) {
1305                 c = 0;
1306                 fEof = TRUE;
1307                 break;
1308             }
1309
1310             if (ch <= 0x7f) {
1311                 // It's ascii.  No further utf-8 conversion.
1312                 c = ch;
1313                 break;
1314             }
1315
1316             // Figure out the lenght of the char and read the rest of the bytes
1317             //   into a temp array.
1318             int nBytes;
1319             if (ch >= 0xF0) {nBytes=4;}
1320             else if (ch >= 0xE0) {nBytes=3;}
1321             else if (ch >= 0xC0) {nBytes=2;}
1322             else {
1323                 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1324                 fError = TRUE;
1325                 return 0;
1326             }
1327
1328             unsigned char  bytes[10];
1329             bytes[0] = (unsigned char)ch;
1330             int i;
1331             for (i=1; i<nBytes; i++) {
1332                 bytes[i] = fgetc(fFile);
1333                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1334                     fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1335                     fError = TRUE;
1336                     return 0;
1337                 }
1338             }
1339
1340             // Convert the bytes from the temp array to a Unicode char.
1341             i = 0;
1342             uint32_t  cp;
1343             UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
1344             c = (UChar)cp;
1345
1346             if (cp >= 0x10000) {
1347                 // The code point needs to be broken up into a utf-16 surrogate pair.
1348                 //  Process first half this time through the main loop, and
1349                 //   remember the other half for the next time through.
1350                 UChar utf16Buf[3];
1351                 i = 0;
1352                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1353                 fPending2ndSurrogate = utf16Buf[1];
1354                 c = utf16Buf[0];
1355             }
1356             break;
1357         };
1358     default:
1359         c = 0xFFFD; /* Error, unspecified codepage*/
1360         fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1361         exit(1);
1362     }
1363     return c;
1364 }
1365
1366 //----------------------------------------------------------------------------------------
1367 //
1368 //   openRulesCollator  - Command line specified a rules file.  Read it in
1369 //                        and open a collator with it.
1370 //
1371 //----------------------------------------------------------------------------------------
1372 UCollator *openRulesCollator() {
1373     UCharFile f(opt_rules);
1374     if (f.error()) {
1375         return 0;
1376     }
1377
1378     int  bufLen = 10000;
1379     UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1380     UChar *tmp;
1381     int i = 0;
1382
1383     for(;;) {
1384         buf[i] = f.get();
1385         if (f.eof()) {
1386             break;
1387         }
1388         if (f.error()) {
1389             return 0;
1390         }
1391         i++;
1392         if (i >= bufLen) {
1393             tmp = buf;
1394             bufLen += 10000;
1395             buf = (UChar *)realloc(buf, bufLen);
1396             if (buf == NULL) {
1397                 free(tmp);
1398                 return 0;
1399             }
1400         }
1401     }
1402     buf[i] = 0;
1403
1404     UErrorCode    status = U_ZERO_ERROR;
1405     UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1406                                          UCOL_DEFAULT_STRENGTH, NULL, &status);
1407     if (U_FAILURE(status)) {
1408         fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1409         return 0;
1410     }
1411     free(buf);
1412     return coll;
1413 }
1414
1415
1416
1417
1418
1419 //----------------------------------------------------------------------------------------
1420 //
1421 //    Main   --  process command line, read in and pre-process the test file,
1422 //                 call other functions to do the actual tests.
1423 //
1424 //----------------------------------------------------------------------------------------
1425 int main(int argc, const char** argv) {
1426     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
1427         printf(gUsageString);
1428         exit (1);
1429     }
1430
1431     // Make sure that we've only got one API selected.
1432     if (opt_unix || opt_win) opt_icu = FALSE;
1433     if (opt_unix) opt_win = FALSE;
1434
1435     //
1436     //  Set up an ICU collator
1437     //
1438     UErrorCode          status = U_ZERO_ERROR;
1439
1440     if (opt_rules != 0) {
1441         gCol = openRulesCollator();
1442         if (gCol == 0) {return -1;}
1443     }
1444     else {
1445         gCol = ucol_open(opt_locale, &status);
1446         if (U_FAILURE(status)) {
1447             fprintf(stderr, "Collator creation failed.: %d\n", status);
1448             return -1;
1449         }
1450     }
1451     if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
1452         fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1453     }
1454     if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
1455         fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1456     }
1457
1458     if (opt_norm) {
1459         ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1460     }
1461     if (opt_french && opt_frenchoff) {
1462         fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
1463         exit(-1);
1464     }
1465     if (opt_french) {
1466         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1467     }
1468     if (opt_frenchoff) {
1469         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1470     }
1471     if (opt_lower) {
1472         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1473     }
1474     if (opt_upper) {
1475         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1476     }
1477     if (opt_case) {
1478         ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1479     }
1480     if (opt_shifted) {
1481         ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1482     }
1483     if (opt_level != 0) {
1484         switch (opt_level) {
1485         case 1:
1486             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1487             break;
1488         case 2:
1489             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1490             break;
1491         case 3:
1492             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1493             break;
1494         case 4:
1495             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1496             break;
1497         case 5:
1498             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1499             break;
1500         default:
1501             fprintf(stderr, "-level param must be between 1 and 5\n");
1502             exit(-1);
1503         }
1504     }
1505
1506     if (U_FAILURE(status)) {
1507         fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1508         return -1;
1509     }
1510
1511
1512     //
1513     //  Set up a Windows LCID
1514     //
1515     if (opt_langid != 0) {
1516         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1517     }
1518     else {
1519         gWinLCID = uloc_getLCID(opt_locale);
1520     }
1521
1522
1523     //
1524     //  Set the UNIX locale
1525     //
1526     if (opt_unix) {
1527         if (setlocale(LC_ALL, opt_locale) == 0) {
1528             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1529             exit(-1);
1530         }
1531     }
1532
1533     // Read in  the input file.
1534     //   File assumed to be utf-16.
1535     //   Lines go onto heap buffers.  Global index array to line starts is created.
1536     //   Lines themselves are null terminated.
1537     //
1538
1539     UCharFile f(opt_fName);
1540     if (f.error()) {
1541         exit(-1);
1542     }
1543
1544     const int MAXLINES = 100000;
1545     gFileLines = new Line[MAXLINES];
1546     UChar buf[1024];
1547     int   column = 0;
1548
1549     //  Read the file, split into lines, and save in memory.
1550     //  Loop runs once per utf-16 value from the input file,
1551     //    (The number of bytes read from file per loop iteration depends on external encoding.)
1552     for (;;) {
1553
1554         UChar c = f.get();
1555         if (f.error()){
1556             exit(-1);
1557         }
1558
1559
1560         // We now have a good UTF-16 value in c.
1561
1562         // Watch for CR, LF, EOF; these finish off a line.
1563         if (c == 0xd) {
1564             continue;
1565         }
1566
1567         if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
1568             buf[column++] = 0;
1569             if (column > 1) {
1570                 gFileLines[gNumFileLines].name  = new UChar[column];
1571                 gFileLines[gNumFileLines].len   = column-1;
1572                 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1573                 gNumFileLines++;
1574                 column = 0;
1575                 if (gNumFileLines >= MAXLINES) {
1576                     fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
1577                     exit(-1);
1578                 }
1579
1580             }
1581             if (c == 0xa || c == 0x2028)
1582                 continue;
1583             else
1584                 break;  // EOF
1585         }
1586         buf[column++] = c;
1587         if (column >= 1023)
1588         {
1589             static UBool warnFlag = TRUE;
1590             if (warnFlag) {
1591                 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1592                 warnFlag = FALSE;
1593             }
1594             column--;
1595         }
1596     }
1597
1598     if (opt_terse == FALSE) {
1599         printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1600     }
1601
1602
1603     // Convert the lines to the UNIX encoding.
1604     if (opt_unix) {
1605         UnixConvert();
1606     }
1607
1608     //
1609     //  Pre-compute ICU sort keys for the lines of the file.
1610     //
1611     int line;
1612     int32_t t;
1613
1614     for (line=0; line<gNumFileLines; line++) {
1615          t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1616          gFileLines[line].icuSortKey  = new char[t];
1617
1618          if (t > (int32_t)sizeof(buf)) {
1619              t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1620          }
1621          else
1622          {
1623              memcpy(gFileLines[line].icuSortKey, buf, t);
1624          }
1625     }
1626
1627
1628
1629     //
1630     //  Pre-compute Windows sort keys for the lines of the file.
1631     //
1632     for (line=0; line<gNumFileLines; line++) {
1633          t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1634          gFileLines[line].winSortKey  = new char[t];
1635          if (t > (int32_t)sizeof(buf)) {
1636              t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
1637          }
1638          else
1639          {
1640              memcpy(gFileLines[line].winSortKey, buf, t);
1641          }
1642     }
1643
1644     //
1645     //  Pre-compute UNIX sort keys for the lines of the file.
1646     //
1647     if (opt_unix) {
1648         for (line=0; line<gNumFileLines; line++) {
1649             t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
1650             gFileLines[line].unixSortKey  = new char[t];
1651             if (t > (int32_t)sizeof(buf)) {
1652                 t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
1653             }
1654             else
1655             {
1656                 memcpy(gFileLines[line].unixSortKey, buf, t);
1657             }
1658         }
1659     }
1660
1661
1662     //
1663     //  Dump file lines, CEs, Sort Keys if requested.
1664     //
1665     if (opt_dump) {
1666         int  i;
1667         for (line=0; line<gNumFileLines; line++) {
1668             for (i=0;;i++) {
1669                 UChar  c = gFileLines[line].name[i];
1670                 if (c == 0)
1671                     break;
1672                 if (c < 0x20 || c > 0x7e) {
1673                     printf("\\u%.4x", c);
1674                 }
1675                 else {
1676                     printf("%c", c);
1677                 }
1678             }
1679             printf("\n");
1680
1681             printf("   CEs: ");
1682             UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1683             int32_t ce;
1684             i = 0;
1685             for (;;) {
1686                 ce = ucol_next(CEiter, &status);
1687                 if (ce == UCOL_NULLORDER) {
1688                     break;
1689                 }
1690                 printf(" %.8x", ce);
1691                 if (++i > 8) {
1692                     printf("\n        ");
1693                     i = 0;
1694                 }
1695             }
1696             printf("\n");
1697             ucol_closeElements(CEiter);
1698
1699
1700             printf("   ICU Sort Key: ");
1701             for (i=0; ; i++) {
1702                 unsigned char c = gFileLines[line].icuSortKey[i];
1703                 printf("%02x ", c);
1704                 if (c == 0) {
1705                     break;
1706                 }
1707                 if (i > 0 && i % 20 == 0) {
1708                     printf("\n                 ");
1709                 }
1710            }
1711             printf("\n");
1712         }
1713     }
1714
1715
1716     //
1717     //  Pre-sort the lines.
1718     //
1719     int i;
1720     gSortedLines = new Line *[gNumFileLines];
1721     for (i=0; i<gNumFileLines; i++) {
1722         gSortedLines[i] = &gFileLines[i];
1723     }
1724
1725     if (opt_win) {
1726         qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1727     }
1728     else if (opt_unix) {
1729         qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1730     }
1731     else   /* ICU */
1732     {
1733         qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1734     }
1735
1736
1737     //
1738     //  Make up a randomized order, will be used for sorting tests.
1739     //
1740     gRandomLines = new Line *[gNumFileLines];
1741     for (i=0; i<gNumFileLines; i++) {
1742         gRandomLines[i] = &gFileLines[i];
1743     }
1744     qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1745
1746
1747
1748
1749     //
1750     //  We've got the file read into memory.  Go do something with it.
1751     //
1752
1753     if (opt_qsort)     doQSort();
1754     if (opt_binsearch) doBinarySearch();
1755     if (opt_keygen)    doKeyGen();
1756     if (opt_keyhist)   doKeyHist();
1757     if (opt_itertest)  doIterTest();
1758
1759     return 0;
1760
1761 }