icuSources/test/perf/collationperf/collperf.cpp

   1 /***********************************************************************
   2  * © 2016 and later: Unicode, Inc. and others.
   3  * License & terms of use: http://www.unicode.org/copyright.html#License
   4  ***********************************************************************
   5  ***********************************************************************
   6  * COPYRIGHT:
   7  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
   8  *
   9  ***********************************************************************/
  10 /********************************************************************************
  11 *
  12 * File CALLCOLL.C
  13 *
  14 * Modification History:
  15 *        Name                     Description
  16 *     Andy Heninger             First Version
  17 *
  18 *********************************************************************************
  19 */
  20
  21 //
  22 //  This program tests string collation and sort key generation performance.
  23 //      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
  24 //      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
  25 //      and include a byte order mark.  Either LE or BE format is OK.
  26 //
  27
  28 const char gUsageString[] =
  29  "usage:  collperf options...\n"
  30     "-help                      Display this message.\n"
  31     "-file file_name            utf-16 format file of names.\n"
  32     "-locale name               ICU locale to use.  Default is en_US\n"
  33     "-rules file_name           Collation rules file (overrides locale)\n"
  34     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
  35     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
  36     "-win                       Run test using Windows native services.  (ICU is default)\n"
  37     "-unix                      Run test using Unix strxfrm, strcoll services.\n"
  38     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
  39     "-usekeys                   Run tests using sortkeys rather than strcoll\n"
  40     "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
  41     "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
  42     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
  43     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
  44     "                               under test at each call point.  For measuring test overhead.\n"
  45     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
  46     "-french                    French accent ordering\n"
  47     "-frenchoff                 No French accent ordering (for use with French locales.)\n"
  48     "-norm                      Normalizing mode on\n"
  49     "-shifted                   Shifted mode\n"
  50     "-lower                     Lower case first\n"
  51     "-upper                     Upper case first\n"
  52     "-case                      Enable separate case level\n"
  53     "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
  54     "-keyhist                   Produce a table sort key size vs. string length\n"
  55     "-binsearch                 Binary Search timing test\n"
  56     "-keygen                    Sort Key Generation timing test\n"
  57     "-qsort                     Quicksort timing test\n"
  58     "-iter                      Iteration Performance Test\n"
  59     "-dump                      Display strings, sort keys and CEs.\n"
  60     ;
  61
  62
  63
  64 #include <stdio.h>
  65 #include <string.h>
  66 #include <stdlib.h>
  67 #include <math.h>
  68 #include <locale.h>
  69 #include <errno.h>
  70
  71 #include <unicode/utypes.h>
  72 #include <unicode/ucol.h>
  73 #include <unicode/ucoleitr.h>
  74 #include <unicode/uloc.h>
  75 #include <unicode/ustring.h>
  76 #include <unicode/ures.h>
  77 #include <unicode/uchar.h>
  78 #include <unicode/ucnv.h>
  79 #include <unicode/utf8.h>
  80
  81 #ifdef WIN32
  82 #include <windows.h>
  83 #else
  84 //
  85 //  Stubs for Windows API functions when building on UNIXes.
  86 //
  87 typedef int DWORD;
  88 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
  89 #include <sys/time.h>
  90 unsigned long timeGetTime() {
  91     struct timeval t;
  92     gettimeofday(&t, 0);
  93     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
  94     val += t.tv_usec / 1000;
  95     return val;
  96 }
  97 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
  98 const int LCMAP_SORTKEY = 0;
  99 #define MAKELCID(a,b) 0
 100 const int SORT_DEFAULT = 0;
 101 #endif
 102
 103
 104
 105 //
 106 //  Command line option variables
 107 //     These global variables are set according to the options specified
 108 //     on the command line by the user.
 109 char * opt_fName      = 0;
 110 const char * opt_locale     = "en_US";
 111 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
 112 char * opt_rules      = 0;
 113 UBool  opt_help       = FALSE;
 114 int    opt_loopCount  = 1;
 115 int    opt_iLoopCount = 1;
 116 UBool  opt_terse      = FALSE;
 117 UBool  opt_qsort      = FALSE;
 118 UBool  opt_binsearch  = FALSE;
 119 UBool  opt_icu        = TRUE;
 120 UBool  opt_win        = FALSE;      // Run with Windows native functions.
 121 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
 122 UBool  opt_uselen     = FALSE;
 123 UBool  opt_usekeys    = FALSE;
 124 UBool  opt_strcmp     = FALSE;
 125 UBool  opt_strcmpCPO  = FALSE;
 126 UBool  opt_norm       = FALSE;
 127 UBool  opt_keygen     = FALSE;
 128 UBool  opt_french     = FALSE;
 129 UBool  opt_frenchoff  = FALSE;
 130 UBool  opt_shifted    = FALSE;
 131 UBool  opt_lower      = FALSE;
 132 UBool  opt_upper      = FALSE;
 133 UBool  opt_case       = FALSE;
 134 int    opt_level      = 0;
 135 UBool  opt_keyhist    = FALSE;
 136 UBool  opt_itertest   = FALSE;
 137 UBool  opt_dump       = FALSE;
 138
 139
 140
 141 //
 142 //   Definitions for the command line options
 143 //
 144 struct OptSpec {
 145     const char *name;
 146     enum {FLAG, NUM, STRING} type;
 147     void *pVar;
 148 };
 149
 150 OptSpec opts[] = {
 151     {"-file",        OptSpec::STRING, &opt_fName},
 152     {"-locale",      OptSpec::STRING, &opt_locale},
 153     {"-langid",      OptSpec::NUM,    &opt_langid},
 154     {"-rules",       OptSpec::STRING, &opt_rules},
 155     {"-qsort",       OptSpec::FLAG,   &opt_qsort},
 156     {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
 157     {"-iter",        OptSpec::FLAG,   &opt_itertest},
 158     {"-win",         OptSpec::FLAG,   &opt_win},
 159     {"-unix",        OptSpec::FLAG,   &opt_unix},
 160     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
 161     {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
 162     {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
 163     {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
 164     {"-norm",        OptSpec::FLAG,   &opt_norm},
 165     {"-french",      OptSpec::FLAG,   &opt_french},
 166     {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
 167     {"-shifted",     OptSpec::FLAG,   &opt_shifted},
 168     {"-lower",       OptSpec::FLAG,   &opt_lower},
 169     {"-upper",       OptSpec::FLAG,   &opt_upper},
 170     {"-case",        OptSpec::FLAG,   &opt_case},
 171     {"-level",       OptSpec::NUM,    &opt_level},
 172     {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
 173     {"-keygen",      OptSpec::FLAG,   &opt_keygen},
 174     {"-loop",        OptSpec::NUM,    &opt_loopCount},
 175     {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
 176     {"-terse",       OptSpec::FLAG,   &opt_terse},
 177     {"-dump",        OptSpec::FLAG,   &opt_dump},
 178     {"-help",        OptSpec::FLAG,   &opt_help},
 179     {"-?",           OptSpec::FLAG,   &opt_help},
 180     {0, OptSpec::FLAG, 0}
 181 };
 182
 183
 184 //---------------------------------------------------------------------------
 185 //
 186 //  Global variables pointing to and describing the test file
 187 //
 188 //---------------------------------------------------------------------------
 189
 190 //
 191 //   struct Line
 192 //
 193 //      Each line from the source file (containing a name, presumably) gets
 194 //      one of these structs.
 195 //
 196 struct  Line {
 197     UChar     *name;
 198     int        len;
 199     char      *winSortKey;
 200     char      *icuSortKey;
 201     char      *unixSortKey;
 202     char      *unixName;
 203 };
 204
 205
 206
 207 Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
 208 int            gNumFileLines;
 209 UCollator     *gCol;
 210 DWORD          gWinLCID;
 211
 212 Line          **gSortedLines;
 213 Line          **gRandomLines;
 214 int            gCount;
 215
 216
 217
 218 //---------------------------------------------------------------------------
 219 //
 220 //  ProcessOptions()    Function to read the command line options.
 221 //
 222 //---------------------------------------------------------------------------
 223 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
 224 {
 225     int         i;
 226     int         argNum;
 227     const char  *pArgName;
 228     OptSpec    *pOpt;
 229
 230     for (argNum=1; argNum<argc; argNum++) {
 231         pArgName = argv[argNum];
 232         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
 233             if (strcmp(pOpt->name, pArgName) == 0) {
 234                 switch (pOpt->type) {
 235                 case OptSpec::FLAG:
 236                     *(UBool *)(pOpt->pVar) = TRUE;
 237                     break;
 238                 case OptSpec::STRING:
 239                     argNum ++;
 240                     if (argNum >= argc) {
 241                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 242                         return FALSE;
 243                     }
 244                     *(const char **)(pOpt->pVar)  = argv[argNum];
 245                     break;
 246                 case OptSpec::NUM:
 247                     argNum ++;
 248                     if (argNum >= argc) {
 249                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 250                         return FALSE;
 251                     }
 252                     char *endp;
 253                     i = strtol(argv[argNum], &endp, 0);
 254                     if (endp == argv[argNum]) {
 255                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
 256                         return FALSE;
 257                     }
 258                     *(int *)(pOpt->pVar) = i;
 259                 }
 260                 break;
 261             }
 262         }
 263         if (pOpt->name == 0)
 264         {
 265             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
 266             return FALSE;
 267         }
 268     }
 269 return TRUE;
 270 }
 271
 272 //---------------------------------------------------------------------------------------
 273 //
 274 //   Comparison functions for use by qsort.
 275 //
 276 //       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
 277 //           or null terminated.
 278 //
 279 //---------------------------------------------------------------------------------------
 280 int ICUstrcmpK(const void *a, const void *b) {
 281     gCount++;
 282     int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
 283     return t;
 284 }
 285
 286
 287 int ICUstrcmpL(const void *a, const void *b) {
 288     gCount++;
 289     UCollationResult t;
 290     t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
 291     if (t == UCOL_LESS) return -1;
 292     if (t == UCOL_GREATER) return +1;
 293     return 0;
 294 }
 295
 296
 297 int ICUstrcmp(const void *a, const void *b) {
 298     gCount++;
 299     UCollationResult t;
 300     t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
 301     if (t == UCOL_LESS) return -1;
 302     if (t == UCOL_GREATER) return +1;
 303     return 0;
 304 }
 305
 306
 307 int Winstrcmp(const void *a, const void *b) {
 308     gCount++;
 309     int t;
 310     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
 311     return t-2;
 312 }
 313
 314
 315 int UNIXstrcmp(const void *a, const void *b) {
 316     gCount++;
 317     int t;
 318     t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
 319     return t;
 320 }
 321
 322
 323 int WinstrcmpL(const void *a, const void *b) {
 324     gCount++;
 325     int t;
 326     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
 327     return t-2;
 328 }
 329
 330
 331 int WinstrcmpK(const void *a, const void *b) {
 332     gCount++;
 333     int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
 334     return t;
 335 }
 336
 337
 338 //---------------------------------------------------------------------------------------
 339 //
 340 //   Function for sorting the names (lines) into a random order.
 341 //      Order is based on a hash of the  ICU Sort key for the lines
 342 //      The randomized order is used as input for the sorting timing tests.
 343 //
 344 //---------------------------------------------------------------------------------------
 345 int ICURandomCmp(const void *a, const void *b) {
 346     char  *ask = (*(Line **)a)->icuSortKey;
 347     char  *bsk = (*(Line **)b)->icuSortKey;
 348     int   aVal = 0;
 349     int   bVal = 0;
 350     int   retVal;
 351     while (*ask != 0) {
 352         aVal += aVal*37 + *ask++;
 353     }
 354     while (*bsk != 0) {
 355         bVal += bVal*37 + *bsk++;
 356     }
 357     retVal = -1;
 358     if (aVal == bVal) {
 359         retVal = 0;
 360     }
 361     else if (aVal > bVal) {
 362         retVal = 1;
 363     }
 364     return retVal;
 365 }
 366
 367 //---------------------------------------------------------------------------------------
 368 //
 369 //   doKeyGen()     Key Generation Timing Test
 370 //
 371 //---------------------------------------------------------------------------------------
 372 void doKeyGen()
 373 {
 374     int  line;
 375     int  loops = 0;
 376     int  iLoop;
 377     int  len=-1;
 378
 379     // Adjust loop count to compensate for file size.   Should be order n
 380     double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
 381     int adj_loopCount = int(dLoopCount);
 382     if (adj_loopCount < 1) adj_loopCount = 1;
 383
 384
 385     unsigned long startTime = timeGetTime();
 386
 387     if (opt_win) {
 388         for (loops=0; loops<adj_loopCount; loops++) {
 389             for (line=0; line < gNumFileLines; line++) {
 390                 if (opt_uselen) {
 391                     len = gFileLines[line].len;
 392                 }
 393                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 394                     LCMapStringW(gWinLCID, LCMAP_SORTKEY,
 395                         gFileLines[line].name, len,
 396                         (UChar *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
 397                 }
 398             }
 399         }
 400     }
 401     else if (opt_icu)
 402     {
 403         for (loops=0; loops<adj_loopCount; loops++) {
 404             for (line=0; line < gNumFileLines; line++) {
 405                 if (opt_uselen) {
 406                     len = gFileLines[line].len;
 407                 }
 408                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 409                     ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
 410                 }
 411             }
 412         }
 413     }
 414     else if (opt_unix)
 415     {
 416         for (loops=0; loops<adj_loopCount; loops++) {
 417             for (line=0; line < gNumFileLines; line++) {
 418                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 419                     strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
 420                 }
 421             }
 422         }
 423     }
 424
 425     unsigned long elapsedTime = timeGetTime() - startTime;
 426     int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
 427
 428     if (opt_terse == FALSE) {
 429         printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
 430         printf("Sort Key Generation:  time per key = %d ns\n", ns);
 431     }
 432     else {
 433         printf("%d,  ", ns);
 434     }
 435
 436     int   totalKeyLen = 0;
 437     int   totalChars  = 0;
 438     for (line=0; line<gNumFileLines; line++) {
 439         totalChars += u_strlen(gFileLines[line].name);
 440         if (opt_win) {
 441             totalKeyLen += strlen(gFileLines[line].winSortKey);
 442         }
 443         else if (opt_icu) {
 444             totalKeyLen += strlen(gFileLines[line].icuSortKey);
 445         }
 446         else if (opt_unix) {
 447             totalKeyLen += strlen(gFileLines[line].unixSortKey);
 448         }
 449
 450     }
 451     if (opt_terse == FALSE) {
 452         printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
 453     } else {
 454         printf("%f, ", (float)totalKeyLen / (float)totalChars);
 455     }
 456 }
 457
 458
 459
 460 //---------------------------------------------------------------------------------------
 461 //
 462 //    doBinarySearch()    Binary Search timing test.  Each name from the list
 463 //                        is looked up in the full sorted list of names.
 464 //
 465 //---------------------------------------------------------------------------------------
 466 void doBinarySearch()
 467 {
 468
 469     gCount = 0;
 470     int  line;
 471     int  loops = 0;
 472     int  iLoop = 0;
 473     unsigned long elapsedTime = 0;
 474
 475     // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
 476     // Accurate timings do not depend on this being perfect.  The correction is just to try to
 477     //   get total running times of about the right order, so the that user doesn't need to
 478     //   manually adjust the loop count for every different file size.
 479     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
 480     if (opt_usekeys) dLoopCount *= 5;
 481     int adj_loopCount = int(dLoopCount);
 482     if (adj_loopCount < 1) adj_loopCount = 1;
 483
 484
 485     for (;;) {  // not really a loop, just allows "break" to work, to simplify
 486                 //   inadvertantly running more than one test through here.
 487         if (opt_strcmp || opt_strcmpCPO)
 488         {
 489             unsigned long startTime = timeGetTime();
 490             typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
 491             PF pf = u_strcmp;
 492             if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
 493             //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
 494                                                             //   which forces the use of a cast here.
 495
 496             int r = 0;
 497             for (loops=0; loops<adj_loopCount; loops++) {
 498
 499                 for (line=0; line < gNumFileLines; line++) {
 500                     int hi      = gNumFileLines-1;
 501                     int lo      = 0;
 502                     int  guess = -1;
 503                     for (;;) {
 504                         int newGuess = (hi + lo) / 2;
 505                         if (newGuess == guess)
 506                             break;
 507                         guess = newGuess;
 508                         for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 509                             r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
 510                         }
 511                         gCount++;
 512                         if (r== 0)
 513                             break;
 514                         if (r < 0)
 515                             hi = guess;
 516                         else
 517                             lo   = guess;
 518                     }
 519                 }
 520             }
 521             elapsedTime = timeGetTime() - startTime;
 522             break;
 523         }
 524
 525
 526         if (opt_icu)
 527         {
 528             unsigned long startTime = timeGetTime();
 529             UCollationResult  r = UCOL_EQUAL;
 530             for (loops=0; loops<adj_loopCount; loops++) {
 531
 532                 for (line=0; line < gNumFileLines; line++) {
 533                     int lineLen  = -1;
 534                     int guessLen = -1;
 535                     if (opt_uselen) {
 536                         lineLen = (gSortedLines[line])->len;
 537                     }
 538                     int hi      = gNumFileLines-1;
 539                     int lo      = 0;
 540                     int  guess = -1;
 541                     for (;;) {
 542                         int newGuess = (hi + lo) / 2;
 543                         if (newGuess == guess)
 544                             break;
 545                         guess = newGuess;
 546                         int ri = 0;
 547                         if (opt_usekeys) {
 548                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 549                                 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
 550                             }
 551                             gCount++;
 552                             r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
 553                         }
 554                         else
 555                         {
 556                             if (opt_uselen) {
 557                                 guessLen = (gSortedLines[guess])->len;
 558                             }
 559                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 560                                 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
 561                             }
 562                             gCount++;
 563                         }
 564                         if (r== UCOL_EQUAL)
 565                             break;
 566                         if (r == UCOL_LESS)
 567                             hi = guess;
 568                         else
 569                             lo   = guess;
 570                     }
 571                 }
 572             }
 573             elapsedTime = timeGetTime() - startTime;
 574             break;
 575         }
 576
 577         if (opt_win)
 578         {
 579             unsigned long startTime = timeGetTime();
 580             int r = 0;
 581             for (loops=0; loops<adj_loopCount; loops++) {
 582
 583                 for (line=0; line < gNumFileLines; line++) {
 584                     int lineLen  = -1;
 585                     int guessLen = -1;
 586                     if (opt_uselen) {
 587                         lineLen = (gSortedLines[line])->len;
 588                     }
 589                     int hi   = gNumFileLines-1;
 590                     int lo   = 0;
 591                     int  guess = -1;
 592                     for (;;) {
 593                         int newGuess = (hi + lo) / 2;
 594                         if (newGuess == guess)
 595                             break;
 596                         guess = newGuess;
 597                         if (opt_usekeys) {
 598                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 599                                 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
 600                             }
 601                             gCount++;
 602                             r+=2;
 603                         }
 604                         else
 605                         {
 606                             if (opt_uselen) {
 607                                 guessLen = (gSortedLines[guess])->len;
 608                             }
 609                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 610                                 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
 611                             }
 612                             if (r == 0) {
 613                                 if (opt_terse == FALSE) {
 614                                     fprintf(stderr, "Error returned from Windows CompareStringW.\n");
 615                                 }
 616                                 exit(-1);
 617                             }
 618                             gCount++;
 619                         }
 620                         if (r== 2)   //  strings ==
 621                             break;
 622                         if (r == 1)  //  line < guess
 623                             hi = guess;
 624                         else         //  line > guess
 625                             lo   = guess;
 626                     }
 627                 }
 628             }
 629             elapsedTime = timeGetTime() - startTime;
 630             break;
 631         }
 632
 633         if (opt_unix)
 634         {
 635             unsigned long startTime = timeGetTime();
 636             int r = 0;
 637             for (loops=0; loops<adj_loopCount; loops++) {
 638
 639                 for (line=0; line < gNumFileLines; line++) {
 640                     int hi   = gNumFileLines-1;
 641                     int lo   = 0;
 642                     int  guess = -1;
 643                     for (;;) {
 644                         int newGuess = (hi + lo) / 2;
 645                         if (newGuess == guess)
 646                             break;
 647                         guess = newGuess;
 648                         if (opt_usekeys) {
 649                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 650                                  r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
 651                             }
 652                             gCount++;
 653                         }
 654                         else
 655                         {
 656                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
 657                                 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
 658                             }
 659                             errno = 0;
 660                             if (errno != 0) {
 661                                 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
 662                                 exit(-1);
 663                             }
 664                             gCount++;
 665                         }
 666                         if (r == 0)   //  strings ==
 667                             break;
 668                         if (r < 0)  //  line < guess
 669                             hi = guess;
 670                         else         //  line > guess
 671                             lo   = guess;
 672                     }
 673                 }
 674             }
 675             elapsedTime = timeGetTime() - startTime;
 676             break;
 677         }
 678         break;
 679     }
 680
 681     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 682     if (opt_terse == FALSE) {
 683         printf("binary search:  total # of string compares = %d\n", gCount);
 684         printf("binary search:  compares per loop = %d\n", gCount / loops);
 685         printf("binary search:  time per compare = %d ns\n", ns);
 686     } else {
 687         printf("%d, ", ns);
 688     }
 689
 690 }
 691
 692
 693
 694
 695 //---------------------------------------------------------------------------------------
 696 //
 697 //   doQSort()    The quick sort timing test.  Uses the C library qsort function.
 698 //
 699 //---------------------------------------------------------------------------------------
 700 void doQSort() {
 701     int i;
 702     Line **sortBuf = new Line *[gNumFileLines];
 703
 704     // Adjust loop count to compensate for file size.   QSort should be n log(n)
 705     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
 706     if (opt_usekeys) dLoopCount *= 5;
 707     int adj_loopCount = int(dLoopCount);
 708     if (adj_loopCount < 1) adj_loopCount = 1;
 709
 710
 711     gCount = 0;
 712     unsigned long startTime = timeGetTime();
 713     if (opt_win && opt_usekeys) {
 714         for (i=0; i<opt_loopCount; i++) {
 715             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 716             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
 717         }
 718     }
 719
 720     else if (opt_win && opt_uselen) {
 721         for (i=0; i<adj_loopCount; i++) {
 722             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 723             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
 724         }
 725     }
 726
 727
 728     else if (opt_win && !opt_uselen) {
 729         for (i=0; i<adj_loopCount; i++) {
 730             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 731             qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
 732         }
 733     }
 734
 735     else if (opt_icu && opt_usekeys) {
 736         for (i=0; i<adj_loopCount; i++) {
 737             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 738             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
 739         }
 740     }
 741
 742     else if (opt_icu && opt_uselen) {
 743         for (i=0; i<adj_loopCount; i++) {
 744             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 745             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
 746         }
 747     }
 748
 749
 750     else if (opt_icu && !opt_uselen) {
 751         for (i=0; i<adj_loopCount; i++) {
 752             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 753             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
 754         }
 755     }
 756
 757     else if (opt_unix && !opt_usekeys) {
 758         for (i=0; i<adj_loopCount; i++) {
 759             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
 760             qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
 761         }
 762     }
 763
 764     unsigned long elapsedTime = timeGetTime() - startTime;
 765     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 766     if (opt_terse == FALSE) {
 767         printf("qsort:  total # of string compares = %d\n", gCount);
 768         printf("qsort:  time per compare = %d ns\n", ns);
 769     } else {
 770         printf("%d, ", ns);
 771     }
 772 }
 773
 774
 775
 776 //---------------------------------------------------------------------------------------
 777 //
 778 //    doKeyHist()       Output a table of data for
 779 //                        average sort key size vs. string length.
 780 //
 781 //---------------------------------------------------------------------------------------
 782 void doKeyHist() {
 783     int     i;
 784     int     maxLen = 0;
 785
 786     // Find the maximum string length
 787     for (i=0; i<gNumFileLines; i++) {
 788         if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
 789     }
 790
 791     // Allocate arrays to hold the histogram data
 792     int *accumulatedLen  = new int[maxLen+1];
 793     int *numKeysOfSize   = new int[maxLen+1];
 794     for (i=0; i<=maxLen; i++) {
 795         accumulatedLen[i] = 0;
 796         numKeysOfSize[i] = 0;
 797     }
 798
 799     // Fill the arrays...
 800     for (i=0; i<gNumFileLines; i++) {
 801         int len = gFileLines[i].len;
 802         accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
 803         numKeysOfSize[len] += 1;
 804     }
 805
 806     // And write out averages
 807     printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
 808     for (i=1; i<=maxLen; i++) {
 809         if (numKeysOfSize[i] > 0) {
 810             printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
 811                 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
 812         }
 813     }
 814     delete []accumulatedLen;
 815     delete []numKeysOfSize ;
 816 }
 817
 818 //---------------------------------------------------------------------------------------
 819 //
 820 //    doForwardIterTest(UBool)       Forward iteration test
 821 //                                   argument null-terminated string used
 822 //
 823 //---------------------------------------------------------------------------------------
 824 void doForwardIterTest(UBool haslen) {
 825     int count = 0;
 826
 827     UErrorCode error = U_ZERO_ERROR;
 828     printf("\n\nPerforming forward iteration performance test with ");
 829
 830     if (haslen) {
 831         printf("non-null terminated data -----------\n");
 832     }
 833     else {
 834         printf("null terminated data -----------\n");
 835     }
 836     printf("performance test on strings from file -----------\n");
 837
 838     UChar dummytext[] = {0, 0};
 839     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
 840     ucol_setText(iter, dummytext, 1, &error);
 841
 842     gCount = 0;
 843     unsigned long startTime = timeGetTime();
 844     while (count < opt_loopCount) {
 845         int linecount = 0;
 846         while (linecount < gNumFileLines) {
 847             UChar *str = gFileLines[linecount].name;
 848             int strlen = haslen?gFileLines[linecount].len:-1;
 849             ucol_setText(iter, str, strlen, &error);
 850             while (ucol_next(iter, &error) != UCOL_NULLORDER) {
 851                 gCount++;
 852             }
 853
 854             linecount ++;
 855         }
 856         count ++;
 857     }
 858     unsigned long elapsedTime = timeGetTime() - startTime;
 859     printf("elapsedTime %ld\n", elapsedTime);
 860
 861     // empty loop recalculation
 862     count = 0;
 863     startTime = timeGetTime();
 864     while (count < opt_loopCount) {
 865         int linecount = 0;
 866         while (linecount < gNumFileLines) {
 867             UChar *str = gFileLines[linecount].name;
 868             int strlen = haslen?gFileLines[linecount].len:-1;
 869             ucol_setText(iter, str, strlen, &error);
 870             linecount ++;
 871         }
 872         count ++;
 873     }
 874     elapsedTime -= (timeGetTime() - startTime);
 875     printf("elapsedTime %ld\n", elapsedTime);
 876
 877     ucol_closeElements(iter);
 878
 879     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 880     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
 881                                                                 opt_loopCount);
 882     printf("Average time per ucol_next() nano seconds %d\n", ns);
 883
 884     printf("performance test on skipped-5 concatenated strings from file -----------\n");
 885
 886     UChar *str;
 887     int    strlen = 0;
 888     // appending all the strings
 889     int linecount = 0;
 890     while (linecount < gNumFileLines) {
 891         strlen += haslen?gFileLines[linecount].len:
 892                                       u_strlen(gFileLines[linecount].name);
 893         linecount ++;
 894     }
 895     str = (UChar *)malloc(sizeof(UChar) * strlen);
 896     int strindex = 0;
 897     linecount = 0;
 898     while (strindex < strlen) {
 899         int len = 0;
 900         len += haslen?gFileLines[linecount].len:
 901                                       u_strlen(gFileLines[linecount].name);
 902         memcpy(str + strindex, gFileLines[linecount].name,
 903                sizeof(UChar) * len);
 904         strindex += len;
 905         linecount ++;
 906     }
 907
 908     printf("Total size of strings %d\n", strlen);
 909
 910     gCount = 0;
 911     count  = 0;
 912
 913     if (!haslen) {
 914         strlen = -1;
 915     }
 916     iter = ucol_openElements(gCol, str, strlen, &error);
 917     if (!haslen) {
 918         strlen = u_strlen(str);
 919     }
 920     strlen -= 5; // any left over characters are not iterated,
 921                  // this is to ensure the backwards and forwards iterators
 922                  // gets the same position
 923     startTime = timeGetTime();
 924     while (count < opt_loopCount) {
 925         int count5 = 5;
 926         strindex = 0;
 927         ucol_setOffset(iter, strindex, &error);
 928         while (TRUE) {
 929             if (ucol_next(iter, &error) == UCOL_NULLORDER) {
 930                 break;
 931             }
 932             gCount++;
 933             count5 --;
 934             if (count5 == 0) {
 935                 strindex += 10;
 936                 if (strindex > strlen) {
 937                     break;
 938                 }
 939                 ucol_setOffset(iter, strindex, &error);
 940                 count5 = 5;
 941             }
 942         }
 943         count ++;
 944     }
 945
 946     elapsedTime = timeGetTime() - startTime;
 947     printf("elapsedTime %ld\n", elapsedTime);
 948
 949     // empty loop recalculation
 950     int tempgCount = 0;
 951     count = 0;
 952     startTime = timeGetTime();
 953     while (count < opt_loopCount) {
 954         int count5 = 5;
 955         strindex = 0;
 956         ucol_setOffset(iter, strindex, &error);
 957         while (TRUE) {
 958             tempgCount ++;
 959             count5 --;
 960             if (count5 == 0) {
 961                 strindex += 10;
 962                 if (strindex > strlen) {
 963                     break;
 964                 }
 965                 ucol_setOffset(iter, strindex, &error);
 966                 count5 = 5;
 967             }
 968         }
 969         count ++;
 970     }
 971     elapsedTime -= (timeGetTime() - startTime);
 972     printf("elapsedTime %ld\n", elapsedTime);
 973
 974     ucol_closeElements(iter);
 975
 976     printf("gCount %d\n", gCount);
 977     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
 978     printf("Average time per ucol_next() nano seconds %d\n", ns);
 979 }
 980
 981 //---------------------------------------------------------------------------------------
 982 //
 983 //    doBackwardIterTest(UBool)      Backwards iteration test
 984 //                                   argument null-terminated string used
 985 //
 986 //---------------------------------------------------------------------------------------
 987 void doBackwardIterTest(UBool haslen) {
 988     int count = 0;
 989     UErrorCode error = U_ZERO_ERROR;
 990     printf("\n\nPerforming backward iteration performance test with ");
 991
 992     if (haslen) {
 993         printf("non-null terminated data -----------\n");
 994     }
 995     else {
 996         printf("null terminated data -----------\n");
 997     }
 998
 999     printf("performance test on strings from file -----------\n");
1000
1001     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
1002     UChar dummytext[] = {0, 0};
1003     ucol_setText(iter, dummytext, 1, &error);
1004
1005     gCount = 0;
1006     unsigned long startTime = timeGetTime();
1007     while (count < opt_loopCount) {
1008         int linecount = 0;
1009         while (linecount < gNumFileLines) {
1010             UChar *str = gFileLines[linecount].name;
1011             int strlen = haslen?gFileLines[linecount].len:-1;
1012             ucol_setText(iter, str, strlen, &error);
1013             while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1014                 gCount ++;
1015             }
1016
1017             linecount ++;
1018         }
1019         count ++;
1020     }
1021     unsigned long elapsedTime = timeGetTime() - startTime;
1022
1023     printf("elapsedTime %ld\n", elapsedTime);
1024
1025     // empty loop recalculation
1026     count = 0;
1027     startTime = timeGetTime();
1028     while (count < opt_loopCount) {
1029         int linecount = 0;
1030         while (linecount < gNumFileLines) {
1031             UChar *str = gFileLines[linecount].name;
1032             int strlen = haslen?gFileLines[linecount].len:-1;
1033             ucol_setText(iter, str, strlen, &error);
1034             linecount ++;
1035         }
1036         count ++;
1037     }
1038     elapsedTime -= (timeGetTime() - startTime);
1039
1040     printf("elapsedTime %ld\n", elapsedTime);
1041     ucol_closeElements(iter);
1042
1043     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1044     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1045                                                                 opt_loopCount);
1046     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1047
1048     printf("performance test on skipped-5 concatenated strings from file -----------\n");
1049
1050     UChar *str;
1051     int    strlen = 0;
1052     // appending all the strings
1053     int linecount = 0;
1054     while (linecount < gNumFileLines) {
1055         strlen += haslen?gFileLines[linecount].len:
1056                                       u_strlen(gFileLines[linecount].name);
1057         linecount ++;
1058     }
1059     str = (UChar *)malloc(sizeof(UChar) * strlen);
1060     int strindex = 0;
1061     linecount = 0;
1062     while (strindex < strlen) {
1063         int len = 0;
1064         len += haslen?gFileLines[linecount].len:
1065                                       u_strlen(gFileLines[linecount].name);
1066         memcpy(str + strindex, gFileLines[linecount].name,
1067                sizeof(UChar) * len);
1068         strindex += len;
1069         linecount ++;
1070     }
1071
1072     printf("Total size of strings %d\n", strlen);
1073
1074     gCount = 0;
1075     count  = 0;
1076
1077     if (!haslen) {
1078         strlen = -1;
1079     }
1080
1081     iter = ucol_openElements(gCol, str, strlen, &error);
1082     if (!haslen) {
1083         strlen = u_strlen(str);
1084     }
1085
1086     startTime = timeGetTime();
1087     while (count < opt_loopCount) {
1088         int count5 = 5;
1089         strindex = 5;
1090         ucol_setOffset(iter, strindex, &error);
1091         while (TRUE) {
1092             if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1093                 break;
1094             }
1095              gCount ++;
1096              count5 --;
1097              if (count5 == 0) {
1098                  strindex += 10;
1099                  if (strindex > strlen) {
1100                     break;
1101                  }
1102                  ucol_setOffset(iter, strindex, &error);
1103                  count5 = 5;
1104              }
1105         }
1106         count ++;
1107     }
1108
1109     elapsedTime = timeGetTime() - startTime;
1110     printf("elapsedTime %ld\n", elapsedTime);
1111
1112     // empty loop recalculation
1113     count = 0;
1114     int tempgCount = 0;
1115     startTime = timeGetTime();
1116     while (count < opt_loopCount) {
1117         int count5 = 5;
1118         strindex = 5;
1119         ucol_setOffset(iter, strindex, &error);
1120         while (TRUE) {
1121              tempgCount ++;
1122              count5 --;
1123              if (count5 == 0) {
1124                  strindex += 10;
1125                  if (strindex > strlen) {
1126                     break;
1127                  }
1128                  ucol_setOffset(iter, strindex, &error);
1129                  count5 = 5;
1130              }
1131         }
1132         count ++;
1133     }
1134     elapsedTime -= (timeGetTime() - startTime);
1135     printf("elapsedTime %ld\n", elapsedTime);
1136     ucol_closeElements(iter);
1137
1138     printf("gCount %d\n", gCount);
1139     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1140     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1141 }
1142
1143 //---------------------------------------------------------------------------------------
1144 //
1145 //    doIterTest()       Iteration test
1146 //
1147 //---------------------------------------------------------------------------------------
1148 void doIterTest() {
1149     doForwardIterTest(opt_uselen);
1150     doBackwardIterTest(opt_uselen);
1151 }
1152
1153
1154 //----------------------------------------------------------------------------------------
1155 //
1156 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
1157 //                    Since it appears that Unicode support is going in the general
1158 //                    direction of the use of UTF-8 locales, that is the approach
1159 //                    that is used here.
1160 //
1161 //----------------------------------------------------------------------------------------
1162 void  UnixConvert() {
1163     int    line;
1164
1165     UConverter   *cvrtr;    // An ICU code page converter.
1166     UErrorCode    status = U_ZERO_ERROR;
1167
1168
1169     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
1170     if (U_FAILURE(status)) {
1171         fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1172         exit(-1);
1173     }
1174
1175     for (line=0; line < gNumFileLines; line++) {
1176         int sizeNeeded = ucnv_fromUChars(cvrtr,
1177                                          0,            // ptr to target buffer.
1178                                          0,            // length of target buffer.
1179                                          gFileLines[line].name,
1180                                          -1,           //  source is null terminated
1181                                          &status);
1182         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1183             //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1184             //exit(-1);
1185         }
1186         status = U_ZERO_ERROR;
1187         gFileLines[line].unixName = new char[sizeNeeded+1];
1188         sizeNeeded = ucnv_fromUChars(cvrtr,
1189                                          gFileLines[line].unixName, // ptr to target buffer.
1190                                          sizeNeeded+1, // length of target buffer.
1191                                          gFileLines[line].name,
1192                                          -1,           //  source is null terminated
1193                                          &status);
1194         if (U_FAILURE(status)) {
1195             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1196             exit(-1);
1197         }
1198         gFileLines[line].unixName[sizeNeeded] = 0;
1199     };
1200     ucnv_close(cvrtr);
1201 }
1202
1203
1204 //----------------------------------------------------------------------------------------
1205 //
1206 //  class UCharFile   Class to hide all the gorp to read a file in
1207 //                    and produce a stream of UChars.
1208 //
1209 //----------------------------------------------------------------------------------------
1210 class UCharFile {
1211 public:
1212     UCharFile(const char *fileName);
1213     ~UCharFile();
1214     UChar   get();
1215     UBool   eof() {return fEof;};
1216     UBool   error() {return fError;};
1217
1218 private:
1219     UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
1220     UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
1221
1222     FILE         *fFile;
1223     const char   *fName;
1224     UBool        fEof;
1225     UBool        fError;
1226     UChar        fPending2ndSurrogate;
1227
1228     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1229 };
1230
1231 UCharFile::UCharFile(const char * fileName) {
1232     fEof                 = FALSE;
1233     fError               = FALSE;
1234     fName                = fileName;
1235     fFile                = fopen(fName, "rb");
1236     fPending2ndSurrogate = 0;
1237     if (fFile == NULL) {
1238         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1239         fError = TRUE;
1240         return;
1241     }
1242     //
1243     //  Look for the byte order mark at the start of the file.
1244     //
1245     int BOMC1, BOMC2, BOMC3;
1246     BOMC1 = fgetc(fFile);
1247     BOMC2 = fgetc(fFile);
1248
1249     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1250         fEncoding = UTF16LE; }
1251     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1252         fEncoding = UTF16BE; }
1253     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1254         fEncoding = UTF8; }
1255     else
1256     {
1257         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
1258             "must include a BOM.\n", fileName);
1259         fError = true;
1260         return;
1261     }
1262 }
1263
1264
1265 UCharFile::~UCharFile() {
1266     fclose(fFile);
1267 }
1268
1269
1270
1271 UChar UCharFile::get() {
1272     UChar   c;
1273     switch (fEncoding) {
1274     case UTF16LE:
1275         {
1276             int  cL, cH;
1277             cL = fgetc(fFile);
1278             cH = fgetc(fFile);
1279             c  = cL  | (cH << 8);
1280             if (cH == EOF) {
1281                 c   = 0;
1282                 fEof = TRUE;
1283             }
1284             break;
1285         }
1286     case UTF16BE:
1287         {
1288             int  cL, cH;
1289             cH = fgetc(fFile);
1290             cL = fgetc(fFile);
1291             c  = cL  | (cH << 8);
1292             if (cL == EOF) {
1293                 c   = 0;
1294                 fEof = TRUE;
1295             }
1296             break;
1297         }
1298     case UTF8:
1299         {
1300             if (fPending2ndSurrogate != 0) {
1301                 c = fPending2ndSurrogate;
1302                 fPending2ndSurrogate = 0;
1303                 break;
1304             }
1305
1306             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
1307             if (ch == EOF) {
1308                 c = 0;
1309                 fEof = TRUE;
1310                 break;
1311             }
1312
1313             if (ch <= 0x7f) {
1314                 // It's ascii.  No further utf-8 conversion.
1315                 c = ch;
1316                 break;
1317             }
1318
1319             // Figure out the lenght of the char and read the rest of the bytes
1320             //   into a temp array.
1321             int nBytes;
1322             if (ch >= 0xF0) {nBytes=4;}
1323             else if (ch >= 0xE0) {nBytes=3;}
1324             else if (ch >= 0xC0) {nBytes=2;}
1325             else {
1326                 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1327                 fError = TRUE;
1328                 return 0;
1329             }
1330
1331             unsigned char  bytes[10];
1332             bytes[0] = (unsigned char)ch;
1333             int i;
1334             for (i=1; i<nBytes; i++) {
1335                 bytes[i] = fgetc(fFile);
1336                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1337                     fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1338                     fError = TRUE;
1339                     return 0;
1340                 }
1341             }
1342
1343             // Convert the bytes from the temp array to a Unicode char.
1344             i = 0;
1345             uint32_t  cp;
1346             U8_NEXT_UNSAFE(bytes, i, cp);
1347             c = (UChar)cp;
1348
1349             if (cp >= 0x10000) {
1350                 // The code point needs to be broken up into a utf-16 surrogate pair.
1351                 //  Process first half this time through the main loop, and
1352                 //   remember the other half for the next time through.
1353                 UChar utf16Buf[3];
1354                 i = 0;
1355                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1356                 fPending2ndSurrogate = utf16Buf[1];
1357                 c = utf16Buf[0];
1358             }
1359             break;
1360         };
1361     default:
1362         c = 0xFFFD; /* Error, unspecified codepage*/
1363         fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1364         exit(1);
1365     }
1366     return c;
1367 }
1368
1369 //----------------------------------------------------------------------------------------
1370 //
1371 //   openRulesCollator  - Command line specified a rules file.  Read it in
1372 //                        and open a collator with it.
1373 //
1374 //----------------------------------------------------------------------------------------
1375 UCollator *openRulesCollator() {
1376     UCharFile f(opt_rules);
1377     if (f.error()) {
1378         return 0;
1379     }
1380
1381     int  bufLen = 10000;
1382     UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1383     UChar *tmp;
1384     int i = 0;
1385
1386     for(;;) {
1387         buf[i] = f.get();
1388         if (f.eof()) {
1389             break;
1390         }
1391         if (f.error()) {
1392             return 0;
1393         }
1394         i++;
1395         if (i >= bufLen) {
1396             tmp = buf;
1397             bufLen += 10000;
1398             buf = (UChar *)realloc(buf, bufLen);
1399             if (buf == NULL) {
1400                 free(tmp);
1401                 return 0;
1402             }
1403         }
1404     }
1405     buf[i] = 0;
1406
1407     UErrorCode    status = U_ZERO_ERROR;
1408     UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1409                                          UCOL_DEFAULT_STRENGTH, NULL, &status);
1410     if (U_FAILURE(status)) {
1411         fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1412         return 0;
1413     }
1414     free(buf);
1415     return coll;
1416 }
1417
1418
1419
1420
1421
1422 //----------------------------------------------------------------------------------------
1423 //
1424 //    Main   --  process command line, read in and pre-process the test file,
1425 //                 call other functions to do the actual tests.
1426 //
1427 //----------------------------------------------------------------------------------------
1428 int main(int argc, const char** argv) {
1429     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
1430         printf(gUsageString);
1431         exit (1);
1432     }
1433
1434     // Make sure that we've only got one API selected.
1435     if (opt_unix || opt_win) opt_icu = FALSE;
1436     if (opt_unix) opt_win = FALSE;
1437
1438     //
1439     //  Set up an ICU collator
1440     //
1441     UErrorCode          status = U_ZERO_ERROR;
1442
1443     if (opt_rules != 0) {
1444         gCol = openRulesCollator();
1445         if (gCol == 0) {return -1;}
1446     }
1447     else {
1448         gCol = ucol_open(opt_locale, &status);
1449         if (U_FAILURE(status)) {
1450             fprintf(stderr, "Collator creation failed.: %d\n", status);
1451             return -1;
1452         }
1453     }
1454     if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
1455         fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1456     }
1457     if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
1458         fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1459     }
1460
1461     if (opt_norm) {
1462         ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1463     }
1464     if (opt_french && opt_frenchoff) {
1465         fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
1466         exit(-1);
1467     }
1468     if (opt_french) {
1469         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1470     }
1471     if (opt_frenchoff) {
1472         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1473     }
1474     if (opt_lower) {
1475         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1476     }
1477     if (opt_upper) {
1478         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1479     }
1480     if (opt_case) {
1481         ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1482     }
1483     if (opt_shifted) {
1484         ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1485     }
1486     if (opt_level != 0) {
1487         switch (opt_level) {
1488         case 1:
1489             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1490             break;
1491         case 2:
1492             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1493             break;
1494         case 3:
1495             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1496             break;
1497         case 4:
1498             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1499             break;
1500         case 5:
1501             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1502             break;
1503         default:
1504             fprintf(stderr, "-level param must be between 1 and 5\n");
1505             exit(-1);
1506         }
1507     }
1508
1509     if (U_FAILURE(status)) {
1510         fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1511         return -1;
1512     }
1513
1514
1515     //
1516     //  Set up a Windows LCID
1517     //
1518     if (opt_langid != 0) {
1519         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1520     }
1521     else {
1522         gWinLCID = uloc_getLCID(opt_locale);
1523     }
1524
1525
1526     //
1527     //  Set the UNIX locale
1528     //
1529     if (opt_unix) {
1530         if (setlocale(LC_ALL, opt_locale) == 0) {
1531             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1532             exit(-1);
1533         }
1534     }
1535
1536     // Read in  the input file.
1537     //   File assumed to be utf-16.
1538     //   Lines go onto heap buffers.  Global index array to line starts is created.
1539     //   Lines themselves are null terminated.
1540     //
1541
1542     UCharFile f(opt_fName);
1543     if (f.error()) {
1544         exit(-1);
1545     }
1546
1547     const int MAXLINES = 100000;
1548     gFileLines = new Line[MAXLINES];
1549     UChar buf[1024];
1550     int   column = 0;
1551
1552     //  Read the file, split into lines, and save in memory.
1553     //  Loop runs once per utf-16 value from the input file,
1554     //    (The number of bytes read from file per loop iteration depends on external encoding.)
1555     for (;;) {
1556
1557         UChar c = f.get();
1558         if (f.error()){
1559             exit(-1);
1560         }
1561
1562
1563         // We now have a good UTF-16 value in c.
1564
1565         // Watch for CR, LF, EOF; these finish off a line.
1566         if (c == 0xd) {
1567             continue;
1568         }
1569
1570         if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
1571             buf[column++] = 0;
1572             if (column > 1) {
1573                 gFileLines[gNumFileLines].name  = new UChar[column];
1574                 gFileLines[gNumFileLines].len   = column-1;
1575                 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1576                 gNumFileLines++;
1577                 column = 0;
1578                 if (gNumFileLines >= MAXLINES) {
1579                     fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
1580                     exit(-1);
1581                 }
1582
1583             }
1584             if (c == 0xa || c == 0x2028)
1585                 continue;
1586             else
1587                 break;  // EOF
1588         }
1589         buf[column++] = c;
1590         if (column >= 1023)
1591         {
1592             static UBool warnFlag = TRUE;
1593             if (warnFlag) {
1594                 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1595                 warnFlag = FALSE;
1596             }
1597             column--;
1598         }
1599     }
1600
1601     if (opt_terse == FALSE) {
1602         printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1603     }
1604
1605
1606     // Convert the lines to the UNIX encoding.
1607     if (opt_unix) {
1608         UnixConvert();
1609     }
1610
1611     //
1612     //  Pre-compute ICU sort keys for the lines of the file.
1613     //
1614     int line;
1615     int32_t t;
1616
1617     for (line=0; line<gNumFileLines; line++) {
1618          t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1619          gFileLines[line].icuSortKey  = new char[t];
1620
1621          if (t > (int32_t)sizeof(buf)) {
1622              t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1623          }
1624          else
1625          {
1626              memcpy(gFileLines[line].icuSortKey, buf, t);
1627          }
1628     }
1629
1630
1631
1632     //
1633     //  Pre-compute Windows sort keys for the lines of the file.
1634     //
1635     for (line=0; line<gNumFileLines; line++) {
1636          t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1637          gFileLines[line].winSortKey  = new char[t];
1638          if (t > (int32_t)sizeof(buf)) {
1639              t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (UChar *)(gFileLines[line].winSortKey), t);
1640          }
1641          else
1642          {
1643              memcpy(gFileLines[line].winSortKey, buf, t);
1644          }
1645     }
1646
1647     //
1648     //  Pre-compute UNIX sort keys for the lines of the file.
1649     //
1650     if (opt_unix) {
1651         for (line=0; line<gNumFileLines; line++) {
1652             t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
1653             gFileLines[line].unixSortKey  = new char[t];
1654             if (t > (int32_t)sizeof(buf)) {
1655                 t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
1656             }
1657             else
1658             {
1659                 memcpy(gFileLines[line].unixSortKey, buf, t);
1660             }
1661         }
1662     }
1663
1664
1665     //
1666     //  Dump file lines, CEs, Sort Keys if requested.
1667     //
1668     if (opt_dump) {
1669         int  i;
1670         for (line=0; line<gNumFileLines; line++) {
1671             for (i=0;;i++) {
1672                 UChar  c = gFileLines[line].name[i];
1673                 if (c == 0)
1674                     break;
1675                 if (c < 0x20 || c > 0x7e) {
1676                     printf("\\u%.4x", c);
1677                 }
1678                 else {
1679                     printf("%c", c);
1680                 }
1681             }
1682             printf("\n");
1683
1684             printf("   CEs: ");
1685             UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1686             int32_t ce;
1687             i = 0;
1688             for (;;) {
1689                 ce = ucol_next(CEiter, &status);
1690                 if (ce == UCOL_NULLORDER) {
1691                     break;
1692                 }
1693                 printf(" %.8x", ce);
1694                 if (++i > 8) {
1695                     printf("\n        ");
1696                     i = 0;
1697                 }
1698             }
1699             printf("\n");
1700             ucol_closeElements(CEiter);
1701
1702
1703             printf("   ICU Sort Key: ");
1704             for (i=0; ; i++) {
1705                 unsigned char c = gFileLines[line].icuSortKey[i];
1706                 printf("%02x ", c);
1707                 if (c == 0) {
1708                     break;
1709                 }
1710                 if (i > 0 && i % 20 == 0) {
1711                     printf("\n                 ");
1712                 }
1713            }
1714             printf("\n");
1715         }
1716     }
1717
1718
1719     //
1720     //  Pre-sort the lines.
1721     //
1722     int i;
1723     gSortedLines = new Line *[gNumFileLines];
1724     for (i=0; i<gNumFileLines; i++) {
1725         gSortedLines[i] = &gFileLines[i];
1726     }
1727
1728     if (opt_win) {
1729         qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1730     }
1731     else if (opt_unix) {
1732         qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1733     }
1734     else   /* ICU */
1735     {
1736         qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1737     }
1738
1739
1740     //
1741     //  Make up a randomized order, will be used for sorting tests.
1742     //
1743     gRandomLines = new Line *[gNumFileLines];
1744     for (i=0; i<gNumFileLines; i++) {
1745         gRandomLines[i] = &gFileLines[i];
1746     }
1747     qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1748
1749
1750
1751
1752     //
1753     //  We've got the file read into memory.  Go do something with it.
1754     //
1755
1756     if (opt_qsort)     doQSort();
1757     if (opt_binsearch) doBinarySearch();
1758     if (opt_keygen)    doKeyGen();
1759     if (opt_keyhist)   doKeyHist();
1760     if (opt_itertest)  doIterTest();
1761
1762     return 0;
1763
1764 }