icuSources/test/perf/ubrkperf/ubrkperfold.cpp

   1 /***********************************************************************
   2  * © 2016 and later: Unicode, Inc. and others.
   3  * License & terms of use: http://www.unicode.org/copyright.html#License
   4  *
   5  ***********************************************************************
   6  ***********************************************************************
   7  * COPYRIGHT:
   8  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
   9  *
  10  ***********************************************************************/
  11 /********************************************************************************
  12 *
  13 * File ubrkperf.cpp
  14 *
  15 * Modification History:
  16 *        Name                     Description
  17 *     Vladimir Weinstein          First Version, based on collperf
  18 *
  19 *********************************************************************************
  20 */
  21
  22 //
  23 //  This program tests break iterator performance
  24 //      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
  25 //      (if any)
  26 //      A text file is required as input.  It must be in utf-8 or utf-16 format,
  27 //      and include a byte order mark.  Either LE or BE format is OK.
  28 //
  29
  30 const char gUsageString[] =
  31  "usage:  ubrkperf options...\n"
  32     "-help                      Display this message.\n"
  33     "-file file_name            utf-16/utf-8 format file.\n"
  34     "-locale name               ICU locale to use.  Default is en_US\n"
  35     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
  36     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
  37     "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
  38     "-unix                      Run test using Unix word breaking services. (currently not working) \n"
  39     "-mac                       Run test using MacOSX word breaking services.\n"
  40     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
  41     "-char                      Use character break iterator\n"
  42     "-word                      Use word break iterator\n"
  43     "-line                      Use line break iterator\n"
  44     "-sentence                  Use sentence break iterator\n"
  45     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
  46     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
  47     "                               under test at each call point.  For measuring test overhead.\n"
  48     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
  49     "-dump                      Display stuff.\n"
  50     "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
  51     "-next                      Do the next test\n"
  52     "-isBound                   Do the isBound test\n"
  53     ;
  54
  55
  56 #include <stdio.h>
  57 #include <string.h>
  58 #include <stdlib.h>
  59 #include <math.h>
  60 #include <locale.h>
  61 #include <errno.h>
  62 #include <sys/stat.h>
  63
  64 #include <unicode/utypes.h>
  65 #include <unicode/ucol.h>
  66 #include <unicode/ucoleitr.h>
  67 #include <unicode/uloc.h>
  68 #include <unicode/ustring.h>
  69 #include <unicode/ures.h>
  70 #include <unicode/uchar.h>
  71 #include <unicode/ucnv.h>
  72 #include <unicode/utf8.h>
  73
  74 #include <unicode/brkiter.h>
  75
  76
  77 #if U_PLATFORM_HAS_WIN32_API
  78 #include <windows.h>
  79 #else
  80 //
  81 //  Stubs for Windows API functions when building on UNIXes.
  82 //
  83 #include <sys/time.h>
  84 unsigned long timeGetTime() {
  85     struct timeval t;
  86     gettimeofday(&t, 0);
  87     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
  88     val += t.tv_usec / 1000;
  89     return val;
  90 };
  91 #define MAKELCID(a,b) 0
  92 #endif
  93
  94
  95 //
  96 //  Command line option variables
  97 //     These global variables are set according to the options specified
  98 //     on the command line by the user.
  99 char * opt_fName      = 0;
 100 char * opt_locale     = "en_US";
 101 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
 102 char * opt_rules      = 0;
 103 UBool  opt_help       = FALSE;
 104 int    opt_time       = 0;
 105 int    opt_loopCount  = 0;
 106 int    opt_passesCount= 1;
 107 UBool  opt_terse      = FALSE;
 108 UBool  opt_icu        = TRUE;
 109 UBool  opt_win        = FALSE;      // Run with Windows native functions.
 110 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
 111 UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
 112 UBool  opt_uselen     = FALSE;
 113 UBool  opt_dump       = FALSE;
 114 UBool  opt_char       = FALSE;
 115 UBool  opt_word       = FALSE;
 116 UBool  opt_line       = FALSE;
 117 UBool  opt_sentence   = FALSE;
 118 UBool  opt_capi       = FALSE;
 119
 120 UBool  opt_next       = FALSE;
 121 UBool  opt_isBound    = FALSE;
 122
 123
 124
 125 //
 126 //   Definitions for the command line options
 127 //
 128 struct OptSpec {
 129     const char *name;
 130     enum {FLAG, NUM, STRING} type;
 131     void *pVar;
 132 };
 133
 134 OptSpec opts[] = {
 135     {"-file",        OptSpec::STRING, &opt_fName},
 136     {"-locale",      OptSpec::STRING, &opt_locale},
 137     {"-langid",      OptSpec::NUM,    &opt_langid},
 138     {"-win",         OptSpec::FLAG,   &opt_win},
 139     {"-unix",        OptSpec::FLAG,   &opt_unix},
 140     {"-mac",         OptSpec::FLAG,   &opt_mac},
 141     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
 142     {"-loop",        OptSpec::NUM,    &opt_loopCount},
 143     {"-time",        OptSpec::NUM,    &opt_time},
 144     {"-passes",      OptSpec::NUM,    &opt_passesCount},
 145     {"-char",        OptSpec::FLAG,   &opt_char},
 146     {"-word",        OptSpec::FLAG,   &opt_word},
 147     {"-line",        OptSpec::FLAG,   &opt_line},
 148     {"-sentence",    OptSpec::FLAG,   &opt_sentence},
 149     {"-terse",       OptSpec::FLAG,   &opt_terse},
 150     {"-dump",        OptSpec::FLAG,   &opt_dump},
 151     {"-capi",        OptSpec::FLAG,   &opt_capi},
 152     {"-next",        OptSpec::FLAG,   &opt_next},
 153     {"-isBound",     OptSpec::FLAG,   &opt_isBound},
 154     {"-help",        OptSpec::FLAG,   &opt_help},
 155     {"-?",           OptSpec::FLAG,   &opt_help},
 156     {0, OptSpec::FLAG, 0}
 157 };
 158
 159
 160 //---------------------------------------------------------------------------
 161 //
 162 //  Global variables pointing to and describing the test file
 163 //
 164 //---------------------------------------------------------------------------
 165
 166 //DWORD          gWinLCID;
 167 BreakIterator *brkit = NULL;
 168 UChar *text = NULL;
 169 int32_t textSize = 0;
 170
 171
 172
 173 #if U_PLATFORM_IS_DARWIN_BASED
 174 #include <ApplicationServices/ApplicationServices.h>
 175 enum{
 176   kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
 177     };
 178 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
 179 TextBreakLocatorRef breakRef;
 180 UCTextBreakType macBreakType;
 181
 182 void createMACBrkIt() {
 183   OSStatus status = noErr;
 184   LocaleRef lref;
 185   status = LocaleRefFromLocaleString(opt_locale, &lref);
 186   status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
 187   if(opt_char == TRUE) {
 188     macBreakType = kUCTextBreakClusterMask;
 189   } else if(opt_word == TRUE) {
 190     macBreakType = kUCTextBreakWordMask;
 191   } else if(opt_line == TRUE) {
 192     macBreakType = kUCTextBreakLineMask;
 193   } else if(opt_sentence == TRUE) {
 194     // error
 195     // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
 196   } else {
 197     // default is character iterator
 198     macBreakType = kUCTextBreakClusterMask;
 199       }
 200 }
 201 #endif
 202
 203 void createICUBrkIt() {
 204   //
 205   //  Set up an ICU break iterator
 206   //
 207   UErrorCode          status = U_ZERO_ERROR;
 208   if(opt_char == TRUE) {
 209     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
 210   } else if(opt_word == TRUE) {
 211     brkit = BreakIterator::createWordInstance(opt_locale, status);
 212   } else if(opt_line == TRUE) {
 213     brkit = BreakIterator::createLineInstance(opt_locale, status);
 214   } else if(opt_sentence == TRUE) {
 215     brkit = BreakIterator::createSentenceInstance(opt_locale, status);
 216   } else {
 217     // default is character iterator
 218     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
 219   }
 220   if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
 221     fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
 222   }
 223   if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
 224     fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
 225   }
 226
 227 }
 228
 229 //---------------------------------------------------------------------------
 230 //
 231 //  ProcessOptions()    Function to read the command line options.
 232 //
 233 //---------------------------------------------------------------------------
 234 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
 235 {
 236     int         i;
 237     int         argNum;
 238     const char  *pArgName;
 239     OptSpec    *pOpt;
 240
 241     for (argNum=1; argNum<argc; argNum++) {
 242         pArgName = argv[argNum];
 243         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
 244             if (strcmp(pOpt->name, pArgName) == 0) {
 245                 switch (pOpt->type) {
 246                 case OptSpec::FLAG:
 247                     *(UBool *)(pOpt->pVar) = TRUE;
 248                     break;
 249                 case OptSpec::STRING:
 250                     argNum ++;
 251                     if (argNum >= argc) {
 252                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 253                         return FALSE;
 254                     }
 255                     *(const char **)(pOpt->pVar)  = argv[argNum];
 256                     break;
 257                 case OptSpec::NUM:
 258                     argNum ++;
 259                     if (argNum >= argc) {
 260                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 261                         return FALSE;
 262                     }
 263                     char *endp;
 264                     i = strtol(argv[argNum], &endp, 0);
 265                     if (endp == argv[argNum]) {
 266                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
 267                         return FALSE;
 268                     }
 269                     *(int *)(pOpt->pVar) = i;
 270                 }
 271                 break;
 272             }
 273         }
 274         if (pOpt->name == 0)
 275         {
 276             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
 277             return FALSE;
 278         }
 279     }
 280 return TRUE;
 281 }
 282
 283
 284 void doForwardTest() {
 285   if (opt_terse == FALSE) {
 286     printf("Doing the forward test\n");
 287   }
 288   int32_t noBreaks = 0;
 289   int32_t i = 0;
 290   unsigned long startTime = timeGetTime();
 291   unsigned long elapsedTime = 0;
 292   if(opt_icu) {
 293     createICUBrkIt();
 294     brkit->setText(UnicodeString(text, textSize));
 295     brkit->first();
 296     if (opt_terse == FALSE) {
 297       printf("Warmup\n");
 298     }
 299     int j;
 300     while((j = brkit->next()) != BreakIterator::DONE) {
 301       noBreaks++;
 302       //fprintf(stderr, "%d ", j);
 303     }
 304
 305     if (opt_terse == FALSE) {
 306       printf("Measure\n");
 307     }
 308     startTime = timeGetTime();
 309     for(i = 0; i < opt_loopCount; i++) {
 310       brkit->first();
 311       while(brkit->next() != BreakIterator::DONE) {
 312       }
 313     }
 314
 315     elapsedTime = timeGetTime()-startTime;
 316   } else if(opt_mac) {
 317 #if U_PLATFORM_IS_DARWIN_BASED
 318     createMACBrkIt();
 319     UniChar* filePtr = text;
 320     OSStatus status = noErr;
 321     UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
 322     startOffset = 0;
 323     //printf("\t---Search forward--\n");
 324
 325     while (startOffset < numUniChars)
 326     {
 327         status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
 328                                startOffset, &breakOffset);
 329       //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
 330       //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
 331
 332       // Output break
 333       //printf("\t%d\n", (int)breakOffset);
 334
 335       // Increment counters
 336         noBreaks++;
 337       startOffset = breakOffset;
 338     }
 339     startTime = timeGetTime();
 340     for(i = 0; i < opt_loopCount; i++) {
 341       startOffset = 0;
 342
 343       while (startOffset < numUniChars)
 344         {
 345           status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
 346                                    startOffset, &breakOffset);
 347           // Increment counters
 348           startOffset = breakOffset;
 349         }
 350     }
 351     elapsedTime = timeGetTime()-startTime;
 352     UCDisposeTextBreakLocator(&breakRef);
 353 #endif
 354
 355
 356   }
 357
 358
 359   if (opt_terse == FALSE) {
 360   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
 361       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
 362       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
 363       printf("forward break iteration average loop time %d\n", loopTime);
 364       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
 365       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
 366   } else {
 367       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
 368   }
 369
 370
 371 }
 372
 373 void doIsBoundTest() {
 374   int32_t noBreaks = 0, hit = 0;
 375   int32_t i = 0, j = 0;
 376   unsigned long startTime = timeGetTime();
 377   unsigned long elapsedTime = 0;
 378   createICUBrkIt();
 379   brkit->setText(UnicodeString(text, textSize));
 380   brkit->first();
 381   for(j = 0; j < textSize; j++) {
 382     if(brkit->isBoundary(j)) {
 383       noBreaks++;
 384       //fprintf(stderr, "%d ", j);
 385     }
 386   }
 387   /*
 388   while(brkit->next() != BreakIterator::DONE) {
 389     noBreaks++;
 390   }
 391   */
 392
 393   startTime = timeGetTime();
 394   for(i = 0; i < opt_loopCount; i++) {
 395     for(j = 0; j < textSize; j++) {
 396       if(brkit->isBoundary(j)) {
 397         hit++;
 398       }
 399     }
 400   }
 401
 402   elapsedTime = timeGetTime()-startTime;
 403   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
 404   if (opt_terse == FALSE) {
 405       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
 406       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
 407       printf("forward break iteration average loop time %d\n", loopTime);
 408       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
 409       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
 410   } else {
 411       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
 412   }
 413 }
 414
 415 //----------------------------------------------------------------------------------------
 416 //
 417 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
 418 //                    Since it appears that Unicode support is going in the general
 419 //                    direction of the use of UTF-8 locales, that is the approach
 420 //                    that is used here.
 421 //
 422 //----------------------------------------------------------------------------------------
 423 void  UnixConvert() {
 424 #if 0
 425     int    line;
 426
 427     UConverter   *cvrtr;    // An ICU code page converter.
 428     UErrorCode    status = U_ZERO_ERROR;
 429
 430
 431     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
 432     if (U_FAILURE(status)) {
 433         fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
 434         exit(-1);
 435     }
 436     // redo for unix
 437     for (line=0; line < gNumFileLines; line++) {
 438         int sizeNeeded = ucnv_fromUChars(cvrtr,
 439                                          0,            // ptr to target buffer.
 440                                          0,            // length of target buffer.
 441                                          gFileLines[line].name,
 442                                          -1,           //  source is null terminated
 443                                          &status);
 444         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
 445             fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
 446             exit(-1);
 447         }
 448         status = U_ZERO_ERROR;
 449         gFileLines[line].unixName = new char[sizeNeeded+1];
 450         sizeNeeded = ucnv_fromUChars(cvrtr,
 451                                          gFileLines[line].unixName, // ptr to target buffer.
 452                                          sizeNeeded+1, // length of target buffer.
 453                                          gFileLines[line].name,
 454                                          -1,           //  source is null terminated
 455                                          &status);
 456         if (U_FAILURE(status)) {
 457             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
 458             exit(-1);
 459         }
 460         gFileLines[line].unixName[sizeNeeded] = 0;
 461     };
 462     ucnv_close(cvrtr);
 463 #endif
 464 }
 465
 466
 467 //----------------------------------------------------------------------------------------
 468 //
 469 //  class UCharFile   Class to hide all the gorp to read a file in
 470 //                    and produce a stream of UChars.
 471 //
 472 //----------------------------------------------------------------------------------------
 473 class UCharFile {
 474 public:
 475     UCharFile(const char *fileName);
 476     ~UCharFile();
 477     UChar   get();
 478     UBool   eof() {return fEof;};
 479     UBool   error() {return fError;};
 480     int32_t size() { return fFileSize; };
 481
 482 private:
 483     UCharFile (const UCharFile &other) {};                         // No copy constructor.
 484     UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
 485
 486     FILE         *fFile;
 487     const char   *fName;
 488     UBool        fEof;
 489     UBool        fError;
 490     UChar        fPending2ndSurrogate;
 491     int32_t      fFileSize;
 492
 493     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
 494 };
 495
 496 UCharFile::UCharFile(const char * fileName) {
 497     fEof                 = FALSE;
 498     fError               = FALSE;
 499     fName                = fileName;
 500     struct stat buf;
 501     int32_t result = stat(fileName, &buf);
 502     if(result != 0) {
 503       fprintf(stderr, "Error getting info\n");
 504       fFileSize = -1;
 505     } else {
 506       fFileSize = buf.st_size;
 507     }
 508     fFile                = fopen(fName, "rb");
 509     fPending2ndSurrogate = 0;
 510     if (fFile == NULL) {
 511         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
 512         fError = TRUE;
 513         return;
 514     }
 515     //
 516     //  Look for the byte order mark at the start of the file.
 517     //
 518     int BOMC1, BOMC2, BOMC3;
 519     BOMC1 = fgetc(fFile);
 520     BOMC2 = fgetc(fFile);
 521
 522     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
 523         fEncoding = UTF16LE; }
 524     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
 525         fEncoding = UTF16BE; }
 526     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
 527         fEncoding = UTF8; }
 528     else
 529     {
 530         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
 531             "must include a BOM.\n", fileName);
 532         fError = true;
 533         return;
 534     }
 535 }
 536
 537
 538 UCharFile::~UCharFile() {
 539     fclose(fFile);
 540 }
 541
 542
 543
 544 UChar UCharFile::get() {
 545     UChar   c;
 546     switch (fEncoding) {
 547     case UTF16LE:
 548         {
 549             int  cL, cH;
 550             cL = fgetc(fFile);
 551             cH = fgetc(fFile);
 552             c  = cL  | (cH << 8);
 553             if (cH == EOF) {
 554                 c   = 0;
 555                 fEof = TRUE;
 556             }
 557             break;
 558         }
 559     case UTF16BE:
 560         {
 561             int  cL, cH;
 562             cH = fgetc(fFile);
 563             cL = fgetc(fFile);
 564             c  = cL  | (cH << 8);
 565             if (cL == EOF) {
 566                 c   = 0;
 567                 fEof = TRUE;
 568             }
 569             break;
 570         }
 571     case UTF8:
 572         {
 573             if (fPending2ndSurrogate != 0) {
 574                 c = fPending2ndSurrogate;
 575                 fPending2ndSurrogate = 0;
 576                 break;
 577             }
 578
 579             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
 580             if (ch == EOF) {
 581                 c = 0;
 582                 fEof = TRUE;
 583                 break;
 584             }
 585
 586             if (ch <= 0x7f) {
 587                 // It's ascii.  No further utf-8 conversion.
 588                 c = ch;
 589                 break;
 590             }
 591
 592             // Figure out the lenght of the char and read the rest of the bytes
 593             //   into a temp array.
 594             int nBytes;
 595             if (ch >= 0xF0) {nBytes=4;}
 596             else if (ch >= 0xE0) {nBytes=3;}
 597             else if (ch >= 0xC0) {nBytes=2;}
 598             else {
 599                 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
 600                 fError = TRUE;
 601                 return 0;
 602             }
 603
 604             unsigned char  bytes[10];
 605             bytes[0] = (unsigned char)ch;
 606             int i;
 607             for (i=1; i<nBytes; i++) {
 608                 bytes[i] = fgetc(fFile);
 609                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
 610                     fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
 611                     fError = TRUE;
 612                     return 0;
 613                 }
 614             }
 615
 616             // Convert the bytes from the temp array to a Unicode char.
 617             i = 0;
 618             uint32_t  cp;
 619             U8_NEXT_UNSAFE(bytes, i, cp);
 620             c = (UChar)cp;
 621
 622             if (cp >= 0x10000) {
 623                 // The code point needs to be broken up into a utf-16 surrogate pair.
 624                 //  Process first half this time through the main loop, and
 625                 //   remember the other half for the next time through.
 626                 UChar utf16Buf[3];
 627                 i = 0;
 628                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
 629                 fPending2ndSurrogate = utf16Buf[1];
 630                 c = utf16Buf[0];
 631             }
 632             break;
 633         };
 634     }
 635     return c;
 636 }
 637
 638
 639 //----------------------------------------------------------------------------------------
 640 //
 641 //    Main   --  process command line, read in and pre-process the test file,
 642 //                 call other functions to do the actual tests.
 643 //
 644 //----------------------------------------------------------------------------------------
 645 int main(int argc, const char** argv) {
 646     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
 647         printf(gUsageString);
 648         exit (1);
 649     }
 650     // Make sure that we've only got one API selected.
 651     if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
 652     if (opt_mac || opt_unix) opt_win = FALSE;
 653     if (opt_mac) opt_unix = FALSE;
 654
 655     UErrorCode          status = U_ZERO_ERROR;
 656
 657
 658
 659     //
 660     //  Set up a Windows LCID
 661     //
 662   /*
 663     if (opt_langid != 0) {
 664         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
 665     }
 666     else {
 667         gWinLCID = uloc_getLCID(opt_locale);
 668     }
 669   */
 670
 671     //
 672     //  Set the UNIX locale
 673     //
 674     if (opt_unix) {
 675         if (setlocale(LC_ALL, opt_locale) == 0) {
 676             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
 677             exit(-1);
 678         }
 679     }
 680
 681     // Read in  the input file.
 682     //   File assumed to be utf-16.
 683     //   Lines go onto heap buffers.  Global index array to line starts is created.
 684     //   Lines themselves are null terminated.
 685     //
 686
 687     UCharFile f(opt_fName);
 688     if (f.error()) {
 689         exit(-1);
 690     }
 691     int32_t fileSize = f.size();
 692     const int STARTSIZE = 70000;
 693     int32_t bufSize = 0;
 694     int32_t charCount = 0;
 695     if(fileSize != -1) {
 696       text = (UChar *)malloc(fileSize*sizeof(UChar));
 697       bufSize = fileSize;
 698     } else {
 699       text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
 700       bufSize = STARTSIZE;
 701     }
 702     if(text == NULL) {
 703       fprintf(stderr, "Allocating buffer failed\n");
 704       exit(-1);
 705     }
 706
 707
 708     //  Read the file, split into lines, and save in memory.
 709     //  Loop runs once per utf-16 value from the input file,
 710     //    (The number of bytes read from file per loop iteration depends on external encoding.)
 711     for (;;) {
 712
 713         UChar c = f.get();
 714         if(f.eof()) {
 715           break;
 716         }
 717         if (f.error()){
 718           exit(-1);
 719         }
 720         // We now have a good UTF-16 value in c.
 721         text[charCount++] = c;
 722         if(charCount == bufSize) {
 723           text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
 724           if(text == NULL) {
 725             fprintf(stderr, "Reallocating buffer failed\n");
 726             exit(-1);
 727           }
 728           bufSize *= 2;
 729         }
 730     }
 731
 732
 733     if (opt_terse == FALSE) {
 734         printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
 735     }
 736
 737     textSize = charCount;
 738
 739
 740
 741
 742     //
 743     //  Dump file contents if requested.
 744     //
 745     if (opt_dump) {
 746       // dump file, etc... possibly
 747     }
 748
 749
 750     //
 751     //  We've got the file read into memory.  Go do something with it.
 752     //
 753     int32_t i = 0;
 754     for(i = 0; i < opt_passesCount; i++) {
 755       if(opt_loopCount != 0) {
 756         if(opt_next) {
 757           doForwardTest();
 758         } else if(opt_isBound) {
 759           doIsBoundTest();
 760         } else {
 761           doForwardTest();
 762         }
 763       } else if(opt_time != 0) {
 764
 765       }
 766     }
 767
 768   if(text != NULL) {
 769     free(text);
 770   }
 771     if(brkit != NULL) {
 772       delete brkit;
 773     }
 774
 775     return 0;
 776 }