icuSources/test/perf/ubrkperf/ubrkperfold.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (C) 2001 IBM, Inc.   All Rights Reserved.
   4  *
   5  ********************************************************************/
   6 /********************************************************************************
   7 *
   8 * File ubrkperf.cpp
   9 *
  10 * Modification History:
  11 *        Name                     Description
  12 *     Vladimir Weinstein          First Version, based on collperf
  13 *
  14 *********************************************************************************
  15 */
  16
  17 //
  18 //  This program tests break iterator performance
  19 //      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
  20 //      (if any)
  21 //      A text file is required as input.  It must be in utf-8 or utf-16 format,
  22 //      and include a byte order mark.  Either LE or BE format is OK.
  23 //
  24
  25 const char gUsageString[] =
  26  "usage:  ubrkperf options...\n"
  27     "-help                      Display this message.\n"
  28     "-file file_name            utf-16/utf-8 format file.\n"
  29     "-locale name               ICU locale to use.  Default is en_US\n"
  30     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
  31     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
  32     "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
  33     "-unix                      Run test using Unix word breaking services. (currently not working) \n"
  34     "-mac                       Run test using MacOSX word breaking services.\n"
  35     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
  36     "-char                      Use character break iterator\n"
  37     "-word                      Use word break iterator\n"
  38     "-line                      Use line break iterator\n"
  39     "-sentence                  Use sentence break iterator\n"
  40     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
  41     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
  42     "                               under test at each call point.  For measuring test overhead.\n"
  43     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
  44     "-dump                      Display stuff.\n"
  45     "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
  46     "-next                      Do the next test\n"
  47     "-isBound                   Do the isBound test\n"
  48     ;
  49
  50
  51 #include <stdio.h>
  52 #include <string.h>
  53 #include <stdlib.h>
  54 #include <math.h>
  55 #include <locale.h>
  56 #include <errno.h>
  57 #include <sys/stat.h>
  58
  59 #include <unicode/utypes.h>
  60 #include <unicode/ucol.h>
  61 #include <unicode/ucoleitr.h>
  62 #include <unicode/uloc.h>
  63 #include <unicode/ustring.h>
  64 #include <unicode/ures.h>
  65 #include <unicode/uchar.h>
  66 #include <unicode/ucnv.h>
  67 #include <unicode/utf8.h>
  68
  69 #include <unicode/brkiter.h>
  70
  71
  72 #ifdef WIN32
  73 #include <windows.h>
  74 #else
  75 //
  76 //  Stubs for Windows API functions when building on UNIXes.
  77 //
  78 #include <sys/time.h>
  79 unsigned long timeGetTime() {
  80     struct timeval t;
  81     gettimeofday(&t, 0);
  82     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
  83     val += t.tv_usec / 1000;
  84     return val;
  85 };
  86 #define MAKELCID(a,b) 0
  87 #endif
  88
  89
  90 //
  91 //  Command line option variables
  92 //     These global variables are set according to the options specified
  93 //     on the command line by the user.
  94 char * opt_fName      = 0;
  95 char * opt_locale     = "en_US";
  96 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
  97 char * opt_rules      = 0;
  98 UBool  opt_help       = FALSE;
  99 int    opt_time       = 0;
 100 int    opt_loopCount  = 0;
 101 int    opt_passesCount= 1;
 102 UBool  opt_terse      = FALSE;
 103 UBool  opt_icu        = TRUE;
 104 UBool  opt_win        = FALSE;      // Run with Windows native functions.
 105 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
 106 UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
 107 UBool  opt_uselen     = FALSE;
 108 UBool  opt_dump       = FALSE;
 109 UBool  opt_char       = FALSE;
 110 UBool  opt_word       = FALSE;
 111 UBool  opt_line       = FALSE;
 112 UBool  opt_sentence   = FALSE;
 113 UBool  opt_capi       = FALSE;
 114
 115 UBool  opt_next       = FALSE;
 116 UBool  opt_isBound    = FALSE;
 117
 118
 119
 120 //
 121 //   Definitions for the command line options
 122 //
 123 struct OptSpec {
 124     const char *name;
 125     enum {FLAG, NUM, STRING} type;
 126     void *pVar;
 127 };
 128
 129 OptSpec opts[] = {
 130     {"-file",        OptSpec::STRING, &opt_fName},
 131     {"-locale",      OptSpec::STRING, &opt_locale},
 132     {"-langid",      OptSpec::NUM,    &opt_langid},
 133     {"-win",         OptSpec::FLAG,   &opt_win},
 134     {"-unix",        OptSpec::FLAG,   &opt_unix},
 135     {"-mac",         OptSpec::FLAG,   &opt_mac},
 136     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
 137     {"-loop",        OptSpec::NUM,    &opt_loopCount},
 138     {"-time",        OptSpec::NUM,    &opt_time},
 139     {"-passes",      OptSpec::NUM,    &opt_passesCount},
 140     {"-char",        OptSpec::FLAG,   &opt_char},
 141     {"-word",        OptSpec::FLAG,   &opt_word},
 142     {"-line",        OptSpec::FLAG,   &opt_line},
 143     {"-sentence",    OptSpec::FLAG,   &opt_sentence},
 144     {"-terse",       OptSpec::FLAG,   &opt_terse},
 145     {"-dump",        OptSpec::FLAG,   &opt_dump},
 146     {"-capi",        OptSpec::FLAG,   &opt_capi},
 147     {"-next",        OptSpec::FLAG,   &opt_next},
 148     {"-isBound",     OptSpec::FLAG,   &opt_isBound},
 149     {"-help",        OptSpec::FLAG,   &opt_help},
 150     {"-?",           OptSpec::FLAG,   &opt_help},
 151     {0, OptSpec::FLAG, 0}
 152 };
 153
 154
 155 //---------------------------------------------------------------------------
 156 //
 157 //  Global variables pointing to and describing the test file
 158 //
 159 //---------------------------------------------------------------------------
 160
 161 //DWORD          gWinLCID;
 162 BreakIterator *brkit = NULL;
 163 UChar *text = NULL;
 164 int32_t textSize = 0;
 165
 166
 167
 168 #ifdef U_DARWIN
 169 #include <ApplicationServices/ApplicationServices.h>
 170 enum{
 171   kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
 172     };
 173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
 174 TextBreakLocatorRef breakRef;
 175 UCTextBreakType macBreakType;
 176
 177 void createMACBrkIt() {
 178   OSStatus status = noErr;
 179   LocaleRef lref;
 180   status = LocaleRefFromLocaleString(opt_locale, &lref);
 181   status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
 182   if(opt_char == TRUE) {
 183     macBreakType = kUCTextBreakClusterMask;
 184   } else if(opt_word == TRUE) {
 185     macBreakType = kUCTextBreakWordMask;
 186   } else if(opt_line == TRUE) {
 187     macBreakType = kUCTextBreakLineMask;
 188   } else if(opt_sentence == TRUE) {
 189     // error
 190     // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
 191   } else {
 192     // default is character iterator
 193     macBreakType = kUCTextBreakClusterMask;
 194       }
 195 }
 196 #endif
 197
 198 void createICUBrkIt() {
 199   //
 200   //  Set up an ICU break iterator
 201   //
 202   UErrorCode          status = U_ZERO_ERROR;
 203   if(opt_char == TRUE) {
 204     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
 205   } else if(opt_word == TRUE) {
 206     brkit = BreakIterator::createWordInstance(opt_locale, status);
 207   } else if(opt_line == TRUE) {
 208     brkit = BreakIterator::createLineInstance(opt_locale, status);
 209   } else if(opt_sentence == TRUE) {
 210     brkit = BreakIterator::createSentenceInstance(opt_locale, status);
 211   } else {
 212     // default is character iterator
 213     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
 214   }
 215   if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
 216     fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
 217   }
 218   if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
 219     fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
 220   }
 221
 222 }
 223
 224 //---------------------------------------------------------------------------
 225 //
 226 //  ProcessOptions()    Function to read the command line options.
 227 //
 228 //---------------------------------------------------------------------------
 229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
 230 {
 231     int         i;
 232     int         argNum;
 233     const char  *pArgName;
 234     OptSpec    *pOpt;
 235
 236     for (argNum=1; argNum<argc; argNum++) {
 237         pArgName = argv[argNum];
 238         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
 239             if (strcmp(pOpt->name, pArgName) == 0) {
 240                 switch (pOpt->type) {
 241                 case OptSpec::FLAG:
 242                     *(UBool *)(pOpt->pVar) = TRUE;
 243                     break;
 244                 case OptSpec::STRING:
 245                     argNum ++;
 246                     if (argNum >= argc) {
 247                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 248                         return FALSE;
 249                     }
 250                     *(const char **)(pOpt->pVar)  = argv[argNum];
 251                     break;
 252                 case OptSpec::NUM:
 253                     argNum ++;
 254                     if (argNum >= argc) {
 255                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 256                         return FALSE;
 257                     }
 258                     char *endp;
 259                     i = strtol(argv[argNum], &endp, 0);
 260                     if (endp == argv[argNum]) {
 261                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
 262                         return FALSE;
 263                     }
 264                     *(int *)(pOpt->pVar) = i;
 265                 }
 266                 break;
 267             }
 268         }
 269         if (pOpt->name == 0)
 270         {
 271             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
 272             return FALSE;
 273         }
 274     }
 275 return TRUE;
 276 }
 277
 278
 279 void doForwardTest() {
 280   if (opt_terse == FALSE) {
 281     printf("Doing the forward test\n");
 282   }
 283   int32_t noBreaks = 0;
 284   int32_t i = 0;
 285   unsigned long startTime = timeGetTime();
 286   unsigned long elapsedTime = 0;
 287   if(opt_icu) {
 288     createICUBrkIt();
 289     brkit->setText(UnicodeString(text, textSize));
 290     brkit->first();
 291     if (opt_terse == FALSE) {
 292       printf("Warmup\n");
 293     }
 294     int j;
 295     while((j = brkit->next()) != BreakIterator::DONE) {
 296       noBreaks++;
 297       //fprintf(stderr, "%d ", j);
 298     }
 299
 300     if (opt_terse == FALSE) {
 301       printf("Measure\n");
 302     }
 303     startTime = timeGetTime();
 304     for(i = 0; i < opt_loopCount; i++) {
 305       brkit->first();
 306       while(brkit->next() != BreakIterator::DONE) {
 307       }
 308     }
 309
 310     elapsedTime = timeGetTime()-startTime;
 311   } else if(opt_mac) {
 312 #ifdef U_DARWIN
 313     createMACBrkIt();
 314     UniChar* filePtr = text;
 315     OSStatus status = noErr;
 316     UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
 317     startOffset = 0;
 318     //printf("\t---Search forward--\n");
 319
 320     while (startOffset < numUniChars)
 321     {
 322         status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
 323                                startOffset, &breakOffset);
 324       //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
 325       //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
 326
 327       // Output break
 328       //printf("\t%d\n", (int)breakOffset);
 329
 330       // Increment counters
 331         noBreaks++;
 332       startOffset = breakOffset;
 333     }
 334     startTime = timeGetTime();
 335     for(i = 0; i < opt_loopCount; i++) {
 336       startOffset = 0;
 337
 338       while (startOffset < numUniChars)
 339         {
 340           status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
 341                                    startOffset, &breakOffset);
 342           // Increment counters
 343           startOffset = breakOffset;
 344         }
 345     }
 346     elapsedTime = timeGetTime()-startTime;
 347     UCDisposeTextBreakLocator(&breakRef);
 348 #endif
 349
 350
 351   }
 352
 353
 354   if (opt_terse == FALSE) {
 355   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
 356       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
 357       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
 358       printf("forward break iteration average loop time %d\n", loopTime);
 359       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
 360       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
 361   } else {
 362       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
 363   }
 364
 365
 366 }
 367
 368 void doIsBoundTest() {
 369   int32_t noBreaks = 0, hit = 0;
 370   int32_t i = 0, j = 0;
 371   unsigned long startTime = timeGetTime();
 372   unsigned long elapsedTime = 0;
 373   createICUBrkIt();
 374   brkit->setText(UnicodeString(text, textSize));
 375   brkit->first();
 376   for(j = 0; j < textSize; j++) {
 377     if(brkit->isBoundary(j)) {
 378       noBreaks++;
 379       //fprintf(stderr, "%d ", j);
 380     }
 381   }
 382   /*
 383   while(brkit->next() != BreakIterator::DONE) {
 384     noBreaks++;
 385   }
 386   */
 387
 388   startTime = timeGetTime();
 389   for(i = 0; i < opt_loopCount; i++) {
 390     for(j = 0; j < textSize; j++) {
 391       if(brkit->isBoundary(j)) {
 392         hit++;
 393       }
 394     }
 395   }
 396
 397   elapsedTime = timeGetTime()-startTime;
 398   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
 399   if (opt_terse == FALSE) {
 400       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
 401       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
 402       printf("forward break iteration average loop time %d\n", loopTime);
 403       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
 404       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
 405   } else {
 406       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
 407   }
 408 }
 409
 410 //----------------------------------------------------------------------------------------
 411 //
 412 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
 413 //                    Since it appears that Unicode support is going in the general
 414 //                    direction of the use of UTF-8 locales, that is the approach
 415 //                    that is used here.
 416 //
 417 //----------------------------------------------------------------------------------------
 418 void  UnixConvert() {
 419 #if 0
 420     int    line;
 421
 422     UConverter   *cvrtr;    // An ICU code page converter.
 423     UErrorCode    status = U_ZERO_ERROR;
 424
 425
 426     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
 427     if (U_FAILURE(status)) {
 428         fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
 429         exit(-1);
 430     }
 431     // redo for unix
 432     for (line=0; line < gNumFileLines; line++) {
 433         int sizeNeeded = ucnv_fromUChars(cvrtr,
 434                                          0,            // ptr to target buffer.
 435                                          0,            // length of target buffer.
 436                                          gFileLines[line].name,
 437                                          -1,           //  source is null terminated
 438                                          &status);
 439         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
 440             fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
 441             exit(-1);
 442         }
 443         status = U_ZERO_ERROR;
 444         gFileLines[line].unixName = new char[sizeNeeded+1];
 445         sizeNeeded = ucnv_fromUChars(cvrtr,
 446                                          gFileLines[line].unixName, // ptr to target buffer.
 447                                          sizeNeeded+1, // length of target buffer.
 448                                          gFileLines[line].name,
 449                                          -1,           //  source is null terminated
 450                                          &status);
 451         if (U_FAILURE(status)) {
 452             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
 453             exit(-1);
 454         }
 455         gFileLines[line].unixName[sizeNeeded] = 0;
 456     };
 457     ucnv_close(cvrtr);
 458 #endif
 459 }
 460
 461
 462 //----------------------------------------------------------------------------------------
 463 //
 464 //  class UCharFile   Class to hide all the gorp to read a file in
 465 //                    and produce a stream of UChars.
 466 //
 467 //----------------------------------------------------------------------------------------
 468 class UCharFile {
 469 public:
 470     UCharFile(const char *fileName);
 471     ~UCharFile();
 472     UChar   get();
 473     UBool   eof() {return fEof;};
 474     UBool   error() {return fError;};
 475     int32_t size() { return fFileSize; };
 476
 477 private:
 478     UCharFile (const UCharFile &other) {};                         // No copy constructor.
 479     UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
 480
 481     FILE         *fFile;
 482     const char   *fName;
 483     UBool        fEof;
 484     UBool        fError;
 485     UChar        fPending2ndSurrogate;
 486     int32_t      fFileSize;
 487
 488     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
 489 };
 490
 491 UCharFile::UCharFile(const char * fileName) {
 492     fEof                 = FALSE;
 493     fError               = FALSE;
 494     fName                = fileName;
 495     struct stat buf;
 496     int32_t result = stat(fileName, &buf);
 497     if(result != 0) {
 498       fprintf(stderr, "Error getting info\n");
 499       fFileSize = -1;
 500     } else {
 501       fFileSize = buf.st_size;
 502     }
 503     fFile                = fopen(fName, "rb");
 504     fPending2ndSurrogate = 0;
 505     if (fFile == NULL) {
 506         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
 507         fError = TRUE;
 508         return;
 509     }
 510     //
 511     //  Look for the byte order mark at the start of the file.
 512     //
 513     int BOMC1, BOMC2, BOMC3;
 514     BOMC1 = fgetc(fFile);
 515     BOMC2 = fgetc(fFile);
 516
 517     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
 518         fEncoding = UTF16LE; }
 519     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
 520         fEncoding = UTF16BE; }
 521     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
 522         fEncoding = UTF8; }
 523     else
 524     {
 525         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
 526             "must include a BOM.\n", fileName);
 527         fError = true;
 528         return;
 529     }
 530 }
 531
 532
 533 UCharFile::~UCharFile() {
 534     fclose(fFile);
 535 }
 536
 537
 538
 539 UChar UCharFile::get() {
 540     UChar   c;
 541     switch (fEncoding) {
 542     case UTF16LE:
 543         {
 544             int  cL, cH;
 545             cL = fgetc(fFile);
 546             cH = fgetc(fFile);
 547             c  = cL  | (cH << 8);
 548             if (cH == EOF) {
 549                 c   = 0;
 550                 fEof = TRUE;
 551             }
 552             break;
 553         }
 554     case UTF16BE:
 555         {
 556             int  cL, cH;
 557             cH = fgetc(fFile);
 558             cL = fgetc(fFile);
 559             c  = cL  | (cH << 8);
 560             if (cL == EOF) {
 561                 c   = 0;
 562                 fEof = TRUE;
 563             }
 564             break;
 565         }
 566     case UTF8:
 567         {
 568             if (fPending2ndSurrogate != 0) {
 569                 c = fPending2ndSurrogate;
 570                 fPending2ndSurrogate = 0;
 571                 break;
 572             }
 573
 574             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
 575             if (ch == EOF) {
 576                 c = 0;
 577                 fEof = TRUE;
 578                 break;
 579             }
 580
 581             if (ch <= 0x7f) {
 582                 // It's ascii.  No further utf-8 conversion.
 583                 c = ch;
 584                 break;
 585             }
 586
 587             // Figure out the lenght of the char and read the rest of the bytes
 588             //   into a temp array.
 589             int nBytes;
 590             if (ch >= 0xF0) {nBytes=4;}
 591             else if (ch >= 0xE0) {nBytes=3;}
 592             else if (ch >= 0xC0) {nBytes=2;}
 593             else {
 594                 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
 595                 fError = TRUE;
 596                 return 0;
 597             }
 598
 599             unsigned char  bytes[10];
 600             bytes[0] = (unsigned char)ch;
 601             int i;
 602             for (i=1; i<nBytes; i++) {
 603                 bytes[i] = fgetc(fFile);
 604                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
 605                     fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
 606                     fError = TRUE;
 607                     return 0;
 608                 }
 609             }
 610
 611             // Convert the bytes from the temp array to a Unicode char.
 612             i = 0;
 613             uint32_t  cp;
 614             UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
 615             c = (UChar)cp;
 616
 617             if (cp >= 0x10000) {
 618                 // The code point needs to be broken up into a utf-16 surrogate pair.
 619                 //  Process first half this time through the main loop, and
 620                 //   remember the other half for the next time through.
 621                 UChar utf16Buf[3];
 622                 i = 0;
 623                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
 624                 fPending2ndSurrogate = utf16Buf[1];
 625                 c = utf16Buf[0];
 626             }
 627             break;
 628         };
 629     }
 630     return c;
 631 }
 632
 633
 634 //----------------------------------------------------------------------------------------
 635 //
 636 //    Main   --  process command line, read in and pre-process the test file,
 637 //                 call other functions to do the actual tests.
 638 //
 639 //----------------------------------------------------------------------------------------
 640 int main(int argc, const char** argv) {
 641     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
 642         printf(gUsageString);
 643         exit (1);
 644     }
 645     // Make sure that we've only got one API selected.
 646     if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
 647     if (opt_mac || opt_unix) opt_win = FALSE;
 648     if (opt_mac) opt_unix = FALSE;
 649
 650     UErrorCode          status = U_ZERO_ERROR;
 651
 652
 653
 654     //
 655     //  Set up a Windows LCID
 656     //
 657   /*
 658     if (opt_langid != 0) {
 659         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
 660     }
 661     else {
 662         gWinLCID = uloc_getLCID(opt_locale);
 663     }
 664   */
 665
 666     //
 667     //  Set the UNIX locale
 668     //
 669     if (opt_unix) {
 670         if (setlocale(LC_ALL, opt_locale) == 0) {
 671             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
 672             exit(-1);
 673         }
 674     }
 675
 676     // Read in  the input file.
 677     //   File assumed to be utf-16.
 678     //   Lines go onto heap buffers.  Global index array to line starts is created.
 679     //   Lines themselves are null terminated.
 680     //
 681
 682     UCharFile f(opt_fName);
 683     if (f.error()) {
 684         exit(-1);
 685     }
 686     int32_t fileSize = f.size();
 687     const int STARTSIZE = 70000;
 688     int32_t bufSize = 0;
 689     int32_t charCount = 0;
 690     if(fileSize != -1) {
 691       text = (UChar *)malloc(fileSize*sizeof(UChar));
 692       bufSize = fileSize;
 693     } else {
 694       text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
 695       bufSize = STARTSIZE;
 696     }
 697     if(text == NULL) {
 698       fprintf(stderr, "Allocating buffer failed\n");
 699       exit(-1);
 700     }
 701
 702
 703     //  Read the file, split into lines, and save in memory.
 704     //  Loop runs once per utf-16 value from the input file,
 705     //    (The number of bytes read from file per loop iteration depends on external encoding.)
 706     for (;;) {
 707
 708         UChar c = f.get();
 709         if(f.eof()) {
 710           break;
 711         }
 712         if (f.error()){
 713           exit(-1);
 714         }
 715         // We now have a good UTF-16 value in c.
 716         text[charCount++] = c;
 717         if(charCount == bufSize) {
 718           text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
 719           if(text == NULL) {
 720             fprintf(stderr, "Reallocating buffer failed\n");
 721             exit(-1);
 722           }
 723           bufSize *= 2;
 724         }
 725     }
 726
 727
 728     if (opt_terse == FALSE) {
 729         printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
 730     }
 731
 732     textSize = charCount;
 733
 734
 735
 736
 737     //
 738     //  Dump file contents if requested.
 739     //
 740     if (opt_dump) {
 741       // dump file, etc... possibly
 742     }
 743
 744
 745     //
 746     //  We've got the file read into memory.  Go do something with it.
 747     //
 748     int32_t i = 0;
 749     for(i = 0; i < opt_passesCount; i++) {
 750       if(opt_loopCount != 0) {
 751         if(opt_next) {
 752           doForwardTest();
 753         } else if(opt_isBound) {
 754           doIsBoundTest();
 755         } else {
 756           doForwardTest();
 757         }
 758       } else if(opt_time != 0) {
 759
 760       }
 761     }
 762
 763   if(text != NULL) {
 764     free(text);
 765   }
 766     if(brkit != NULL) {
 767       delete brkit;
 768     }
 769
 770     return 0;
 771 }