1 /********************************************************************
3 * Copyright (C) 2001-2005 IBM, Inc. All Rights Reserved.
5 ********************************************************************/
6 /********************************************************************************
10 * Modification History:
12 * Vladimir Weinstein First Version, based on collperf
14 *********************************************************************************
18 // This program tests break iterator performance
19 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
21 // A text file is required as input. It must be in utf-8 or utf-16 format,
22 // and include a byte order mark. Either LE or BE format is OK.
25 const char gUsageString
[] =
26 "usage: ubrkperf options...\n"
27 "-help Display this message.\n"
28 "-file file_name utf-16/utf-8 format file.\n"
29 "-locale name ICU locale to use. Default is en_US\n"
30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32 "-win Run test using Windows native services. (currently not working) (ICU is default)\n"
33 "-unix Run test using Unix word breaking services. (currently not working) \n"
34 "-mac Run test using MacOSX word breaking services.\n"
35 "-uselen Use API with string lengths. Default is null-terminated strings\n"
36 "-char Use character break iterator\n"
37 "-word Use word break iterator\n"
38 "-line Use line break iterator\n"
39 "-sentence Use sentence break iterator\n"
40 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
41 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
42 " under test at each call point. For measuring test overhead.\n"
43 "-terse Terse numbers-only output. Intended for use by scripts.\n"
44 "-dump Display stuff.\n"
45 "-capi Use C APIs instead of C++ APIs (currently not working)\n"
46 "-next Do the next test\n"
47 "-isBound Do the isBound test\n"
59 #include <unicode/utypes.h>
60 #include <unicode/ucol.h>
61 #include <unicode/ucoleitr.h>
62 #include <unicode/uloc.h>
63 #include <unicode/ustring.h>
64 #include <unicode/ures.h>
65 #include <unicode/uchar.h>
66 #include <unicode/ucnv.h>
67 #include <unicode/utf8.h>
69 #include <unicode/brkiter.h>
76 // Stubs for Windows API functions when building on UNIXes.
79 unsigned long timeGetTime() {
82 unsigned long val
= t
.tv_sec
* 1000; // Let it overflow. Who cares.
83 val
+= t
.tv_usec
/ 1000;
86 #define MAKELCID(a,b) 0
91 // Command line option variables
92 // These global variables are set according to the options specified
93 // on the command line by the user.
95 char * opt_locale
= "en_US";
96 int opt_langid
= 0; // Defaults to value corresponding to opt_locale.
98 UBool opt_help
= FALSE
;
100 int opt_loopCount
= 0;
101 int opt_passesCount
= 1;
102 UBool opt_terse
= FALSE
;
103 UBool opt_icu
= TRUE
;
104 UBool opt_win
= FALSE
; // Run with Windows native functions.
105 UBool opt_unix
= FALSE
; // Run with UNIX strcoll, strxfrm functions.
106 UBool opt_mac
= FALSE
; // Run with MacOSX word break services.
107 UBool opt_uselen
= FALSE
;
108 UBool opt_dump
= FALSE
;
109 UBool opt_char
= FALSE
;
110 UBool opt_word
= FALSE
;
111 UBool opt_line
= FALSE
;
112 UBool opt_sentence
= FALSE
;
113 UBool opt_capi
= FALSE
;
115 UBool opt_next
= FALSE
;
116 UBool opt_isBound
= FALSE
;
121 // Definitions for the command line options
125 enum {FLAG
, NUM
, STRING
} type
;
130 {"-file", OptSpec::STRING
, &opt_fName
},
131 {"-locale", OptSpec::STRING
, &opt_locale
},
132 {"-langid", OptSpec::NUM
, &opt_langid
},
133 {"-win", OptSpec::FLAG
, &opt_win
},
134 {"-unix", OptSpec::FLAG
, &opt_unix
},
135 {"-mac", OptSpec::FLAG
, &opt_mac
},
136 {"-uselen", OptSpec::FLAG
, &opt_uselen
},
137 {"-loop", OptSpec::NUM
, &opt_loopCount
},
138 {"-time", OptSpec::NUM
, &opt_time
},
139 {"-passes", OptSpec::NUM
, &opt_passesCount
},
140 {"-char", OptSpec::FLAG
, &opt_char
},
141 {"-word", OptSpec::FLAG
, &opt_word
},
142 {"-line", OptSpec::FLAG
, &opt_line
},
143 {"-sentence", OptSpec::FLAG
, &opt_sentence
},
144 {"-terse", OptSpec::FLAG
, &opt_terse
},
145 {"-dump", OptSpec::FLAG
, &opt_dump
},
146 {"-capi", OptSpec::FLAG
, &opt_capi
},
147 {"-next", OptSpec::FLAG
, &opt_next
},
148 {"-isBound", OptSpec::FLAG
, &opt_isBound
},
149 {"-help", OptSpec::FLAG
, &opt_help
},
150 {"-?", OptSpec::FLAG
, &opt_help
},
151 {0, OptSpec::FLAG
, 0}
155 //---------------------------------------------------------------------------
157 // Global variables pointing to and describing the test file
159 //---------------------------------------------------------------------------
162 BreakIterator
*brkit
= NULL
;
164 int32_t textSize
= 0;
169 #include <ApplicationServices/ApplicationServices.h>
171 kUCTextBreakAllMask
= (kUCTextBreakClusterMask
| kUCTextBreakWordMask
| kUCTextBreakLineMask
)
173 UCTextBreakType breakTypes
[4] = {kUCTextBreakCharMask
, kUCTextBreakClusterMask
, kUCTextBreakWordMask
, kUCTextBreakLineMask
};
174 TextBreakLocatorRef breakRef
;
175 UCTextBreakType macBreakType
;
177 void createMACBrkIt() {
178 OSStatus status
= noErr
;
180 status
= LocaleRefFromLocaleString(opt_locale
, &lref
);
181 status
= UCCreateTextBreakLocator(lref
, 0, kUCTextBreakAllMask
, (TextBreakLocatorRef
*)&breakRef
);
182 if(opt_char
== TRUE
) {
183 macBreakType
= kUCTextBreakClusterMask
;
184 } else if(opt_word
== TRUE
) {
185 macBreakType
= kUCTextBreakWordMask
;
186 } else if(opt_line
== TRUE
) {
187 macBreakType
= kUCTextBreakLineMask
;
188 } else if(opt_sentence
== TRUE
) {
190 // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
192 // default is character iterator
193 macBreakType
= kUCTextBreakClusterMask
;
198 void createICUBrkIt() {
200 // Set up an ICU break iterator
202 UErrorCode status
= U_ZERO_ERROR
;
203 if(opt_char
== TRUE
) {
204 brkit
= BreakIterator::createCharacterInstance(opt_locale
, status
);
205 } else if(opt_word
== TRUE
) {
206 brkit
= BreakIterator::createWordInstance(opt_locale
, status
);
207 } else if(opt_line
== TRUE
) {
208 brkit
= BreakIterator::createLineInstance(opt_locale
, status
);
209 } else if(opt_sentence
== TRUE
) {
210 brkit
= BreakIterator::createSentenceInstance(opt_locale
, status
);
212 // default is character iterator
213 brkit
= BreakIterator::createCharacterInstance(opt_locale
, status
);
215 if (status
==U_USING_DEFAULT_WARNING
&& opt_terse
==FALSE
) {
216 fprintf(stderr
, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale
);
218 if (status
==U_USING_FALLBACK_WARNING
&& opt_terse
==FALSE
) {
219 fprintf(stderr
, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale
);
224 //---------------------------------------------------------------------------
226 // ProcessOptions() Function to read the command line options.
228 //---------------------------------------------------------------------------
229 UBool
ProcessOptions(int argc
, const char **argv
, OptSpec opts
[])
233 const char *pArgName
;
236 for (argNum
=1; argNum
<argc
; argNum
++) {
237 pArgName
= argv
[argNum
];
238 for (pOpt
= opts
; pOpt
->name
!= 0; pOpt
++) {
239 if (strcmp(pOpt
->name
, pArgName
) == 0) {
240 switch (pOpt
->type
) {
242 *(UBool
*)(pOpt
->pVar
) = TRUE
;
244 case OptSpec::STRING
:
246 if (argNum
>= argc
) {
247 fprintf(stderr
, "value expected for \"%s\" option.\n", pOpt
->name
);
250 *(const char **)(pOpt
->pVar
) = argv
[argNum
];
254 if (argNum
>= argc
) {
255 fprintf(stderr
, "value expected for \"%s\" option.\n", pOpt
->name
);
259 i
= strtol(argv
[argNum
], &endp
, 0);
260 if (endp
== argv
[argNum
]) {
261 fprintf(stderr
, "integer value expected for \"%s\" option.\n", pOpt
->name
);
264 *(int *)(pOpt
->pVar
) = i
;
271 fprintf(stderr
, "Unrecognized option \"%s\"\n", pArgName
);
279 void doForwardTest() {
280 if (opt_terse
== FALSE
) {
281 printf("Doing the forward test\n");
283 int32_t noBreaks
= 0;
285 unsigned long startTime
= timeGetTime();
286 unsigned long elapsedTime
= 0;
289 brkit
->setText(UnicodeString(text
, textSize
));
291 if (opt_terse
== FALSE
) {
295 while((j
= brkit
->next()) != BreakIterator::DONE
) {
297 //fprintf(stderr, "%d ", j);
300 if (opt_terse
== FALSE
) {
303 startTime
= timeGetTime();
304 for(i
= 0; i
< opt_loopCount
; i
++) {
306 while(brkit
->next() != BreakIterator::DONE
) {
310 elapsedTime
= timeGetTime()-startTime
;
314 UniChar
* filePtr
= text
;
315 OSStatus status
= noErr
;
316 UniCharCount startOffset
= 0, breakOffset
= 0, numUniChars
= textSize
;
318 //printf("\t---Search forward--\n");
320 while (startOffset
< numUniChars
)
322 status
= UCFindTextBreak(breakRef
, macBreakType
, kUCTextBreakLeadingEdgeMask
, filePtr
, numUniChars
,
323 startOffset
, &breakOffset
);
324 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
325 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
328 //printf("\t%d\n", (int)breakOffset);
330 // Increment counters
332 startOffset
= breakOffset
;
334 startTime
= timeGetTime();
335 for(i
= 0; i
< opt_loopCount
; i
++) {
338 while (startOffset
< numUniChars
)
340 status
= UCFindTextBreak(breakRef
, macBreakType
, kUCTextBreakLeadingEdgeMask
, filePtr
, numUniChars
,
341 startOffset
, &breakOffset
);
342 // Increment counters
343 startOffset
= breakOffset
;
346 elapsedTime
= timeGetTime()-startTime
;
347 UCDisposeTextBreakLocator(&breakRef
);
354 if (opt_terse
== FALSE
) {
355 int32_t loopTime
= (int)(float(1000) * ((float)elapsedTime
/(float)opt_loopCount
));
356 int32_t timePerCU
= (int)(float(1000) * ((float)loopTime
/(float)textSize
));
357 int32_t timePerBreak
= (int)(float(1000) * ((float)loopTime
/(float)noBreaks
));
358 printf("forward break iteration average loop time %d\n", loopTime
);
359 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU
);
360 printf("number of breaks %d average time per break %d\n", noBreaks
, timePerBreak
);
362 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime
, noBreaks
, textSize
);
368 void doIsBoundTest() {
369 int32_t noBreaks
= 0, hit
= 0;
370 int32_t i
= 0, j
= 0;
371 unsigned long startTime
= timeGetTime();
372 unsigned long elapsedTime
= 0;
374 brkit
->setText(UnicodeString(text
, textSize
));
376 for(j
= 0; j
< textSize
; j
++) {
377 if(brkit
->isBoundary(j
)) {
379 //fprintf(stderr, "%d ", j);
383 while(brkit->next() != BreakIterator::DONE) {
388 startTime
= timeGetTime();
389 for(i
= 0; i
< opt_loopCount
; i
++) {
390 for(j
= 0; j
< textSize
; j
++) {
391 if(brkit
->isBoundary(j
)) {
397 elapsedTime
= timeGetTime()-startTime
;
398 int32_t loopTime
= (int)(float(1000) * ((float)elapsedTime
/(float)opt_loopCount
));
399 if (opt_terse
== FALSE
) {
400 int32_t timePerCU
= (int)(float(1000) * ((float)loopTime
/(float)textSize
));
401 int32_t timePerBreak
= (int)(float(1000) * ((float)loopTime
/(float)noBreaks
));
402 printf("forward break iteration average loop time %d\n", loopTime
);
403 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU
);
404 printf("number of breaks %d average time per break %d\n", noBreaks
, timePerBreak
);
406 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime
, noBreaks
, textSize
);
410 //----------------------------------------------------------------------------------------
412 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
413 // Since it appears that Unicode support is going in the general
414 // direction of the use of UTF-8 locales, that is the approach
415 // that is used here.
417 //----------------------------------------------------------------------------------------
422 UConverter
*cvrtr
; // An ICU code page converter.
423 UErrorCode status
= U_ZERO_ERROR
;
426 cvrtr
= ucnv_open("utf-8", &status
); // we are just doing UTF-8 locales for now.
427 if (U_FAILURE(status
)) {
428 fprintf(stderr
, "ICU Converter open failed.: %d\n", &status
);
432 for (line
=0; line
< gNumFileLines
; line
++) {
433 int sizeNeeded
= ucnv_fromUChars(cvrtr
,
434 0, // ptr to target buffer.
435 0, // length of target buffer.
436 gFileLines
[line
].name
,
437 -1, // source is null terminated
439 if (status
!= U_BUFFER_OVERFLOW_ERROR
&& status
!= U_ZERO_ERROR
) {
440 fprintf(stderr
, "Conversion from Unicode, something is wrong.\n");
443 status
= U_ZERO_ERROR
;
444 gFileLines
[line
].unixName
= new char[sizeNeeded
+1];
445 sizeNeeded
= ucnv_fromUChars(cvrtr
,
446 gFileLines
[line
].unixName
, // ptr to target buffer.
447 sizeNeeded
+1, // length of target buffer.
448 gFileLines
[line
].name
,
449 -1, // source is null terminated
451 if (U_FAILURE(status
)) {
452 fprintf(stderr
, "ICU Conversion Failed.: %d\n", status
);
455 gFileLines
[line
].unixName
[sizeNeeded
] = 0;
462 //----------------------------------------------------------------------------------------
464 // class UCharFile Class to hide all the gorp to read a file in
465 // and produce a stream of UChars.
467 //----------------------------------------------------------------------------------------
470 UCharFile(const char *fileName
);
473 UBool
eof() {return fEof
;};
474 UBool
error() {return fError
;};
475 int32_t size() { return fFileSize
; };
478 UCharFile (const UCharFile
&other
) {}; // No copy constructor.
479 UCharFile
& operator = (const UCharFile
&other
) {return *this;}; // No assignment op
485 UChar fPending2ndSurrogate
;
488 enum {UTF16LE
, UTF16BE
, UTF8
} fEncoding
;
491 UCharFile::UCharFile(const char * fileName
) {
496 int32_t result
= stat(fileName
, &buf
);
498 fprintf(stderr
, "Error getting info\n");
501 fFileSize
= buf
.st_size
;
503 fFile
= fopen(fName
, "rb");
504 fPending2ndSurrogate
= 0;
506 fprintf(stderr
, "Can not open file \"%s\"\n", opt_fName
);
511 // Look for the byte order mark at the start of the file.
513 int BOMC1
, BOMC2
, BOMC3
;
514 BOMC1
= fgetc(fFile
);
515 BOMC2
= fgetc(fFile
);
517 if (BOMC1
== 0xff && BOMC2
== 0xfe) {
518 fEncoding
= UTF16LE
; }
519 else if (BOMC1
== 0xfe && BOMC2
== 0xff) {
520 fEncoding
= UTF16BE
; }
521 else if (BOMC1
== 0xEF && BOMC2
== 0xBB && (BOMC3
= fgetc(fFile
)) == 0xBF ) {
525 fprintf(stderr
, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
526 "must include a BOM.\n", fileName
);
533 UCharFile::~UCharFile() {
539 UChar
UCharFile::get() {
568 if (fPending2ndSurrogate
!= 0) {
569 c
= fPending2ndSurrogate
;
570 fPending2ndSurrogate
= 0;
574 int ch
= fgetc(fFile
); // Note: c and ch are separate cause eof test doesn't work on UChar type.
582 // It's ascii. No further utf-8 conversion.
587 // Figure out the lenght of the char and read the rest of the bytes
588 // into a temp array.
590 if (ch
>= 0xF0) {nBytes
=4;}
591 else if (ch
>= 0xE0) {nBytes
=3;}
592 else if (ch
>= 0xC0) {nBytes
=2;}
594 fprintf(stderr
, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName
, ftell(fFile
));
599 unsigned char bytes
[10];
600 bytes
[0] = (unsigned char)ch
;
602 for (i
=1; i
<nBytes
; i
++) {
603 bytes
[i
] = fgetc(fFile
);
604 if (bytes
[i
] < 0x80 || bytes
[i
] >= 0xc0) {
605 fprintf(stderr
, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName
, ftell(fFile
), nBytes
, i
, ch
);
611 // Convert the bytes from the temp array to a Unicode char.
614 UTF8_NEXT_CHAR_UNSAFE(bytes
, i
, cp
);
618 // The code point needs to be broken up into a utf-16 surrogate pair.
619 // Process first half this time through the main loop, and
620 // remember the other half for the next time through.
623 UTF16_APPEND_CHAR_UNSAFE(utf16Buf
, i
, cp
);
624 fPending2ndSurrogate
= utf16Buf
[1];
634 //----------------------------------------------------------------------------------------
636 // Main -- process command line, read in and pre-process the test file,
637 // call other functions to do the actual tests.
639 //----------------------------------------------------------------------------------------
640 int main(int argc
, const char** argv
) {
641 if (ProcessOptions(argc
, argv
, opts
) != TRUE
|| opt_help
|| opt_fName
== 0) {
642 printf(gUsageString
);
645 // Make sure that we've only got one API selected.
646 if (opt_mac
|| opt_unix
|| opt_win
) opt_icu
= FALSE
;
647 if (opt_mac
|| opt_unix
) opt_win
= FALSE
;
648 if (opt_mac
) opt_unix
= FALSE
;
650 UErrorCode status
= U_ZERO_ERROR
;
655 // Set up a Windows LCID
658 if (opt_langid != 0) {
659 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
662 gWinLCID = uloc_getLCID(opt_locale);
667 // Set the UNIX locale
670 if (setlocale(LC_ALL
, opt_locale
) == 0) {
671 fprintf(stderr
, "setlocale(LC_ALL, %s) failed.\n", opt_locale
);
676 // Read in the input file.
677 // File assumed to be utf-16.
678 // Lines go onto heap buffers. Global index array to line starts is created.
679 // Lines themselves are null terminated.
682 UCharFile
f(opt_fName
);
686 int32_t fileSize
= f
.size();
687 const int STARTSIZE
= 70000;
689 int32_t charCount
= 0;
691 text
= (UChar
*)malloc(fileSize
*sizeof(UChar
));
694 text
= (UChar
*)malloc(STARTSIZE
*sizeof(UChar
));
698 fprintf(stderr
, "Allocating buffer failed\n");
703 // Read the file, split into lines, and save in memory.
704 // Loop runs once per utf-16 value from the input file,
705 // (The number of bytes read from file per loop iteration depends on external encoding.)
715 // We now have a good UTF-16 value in c.
716 text
[charCount
++] = c
;
717 if(charCount
== bufSize
) {
718 text
= (UChar
*)realloc(text
, 2*bufSize
*sizeof(UChar
));
720 fprintf(stderr
, "Reallocating buffer failed\n");
728 if (opt_terse
== FALSE
) {
729 printf("file \"%s\", %d charCount code units.\n", opt_fName
, charCount
);
732 textSize
= charCount
;
738 // Dump file contents if requested.
741 // dump file, etc... possibly
746 // We've got the file read into memory. Go do something with it.
749 for(i
= 0; i
< opt_passesCount
; i
++) {
750 if(opt_loopCount
!= 0) {
753 } else if(opt_isBound
) {
758 } else if(opt_time
!= 0) {