1 /***********************************************************************
2 * © 2016 and later: Unicode, Inc. and others.
3 * License & terms of use: http://www.unicode.org/copyright.html#License
5 ***********************************************************************
6 ***********************************************************************
8 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
10 ***********************************************************************/
11 /********************************************************************************
15 * Modification History:
17 * Vladimir Weinstein First Version, based on collperf
19 *********************************************************************************
23 // This program tests break iterator performance
24 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
26 // A text file is required as input. It must be in utf-8 or utf-16 format,
27 // and include a byte order mark. Either LE or BE format is OK.
30 const char gUsageString
[] =
31 "usage: ubrkperf options...\n"
32 "-help Display this message.\n"
33 "-file file_name utf-16/utf-8 format file.\n"
34 "-locale name ICU locale to use. Default is en_US\n"
35 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
36 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
37 "-win Run test using Windows native services. (currently not working) (ICU is default)\n"
38 "-unix Run test using Unix word breaking services. (currently not working) \n"
39 "-mac Run test using MacOSX word breaking services.\n"
40 "-uselen Use API with string lengths. Default is null-terminated strings\n"
41 "-char Use character break iterator\n"
42 "-word Use word break iterator\n"
43 "-line Use line break iterator\n"
44 "-sentence Use sentence break iterator\n"
45 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
46 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
47 " under test at each call point. For measuring test overhead.\n"
48 "-terse Terse numbers-only output. Intended for use by scripts.\n"
49 "-dump Display stuff.\n"
50 "-capi Use C APIs instead of C++ APIs (currently not working)\n"
51 "-next Do the next test\n"
52 "-isBound Do the isBound test\n"
64 #include <unicode/utypes.h>
65 #include <unicode/ucol.h>
66 #include <unicode/ucoleitr.h>
67 #include <unicode/uloc.h>
68 #include <unicode/ustring.h>
69 #include <unicode/ures.h>
70 #include <unicode/uchar.h>
71 #include <unicode/ucnv.h>
72 #include <unicode/utf8.h>
74 #include <unicode/brkiter.h>
77 #if U_PLATFORM_HAS_WIN32_API
81 // Stubs for Windows API functions when building on UNIXes.
84 unsigned long timeGetTime() {
87 unsigned long val
= t
.tv_sec
* 1000; // Let it overflow. Who cares.
88 val
+= t
.tv_usec
/ 1000;
91 #define MAKELCID(a,b) 0
96 // Command line option variables
97 // These global variables are set according to the options specified
98 // on the command line by the user.
100 char * opt_locale
= "en_US";
101 int opt_langid
= 0; // Defaults to value corresponding to opt_locale.
102 char * opt_rules
= 0;
103 UBool opt_help
= FALSE
;
105 int opt_loopCount
= 0;
106 int opt_passesCount
= 1;
107 UBool opt_terse
= FALSE
;
108 UBool opt_icu
= TRUE
;
109 UBool opt_win
= FALSE
; // Run with Windows native functions.
110 UBool opt_unix
= FALSE
; // Run with UNIX strcoll, strxfrm functions.
111 UBool opt_mac
= FALSE
; // Run with MacOSX word break services.
112 UBool opt_uselen
= FALSE
;
113 UBool opt_dump
= FALSE
;
114 UBool opt_char
= FALSE
;
115 UBool opt_word
= FALSE
;
116 UBool opt_line
= FALSE
;
117 UBool opt_sentence
= FALSE
;
118 UBool opt_capi
= FALSE
;
120 UBool opt_next
= FALSE
;
121 UBool opt_isBound
= FALSE
;
126 // Definitions for the command line options
130 enum {FLAG
, NUM
, STRING
} type
;
135 {"-file", OptSpec::STRING
, &opt_fName
},
136 {"-locale", OptSpec::STRING
, &opt_locale
},
137 {"-langid", OptSpec::NUM
, &opt_langid
},
138 {"-win", OptSpec::FLAG
, &opt_win
},
139 {"-unix", OptSpec::FLAG
, &opt_unix
},
140 {"-mac", OptSpec::FLAG
, &opt_mac
},
141 {"-uselen", OptSpec::FLAG
, &opt_uselen
},
142 {"-loop", OptSpec::NUM
, &opt_loopCount
},
143 {"-time", OptSpec::NUM
, &opt_time
},
144 {"-passes", OptSpec::NUM
, &opt_passesCount
},
145 {"-char", OptSpec::FLAG
, &opt_char
},
146 {"-word", OptSpec::FLAG
, &opt_word
},
147 {"-line", OptSpec::FLAG
, &opt_line
},
148 {"-sentence", OptSpec::FLAG
, &opt_sentence
},
149 {"-terse", OptSpec::FLAG
, &opt_terse
},
150 {"-dump", OptSpec::FLAG
, &opt_dump
},
151 {"-capi", OptSpec::FLAG
, &opt_capi
},
152 {"-next", OptSpec::FLAG
, &opt_next
},
153 {"-isBound", OptSpec::FLAG
, &opt_isBound
},
154 {"-help", OptSpec::FLAG
, &opt_help
},
155 {"-?", OptSpec::FLAG
, &opt_help
},
156 {0, OptSpec::FLAG
, 0}
160 //---------------------------------------------------------------------------
162 // Global variables pointing to and describing the test file
164 //---------------------------------------------------------------------------
167 BreakIterator
*brkit
= NULL
;
169 int32_t textSize
= 0;
173 #if U_PLATFORM_IS_DARWIN_BASED
174 #include <ApplicationServices/ApplicationServices.h>
176 kUCTextBreakAllMask
= (kUCTextBreakClusterMask
| kUCTextBreakWordMask
| kUCTextBreakLineMask
)
178 UCTextBreakType breakTypes
[4] = {kUCTextBreakCharMask
, kUCTextBreakClusterMask
, kUCTextBreakWordMask
, kUCTextBreakLineMask
};
179 TextBreakLocatorRef breakRef
;
180 UCTextBreakType macBreakType
;
182 void createMACBrkIt() {
183 OSStatus status
= noErr
;
185 status
= LocaleRefFromLocaleString(opt_locale
, &lref
);
186 status
= UCCreateTextBreakLocator(lref
, 0, kUCTextBreakAllMask
, (TextBreakLocatorRef
*)&breakRef
);
187 if(opt_char
== TRUE
) {
188 macBreakType
= kUCTextBreakClusterMask
;
189 } else if(opt_word
== TRUE
) {
190 macBreakType
= kUCTextBreakWordMask
;
191 } else if(opt_line
== TRUE
) {
192 macBreakType
= kUCTextBreakLineMask
;
193 } else if(opt_sentence
== TRUE
) {
195 // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
197 // default is character iterator
198 macBreakType
= kUCTextBreakClusterMask
;
203 void createICUBrkIt() {
205 // Set up an ICU break iterator
207 UErrorCode status
= U_ZERO_ERROR
;
208 if(opt_char
== TRUE
) {
209 brkit
= BreakIterator::createCharacterInstance(opt_locale
, status
);
210 } else if(opt_word
== TRUE
) {
211 brkit
= BreakIterator::createWordInstance(opt_locale
, status
);
212 } else if(opt_line
== TRUE
) {
213 brkit
= BreakIterator::createLineInstance(opt_locale
, status
);
214 } else if(opt_sentence
== TRUE
) {
215 brkit
= BreakIterator::createSentenceInstance(opt_locale
, status
);
217 // default is character iterator
218 brkit
= BreakIterator::createCharacterInstance(opt_locale
, status
);
220 if (status
==U_USING_DEFAULT_WARNING
&& opt_terse
==FALSE
) {
221 fprintf(stderr
, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale
);
223 if (status
==U_USING_FALLBACK_WARNING
&& opt_terse
==FALSE
) {
224 fprintf(stderr
, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale
);
229 //---------------------------------------------------------------------------
231 // ProcessOptions() Function to read the command line options.
233 //---------------------------------------------------------------------------
234 UBool
ProcessOptions(int argc
, const char **argv
, OptSpec opts
[])
238 const char *pArgName
;
241 for (argNum
=1; argNum
<argc
; argNum
++) {
242 pArgName
= argv
[argNum
];
243 for (pOpt
= opts
; pOpt
->name
!= 0; pOpt
++) {
244 if (strcmp(pOpt
->name
, pArgName
) == 0) {
245 switch (pOpt
->type
) {
247 *(UBool
*)(pOpt
->pVar
) = TRUE
;
249 case OptSpec::STRING
:
251 if (argNum
>= argc
) {
252 fprintf(stderr
, "value expected for \"%s\" option.\n", pOpt
->name
);
255 *(const char **)(pOpt
->pVar
) = argv
[argNum
];
259 if (argNum
>= argc
) {
260 fprintf(stderr
, "value expected for \"%s\" option.\n", pOpt
->name
);
264 i
= strtol(argv
[argNum
], &endp
, 0);
265 if (endp
== argv
[argNum
]) {
266 fprintf(stderr
, "integer value expected for \"%s\" option.\n", pOpt
->name
);
269 *(int *)(pOpt
->pVar
) = i
;
276 fprintf(stderr
, "Unrecognized option \"%s\"\n", pArgName
);
284 void doForwardTest() {
285 if (opt_terse
== FALSE
) {
286 printf("Doing the forward test\n");
288 int32_t noBreaks
= 0;
290 unsigned long startTime
= timeGetTime();
291 unsigned long elapsedTime
= 0;
294 brkit
->setText(UnicodeString(text
, textSize
));
296 if (opt_terse
== FALSE
) {
300 while((j
= brkit
->next()) != BreakIterator::DONE
) {
302 //fprintf(stderr, "%d ", j);
305 if (opt_terse
== FALSE
) {
308 startTime
= timeGetTime();
309 for(i
= 0; i
< opt_loopCount
; i
++) {
311 while(brkit
->next() != BreakIterator::DONE
) {
315 elapsedTime
= timeGetTime()-startTime
;
317 #if U_PLATFORM_IS_DARWIN_BASED
319 UniChar
* filePtr
= text
;
320 OSStatus status
= noErr
;
321 UniCharCount startOffset
= 0, breakOffset
= 0, numUniChars
= textSize
;
323 //printf("\t---Search forward--\n");
325 while (startOffset
< numUniChars
)
327 status
= UCFindTextBreak(breakRef
, macBreakType
, kUCTextBreakLeadingEdgeMask
, filePtr
, numUniChars
,
328 startOffset
, &breakOffset
);
329 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
330 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
333 //printf("\t%d\n", (int)breakOffset);
335 // Increment counters
337 startOffset
= breakOffset
;
339 startTime
= timeGetTime();
340 for(i
= 0; i
< opt_loopCount
; i
++) {
343 while (startOffset
< numUniChars
)
345 status
= UCFindTextBreak(breakRef
, macBreakType
, kUCTextBreakLeadingEdgeMask
, filePtr
, numUniChars
,
346 startOffset
, &breakOffset
);
347 // Increment counters
348 startOffset
= breakOffset
;
351 elapsedTime
= timeGetTime()-startTime
;
352 UCDisposeTextBreakLocator(&breakRef
);
359 if (opt_terse
== FALSE
) {
360 int32_t loopTime
= (int)(float(1000) * ((float)elapsedTime
/(float)opt_loopCount
));
361 int32_t timePerCU
= (int)(float(1000) * ((float)loopTime
/(float)textSize
));
362 int32_t timePerBreak
= (int)(float(1000) * ((float)loopTime
/(float)noBreaks
));
363 printf("forward break iteration average loop time %d\n", loopTime
);
364 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU
);
365 printf("number of breaks %d average time per break %d\n", noBreaks
, timePerBreak
);
367 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime
, noBreaks
, textSize
);
373 void doIsBoundTest() {
374 int32_t noBreaks
= 0, hit
= 0;
375 int32_t i
= 0, j
= 0;
376 unsigned long startTime
= timeGetTime();
377 unsigned long elapsedTime
= 0;
379 brkit
->setText(UnicodeString(text
, textSize
));
381 for(j
= 0; j
< textSize
; j
++) {
382 if(brkit
->isBoundary(j
)) {
384 //fprintf(stderr, "%d ", j);
388 while(brkit->next() != BreakIterator::DONE) {
393 startTime
= timeGetTime();
394 for(i
= 0; i
< opt_loopCount
; i
++) {
395 for(j
= 0; j
< textSize
; j
++) {
396 if(brkit
->isBoundary(j
)) {
402 elapsedTime
= timeGetTime()-startTime
;
403 int32_t loopTime
= (int)(float(1000) * ((float)elapsedTime
/(float)opt_loopCount
));
404 if (opt_terse
== FALSE
) {
405 int32_t timePerCU
= (int)(float(1000) * ((float)loopTime
/(float)textSize
));
406 int32_t timePerBreak
= (int)(float(1000) * ((float)loopTime
/(float)noBreaks
));
407 printf("forward break iteration average loop time %d\n", loopTime
);
408 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU
);
409 printf("number of breaks %d average time per break %d\n", noBreaks
, timePerBreak
);
411 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime
, noBreaks
, textSize
);
415 //----------------------------------------------------------------------------------------
417 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
418 // Since it appears that Unicode support is going in the general
419 // direction of the use of UTF-8 locales, that is the approach
420 // that is used here.
422 //----------------------------------------------------------------------------------------
427 UConverter
*cvrtr
; // An ICU code page converter.
428 UErrorCode status
= U_ZERO_ERROR
;
431 cvrtr
= ucnv_open("utf-8", &status
); // we are just doing UTF-8 locales for now.
432 if (U_FAILURE(status
)) {
433 fprintf(stderr
, "ICU Converter open failed.: %d\n", &status
);
437 for (line
=0; line
< gNumFileLines
; line
++) {
438 int sizeNeeded
= ucnv_fromUChars(cvrtr
,
439 0, // ptr to target buffer.
440 0, // length of target buffer.
441 gFileLines
[line
].name
,
442 -1, // source is null terminated
444 if (status
!= U_BUFFER_OVERFLOW_ERROR
&& status
!= U_ZERO_ERROR
) {
445 fprintf(stderr
, "Conversion from Unicode, something is wrong.\n");
448 status
= U_ZERO_ERROR
;
449 gFileLines
[line
].unixName
= new char[sizeNeeded
+1];
450 sizeNeeded
= ucnv_fromUChars(cvrtr
,
451 gFileLines
[line
].unixName
, // ptr to target buffer.
452 sizeNeeded
+1, // length of target buffer.
453 gFileLines
[line
].name
,
454 -1, // source is null terminated
456 if (U_FAILURE(status
)) {
457 fprintf(stderr
, "ICU Conversion Failed.: %d\n", status
);
460 gFileLines
[line
].unixName
[sizeNeeded
] = 0;
467 //----------------------------------------------------------------------------------------
469 // class UCharFile Class to hide all the gorp to read a file in
470 // and produce a stream of UChars.
472 //----------------------------------------------------------------------------------------
475 UCharFile(const char *fileName
);
478 UBool
eof() {return fEof
;};
479 UBool
error() {return fError
;};
480 int32_t size() { return fFileSize
; };
483 UCharFile (const UCharFile
&other
) {}; // No copy constructor.
484 UCharFile
& operator = (const UCharFile
&other
) {return *this;}; // No assignment op
490 UChar fPending2ndSurrogate
;
493 enum {UTF16LE
, UTF16BE
, UTF8
} fEncoding
;
496 UCharFile::UCharFile(const char * fileName
) {
501 int32_t result
= stat(fileName
, &buf
);
503 fprintf(stderr
, "Error getting info\n");
506 fFileSize
= buf
.st_size
;
508 fFile
= fopen(fName
, "rb");
509 fPending2ndSurrogate
= 0;
511 fprintf(stderr
, "Can not open file \"%s\"\n", opt_fName
);
516 // Look for the byte order mark at the start of the file.
518 int BOMC1
, BOMC2
, BOMC3
;
519 BOMC1
= fgetc(fFile
);
520 BOMC2
= fgetc(fFile
);
522 if (BOMC1
== 0xff && BOMC2
== 0xfe) {
523 fEncoding
= UTF16LE
; }
524 else if (BOMC1
== 0xfe && BOMC2
== 0xff) {
525 fEncoding
= UTF16BE
; }
526 else if (BOMC1
== 0xEF && BOMC2
== 0xBB && (BOMC3
= fgetc(fFile
)) == 0xBF ) {
530 fprintf(stderr
, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
531 "must include a BOM.\n", fileName
);
538 UCharFile::~UCharFile() {
544 UChar
UCharFile::get() {
573 if (fPending2ndSurrogate
!= 0) {
574 c
= fPending2ndSurrogate
;
575 fPending2ndSurrogate
= 0;
579 int ch
= fgetc(fFile
); // Note: c and ch are separate cause eof test doesn't work on UChar type.
587 // It's ascii. No further utf-8 conversion.
592 // Figure out the lenght of the char and read the rest of the bytes
593 // into a temp array.
595 if (ch
>= 0xF0) {nBytes
=4;}
596 else if (ch
>= 0xE0) {nBytes
=3;}
597 else if (ch
>= 0xC0) {nBytes
=2;}
599 fprintf(stderr
, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName
, ftell(fFile
));
604 unsigned char bytes
[10];
605 bytes
[0] = (unsigned char)ch
;
607 for (i
=1; i
<nBytes
; i
++) {
608 bytes
[i
] = fgetc(fFile
);
609 if (bytes
[i
] < 0x80 || bytes
[i
] >= 0xc0) {
610 fprintf(stderr
, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName
, ftell(fFile
), nBytes
, i
, ch
);
616 // Convert the bytes from the temp array to a Unicode char.
619 U8_NEXT_UNSAFE(bytes
, i
, cp
);
623 // The code point needs to be broken up into a utf-16 surrogate pair.
624 // Process first half this time through the main loop, and
625 // remember the other half for the next time through.
628 UTF16_APPEND_CHAR_UNSAFE(utf16Buf
, i
, cp
);
629 fPending2ndSurrogate
= utf16Buf
[1];
639 //----------------------------------------------------------------------------------------
641 // Main -- process command line, read in and pre-process the test file,
642 // call other functions to do the actual tests.
644 //----------------------------------------------------------------------------------------
645 int main(int argc
, const char** argv
) {
646 if (ProcessOptions(argc
, argv
, opts
) != TRUE
|| opt_help
|| opt_fName
== 0) {
647 printf(gUsageString
);
650 // Make sure that we've only got one API selected.
651 if (opt_mac
|| opt_unix
|| opt_win
) opt_icu
= FALSE
;
652 if (opt_mac
|| opt_unix
) opt_win
= FALSE
;
653 if (opt_mac
) opt_unix
= FALSE
;
655 UErrorCode status
= U_ZERO_ERROR
;
660 // Set up a Windows LCID
663 if (opt_langid != 0) {
664 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
667 gWinLCID = uloc_getLCID(opt_locale);
672 // Set the UNIX locale
675 if (setlocale(LC_ALL
, opt_locale
) == 0) {
676 fprintf(stderr
, "setlocale(LC_ALL, %s) failed.\n", opt_locale
);
681 // Read in the input file.
682 // File assumed to be utf-16.
683 // Lines go onto heap buffers. Global index array to line starts is created.
684 // Lines themselves are null terminated.
687 UCharFile
f(opt_fName
);
691 int32_t fileSize
= f
.size();
692 const int STARTSIZE
= 70000;
694 int32_t charCount
= 0;
696 text
= (UChar
*)malloc(fileSize
*sizeof(UChar
));
699 text
= (UChar
*)malloc(STARTSIZE
*sizeof(UChar
));
703 fprintf(stderr
, "Allocating buffer failed\n");
708 // Read the file, split into lines, and save in memory.
709 // Loop runs once per utf-16 value from the input file,
710 // (The number of bytes read from file per loop iteration depends on external encoding.)
720 // We now have a good UTF-16 value in c.
721 text
[charCount
++] = c
;
722 if(charCount
== bufSize
) {
723 text
= (UChar
*)realloc(text
, 2*bufSize
*sizeof(UChar
));
725 fprintf(stderr
, "Reallocating buffer failed\n");
733 if (opt_terse
== FALSE
) {
734 printf("file \"%s\", %d charCount code units.\n", opt_fName
, charCount
);
737 textSize
= charCount
;
743 // Dump file contents if requested.
746 // dump file, etc... possibly
751 // We've got the file read into memory. Go do something with it.
754 for(i
= 0; i
< opt_passesCount
; i
++) {
755 if(opt_loopCount
!= 0) {
758 } else if(opt_isBound
) {
763 } else if(opt_time
!= 0) {