1 /***********************************************************************
2 * © 2016 and later: Unicode, Inc. and others.
3 * License & terms of use: http://www.unicode.org/copyright.html#License
4 ***********************************************************************
5 ***********************************************************************
7 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
9 ***********************************************************************/
10 /********************************************************************************
14 * Modification History:
16 * Andy Heninger First Version
18 *********************************************************************************
22 // This program tests string collation and sort key generation performance.
23 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
24 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,
25 // and include a byte order mark. Either LE or BE format is OK.
28 const char gUsageString
[] =
29 "usage: collperf options...\n"
30 "-help Display this message.\n"
31 "-file file_name utf-16 format file of names.\n"
32 "-locale name ICU locale to use. Default is en_US\n"
33 "-rules file_name Collation rules file (overrides locale)\n"
34 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
35 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
36 "-win Run test using Windows native services. (ICU is default)\n"
37 "-unix Run test using Unix strxfrm, strcoll services.\n"
38 "-uselen Use API with string lengths. Default is null-terminated strings\n"
39 "-usekeys Run tests using sortkeys rather than strcoll\n"
40 "-strcmp Run tests using u_strcmp rather than strcoll\n"
41 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n"
42 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
43 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
44 " under test at each call point. For measuring test overhead.\n"
45 "-terse Terse numbers-only output. Intended for use by scripts.\n"
46 "-french French accent ordering\n"
47 "-frenchoff No French accent ordering (for use with French locales.)\n"
48 "-norm Normalizing mode on\n"
49 "-shifted Shifted mode\n"
50 "-lower Lower case first\n"
51 "-upper Upper case first\n"
52 "-case Enable separate case level\n"
53 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
54 "-keyhist Produce a table sort key size vs. string length\n"
55 "-binsearch Binary Search timing test\n"
56 "-keygen Sort Key Generation timing test\n"
57 "-qsort Quicksort timing test\n"
58 "-iter Iteration Performance Test\n"
59 "-dump Display strings, sort keys and CEs.\n"
71 #include <unicode/utypes.h>
72 #include <unicode/ucol.h>
73 #include <unicode/ucoleitr.h>
74 #include <unicode/uloc.h>
75 #include <unicode/ustring.h>
76 #include <unicode/ures.h>
77 #include <unicode/uchar.h>
78 #include <unicode/ucnv.h>
79 #include <unicode/utf8.h>
85 // Stubs for Windows API functions when building on UNIXes.
88 inline int CompareStringW(DWORD
, DWORD
, UChar
*, int, UChar
*, int) {return 0;}
90 unsigned long timeGetTime() {
93 unsigned long val
= t
.tv_sec
* 1000; // Let it overflow. Who cares.
94 val
+= t
.tv_usec
/ 1000;
97 inline int LCMapStringW(DWORD
, DWORD
, UChar
*, int, UChar
*, int) {return 0;}
98 const int LCMAP_SORTKEY
= 0;
99 #define MAKELCID(a,b) 0
100 const int SORT_DEFAULT
= 0;
106 // Command line option variables
107 // These global variables are set according to the options specified
108 // on the command line by the user.
109 char * opt_fName
= 0;
110 const char * opt_locale
= "en_US";
111 int opt_langid
= 0; // Defaults to value corresponding to opt_locale.
112 char * opt_rules
= 0;
113 UBool opt_help
= FALSE
;
114 int opt_loopCount
= 1;
115 int opt_iLoopCount
= 1;
116 UBool opt_terse
= FALSE
;
117 UBool opt_qsort
= FALSE
;
118 UBool opt_binsearch
= FALSE
;
119 UBool opt_icu
= TRUE
;
120 UBool opt_win
= FALSE
; // Run with Windows native functions.
121 UBool opt_unix
= FALSE
; // Run with UNIX strcoll, strxfrm functions.
122 UBool opt_uselen
= FALSE
;
123 UBool opt_usekeys
= FALSE
;
124 UBool opt_strcmp
= FALSE
;
125 UBool opt_strcmpCPO
= FALSE
;
126 UBool opt_norm
= FALSE
;
127 UBool opt_keygen
= FALSE
;
128 UBool opt_french
= FALSE
;
129 UBool opt_frenchoff
= FALSE
;
130 UBool opt_shifted
= FALSE
;
131 UBool opt_lower
= FALSE
;
132 UBool opt_upper
= FALSE
;
133 UBool opt_case
= FALSE
;
135 UBool opt_keyhist
= FALSE
;
136 UBool opt_itertest
= FALSE
;
137 UBool opt_dump
= FALSE
;
142 // Definitions for the command line options
146 enum {FLAG
, NUM
, STRING
} type
;
151 {"-file", OptSpec::STRING
, &opt_fName
},
152 {"-locale", OptSpec::STRING
, &opt_locale
},
153 {"-langid", OptSpec::NUM
, &opt_langid
},
154 {"-rules", OptSpec::STRING
, &opt_rules
},
155 {"-qsort", OptSpec::FLAG
, &opt_qsort
},
156 {"-binsearch", OptSpec::FLAG
, &opt_binsearch
},
157 {"-iter", OptSpec::FLAG
, &opt_itertest
},
158 {"-win", OptSpec::FLAG
, &opt_win
},
159 {"-unix", OptSpec::FLAG
, &opt_unix
},
160 {"-uselen", OptSpec::FLAG
, &opt_uselen
},
161 {"-usekeys", OptSpec::FLAG
, &opt_usekeys
},
162 {"-strcmp", OptSpec::FLAG
, &opt_strcmp
},
163 {"-strcmpCPO", OptSpec::FLAG
, &opt_strcmpCPO
},
164 {"-norm", OptSpec::FLAG
, &opt_norm
},
165 {"-french", OptSpec::FLAG
, &opt_french
},
166 {"-frenchoff", OptSpec::FLAG
, &opt_frenchoff
},
167 {"-shifted", OptSpec::FLAG
, &opt_shifted
},
168 {"-lower", OptSpec::FLAG
, &opt_lower
},
169 {"-upper", OptSpec::FLAG
, &opt_upper
},
170 {"-case", OptSpec::FLAG
, &opt_case
},
171 {"-level", OptSpec::NUM
, &opt_level
},
172 {"-keyhist", OptSpec::FLAG
, &opt_keyhist
},
173 {"-keygen", OptSpec::FLAG
, &opt_keygen
},
174 {"-loop", OptSpec::NUM
, &opt_loopCount
},
175 {"-iloop", OptSpec::NUM
, &opt_iLoopCount
},
176 {"-terse", OptSpec::FLAG
, &opt_terse
},
177 {"-dump", OptSpec::FLAG
, &opt_dump
},
178 {"-help", OptSpec::FLAG
, &opt_help
},
179 {"-?", OptSpec::FLAG
, &opt_help
},
180 {0, OptSpec::FLAG
, 0}
184 //---------------------------------------------------------------------------
186 // Global variables pointing to and describing the test file
188 //---------------------------------------------------------------------------
193 // Each line from the source file (containing a name, presumably) gets
194 // one of these structs.
207 Line
*gFileLines
; // Ptr to array of Line structs, one per line in the file.
218 //---------------------------------------------------------------------------
220 // ProcessOptions() Function to read the command line options.
222 //---------------------------------------------------------------------------
223 UBool
ProcessOptions(int argc
, const char **argv
, OptSpec opts
[])
227 const char *pArgName
;
230 for (argNum
=1; argNum
<argc
; argNum
++) {
231 pArgName
= argv
[argNum
];
232 for (pOpt
= opts
; pOpt
->name
!= 0; pOpt
++) {
233 if (strcmp(pOpt
->name
, pArgName
) == 0) {
234 switch (pOpt
->type
) {
236 *(UBool
*)(pOpt
->pVar
) = TRUE
;
238 case OptSpec::STRING
:
240 if (argNum
>= argc
) {
241 fprintf(stderr
, "value expected for \"%s\" option.\n", pOpt
->name
);
244 *(const char **)(pOpt
->pVar
) = argv
[argNum
];
248 if (argNum
>= argc
) {
249 fprintf(stderr
, "value expected for \"%s\" option.\n", pOpt
->name
);
253 i
= strtol(argv
[argNum
], &endp
, 0);
254 if (endp
== argv
[argNum
]) {
255 fprintf(stderr
, "integer value expected for \"%s\" option.\n", pOpt
->name
);
258 *(int *)(pOpt
->pVar
) = i
;
265 fprintf(stderr
, "Unrecognized option \"%s\"\n", pArgName
);
272 //---------------------------------------------------------------------------------------
274 // Comparison functions for use by qsort.
276 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
277 // or null terminated.
279 //---------------------------------------------------------------------------------------
280 int ICUstrcmpK(const void *a
, const void *b
) {
282 int t
= strcmp((*(Line
**)a
)->icuSortKey
, (*(Line
**)b
)->icuSortKey
);
287 int ICUstrcmpL(const void *a
, const void *b
) {
290 t
= ucol_strcoll(gCol
, (*(Line
**)a
)->name
, (*(Line
**)a
)->len
, (*(Line
**)b
)->name
, (*(Line
**)b
)->len
);
291 if (t
== UCOL_LESS
) return -1;
292 if (t
== UCOL_GREATER
) return +1;
297 int ICUstrcmp(const void *a
, const void *b
) {
300 t
= ucol_strcoll(gCol
, (*(Line
**)a
)->name
, -1, (*(Line
**)b
)->name
, -1);
301 if (t
== UCOL_LESS
) return -1;
302 if (t
== UCOL_GREATER
) return +1;
307 int Winstrcmp(const void *a
, const void *b
) {
310 t
= CompareStringW(gWinLCID
, 0, (*(Line
**)a
)->name
, -1, (*(Line
**)b
)->name
, -1);
315 int UNIXstrcmp(const void *a
, const void *b
) {
318 t
= strcoll((*(Line
**)a
)->unixName
, (*(Line
**)b
)->unixName
);
323 int WinstrcmpL(const void *a
, const void *b
) {
326 t
= CompareStringW(gWinLCID
, 0, (*(Line
**)a
)->name
, (*(Line
**)a
)->len
, (*(Line
**)b
)->name
, (*(Line
**)b
)->len
);
331 int WinstrcmpK(const void *a
, const void *b
) {
333 int t
= strcmp((*(Line
**)a
)->winSortKey
, (*(Line
**)b
)->winSortKey
);
338 //---------------------------------------------------------------------------------------
340 // Function for sorting the names (lines) into a random order.
341 // Order is based on a hash of the ICU Sort key for the lines
342 // The randomized order is used as input for the sorting timing tests.
344 //---------------------------------------------------------------------------------------
345 int ICURandomCmp(const void *a
, const void *b
) {
346 char *ask
= (*(Line
**)a
)->icuSortKey
;
347 char *bsk
= (*(Line
**)b
)->icuSortKey
;
352 aVal
+= aVal
*37 + *ask
++;
355 bVal
+= bVal
*37 + *bsk
++;
361 else if (aVal
> bVal
) {
367 //---------------------------------------------------------------------------------------
369 // doKeyGen() Key Generation Timing Test
371 //---------------------------------------------------------------------------------------
379 // Adjust loop count to compensate for file size. Should be order n
380 double dLoopCount
= double(opt_loopCount
) * (1000. / double(gNumFileLines
));
381 int adj_loopCount
= int(dLoopCount
);
382 if (adj_loopCount
< 1) adj_loopCount
= 1;
385 unsigned long startTime
= timeGetTime();
388 for (loops
=0; loops
<adj_loopCount
; loops
++) {
389 for (line
=0; line
< gNumFileLines
; line
++) {
391 len
= gFileLines
[line
].len
;
393 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
394 LCMapStringW(gWinLCID
, LCMAP_SORTKEY
,
395 gFileLines
[line
].name
, len
,
396 (UChar
*)gFileLines
[line
].winSortKey
, 5000); // TODO something with length.
403 for (loops
=0; loops
<adj_loopCount
; loops
++) {
404 for (line
=0; line
< gNumFileLines
; line
++) {
406 len
= gFileLines
[line
].len
;
408 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
409 ucol_getSortKey(gCol
, gFileLines
[line
].name
, len
, (unsigned char *)gFileLines
[line
].icuSortKey
, 5000);
416 for (loops
=0; loops
<adj_loopCount
; loops
++) {
417 for (line
=0; line
< gNumFileLines
; line
++) {
418 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
419 strxfrm(gFileLines
[line
].unixSortKey
, gFileLines
[line
].unixName
, 5000);
425 unsigned long elapsedTime
= timeGetTime() - startTime
;
426 int ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)(adj_loopCount
*gNumFileLines
));
428 if (opt_terse
== FALSE
) {
429 printf("Sort Key Generation: total # of keys = %d\n", loops
*gNumFileLines
);
430 printf("Sort Key Generation: time per key = %d ns\n", ns
);
438 for (line
=0; line
<gNumFileLines
; line
++) {
439 totalChars
+= u_strlen(gFileLines
[line
].name
);
441 totalKeyLen
+= strlen(gFileLines
[line
].winSortKey
);
444 totalKeyLen
+= strlen(gFileLines
[line
].icuSortKey
);
447 totalKeyLen
+= strlen(gFileLines
[line
].unixSortKey
);
451 if (opt_terse
== FALSE
) {
452 printf("Key Length / character = %f\n", (float)totalKeyLen
/ (float)totalChars
);
454 printf("%f, ", (float)totalKeyLen
/ (float)totalChars
);
460 //---------------------------------------------------------------------------------------
462 // doBinarySearch() Binary Search timing test. Each name from the list
463 // is looked up in the full sorted list of names.
465 //---------------------------------------------------------------------------------------
466 void doBinarySearch()
473 unsigned long elapsedTime
= 0;
475 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup)
476 // Accurate timings do not depend on this being perfect. The correction is just to try to
477 // get total running times of about the right order, so the that user doesn't need to
478 // manually adjust the loop count for every different file size.
479 double dLoopCount
= double(opt_loopCount
) * 3000. / (log10((double)gNumFileLines
) * double(gNumFileLines
));
480 if (opt_usekeys
) dLoopCount
*= 5;
481 int adj_loopCount
= int(dLoopCount
);
482 if (adj_loopCount
< 1) adj_loopCount
= 1;
485 for (;;) { // not really a loop, just allows "break" to work, to simplify
486 // inadvertantly running more than one test through here.
487 if (opt_strcmp
|| opt_strcmpCPO
)
489 unsigned long startTime
= timeGetTime();
490 typedef int32_t (U_EXPORT2
*PF
)(const UChar
*, const UChar
*);
492 if (opt_strcmpCPO
) {pf
= u_strcmpCodePointOrder
;}
493 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int
494 // which forces the use of a cast here.
497 for (loops
=0; loops
<adj_loopCount
; loops
++) {
499 for (line
=0; line
< gNumFileLines
; line
++) {
500 int hi
= gNumFileLines
-1;
504 int newGuess
= (hi
+ lo
) / 2;
505 if (newGuess
== guess
)
508 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
509 r
= (*pf
)((gSortedLines
[line
])->name
, (gSortedLines
[guess
])->name
);
521 elapsedTime
= timeGetTime() - startTime
;
528 unsigned long startTime
= timeGetTime();
529 UCollationResult r
= UCOL_EQUAL
;
530 for (loops
=0; loops
<adj_loopCount
; loops
++) {
532 for (line
=0; line
< gNumFileLines
; line
++) {
536 lineLen
= (gSortedLines
[line
])->len
;
538 int hi
= gNumFileLines
-1;
542 int newGuess
= (hi
+ lo
) / 2;
543 if (newGuess
== guess
)
548 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
549 ri
= strcmp((gSortedLines
[line
])->icuSortKey
, (gSortedLines
[guess
])->icuSortKey
);
552 r
=UCOL_GREATER
; if(ri
<0) {r
=UCOL_LESS
;} else if (ri
==0) {r
=UCOL_EQUAL
;}
557 guessLen
= (gSortedLines
[guess
])->len
;
559 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
560 r
= ucol_strcoll(gCol
, (gSortedLines
[line
])->name
, lineLen
, (gSortedLines
[guess
])->name
, guessLen
);
573 elapsedTime
= timeGetTime() - startTime
;
579 unsigned long startTime
= timeGetTime();
581 for (loops
=0; loops
<adj_loopCount
; loops
++) {
583 for (line
=0; line
< gNumFileLines
; line
++) {
587 lineLen
= (gSortedLines
[line
])->len
;
589 int hi
= gNumFileLines
-1;
593 int newGuess
= (hi
+ lo
) / 2;
594 if (newGuess
== guess
)
598 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
599 r
= strcmp((gSortedLines
[line
])->winSortKey
, (gSortedLines
[guess
])->winSortKey
);
607 guessLen
= (gSortedLines
[guess
])->len
;
609 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
610 r
= CompareStringW(gWinLCID
, 0, (gSortedLines
[line
])->name
, lineLen
, (gSortedLines
[guess
])->name
, guessLen
);
613 if (opt_terse
== FALSE
) {
614 fprintf(stderr
, "Error returned from Windows CompareStringW.\n");
620 if (r
== 2) // strings ==
622 if (r
== 1) // line < guess
629 elapsedTime
= timeGetTime() - startTime
;
635 unsigned long startTime
= timeGetTime();
637 for (loops
=0; loops
<adj_loopCount
; loops
++) {
639 for (line
=0; line
< gNumFileLines
; line
++) {
640 int hi
= gNumFileLines
-1;
644 int newGuess
= (hi
+ lo
) / 2;
645 if (newGuess
== guess
)
649 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
650 r
= strcmp((gSortedLines
[line
])->unixSortKey
, (gSortedLines
[guess
])->unixSortKey
);
656 for (iLoop
=0; iLoop
< opt_iLoopCount
; iLoop
++) {
657 r
= strcoll((gSortedLines
[line
])->unixName
, (gSortedLines
[guess
])->unixName
);
661 fprintf(stderr
, "Error %d returned from strcoll.\n", errno
);
666 if (r
== 0) // strings ==
668 if (r
< 0) // line < guess
675 elapsedTime
= timeGetTime() - startTime
;
681 int ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)gCount
);
682 if (opt_terse
== FALSE
) {
683 printf("binary search: total # of string compares = %d\n", gCount
);
684 printf("binary search: compares per loop = %d\n", gCount
/ loops
);
685 printf("binary search: time per compare = %d ns\n", ns
);
695 //---------------------------------------------------------------------------------------
697 // doQSort() The quick sort timing test. Uses the C library qsort function.
699 //---------------------------------------------------------------------------------------
702 Line
**sortBuf
= new Line
*[gNumFileLines
];
704 // Adjust loop count to compensate for file size. QSort should be n log(n)
705 double dLoopCount
= double(opt_loopCount
) * 3000. / (log10((double)gNumFileLines
) * double(gNumFileLines
));
706 if (opt_usekeys
) dLoopCount
*= 5;
707 int adj_loopCount
= int(dLoopCount
);
708 if (adj_loopCount
< 1) adj_loopCount
= 1;
712 unsigned long startTime
= timeGetTime();
713 if (opt_win
&& opt_usekeys
) {
714 for (i
=0; i
<opt_loopCount
; i
++) {
715 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
716 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), WinstrcmpK
);
720 else if (opt_win
&& opt_uselen
) {
721 for (i
=0; i
<adj_loopCount
; i
++) {
722 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
723 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), WinstrcmpL
);
728 else if (opt_win
&& !opt_uselen
) {
729 for (i
=0; i
<adj_loopCount
; i
++) {
730 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
731 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), Winstrcmp
);
735 else if (opt_icu
&& opt_usekeys
) {
736 for (i
=0; i
<adj_loopCount
; i
++) {
737 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
738 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), ICUstrcmpK
);
742 else if (opt_icu
&& opt_uselen
) {
743 for (i
=0; i
<adj_loopCount
; i
++) {
744 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
745 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), ICUstrcmpL
);
750 else if (opt_icu
&& !opt_uselen
) {
751 for (i
=0; i
<adj_loopCount
; i
++) {
752 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
753 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), ICUstrcmp
);
757 else if (opt_unix
&& !opt_usekeys
) {
758 for (i
=0; i
<adj_loopCount
; i
++) {
759 memcpy(sortBuf
, gRandomLines
, gNumFileLines
* sizeof(Line
*));
760 qsort(sortBuf
, gNumFileLines
, sizeof(Line
*), UNIXstrcmp
);
764 unsigned long elapsedTime
= timeGetTime() - startTime
;
765 int ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)gCount
);
766 if (opt_terse
== FALSE
) {
767 printf("qsort: total # of string compares = %d\n", gCount
);
768 printf("qsort: time per compare = %d ns\n", ns
);
776 //---------------------------------------------------------------------------------------
778 // doKeyHist() Output a table of data for
779 // average sort key size vs. string length.
781 //---------------------------------------------------------------------------------------
786 // Find the maximum string length
787 for (i
=0; i
<gNumFileLines
; i
++) {
788 if (gFileLines
[i
].len
> maxLen
) maxLen
= gFileLines
[i
].len
;
791 // Allocate arrays to hold the histogram data
792 int *accumulatedLen
= new int[maxLen
+1];
793 int *numKeysOfSize
= new int[maxLen
+1];
794 for (i
=0; i
<=maxLen
; i
++) {
795 accumulatedLen
[i
] = 0;
796 numKeysOfSize
[i
] = 0;
799 // Fill the arrays...
800 for (i
=0; i
<gNumFileLines
; i
++) {
801 int len
= gFileLines
[i
].len
;
802 accumulatedLen
[len
] += strlen(gFileLines
[i
].icuSortKey
);
803 numKeysOfSize
[len
] += 1;
806 // And write out averages
807 printf("String Length, Avg Key Length, Avg Key Len per char\n");
808 for (i
=1; i
<=maxLen
; i
++) {
809 if (numKeysOfSize
[i
] > 0) {
810 printf("%d, %f, %f\n", i
, (float)accumulatedLen
[i
] / (float)numKeysOfSize
[i
],
811 (float)accumulatedLen
[i
] / (float)(numKeysOfSize
[i
] * i
));
814 delete []accumulatedLen
;
815 delete []numKeysOfSize
;
818 //---------------------------------------------------------------------------------------
820 // doForwardIterTest(UBool) Forward iteration test
821 // argument null-terminated string used
823 //---------------------------------------------------------------------------------------
824 void doForwardIterTest(UBool haslen
) {
827 UErrorCode error
= U_ZERO_ERROR
;
828 printf("\n\nPerforming forward iteration performance test with ");
831 printf("non-null terminated data -----------\n");
834 printf("null terminated data -----------\n");
836 printf("performance test on strings from file -----------\n");
838 UChar dummytext
[] = {0, 0};
839 UCollationElements
*iter
= ucol_openElements(gCol
, NULL
, 0, &error
);
840 ucol_setText(iter
, dummytext
, 1, &error
);
843 unsigned long startTime
= timeGetTime();
844 while (count
< opt_loopCount
) {
846 while (linecount
< gNumFileLines
) {
847 UChar
*str
= gFileLines
[linecount
].name
;
848 int strlen
= haslen
?gFileLines
[linecount
].len
:-1;
849 ucol_setText(iter
, str
, strlen
, &error
);
850 while (ucol_next(iter
, &error
) != UCOL_NULLORDER
) {
858 unsigned long elapsedTime
= timeGetTime() - startTime
;
859 printf("elapsedTime %ld\n", elapsedTime
);
861 // empty loop recalculation
863 startTime
= timeGetTime();
864 while (count
< opt_loopCount
) {
866 while (linecount
< gNumFileLines
) {
867 UChar
*str
= gFileLines
[linecount
].name
;
868 int strlen
= haslen
?gFileLines
[linecount
].len
:-1;
869 ucol_setText(iter
, str
, strlen
, &error
);
874 elapsedTime
-= (timeGetTime() - startTime
);
875 printf("elapsedTime %ld\n", elapsedTime
);
877 ucol_closeElements(iter
);
879 int ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)gCount
);
880 printf("Total number of strings compared %d in %d loops\n", gNumFileLines
,
882 printf("Average time per ucol_next() nano seconds %d\n", ns
);
884 printf("performance test on skipped-5 concatenated strings from file -----------\n");
888 // appending all the strings
890 while (linecount
< gNumFileLines
) {
891 strlen
+= haslen
?gFileLines
[linecount
].len
:
892 u_strlen(gFileLines
[linecount
].name
);
895 str
= (UChar
*)malloc(sizeof(UChar
) * strlen
);
898 while (strindex
< strlen
) {
900 len
+= haslen
?gFileLines
[linecount
].len
:
901 u_strlen(gFileLines
[linecount
].name
);
902 memcpy(str
+ strindex
, gFileLines
[linecount
].name
,
903 sizeof(UChar
) * len
);
908 printf("Total size of strings %d\n", strlen
);
916 iter
= ucol_openElements(gCol
, str
, strlen
, &error
);
918 strlen
= u_strlen(str
);
920 strlen
-= 5; // any left over characters are not iterated,
921 // this is to ensure the backwards and forwards iterators
922 // gets the same position
923 startTime
= timeGetTime();
924 while (count
< opt_loopCount
) {
927 ucol_setOffset(iter
, strindex
, &error
);
929 if (ucol_next(iter
, &error
) == UCOL_NULLORDER
) {
936 if (strindex
> strlen
) {
939 ucol_setOffset(iter
, strindex
, &error
);
946 elapsedTime
= timeGetTime() - startTime
;
947 printf("elapsedTime %ld\n", elapsedTime
);
949 // empty loop recalculation
952 startTime
= timeGetTime();
953 while (count
< opt_loopCount
) {
956 ucol_setOffset(iter
, strindex
, &error
);
962 if (strindex
> strlen
) {
965 ucol_setOffset(iter
, strindex
, &error
);
971 elapsedTime
-= (timeGetTime() - startTime
);
972 printf("elapsedTime %ld\n", elapsedTime
);
974 ucol_closeElements(iter
);
976 printf("gCount %d\n", gCount
);
977 ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)gCount
);
978 printf("Average time per ucol_next() nano seconds %d\n", ns
);
981 //---------------------------------------------------------------------------------------
983 // doBackwardIterTest(UBool) Backwards iteration test
984 // argument null-terminated string used
986 //---------------------------------------------------------------------------------------
987 void doBackwardIterTest(UBool haslen
) {
989 UErrorCode error
= U_ZERO_ERROR
;
990 printf("\n\nPerforming backward iteration performance test with ");
993 printf("non-null terminated data -----------\n");
996 printf("null terminated data -----------\n");
999 printf("performance test on strings from file -----------\n");
1001 UCollationElements
*iter
= ucol_openElements(gCol
, NULL
, 0, &error
);
1002 UChar dummytext
[] = {0, 0};
1003 ucol_setText(iter
, dummytext
, 1, &error
);
1006 unsigned long startTime
= timeGetTime();
1007 while (count
< opt_loopCount
) {
1009 while (linecount
< gNumFileLines
) {
1010 UChar
*str
= gFileLines
[linecount
].name
;
1011 int strlen
= haslen
?gFileLines
[linecount
].len
:-1;
1012 ucol_setText(iter
, str
, strlen
, &error
);
1013 while (ucol_previous(iter
, &error
) != UCOL_NULLORDER
) {
1021 unsigned long elapsedTime
= timeGetTime() - startTime
;
1023 printf("elapsedTime %ld\n", elapsedTime
);
1025 // empty loop recalculation
1027 startTime
= timeGetTime();
1028 while (count
< opt_loopCount
) {
1030 while (linecount
< gNumFileLines
) {
1031 UChar
*str
= gFileLines
[linecount
].name
;
1032 int strlen
= haslen
?gFileLines
[linecount
].len
:-1;
1033 ucol_setText(iter
, str
, strlen
, &error
);
1038 elapsedTime
-= (timeGetTime() - startTime
);
1040 printf("elapsedTime %ld\n", elapsedTime
);
1041 ucol_closeElements(iter
);
1043 int ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)gCount
);
1044 printf("Total number of strings compared %d in %d loops\n", gNumFileLines
,
1046 printf("Average time per ucol_previous() nano seconds %d\n", ns
);
1048 printf("performance test on skipped-5 concatenated strings from file -----------\n");
1052 // appending all the strings
1054 while (linecount
< gNumFileLines
) {
1055 strlen
+= haslen
?gFileLines
[linecount
].len
:
1056 u_strlen(gFileLines
[linecount
].name
);
1059 str
= (UChar
*)malloc(sizeof(UChar
) * strlen
);
1062 while (strindex
< strlen
) {
1064 len
+= haslen
?gFileLines
[linecount
].len
:
1065 u_strlen(gFileLines
[linecount
].name
);
1066 memcpy(str
+ strindex
, gFileLines
[linecount
].name
,
1067 sizeof(UChar
) * len
);
1072 printf("Total size of strings %d\n", strlen
);
1081 iter
= ucol_openElements(gCol
, str
, strlen
, &error
);
1083 strlen
= u_strlen(str
);
1086 startTime
= timeGetTime();
1087 while (count
< opt_loopCount
) {
1090 ucol_setOffset(iter
, strindex
, &error
);
1092 if (ucol_previous(iter
, &error
) == UCOL_NULLORDER
) {
1099 if (strindex
> strlen
) {
1102 ucol_setOffset(iter
, strindex
, &error
);
1109 elapsedTime
= timeGetTime() - startTime
;
1110 printf("elapsedTime %ld\n", elapsedTime
);
1112 // empty loop recalculation
1115 startTime
= timeGetTime();
1116 while (count
< opt_loopCount
) {
1119 ucol_setOffset(iter
, strindex
, &error
);
1125 if (strindex
> strlen
) {
1128 ucol_setOffset(iter
, strindex
, &error
);
1134 elapsedTime
-= (timeGetTime() - startTime
);
1135 printf("elapsedTime %ld\n", elapsedTime
);
1136 ucol_closeElements(iter
);
1138 printf("gCount %d\n", gCount
);
1139 ns
= (int)(float(1000000) * (float)elapsedTime
/ (float)gCount
);
1140 printf("Average time per ucol_previous() nano seconds %d\n", ns
);
1143 //---------------------------------------------------------------------------------------
1145 // doIterTest() Iteration test
1147 //---------------------------------------------------------------------------------------
1149 doForwardIterTest(opt_uselen
);
1150 doBackwardIterTest(opt_uselen
);
1154 //----------------------------------------------------------------------------------------
1156 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
1157 // Since it appears that Unicode support is going in the general
1158 // direction of the use of UTF-8 locales, that is the approach
1159 // that is used here.
1161 //----------------------------------------------------------------------------------------
1162 void UnixConvert() {
1165 UConverter
*cvrtr
; // An ICU code page converter.
1166 UErrorCode status
= U_ZERO_ERROR
;
1169 cvrtr
= ucnv_open("utf-8", &status
); // we are just doing UTF-8 locales for now.
1170 if (U_FAILURE(status
)) {
1171 fprintf(stderr
, "ICU Converter open failed.: %s\n", u_errorName(status
));
1175 for (line
=0; line
< gNumFileLines
; line
++) {
1176 int sizeNeeded
= ucnv_fromUChars(cvrtr
,
1177 0, // ptr to target buffer.
1178 0, // length of target buffer.
1179 gFileLines
[line
].name
,
1180 -1, // source is null terminated
1182 if (status
!= U_BUFFER_OVERFLOW_ERROR
&& status
!= U_ZERO_ERROR
) {
1183 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1186 status
= U_ZERO_ERROR
;
1187 gFileLines
[line
].unixName
= new char[sizeNeeded
+1];
1188 sizeNeeded
= ucnv_fromUChars(cvrtr
,
1189 gFileLines
[line
].unixName
, // ptr to target buffer.
1190 sizeNeeded
+1, // length of target buffer.
1191 gFileLines
[line
].name
,
1192 -1, // source is null terminated
1194 if (U_FAILURE(status
)) {
1195 fprintf(stderr
, "ICU Conversion Failed.: %d\n", status
);
1198 gFileLines
[line
].unixName
[sizeNeeded
] = 0;
1204 //----------------------------------------------------------------------------------------
1206 // class UCharFile Class to hide all the gorp to read a file in
1207 // and produce a stream of UChars.
1209 //----------------------------------------------------------------------------------------
1212 UCharFile(const char *fileName
);
1215 UBool
eof() {return fEof
;};
1216 UBool
error() {return fError
;};
1219 UCharFile (const UCharFile
& /*other*/) {}; // No copy constructor.
1220 UCharFile
& operator = (const UCharFile
&/*other*/) {return *this;}; // No assignment op
1226 UChar fPending2ndSurrogate
;
1228 enum {UTF16LE
, UTF16BE
, UTF8
} fEncoding
;
1231 UCharFile::UCharFile(const char * fileName
) {
1235 fFile
= fopen(fName
, "rb");
1236 fPending2ndSurrogate
= 0;
1237 if (fFile
== NULL
) {
1238 fprintf(stderr
, "Can not open file \"%s\"\n", opt_fName
);
1243 // Look for the byte order mark at the start of the file.
1245 int BOMC1
, BOMC2
, BOMC3
;
1246 BOMC1
= fgetc(fFile
);
1247 BOMC2
= fgetc(fFile
);
1249 if (BOMC1
== 0xff && BOMC2
== 0xfe) {
1250 fEncoding
= UTF16LE
; }
1251 else if (BOMC1
== 0xfe && BOMC2
== 0xff) {
1252 fEncoding
= UTF16BE
; }
1253 else if (BOMC1
== 0xEF && BOMC2
== 0xBB && (BOMC3
= fgetc(fFile
)) == 0xBF ) {
1257 fprintf(stderr
, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
1258 "must include a BOM.\n", fileName
);
1265 UCharFile::~UCharFile() {
1271 UChar
UCharFile::get() {
1273 switch (fEncoding
) {
1300 if (fPending2ndSurrogate
!= 0) {
1301 c
= fPending2ndSurrogate
;
1302 fPending2ndSurrogate
= 0;
1306 int ch
= fgetc(fFile
); // Note: c and ch are separate cause eof test doesn't work on UChar type.
1314 // It's ascii. No further utf-8 conversion.
1319 // Figure out the lenght of the char and read the rest of the bytes
1320 // into a temp array.
1322 if (ch
>= 0xF0) {nBytes
=4;}
1323 else if (ch
>= 0xE0) {nBytes
=3;}
1324 else if (ch
>= 0xC0) {nBytes
=2;}
1326 fprintf(stderr
, "utf-8 encoded file contains corrupt data.\n");
1331 unsigned char bytes
[10];
1332 bytes
[0] = (unsigned char)ch
;
1334 for (i
=1; i
<nBytes
; i
++) {
1335 bytes
[i
] = fgetc(fFile
);
1336 if (bytes
[i
] < 0x80 || bytes
[i
] >= 0xc0) {
1337 fprintf(stderr
, "utf-8 encoded file contains corrupt data.\n");
1343 // Convert the bytes from the temp array to a Unicode char.
1346 U8_NEXT_UNSAFE(bytes
, i
, cp
);
1349 if (cp
>= 0x10000) {
1350 // The code point needs to be broken up into a utf-16 surrogate pair.
1351 // Process first half this time through the main loop, and
1352 // remember the other half for the next time through.
1355 UTF16_APPEND_CHAR_UNSAFE(utf16Buf
, i
, cp
);
1356 fPending2ndSurrogate
= utf16Buf
[1];
1362 c
= 0xFFFD; /* Error, unspecified codepage*/
1363 fprintf(stderr
, "UCharFile: Error: unknown fEncoding\n");
1369 //----------------------------------------------------------------------------------------
1371 // openRulesCollator - Command line specified a rules file. Read it in
1372 // and open a collator with it.
1374 //----------------------------------------------------------------------------------------
1375 UCollator
*openRulesCollator() {
1376 UCharFile
f(opt_rules
);
1382 UChar
*buf
= (UChar
*)malloc(bufLen
* sizeof(UChar
));
1398 buf
= (UChar
*)realloc(buf
, bufLen
);
1407 UErrorCode status
= U_ZERO_ERROR
;
1408 UCollator
*coll
= ucol_openRules(buf
, u_strlen(buf
), UCOL_OFF
,
1409 UCOL_DEFAULT_STRENGTH
, NULL
, &status
);
1410 if (U_FAILURE(status
)) {
1411 fprintf(stderr
, "ICU ucol_openRules() open failed.: %d\n", status
);
1422 //----------------------------------------------------------------------------------------
1424 // Main -- process command line, read in and pre-process the test file,
1425 // call other functions to do the actual tests.
1427 //----------------------------------------------------------------------------------------
1428 int main(int argc
, const char** argv
) {
1429 if (ProcessOptions(argc
, argv
, opts
) != TRUE
|| opt_help
|| opt_fName
== 0) {
1430 printf(gUsageString
);
1434 // Make sure that we've only got one API selected.
1435 if (opt_unix
|| opt_win
) opt_icu
= FALSE
;
1436 if (opt_unix
) opt_win
= FALSE
;
1439 // Set up an ICU collator
1441 UErrorCode status
= U_ZERO_ERROR
;
1443 if (opt_rules
!= 0) {
1444 gCol
= openRulesCollator();
1445 if (gCol
== 0) {return -1;}
1448 gCol
= ucol_open(opt_locale
, &status
);
1449 if (U_FAILURE(status
)) {
1450 fprintf(stderr
, "Collator creation failed.: %d\n", status
);
1454 if (status
==U_USING_DEFAULT_WARNING
&& opt_terse
==FALSE
) {
1455 fprintf(stderr
, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale
);
1457 if (status
==U_USING_FALLBACK_WARNING
&& opt_terse
==FALSE
) {
1458 fprintf(stderr
, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale
);
1462 ucol_setAttribute(gCol
, UCOL_NORMALIZATION_MODE
, UCOL_ON
, &status
);
1464 if (opt_french
&& opt_frenchoff
) {
1465 fprintf(stderr
, "collperf: Error, specified both -french and -frenchoff options.");
1469 ucol_setAttribute(gCol
, UCOL_FRENCH_COLLATION
, UCOL_ON
, &status
);
1471 if (opt_frenchoff
) {
1472 ucol_setAttribute(gCol
, UCOL_FRENCH_COLLATION
, UCOL_OFF
, &status
);
1475 ucol_setAttribute(gCol
, UCOL_CASE_FIRST
, UCOL_LOWER_FIRST
, &status
);
1478 ucol_setAttribute(gCol
, UCOL_CASE_FIRST
, UCOL_UPPER_FIRST
, &status
);
1481 ucol_setAttribute(gCol
, UCOL_CASE_LEVEL
, UCOL_ON
, &status
);
1484 ucol_setAttribute(gCol
, UCOL_ALTERNATE_HANDLING
, UCOL_SHIFTED
, &status
);
1486 if (opt_level
!= 0) {
1487 switch (opt_level
) {
1489 ucol_setAttribute(gCol
, UCOL_STRENGTH
, UCOL_PRIMARY
, &status
);
1492 ucol_setAttribute(gCol
, UCOL_STRENGTH
, UCOL_SECONDARY
, &status
);
1495 ucol_setAttribute(gCol
, UCOL_STRENGTH
, UCOL_TERTIARY
, &status
);
1498 ucol_setAttribute(gCol
, UCOL_STRENGTH
, UCOL_QUATERNARY
, &status
);
1501 ucol_setAttribute(gCol
, UCOL_STRENGTH
, UCOL_IDENTICAL
, &status
);
1504 fprintf(stderr
, "-level param must be between 1 and 5\n");
1509 if (U_FAILURE(status
)) {
1510 fprintf(stderr
, "Collator attribute setting failed.: %d\n", status
);
1516 // Set up a Windows LCID
1518 if (opt_langid
!= 0) {
1519 gWinLCID
= MAKELCID(opt_langid
, SORT_DEFAULT
);
1522 gWinLCID
= uloc_getLCID(opt_locale
);
1527 // Set the UNIX locale
1530 if (setlocale(LC_ALL
, opt_locale
) == 0) {
1531 fprintf(stderr
, "setlocale(LC_ALL, %s) failed.\n", opt_locale
);
1536 // Read in the input file.
1537 // File assumed to be utf-16.
1538 // Lines go onto heap buffers. Global index array to line starts is created.
1539 // Lines themselves are null terminated.
1542 UCharFile
f(opt_fName
);
1547 const int MAXLINES
= 100000;
1548 gFileLines
= new Line
[MAXLINES
];
1552 // Read the file, split into lines, and save in memory.
1553 // Loop runs once per utf-16 value from the input file,
1554 // (The number of bytes read from file per loop iteration depends on external encoding.)
1563 // We now have a good UTF-16 value in c.
1565 // Watch for CR, LF, EOF; these finish off a line.
1570 if (f
.eof() || c
== 0x0a || c
==0x2028) { // Unipad inserts 2028 line separators!
1573 gFileLines
[gNumFileLines
].name
= new UChar
[column
];
1574 gFileLines
[gNumFileLines
].len
= column
-1;
1575 memcpy(gFileLines
[gNumFileLines
].name
, buf
, column
* sizeof(UChar
));
1578 if (gNumFileLines
>= MAXLINES
) {
1579 fprintf(stderr
, "File too big. Max number of lines is %d\n", MAXLINES
);
1584 if (c
== 0xa || c
== 0x2028)
1592 static UBool warnFlag
= TRUE
;
1594 fprintf(stderr
, "Warning - file line longer than 1023 chars truncated.\n");
1601 if (opt_terse
== FALSE
) {
1602 printf("file \"%s\", %d lines.\n", opt_fName
, gNumFileLines
);
1606 // Convert the lines to the UNIX encoding.
1612 // Pre-compute ICU sort keys for the lines of the file.
1617 for (line
=0; line
<gNumFileLines
; line
++) {
1618 t
= ucol_getSortKey(gCol
, gFileLines
[line
].name
, -1, (unsigned char *)buf
, sizeof(buf
));
1619 gFileLines
[line
].icuSortKey
= new char[t
];
1621 if (t
> (int32_t)sizeof(buf
)) {
1622 t
= ucol_getSortKey(gCol
, gFileLines
[line
].name
, -1, (unsigned char *)gFileLines
[line
].icuSortKey
, t
);
1626 memcpy(gFileLines
[line
].icuSortKey
, buf
, t
);
1633 // Pre-compute Windows sort keys for the lines of the file.
1635 for (line
=0; line
<gNumFileLines
; line
++) {
1636 t
=LCMapStringW(gWinLCID
, LCMAP_SORTKEY
, gFileLines
[line
].name
, -1, buf
, sizeof(buf
));
1637 gFileLines
[line
].winSortKey
= new char[t
];
1638 if (t
> (int32_t)sizeof(buf
)) {
1639 t
= LCMapStringW(gWinLCID
, LCMAP_SORTKEY
, gFileLines
[line
].name
, -1, (UChar
*)(gFileLines
[line
].winSortKey
), t
);
1643 memcpy(gFileLines
[line
].winSortKey
, buf
, t
);
1648 // Pre-compute UNIX sort keys for the lines of the file.
1651 for (line
=0; line
<gNumFileLines
; line
++) {
1652 t
=strxfrm((char *)buf
, gFileLines
[line
].unixName
, sizeof(buf
));
1653 gFileLines
[line
].unixSortKey
= new char[t
];
1654 if (t
> (int32_t)sizeof(buf
)) {
1655 t
= strxfrm(gFileLines
[line
].unixSortKey
, gFileLines
[line
].unixName
, sizeof(buf
));
1659 memcpy(gFileLines
[line
].unixSortKey
, buf
, t
);
1666 // Dump file lines, CEs, Sort Keys if requested.
1670 for (line
=0; line
<gNumFileLines
; line
++) {
1672 UChar c
= gFileLines
[line
].name
[i
];
1675 if (c
< 0x20 || c
> 0x7e) {
1676 printf("\\u%.4x", c
);
1685 UCollationElements
*CEiter
= ucol_openElements(gCol
, gFileLines
[line
].name
, -1, &status
);
1689 ce
= ucol_next(CEiter
, &status
);
1690 if (ce
== UCOL_NULLORDER
) {
1693 printf(" %.8x", ce
);
1700 ucol_closeElements(CEiter
);
1703 printf(" ICU Sort Key: ");
1705 unsigned char c
= gFileLines
[line
].icuSortKey
[i
];
1710 if (i
> 0 && i
% 20 == 0) {
1720 // Pre-sort the lines.
1723 gSortedLines
= new Line
*[gNumFileLines
];
1724 for (i
=0; i
<gNumFileLines
; i
++) {
1725 gSortedLines
[i
] = &gFileLines
[i
];
1729 qsort(gSortedLines
, gNumFileLines
, sizeof(Line
*), Winstrcmp
);
1731 else if (opt_unix
) {
1732 qsort(gSortedLines
, gNumFileLines
, sizeof(Line
*), UNIXstrcmp
);
1736 qsort(gSortedLines
, gNumFileLines
, sizeof(Line
*), ICUstrcmp
);
1741 // Make up a randomized order, will be used for sorting tests.
1743 gRandomLines
= new Line
*[gNumFileLines
];
1744 for (i
=0; i
<gNumFileLines
; i
++) {
1745 gRandomLines
[i
] = &gFileLines
[i
];
1747 qsort(gRandomLines
, gNumFileLines
, sizeof(Line
*), ICURandomCmp
);
1753 // We've got the file read into memory. Go do something with it.
1756 if (opt_qsort
) doQSort();
1757 if (opt_binsearch
) doBinarySearch();
1758 if (opt_keygen
) doKeyGen();
1759 if (opt_keyhist
) doKeyHist();
1760 if (opt_itertest
) doIterTest();