git.saurik.com Git - apple/icu.git/blame - icuSources/test/perf/collationperf/collperf.cpp

Commit	Line	Data
f3c0d7a5 A	1	/***********************************************************************
	2	* © 2016 and later: Unicode, Inc. and others.
	3	* License & terms of use: http://www.unicode.org/copyright.html#License
	4	***********************************************************************
	5	***********************************************************************
46f4442e	6	* COPYRIGHT:
51004dcb	7	* Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
46f4442e	8	*
f3c0d7a5	9	***********************************************************************/
46f4442e A	10	/********************************************************************************
	11	*
	12	* File CALLCOLL.C
	13	*
	14	* Modification History:
	15	* Name Description
	16	* Andy Heninger First Version
	17	*
	18	*********************************************************************************
	19	*/
	20
	21	//
	22	// This program tests string collation and sort key generation performance.
	23	// Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
	24	// A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,
	25	// and include a byte order mark. Either LE or BE format is OK.
	26	//
	27
	28	const char gUsageString[] =
	29	"usage: collperf options...\n"
	30	"-help Display this message.\n"
	31	"-file file_name utf-16 format file of names.\n"
	32	"-locale name ICU locale to use. Default is en_US\n"
	33	"-rules file_name Collation rules file (overrides locale)\n"
	34	"-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
	35	" see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
	36	"-win Run test using Windows native services. (ICU is default)\n"
	37	"-unix Run test using Unix strxfrm, strcoll services.\n"
	38	"-uselen Use API with string lengths. Default is null-terminated strings\n"
	39	"-usekeys Run tests using sortkeys rather than strcoll\n"
	40	"-strcmp Run tests using u_strcmp rather than strcoll\n"
	41	"-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n"
	42	"-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
	43	"-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
	44	" under test at each call point. For measuring test overhead.\n"
	45	"-terse Terse numbers-only output. Intended for use by scripts.\n"
	46	"-french French accent ordering\n"
	47	"-frenchoff No French accent ordering (for use with French locales.)\n"
	48	"-norm Normalizing mode on\n"
	49	"-shifted Shifted mode\n"
	50	"-lower Lower case first\n"
	51	"-upper Upper case first\n"
	52	"-case Enable separate case level\n"
	53	"-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
	54	"-keyhist Produce a table sort key size vs. string length\n"
	55	"-binsearch Binary Search timing test\n"
	56	"-keygen Sort Key Generation timing test\n"
	57	"-qsort Quicksort timing test\n"
	58	"-iter Iteration Performance Test\n"
	59	"-dump Display strings, sort keys and CEs.\n"
	60	;
	61
	62
	63
	64	#include <stdio.h>
	65	#include <string.h>
	66	#include <stdlib.h>
	67	#include <math.h>
	68	#include <locale.h>
	69	#include <errno.h>
	70
	71	#include <unicode/utypes.h>
	72	#include <unicode/ucol.h>
	73	#include <unicode/ucoleitr.h>
74	#include <unicode/uloc.h>
75	#include <unicode/ustring.h>
76	#include <unicode/ures.h>
77	#include <unicode/uchar.h>
78	#include <unicode/ucnv.h>
79	#include <unicode/utf8.h>
80
81	#ifdef WIN32
82	#include <windows.h>
83	#else
84	//
85	// Stubs for Windows API functions when building on UNIXes.
86	//
87	typedef int DWORD;
729e4ab9	88	inline int CompareStringW(DWORD, DWORD, UChar , int, UChar , int) {return 0;}
46f4442e A	89	#include <sys/time.h>
	90	unsigned long timeGetTime() {
	91	struct timeval t;
	92	gettimeofday(&t, 0);
	93	unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
	94	val += t.tv_usec / 1000;
	95	return val;
729e4ab9 A	96	}
729e4ab9 A	97	inline int LCMapStringW(DWORD, DWORD, UChar , int, UChar , int) {return 0;}
46f4442e A	98	const int LCMAP_SORTKEY = 0;
	99	#define MAKELCID(a,b) 0
	100	const int SORT_DEFAULT = 0;
	101	#endif
	102
	103
	104
	105	//
	106	// Command line option variables
	107	// These global variables are set according to the options specified
	108	// on the command line by the user.
	109	char * opt_fName = 0;
729e4ab9	110	const char * opt_locale = "en_US";
46f4442e A	111	int opt_langid = 0; // Defaults to value corresponding to opt_locale.
	112	char * opt_rules = 0;
	113	UBool opt_help = FALSE;
	114	int opt_loopCount = 1;
	115	int opt_iLoopCount = 1;
	116	UBool opt_terse = FALSE;
	117	UBool opt_qsort = FALSE;
	118	UBool opt_binsearch = FALSE;
	119	UBool opt_icu = TRUE;
	120	UBool opt_win = FALSE; // Run with Windows native functions.
	121	UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
	122	UBool opt_uselen = FALSE;
	123	UBool opt_usekeys = FALSE;
	124	UBool opt_strcmp = FALSE;
	125	UBool opt_strcmpCPO = FALSE;
	126	UBool opt_norm = FALSE;
	127	UBool opt_keygen = FALSE;
	128	UBool opt_french = FALSE;
	129	UBool opt_frenchoff = FALSE;
	130	UBool opt_shifted = FALSE;
	131	UBool opt_lower = FALSE;
	132	UBool opt_upper = FALSE;
	133	UBool opt_case = FALSE;
	134	int opt_level = 0;
	135	UBool opt_keyhist = FALSE;
	136	UBool opt_itertest = FALSE;
	137	UBool opt_dump = FALSE;
	138
	139
	140
	141	//
	142	// Definitions for the command line options
	143	//
	144	struct OptSpec {
	145	const char *name;
	146	enum {FLAG, NUM, STRING} type;
	147	void *pVar;
	148	};
	149
	150	OptSpec opts[] = {
	151	{"-file", OptSpec::STRING, &opt_fName},
	152	{"-locale", OptSpec::STRING, &opt_locale},
	153	{"-langid", OptSpec::NUM, &opt_langid},
	154	{"-rules", OptSpec::STRING, &opt_rules},
	155	{"-qsort", OptSpec::FLAG, &opt_qsort},
	156	{"-binsearch", OptSpec::FLAG, &opt_binsearch},
	157	{"-iter", OptSpec::FLAG, &opt_itertest},
	158	{"-win", OptSpec::FLAG, &opt_win},
	159	{"-unix", OptSpec::FLAG, &opt_unix},
	160	{"-uselen", OptSpec::FLAG, &opt_uselen},
	161	{"-usekeys", OptSpec::FLAG, &opt_usekeys},
	162	{"-strcmp", OptSpec::FLAG, &opt_strcmp},
	163	{"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO},
	164	{"-norm", OptSpec::FLAG, &opt_norm},
	165	{"-french", OptSpec::FLAG, &opt_french},
	166	{"-frenchoff", OptSpec::FLAG, &opt_frenchoff},
	167	{"-shifted", OptSpec::FLAG, &opt_shifted},
	168	{"-lower", OptSpec::FLAG, &opt_lower},
	169	{"-upper", OptSpec::FLAG, &opt_upper},
	170	{"-case", OptSpec::FLAG, &opt_case},
	171	{"-level", OptSpec::NUM, &opt_level},
	172	{"-keyhist", OptSpec::FLAG, &opt_keyhist},
	173	{"-keygen", OptSpec::FLAG, &opt_keygen},
	174	{"-loop", OptSpec::NUM, &opt_loopCount},
175	{"-iloop", OptSpec::NUM, &opt_iLoopCount},
176	{"-terse", OptSpec::FLAG, &opt_terse},
177	{"-dump", OptSpec::FLAG, &opt_dump},
178	{"-help", OptSpec::FLAG, &opt_help},
179	{"-?", OptSpec::FLAG, &opt_help},
180	{0, OptSpec::FLAG, 0}
181	};
182
183
184	//---------------------------------------------------------------------------
185	//
186	// Global variables pointing to and describing the test file
187	//
188	//---------------------------------------------------------------------------
189
190	//
191	// struct Line
192	//
193	// Each line from the source file (containing a name, presumably) gets
194	// one of these structs.
195	//
196	struct Line {
197	UChar *name;
198	int len;
199	char *winSortKey;
200	char *icuSortKey;
201	char *unixSortKey;
202	char *unixName;
203	};
204
205
206
207	Line *gFileLines; // Ptr to array of Line structs, one per line in the file.
208	int gNumFileLines;
209	UCollator *gCol;
210	DWORD gWinLCID;
211
212	Line **gSortedLines;
213	Line **gRandomLines;
214	int gCount;
215
216
217
218	//---------------------------------------------------------------------------
219	//
220	// ProcessOptions() Function to read the command line options.
221	//
222	//---------------------------------------------------------------------------
223	UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
224	{
225	int i;
226	int argNum;
227	const char *pArgName;
228	OptSpec *pOpt;
229
230	for (argNum=1; argNum<argc; argNum++) {
231	pArgName = argv[argNum];
232	for (pOpt = opts; pOpt->name != 0; pOpt++) {
233	if (strcmp(pOpt->name, pArgName) == 0) {
234	switch (pOpt->type) {
235	case OptSpec::FLAG:
236	(UBool )(pOpt->pVar) = TRUE;
237	break;
238	case OptSpec::STRING:
239	argNum ++;
240	if (argNum >= argc) {
241	fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
242	return FALSE;
243	}
244	(const char *)(pOpt->pVar) = argv[argNum];
245	break;
246	case OptSpec::NUM:
247	argNum ++;
248	if (argNum >= argc) {
249	fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
250	return FALSE;
251	}
252	char *endp;
253	i = strtol(argv[argNum], &endp, 0);
254	if (endp == argv[argNum]) {
255	fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
256	return FALSE;
257	}
258	(int )(pOpt->pVar) = i;
259	}
260	break;
261	}
262	}
263	if (pOpt->name == 0)
264	{
265	fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
266	return FALSE;
267	}
268	}
269	return TRUE;
270	}
271
272	//---------------------------------------------------------------------------------------
273	//
274	// Comparison functions for use by qsort.
275	//
276	// Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
277	// or null terminated.
278	//
279	//---------------------------------------------------------------------------------------
280	int ICUstrcmpK(const void a, const void b) {
281	gCount++;
282	int t = strcmp(((Line )a)->icuSortKey, ((Line **)b)->icuSortKey);
283	return t;
284	}
285
286
287	int ICUstrcmpL(const void a, const void b) {
288	gCount++;
289	UCollationResult t;
290	t = ucol_strcoll(gCol, ((Line )a)->name, ((Line *)a)->len, ((Line *)b)->name, ((Line **)b)->len);
291	if (t == UCOL_LESS) return -1;
292	if (t == UCOL_GREATER) return +1;
293	return 0;
294	}
295
296
297	int ICUstrcmp(const void a, const void b) {
298	gCount++;
299	UCollationResult t;
300	t = ucol_strcoll(gCol, ((Line )a)->name, -1, ((Line **)b)->name, -1);
301	if (t == UCOL_LESS) return -1;
302	if (t == UCOL_GREATER) return +1;
303	return 0;
304	}
305
306
307	int Winstrcmp(const void a, const void b) {
308	gCount++;
309	int t;
310	t = CompareStringW(gWinLCID, 0, ((Line )a)->name, -1, ((Line **)b)->name, -1);
311	return t-2;
312	}
313
314
315	int UNIXstrcmp(const void a, const void b) {
316	gCount++;
317	int t;
318	t = strcoll(((Line )a)->unixName, ((Line **)b)->unixName);
319	return t;
320	}
321
322
323	int WinstrcmpL(const void a, const void b) {
324	gCount++;
325	int t;
326	t = CompareStringW(gWinLCID, 0, ((Line )a)->name, ((Line *)a)->len, ((Line *)b)->name, ((Line **)b)->len);
327	return t-2;
328	}
329
330
331	int WinstrcmpK(const void a, const void b) {
332	gCount++;
333	int t = strcmp(((Line )a)->winSortKey, ((Line **)b)->winSortKey);
334	return t;
335	}
336
337
338	//---------------------------------------------------------------------------------------
339	//
340	// Function for sorting the names (lines) into a random order.
341	// Order is based on a hash of the ICU Sort key for the lines
342	// The randomized order is used as input for the sorting timing tests.
343	//
344	//---------------------------------------------------------------------------------------
345	int ICURandomCmp(const void a, const void b) {
346	char ask = ((Line **)a)->icuSortKey;
347	char bsk = ((Line **)b)->icuSortKey;
348	int aVal = 0;
349	int bVal = 0;
350	int retVal;
351	while (*ask != 0) {
352	aVal += aVal37 + ask++;
353	}
354	while (*bsk != 0) {
355	bVal += bVal37 + bsk++;
356	}
357	retVal = -1;
358	if (aVal == bVal) {
359	retVal = 0;
360	}
361	else if (aVal > bVal) {
362	retVal = 1;
363	}
364	return retVal;
365	}
366
367	//---------------------------------------------------------------------------------------
368	//
369	// doKeyGen() Key Generation Timing Test
370	//
371	//---------------------------------------------------------------------------------------
372	void doKeyGen()
373	{
374	int line;
729e4ab9	375	int loops = 0;
46f4442e	376	int iLoop;
46f4442e A	377	int len=-1;
	378
	379	// Adjust loop count to compensate for file size. Should be order n
	380	double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines));
	381	int adj_loopCount = int(dLoopCount);
	382	if (adj_loopCount < 1) adj_loopCount = 1;
	383
	384
	385	unsigned long startTime = timeGetTime();
	386
	387	if (opt_win) {
	388	for (loops=0; loops<adj_loopCount; loops++) {
	389	for (line=0; line < gNumFileLines; line++) {
	390	if (opt_uselen) {
	391	len = gFileLines[line].len;
	392	}
	393	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
3d1f044b	394	LCMapStringW(gWinLCID, LCMAP_SORTKEY,
46f4442e	395	gFileLines[line].name, len,
3d1f044b	396	(UChar *)gFileLines[line].winSortKey, 5000); // TODO something with length.
46f4442e A	397	}
	398	}
	399	}
	400	}
	401	else if (opt_icu)
	402	{
	403	for (loops=0; loops<adj_loopCount; loops++) {
	404	for (line=0; line < gNumFileLines; line++) {
	405	if (opt_uselen) {
	406	len = gFileLines[line].len;
	407	}
	408	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
3d1f044b	409	ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
46f4442e A	410	}
	411	}
	412	}
	413	}
	414	else if (opt_unix)
	415	{
	416	for (loops=0; loops<adj_loopCount; loops++) {
	417	for (line=0; line < gNumFileLines; line++) {
	418	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
3d1f044b	419	strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
46f4442e A	420	}
	421	}
	422	}
	423	}
	424
	425	unsigned long elapsedTime = timeGetTime() - startTime;
	426	int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
	427
	428	if (opt_terse == FALSE) {
	429	printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines);
	430	printf("Sort Key Generation: time per key = %d ns\n", ns);
	431	}
	432	else {
	433	printf("%d, ", ns);
	434	}
	435
	436	int totalKeyLen = 0;
	437	int totalChars = 0;
	438	for (line=0; line<gNumFileLines; line++) {
	439	totalChars += u_strlen(gFileLines[line].name);
	440	if (opt_win) {
	441	totalKeyLen += strlen(gFileLines[line].winSortKey);
	442	}
	443	else if (opt_icu) {
	444	totalKeyLen += strlen(gFileLines[line].icuSortKey);
	445	}
	446	else if (opt_unix) {
	447	totalKeyLen += strlen(gFileLines[line].unixSortKey);
	448	}
	449
	450	}
	451	if (opt_terse == FALSE) {
	452	printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
	453	} else {
	454	printf("%f, ", (float)totalKeyLen / (float)totalChars);
	455	}
	456	}
	457
	458
	459
	460	//---------------------------------------------------------------------------------------
	461	//
	462	// doBinarySearch() Binary Search timing test. Each name from the list
	463	// is looked up in the full sorted list of names.
	464	//
	465	//---------------------------------------------------------------------------------------
	466	void doBinarySearch()
	467	{
	468
	469	gCount = 0;
	470	int line;
729e4ab9 A	471	int loops = 0;
	472	int iLoop = 0;
	473	unsigned long elapsedTime = 0;
46f4442e A	474
	475	// Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup)
	476	// Accurate timings do not depend on this being perfect. The correction is just to try to
	477	// get total running times of about the right order, so the that user doesn't need to
	478	// manually adjust the loop count for every different file size.
4388f060	479	double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
46f4442e A	480	if (opt_usekeys) dLoopCount *= 5;
	481	int adj_loopCount = int(dLoopCount);
	482	if (adj_loopCount < 1) adj_loopCount = 1;
	483
	484
	485	for (;;) { // not really a loop, just allows "break" to work, to simplify
	486	// inadvertantly running more than one test through here.
	487	if (opt_strcmp \|\| opt_strcmpCPO)
	488	{
	489	unsigned long startTime = timeGetTime();
	490	typedef int32_t (U_EXPORT2 PF)(const UChar , const UChar *);
	491	PF pf = u_strcmp;
	492	if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
	493	//if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int
	494	// which forces the use of a cast here.
	495
729e4ab9	496	int r = 0;
46f4442e A	497	for (loops=0; loops<adj_loopCount; loops++) {
	498
	499	for (line=0; line < gNumFileLines; line++) {
	500	int hi = gNumFileLines-1;
	501	int lo = 0;
	502	int guess = -1;
	503	for (;;) {
	504	int newGuess = (hi + lo) / 2;
	505	if (newGuess == guess)
	506	break;
	507	guess = newGuess;
	508	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	509	r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
	510	}
	511	gCount++;
	512	if (r== 0)
	513	break;
	514	if (r < 0)
	515	hi = guess;
	516	else
	517	lo = guess;
	518	}
	519	}
	520	}
	521	elapsedTime = timeGetTime() - startTime;
	522	break;
	523	}
	524
	525
	526	if (opt_icu)
	527	{
	528	unsigned long startTime = timeGetTime();
729e4ab9	529	UCollationResult r = UCOL_EQUAL;
46f4442e A	530	for (loops=0; loops<adj_loopCount; loops++) {
	531
	532	for (line=0; line < gNumFileLines; line++) {
	533	int lineLen = -1;
	534	int guessLen = -1;
	535	if (opt_uselen) {
	536	lineLen = (gSortedLines[line])->len;
	537	}
	538	int hi = gNumFileLines-1;
	539	int lo = 0;
	540	int guess = -1;
	541	for (;;) {
	542	int newGuess = (hi + lo) / 2;
	543	if (newGuess == guess)
	544	break;
	545	guess = newGuess;
729e4ab9	546	int ri = 0;
46f4442e A	547	if (opt_usekeys) {
	548	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	549	ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
	550	}
	551	gCount++;
	552	r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
	553	}
	554	else
	555	{
	556	if (opt_uselen) {
	557	guessLen = (gSortedLines[guess])->len;
	558	}
	559	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	560	r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
	561	}
	562	gCount++;
	563	}
	564	if (r== UCOL_EQUAL)
	565	break;
	566	if (r == UCOL_LESS)
	567	hi = guess;
	568	else
	569	lo = guess;
	570	}
	571	}
	572	}
	573	elapsedTime = timeGetTime() - startTime;
	574	break;
	575	}
	576
	577	if (opt_win)
	578	{
	579	unsigned long startTime = timeGetTime();
729e4ab9	580	int r = 0;
46f4442e A	581	for (loops=0; loops<adj_loopCount; loops++) {
	582
	583	for (line=0; line < gNumFileLines; line++) {
	584	int lineLen = -1;
	585	int guessLen = -1;
	586	if (opt_uselen) {
	587	lineLen = (gSortedLines[line])->len;
	588	}
	589	int hi = gNumFileLines-1;
	590	int lo = 0;
	591	int guess = -1;
	592	for (;;) {
	593	int newGuess = (hi + lo) / 2;
	594	if (newGuess == guess)
	595	break;
	596	guess = newGuess;
	597	if (opt_usekeys) {
	598	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	599	r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
	600	}
	601	gCount++;
	602	r+=2;
	603	}
	604	else
	605	{
	606	if (opt_uselen) {
	607	guessLen = (gSortedLines[guess])->len;
	608	}
	609	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	610	r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
	611	}
	612	if (r == 0) {
	613	if (opt_terse == FALSE) {
	614	fprintf(stderr, "Error returned from Windows CompareStringW.\n");
	615	}
	616	exit(-1);
	617	}
	618	gCount++;
	619	}
	620	if (r== 2) // strings ==
	621	break;
	622	if (r == 1) // line < guess
	623	hi = guess;
	624	else // line > guess
	625	lo = guess;
	626	}
	627	}
	628	}
	629	elapsedTime = timeGetTime() - startTime;
	630	break;
	631	}
	632
	633	if (opt_unix)
	634	{
	635	unsigned long startTime = timeGetTime();
729e4ab9	636	int r = 0;
46f4442e A	637	for (loops=0; loops<adj_loopCount; loops++) {
	638
	639	for (line=0; line < gNumFileLines; line++) {
	640	int hi = gNumFileLines-1;
	641	int lo = 0;
	642	int guess = -1;
	643	for (;;) {
	644	int newGuess = (hi + lo) / 2;
	645	if (newGuess == guess)
	646	break;
	647	guess = newGuess;
	648	if (opt_usekeys) {
	649	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	650	r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
	651	}
	652	gCount++;
	653	}
	654	else
	655	{
	656	for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
	657	r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
	658	}
	659	errno = 0;
	660	if (errno != 0) {
	661	fprintf(stderr, "Error %d returned from strcoll.\n", errno);
	662	exit(-1);
	663	}
	664	gCount++;
	665	}
	666	if (r == 0) // strings ==
	667	break;
	668	if (r < 0) // line < guess
	669	hi = guess;
	670	else // line > guess
	671	lo = guess;
	672	}
	673	}
	674	}
	675	elapsedTime = timeGetTime() - startTime;
	676	break;
	677	}
	678	break;
	679	}
	680
	681	int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
	682	if (opt_terse == FALSE) {
	683	printf("binary search: total # of string compares = %d\n", gCount);
	684	printf("binary search: compares per loop = %d\n", gCount / loops);
	685	printf("binary search: time per compare = %d ns\n", ns);
	686	} else {
	687	printf("%d, ", ns);
	688	}
	689
	690	}
	691
	692
	693
	694
	695	//---------------------------------------------------------------------------------------
	696	//
	697	// doQSort() The quick sort timing test. Uses the C library qsort function.
	698	//
	699	//---------------------------------------------------------------------------------------
	700	void doQSort() {
701	int i;
702	Line *sortBuf = new Line [gNumFileLines];
703
704	// Adjust loop count to compensate for file size. QSort should be n log(n)
4388f060	705	double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
46f4442e A	706	if (opt_usekeys) dLoopCount *= 5;
	707	int adj_loopCount = int(dLoopCount);
	708	if (adj_loopCount < 1) adj_loopCount = 1;
	709
	710
	711	gCount = 0;
	712	unsigned long startTime = timeGetTime();
	713	if (opt_win && opt_usekeys) {
	714	for (i=0; i<opt_loopCount; i++) {
	715	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	716	qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
	717	}
	718	}
	719
	720	else if (opt_win && opt_uselen) {
	721	for (i=0; i<adj_loopCount; i++) {
	722	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	723	qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
	724	}
	725	}
	726
	727
	728	else if (opt_win && !opt_uselen) {
	729	for (i=0; i<adj_loopCount; i++) {
	730	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	731	qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
	732	}
	733	}
	734
	735	else if (opt_icu && opt_usekeys) {
	736	for (i=0; i<adj_loopCount; i++) {
	737	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	738	qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
	739	}
	740	}
	741
	742	else if (opt_icu && opt_uselen) {
	743	for (i=0; i<adj_loopCount; i++) {
	744	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	745	qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
	746	}
	747	}
	748
	749
	750	else if (opt_icu && !opt_uselen) {
	751	for (i=0; i<adj_loopCount; i++) {
	752	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	753	qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
	754	}
	755	}
	756
	757	else if (opt_unix && !opt_usekeys) {
	758	for (i=0; i<adj_loopCount; i++) {
	759	memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
	760	qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
	761	}
	762	}
	763
	764	unsigned long elapsedTime = timeGetTime() - startTime;
	765	int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
	766	if (opt_terse == FALSE) {
	767	printf("qsort: total # of string compares = %d\n", gCount);
	768	printf("qsort: time per compare = %d ns\n", ns);
	769	} else {
770	printf("%d, ", ns);
771	}
729e4ab9	772	}
46f4442e A	773
	774
	775
	776	//---------------------------------------------------------------------------------------
	777	//
	778	// doKeyHist() Output a table of data for
	779	// average sort key size vs. string length.
	780	//
	781	//---------------------------------------------------------------------------------------
	782	void doKeyHist() {
	783	int i;
	784	int maxLen = 0;
	785
	786	// Find the maximum string length
	787	for (i=0; i<gNumFileLines; i++) {
	788	if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
	789	}
	790
	791	// Allocate arrays to hold the histogram data
	792	int *accumulatedLen = new int[maxLen+1];
	793	int *numKeysOfSize = new int[maxLen+1];
	794	for (i=0; i<=maxLen; i++) {
	795	accumulatedLen[i] = 0;
	796	numKeysOfSize[i] = 0;
	797	}
	798
	799	// Fill the arrays...
	800	for (i=0; i<gNumFileLines; i++) {
	801	int len = gFileLines[i].len;
	802	accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
	803	numKeysOfSize[len] += 1;
	804	}
	805
	806	// And write out averages
	807	printf("String Length, Avg Key Length, Avg Key Len per char\n");
	808	for (i=1; i<=maxLen; i++) {
	809	if (numKeysOfSize[i] > 0) {
	810	printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
	811	(float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
	812	}
	813	}
729e4ab9 A	814	delete []accumulatedLen;
729e4ab9 A	815	delete []numKeysOfSize ;
46f4442e A	816	}
	817
	818	//---------------------------------------------------------------------------------------
	819	//
	820	// doForwardIterTest(UBool) Forward iteration test
	821	// argument null-terminated string used
	822	//
	823	//---------------------------------------------------------------------------------------
	824	void doForwardIterTest(UBool haslen) {
	825	int count = 0;
	826
	827	UErrorCode error = U_ZERO_ERROR;
	828	printf("\n\nPerforming forward iteration performance test with ");
	829
	830	if (haslen) {
	831	printf("non-null terminated data -----------\n");
	832	}
	833	else {
	834	printf("null terminated data -----------\n");
	835	}
	836	printf("performance test on strings from file -----------\n");
	837
	838	UChar dummytext[] = {0, 0};
	839	UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
	840	ucol_setText(iter, dummytext, 1, &error);
	841
	842	gCount = 0;
	843	unsigned long startTime = timeGetTime();
	844	while (count < opt_loopCount) {
	845	int linecount = 0;
	846	while (linecount < gNumFileLines) {
	847	UChar *str = gFileLines[linecount].name;
	848	int strlen = haslen?gFileLines[linecount].len:-1;
	849	ucol_setText(iter, str, strlen, &error);
	850	while (ucol_next(iter, &error) != UCOL_NULLORDER) {
	851	gCount++;
	852	}
	853
	854	linecount ++;
	855	}
	856	count ++;
	857	}
	858	unsigned long elapsedTime = timeGetTime() - startTime;
729e4ab9	859	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	860
	861	// empty loop recalculation
	862	count = 0;
	863	startTime = timeGetTime();
	864	while (count < opt_loopCount) {
	865	int linecount = 0;
	866	while (linecount < gNumFileLines) {
	867	UChar *str = gFileLines[linecount].name;
	868	int strlen = haslen?gFileLines[linecount].len:-1;
	869	ucol_setText(iter, str, strlen, &error);
	870	linecount ++;
	871	}
	872	count ++;
	873	}
	874	elapsedTime -= (timeGetTime() - startTime);
729e4ab9	875	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	876
	877	ucol_closeElements(iter);
	878
	879	int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
	880	printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
	881	opt_loopCount);
	882	printf("Average time per ucol_next() nano seconds %d\n", ns);
	883
	884	printf("performance test on skipped-5 concatenated strings from file -----------\n");
	885
	886	UChar *str;
	887	int strlen = 0;
	888	// appending all the strings
	889	int linecount = 0;
	890	while (linecount < gNumFileLines) {
	891	strlen += haslen?gFileLines[linecount].len:
	892	u_strlen(gFileLines[linecount].name);
	893	linecount ++;
	894	}
	895	str = (UChar )malloc(sizeof(UChar) strlen);
	896	int strindex = 0;
	897	linecount = 0;
	898	while (strindex < strlen) {
	899	int len = 0;
	900	len += haslen?gFileLines[linecount].len:
	901	u_strlen(gFileLines[linecount].name);
	902	memcpy(str + strindex, gFileLines[linecount].name,
	903	sizeof(UChar) * len);
	904	strindex += len;
	905	linecount ++;
	906	}
	907
	908	printf("Total size of strings %d\n", strlen);
	909
	910	gCount = 0;
	911	count = 0;
	912
	913	if (!haslen) {
	914	strlen = -1;
	915	}
	916	iter = ucol_openElements(gCol, str, strlen, &error);
	917	if (!haslen) {
	918	strlen = u_strlen(str);
	919	}
	920	strlen -= 5; // any left over characters are not iterated,
	921	// this is to ensure the backwards and forwards iterators
	922	// gets the same position
	923	startTime = timeGetTime();
	924	while (count < opt_loopCount) {
	925	int count5 = 5;
	926	strindex = 0;
	927	ucol_setOffset(iter, strindex, &error);
	928	while (TRUE) {
	929	if (ucol_next(iter, &error) == UCOL_NULLORDER) {
	930	break;
	931	}
	932	gCount++;
	933	count5 --;
	934	if (count5 == 0) {
	935	strindex += 10;
	936	if (strindex > strlen) {
	937	break;
	938	}
	939	ucol_setOffset(iter, strindex, &error);
940	count5 = 5;
941	}
942	}
943	count ++;
944	}
945
946	elapsedTime = timeGetTime() - startTime;
729e4ab9	947	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	948
	949	// empty loop recalculation
	950	int tempgCount = 0;
	951	count = 0;
	952	startTime = timeGetTime();
	953	while (count < opt_loopCount) {
	954	int count5 = 5;
	955	strindex = 0;
	956	ucol_setOffset(iter, strindex, &error);
	957	while (TRUE) {
	958	tempgCount ++;
	959	count5 --;
	960	if (count5 == 0) {
	961	strindex += 10;
	962	if (strindex > strlen) {
	963	break;
	964	}
	965	ucol_setOffset(iter, strindex, &error);
	966	count5 = 5;
	967	}
	968	}
	969	count ++;
	970	}
	971	elapsedTime -= (timeGetTime() - startTime);
729e4ab9	972	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	973
	974	ucol_closeElements(iter);
	975
	976	printf("gCount %d\n", gCount);
	977	ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
	978	printf("Average time per ucol_next() nano seconds %d\n", ns);
	979	}
	980
	981	//---------------------------------------------------------------------------------------
	982	//
	983	// doBackwardIterTest(UBool) Backwards iteration test
	984	// argument null-terminated string used
	985	//
	986	//---------------------------------------------------------------------------------------
	987	void doBackwardIterTest(UBool haslen) {
	988	int count = 0;
	989	UErrorCode error = U_ZERO_ERROR;
	990	printf("\n\nPerforming backward iteration performance test with ");
	991
	992	if (haslen) {
	993	printf("non-null terminated data -----------\n");
	994	}
	995	else {
	996	printf("null terminated data -----------\n");
	997	}
	998
	999	printf("performance test on strings from file -----------\n");
	1000
	1001	UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
	1002	UChar dummytext[] = {0, 0};
	1003	ucol_setText(iter, dummytext, 1, &error);
	1004
	1005	gCount = 0;
	1006	unsigned long startTime = timeGetTime();
	1007	while (count < opt_loopCount) {
	1008	int linecount = 0;
	1009	while (linecount < gNumFileLines) {
	1010	UChar *str = gFileLines[linecount].name;
	1011	int strlen = haslen?gFileLines[linecount].len:-1;
	1012	ucol_setText(iter, str, strlen, &error);
	1013	while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
	1014	gCount ++;
	1015	}
	1016
	1017	linecount ++;
	1018	}
	1019	count ++;
	1020	}
	1021	unsigned long elapsedTime = timeGetTime() - startTime;
	1022
729e4ab9	1023	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	1024
	1025	// empty loop recalculation
	1026	count = 0;
	1027	startTime = timeGetTime();
	1028	while (count < opt_loopCount) {
	1029	int linecount = 0;
	1030	while (linecount < gNumFileLines) {
	1031	UChar *str = gFileLines[linecount].name;
	1032	int strlen = haslen?gFileLines[linecount].len:-1;
	1033	ucol_setText(iter, str, strlen, &error);
	1034	linecount ++;
	1035	}
	1036	count ++;
	1037	}
	1038	elapsedTime -= (timeGetTime() - startTime);
	1039
729e4ab9	1040	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	1041	ucol_closeElements(iter);
	1042
	1043	int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
	1044	printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
	1045	opt_loopCount);
	1046	printf("Average time per ucol_previous() nano seconds %d\n", ns);
	1047
	1048	printf("performance test on skipped-5 concatenated strings from file -----------\n");
	1049
	1050	UChar *str;
	1051	int strlen = 0;
	1052	// appending all the strings
	1053	int linecount = 0;
	1054	while (linecount < gNumFileLines) {
	1055	strlen += haslen?gFileLines[linecount].len:
	1056	u_strlen(gFileLines[linecount].name);
	1057	linecount ++;
	1058	}
	1059	str = (UChar )malloc(sizeof(UChar) strlen);
	1060	int strindex = 0;
	1061	linecount = 0;
	1062	while (strindex < strlen) {
	1063	int len = 0;
	1064	len += haslen?gFileLines[linecount].len:
	1065	u_strlen(gFileLines[linecount].name);
	1066	memcpy(str + strindex, gFileLines[linecount].name,
	1067	sizeof(UChar) * len);
	1068	strindex += len;
	1069	linecount ++;
	1070	}
	1071
	1072	printf("Total size of strings %d\n", strlen);
	1073
	1074	gCount = 0;
	1075	count = 0;
	1076
	1077	if (!haslen) {
	1078	strlen = -1;
	1079	}
	1080
	1081	iter = ucol_openElements(gCol, str, strlen, &error);
	1082	if (!haslen) {
	1083	strlen = u_strlen(str);
	1084	}
	1085
	1086	startTime = timeGetTime();
	1087	while (count < opt_loopCount) {
	1088	int count5 = 5;
	1089	strindex = 5;
	1090	ucol_setOffset(iter, strindex, &error);
	1091	while (TRUE) {
	1092	if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
	1093	break;
	1094	}
	1095	gCount ++;
	1096	count5 --;
	1097	if (count5 == 0) {
	1098	strindex += 10;
	1099	if (strindex > strlen) {
	1100	break;
	1101	}
	1102	ucol_setOffset(iter, strindex, &error);
	1103	count5 = 5;
	1104	}
1105	}
1106	count ++;
1107	}
1108
1109	elapsedTime = timeGetTime() - startTime;
729e4ab9	1110	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	1111
	1112	// empty loop recalculation
	1113	count = 0;
	1114	int tempgCount = 0;
	1115	startTime = timeGetTime();
	1116	while (count < opt_loopCount) {
	1117	int count5 = 5;
	1118	strindex = 5;
	1119	ucol_setOffset(iter, strindex, &error);
	1120	while (TRUE) {
	1121	tempgCount ++;
	1122	count5 --;
	1123	if (count5 == 0) {
	1124	strindex += 10;
	1125	if (strindex > strlen) {
	1126	break;
	1127	}
	1128	ucol_setOffset(iter, strindex, &error);
	1129	count5 = 5;
	1130	}
	1131	}
	1132	count ++;
	1133	}
	1134	elapsedTime -= (timeGetTime() - startTime);
729e4ab9	1135	printf("elapsedTime %ld\n", elapsedTime);
46f4442e A	1136	ucol_closeElements(iter);
	1137
	1138	printf("gCount %d\n", gCount);
	1139	ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
	1140	printf("Average time per ucol_previous() nano seconds %d\n", ns);
	1141	}
	1142
	1143	//---------------------------------------------------------------------------------------
	1144	//
	1145	// doIterTest() Iteration test
	1146	//
	1147	//---------------------------------------------------------------------------------------
	1148	void doIterTest() {
	1149	doForwardIterTest(opt_uselen);
	1150	doBackwardIterTest(opt_uselen);
	1151	}
	1152
	1153
	1154	//----------------------------------------------------------------------------------------
	1155	//
	1156	// UnixConvert -- Convert the lines of the file to the encoding for UNIX
	1157	// Since it appears that Unicode support is going in the general
	1158	// direction of the use of UTF-8 locales, that is the approach
	1159	// that is used here.
	1160	//
	1161	//----------------------------------------------------------------------------------------
	1162	void UnixConvert() {
	1163	int line;
	1164
	1165	UConverter *cvrtr; // An ICU code page converter.
	1166	UErrorCode status = U_ZERO_ERROR;
	1167
	1168
	1169	cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
	1170	if (U_FAILURE(status)) {
729e4ab9	1171	fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
46f4442e A	1172	exit(-1);
	1173	}
	1174
	1175	for (line=0; line < gNumFileLines; line++) {
	1176	int sizeNeeded = ucnv_fromUChars(cvrtr,
	1177	0, // ptr to target buffer.
	1178	0, // length of target buffer.
	1179	gFileLines[line].name,
	1180	-1, // source is null terminated
	1181	&status);
	1182	if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
	1183	//fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
	1184	//exit(-1);
	1185	}
	1186	status = U_ZERO_ERROR;
	1187	gFileLines[line].unixName = new char[sizeNeeded+1];
	1188	sizeNeeded = ucnv_fromUChars(cvrtr,
	1189	gFileLines[line].unixName, // ptr to target buffer.
	1190	sizeNeeded+1, // length of target buffer.
	1191	gFileLines[line].name,
	1192	-1, // source is null terminated
	1193	&status);
	1194	if (U_FAILURE(status)) {
	1195	fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
	1196	exit(-1);
	1197	}
	1198	gFileLines[line].unixName[sizeNeeded] = 0;
	1199	};
	1200	ucnv_close(cvrtr);
	1201	}
	1202
	1203
	1204	//----------------------------------------------------------------------------------------
	1205	//
	1206	// class UCharFile Class to hide all the gorp to read a file in
	1207	// and produce a stream of UChars.
	1208	//
	1209	//----------------------------------------------------------------------------------------
	1210	class UCharFile {
	1211	public:
	1212	UCharFile(const char *fileName);
	1213	~UCharFile();
	1214	UChar get();
	1215	UBool eof() {return fEof;};
	1216	UBool error() {return fError;};
	1217
	1218	private:
729e4ab9 A	1219	UCharFile (const UCharFile & /other/) {}; // No copy constructor.
729e4ab9 A	1220	UCharFile & operator = (const UCharFile &/other/) {return *this;}; // No assignment op
46f4442e A	1221
	1222	FILE *fFile;
	1223	const char *fName;
	1224	UBool fEof;
	1225	UBool fError;
	1226	UChar fPending2ndSurrogate;
	1227
	1228	enum {UTF16LE, UTF16BE, UTF8} fEncoding;
	1229	};
	1230
	1231	UCharFile::UCharFile(const char * fileName) {
	1232	fEof = FALSE;
	1233	fError = FALSE;
	1234	fName = fileName;
	1235	fFile = fopen(fName, "rb");
	1236	fPending2ndSurrogate = 0;
	1237	if (fFile == NULL) {
	1238	fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
	1239	fError = TRUE;
	1240	return;
	1241	}
	1242	//
	1243	// Look for the byte order mark at the start of the file.
	1244	//
	1245	int BOMC1, BOMC2, BOMC3;
	1246	BOMC1 = fgetc(fFile);
	1247	BOMC2 = fgetc(fFile);
	1248
	1249	if (BOMC1 == 0xff && BOMC2 == 0xfe) {
	1250	fEncoding = UTF16LE; }
	1251	else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
	1252	fEncoding = UTF16BE; }
	1253	else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
	1254	fEncoding = UTF8; }
	1255	else
	1256	{
	1257	fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
	1258	"must include a BOM.\n", fileName);
	1259	fError = true;
	1260	return;
	1261	}
	1262	}
	1263
	1264
	1265	UCharFile::~UCharFile() {
	1266	fclose(fFile);
	1267	}
	1268
	1269
	1270
	1271	UChar UCharFile::get() {
	1272	UChar c;
	1273	switch (fEncoding) {
	1274	case UTF16LE:
	1275	{
	1276	int cL, cH;
	1277	cL = fgetc(fFile);
	1278	cH = fgetc(fFile);
	1279	c = cL \| (cH << 8);
	1280	if (cH == EOF) {
	1281	c = 0;
	1282	fEof = TRUE;
	1283	}
	1284	break;
1285	}
1286	case UTF16BE:
1287	{
1288	int cL, cH;
1289	cH = fgetc(fFile);
1290	cL = fgetc(fFile);
1291	c = cL \| (cH << 8);
1292	if (cL == EOF) {
1293	c = 0;
1294	fEof = TRUE;
1295	}
1296	break;
1297	}
1298	case UTF8:
1299	{
1300	if (fPending2ndSurrogate != 0) {
1301	c = fPending2ndSurrogate;
1302	fPending2ndSurrogate = 0;
1303	break;
1304	}
1305
1306	int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
1307	if (ch == EOF) {
1308	c = 0;
1309	fEof = TRUE;
1310	break;
1311	}
1312
1313	if (ch <= 0x7f) {
1314	// It's ascii. No further utf-8 conversion.
1315	c = ch;
1316	break;
1317	}
1318
1319	// Figure out the lenght of the char and read the rest of the bytes
1320	// into a temp array.
1321	int nBytes;
1322	if (ch >= 0xF0) {nBytes=4;}
1323	else if (ch >= 0xE0) {nBytes=3;}
1324	else if (ch >= 0xC0) {nBytes=2;}
1325	else {
1326	fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1327	fError = TRUE;
1328	return 0;
1329	}
1330
1331	unsigned char bytes[10];
1332	bytes[0] = (unsigned char)ch;
1333	int i;
1334	for (i=1; i<nBytes; i++) {
1335	bytes[i] = fgetc(fFile);
1336	if (bytes[i] < 0x80 \|\| bytes[i] >= 0xc0) {
1337	fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1338	fError = TRUE;
1339	return 0;
1340	}
1341	}
1342
1343	// Convert the bytes from the temp array to a Unicode char.
1344	i = 0;
1345	uint32_t cp;
51004dcb	1346	U8_NEXT_UNSAFE(bytes, i, cp);
46f4442e A	1347	c = (UChar)cp;
	1348
	1349	if (cp >= 0x10000) {
	1350	// The code point needs to be broken up into a utf-16 surrogate pair.
	1351	// Process first half this time through the main loop, and
	1352	// remember the other half for the next time through.
	1353	UChar utf16Buf[3];
	1354	i = 0;
	1355	UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
	1356	fPending2ndSurrogate = utf16Buf[1];
	1357	c = utf16Buf[0];
	1358	}
	1359	break;
	1360	};
729e4ab9 A	1361	default:
	1362	c = 0xFFFD; /* Error, unspecified codepage*/
	1363	fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
	1364	exit(1);
46f4442e A	1365	}
	1366	return c;
	1367	}
	1368
	1369	//----------------------------------------------------------------------------------------
	1370	//
	1371	// openRulesCollator - Command line specified a rules file. Read it in
	1372	// and open a collator with it.
	1373	//
	1374	//----------------------------------------------------------------------------------------
	1375	UCollator *openRulesCollator() {
	1376	UCharFile f(opt_rules);
	1377	if (f.error()) {
	1378	return 0;
	1379	}
	1380
	1381	int bufLen = 10000;
	1382	UChar buf = (UChar )malloc(bufLen * sizeof(UChar));
4388f060	1383	UChar *tmp;
46f4442e A	1384	int i = 0;
	1385
	1386	for(;;) {
	1387	buf[i] = f.get();
	1388	if (f.eof()) {
	1389	break;
	1390	}
	1391	if (f.error()) {
	1392	return 0;
	1393	}
	1394	i++;
	1395	if (i >= bufLen) {
4388f060	1396	tmp = buf;
46f4442e A	1397	bufLen += 10000;
46f4442e A	1398	buf = (UChar *)realloc(buf, bufLen);
4388f060 A	1399	if (buf == NULL) {
	1400	free(tmp);
	1401	return 0;
	1402	}
46f4442e A	1403	}
	1404	}
	1405	buf[i] = 0;
	1406
	1407	UErrorCode status = U_ZERO_ERROR;
	1408	UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
	1409	UCOL_DEFAULT_STRENGTH, NULL, &status);
	1410	if (U_FAILURE(status)) {
	1411	fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
	1412	return 0;
	1413	}
	1414	free(buf);
	1415	return coll;
	1416	}
	1417
	1418
	1419
	1420
	1421
	1422	//----------------------------------------------------------------------------------------
	1423	//
	1424	// Main -- process command line, read in and pre-process the test file,
	1425	// call other functions to do the actual tests.
	1426	//
	1427	//----------------------------------------------------------------------------------------
	1428	int main(int argc, const char** argv) {
	1429	if (ProcessOptions(argc, argv, opts) != TRUE \|\| opt_help \|\| opt_fName == 0) {
	1430	printf(gUsageString);
	1431	exit (1);
	1432	}
	1433
	1434	// Make sure that we've only got one API selected.
	1435	if (opt_unix \|\| opt_win) opt_icu = FALSE;
	1436	if (opt_unix) opt_win = FALSE;
	1437
	1438	//
	1439	// Set up an ICU collator
	1440	//
	1441	UErrorCode status = U_ZERO_ERROR;
	1442
	1443	if (opt_rules != 0) {
	1444	gCol = openRulesCollator();
	1445	if (gCol == 0) {return -1;}
	1446	}
	1447	else {
	1448	gCol = ucol_open(opt_locale, &status);
	1449	if (U_FAILURE(status)) {
	1450	fprintf(stderr, "Collator creation failed.: %d\n", status);
	1451	return -1;
	1452	}
	1453	}
	1454	if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
	1455	fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
	1456	}
	1457	if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
	1458	fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
	1459	}
	1460
	1461	if (opt_norm) {
	1462	ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
	1463	}
	1464	if (opt_french && opt_frenchoff) {
	1465	fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options.");
	1466	exit(-1);
1467	}
1468	if (opt_french) {
1469	ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1470	}
1471	if (opt_frenchoff) {
1472	ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1473	}
1474	if (opt_lower) {
1475	ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1476	}
1477	if (opt_upper) {
1478	ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1479	}
1480	if (opt_case) {
1481	ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1482	}
1483	if (opt_shifted) {
1484	ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1485	}
1486	if (opt_level != 0) {
1487	switch (opt_level) {
1488	case 1:
1489	ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1490	break;
1491	case 2:
1492	ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1493	break;
1494	case 3:
1495	ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1496	break;
1497	case 4:
1498	ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1499	break;
1500	case 5:
1501	ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1502	break;
1503	default:
1504	fprintf(stderr, "-level param must be between 1 and 5\n");
1505	exit(-1);
1506	}
1507	}
1508
1509	if (U_FAILURE(status)) {
1510	fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1511	return -1;
1512	}
1513
1514
1515	//
1516	// Set up a Windows LCID
1517	//
1518	if (opt_langid != 0) {
1519	gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1520	}
1521	else {
1522	gWinLCID = uloc_getLCID(opt_locale);
1523	}
1524
1525
1526	//
1527	// Set the UNIX locale
1528	//
1529	if (opt_unix) {
1530	if (setlocale(LC_ALL, opt_locale) == 0) {
1531	fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1532	exit(-1);
1533	}
1534	}
1535
1536	// Read in the input file.
1537	// File assumed to be utf-16.
1538	// Lines go onto heap buffers. Global index array to line starts is created.
1539	// Lines themselves are null terminated.
1540	//
1541
1542	UCharFile f(opt_fName);
1543	if (f.error()) {
1544	exit(-1);
1545	}
1546
1547	const int MAXLINES = 100000;
1548	gFileLines = new Line[MAXLINES];
1549	UChar buf[1024];
1550	int column = 0;
1551
1552	// Read the file, split into lines, and save in memory.
1553	// Loop runs once per utf-16 value from the input file,
1554	// (The number of bytes read from file per loop iteration depends on external encoding.)
1555	for (;;) {
1556
1557	UChar c = f.get();
1558	if (f.error()){
1559	exit(-1);
1560	}
1561
1562
1563	// We now have a good UTF-16 value in c.
1564
1565	// Watch for CR, LF, EOF; these finish off a line.
1566	if (c == 0xd) {
1567	continue;
1568	}
1569
1570	if (f.eof() \|\| c == 0x0a \|\| c==0x2028) { // Unipad inserts 2028 line separators!
1571	buf[column++] = 0;
1572	if (column > 1) {
1573	gFileLines[gNumFileLines].name = new UChar[column];
1574	gFileLines[gNumFileLines].len = column-1;
1575	memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1576	gNumFileLines++;
1577	column = 0;
1578	if (gNumFileLines >= MAXLINES) {
1579	fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES);
1580	exit(-1);
1581	}
1582
1583	}
1584	if (c == 0xa \|\| c == 0x2028)
1585	continue;
1586	else
1587	break; // EOF
1588	}
1589	buf[column++] = c;
1590	if (column >= 1023)
1591	{
1592	static UBool warnFlag = TRUE;
1593	if (warnFlag) {
1594	fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1595	warnFlag = FALSE;
1596	}
1597	column--;
1598	}
1599	}
1600
1601	if (opt_terse == FALSE) {
1602	printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1603	}
1604
1605
1606	// Convert the lines to the UNIX encoding.
1607	if (opt_unix) {
1608	UnixConvert();
1609	}
1610
1611	//
1612	// Pre-compute ICU sort keys for the lines of the file.
1613	//
1614	int line;
729e4ab9	1615	int32_t t;
46f4442e A	1616
	1617	for (line=0; line<gNumFileLines; line++) {
	1618	t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
	1619	gFileLines[line].icuSortKey = new char[t];
	1620
729e4ab9	1621	if (t > (int32_t)sizeof(buf)) {
46f4442e A	1622	t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
	1623	}
	1624	else
	1625	{
	1626	memcpy(gFileLines[line].icuSortKey, buf, t);
	1627	}
	1628	}
	1629
	1630
	1631
	1632	//
	1633	// Pre-compute Windows sort keys for the lines of the file.
	1634	//
	1635	for (line=0; line<gNumFileLines; line++) {
	1636	t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
	1637	gFileLines[line].winSortKey = new char[t];
729e4ab9	1638	if (t > (int32_t)sizeof(buf)) {
3d1f044b	1639	t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (UChar *)(gFileLines[line].winSortKey), t);
46f4442e A	1640	}
	1641	else
	1642	{
	1643	memcpy(gFileLines[line].winSortKey, buf, t);
	1644	}
	1645	}
	1646
	1647	//
	1648	// Pre-compute UNIX sort keys for the lines of the file.
	1649	//
	1650	if (opt_unix) {
	1651	for (line=0; line<gNumFileLines; line++) {
	1652	t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf));
	1653	gFileLines[line].unixSortKey = new char[t];
729e4ab9	1654	if (t > (int32_t)sizeof(buf)) {
46f4442e A	1655	t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf));
	1656	}
	1657	else
	1658	{
	1659	memcpy(gFileLines[line].unixSortKey, buf, t);
	1660	}
	1661	}
	1662	}
	1663
	1664
	1665	//
	1666	// Dump file lines, CEs, Sort Keys if requested.
	1667	//
	1668	if (opt_dump) {
	1669	int i;
	1670	for (line=0; line<gNumFileLines; line++) {
	1671	for (i=0;;i++) {
	1672	UChar c = gFileLines[line].name[i];
	1673	if (c == 0)
	1674	break;
	1675	if (c < 0x20 \|\| c > 0x7e) {
	1676	printf("\\u%.4x", c);
	1677	}
	1678	else {
	1679	printf("%c", c);
	1680	}
	1681	}
	1682	printf("\n");
	1683
	1684	printf(" CEs: ");
	1685	UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
	1686	int32_t ce;
	1687	i = 0;
	1688	for (;;) {
	1689	ce = ucol_next(CEiter, &status);
	1690	if (ce == UCOL_NULLORDER) {
	1691	break;
	1692	}
	1693	printf(" %.8x", ce);
	1694	if (++i > 8) {
	1695	printf("\n ");
	1696	i = 0;
	1697	}
	1698	}
	1699	printf("\n");
	1700	ucol_closeElements(CEiter);
	1701
	1702
	1703	printf(" ICU Sort Key: ");
	1704	for (i=0; ; i++) {
	1705	unsigned char c = gFileLines[line].icuSortKey[i];
	1706	printf("%02x ", c);
	1707	if (c == 0) {
	1708	break;
	1709	}
	1710	if (i > 0 && i % 20 == 0) {
	1711	printf("\n ");
	1712	}
	1713	}
	1714	printf("\n");
	1715	}
	1716	}
	1717
	1718
1719	//
1720	// Pre-sort the lines.
1721	//
1722	int i;
1723	gSortedLines = new Line *[gNumFileLines];
1724	for (i=0; i<gNumFileLines; i++) {
1725	gSortedLines[i] = &gFileLines[i];
1726	}
1727
1728	if (opt_win) {
1729	qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1730	}
1731	else if (opt_unix) {
1732	qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1733	}
1734	else /* ICU */
1735	{
1736	qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1737	}
1738
1739
1740	//
1741	// Make up a randomized order, will be used for sorting tests.
1742	//
1743	gRandomLines = new Line *[gNumFileLines];
1744	for (i=0; i<gNumFileLines; i++) {
1745	gRandomLines[i] = &gFileLines[i];
1746	}
1747	qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1748
1749
1750
1751
1752	//
1753	// We've got the file read into memory. Go do something with it.
1754	//
1755
1756	if (opt_qsort) doQSort();
1757	if (opt_binsearch) doBinarySearch();
1758	if (opt_keygen) doKeyGen();
1759	if (opt_keyhist) doKeyHist();
1760	if (opt_itertest) doIterTest();
1761
1762	return 0;
1763
1764	}