git.saurik.com Git - apple/icu.git/blame - icuSources/test/perf/ubrkperf/ubrkperfold.cpp

Commit	Line	Data
f3c0d7a5 A	1	/***********************************************************************
	2	* © 2016 and later: Unicode, Inc. and others.
	3	* License & terms of use: http://www.unicode.org/copyright.html#License
	4	*
	5	***********************************************************************
	6	***********************************************************************
b75a7d8f	7	* COPYRIGHT:
51004dcb	8	* Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
b75a7d8f	9	*
f3c0d7a5	10	***********************************************************************/
b75a7d8f A	11	/********************************************************************************
	12	*
	13	* File ubrkperf.cpp
	14	*
	15	* Modification History:
	16	* Name Description
	17	* Vladimir Weinstein First Version, based on collperf
	18	*
	19	*********************************************************************************
	20	*/
	21
	22	//
	23	// This program tests break iterator performance
	24	// Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
	25	// (if any)
	26	// A text file is required as input. It must be in utf-8 or utf-16 format,
	27	// and include a byte order mark. Either LE or BE format is OK.
	28	//
	29
	30	const char gUsageString[] =
	31	"usage: ubrkperf options...\n"
	32	"-help Display this message.\n"
	33	"-file file_name utf-16/utf-8 format file.\n"
	34	"-locale name ICU locale to use. Default is en_US\n"
	35	"-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
	36	" see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
	37	"-win Run test using Windows native services. (currently not working) (ICU is default)\n"
	38	"-unix Run test using Unix word breaking services. (currently not working) \n"
	39	"-mac Run test using MacOSX word breaking services.\n"
	40	"-uselen Use API with string lengths. Default is null-terminated strings\n"
	41	"-char Use character break iterator\n"
	42	"-word Use word break iterator\n"
	43	"-line Use line break iterator\n"
	44	"-sentence Use sentence break iterator\n"
	45	"-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
	46	"-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
	47	" under test at each call point. For measuring test overhead.\n"
	48	"-terse Terse numbers-only output. Intended for use by scripts.\n"
	49	"-dump Display stuff.\n"
	50	"-capi Use C APIs instead of C++ APIs (currently not working)\n"
	51	"-next Do the next test\n"
	52	"-isBound Do the isBound test\n"
	53	;
	54
	55
	56	#include <stdio.h>
	57	#include <string.h>
	58	#include <stdlib.h>
	59	#include <math.h>
	60	#include <locale.h>
	61	#include <errno.h>
	62	#include <sys/stat.h>
	63
	64	#include <unicode/utypes.h>
	65	#include <unicode/ucol.h>
	66	#include <unicode/ucoleitr.h>
	67	#include <unicode/uloc.h>
	68	#include <unicode/ustring.h>
	69	#include <unicode/ures.h>
	70	#include <unicode/uchar.h>
	71	#include <unicode/ucnv.h>
	72	#include <unicode/utf8.h>
	73
	74	#include <unicode/brkiter.h>
75
76
4388f060	77	#if U_PLATFORM_HAS_WIN32_API
b75a7d8f A	78	#include <windows.h>
	79	#else
	80	//
	81	// Stubs for Windows API functions when building on UNIXes.
	82	//
	83	#include <sys/time.h>
	84	unsigned long timeGetTime() {
	85	struct timeval t;
	86	gettimeofday(&t, 0);
	87	unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
	88	val += t.tv_usec / 1000;
	89	return val;
	90	};
	91	#define MAKELCID(a,b) 0
	92	#endif
	93
	94
	95	//
	96	// Command line option variables
	97	// These global variables are set according to the options specified
	98	// on the command line by the user.
	99	char * opt_fName = 0;
	100	char * opt_locale = "en_US";
	101	int opt_langid = 0; // Defaults to value corresponding to opt_locale.
	102	char * opt_rules = 0;
	103	UBool opt_help = FALSE;
	104	int opt_time = 0;
	105	int opt_loopCount = 0;
	106	int opt_passesCount= 1;
	107	UBool opt_terse = FALSE;
	108	UBool opt_icu = TRUE;
	109	UBool opt_win = FALSE; // Run with Windows native functions.
	110	UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
	111	UBool opt_mac = FALSE; // Run with MacOSX word break services.
	112	UBool opt_uselen = FALSE;
	113	UBool opt_dump = FALSE;
	114	UBool opt_char = FALSE;
	115	UBool opt_word = FALSE;
	116	UBool opt_line = FALSE;
	117	UBool opt_sentence = FALSE;
	118	UBool opt_capi = FALSE;
	119
	120	UBool opt_next = FALSE;
	121	UBool opt_isBound = FALSE;
	122
	123
	124
	125	//
	126	// Definitions for the command line options
	127	//
	128	struct OptSpec {
	129	const char *name;
	130	enum {FLAG, NUM, STRING} type;
	131	void *pVar;
	132	};
	133
	134	OptSpec opts[] = {
	135	{"-file", OptSpec::STRING, &opt_fName},
	136	{"-locale", OptSpec::STRING, &opt_locale},
	137	{"-langid", OptSpec::NUM, &opt_langid},
	138	{"-win", OptSpec::FLAG, &opt_win},
	139	{"-unix", OptSpec::FLAG, &opt_unix},
	140	{"-mac", OptSpec::FLAG, &opt_mac},
	141	{"-uselen", OptSpec::FLAG, &opt_uselen},
142	{"-loop", OptSpec::NUM, &opt_loopCount},
143	{"-time", OptSpec::NUM, &opt_time},
144	{"-passes", OptSpec::NUM, &opt_passesCount},
145	{"-char", OptSpec::FLAG, &opt_char},
146	{"-word", OptSpec::FLAG, &opt_word},
147	{"-line", OptSpec::FLAG, &opt_line},
148	{"-sentence", OptSpec::FLAG, &opt_sentence},
149	{"-terse", OptSpec::FLAG, &opt_terse},
150	{"-dump", OptSpec::FLAG, &opt_dump},
151	{"-capi", OptSpec::FLAG, &opt_capi},
152	{"-next", OptSpec::FLAG, &opt_next},
153	{"-isBound", OptSpec::FLAG, &opt_isBound},
154	{"-help", OptSpec::FLAG, &opt_help},
155	{"-?", OptSpec::FLAG, &opt_help},
156	{0, OptSpec::FLAG, 0}
157	};
158
159
160	//---------------------------------------------------------------------------
161	//
162	// Global variables pointing to and describing the test file
163	//
164	//---------------------------------------------------------------------------
165
166	//DWORD gWinLCID;
167	BreakIterator *brkit = NULL;
168	UChar *text = NULL;
169	int32_t textSize = 0;
170
171
172
4388f060	173	#if U_PLATFORM_IS_DARWIN_BASED
b75a7d8f A	174	#include <ApplicationServices/ApplicationServices.h>
	175	enum{
	176	kUCTextBreakAllMask = (kUCTextBreakClusterMask \| kUCTextBreakWordMask \| kUCTextBreakLineMask)
	177	};
	178	UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
	179	TextBreakLocatorRef breakRef;
	180	UCTextBreakType macBreakType;
	181
	182	void createMACBrkIt() {
	183	OSStatus status = noErr;
	184	LocaleRef lref;
	185	status = LocaleRefFromLocaleString(opt_locale, &lref);
	186	status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
	187	if(opt_char == TRUE) {
	188	macBreakType = kUCTextBreakClusterMask;
	189	} else if(opt_word == TRUE) {
	190	macBreakType = kUCTextBreakWordMask;
	191	} else if(opt_line == TRUE) {
	192	macBreakType = kUCTextBreakLineMask;
	193	} else if(opt_sentence == TRUE) {
	194	// error
	195	// brkit = BreakIterator::createSentenceInstance(opt_locale, status);
	196	} else {
	197	// default is character iterator
	198	macBreakType = kUCTextBreakClusterMask;
	199	}
	200	}
	201	#endif
	202
	203	void createICUBrkIt() {
	204	//
	205	// Set up an ICU break iterator
	206	//
	207	UErrorCode status = U_ZERO_ERROR;
	208	if(opt_char == TRUE) {
	209	brkit = BreakIterator::createCharacterInstance(opt_locale, status);
	210	} else if(opt_word == TRUE) {
	211	brkit = BreakIterator::createWordInstance(opt_locale, status);
	212	} else if(opt_line == TRUE) {
	213	brkit = BreakIterator::createLineInstance(opt_locale, status);
	214	} else if(opt_sentence == TRUE) {
	215	brkit = BreakIterator::createSentenceInstance(opt_locale, status);
	216	} else {
	217	// default is character iterator
	218	brkit = BreakIterator::createCharacterInstance(opt_locale, status);
	219	}
	220	if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
	221	fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
	222	}
	223	if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
	224	fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
	225	}
	226
	227	}
	228
	229	//---------------------------------------------------------------------------
	230	//
	231	// ProcessOptions() Function to read the command line options.
	232	//
	233	//---------------------------------------------------------------------------
	234	UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
	235	{
	236	int i;
	237	int argNum;
238	const char *pArgName;
239	OptSpec *pOpt;
240
241	for (argNum=1; argNum<argc; argNum++) {
242	pArgName = argv[argNum];
243	for (pOpt = opts; pOpt->name != 0; pOpt++) {
244	if (strcmp(pOpt->name, pArgName) == 0) {
245	switch (pOpt->type) {
246	case OptSpec::FLAG:
247	(UBool )(pOpt->pVar) = TRUE;
248	break;
249	case OptSpec::STRING:
250	argNum ++;
251	if (argNum >= argc) {
252	fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
253	return FALSE;
254	}
255	(const char *)(pOpt->pVar) = argv[argNum];
256	break;
257	case OptSpec::NUM:
258	argNum ++;
259	if (argNum >= argc) {
260	fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
261	return FALSE;
262	}
263	char *endp;
264	i = strtol(argv[argNum], &endp, 0);
265	if (endp == argv[argNum]) {
266	fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
267	return FALSE;
268	}
269	(int )(pOpt->pVar) = i;
270	}
271	break;
272	}
273	}
274	if (pOpt->name == 0)
275	{
276	fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
277	return FALSE;
278	}
279	}
280	return TRUE;
281	}
282
283
284	void doForwardTest() {
285	if (opt_terse == FALSE) {
286	printf("Doing the forward test\n");
287	}
288	int32_t noBreaks = 0;
289	int32_t i = 0;
290	unsigned long startTime = timeGetTime();
291	unsigned long elapsedTime = 0;
292	if(opt_icu) {
293	createICUBrkIt();
294	brkit->setText(UnicodeString(text, textSize));
295	brkit->first();
296	if (opt_terse == FALSE) {
297	printf("Warmup\n");
298	}
299	int j;
300	while((j = brkit->next()) != BreakIterator::DONE) {
301	noBreaks++;
302	//fprintf(stderr, "%d ", j);
303	}
304
305	if (opt_terse == FALSE) {
306	printf("Measure\n");
307	}
308	startTime = timeGetTime();
309	for(i = 0; i < opt_loopCount; i++) {
310	brkit->first();
311	while(brkit->next() != BreakIterator::DONE) {
312	}
313	}
314
315	elapsedTime = timeGetTime()-startTime;
316	} else if(opt_mac) {
4388f060	317	#if U_PLATFORM_IS_DARWIN_BASED
b75a7d8f A	318	createMACBrkIt();
	319	UniChar* filePtr = text;
	320	OSStatus status = noErr;
	321	UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
	322	startOffset = 0;
	323	//printf("\t---Search forward--\n");
	324
	325	while (startOffset < numUniChars)
	326	{
	327	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
	328	startOffset, &breakOffset);
	329	//require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
	330	//require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
	331
	332	// Output break
	333	//printf("\t%d\n", (int)breakOffset);
	334
	335	// Increment counters
	336	noBreaks++;
	337	startOffset = breakOffset;
	338	}
	339	startTime = timeGetTime();
	340	for(i = 0; i < opt_loopCount; i++) {
	341	startOffset = 0;
	342
	343	while (startOffset < numUniChars)
	344	{
	345	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
	346	startOffset, &breakOffset);
	347	// Increment counters
	348	startOffset = breakOffset;
	349	}
	350	}
	351	elapsedTime = timeGetTime()-startTime;
	352	UCDisposeTextBreakLocator(&breakRef);
	353	#endif
	354
	355
	356	}
	357
	358
	359	if (opt_terse == FALSE) {
	360	int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
	361	int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
	362	int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
	363	printf("forward break iteration average loop time %d\n", loopTime);
	364	printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
	365	printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
	366	} else {
	367	printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
	368	}
	369
	370
	371	}
	372
	373	void doIsBoundTest() {
	374	int32_t noBreaks = 0, hit = 0;
	375	int32_t i = 0, j = 0;
	376	unsigned long startTime = timeGetTime();
	377	unsigned long elapsedTime = 0;
	378	createICUBrkIt();
	379	brkit->setText(UnicodeString(text, textSize));
	380	brkit->first();
	381	for(j = 0; j < textSize; j++) {
382	if(brkit->isBoundary(j)) {
383	noBreaks++;
384	//fprintf(stderr, "%d ", j);
385	}
386	}
387	/*
388	while(brkit->next() != BreakIterator::DONE) {
389	noBreaks++;
390	}
391	*/
392
393	startTime = timeGetTime();
394	for(i = 0; i < opt_loopCount; i++) {
395	for(j = 0; j < textSize; j++) {
396	if(brkit->isBoundary(j)) {
397	hit++;
398	}
399	}
400	}
401
402	elapsedTime = timeGetTime()-startTime;
403	int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
404	if (opt_terse == FALSE) {
405	int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
406	int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
407	printf("forward break iteration average loop time %d\n", loopTime);
408	printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
409	printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
410	} else {
411	printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
412	}
413	}
414
415	//----------------------------------------------------------------------------------------
416	//
417	// UnixConvert -- Convert the lines of the file to the encoding for UNIX
418	// Since it appears that Unicode support is going in the general
419	// direction of the use of UTF-8 locales, that is the approach
420	// that is used here.
421	//
422	//----------------------------------------------------------------------------------------
423	void UnixConvert() {
424	#if 0
425	int line;
426
427	UConverter *cvrtr; // An ICU code page converter.
428	UErrorCode status = U_ZERO_ERROR;
429
430
431	cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
432	if (U_FAILURE(status)) {
433	fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
434	exit(-1);
435	}
436	// redo for unix
437	for (line=0; line < gNumFileLines; line++) {
438	int sizeNeeded = ucnv_fromUChars(cvrtr,
439	0, // ptr to target buffer.
440	0, // length of target buffer.
441	gFileLines[line].name,
442	-1, // source is null terminated
443	&status);
444	if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
445	fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
446	exit(-1);
447	}
448	status = U_ZERO_ERROR;
449	gFileLines[line].unixName = new char[sizeNeeded+1];
450	sizeNeeded = ucnv_fromUChars(cvrtr,
451	gFileLines[line].unixName, // ptr to target buffer.
452	sizeNeeded+1, // length of target buffer.
453	gFileLines[line].name,
454	-1, // source is null terminated
455	&status);
456	if (U_FAILURE(status)) {
457	fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
458	exit(-1);
459	}
460	gFileLines[line].unixName[sizeNeeded] = 0;
461	};
462	ucnv_close(cvrtr);
463	#endif
464	}
465
466
467	//----------------------------------------------------------------------------------------
468	//
469	// class UCharFile Class to hide all the gorp to read a file in
470	// and produce a stream of UChars.
471	//
472	//----------------------------------------------------------------------------------------
473	class UCharFile {
474	public:
475	UCharFile(const char *fileName);
476	~UCharFile();
477	UChar get();
478	UBool eof() {return fEof;};
479	UBool error() {return fError;};
480	int32_t size() { return fFileSize; };
481
482	private:
483	UCharFile (const UCharFile &other) {}; // No copy constructor.
484	UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op
485
486	FILE *fFile;
487	const char *fName;
488	UBool fEof;
489	UBool fError;
490	UChar fPending2ndSurrogate;
491	int32_t fFileSize;
492
493	enum {UTF16LE, UTF16BE, UTF8} fEncoding;
494	};
495
496	UCharFile::UCharFile(const char * fileName) {
497	fEof = FALSE;
498	fError = FALSE;
499	fName = fileName;
500	struct stat buf;
501	int32_t result = stat(fileName, &buf);
502	if(result != 0) {
503	fprintf(stderr, "Error getting info\n");
504	fFileSize = -1;
505	} else {
506	fFileSize = buf.st_size;
507	}
508	fFile = fopen(fName, "rb");
509	fPending2ndSurrogate = 0;
510	if (fFile == NULL) {
511	fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
512	fError = TRUE;
513	return;
514	}
515	//
516	// Look for the byte order mark at the start of the file.
517	//
518	int BOMC1, BOMC2, BOMC3;
519	BOMC1 = fgetc(fFile);
520	BOMC2 = fgetc(fFile);
521
522	if (BOMC1 == 0xff && BOMC2 == 0xfe) {
523	fEncoding = UTF16LE; }
524	else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
525	fEncoding = UTF16BE; }
526	else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
527	fEncoding = UTF8; }
528	else
529	{
530	fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
531	"must include a BOM.\n", fileName);
532	fError = true;
533	return;
534	}
535	}
536
537
538	UCharFile::~UCharFile() {
539	fclose(fFile);
540	}
541
542
543
544	UChar UCharFile::get() {
545	UChar c;
546	switch (fEncoding) {
547	case UTF16LE:
548	{
549	int cL, cH;
550	cL = fgetc(fFile);
551	cH = fgetc(fFile);
552	c = cL \| (cH << 8);
553	if (cH == EOF) {
554	c = 0;
555	fEof = TRUE;
556	}
557	break;
558	}
559	case UTF16BE:
560	{
561	int cL, cH;
562	cH = fgetc(fFile);
563	cL = fgetc(fFile);
564	c = cL \| (cH << 8);
565	if (cL == EOF) {
566	c = 0;
567	fEof = TRUE;
568	}
569	break;
570	}
571	case UTF8:
572	{
573	if (fPending2ndSurrogate != 0) {
574	c = fPending2ndSurrogate;
575	fPending2ndSurrogate = 0;
576	break;
577	}
578
579	int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
580	if (ch == EOF) {
581	c = 0;
582	fEof = TRUE;
583	break;
584	}
585
586	if (ch <= 0x7f) {
587	// It's ascii. No further utf-8 conversion.
588	c = ch;
589	break;
590	}
591
592	// Figure out the lenght of the char and read the rest of the bytes
593	// into a temp array.
594	int nBytes;
595	if (ch >= 0xF0) {nBytes=4;}
596	else if (ch >= 0xE0) {nBytes=3;}
597	else if (ch >= 0xC0) {nBytes=2;}
598	else {
599	fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
600	fError = TRUE;
601	return 0;
602	}
603
604	unsigned char bytes[10];
605	bytes[0] = (unsigned char)ch;
606	int i;
607	for (i=1; i<nBytes; i++) {
608	bytes[i] = fgetc(fFile);
609	if (bytes[i] < 0x80 \|\| bytes[i] >= 0xc0) {
610	fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
611	fError = TRUE;
612	return 0;
613	}
614	}
615
616	// Convert the bytes from the temp array to a Unicode char.
617	i = 0;
618	uint32_t cp;
51004dcb	619	U8_NEXT_UNSAFE(bytes, i, cp);
b75a7d8f A	620	c = (UChar)cp;
	621
	622	if (cp >= 0x10000) {
	623	// The code point needs to be broken up into a utf-16 surrogate pair.
	624	// Process first half this time through the main loop, and
	625	// remember the other half for the next time through.
	626	UChar utf16Buf[3];
	627	i = 0;
	628	UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
	629	fPending2ndSurrogate = utf16Buf[1];
	630	c = utf16Buf[0];
	631	}
	632	break;
	633	};
	634	}
	635	return c;
	636	}
	637
	638
	639	//----------------------------------------------------------------------------------------
	640	//
	641	// Main -- process command line, read in and pre-process the test file,
	642	// call other functions to do the actual tests.
	643	//
	644	//----------------------------------------------------------------------------------------
	645	int main(int argc, const char** argv) {
	646	if (ProcessOptions(argc, argv, opts) != TRUE \|\| opt_help \|\| opt_fName == 0) {
	647	printf(gUsageString);
	648	exit (1);
	649	}
	650	// Make sure that we've only got one API selected.
	651	if (opt_mac \|\| opt_unix \|\| opt_win) opt_icu = FALSE;
	652	if (opt_mac \|\| opt_unix) opt_win = FALSE;
	653	if (opt_mac) opt_unix = FALSE;
	654
	655	UErrorCode status = U_ZERO_ERROR;
	656
	657
	658
	659	//
	660	// Set up a Windows LCID
	661	//
	662	/*
	663	if (opt_langid != 0) {
	664	gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
	665	}
	666	else {
	667	gWinLCID = uloc_getLCID(opt_locale);
	668	}
	669	*/
	670
	671	//
	672	// Set the UNIX locale
	673	//
	674	if (opt_unix) {
	675	if (setlocale(LC_ALL, opt_locale) == 0) {
	676	fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
	677	exit(-1);
	678	}
	679	}
	680
	681	// Read in the input file.
	682	// File assumed to be utf-16.
	683	// Lines go onto heap buffers. Global index array to line starts is created.
684	// Lines themselves are null terminated.
685	//
686
687	UCharFile f(opt_fName);
688	if (f.error()) {
689	exit(-1);
690	}
691	int32_t fileSize = f.size();
692	const int STARTSIZE = 70000;
693	int32_t bufSize = 0;
694	int32_t charCount = 0;
695	if(fileSize != -1) {
696	text = (UChar )malloc(fileSizesizeof(UChar));
697	bufSize = fileSize;
698	} else {
699	text = (UChar )malloc(STARTSIZEsizeof(UChar));
700	bufSize = STARTSIZE;
701	}
702	if(text == NULL) {
703	fprintf(stderr, "Allocating buffer failed\n");
704	exit(-1);
705	}
706
707
708	// Read the file, split into lines, and save in memory.
709	// Loop runs once per utf-16 value from the input file,
710	// (The number of bytes read from file per loop iteration depends on external encoding.)
711	for (;;) {
712
713	UChar c = f.get();
714	if(f.eof()) {
715	break;
716	}
717	if (f.error()){
718	exit(-1);
719	}
720	// We now have a good UTF-16 value in c.
721	text[charCount++] = c;
722	if(charCount == bufSize) {
723	text = (UChar )realloc(text, 2bufSize*sizeof(UChar));
724	if(text == NULL) {
725	fprintf(stderr, "Reallocating buffer failed\n");
726	exit(-1);
727	}
728	bufSize *= 2;
729	}
730	}
731
732
733	if (opt_terse == FALSE) {
734	printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
735	}
736
737	textSize = charCount;
738
739
740
741
742	//
743	// Dump file contents if requested.
744	//
745	if (opt_dump) {
746	// dump file, etc... possibly
747	}
748
749
750	//
751	// We've got the file read into memory. Go do something with it.
752	//
753	int32_t i = 0;
754	for(i = 0; i < opt_passesCount; i++) {
755	if(opt_loopCount != 0) {
756	if(opt_next) {
757	doForwardTest();
758	} else if(opt_isBound) {
759	doIsBoundTest();
760	} else {
761	doForwardTest();
762	}
763	} else if(opt_time != 0) {
764
765	}
766	}
767
768	if(text != NULL) {
769	free(text);
770	}
771	if(brkit != NULL) {
772	delete brkit;
773	}
774
775	return 0;
776	}