]>
Commit | Line | Data |
---|---|---|
1 | /******************************************************************** | |
2 | * COPYRIGHT: | |
3 | * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. | |
4 | * | |
5 | ********************************************************************/ | |
6 | /******************************************************************************** | |
7 | * | |
8 | * File ubrkperf.cpp | |
9 | * | |
10 | * Modification History: | |
11 | * Name Description | |
12 | * Vladimir Weinstein First Version, based on collperf | |
13 | * | |
14 | ********************************************************************************* | |
15 | */ | |
16 | ||
17 | // | |
18 | // This program tests break iterator performance | |
19 | // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs | |
20 | // (if any) | |
21 | // A text file is required as input. It must be in utf-8 or utf-16 format, | |
22 | // and include a byte order mark. Either LE or BE format is OK. | |
23 | // | |
24 | ||
25 | const char gUsageString[] = | |
26 | "usage: ubrkperf options...\n" | |
27 | "-help Display this message.\n" | |
28 | "-file file_name utf-16/utf-8 format file.\n" | |
29 | "-locale name ICU locale to use. Default is en_US\n" | |
30 | "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" | |
31 | " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" | |
32 | "-win Run test using Windows native services. (currently not working) (ICU is default)\n" | |
33 | "-unix Run test using Unix word breaking services. (currently not working) \n" | |
34 | "-mac Run test using MacOSX word breaking services.\n" | |
35 | "-uselen Use API with string lengths. Default is null-terminated strings\n" | |
36 | "-char Use character break iterator\n" | |
37 | "-word Use word break iterator\n" | |
38 | "-line Use line break iterator\n" | |
39 | "-sentence Use sentence break iterator\n" | |
40 | "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" | |
41 | "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" | |
42 | " under test at each call point. For measuring test overhead.\n" | |
43 | "-terse Terse numbers-only output. Intended for use by scripts.\n" | |
44 | "-dump Display stuff.\n" | |
45 | "-capi Use C APIs instead of C++ APIs (currently not working)\n" | |
46 | "-next Do the next test\n" | |
47 | "-isBound Do the isBound test\n" | |
48 | ; | |
49 | ||
50 | ||
51 | #include <stdio.h> | |
52 | #include <string.h> | |
53 | #include <stdlib.h> | |
54 | #include <math.h> | |
55 | #include <locale.h> | |
56 | #include <errno.h> | |
57 | #include <sys/stat.h> | |
58 | ||
59 | #include <unicode/utypes.h> | |
60 | #include <unicode/ucol.h> | |
61 | #include <unicode/ucoleitr.h> | |
62 | #include <unicode/uloc.h> | |
63 | #include <unicode/ustring.h> | |
64 | #include <unicode/ures.h> | |
65 | #include <unicode/uchar.h> | |
66 | #include <unicode/ucnv.h> | |
67 | #include <unicode/utf8.h> | |
68 | ||
69 | #include <unicode/brkiter.h> | |
70 | ||
71 | ||
72 | #if U_PLATFORM_HAS_WIN32_API | |
73 | #include <windows.h> | |
74 | #else | |
75 | // | |
76 | // Stubs for Windows API functions when building on UNIXes. | |
77 | // | |
78 | #include <sys/time.h> | |
79 | unsigned long timeGetTime() { | |
80 | struct timeval t; | |
81 | gettimeofday(&t, 0); | |
82 | unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. | |
83 | val += t.tv_usec / 1000; | |
84 | return val; | |
85 | }; | |
86 | #define MAKELCID(a,b) 0 | |
87 | #endif | |
88 | ||
89 | ||
90 | // | |
91 | // Command line option variables | |
92 | // These global variables are set according to the options specified | |
93 | // on the command line by the user. | |
94 | char * opt_fName = 0; | |
95 | char * opt_locale = "en_US"; | |
96 | int opt_langid = 0; // Defaults to value corresponding to opt_locale. | |
97 | char * opt_rules = 0; | |
98 | UBool opt_help = FALSE; | |
99 | int opt_time = 0; | |
100 | int opt_loopCount = 0; | |
101 | int opt_passesCount= 1; | |
102 | UBool opt_terse = FALSE; | |
103 | UBool opt_icu = TRUE; | |
104 | UBool opt_win = FALSE; // Run with Windows native functions. | |
105 | UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. | |
106 | UBool opt_mac = FALSE; // Run with MacOSX word break services. | |
107 | UBool opt_uselen = FALSE; | |
108 | UBool opt_dump = FALSE; | |
109 | UBool opt_char = FALSE; | |
110 | UBool opt_word = FALSE; | |
111 | UBool opt_line = FALSE; | |
112 | UBool opt_sentence = FALSE; | |
113 | UBool opt_capi = FALSE; | |
114 | ||
115 | UBool opt_next = FALSE; | |
116 | UBool opt_isBound = FALSE; | |
117 | ||
118 | ||
119 | ||
120 | // | |
121 | // Definitions for the command line options | |
122 | // | |
123 | struct OptSpec { | |
124 | const char *name; | |
125 | enum {FLAG, NUM, STRING} type; | |
126 | void *pVar; | |
127 | }; | |
128 | ||
129 | OptSpec opts[] = { | |
130 | {"-file", OptSpec::STRING, &opt_fName}, | |
131 | {"-locale", OptSpec::STRING, &opt_locale}, | |
132 | {"-langid", OptSpec::NUM, &opt_langid}, | |
133 | {"-win", OptSpec::FLAG, &opt_win}, | |
134 | {"-unix", OptSpec::FLAG, &opt_unix}, | |
135 | {"-mac", OptSpec::FLAG, &opt_mac}, | |
136 | {"-uselen", OptSpec::FLAG, &opt_uselen}, | |
137 | {"-loop", OptSpec::NUM, &opt_loopCount}, | |
138 | {"-time", OptSpec::NUM, &opt_time}, | |
139 | {"-passes", OptSpec::NUM, &opt_passesCount}, | |
140 | {"-char", OptSpec::FLAG, &opt_char}, | |
141 | {"-word", OptSpec::FLAG, &opt_word}, | |
142 | {"-line", OptSpec::FLAG, &opt_line}, | |
143 | {"-sentence", OptSpec::FLAG, &opt_sentence}, | |
144 | {"-terse", OptSpec::FLAG, &opt_terse}, | |
145 | {"-dump", OptSpec::FLAG, &opt_dump}, | |
146 | {"-capi", OptSpec::FLAG, &opt_capi}, | |
147 | {"-next", OptSpec::FLAG, &opt_next}, | |
148 | {"-isBound", OptSpec::FLAG, &opt_isBound}, | |
149 | {"-help", OptSpec::FLAG, &opt_help}, | |
150 | {"-?", OptSpec::FLAG, &opt_help}, | |
151 | {0, OptSpec::FLAG, 0} | |
152 | }; | |
153 | ||
154 | ||
155 | //--------------------------------------------------------------------------- | |
156 | // | |
157 | // Global variables pointing to and describing the test file | |
158 | // | |
159 | //--------------------------------------------------------------------------- | |
160 | ||
161 | //DWORD gWinLCID; | |
162 | BreakIterator *brkit = NULL; | |
163 | UChar *text = NULL; | |
164 | int32_t textSize = 0; | |
165 | ||
166 | ||
167 | ||
168 | #if U_PLATFORM_IS_DARWIN_BASED | |
169 | #include <ApplicationServices/ApplicationServices.h> | |
170 | enum{ | |
171 | kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) | |
172 | }; | |
173 | UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; | |
174 | TextBreakLocatorRef breakRef; | |
175 | UCTextBreakType macBreakType; | |
176 | ||
177 | void createMACBrkIt() { | |
178 | OSStatus status = noErr; | |
179 | LocaleRef lref; | |
180 | status = LocaleRefFromLocaleString(opt_locale, &lref); | |
181 | status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); | |
182 | if(opt_char == TRUE) { | |
183 | macBreakType = kUCTextBreakClusterMask; | |
184 | } else if(opt_word == TRUE) { | |
185 | macBreakType = kUCTextBreakWordMask; | |
186 | } else if(opt_line == TRUE) { | |
187 | macBreakType = kUCTextBreakLineMask; | |
188 | } else if(opt_sentence == TRUE) { | |
189 | // error | |
190 | // brkit = BreakIterator::createSentenceInstance(opt_locale, status); | |
191 | } else { | |
192 | // default is character iterator | |
193 | macBreakType = kUCTextBreakClusterMask; | |
194 | } | |
195 | } | |
196 | #endif | |
197 | ||
198 | void createICUBrkIt() { | |
199 | // | |
200 | // Set up an ICU break iterator | |
201 | // | |
202 | UErrorCode status = U_ZERO_ERROR; | |
203 | if(opt_char == TRUE) { | |
204 | brkit = BreakIterator::createCharacterInstance(opt_locale, status); | |
205 | } else if(opt_word == TRUE) { | |
206 | brkit = BreakIterator::createWordInstance(opt_locale, status); | |
207 | } else if(opt_line == TRUE) { | |
208 | brkit = BreakIterator::createLineInstance(opt_locale, status); | |
209 | } else if(opt_sentence == TRUE) { | |
210 | brkit = BreakIterator::createSentenceInstance(opt_locale, status); | |
211 | } else { | |
212 | // default is character iterator | |
213 | brkit = BreakIterator::createCharacterInstance(opt_locale, status); | |
214 | } | |
215 | if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { | |
216 | fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); | |
217 | } | |
218 | if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { | |
219 | fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); | |
220 | } | |
221 | ||
222 | } | |
223 | ||
224 | //--------------------------------------------------------------------------- | |
225 | // | |
226 | // ProcessOptions() Function to read the command line options. | |
227 | // | |
228 | //--------------------------------------------------------------------------- | |
229 | UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) | |
230 | { | |
231 | int i; | |
232 | int argNum; | |
233 | const char *pArgName; | |
234 | OptSpec *pOpt; | |
235 | ||
236 | for (argNum=1; argNum<argc; argNum++) { | |
237 | pArgName = argv[argNum]; | |
238 | for (pOpt = opts; pOpt->name != 0; pOpt++) { | |
239 | if (strcmp(pOpt->name, pArgName) == 0) { | |
240 | switch (pOpt->type) { | |
241 | case OptSpec::FLAG: | |
242 | *(UBool *)(pOpt->pVar) = TRUE; | |
243 | break; | |
244 | case OptSpec::STRING: | |
245 | argNum ++; | |
246 | if (argNum >= argc) { | |
247 | fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); | |
248 | return FALSE; | |
249 | } | |
250 | *(const char **)(pOpt->pVar) = argv[argNum]; | |
251 | break; | |
252 | case OptSpec::NUM: | |
253 | argNum ++; | |
254 | if (argNum >= argc) { | |
255 | fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); | |
256 | return FALSE; | |
257 | } | |
258 | char *endp; | |
259 | i = strtol(argv[argNum], &endp, 0); | |
260 | if (endp == argv[argNum]) { | |
261 | fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); | |
262 | return FALSE; | |
263 | } | |
264 | *(int *)(pOpt->pVar) = i; | |
265 | } | |
266 | break; | |
267 | } | |
268 | } | |
269 | if (pOpt->name == 0) | |
270 | { | |
271 | fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); | |
272 | return FALSE; | |
273 | } | |
274 | } | |
275 | return TRUE; | |
276 | } | |
277 | ||
278 | ||
279 | void doForwardTest() { | |
280 | if (opt_terse == FALSE) { | |
281 | printf("Doing the forward test\n"); | |
282 | } | |
283 | int32_t noBreaks = 0; | |
284 | int32_t i = 0; | |
285 | unsigned long startTime = timeGetTime(); | |
286 | unsigned long elapsedTime = 0; | |
287 | if(opt_icu) { | |
288 | createICUBrkIt(); | |
289 | brkit->setText(UnicodeString(text, textSize)); | |
290 | brkit->first(); | |
291 | if (opt_terse == FALSE) { | |
292 | printf("Warmup\n"); | |
293 | } | |
294 | int j; | |
295 | while((j = brkit->next()) != BreakIterator::DONE) { | |
296 | noBreaks++; | |
297 | //fprintf(stderr, "%d ", j); | |
298 | } | |
299 | ||
300 | if (opt_terse == FALSE) { | |
301 | printf("Measure\n"); | |
302 | } | |
303 | startTime = timeGetTime(); | |
304 | for(i = 0; i < opt_loopCount; i++) { | |
305 | brkit->first(); | |
306 | while(brkit->next() != BreakIterator::DONE) { | |
307 | } | |
308 | } | |
309 | ||
310 | elapsedTime = timeGetTime()-startTime; | |
311 | } else if(opt_mac) { | |
312 | #if U_PLATFORM_IS_DARWIN_BASED | |
313 | createMACBrkIt(); | |
314 | UniChar* filePtr = text; | |
315 | OSStatus status = noErr; | |
316 | UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; | |
317 | startOffset = 0; | |
318 | //printf("\t---Search forward--\n"); | |
319 | ||
320 | while (startOffset < numUniChars) | |
321 | { | |
322 | status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, | |
323 | startOffset, &breakOffset); | |
324 | //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); | |
325 | //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); | |
326 | ||
327 | // Output break | |
328 | //printf("\t%d\n", (int)breakOffset); | |
329 | ||
330 | // Increment counters | |
331 | noBreaks++; | |
332 | startOffset = breakOffset; | |
333 | } | |
334 | startTime = timeGetTime(); | |
335 | for(i = 0; i < opt_loopCount; i++) { | |
336 | startOffset = 0; | |
337 | ||
338 | while (startOffset < numUniChars) | |
339 | { | |
340 | status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, | |
341 | startOffset, &breakOffset); | |
342 | // Increment counters | |
343 | startOffset = breakOffset; | |
344 | } | |
345 | } | |
346 | elapsedTime = timeGetTime()-startTime; | |
347 | UCDisposeTextBreakLocator(&breakRef); | |
348 | #endif | |
349 | ||
350 | ||
351 | } | |
352 | ||
353 | ||
354 | if (opt_terse == FALSE) { | |
355 | int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); | |
356 | int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); | |
357 | int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); | |
358 | printf("forward break iteration average loop time %d\n", loopTime); | |
359 | printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); | |
360 | printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); | |
361 | } else { | |
362 | printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); | |
363 | } | |
364 | ||
365 | ||
366 | } | |
367 | ||
368 | void doIsBoundTest() { | |
369 | int32_t noBreaks = 0, hit = 0; | |
370 | int32_t i = 0, j = 0; | |
371 | unsigned long startTime = timeGetTime(); | |
372 | unsigned long elapsedTime = 0; | |
373 | createICUBrkIt(); | |
374 | brkit->setText(UnicodeString(text, textSize)); | |
375 | brkit->first(); | |
376 | for(j = 0; j < textSize; j++) { | |
377 | if(brkit->isBoundary(j)) { | |
378 | noBreaks++; | |
379 | //fprintf(stderr, "%d ", j); | |
380 | } | |
381 | } | |
382 | /* | |
383 | while(brkit->next() != BreakIterator::DONE) { | |
384 | noBreaks++; | |
385 | } | |
386 | */ | |
387 | ||
388 | startTime = timeGetTime(); | |
389 | for(i = 0; i < opt_loopCount; i++) { | |
390 | for(j = 0; j < textSize; j++) { | |
391 | if(brkit->isBoundary(j)) { | |
392 | hit++; | |
393 | } | |
394 | } | |
395 | } | |
396 | ||
397 | elapsedTime = timeGetTime()-startTime; | |
398 | int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); | |
399 | if (opt_terse == FALSE) { | |
400 | int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); | |
401 | int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); | |
402 | printf("forward break iteration average loop time %d\n", loopTime); | |
403 | printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); | |
404 | printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); | |
405 | } else { | |
406 | printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); | |
407 | } | |
408 | } | |
409 | ||
410 | //---------------------------------------------------------------------------------------- | |
411 | // | |
412 | // UnixConvert -- Convert the lines of the file to the encoding for UNIX | |
413 | // Since it appears that Unicode support is going in the general | |
414 | // direction of the use of UTF-8 locales, that is the approach | |
415 | // that is used here. | |
416 | // | |
417 | //---------------------------------------------------------------------------------------- | |
418 | void UnixConvert() { | |
419 | #if 0 | |
420 | int line; | |
421 | ||
422 | UConverter *cvrtr; // An ICU code page converter. | |
423 | UErrorCode status = U_ZERO_ERROR; | |
424 | ||
425 | ||
426 | cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. | |
427 | if (U_FAILURE(status)) { | |
428 | fprintf(stderr, "ICU Converter open failed.: %d\n", &status); | |
429 | exit(-1); | |
430 | } | |
431 | // redo for unix | |
432 | for (line=0; line < gNumFileLines; line++) { | |
433 | int sizeNeeded = ucnv_fromUChars(cvrtr, | |
434 | 0, // ptr to target buffer. | |
435 | 0, // length of target buffer. | |
436 | gFileLines[line].name, | |
437 | -1, // source is null terminated | |
438 | &status); | |
439 | if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { | |
440 | fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); | |
441 | exit(-1); | |
442 | } | |
443 | status = U_ZERO_ERROR; | |
444 | gFileLines[line].unixName = new char[sizeNeeded+1]; | |
445 | sizeNeeded = ucnv_fromUChars(cvrtr, | |
446 | gFileLines[line].unixName, // ptr to target buffer. | |
447 | sizeNeeded+1, // length of target buffer. | |
448 | gFileLines[line].name, | |
449 | -1, // source is null terminated | |
450 | &status); | |
451 | if (U_FAILURE(status)) { | |
452 | fprintf(stderr, "ICU Conversion Failed.: %d\n", status); | |
453 | exit(-1); | |
454 | } | |
455 | gFileLines[line].unixName[sizeNeeded] = 0; | |
456 | }; | |
457 | ucnv_close(cvrtr); | |
458 | #endif | |
459 | } | |
460 | ||
461 | ||
462 | //---------------------------------------------------------------------------------------- | |
463 | // | |
464 | // class UCharFile Class to hide all the gorp to read a file in | |
465 | // and produce a stream of UChars. | |
466 | // | |
467 | //---------------------------------------------------------------------------------------- | |
468 | class UCharFile { | |
469 | public: | |
470 | UCharFile(const char *fileName); | |
471 | ~UCharFile(); | |
472 | UChar get(); | |
473 | UBool eof() {return fEof;}; | |
474 | UBool error() {return fError;}; | |
475 | int32_t size() { return fFileSize; }; | |
476 | ||
477 | private: | |
478 | UCharFile (const UCharFile &other) {}; // No copy constructor. | |
479 | UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op | |
480 | ||
481 | FILE *fFile; | |
482 | const char *fName; | |
483 | UBool fEof; | |
484 | UBool fError; | |
485 | UChar fPending2ndSurrogate; | |
486 | int32_t fFileSize; | |
487 | ||
488 | enum {UTF16LE, UTF16BE, UTF8} fEncoding; | |
489 | }; | |
490 | ||
491 | UCharFile::UCharFile(const char * fileName) { | |
492 | fEof = FALSE; | |
493 | fError = FALSE; | |
494 | fName = fileName; | |
495 | struct stat buf; | |
496 | int32_t result = stat(fileName, &buf); | |
497 | if(result != 0) { | |
498 | fprintf(stderr, "Error getting info\n"); | |
499 | fFileSize = -1; | |
500 | } else { | |
501 | fFileSize = buf.st_size; | |
502 | } | |
503 | fFile = fopen(fName, "rb"); | |
504 | fPending2ndSurrogate = 0; | |
505 | if (fFile == NULL) { | |
506 | fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); | |
507 | fError = TRUE; | |
508 | return; | |
509 | } | |
510 | // | |
511 | // Look for the byte order mark at the start of the file. | |
512 | // | |
513 | int BOMC1, BOMC2, BOMC3; | |
514 | BOMC1 = fgetc(fFile); | |
515 | BOMC2 = fgetc(fFile); | |
516 | ||
517 | if (BOMC1 == 0xff && BOMC2 == 0xfe) { | |
518 | fEncoding = UTF16LE; } | |
519 | else if (BOMC1 == 0xfe && BOMC2 == 0xff) { | |
520 | fEncoding = UTF16BE; } | |
521 | else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { | |
522 | fEncoding = UTF8; } | |
523 | else | |
524 | { | |
525 | fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " | |
526 | "must include a BOM.\n", fileName); | |
527 | fError = true; | |
528 | return; | |
529 | } | |
530 | } | |
531 | ||
532 | ||
533 | UCharFile::~UCharFile() { | |
534 | fclose(fFile); | |
535 | } | |
536 | ||
537 | ||
538 | ||
539 | UChar UCharFile::get() { | |
540 | UChar c; | |
541 | switch (fEncoding) { | |
542 | case UTF16LE: | |
543 | { | |
544 | int cL, cH; | |
545 | cL = fgetc(fFile); | |
546 | cH = fgetc(fFile); | |
547 | c = cL | (cH << 8); | |
548 | if (cH == EOF) { | |
549 | c = 0; | |
550 | fEof = TRUE; | |
551 | } | |
552 | break; | |
553 | } | |
554 | case UTF16BE: | |
555 | { | |
556 | int cL, cH; | |
557 | cH = fgetc(fFile); | |
558 | cL = fgetc(fFile); | |
559 | c = cL | (cH << 8); | |
560 | if (cL == EOF) { | |
561 | c = 0; | |
562 | fEof = TRUE; | |
563 | } | |
564 | break; | |
565 | } | |
566 | case UTF8: | |
567 | { | |
568 | if (fPending2ndSurrogate != 0) { | |
569 | c = fPending2ndSurrogate; | |
570 | fPending2ndSurrogate = 0; | |
571 | break; | |
572 | } | |
573 | ||
574 | int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. | |
575 | if (ch == EOF) { | |
576 | c = 0; | |
577 | fEof = TRUE; | |
578 | break; | |
579 | } | |
580 | ||
581 | if (ch <= 0x7f) { | |
582 | // It's ascii. No further utf-8 conversion. | |
583 | c = ch; | |
584 | break; | |
585 | } | |
586 | ||
587 | // Figure out the lenght of the char and read the rest of the bytes | |
588 | // into a temp array. | |
589 | int nBytes; | |
590 | if (ch >= 0xF0) {nBytes=4;} | |
591 | else if (ch >= 0xE0) {nBytes=3;} | |
592 | else if (ch >= 0xC0) {nBytes=2;} | |
593 | else { | |
594 | fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); | |
595 | fError = TRUE; | |
596 | return 0; | |
597 | } | |
598 | ||
599 | unsigned char bytes[10]; | |
600 | bytes[0] = (unsigned char)ch; | |
601 | int i; | |
602 | for (i=1; i<nBytes; i++) { | |
603 | bytes[i] = fgetc(fFile); | |
604 | if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { | |
605 | fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); | |
606 | fError = TRUE; | |
607 | return 0; | |
608 | } | |
609 | } | |
610 | ||
611 | // Convert the bytes from the temp array to a Unicode char. | |
612 | i = 0; | |
613 | uint32_t cp; | |
614 | U8_NEXT_UNSAFE(bytes, i, cp); | |
615 | c = (UChar)cp; | |
616 | ||
617 | if (cp >= 0x10000) { | |
618 | // The code point needs to be broken up into a utf-16 surrogate pair. | |
619 | // Process first half this time through the main loop, and | |
620 | // remember the other half for the next time through. | |
621 | UChar utf16Buf[3]; | |
622 | i = 0; | |
623 | UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); | |
624 | fPending2ndSurrogate = utf16Buf[1]; | |
625 | c = utf16Buf[0]; | |
626 | } | |
627 | break; | |
628 | }; | |
629 | } | |
630 | return c; | |
631 | } | |
632 | ||
633 | ||
634 | //---------------------------------------------------------------------------------------- | |
635 | // | |
636 | // Main -- process command line, read in and pre-process the test file, | |
637 | // call other functions to do the actual tests. | |
638 | // | |
639 | //---------------------------------------------------------------------------------------- | |
640 | int main(int argc, const char** argv) { | |
641 | if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { | |
642 | printf(gUsageString); | |
643 | exit (1); | |
644 | } | |
645 | // Make sure that we've only got one API selected. | |
646 | if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; | |
647 | if (opt_mac || opt_unix) opt_win = FALSE; | |
648 | if (opt_mac) opt_unix = FALSE; | |
649 | ||
650 | UErrorCode status = U_ZERO_ERROR; | |
651 | ||
652 | ||
653 | ||
654 | // | |
655 | // Set up a Windows LCID | |
656 | // | |
657 | /* | |
658 | if (opt_langid != 0) { | |
659 | gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); | |
660 | } | |
661 | else { | |
662 | gWinLCID = uloc_getLCID(opt_locale); | |
663 | } | |
664 | */ | |
665 | ||
666 | // | |
667 | // Set the UNIX locale | |
668 | // | |
669 | if (opt_unix) { | |
670 | if (setlocale(LC_ALL, opt_locale) == 0) { | |
671 | fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); | |
672 | exit(-1); | |
673 | } | |
674 | } | |
675 | ||
676 | // Read in the input file. | |
677 | // File assumed to be utf-16. | |
678 | // Lines go onto heap buffers. Global index array to line starts is created. | |
679 | // Lines themselves are null terminated. | |
680 | // | |
681 | ||
682 | UCharFile f(opt_fName); | |
683 | if (f.error()) { | |
684 | exit(-1); | |
685 | } | |
686 | int32_t fileSize = f.size(); | |
687 | const int STARTSIZE = 70000; | |
688 | int32_t bufSize = 0; | |
689 | int32_t charCount = 0; | |
690 | if(fileSize != -1) { | |
691 | text = (UChar *)malloc(fileSize*sizeof(UChar)); | |
692 | bufSize = fileSize; | |
693 | } else { | |
694 | text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); | |
695 | bufSize = STARTSIZE; | |
696 | } | |
697 | if(text == NULL) { | |
698 | fprintf(stderr, "Allocating buffer failed\n"); | |
699 | exit(-1); | |
700 | } | |
701 | ||
702 | ||
703 | // Read the file, split into lines, and save in memory. | |
704 | // Loop runs once per utf-16 value from the input file, | |
705 | // (The number of bytes read from file per loop iteration depends on external encoding.) | |
706 | for (;;) { | |
707 | ||
708 | UChar c = f.get(); | |
709 | if(f.eof()) { | |
710 | break; | |
711 | } | |
712 | if (f.error()){ | |
713 | exit(-1); | |
714 | } | |
715 | // We now have a good UTF-16 value in c. | |
716 | text[charCount++] = c; | |
717 | if(charCount == bufSize) { | |
718 | text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); | |
719 | if(text == NULL) { | |
720 | fprintf(stderr, "Reallocating buffer failed\n"); | |
721 | exit(-1); | |
722 | } | |
723 | bufSize *= 2; | |
724 | } | |
725 | } | |
726 | ||
727 | ||
728 | if (opt_terse == FALSE) { | |
729 | printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); | |
730 | } | |
731 | ||
732 | textSize = charCount; | |
733 | ||
734 | ||
735 | ||
736 | ||
737 | // | |
738 | // Dump file contents if requested. | |
739 | // | |
740 | if (opt_dump) { | |
741 | // dump file, etc... possibly | |
742 | } | |
743 | ||
744 | ||
745 | // | |
746 | // We've got the file read into memory. Go do something with it. | |
747 | // | |
748 | int32_t i = 0; | |
749 | for(i = 0; i < opt_passesCount; i++) { | |
750 | if(opt_loopCount != 0) { | |
751 | if(opt_next) { | |
752 | doForwardTest(); | |
753 | } else if(opt_isBound) { | |
754 | doIsBoundTest(); | |
755 | } else { | |
756 | doForwardTest(); | |
757 | } | |
758 | } else if(opt_time != 0) { | |
759 | ||
760 | } | |
761 | } | |
762 | ||
763 | if(text != NULL) { | |
764 | free(text); | |
765 | } | |
766 | if(brkit != NULL) { | |
767 | delete brkit; | |
768 | } | |
769 | ||
770 | return 0; | |
771 | } |