]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | /*********************************************************************** |
2 | * © 2016 and later: Unicode, Inc. and others. | |
3 | * License & terms of use: http://www.unicode.org/copyright.html#License | |
4 | * | |
5 | *********************************************************************** | |
6 | *********************************************************************** | |
b75a7d8f | 7 | * COPYRIGHT: |
51004dcb | 8 | * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. |
b75a7d8f | 9 | * |
f3c0d7a5 | 10 | ***********************************************************************/ |
b75a7d8f A |
11 | /******************************************************************************** |
12 | * | |
13 | * File ubrkperf.cpp | |
14 | * | |
15 | * Modification History: | |
16 | * Name Description | |
17 | * Vladimir Weinstein First Version, based on collperf | |
18 | * | |
19 | ********************************************************************************* | |
20 | */ | |
21 | ||
22 | // | |
23 | // This program tests break iterator performance | |
24 | // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs | |
25 | // (if any) | |
26 | // A text file is required as input. It must be in utf-8 or utf-16 format, | |
27 | // and include a byte order mark. Either LE or BE format is OK. | |
28 | // | |
29 | ||
30 | const char gUsageString[] = | |
31 | "usage: ubrkperf options...\n" | |
32 | "-help Display this message.\n" | |
33 | "-file file_name utf-16/utf-8 format file.\n" | |
34 | "-locale name ICU locale to use. Default is en_US\n" | |
35 | "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" | |
36 | " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" | |
37 | "-win Run test using Windows native services. (currently not working) (ICU is default)\n" | |
38 | "-unix Run test using Unix word breaking services. (currently not working) \n" | |
39 | "-mac Run test using MacOSX word breaking services.\n" | |
40 | "-uselen Use API with string lengths. Default is null-terminated strings\n" | |
41 | "-char Use character break iterator\n" | |
42 | "-word Use word break iterator\n" | |
43 | "-line Use line break iterator\n" | |
44 | "-sentence Use sentence break iterator\n" | |
45 | "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" | |
46 | "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" | |
47 | " under test at each call point. For measuring test overhead.\n" | |
48 | "-terse Terse numbers-only output. Intended for use by scripts.\n" | |
49 | "-dump Display stuff.\n" | |
50 | "-capi Use C APIs instead of C++ APIs (currently not working)\n" | |
51 | "-next Do the next test\n" | |
52 | "-isBound Do the isBound test\n" | |
53 | ; | |
54 | ||
55 | ||
56 | #include <stdio.h> | |
57 | #include <string.h> | |
58 | #include <stdlib.h> | |
59 | #include <math.h> | |
60 | #include <locale.h> | |
61 | #include <errno.h> | |
62 | #include <sys/stat.h> | |
63 | ||
64 | #include <unicode/utypes.h> | |
65 | #include <unicode/ucol.h> | |
66 | #include <unicode/ucoleitr.h> | |
67 | #include <unicode/uloc.h> | |
68 | #include <unicode/ustring.h> | |
69 | #include <unicode/ures.h> | |
70 | #include <unicode/uchar.h> | |
71 | #include <unicode/ucnv.h> | |
72 | #include <unicode/utf8.h> | |
73 | ||
74 | #include <unicode/brkiter.h> | |
75 | ||
76 | ||
4388f060 | 77 | #if U_PLATFORM_HAS_WIN32_API |
b75a7d8f A |
78 | #include <windows.h> |
79 | #else | |
80 | // | |
81 | // Stubs for Windows API functions when building on UNIXes. | |
82 | // | |
83 | #include <sys/time.h> | |
84 | unsigned long timeGetTime() { | |
85 | struct timeval t; | |
86 | gettimeofday(&t, 0); | |
87 | unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. | |
88 | val += t.tv_usec / 1000; | |
89 | return val; | |
90 | }; | |
91 | #define MAKELCID(a,b) 0 | |
92 | #endif | |
93 | ||
94 | ||
95 | // | |
96 | // Command line option variables | |
97 | // These global variables are set according to the options specified | |
98 | // on the command line by the user. | |
99 | char * opt_fName = 0; | |
100 | char * opt_locale = "en_US"; | |
101 | int opt_langid = 0; // Defaults to value corresponding to opt_locale. | |
102 | char * opt_rules = 0; | |
103 | UBool opt_help = FALSE; | |
104 | int opt_time = 0; | |
105 | int opt_loopCount = 0; | |
106 | int opt_passesCount= 1; | |
107 | UBool opt_terse = FALSE; | |
108 | UBool opt_icu = TRUE; | |
109 | UBool opt_win = FALSE; // Run with Windows native functions. | |
110 | UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. | |
111 | UBool opt_mac = FALSE; // Run with MacOSX word break services. | |
112 | UBool opt_uselen = FALSE; | |
113 | UBool opt_dump = FALSE; | |
114 | UBool opt_char = FALSE; | |
115 | UBool opt_word = FALSE; | |
116 | UBool opt_line = FALSE; | |
117 | UBool opt_sentence = FALSE; | |
118 | UBool opt_capi = FALSE; | |
119 | ||
120 | UBool opt_next = FALSE; | |
121 | UBool opt_isBound = FALSE; | |
122 | ||
123 | ||
124 | ||
125 | // | |
126 | // Definitions for the command line options | |
127 | // | |
128 | struct OptSpec { | |
129 | const char *name; | |
130 | enum {FLAG, NUM, STRING} type; | |
131 | void *pVar; | |
132 | }; | |
133 | ||
134 | OptSpec opts[] = { | |
135 | {"-file", OptSpec::STRING, &opt_fName}, | |
136 | {"-locale", OptSpec::STRING, &opt_locale}, | |
137 | {"-langid", OptSpec::NUM, &opt_langid}, | |
138 | {"-win", OptSpec::FLAG, &opt_win}, | |
139 | {"-unix", OptSpec::FLAG, &opt_unix}, | |
140 | {"-mac", OptSpec::FLAG, &opt_mac}, | |
141 | {"-uselen", OptSpec::FLAG, &opt_uselen}, | |
142 | {"-loop", OptSpec::NUM, &opt_loopCount}, | |
143 | {"-time", OptSpec::NUM, &opt_time}, | |
144 | {"-passes", OptSpec::NUM, &opt_passesCount}, | |
145 | {"-char", OptSpec::FLAG, &opt_char}, | |
146 | {"-word", OptSpec::FLAG, &opt_word}, | |
147 | {"-line", OptSpec::FLAG, &opt_line}, | |
148 | {"-sentence", OptSpec::FLAG, &opt_sentence}, | |
149 | {"-terse", OptSpec::FLAG, &opt_terse}, | |
150 | {"-dump", OptSpec::FLAG, &opt_dump}, | |
151 | {"-capi", OptSpec::FLAG, &opt_capi}, | |
152 | {"-next", OptSpec::FLAG, &opt_next}, | |
153 | {"-isBound", OptSpec::FLAG, &opt_isBound}, | |
154 | {"-help", OptSpec::FLAG, &opt_help}, | |
155 | {"-?", OptSpec::FLAG, &opt_help}, | |
156 | {0, OptSpec::FLAG, 0} | |
157 | }; | |
158 | ||
159 | ||
160 | //--------------------------------------------------------------------------- | |
161 | // | |
162 | // Global variables pointing to and describing the test file | |
163 | // | |
164 | //--------------------------------------------------------------------------- | |
165 | ||
166 | //DWORD gWinLCID; | |
167 | BreakIterator *brkit = NULL; | |
168 | UChar *text = NULL; | |
169 | int32_t textSize = 0; | |
170 | ||
171 | ||
172 | ||
4388f060 | 173 | #if U_PLATFORM_IS_DARWIN_BASED |
b75a7d8f A |
174 | #include <ApplicationServices/ApplicationServices.h> |
175 | enum{ | |
176 | kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) | |
177 | }; | |
178 | UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; | |
179 | TextBreakLocatorRef breakRef; | |
180 | UCTextBreakType macBreakType; | |
181 | ||
182 | void createMACBrkIt() { | |
183 | OSStatus status = noErr; | |
184 | LocaleRef lref; | |
185 | status = LocaleRefFromLocaleString(opt_locale, &lref); | |
186 | status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); | |
187 | if(opt_char == TRUE) { | |
188 | macBreakType = kUCTextBreakClusterMask; | |
189 | } else if(opt_word == TRUE) { | |
190 | macBreakType = kUCTextBreakWordMask; | |
191 | } else if(opt_line == TRUE) { | |
192 | macBreakType = kUCTextBreakLineMask; | |
193 | } else if(opt_sentence == TRUE) { | |
194 | // error | |
195 | // brkit = BreakIterator::createSentenceInstance(opt_locale, status); | |
196 | } else { | |
197 | // default is character iterator | |
198 | macBreakType = kUCTextBreakClusterMask; | |
199 | } | |
200 | } | |
201 | #endif | |
202 | ||
203 | void createICUBrkIt() { | |
204 | // | |
205 | // Set up an ICU break iterator | |
206 | // | |
207 | UErrorCode status = U_ZERO_ERROR; | |
208 | if(opt_char == TRUE) { | |
209 | brkit = BreakIterator::createCharacterInstance(opt_locale, status); | |
210 | } else if(opt_word == TRUE) { | |
211 | brkit = BreakIterator::createWordInstance(opt_locale, status); | |
212 | } else if(opt_line == TRUE) { | |
213 | brkit = BreakIterator::createLineInstance(opt_locale, status); | |
214 | } else if(opt_sentence == TRUE) { | |
215 | brkit = BreakIterator::createSentenceInstance(opt_locale, status); | |
216 | } else { | |
217 | // default is character iterator | |
218 | brkit = BreakIterator::createCharacterInstance(opt_locale, status); | |
219 | } | |
220 | if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { | |
221 | fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); | |
222 | } | |
223 | if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { | |
224 | fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); | |
225 | } | |
226 | ||
227 | } | |
228 | ||
229 | //--------------------------------------------------------------------------- | |
230 | // | |
231 | // ProcessOptions() Function to read the command line options. | |
232 | // | |
233 | //--------------------------------------------------------------------------- | |
234 | UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) | |
235 | { | |
236 | int i; | |
237 | int argNum; | |
238 | const char *pArgName; | |
239 | OptSpec *pOpt; | |
240 | ||
241 | for (argNum=1; argNum<argc; argNum++) { | |
242 | pArgName = argv[argNum]; | |
243 | for (pOpt = opts; pOpt->name != 0; pOpt++) { | |
244 | if (strcmp(pOpt->name, pArgName) == 0) { | |
245 | switch (pOpt->type) { | |
246 | case OptSpec::FLAG: | |
247 | *(UBool *)(pOpt->pVar) = TRUE; | |
248 | break; | |
249 | case OptSpec::STRING: | |
250 | argNum ++; | |
251 | if (argNum >= argc) { | |
252 | fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); | |
253 | return FALSE; | |
254 | } | |
255 | *(const char **)(pOpt->pVar) = argv[argNum]; | |
256 | break; | |
257 | case OptSpec::NUM: | |
258 | argNum ++; | |
259 | if (argNum >= argc) { | |
260 | fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); | |
261 | return FALSE; | |
262 | } | |
263 | char *endp; | |
264 | i = strtol(argv[argNum], &endp, 0); | |
265 | if (endp == argv[argNum]) { | |
266 | fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); | |
267 | return FALSE; | |
268 | } | |
269 | *(int *)(pOpt->pVar) = i; | |
270 | } | |
271 | break; | |
272 | } | |
273 | } | |
274 | if (pOpt->name == 0) | |
275 | { | |
276 | fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); | |
277 | return FALSE; | |
278 | } | |
279 | } | |
280 | return TRUE; | |
281 | } | |
282 | ||
283 | ||
284 | void doForwardTest() { | |
285 | if (opt_terse == FALSE) { | |
286 | printf("Doing the forward test\n"); | |
287 | } | |
288 | int32_t noBreaks = 0; | |
289 | int32_t i = 0; | |
290 | unsigned long startTime = timeGetTime(); | |
291 | unsigned long elapsedTime = 0; | |
292 | if(opt_icu) { | |
293 | createICUBrkIt(); | |
294 | brkit->setText(UnicodeString(text, textSize)); | |
295 | brkit->first(); | |
296 | if (opt_terse == FALSE) { | |
297 | printf("Warmup\n"); | |
298 | } | |
299 | int j; | |
300 | while((j = brkit->next()) != BreakIterator::DONE) { | |
301 | noBreaks++; | |
302 | //fprintf(stderr, "%d ", j); | |
303 | } | |
304 | ||
305 | if (opt_terse == FALSE) { | |
306 | printf("Measure\n"); | |
307 | } | |
308 | startTime = timeGetTime(); | |
309 | for(i = 0; i < opt_loopCount; i++) { | |
310 | brkit->first(); | |
311 | while(brkit->next() != BreakIterator::DONE) { | |
312 | } | |
313 | } | |
314 | ||
315 | elapsedTime = timeGetTime()-startTime; | |
316 | } else if(opt_mac) { | |
4388f060 | 317 | #if U_PLATFORM_IS_DARWIN_BASED |
b75a7d8f A |
318 | createMACBrkIt(); |
319 | UniChar* filePtr = text; | |
320 | OSStatus status = noErr; | |
321 | UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; | |
322 | startOffset = 0; | |
323 | //printf("\t---Search forward--\n"); | |
324 | ||
325 | while (startOffset < numUniChars) | |
326 | { | |
327 | status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, | |
328 | startOffset, &breakOffset); | |
329 | //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); | |
330 | //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); | |
331 | ||
332 | // Output break | |
333 | //printf("\t%d\n", (int)breakOffset); | |
334 | ||
335 | // Increment counters | |
336 | noBreaks++; | |
337 | startOffset = breakOffset; | |
338 | } | |
339 | startTime = timeGetTime(); | |
340 | for(i = 0; i < opt_loopCount; i++) { | |
341 | startOffset = 0; | |
342 | ||
343 | while (startOffset < numUniChars) | |
344 | { | |
345 | status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, | |
346 | startOffset, &breakOffset); | |
347 | // Increment counters | |
348 | startOffset = breakOffset; | |
349 | } | |
350 | } | |
351 | elapsedTime = timeGetTime()-startTime; | |
352 | UCDisposeTextBreakLocator(&breakRef); | |
353 | #endif | |
354 | ||
355 | ||
356 | } | |
357 | ||
358 | ||
359 | if (opt_terse == FALSE) { | |
360 | int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); | |
361 | int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); | |
362 | int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); | |
363 | printf("forward break iteration average loop time %d\n", loopTime); | |
364 | printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); | |
365 | printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); | |
366 | } else { | |
367 | printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); | |
368 | } | |
369 | ||
370 | ||
371 | } | |
372 | ||
373 | void doIsBoundTest() { | |
374 | int32_t noBreaks = 0, hit = 0; | |
375 | int32_t i = 0, j = 0; | |
376 | unsigned long startTime = timeGetTime(); | |
377 | unsigned long elapsedTime = 0; | |
378 | createICUBrkIt(); | |
379 | brkit->setText(UnicodeString(text, textSize)); | |
380 | brkit->first(); | |
381 | for(j = 0; j < textSize; j++) { | |
382 | if(brkit->isBoundary(j)) { | |
383 | noBreaks++; | |
384 | //fprintf(stderr, "%d ", j); | |
385 | } | |
386 | } | |
387 | /* | |
388 | while(brkit->next() != BreakIterator::DONE) { | |
389 | noBreaks++; | |
390 | } | |
391 | */ | |
392 | ||
393 | startTime = timeGetTime(); | |
394 | for(i = 0; i < opt_loopCount; i++) { | |
395 | for(j = 0; j < textSize; j++) { | |
396 | if(brkit->isBoundary(j)) { | |
397 | hit++; | |
398 | } | |
399 | } | |
400 | } | |
401 | ||
402 | elapsedTime = timeGetTime()-startTime; | |
403 | int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); | |
404 | if (opt_terse == FALSE) { | |
405 | int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); | |
406 | int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); | |
407 | printf("forward break iteration average loop time %d\n", loopTime); | |
408 | printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); | |
409 | printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); | |
410 | } else { | |
411 | printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); | |
412 | } | |
413 | } | |
414 | ||
415 | //---------------------------------------------------------------------------------------- | |
416 | // | |
417 | // UnixConvert -- Convert the lines of the file to the encoding for UNIX | |
418 | // Since it appears that Unicode support is going in the general | |
419 | // direction of the use of UTF-8 locales, that is the approach | |
420 | // that is used here. | |
421 | // | |
422 | //---------------------------------------------------------------------------------------- | |
423 | void UnixConvert() { | |
424 | #if 0 | |
425 | int line; | |
426 | ||
427 | UConverter *cvrtr; // An ICU code page converter. | |
428 | UErrorCode status = U_ZERO_ERROR; | |
429 | ||
430 | ||
431 | cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. | |
432 | if (U_FAILURE(status)) { | |
433 | fprintf(stderr, "ICU Converter open failed.: %d\n", &status); | |
434 | exit(-1); | |
435 | } | |
436 | // redo for unix | |
437 | for (line=0; line < gNumFileLines; line++) { | |
438 | int sizeNeeded = ucnv_fromUChars(cvrtr, | |
439 | 0, // ptr to target buffer. | |
440 | 0, // length of target buffer. | |
441 | gFileLines[line].name, | |
442 | -1, // source is null terminated | |
443 | &status); | |
444 | if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { | |
445 | fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); | |
446 | exit(-1); | |
447 | } | |
448 | status = U_ZERO_ERROR; | |
449 | gFileLines[line].unixName = new char[sizeNeeded+1]; | |
450 | sizeNeeded = ucnv_fromUChars(cvrtr, | |
451 | gFileLines[line].unixName, // ptr to target buffer. | |
452 | sizeNeeded+1, // length of target buffer. | |
453 | gFileLines[line].name, | |
454 | -1, // source is null terminated | |
455 | &status); | |
456 | if (U_FAILURE(status)) { | |
457 | fprintf(stderr, "ICU Conversion Failed.: %d\n", status); | |
458 | exit(-1); | |
459 | } | |
460 | gFileLines[line].unixName[sizeNeeded] = 0; | |
461 | }; | |
462 | ucnv_close(cvrtr); | |
463 | #endif | |
464 | } | |
465 | ||
466 | ||
467 | //---------------------------------------------------------------------------------------- | |
468 | // | |
469 | // class UCharFile Class to hide all the gorp to read a file in | |
470 | // and produce a stream of UChars. | |
471 | // | |
472 | //---------------------------------------------------------------------------------------- | |
473 | class UCharFile { | |
474 | public: | |
475 | UCharFile(const char *fileName); | |
476 | ~UCharFile(); | |
477 | UChar get(); | |
478 | UBool eof() {return fEof;}; | |
479 | UBool error() {return fError;}; | |
480 | int32_t size() { return fFileSize; }; | |
481 | ||
482 | private: | |
483 | UCharFile (const UCharFile &other) {}; // No copy constructor. | |
484 | UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op | |
485 | ||
486 | FILE *fFile; | |
487 | const char *fName; | |
488 | UBool fEof; | |
489 | UBool fError; | |
490 | UChar fPending2ndSurrogate; | |
491 | int32_t fFileSize; | |
492 | ||
493 | enum {UTF16LE, UTF16BE, UTF8} fEncoding; | |
494 | }; | |
495 | ||
496 | UCharFile::UCharFile(const char * fileName) { | |
497 | fEof = FALSE; | |
498 | fError = FALSE; | |
499 | fName = fileName; | |
500 | struct stat buf; | |
501 | int32_t result = stat(fileName, &buf); | |
502 | if(result != 0) { | |
503 | fprintf(stderr, "Error getting info\n"); | |
504 | fFileSize = -1; | |
505 | } else { | |
506 | fFileSize = buf.st_size; | |
507 | } | |
508 | fFile = fopen(fName, "rb"); | |
509 | fPending2ndSurrogate = 0; | |
510 | if (fFile == NULL) { | |
511 | fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); | |
512 | fError = TRUE; | |
513 | return; | |
514 | } | |
515 | // | |
516 | // Look for the byte order mark at the start of the file. | |
517 | // | |
518 | int BOMC1, BOMC2, BOMC3; | |
519 | BOMC1 = fgetc(fFile); | |
520 | BOMC2 = fgetc(fFile); | |
521 | ||
522 | if (BOMC1 == 0xff && BOMC2 == 0xfe) { | |
523 | fEncoding = UTF16LE; } | |
524 | else if (BOMC1 == 0xfe && BOMC2 == 0xff) { | |
525 | fEncoding = UTF16BE; } | |
526 | else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { | |
527 | fEncoding = UTF8; } | |
528 | else | |
529 | { | |
530 | fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " | |
531 | "must include a BOM.\n", fileName); | |
532 | fError = true; | |
533 | return; | |
534 | } | |
535 | } | |
536 | ||
537 | ||
538 | UCharFile::~UCharFile() { | |
539 | fclose(fFile); | |
540 | } | |
541 | ||
542 | ||
543 | ||
544 | UChar UCharFile::get() { | |
545 | UChar c; | |
546 | switch (fEncoding) { | |
547 | case UTF16LE: | |
548 | { | |
549 | int cL, cH; | |
550 | cL = fgetc(fFile); | |
551 | cH = fgetc(fFile); | |
552 | c = cL | (cH << 8); | |
553 | if (cH == EOF) { | |
554 | c = 0; | |
555 | fEof = TRUE; | |
556 | } | |
557 | break; | |
558 | } | |
559 | case UTF16BE: | |
560 | { | |
561 | int cL, cH; | |
562 | cH = fgetc(fFile); | |
563 | cL = fgetc(fFile); | |
564 | c = cL | (cH << 8); | |
565 | if (cL == EOF) { | |
566 | c = 0; | |
567 | fEof = TRUE; | |
568 | } | |
569 | break; | |
570 | } | |
571 | case UTF8: | |
572 | { | |
573 | if (fPending2ndSurrogate != 0) { | |
574 | c = fPending2ndSurrogate; | |
575 | fPending2ndSurrogate = 0; | |
576 | break; | |
577 | } | |
578 | ||
579 | int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. | |
580 | if (ch == EOF) { | |
581 | c = 0; | |
582 | fEof = TRUE; | |
583 | break; | |
584 | } | |
585 | ||
586 | if (ch <= 0x7f) { | |
587 | // It's ascii. No further utf-8 conversion. | |
588 | c = ch; | |
589 | break; | |
590 | } | |
591 | ||
592 | // Figure out the lenght of the char and read the rest of the bytes | |
593 | // into a temp array. | |
594 | int nBytes; | |
595 | if (ch >= 0xF0) {nBytes=4;} | |
596 | else if (ch >= 0xE0) {nBytes=3;} | |
597 | else if (ch >= 0xC0) {nBytes=2;} | |
598 | else { | |
599 | fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); | |
600 | fError = TRUE; | |
601 | return 0; | |
602 | } | |
603 | ||
604 | unsigned char bytes[10]; | |
605 | bytes[0] = (unsigned char)ch; | |
606 | int i; | |
607 | for (i=1; i<nBytes; i++) { | |
608 | bytes[i] = fgetc(fFile); | |
609 | if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { | |
610 | fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); | |
611 | fError = TRUE; | |
612 | return 0; | |
613 | } | |
614 | } | |
615 | ||
616 | // Convert the bytes from the temp array to a Unicode char. | |
617 | i = 0; | |
618 | uint32_t cp; | |
51004dcb | 619 | U8_NEXT_UNSAFE(bytes, i, cp); |
b75a7d8f A |
620 | c = (UChar)cp; |
621 | ||
622 | if (cp >= 0x10000) { | |
623 | // The code point needs to be broken up into a utf-16 surrogate pair. | |
624 | // Process first half this time through the main loop, and | |
625 | // remember the other half for the next time through. | |
626 | UChar utf16Buf[3]; | |
627 | i = 0; | |
628 | UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); | |
629 | fPending2ndSurrogate = utf16Buf[1]; | |
630 | c = utf16Buf[0]; | |
631 | } | |
632 | break; | |
633 | }; | |
634 | } | |
635 | return c; | |
636 | } | |
637 | ||
638 | ||
639 | //---------------------------------------------------------------------------------------- | |
640 | // | |
641 | // Main -- process command line, read in and pre-process the test file, | |
642 | // call other functions to do the actual tests. | |
643 | // | |
644 | //---------------------------------------------------------------------------------------- | |
645 | int main(int argc, const char** argv) { | |
646 | if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { | |
647 | printf(gUsageString); | |
648 | exit (1); | |
649 | } | |
650 | // Make sure that we've only got one API selected. | |
651 | if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; | |
652 | if (opt_mac || opt_unix) opt_win = FALSE; | |
653 | if (opt_mac) opt_unix = FALSE; | |
654 | ||
655 | UErrorCode status = U_ZERO_ERROR; | |
656 | ||
657 | ||
658 | ||
659 | // | |
660 | // Set up a Windows LCID | |
661 | // | |
662 | /* | |
663 | if (opt_langid != 0) { | |
664 | gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); | |
665 | } | |
666 | else { | |
667 | gWinLCID = uloc_getLCID(opt_locale); | |
668 | } | |
669 | */ | |
670 | ||
671 | // | |
672 | // Set the UNIX locale | |
673 | // | |
674 | if (opt_unix) { | |
675 | if (setlocale(LC_ALL, opt_locale) == 0) { | |
676 | fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); | |
677 | exit(-1); | |
678 | } | |
679 | } | |
680 | ||
681 | // Read in the input file. | |
682 | // File assumed to be utf-16. | |
683 | // Lines go onto heap buffers. Global index array to line starts is created. | |
684 | // Lines themselves are null terminated. | |
685 | // | |
686 | ||
687 | UCharFile f(opt_fName); | |
688 | if (f.error()) { | |
689 | exit(-1); | |
690 | } | |
691 | int32_t fileSize = f.size(); | |
692 | const int STARTSIZE = 70000; | |
693 | int32_t bufSize = 0; | |
694 | int32_t charCount = 0; | |
695 | if(fileSize != -1) { | |
696 | text = (UChar *)malloc(fileSize*sizeof(UChar)); | |
697 | bufSize = fileSize; | |
698 | } else { | |
699 | text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); | |
700 | bufSize = STARTSIZE; | |
701 | } | |
702 | if(text == NULL) { | |
703 | fprintf(stderr, "Allocating buffer failed\n"); | |
704 | exit(-1); | |
705 | } | |
706 | ||
707 | ||
708 | // Read the file, split into lines, and save in memory. | |
709 | // Loop runs once per utf-16 value from the input file, | |
710 | // (The number of bytes read from file per loop iteration depends on external encoding.) | |
711 | for (;;) { | |
712 | ||
713 | UChar c = f.get(); | |
714 | if(f.eof()) { | |
715 | break; | |
716 | } | |
717 | if (f.error()){ | |
718 | exit(-1); | |
719 | } | |
720 | // We now have a good UTF-16 value in c. | |
721 | text[charCount++] = c; | |
722 | if(charCount == bufSize) { | |
723 | text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); | |
724 | if(text == NULL) { | |
725 | fprintf(stderr, "Reallocating buffer failed\n"); | |
726 | exit(-1); | |
727 | } | |
728 | bufSize *= 2; | |
729 | } | |
730 | } | |
731 | ||
732 | ||
733 | if (opt_terse == FALSE) { | |
734 | printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); | |
735 | } | |
736 | ||
737 | textSize = charCount; | |
738 | ||
739 | ||
740 | ||
741 | ||
742 | // | |
743 | // Dump file contents if requested. | |
744 | // | |
745 | if (opt_dump) { | |
746 | // dump file, etc... possibly | |
747 | } | |
748 | ||
749 | ||
750 | // | |
751 | // We've got the file read into memory. Go do something with it. | |
752 | // | |
753 | int32_t i = 0; | |
754 | for(i = 0; i < opt_passesCount; i++) { | |
755 | if(opt_loopCount != 0) { | |
756 | if(opt_next) { | |
757 | doForwardTest(); | |
758 | } else if(opt_isBound) { | |
759 | doIsBoundTest(); | |
760 | } else { | |
761 | doForwardTest(); | |
762 | } | |
763 | } else if(opt_time != 0) { | |
764 | ||
765 | } | |
766 | } | |
767 | ||
768 | if(text != NULL) { | |
769 | free(text); | |
770 | } | |
771 | if(brkit != NULL) { | |
772 | delete brkit; | |
773 | } | |
774 | ||
775 | return 0; | |
776 | } |