]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/thaitest/thaitest.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / test / thaitest / thaitest.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1998-2003, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include <errno.h>
9 #include <stdio.h>
10 #include <string.h>
11
12 #include "unicode/utypes.h"
13 #include "unicode/uchar.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/locid.h"
17 #include "unicode/unistr.h"
18
19 /*
20 * This program takes a Unicode text file containing Thai text with
21 * spaces inserted where the word breaks are. It computes a copy of
22 * the text without spaces and uses a word instance of a Thai BreakIterator
23 * to compute the word breaks. The program reports any differences in the
24 * breaks.
25 *
26 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
27 * exptected that this program will always report some differences.
28 */
29
30 /*
31 * This class is a break iterator that counts words and spaces.
32 */
33 class SpaceBreakIterator
34 {
35 public:
36 // The constructor:
37 // text - pointer to an array of UChars to iterate over
38 // count - the number of UChars in text
39 SpaceBreakIterator(const UChar *text, int32_t count);
40
41 // the destructor
42 ~SpaceBreakIterator();
43
44 // return next break position
45 int32_t next();
46
47 // return current word count
48 int32_t getWordCount();
49
50 // return current space count
51 int32_t getSpaceCount();
52
53 private:
54 // No arg constructor: private so clients can't call it.
55 SpaceBreakIterator();
56
57 // The underlying BreakIterator
58 BreakIterator *fBreakIter;
59
60 // address of the UChar array
61 const UChar *fText;
62
63 // number of UChars in fText
64 int32_t fTextCount;
65
66 // current word count
67 int32_t fWordCount;
68
69 // current space count
70 int32_t fSpaceCount;
71
72 // true when fBreakIter has returned DONE
73 UBool fDone;
74 };
75
76 /*
77 * This is the main class. It compares word breaks and reports the differences.
78 */
79 class ThaiWordbreakTest
80 {
81 public:
82 // The main constructor:
83 // spaces - pointer to a UChar array for the text with spaces
84 // spaceCount - the number of characters in the spaces array
85 // noSpaces - pointer to a UChar array for the text without spaces
86 // noSpaceCount - the number of characters in the noSpaces array
87 // verbose - report all breaks if true, otherwise just report differences
88 ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
89 ~ThaiWordbreakTest();
90
91 // returns the number of breaks that are in the spaces array
92 // but aren't found in the noSpaces array
93 int32_t getBreaksNotFound();
94
95 // returns the number of breaks which are found in the noSpaces
96 // array but aren't in the spaces array
97 int32_t getInvalidBreaks();
98
99 // returns the number of words found in the spaces array
100 int32_t getWordCount();
101
102 // reads the input Unicode text file:
103 // fileName - the path name of the file
104 // charCount - set to the number of UChars read from the file
105 // returns - the address of the UChar array containing the characters
106 static const UChar *readFile(char *fileName, int32_t &charCount);
107
108 // removes spaces form the input UChar array:
109 // spaces - pointer to the input UChar array
110 // count - number of UChars in the spaces array
111 // nonSpaceCount - the number of UChars in the result array
112 // returns - the address of the UChar array with spaces removed
113 static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
114
115 private:
116 // The no arg constructor - private so clients can't call it
117 ThaiWordbreakTest();
118
119 // This does the actual comparison:
120 // spaces - the address of the UChar array for the text with spaces
121 // spaceCount - the number of UChars in the spaces array
122 // noSpaces - the address of the UChar array for the text without spaces
123 // noSpaceCount - the number of UChars in the noSpaces array
124 // returns - true if all breaks match, FALSE otherwise
125 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
126 const UChar *noSpaces, int32_t noSpaceCount);
127
128 // helper method to report a break in the spaces
129 // array that's not found in the noSpaces array
130 void breakNotFound(int32_t br);
131
132 // helper method to report a break that's found in
133 // the noSpaces array that's not in the spaces array
134 void foundInvalidBreak(int32_t br);
135
136 // count of breaks in the spaces array that
137 // aren't found in the noSpaces array
138 int32_t fBreaksNotFound;
139
140 // count of breaks found in the noSpaces array
141 // that aren't in the spaces array
142 int32_t fInvalidBreaks;
143
144 // number of words found in the spaces array
145 int32_t fWordCount;
146
147 // report all breaks if true, otherwise just report differences
148 UBool fVerbose;
149 };
150
151 /*
152 * The main constructor: it calls compareWordBreaks and reports any differences
153 */
154 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
155 const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
156 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
157 {
158 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
159 }
160
161 /*
162 * The no arg constructor
163 */
164 ThaiWordbreakTest::ThaiWordbreakTest()
165 {
166 // nothing
167 }
168
169 /*
170 * The destructor
171 */
172 ThaiWordbreakTest::~ThaiWordbreakTest()
173 {
174 // nothing?
175 }
176
177 /*
178 * returns the number of breaks in the spaces array
179 * that aren't found in the noSpaces array
180 */
181 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
182 {
183 return fBreaksNotFound;
184 }
185
186 /*
187 * Returns the number of breaks found in the noSpaces
188 * array that aren't in the spaces array
189 */
190 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
191 {
192 return fInvalidBreaks;
193 }
194
195 /*
196 * Returns the number of words found in the spaces array
197 */
198 inline int32_t ThaiWordbreakTest::getWordCount()
199 {
200 return fWordCount;
201 }
202
203 /*
204 * This method does the acutal break comparison and reports the results.
205 * It uses a SpaceBreakIterator to iterate over the text with spaces,
206 * and a word instance of a Thai BreakIterator to iterate over the text
207 * without spaces.
208 */
209 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
210 const UChar *noSpaces, int32_t noSpaceCount)
211 {
212 UBool result = TRUE;
213 Locale thai("th");
214 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
215 UErrorCode status = U_ZERO_ERROR;
216
217 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
218 breakIter->adoptText(noSpaceIter);
219
220 SpaceBreakIterator spaceIter(spaces, spaceCount);
221
222 int32_t nextBreak = 0;
223 int32_t nextSpaceBreak = 0;
224 int32_t iterCount = 0;
225
226 while (TRUE) {
227 nextSpaceBreak = spaceIter.next();
228 nextBreak = breakIter->next();
229
230 if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
231 if (nextBreak != BreakIterator::DONE) {
232 fprintf(stderr, "break iterator didn't end.\n");
233 } else if (nextSpaceBreak != BreakIterator::DONE) {
234 fprintf(stderr, "premature break iterator end.\n");
235 }
236
237 break;
238 }
239
240 while (nextSpaceBreak != nextBreak &&
241 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
242 if (nextSpaceBreak < nextBreak) {
243 breakNotFound(nextSpaceBreak);
244 result = FALSE;
245 nextSpaceBreak = spaceIter.next();
246 } else if (nextSpaceBreak > nextBreak) {
247 foundInvalidBreak(nextBreak);
248 result = FALSE;
249 nextBreak = breakIter->next();
250 }
251 }
252
253 if (fVerbose) {
254 printf("%d %d\n", nextSpaceBreak, nextBreak);
255 }
256 }
257
258
259 fWordCount = spaceIter.getWordCount();
260
261 delete breakIter;
262
263 return result;
264 }
265
266 /*
267 * Report a break that's in the text with spaces but
268 * not found in the text without spaces.
269 */
270 void ThaiWordbreakTest::breakNotFound(int32_t br)
271 {
272 if (fVerbose) {
273 printf("%d ****\n", br);
274 } else {
275 fprintf(stderr, "break not found: %d\n", br);
276 }
277
278 fBreaksNotFound += 1;
279 }
280
281 /*
282 * Report a break that's found in the text without spaces
283 * that isn't in the text with spaces.
284 */
285 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
286 {
287 if (fVerbose) {
288 printf("**** %d\n", br);
289 } else {
290 fprintf(stderr, "found invalid break: %d\n", br);
291 }
292
293 fInvalidBreaks += 1;
294 }
295
296 /*
297 * Read the text from a file. The text must start with a Unicode Byte
298 * Order Mark (BOM) so that we know what order to read the bytes in.
299 */
300 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
301 {
302 FILE *f;
303 int32_t fileSize;
304
305 UChar *buffer;
306 char *bufferChars;
307
308 f = fopen(fileName, "rb");
309
310 if( f == NULL ) {
311 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
312 return 0;
313 }
314
315 fseek(f, 0, SEEK_END);
316 fileSize = ftell(f);
317
318 fseek(f, 0, SEEK_SET);
319 bufferChars = new char[fileSize];
320
321 if(bufferChars == 0) {
322 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
323 fclose(f);
324 return 0;
325 }
326
327 fread(bufferChars, sizeof(char), fileSize, f);
328 if( ferror(f) ) {
329 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
330 fclose(f);
331 delete[] bufferChars;
332 return 0;
333 }
334 fclose(f);
335
336 UnicodeString myText(bufferChars, fileSize, "UTF-8");
337
338 delete[] bufferChars;
339
340 charCount = myText.length();
341 buffer = new UChar[charCount];
342 if(buffer == 0) {
343 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
344 return 0;
345 }
346
347 myText.extract(1, myText.length(), buffer);
348 charCount--; // skip the BOM
349 buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
350
351 return buffer;
352 }
353
354 /*
355 * Remove spaces from the input UChar array.
356 *
357 * We check explicitly for a Unicode code value of 0x0020
358 * because Unicode::isSpaceChar returns true for CR, LF, etc.
359 *
360 */
361 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
362 {
363 int32_t i, out, spaceCount;
364
365 spaceCount = 0;
366 for (i = 0; i < count; i += 1) {
367 if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
368 spaceCount += 1;
369 }
370 }
371
372 nonSpaceCount = count - spaceCount;
373 UChar *noSpaces = new UChar[nonSpaceCount];
374
375 if (noSpaces == 0) {
376 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
377 return 0;
378 }
379
380 for (out = 0, i = 0; i < count; i += 1) {
381 if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
382 noSpaces[out++] = spaces[i];
383 }
384 }
385
386 return noSpaces;
387 }
388
389 /*
390 * The main routine. Read the command line arguments, read the text file,
391 * remove the spaces, do the comparison and report the final results
392 */
393 int main(int argc, char **argv)
394 {
395 char *fileName = "space.txt";
396 int arg = 1;
397 UBool verbose = FALSE;
398
399 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
400 verbose = TRUE;
401 arg += 1;
402 }
403
404 if (arg == argc - 1) {
405 fileName = argv[arg++];
406 }
407
408 if (arg != argc) {
409 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
410 return 1;
411 }
412
413 int32_t spaceCount, nonSpaceCount;
414 const UChar *spaces, *noSpaces;
415
416 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
417
418 if (spaces == 0) {
419 return 1;
420 }
421
422 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
423
424 if (noSpaces == 0) {
425 return 1;
426 }
427
428 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
429
430 printf("word count: %d\n", test.getWordCount());
431 printf("breaks not found: %d\n", test.getBreaksNotFound());
432 printf("invalid breaks found: %d\n", test.getInvalidBreaks());
433
434 return 0;
435 }
436
437 /*
438 * The main constructor. Clear all the counts and construct a default
439 * word instance of a BreakIterator.
440 */
441 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
442 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
443 {
444 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
445 UErrorCode status = U_ZERO_ERROR;
446 Locale us("us");
447
448 fBreakIter = BreakIterator::createWordInstance(us, status);
449 fBreakIter->adoptText(iter);
450 }
451
452 SpaceBreakIterator::SpaceBreakIterator()
453 {
454 // nothing
455 }
456
457 /*
458 * The destructor. delete the underlying BreakIterator
459 */
460 SpaceBreakIterator::~SpaceBreakIterator()
461 {
462 delete fBreakIter;
463 }
464
465 /*
466 * Return the next break, counting words and spaces.
467 */
468 int32_t SpaceBreakIterator::next()
469 {
470 if (fDone) {
471 return BreakIterator::DONE;
472 }
473
474 int32_t nextBreak = fBreakIter->next();
475
476 if (nextBreak == BreakIterator::DONE) {
477 fDone = TRUE;
478 return BreakIterator::DONE;
479 }
480
481 int32_t result = nextBreak - fSpaceCount;
482
483 if (nextBreak < fTextCount) {
484 if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
485 fSpaceCount += fBreakIter->next() - nextBreak;
486 }
487 }
488
489 fWordCount += 1;
490
491 return result;
492 }
493
494 /*
495 * Returns the current space count
496 */
497 int32_t SpaceBreakIterator::getSpaceCount()
498 {
499 return fSpaceCount;
500 }
501
502 /*
503 * Returns the current word count
504 */
505 int32_t SpaceBreakIterator::getWordCount()
506 {
507 return fWordCount;
508 }
509
510