]>
git.saurik.com Git - apple/icu.git/blob - icuSources/test/thaitest/thaitest.cpp
2 ******************************************************************************
3 * Copyright (C) 1998-2003, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
12 #include "unicode/utypes.h"
13 #include "unicode/uchar.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/locid.h"
17 #include "unicode/unistr.h"
20 * This program takes a Unicode text file containing Thai text with
21 * spaces inserted where the word breaks are. It computes a copy of
22 * the text without spaces and uses a word instance of a Thai BreakIterator
23 * to compute the word breaks. The program reports any differences in the
26 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
27 * exptected that this program will always report some differences.
31 * This class is a break iterator that counts words and spaces.
33 class SpaceBreakIterator
37 // text - pointer to an array of UChars to iterate over
38 // count - the number of UChars in text
39 SpaceBreakIterator(const UChar
*text
, int32_t count
);
42 ~SpaceBreakIterator();
44 // return next break position
47 // return current word count
48 int32_t getWordCount();
50 // return current space count
51 int32_t getSpaceCount();
54 // No arg constructor: private so clients can't call it.
57 // The underlying BreakIterator
58 BreakIterator
*fBreakIter
;
60 // address of the UChar array
63 // number of UChars in fText
69 // current space count
72 // true when fBreakIter has returned DONE
77 * This is the main class. It compares word breaks and reports the differences.
79 class ThaiWordbreakTest
82 // The main constructor:
83 // spaces - pointer to a UChar array for the text with spaces
84 // spaceCount - the number of characters in the spaces array
85 // noSpaces - pointer to a UChar array for the text without spaces
86 // noSpaceCount - the number of characters in the noSpaces array
87 // verbose - report all breaks if true, otherwise just report differences
88 ThaiWordbreakTest(const UChar
*spaces
, int32_t spaceCount
, const UChar
*noSpaces
, int32_t noSpaceCount
, UBool verbose
);
91 // returns the number of breaks that are in the spaces array
92 // but aren't found in the noSpaces array
93 int32_t getBreaksNotFound();
95 // returns the number of breaks which are found in the noSpaces
96 // array but aren't in the spaces array
97 int32_t getInvalidBreaks();
99 // returns the number of words found in the spaces array
100 int32_t getWordCount();
102 // reads the input Unicode text file:
103 // fileName - the path name of the file
104 // charCount - set to the number of UChars read from the file
105 // returns - the address of the UChar array containing the characters
106 static const UChar
*readFile(char *fileName
, int32_t &charCount
);
108 // removes spaces form the input UChar array:
109 // spaces - pointer to the input UChar array
110 // count - number of UChars in the spaces array
111 // nonSpaceCount - the number of UChars in the result array
112 // returns - the address of the UChar array with spaces removed
113 static const UChar
*crunchSpaces(const UChar
*spaces
, int32_t count
, int32_t &nonSpaceCount
);
116 // The no arg constructor - private so clients can't call it
119 // This does the actual comparison:
120 // spaces - the address of the UChar array for the text with spaces
121 // spaceCount - the number of UChars in the spaces array
122 // noSpaces - the address of the UChar array for the text without spaces
123 // noSpaceCount - the number of UChars in the noSpaces array
124 // returns - true if all breaks match, FALSE otherwise
125 UBool
compareWordBreaks(const UChar
*spaces
, int32_t spaceCount
,
126 const UChar
*noSpaces
, int32_t noSpaceCount
);
128 // helper method to report a break in the spaces
129 // array that's not found in the noSpaces array
130 void breakNotFound(int32_t br
);
132 // helper method to report a break that's found in
133 // the noSpaces array that's not in the spaces array
134 void foundInvalidBreak(int32_t br
);
136 // count of breaks in the spaces array that
137 // aren't found in the noSpaces array
138 int32_t fBreaksNotFound
;
140 // count of breaks found in the noSpaces array
141 // that aren't in the spaces array
142 int32_t fInvalidBreaks
;
144 // number of words found in the spaces array
147 // report all breaks if true, otherwise just report differences
152 * The main constructor: it calls compareWordBreaks and reports any differences
154 ThaiWordbreakTest::ThaiWordbreakTest(const UChar
*spaces
, int32_t spaceCount
,
155 const UChar
*noSpaces
, int32_t noSpaceCount
, UBool verbose
)
156 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose
)
158 compareWordBreaks(spaces
, spaceCount
, noSpaces
, noSpaceCount
);
162 * The no arg constructor
164 ThaiWordbreakTest::ThaiWordbreakTest()
172 ThaiWordbreakTest::~ThaiWordbreakTest()
178 * returns the number of breaks in the spaces array
179 * that aren't found in the noSpaces array
181 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
183 return fBreaksNotFound
;
187 * Returns the number of breaks found in the noSpaces
188 * array that aren't in the spaces array
190 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
192 return fInvalidBreaks
;
196 * Returns the number of words found in the spaces array
198 inline int32_t ThaiWordbreakTest::getWordCount()
204 * This method does the acutal break comparison and reports the results.
205 * It uses a SpaceBreakIterator to iterate over the text with spaces,
206 * and a word instance of a Thai BreakIterator to iterate over the text
209 UBool
ThaiWordbreakTest::compareWordBreaks(const UChar
*spaces
, int32_t spaceCount
,
210 const UChar
*noSpaces
, int32_t noSpaceCount
)
214 UCharCharacterIterator
*noSpaceIter
= new UCharCharacterIterator(noSpaces
, noSpaceCount
);
215 UErrorCode status
= U_ZERO_ERROR
;
217 BreakIterator
*breakIter
= BreakIterator::createWordInstance(thai
, status
);
218 breakIter
->adoptText(noSpaceIter
);
220 SpaceBreakIterator
spaceIter(spaces
, spaceCount
);
222 int32_t nextBreak
= 0;
223 int32_t nextSpaceBreak
= 0;
224 int32_t iterCount
= 0;
227 nextSpaceBreak
= spaceIter
.next();
228 nextBreak
= breakIter
->next();
230 if (nextSpaceBreak
== BreakIterator::DONE
|| nextBreak
== BreakIterator::DONE
) {
231 if (nextBreak
!= BreakIterator::DONE
) {
232 fprintf(stderr
, "break iterator didn't end.\n");
233 } else if (nextSpaceBreak
!= BreakIterator::DONE
) {
234 fprintf(stderr
, "premature break iterator end.\n");
240 while (nextSpaceBreak
!= nextBreak
&&
241 nextSpaceBreak
!= BreakIterator::DONE
&& nextBreak
!= BreakIterator::DONE
) {
242 if (nextSpaceBreak
< nextBreak
) {
243 breakNotFound(nextSpaceBreak
);
245 nextSpaceBreak
= spaceIter
.next();
246 } else if (nextSpaceBreak
> nextBreak
) {
247 foundInvalidBreak(nextBreak
);
249 nextBreak
= breakIter
->next();
254 printf("%d %d\n", nextSpaceBreak
, nextBreak
);
259 fWordCount
= spaceIter
.getWordCount();
267 * Report a break that's in the text with spaces but
268 * not found in the text without spaces.
270 void ThaiWordbreakTest::breakNotFound(int32_t br
)
273 printf("%d ****\n", br
);
275 fprintf(stderr
, "break not found: %d\n", br
);
278 fBreaksNotFound
+= 1;
282 * Report a break that's found in the text without spaces
283 * that isn't in the text with spaces.
285 void ThaiWordbreakTest::foundInvalidBreak(int32_t br
)
288 printf("**** %d\n", br
);
290 fprintf(stderr
, "found invalid break: %d\n", br
);
297 * Read the text from a file. The text must start with a Unicode Byte
298 * Order Mark (BOM) so that we know what order to read the bytes in.
300 const UChar
*ThaiWordbreakTest::readFile(char *fileName
, int32_t &charCount
)
308 f
= fopen(fileName
, "rb");
311 fprintf(stderr
,"Couldn't open %s reason: %s \n", fileName
, strerror(errno
));
315 fseek(f
, 0, SEEK_END
);
318 fseek(f
, 0, SEEK_SET
);
319 bufferChars
= new char[fileSize
];
321 if(bufferChars
== 0) {
322 fprintf(stderr
,"Couldn't get memory for reading %s reason: %s \n", fileName
, strerror(errno
));
327 fread(bufferChars
, sizeof(char), fileSize
, f
);
329 fprintf(stderr
,"Couldn't read %s reason: %s \n", fileName
, strerror(errno
));
331 delete[] bufferChars
;
336 UnicodeString
myText(bufferChars
, fileSize
, "UTF-8");
338 delete[] bufferChars
;
340 charCount
= myText
.length();
341 buffer
= new UChar
[charCount
];
343 fprintf(stderr
,"Couldn't get memory for reading %s reason: %s \n", fileName
, strerror(errno
));
347 myText
.extract(1, myText
.length(), buffer
);
348 charCount
--; // skip the BOM
349 buffer
[charCount
] = 0; // NULL terminate for easier reading in the debugger
355 * Remove spaces from the input UChar array.
357 * We check explicitly for a Unicode code value of 0x0020
358 * because Unicode::isSpaceChar returns true for CR, LF, etc.
361 const UChar
*ThaiWordbreakTest::crunchSpaces(const UChar
*spaces
, int32_t count
, int32_t &nonSpaceCount
)
363 int32_t i
, out
, spaceCount
;
366 for (i
= 0; i
< count
; i
+= 1) {
367 if (spaces
[i
] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
372 nonSpaceCount
= count
- spaceCount
;
373 UChar
*noSpaces
= new UChar
[nonSpaceCount
];
376 fprintf(stderr
, "Couldn't allocate memory for the space stripped text.\n");
380 for (out
= 0, i
= 0; i
< count
; i
+= 1) {
381 if (spaces
[i
] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
382 noSpaces
[out
++] = spaces
[i
];
390 * The main routine. Read the command line arguments, read the text file,
391 * remove the spaces, do the comparison and report the final results
393 int main(int argc
, char **argv
)
395 char *fileName
= "space.txt";
397 UBool verbose
= FALSE
;
399 if (argc
>= 2 && strcmp(argv
[1], "-verbose") == 0) {
404 if (arg
== argc
- 1) {
405 fileName
= argv
[arg
++];
409 fprintf(stderr
, "Usage: %s [-verbose] [<file>]\n", argv
[0]);
413 int32_t spaceCount
, nonSpaceCount
;
414 const UChar
*spaces
, *noSpaces
;
416 spaces
= ThaiWordbreakTest::readFile(fileName
, spaceCount
);
422 noSpaces
= ThaiWordbreakTest::crunchSpaces(spaces
, spaceCount
, nonSpaceCount
);
428 ThaiWordbreakTest
test(spaces
, spaceCount
, noSpaces
, nonSpaceCount
, verbose
);
430 printf("word count: %d\n", test
.getWordCount());
431 printf("breaks not found: %d\n", test
.getBreaksNotFound());
432 printf("invalid breaks found: %d\n", test
.getInvalidBreaks());
438 * The main constructor. Clear all the counts and construct a default
439 * word instance of a BreakIterator.
441 SpaceBreakIterator::SpaceBreakIterator(const UChar
*text
, int32_t count
)
442 : fBreakIter(0), fText(text
), fTextCount(count
), fWordCount(0), fSpaceCount(0), fDone(FALSE
)
444 UCharCharacterIterator
*iter
= new UCharCharacterIterator(text
, count
);
445 UErrorCode status
= U_ZERO_ERROR
;
448 fBreakIter
= BreakIterator::createWordInstance(us
, status
);
449 fBreakIter
->adoptText(iter
);
452 SpaceBreakIterator::SpaceBreakIterator()
458 * The destructor. delete the underlying BreakIterator
460 SpaceBreakIterator::~SpaceBreakIterator()
466 * Return the next break, counting words and spaces.
468 int32_t SpaceBreakIterator::next()
471 return BreakIterator::DONE
;
474 int32_t nextBreak
= fBreakIter
->next();
476 if (nextBreak
== BreakIterator::DONE
) {
478 return BreakIterator::DONE
;
481 int32_t result
= nextBreak
- fSpaceCount
;
483 if (nextBreak
< fTextCount
) {
484 if (fText
[nextBreak
] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
485 fSpaceCount
+= fBreakIter
->next() - nextBreak
;
495 * Returns the current space count
497 int32_t SpaceBreakIterator::getSpaceCount()
503 * Returns the current word count
505 int32_t SpaceBreakIterator::getWordCount()