2 ******************************************************************************
3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 ******************************************************************************
12 #include "unicode/utypes.h"
13 #include "unicode/uchar.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/locid.h"
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
22 * This program takes a Unicode text file containing Thai text with
23 * spaces inserted where the word breaks are. It computes a copy of
24 * the text without spaces and uses a word instance of a Thai BreakIterator
25 * to compute the word breaks. The program reports any differences in the
28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
29 * exptected that this program will always report some differences.
33 * This class is a break iterator that counts words and spaces.
35 class SpaceBreakIterator
39 // text - pointer to an array of UChars to iterate over
40 // count - the number of UChars in text
41 SpaceBreakIterator(const UChar
*text
, int32_t count
);
44 ~SpaceBreakIterator();
46 // return next break position
49 // return current word count
50 int32_t getWordCount();
52 // return current space count
53 int32_t getSpaceCount();
56 // No arg constructor: private so clients can't call it.
59 // The underlying BreakIterator
60 BreakIterator
*fBreakIter
;
62 // address of the UChar array
65 // number of UChars in fText
71 // current space count
74 // UnicodeSet of SA characters
75 UnicodeSet fComplexContext
;
77 // true when fBreakIter has returned DONE
82 * This is the main class. It compares word breaks and reports the differences.
84 class ThaiWordbreakTest
87 // The main constructor:
88 // spaces - pointer to a UChar array for the text with spaces
89 // spaceCount - the number of characters in the spaces array
90 // noSpaces - pointer to a UChar array for the text without spaces
91 // noSpaceCount - the number of characters in the noSpaces array
92 // verbose - report all breaks if true, otherwise just report differences
93 ThaiWordbreakTest(const UChar
*spaces
, int32_t spaceCount
, const UChar
*noSpaces
, int32_t noSpaceCount
, UBool verbose
);
96 // returns the number of breaks that are in the spaces array
97 // but aren't found in the noSpaces array
98 int32_t getBreaksNotFound();
100 // returns the number of breaks which are found in the noSpaces
101 // array but aren't in the spaces array
102 int32_t getInvalidBreaks();
104 // returns the number of words found in the spaces array
105 int32_t getWordCount();
107 // reads the input Unicode text file:
108 // fileName - the path name of the file
109 // charCount - set to the number of UChars read from the file
110 // returns - the address of the UChar array containing the characters
111 static const UChar
*readFile(char *fileName
, int32_t &charCount
);
113 // removes spaces form the input UChar array:
114 // spaces - pointer to the input UChar array
115 // count - number of UChars in the spaces array
116 // nonSpaceCount - the number of UChars in the result array
117 // returns - the address of the UChar array with spaces removed
118 static const UChar
*crunchSpaces(const UChar
*spaces
, int32_t count
, int32_t &nonSpaceCount
);
121 // The no arg constructor - private so clients can't call it
124 // This does the actual comparison:
125 // spaces - the address of the UChar array for the text with spaces
126 // spaceCount - the number of UChars in the spaces array
127 // noSpaces - the address of the UChar array for the text without spaces
128 // noSpaceCount - the number of UChars in the noSpaces array
129 // returns - true if all breaks match, FALSE otherwise
130 UBool
compareWordBreaks(const UChar
*spaces
, int32_t spaceCount
,
131 const UChar
*noSpaces
, int32_t noSpaceCount
);
133 // helper method to report a break in the spaces
134 // array that's not found in the noSpaces array
135 void breakNotFound(int32_t br
);
137 // helper method to report a break that's found in
138 // the noSpaces array that's not in the spaces array
139 void foundInvalidBreak(int32_t br
);
141 // count of breaks in the spaces array that
142 // aren't found in the noSpaces array
143 int32_t fBreaksNotFound
;
145 // count of breaks found in the noSpaces array
146 // that aren't in the spaces array
147 int32_t fInvalidBreaks
;
149 // number of words found in the spaces array
152 // report all breaks if true, otherwise just report differences
157 * The main constructor: it calls compareWordBreaks and reports any differences
159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar
*spaces
, int32_t spaceCount
,
160 const UChar
*noSpaces
, int32_t noSpaceCount
, UBool verbose
)
161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose
)
163 compareWordBreaks(spaces
, spaceCount
, noSpaces
, noSpaceCount
);
167 * The no arg constructor
169 ThaiWordbreakTest::ThaiWordbreakTest()
177 ThaiWordbreakTest::~ThaiWordbreakTest()
183 * returns the number of breaks in the spaces array
184 * that aren't found in the noSpaces array
186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
188 return fBreaksNotFound
;
192 * Returns the number of breaks found in the noSpaces
193 * array that aren't in the spaces array
195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
197 return fInvalidBreaks
;
201 * Returns the number of words found in the spaces array
203 inline int32_t ThaiWordbreakTest::getWordCount()
209 * This method does the acutal break comparison and reports the results.
210 * It uses a SpaceBreakIterator to iterate over the text with spaces,
211 * and a word instance of a Thai BreakIterator to iterate over the text
214 UBool
ThaiWordbreakTest::compareWordBreaks(const UChar
*spaces
, int32_t spaceCount
,
215 const UChar
*noSpaces
, int32_t noSpaceCount
)
219 UCharCharacterIterator
*noSpaceIter
= new UCharCharacterIterator(noSpaces
, noSpaceCount
);
220 UErrorCode status
= U_ZERO_ERROR
;
222 BreakIterator
*breakIter
= BreakIterator::createWordInstance(thai
, status
);
223 breakIter
->adoptText(noSpaceIter
);
225 SpaceBreakIterator
spaceIter(spaces
, spaceCount
);
227 int32_t nextBreak
= 0;
228 int32_t nextSpaceBreak
= 0;
229 int32_t iterCount
= 0;
232 nextSpaceBreak
= spaceIter
.next();
233 nextBreak
= breakIter
->next();
235 if (nextSpaceBreak
== BreakIterator::DONE
|| nextBreak
== BreakIterator::DONE
) {
236 if (nextBreak
!= BreakIterator::DONE
) {
237 fprintf(stderr
, "break iterator didn't end.\n");
238 } else if (nextSpaceBreak
!= BreakIterator::DONE
) {
239 fprintf(stderr
, "premature break iterator end.\n");
245 while (nextSpaceBreak
!= nextBreak
&&
246 nextSpaceBreak
!= BreakIterator::DONE
&& nextBreak
!= BreakIterator::DONE
) {
247 if (nextSpaceBreak
< nextBreak
) {
248 breakNotFound(nextSpaceBreak
);
250 nextSpaceBreak
= spaceIter
.next();
251 } else if (nextSpaceBreak
> nextBreak
) {
252 foundInvalidBreak(nextBreak
);
254 nextBreak
= breakIter
->next();
259 printf("%d %d\n", nextSpaceBreak
, nextBreak
);
264 fWordCount
= spaceIter
.getWordCount();
272 * Report a break that's in the text with spaces but
273 * not found in the text without spaces.
275 void ThaiWordbreakTest::breakNotFound(int32_t br
)
278 printf("%d ****\n", br
);
280 fprintf(stderr
, "break not found: %d\n", br
);
283 fBreaksNotFound
+= 1;
287 * Report a break that's found in the text without spaces
288 * that isn't in the text with spaces.
290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br
)
293 printf("**** %d\n", br
);
295 fprintf(stderr
, "found invalid break: %d\n", br
);
302 * Read the text from a file. The text must start with a Unicode Byte
303 * Order Mark (BOM) so that we know what order to read the bytes in.
305 const UChar
*ThaiWordbreakTest::readFile(char *fileName
, int32_t &charCount
)
313 f
= fopen(fileName
, "rb");
316 fprintf(stderr
,"Couldn't open %s reason: %s \n", fileName
, strerror(errno
));
320 fseek(f
, 0, SEEK_END
);
323 fseek(f
, 0, SEEK_SET
);
324 bufferChars
= new char[fileSize
];
326 if(bufferChars
== 0) {
327 fprintf(stderr
,"Couldn't get memory for reading %s reason: %s \n", fileName
, strerror(errno
));
332 fread(bufferChars
, sizeof(char), fileSize
, f
);
334 fprintf(stderr
,"Couldn't read %s reason: %s \n", fileName
, strerror(errno
));
336 delete[] bufferChars
;
341 UnicodeString
myText(bufferChars
, fileSize
, "UTF-8");
343 delete[] bufferChars
;
345 charCount
= myText
.length();
346 buffer
= new UChar
[charCount
];
348 fprintf(stderr
,"Couldn't get memory for reading %s reason: %s \n", fileName
, strerror(errno
));
352 myText
.extract(1, myText
.length(), buffer
);
353 charCount
--; // skip the BOM
354 buffer
[charCount
] = 0; // NULL terminate for easier reading in the debugger
360 * Remove spaces from the input UChar array.
362 * We check explicitly for a Unicode code value of 0x0020
363 * because Unicode::isSpaceChar returns true for CR, LF, etc.
366 const UChar
*ThaiWordbreakTest::crunchSpaces(const UChar
*spaces
, int32_t count
, int32_t &nonSpaceCount
)
368 int32_t i
, out
, spaceCount
;
371 for (i
= 0; i
< count
; i
+= 1) {
372 if (spaces
[i
] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
377 nonSpaceCount
= count
- spaceCount
;
378 UChar
*noSpaces
= new UChar
[nonSpaceCount
];
381 fprintf(stderr
, "Couldn't allocate memory for the space stripped text.\n");
385 for (out
= 0, i
= 0; i
< count
; i
+= 1) {
386 if (spaces
[i
] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
387 noSpaces
[out
++] = spaces
[i
];
395 * Generate a text file with spaces in it from a file without.
397 int generateFile(const UChar
*chars
, int32_t length
) {
399 UCharCharacterIterator
*noSpaceIter
= new UCharCharacterIterator(chars
, length
);
400 UErrorCode status
= U_ZERO_ERROR
;
402 UnicodeSet
complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status
);
403 BreakIterator
*breakIter
= BreakIterator::createWordInstance(root
, status
);
404 breakIter
->adoptText(noSpaceIter
);
409 printf("%s", u_strToUTF8(outbuf
, sizeof(outbuf
), &strlength
, &bom
, 1, &status
));
410 int32_t prevbreak
= 0;
411 while (U_SUCCESS(status
)) {
412 int32_t nextbreak
= breakIter
->next();
413 if (nextbreak
== BreakIterator::DONE
) {
416 printf("%s", u_strToUTF8(outbuf
, sizeof(outbuf
), &strlength
, &chars
[prevbreak
],
417 nextbreak
-prevbreak
, &status
));
418 if (nextbreak
> 0 && complexContext
.contains(chars
[nextbreak
-1])
419 && complexContext
.contains(chars
[nextbreak
])) {
422 prevbreak
= nextbreak
;
425 if (U_FAILURE(status
)) {
426 fprintf(stderr
, "generate failed: %s\n", u_errorName(status
));
435 * The main routine. Read the command line arguments, read the text file,
436 * remove the spaces, do the comparison and report the final results
438 int main(int argc
, char **argv
)
440 char *fileName
= "space.txt";
442 UBool verbose
= FALSE
;
443 UBool generate
= FALSE
;
445 if (argc
>= 2 && strcmp(argv
[1], "-generate") == 0) {
450 if (argc
>= 2 && strcmp(argv
[1], "-verbose") == 0) {
455 if (arg
== argc
- 1) {
456 fileName
= argv
[arg
++];
460 fprintf(stderr
, "Usage: %s [-verbose] [<file>]\n", argv
[0]);
464 int32_t spaceCount
, nonSpaceCount
;
465 const UChar
*spaces
, *noSpaces
;
467 spaces
= ThaiWordbreakTest::readFile(fileName
, spaceCount
);
474 return generateFile(spaces
, spaceCount
);
477 noSpaces
= ThaiWordbreakTest::crunchSpaces(spaces
, spaceCount
, nonSpaceCount
);
483 ThaiWordbreakTest
test(spaces
, spaceCount
, noSpaces
, nonSpaceCount
, verbose
);
485 printf("word count: %d\n", test
.getWordCount());
486 printf("breaks not found: %d\n", test
.getBreaksNotFound());
487 printf("invalid breaks found: %d\n", test
.getInvalidBreaks());
493 * The main constructor. Clear all the counts and construct a default
494 * word instance of a BreakIterator.
496 SpaceBreakIterator::SpaceBreakIterator(const UChar
*text
, int32_t count
)
497 : fBreakIter(0), fText(text
), fTextCount(count
), fWordCount(0), fSpaceCount(0), fDone(FALSE
)
499 UCharCharacterIterator
*iter
= new UCharCharacterIterator(text
, count
);
500 UErrorCode status
= U_ZERO_ERROR
;
501 fComplexContext
.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status
);
504 fBreakIter
= BreakIterator::createWordInstance(root
, status
);
505 fBreakIter
->adoptText(iter
);
508 SpaceBreakIterator::SpaceBreakIterator()
514 * The destructor. delete the underlying BreakIterator
516 SpaceBreakIterator::~SpaceBreakIterator()
522 * Return the next break, counting words and spaces.
524 int32_t SpaceBreakIterator::next()
527 return BreakIterator::DONE
;
532 nextBreak
= fBreakIter
->next();
534 if (nextBreak
== BreakIterator::DONE
) {
536 return BreakIterator::DONE
;
539 while(nextBreak
> 0 && fComplexContext
.contains(fText
[nextBreak
-1])
540 && fComplexContext
.contains(fText
[nextBreak
]));
542 int32_t result
= nextBreak
- fSpaceCount
;
544 if (nextBreak
< fTextCount
) {
545 if (fText
[nextBreak
] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
546 fSpaceCount
+= fBreakIter
->next() - nextBreak
;
556 * Returns the current space count
558 int32_t SpaceBreakIterator::getSpaceCount()
564 * Returns the current word count
566 int32_t SpaceBreakIterator::getWordCount()