1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
5 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 ******************************************************************************
14 #include "unicode/utypes.h"
15 #include "unicode/uchar.h"
16 #include "unicode/uchriter.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/locid.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
24 * This program takes a Unicode text file containing Thai text with
25 * spaces inserted where the word breaks are. It computes a copy of
26 * the text without spaces and uses a word instance of a Thai BreakIterator
27 * to compute the word breaks. The program reports any differences in the
30 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
31 * exptected that this program will always report some differences.
35 * This class is a break iterator that counts words and spaces.
37 class SpaceBreakIterator
41 // text - pointer to an array of UChars to iterate over
42 // count - the number of UChars in text
43 SpaceBreakIterator(const UChar
*text
, int32_t count
);
46 ~SpaceBreakIterator();
48 // return next break position
51 // return current word count
52 int32_t getWordCount();
54 // return current space count
55 int32_t getSpaceCount();
58 // No arg constructor: private so clients can't call it.
61 // The underlying BreakIterator
62 BreakIterator
*fBreakIter
;
64 // address of the UChar array
67 // number of UChars in fText
73 // current space count
76 // UnicodeSet of SA characters
77 UnicodeSet fComplexContext
;
79 // true when fBreakIter has returned DONE
84 * This is the main class. It compares word breaks and reports the differences.
86 class ThaiWordbreakTest
89 // The main constructor:
90 // spaces - pointer to a UChar array for the text with spaces
91 // spaceCount - the number of characters in the spaces array
92 // noSpaces - pointer to a UChar array for the text without spaces
93 // noSpaceCount - the number of characters in the noSpaces array
94 // verbose - report all breaks if true, otherwise just report differences
95 ThaiWordbreakTest(const UChar
*spaces
, int32_t spaceCount
, const UChar
*noSpaces
, int32_t noSpaceCount
, UBool verbose
);
98 // returns the number of breaks that are in the spaces array
99 // but aren't found in the noSpaces array
100 int32_t getBreaksNotFound();
102 // returns the number of breaks which are found in the noSpaces
103 // array but aren't in the spaces array
104 int32_t getInvalidBreaks();
106 // returns the number of words found in the spaces array
107 int32_t getWordCount();
109 // reads the input Unicode text file:
110 // fileName - the path name of the file
111 // charCount - set to the number of UChars read from the file
112 // returns - the address of the UChar array containing the characters
113 static const UChar
*readFile(char *fileName
, int32_t &charCount
);
115 // removes spaces form the input UChar array:
116 // spaces - pointer to the input UChar array
117 // count - number of UChars in the spaces array
118 // nonSpaceCount - the number of UChars in the result array
119 // returns - the address of the UChar array with spaces removed
120 static const UChar
*crunchSpaces(const UChar
*spaces
, int32_t count
, int32_t &nonSpaceCount
);
123 // The no arg constructor - private so clients can't call it
126 // This does the actual comparison:
127 // spaces - the address of the UChar array for the text with spaces
128 // spaceCount - the number of UChars in the spaces array
129 // noSpaces - the address of the UChar array for the text without spaces
130 // noSpaceCount - the number of UChars in the noSpaces array
131 // returns - true if all breaks match, FALSE otherwise
132 UBool
compareWordBreaks(const UChar
*spaces
, int32_t spaceCount
,
133 const UChar
*noSpaces
, int32_t noSpaceCount
);
135 // helper method to report a break in the spaces
136 // array that's not found in the noSpaces array
137 void breakNotFound(int32_t br
);
139 // helper method to report a break that's found in
140 // the noSpaces array that's not in the spaces array
141 void foundInvalidBreak(int32_t br
);
143 // count of breaks in the spaces array that
144 // aren't found in the noSpaces array
145 int32_t fBreaksNotFound
;
147 // count of breaks found in the noSpaces array
148 // that aren't in the spaces array
149 int32_t fInvalidBreaks
;
151 // number of words found in the spaces array
154 // report all breaks if true, otherwise just report differences
159 * The main constructor: it calls compareWordBreaks and reports any differences
161 ThaiWordbreakTest::ThaiWordbreakTest(const UChar
*spaces
, int32_t spaceCount
,
162 const UChar
*noSpaces
, int32_t noSpaceCount
, UBool verbose
)
163 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose
)
165 compareWordBreaks(spaces
, spaceCount
, noSpaces
, noSpaceCount
);
169 * The no arg constructor
171 ThaiWordbreakTest::ThaiWordbreakTest()
179 ThaiWordbreakTest::~ThaiWordbreakTest()
185 * returns the number of breaks in the spaces array
186 * that aren't found in the noSpaces array
188 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
190 return fBreaksNotFound
;
194 * Returns the number of breaks found in the noSpaces
195 * array that aren't in the spaces array
197 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
199 return fInvalidBreaks
;
203 * Returns the number of words found in the spaces array
205 inline int32_t ThaiWordbreakTest::getWordCount()
211 * This method does the acutal break comparison and reports the results.
212 * It uses a SpaceBreakIterator to iterate over the text with spaces,
213 * and a word instance of a Thai BreakIterator to iterate over the text
216 UBool
ThaiWordbreakTest::compareWordBreaks(const UChar
*spaces
, int32_t spaceCount
,
217 const UChar
*noSpaces
, int32_t noSpaceCount
)
221 UCharCharacterIterator
*noSpaceIter
= new UCharCharacterIterator(noSpaces
, noSpaceCount
);
222 UErrorCode status
= U_ZERO_ERROR
;
224 BreakIterator
*breakIter
= BreakIterator::createWordInstance(thai
, status
);
225 breakIter
->adoptText(noSpaceIter
);
227 SpaceBreakIterator
spaceIter(spaces
, spaceCount
);
229 int32_t nextBreak
= 0;
230 int32_t nextSpaceBreak
= 0;
231 int32_t iterCount
= 0;
234 nextSpaceBreak
= spaceIter
.next();
235 nextBreak
= breakIter
->next();
237 if (nextSpaceBreak
== BreakIterator::DONE
|| nextBreak
== BreakIterator::DONE
) {
238 if (nextBreak
!= BreakIterator::DONE
) {
239 fprintf(stderr
, "break iterator didn't end.\n");
240 } else if (nextSpaceBreak
!= BreakIterator::DONE
) {
241 fprintf(stderr
, "premature break iterator end.\n");
247 while (nextSpaceBreak
!= nextBreak
&&
248 nextSpaceBreak
!= BreakIterator::DONE
&& nextBreak
!= BreakIterator::DONE
) {
249 if (nextSpaceBreak
< nextBreak
) {
250 breakNotFound(nextSpaceBreak
);
252 nextSpaceBreak
= spaceIter
.next();
253 } else if (nextSpaceBreak
> nextBreak
) {
254 foundInvalidBreak(nextBreak
);
256 nextBreak
= breakIter
->next();
261 printf("%d %d\n", nextSpaceBreak
, nextBreak
);
266 fWordCount
= spaceIter
.getWordCount();
274 * Report a break that's in the text with spaces but
275 * not found in the text without spaces.
277 void ThaiWordbreakTest::breakNotFound(int32_t br
)
280 printf("%d ****\n", br
);
282 fprintf(stderr
, "break not found: %d\n", br
);
285 fBreaksNotFound
+= 1;
289 * Report a break that's found in the text without spaces
290 * that isn't in the text with spaces.
292 void ThaiWordbreakTest::foundInvalidBreak(int32_t br
)
295 printf("**** %d\n", br
);
297 fprintf(stderr
, "found invalid break: %d\n", br
);
304 * Read the text from a file. The text must start with a Unicode Byte
305 * Order Mark (BOM) so that we know what order to read the bytes in.
307 const UChar
*ThaiWordbreakTest::readFile(char *fileName
, int32_t &charCount
)
315 f
= fopen(fileName
, "rb");
318 fprintf(stderr
,"Couldn't open %s reason: %s \n", fileName
, strerror(errno
));
322 fseek(f
, 0, SEEK_END
);
325 fseek(f
, 0, SEEK_SET
);
326 bufferChars
= new char[fileSize
];
328 if(bufferChars
== 0) {
329 fprintf(stderr
,"Couldn't get memory for reading %s reason: %s \n", fileName
, strerror(errno
));
334 fread(bufferChars
, sizeof(char), fileSize
, f
);
336 fprintf(stderr
,"Couldn't read %s reason: %s \n", fileName
, strerror(errno
));
338 delete[] bufferChars
;
343 UnicodeString
myText(bufferChars
, fileSize
, "UTF-8");
345 delete[] bufferChars
;
347 charCount
= myText
.length();
348 buffer
= new UChar
[charCount
];
350 fprintf(stderr
,"Couldn't get memory for reading %s reason: %s \n", fileName
, strerror(errno
));
354 myText
.extract(1, myText
.length(), buffer
);
355 charCount
--; // skip the BOM
356 buffer
[charCount
] = 0; // NULL terminate for easier reading in the debugger
362 * Remove spaces from the input UChar array.
364 * We check explicitly for a Unicode code value of 0x0020
365 * because Unicode::isSpaceChar returns true for CR, LF, etc.
368 const UChar
*ThaiWordbreakTest::crunchSpaces(const UChar
*spaces
, int32_t count
, int32_t &nonSpaceCount
)
370 int32_t i
, out
, spaceCount
;
373 for (i
= 0; i
< count
; i
+= 1) {
374 if (spaces
[i
] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
379 nonSpaceCount
= count
- spaceCount
;
380 UChar
*noSpaces
= new UChar
[nonSpaceCount
];
383 fprintf(stderr
, "Couldn't allocate memory for the space stripped text.\n");
387 for (out
= 0, i
= 0; i
< count
; i
+= 1) {
388 if (spaces
[i
] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
389 noSpaces
[out
++] = spaces
[i
];
397 * Generate a text file with spaces in it from a file without.
399 int generateFile(const UChar
*chars
, int32_t length
) {
401 UCharCharacterIterator
*noSpaceIter
= new UCharCharacterIterator(chars
, length
);
402 UErrorCode status
= U_ZERO_ERROR
;
404 UnicodeSet
complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status
);
405 BreakIterator
*breakIter
= BreakIterator::createWordInstance(root
, status
);
406 breakIter
->adoptText(noSpaceIter
);
411 printf("%s", u_strToUTF8(outbuf
, sizeof(outbuf
), &strlength
, &bom
, 1, &status
));
412 int32_t prevbreak
= 0;
413 while (U_SUCCESS(status
)) {
414 int32_t nextbreak
= breakIter
->next();
415 if (nextbreak
== BreakIterator::DONE
) {
418 printf("%s", u_strToUTF8(outbuf
, sizeof(outbuf
), &strlength
, &chars
[prevbreak
],
419 nextbreak
-prevbreak
, &status
));
420 if (nextbreak
> 0 && complexContext
.contains(chars
[nextbreak
-1])
421 && complexContext
.contains(chars
[nextbreak
])) {
424 prevbreak
= nextbreak
;
427 if (U_FAILURE(status
)) {
428 fprintf(stderr
, "generate failed: %s\n", u_errorName(status
));
437 * The main routine. Read the command line arguments, read the text file,
438 * remove the spaces, do the comparison and report the final results
440 int main(int argc
, char **argv
)
442 char *fileName
= "space.txt";
444 UBool verbose
= FALSE
;
445 UBool generate
= FALSE
;
447 if (argc
>= 2 && strcmp(argv
[1], "-generate") == 0) {
452 if (argc
>= 2 && strcmp(argv
[1], "-verbose") == 0) {
457 if (arg
== argc
- 1) {
458 fileName
= argv
[arg
++];
462 fprintf(stderr
, "Usage: %s [-verbose] [<file>]\n", argv
[0]);
466 int32_t spaceCount
, nonSpaceCount
;
467 const UChar
*spaces
, *noSpaces
;
469 spaces
= ThaiWordbreakTest::readFile(fileName
, spaceCount
);
476 return generateFile(spaces
, spaceCount
);
479 noSpaces
= ThaiWordbreakTest::crunchSpaces(spaces
, spaceCount
, nonSpaceCount
);
485 ThaiWordbreakTest
test(spaces
, spaceCount
, noSpaces
, nonSpaceCount
, verbose
);
487 printf("word count: %d\n", test
.getWordCount());
488 printf("breaks not found: %d\n", test
.getBreaksNotFound());
489 printf("invalid breaks found: %d\n", test
.getInvalidBreaks());
495 * The main constructor. Clear all the counts and construct a default
496 * word instance of a BreakIterator.
498 SpaceBreakIterator::SpaceBreakIterator(const UChar
*text
, int32_t count
)
499 : fBreakIter(0), fText(text
), fTextCount(count
), fWordCount(0), fSpaceCount(0), fDone(FALSE
)
501 UCharCharacterIterator
*iter
= new UCharCharacterIterator(text
, count
);
502 UErrorCode status
= U_ZERO_ERROR
;
503 fComplexContext
.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status
);
506 fBreakIter
= BreakIterator::createWordInstance(root
, status
);
507 fBreakIter
->adoptText(iter
);
510 SpaceBreakIterator::SpaceBreakIterator()
516 * The destructor. delete the underlying BreakIterator
518 SpaceBreakIterator::~SpaceBreakIterator()
524 * Return the next break, counting words and spaces.
526 int32_t SpaceBreakIterator::next()
529 return BreakIterator::DONE
;
534 nextBreak
= fBreakIter
->next();
536 if (nextBreak
== BreakIterator::DONE
) {
538 return BreakIterator::DONE
;
541 while(nextBreak
> 0 && fComplexContext
.contains(fText
[nextBreak
-1])
542 && fComplexContext
.contains(fText
[nextBreak
]));
544 int32_t result
= nextBreak
- fSpaceCount
;
546 if (nextBreak
< fTextCount
) {
547 if (fText
[nextBreak
] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
548 fSpaceCount
+= fBreakIter
->next() - nextBreak
;
558 * Returns the current space count
560 int32_t SpaceBreakIterator::getSpaceCount()
566 * Returns the current word count
568 int32_t SpaceBreakIterator::getWordCount()