[apple/icu.git] / icuSources / test / thaitest / thaitest.cpp

/*
 ******************************************************************************
 * Copyright (C) 1998-2003, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

#include <errno.h>
#include <stdio.h>
#include <string.h>

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/brkiter.h"
#include "unicode/locid.h"
#include "unicode/unistr.h"

/*
 * This program takes a Unicode text file containing Thai text with
 * spaces inserted where the word breaks are. It computes a copy of
 * the text without spaces and uses a word instance of a Thai BreakIterator
 * to compute the word breaks. The program reports any differences in the
 * breaks.
 *
 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
 * exptected that this program will always report some differences.
 */

/*
 * This class is a break iterator that counts words and spaces.
 */
class SpaceBreakIterator
{
public:
    // The constructor:
    // text  - pointer to an array of UChars to iterate over
    // count - the number of UChars in text
    SpaceBreakIterator(const UChar *text, int32_t count);

    // the destructor
    ~SpaceBreakIterator();

    // return next break position
    int32_t next();

    // return current word count
    int32_t getWordCount();

    // return current space count
    int32_t getSpaceCount();

private:
    // No arg constructor: private so clients can't call it.
    SpaceBreakIterator();

    // The underlying BreakIterator
    BreakIterator *fBreakIter;

    // address of the UChar array
    const UChar *fText;

    // number of UChars in fText
    int32_t fTextCount;

    // current word count
    int32_t fWordCount;

    // current space count
    int32_t fSpaceCount;

    // true when fBreakIter has returned DONE
    UBool fDone;
};

/*
 * This is the main class. It compares word breaks and reports the differences.
 */
class ThaiWordbreakTest
{
public:
    // The main constructor:
    // spaces       - pointer to a UChar array for the text with spaces
    // spaceCount   - the number of characters in the spaces array
    // noSpaces     - pointer to a UChar array for the text without spaces
    // noSpaceCount - the number of characters in the noSpaces array
    // verbose      - report all breaks if true, otherwise just report differences
    ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
    ~ThaiWordbreakTest();

    // returns the number of breaks that are in the spaces array
    // but aren't found in the noSpaces array
    int32_t getBreaksNotFound();

    // returns the number of breaks which are found in the noSpaces
    // array but aren't in the spaces array
    int32_t getInvalidBreaks();

    // returns the number of words found in the spaces array
    int32_t getWordCount();

    // reads the input Unicode text file:
    // fileName  - the path name of the file
    // charCount - set to the number of UChars read from the file
    // returns   - the address of the UChar array containing the characters
    static const UChar *readFile(char *fileName, int32_t &charCount);

    // removes spaces form the input UChar array:
    // spaces        - pointer to the input UChar array
    // count         - number of UChars in the spaces array
    // nonSpaceCount - the number of UChars in the result array
    // returns       - the address of the UChar array with spaces removed
    static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);

private:
    // The no arg constructor - private so clients can't call it
    ThaiWordbreakTest();

    // This does the actual comparison:
    // spaces - the address of the UChar array for the text with spaces
    // spaceCount - the number of UChars in the spaces array
    // noSpaces   - the address of the UChar array for the text without spaces
    // noSpaceCount - the number of UChars in the noSpaces array
    // returns      - true if all breaks match, FALSE otherwise
    UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                            const UChar *noSpaces, int32_t noSpaceCount);

    // helper method to report a break in the spaces
    // array that's not found in the noSpaces array
    void breakNotFound(int32_t br);

    // helper method to report a break that's found in
    // the noSpaces array that's not in the spaces array
    void foundInvalidBreak(int32_t br);

    // count of breaks in the spaces array that
    // aren't found in the noSpaces array
    int32_t fBreaksNotFound;

    // count of breaks found in the noSpaces array
    // that aren't in the spaces array
    int32_t fInvalidBreaks;

    // number of words found in the spaces array
    int32_t fWordCount;

    // report all breaks if true, otherwise just report differences
    UBool fVerbose;
};

/*
 * The main constructor: it calls compareWordBreaks and reports any differences
 */
ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
                                     const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
{
    compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
}

/*
 * The no arg constructor
 */
ThaiWordbreakTest::ThaiWordbreakTest()
{
    // nothing
}

/*
 * The destructor
 */
ThaiWordbreakTest::~ThaiWordbreakTest()
{
    // nothing?
}

/*
 * returns the number of breaks in the spaces array
 * that aren't found in the noSpaces array
 */
inline int32_t ThaiWordbreakTest::getBreaksNotFound()
{
    return fBreaksNotFound;
}

/*
 * Returns the number of breaks found in the noSpaces
 * array that aren't in the spaces array
 */
inline int32_t ThaiWordbreakTest::getInvalidBreaks()
{
    return fInvalidBreaks;
}

/*
 * Returns the number of words found in the spaces array
 */
inline int32_t ThaiWordbreakTest::getWordCount()
{
    return fWordCount;
}

/*
 * This method does the acutal break comparison and reports the results.
 * It uses a SpaceBreakIterator to iterate over the text with spaces,
 * and a word instance of a Thai BreakIterator to iterate over the text
 * without spaces.
 */
UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                                           const UChar *noSpaces, int32_t noSpaceCount)
{
    UBool result = TRUE;
    Locale thai("th");
    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
    UErrorCode status = U_ZERO_ERROR;
    
    BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
    breakIter->adoptText(noSpaceIter);
    
    SpaceBreakIterator spaceIter(spaces, spaceCount);
    
    int32_t nextBreak = 0;
    int32_t nextSpaceBreak = 0;
    int32_t iterCount = 0;
    
    while (TRUE) {
        nextSpaceBreak = spaceIter.next();
        nextBreak = breakIter->next();
        
        if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
            if (nextBreak != BreakIterator::DONE) {
                fprintf(stderr, "break iterator didn't end.\n");
            } else if (nextSpaceBreak != BreakIterator::DONE) {
                fprintf(stderr, "premature break iterator end.\n");
            }
            
            break;
        }
        
        while (nextSpaceBreak != nextBreak &&
               nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
            if (nextSpaceBreak < nextBreak) {
                breakNotFound(nextSpaceBreak);
                result = FALSE;
                nextSpaceBreak = spaceIter.next();
            } else if (nextSpaceBreak > nextBreak) {
                foundInvalidBreak(nextBreak);
                result = FALSE;
                nextBreak = breakIter->next();
            }
        }
        
        if (fVerbose) {
            printf("%d   %d\n", nextSpaceBreak, nextBreak);
        }
    }
        
   
    fWordCount = spaceIter.getWordCount();
    
    delete breakIter;

    return result;
}

/*
 * Report a break that's in the text with spaces but
 * not found in the text without spaces.
 */
void ThaiWordbreakTest::breakNotFound(int32_t br)
{
    if (fVerbose) {
        printf("%d   ****\n", br);
    } else {
        fprintf(stderr, "break not found: %d\n", br);
    }
    
    fBreaksNotFound += 1;
}

/*
 * Report a break that's found in the text without spaces
 * that isn't in the text with spaces.
 */
void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
{
    if (fVerbose) {
        printf("****   %d\n", br);
    } else {
        fprintf(stderr, "found invalid break: %d\n", br);
    }
    
    fInvalidBreaks += 1;
}

/*
 * Read the text from a file. The text must start with a Unicode Byte
 * Order Mark (BOM) so that we know what order to read the bytes in.
 */
const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
{
    FILE *f;
    int32_t fileSize;
    
    UChar *buffer;
    char *bufferChars;
    
    f = fopen(fileName, "rb");
    
    if( f == NULL ) {
        fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
        return 0;
    }
    
    fseek(f, 0, SEEK_END);
    fileSize = ftell(f);
    
    fseek(f, 0, SEEK_SET);
    bufferChars = new char[fileSize];
    
    if(bufferChars == 0) {
        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
        fclose(f);
        return 0;
    }
    
    fread(bufferChars, sizeof(char), fileSize, f);
    if( ferror(f) ) {
        fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
        fclose(f);
        delete[] bufferChars;
        return 0;
    }
    fclose(f);
    
    UnicodeString myText(bufferChars, fileSize, "UTF-8");

    delete[] bufferChars;
    
    charCount = myText.length();
    buffer = new UChar[charCount];
    if(buffer == 0) {
        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
        return 0;
    }
    
    myText.extract(1, myText.length(), buffer);
    charCount--;  // skip the BOM
    buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
    
    return buffer;
}

/*
 * Remove spaces from the input UChar array.
 *
 * We check explicitly for a Unicode code value of 0x0020
 * because Unicode::isSpaceChar returns true for CR, LF, etc.
 *
 */
const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
{
    int32_t i, out, spaceCount;

    spaceCount = 0;
    for (i = 0; i < count; i += 1) {
        if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
            spaceCount += 1;
        }
    }

    nonSpaceCount = count - spaceCount;
    UChar *noSpaces = new UChar[nonSpaceCount];

    if (noSpaces == 0) {
        fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
        return 0;
    }

    for (out = 0, i = 0; i < count; i += 1) {
        if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
            noSpaces[out++] = spaces[i];
        }
    }

    return noSpaces;
}

/*
 * The main routine. Read the command line arguments, read the text file,
 * remove the spaces, do the comparison and report the final results
 */
int main(int argc, char **argv)
{
    char *fileName = "space.txt";
    int arg = 1;
    UBool verbose = FALSE;

    if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
        verbose = TRUE;
        arg += 1;
    }

    if (arg == argc - 1) {
        fileName = argv[arg++];
    }

    if (arg != argc) {
        fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
        return 1;
    }

    int32_t spaceCount, nonSpaceCount;
    const UChar *spaces, *noSpaces;

    spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);

    if (spaces == 0) {
        return 1;
    }

    noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);

    if (noSpaces == 0) {
        return 1;
    }

    ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);

    printf("word count: %d\n", test.getWordCount());
    printf("breaks not found: %d\n", test.getBreaksNotFound());
    printf("invalid breaks found: %d\n", test.getInvalidBreaks());

    return 0;
}

/*
 * The main constructor. Clear all the counts and construct a default
 * word instance of a BreakIterator.
 */
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
  : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
{
    UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
    UErrorCode status = U_ZERO_ERROR;
    Locale us("us");

    fBreakIter = BreakIterator::createWordInstance(us, status);
    fBreakIter->adoptText(iter);
}

SpaceBreakIterator::SpaceBreakIterator()
{
    // nothing
}

/*
 * The destructor. delete the underlying BreakIterator
 */
SpaceBreakIterator::~SpaceBreakIterator()
{
    delete fBreakIter;
}

/*
 * Return the next break, counting words and spaces.
 */
int32_t SpaceBreakIterator::next()
{
    if (fDone) {
        return BreakIterator::DONE;
    }
    
    int32_t nextBreak = fBreakIter->next();
    
    if (nextBreak == BreakIterator::DONE) {
        fDone = TRUE;
        return BreakIterator::DONE;
    }
    
   int32_t result = nextBreak - fSpaceCount;
    
    if (nextBreak < fTextCount) {
        if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
            fSpaceCount += fBreakIter->next() - nextBreak;
        }
    }
    
    fWordCount += 1;

    return result;
}

/*
 * Returns the current space count
 */
int32_t SpaceBreakIterator::getSpaceCount()
{
    return fSpaceCount;
}

/*
 * Returns the current word count
 */
int32_t SpaceBreakIterator::getWordCount()
{
    return fWordCount;
}
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	******************************************************************************
374ca955	3	* Copyright (C) 1998-2003, International Business Machines Corporation and *
b75a7d8f A	4	* others. All Rights Reserved. *
	5	******************************************************************************
	6	*/
	7
	8	#include <errno.h>
	9	#include <stdio.h>
	10	#include <string.h>
	11
	12	#include "unicode/utypes.h"
	13	#include "unicode/uchar.h"
	14	#include "unicode/uchriter.h"
	15	#include "unicode/brkiter.h"
	16	#include "unicode/locid.h"
	17	#include "unicode/unistr.h"
	18
	19	/*
	20	* This program takes a Unicode text file containing Thai text with
	21	* spaces inserted where the word breaks are. It computes a copy of
	22	* the text without spaces and uses a word instance of a Thai BreakIterator
	23	* to compute the word breaks. The program reports any differences in the
	24	* breaks.
	25	*
	26	* NOTE: by it's very nature, Thai word breaking is not exact, so it is
	27	* exptected that this program will always report some differences.
	28	*/
	29
	30	/*
	31	* This class is a break iterator that counts words and spaces.
	32	*/
	33	class SpaceBreakIterator
	34	{
	35	public:
	36	// The constructor:
	37	// text - pointer to an array of UChars to iterate over
	38	// count - the number of UChars in text
	39	SpaceBreakIterator(const UChar *text, int32_t count);
	40
	41	// the destructor
	42	~SpaceBreakIterator();
	43
	44	// return next break position
	45	int32_t next();
	46
	47	// return current word count
	48	int32_t getWordCount();
	49
	50	// return current space count
	51	int32_t getSpaceCount();
	52
	53	private:
	54	// No arg constructor: private so clients can't call it.
	55	SpaceBreakIterator();
	56
	57	// The underlying BreakIterator
	58	BreakIterator *fBreakIter;
	59
	60	// address of the UChar array
	61	const UChar *fText;
	62
	63	// number of UChars in fText
	64	int32_t fTextCount;
	65
	66	// current word count
	67	int32_t fWordCount;
68
69	// current space count
70	int32_t fSpaceCount;
71
72	// true when fBreakIter has returned DONE
73	UBool fDone;
74	};
75
76	/*
77	* This is the main class. It compares word breaks and reports the differences.
78	*/
79	class ThaiWordbreakTest
80	{
81	public:
82	// The main constructor:
83	// spaces - pointer to a UChar array for the text with spaces
84	// spaceCount - the number of characters in the spaces array
85	// noSpaces - pointer to a UChar array for the text without spaces
86	// noSpaceCount - the number of characters in the noSpaces array
87	// verbose - report all breaks if true, otherwise just report differences
88	ThaiWordbreakTest(const UChar spaces, int32_t spaceCount, const UChar noSpaces, int32_t noSpaceCount, UBool verbose);
89	~ThaiWordbreakTest();
90
91	// returns the number of breaks that are in the spaces array
92	// but aren't found in the noSpaces array
93	int32_t getBreaksNotFound();
94
95	// returns the number of breaks which are found in the noSpaces
96	// array but aren't in the spaces array
97	int32_t getInvalidBreaks();
98
99	// returns the number of words found in the spaces array
100	int32_t getWordCount();
101
102	// reads the input Unicode text file:
103	// fileName - the path name of the file
104	// charCount - set to the number of UChars read from the file
105	// returns - the address of the UChar array containing the characters
106	static const UChar readFile(char fileName, int32_t &charCount);
107
108	// removes spaces form the input UChar array:
109	// spaces - pointer to the input UChar array
110	// count - number of UChars in the spaces array
111	// nonSpaceCount - the number of UChars in the result array
112	// returns - the address of the UChar array with spaces removed
113	static const UChar crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount);
114
115	private:
116	// The no arg constructor - private so clients can't call it
117	ThaiWordbreakTest();
118
119	// This does the actual comparison:
120	// spaces - the address of the UChar array for the text with spaces
121	// spaceCount - the number of UChars in the spaces array
122	// noSpaces - the address of the UChar array for the text without spaces
123	// noSpaceCount - the number of UChars in the noSpaces array
124	// returns - true if all breaks match, FALSE otherwise
125	UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
126	const UChar *noSpaces, int32_t noSpaceCount);
127
128	// helper method to report a break in the spaces
129	// array that's not found in the noSpaces array
130	void breakNotFound(int32_t br);
131
132	// helper method to report a break that's found in
133	// the noSpaces array that's not in the spaces array
134	void foundInvalidBreak(int32_t br);
135
136	// count of breaks in the spaces array that
137	// aren't found in the noSpaces array
138	int32_t fBreaksNotFound;
139
140	// count of breaks found in the noSpaces array
141	// that aren't in the spaces array
142	int32_t fInvalidBreaks;
143
144	// number of words found in the spaces array
145	int32_t fWordCount;
146
147	// report all breaks if true, otherwise just report differences
148	UBool fVerbose;
149	};
150
151	/*
152	* The main constructor: it calls compareWordBreaks and reports any differences
153	*/
154	ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
155	const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
156	: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
157	{
158	compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
159	}
160
161	/*
162	* The no arg constructor
163	*/
164	ThaiWordbreakTest::ThaiWordbreakTest()
165	{
166	// nothing
167	}
168
169	/*
170	* The destructor
171	*/
172	ThaiWordbreakTest::~ThaiWordbreakTest()
173	{
174	// nothing?
175	}
176
177	/*
178	* returns the number of breaks in the spaces array
179	* that aren't found in the noSpaces array
180	*/
181	inline int32_t ThaiWordbreakTest::getBreaksNotFound()
182	{
183	return fBreaksNotFound;
184	}
185
186	/*
187	* Returns the number of breaks found in the noSpaces
188	* array that aren't in the spaces array
189	*/
190	inline int32_t ThaiWordbreakTest::getInvalidBreaks()
191	{
192	return fInvalidBreaks;
193	}
194
195	/*
196	* Returns the number of words found in the spaces array
197	*/
198	inline int32_t ThaiWordbreakTest::getWordCount()
199	{
200	return fWordCount;
201	}
202
203	/*
204	* This method does the acutal break comparison and reports the results.
205	* It uses a SpaceBreakIterator to iterate over the text with spaces,
206	* and a word instance of a Thai BreakIterator to iterate over the text
207	* without spaces.
208	*/
209	UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
210	const UChar *noSpaces, int32_t noSpaceCount)
211	{
212	UBool result = TRUE;
213	Locale thai("th");
214	UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
215	UErrorCode status = U_ZERO_ERROR;
216
217	BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
218	breakIter->adoptText(noSpaceIter);
219
220	SpaceBreakIterator spaceIter(spaces, spaceCount);
221
222	int32_t nextBreak = 0;
223	int32_t nextSpaceBreak = 0;
224	int32_t iterCount = 0;
225
226	while (TRUE) {
227	nextSpaceBreak = spaceIter.next();
228	nextBreak = breakIter->next();
229
230	if (nextSpaceBreak == BreakIterator::DONE \|\| nextBreak == BreakIterator::DONE) {
231	if (nextBreak != BreakIterator::DONE) {
232	fprintf(stderr, "break iterator didn't end.\n");
233	} else if (nextSpaceBreak != BreakIterator::DONE) {
234	fprintf(stderr, "premature break iterator end.\n");
235	}
236
237	break;
238	}
239
240	while (nextSpaceBreak != nextBreak &&
241	nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
242	if (nextSpaceBreak < nextBreak) {
243	breakNotFound(nextSpaceBreak);
244	result = FALSE;
245	nextSpaceBreak = spaceIter.next();
246	} else if (nextSpaceBreak > nextBreak) {
247	foundInvalidBreak(nextBreak);
248	result = FALSE;
249	nextBreak = breakIter->next();
250	}
251	}
252
253	if (fVerbose) {
254	printf("%d %d\n", nextSpaceBreak, nextBreak);
255	}
256	}
257
258
259	fWordCount = spaceIter.getWordCount();
260
261	delete breakIter;
262
263	return result;
264	}
265
266	/*
267	* Report a break that's in the text with spaces but
268	* not found in the text without spaces.
269	*/
270	void ThaiWordbreakTest::breakNotFound(int32_t br)
271	{
272	if (fVerbose) {
273	printf("%d ****\n", br);
274	} else {
275	fprintf(stderr, "break not found: %d\n", br);
276	}
277
278	fBreaksNotFound += 1;
279	}
280
281	/*
282	* Report a break that's found in the text without spaces
283	* that isn't in the text with spaces.
284	*/
285	void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
286	{
287	if (fVerbose) {
288	printf("**** %d\n", br);
289	} else {
290	fprintf(stderr, "found invalid break: %d\n", br);
291	}
292
293	fInvalidBreaks += 1;
294	}
295
296	/*
297	* Read the text from a file. The text must start with a Unicode Byte
298	* Order Mark (BOM) so that we know what order to read the bytes in.
299	*/
300	const UChar ThaiWordbreakTest::readFile(char fileName, int32_t &charCount)
301	{
302	FILE *f;
303	int32_t fileSize;
304
305	UChar *buffer;
306	char *bufferChars;
307
308	f = fopen(fileName, "rb");
309
310	if( f == NULL ) {
311	fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
312	return 0;
313	}
314
315	fseek(f, 0, SEEK_END);
316	fileSize = ftell(f);
317
318	fseek(f, 0, SEEK_SET);
319	bufferChars = new char[fileSize];
320
321	if(bufferChars == 0) {
322	fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
323	fclose(f);
324	return 0;
325	}
326
327	fread(bufferChars, sizeof(char), fileSize, f);
328	if( ferror(f) ) {
329	fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
330	fclose(f);
331	delete[] bufferChars;
332	return 0;
333	}
334	fclose(f);
335
336	UnicodeString myText(bufferChars, fileSize, "UTF-8");
337
338	delete[] bufferChars;
339
340	charCount = myText.length();
341	buffer = new UChar[charCount];
342	if(buffer == 0) {
343	fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
344	return 0;
345	}
346
347	myText.extract(1, myText.length(), buffer);
348	charCount--; // skip the BOM
349	buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
350
351	return buffer;
352	}
353
354	/*
355	* Remove spaces from the input UChar array.
356	*
357	* We check explicitly for a Unicode code value of 0x0020
358	* because Unicode::isSpaceChar returns true for CR, LF, etc.
359	*
360	*/
361	const UChar ThaiWordbreakTest::crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount)
362	{
363	int32_t i, out, spaceCount;
364
365	spaceCount = 0;
366	for (i = 0; i < count; i += 1) {
367	if (spaces[i] == 0x0020 /Unicode::isSpaceChar(spaces[i])/) {
368	spaceCount += 1;
369	}
370	}
371
372	nonSpaceCount = count - spaceCount;
373	UChar *noSpaces = new UChar[nonSpaceCount];
374
375	if (noSpaces == 0) {
376	fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
377	return 0;
378	}
379
380	for (out = 0, i = 0; i < count; i += 1) {
381	if (spaces[i] != 0x0020 /! Unicode::isSpaceChar(spaces[i])/) {
382	noSpaces[out++] = spaces[i];
383	}
384	}
385
386	return noSpaces;
387	}
388
389	/*
390	* The main routine. Read the command line arguments, read the text file,
391	* remove the spaces, do the comparison and report the final results
392	*/
393	int main(int argc, char **argv)
394	{
395	char *fileName = "space.txt";
396	int arg = 1;
397	UBool verbose = FALSE;
398
399	if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
400	verbose = TRUE;
401	arg += 1;
402	}
403
404	if (arg == argc - 1) {
405	fileName = argv[arg++];
406	}
407
408	if (arg != argc) {
409	fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
410	return 1;
411	}
412
413	int32_t spaceCount, nonSpaceCount;
414	const UChar spaces, noSpaces;
415
416	spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
417
418	if (spaces == 0) {
419	return 1;
420	}
421
422	noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
423
424	if (noSpaces == 0) {
425	return 1;
426	}
427
428	ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
429
430	printf("word count: %d\n", test.getWordCount());
431	printf("breaks not found: %d\n", test.getBreaksNotFound());
432	printf("invalid breaks found: %d\n", test.getInvalidBreaks());
433
434	return 0;
435	}
436
437	/*
438	* The main constructor. Clear all the counts and construct a default
439	* word instance of a BreakIterator.
440	*/
441	SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
442	: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
443	{
444	UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
445	UErrorCode status = U_ZERO_ERROR;
446	Locale us("us");
447
448	fBreakIter = BreakIterator::createWordInstance(us, status);
449	fBreakIter->adoptText(iter);
450	}
451
452	SpaceBreakIterator::SpaceBreakIterator()
453	{
454	// nothing
455	}
456
457	/*
458	* The destructor. delete the underlying BreakIterator
459	*/
460	SpaceBreakIterator::~SpaceBreakIterator()
461	{
462	delete fBreakIter;
463	}
464
465	/*
466	* Return the next break, counting words and spaces.
467	*/
468	int32_t SpaceBreakIterator::next()
469	{
470	if (fDone) {
471	return BreakIterator::DONE;
472	}
473
474	int32_t nextBreak = fBreakIter->next();
475
476	if (nextBreak == BreakIterator::DONE) {
477	fDone = TRUE;
478	return BreakIterator::DONE;
479	}
480
481	int32_t result = nextBreak - fSpaceCount;
482
483	if (nextBreak < fTextCount) {
484	if (fText[nextBreak] == 0x0020 /Unicode::isSpaceChar(fText[nextBreak])/) {
485	fSpaceCount += fBreakIter->next() - nextBreak;
486	}
487	}
488
489	fWordCount += 1;
490
491	return result;
492	}
493
494	/*
495	* Returns the current space count
496	*/
497	int32_t SpaceBreakIterator::getSpaceCount()
498	{
499	return fSpaceCount;
500	}
501
502	/*
503	* Returns the current word count
504	*/
505	int32_t SpaceBreakIterator::getWordCount()
506	{
507	return fWordCount;
508	}
509
510