[apple/icu.git] / icuSources / test / thaitest / thaitest.cpp

/*
 ******************************************************************************
 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
 * and others. All Rights Reserved.                                           *
 ******************************************************************************
 */

#include <errno.h>
#include <stdio.h>
#include <string.h>

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/brkiter.h"
#include "unicode/locid.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/ustring.h"

/*
 * This program takes a Unicode text file containing Thai text with
 * spaces inserted where the word breaks are. It computes a copy of
 * the text without spaces and uses a word instance of a Thai BreakIterator
 * to compute the word breaks. The program reports any differences in the
 * breaks.
 *
 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
 * exptected that this program will always report some differences.
 */

/*
 * This class is a break iterator that counts words and spaces.
 */
class SpaceBreakIterator
{
public:
    // The constructor:
    // text  - pointer to an array of UChars to iterate over
    // count - the number of UChars in text
    SpaceBreakIterator(const UChar *text, int32_t count);

    // the destructor
    ~SpaceBreakIterator();

    // return next break position
    int32_t next();

    // return current word count
    int32_t getWordCount();

    // return current space count
    int32_t getSpaceCount();

private:
    // No arg constructor: private so clients can't call it.
    SpaceBreakIterator();

    // The underlying BreakIterator
    BreakIterator *fBreakIter;

    // address of the UChar array
    const UChar *fText;

    // number of UChars in fText
    int32_t fTextCount;

    // current word count
    int32_t fWordCount;

    // current space count
    int32_t fSpaceCount;
    
    // UnicodeSet of SA characters
    UnicodeSet fComplexContext;

    // true when fBreakIter has returned DONE
    UBool fDone;
};

/*
 * This is the main class. It compares word breaks and reports the differences.
 */
class ThaiWordbreakTest
{
public:
    // The main constructor:
    // spaces       - pointer to a UChar array for the text with spaces
    // spaceCount   - the number of characters in the spaces array
    // noSpaces     - pointer to a UChar array for the text without spaces
    // noSpaceCount - the number of characters in the noSpaces array
    // verbose      - report all breaks if true, otherwise just report differences
    ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
    ~ThaiWordbreakTest();

    // returns the number of breaks that are in the spaces array
    // but aren't found in the noSpaces array
    int32_t getBreaksNotFound();

    // returns the number of breaks which are found in the noSpaces
    // array but aren't in the spaces array
    int32_t getInvalidBreaks();

    // returns the number of words found in the spaces array
    int32_t getWordCount();

    // reads the input Unicode text file:
    // fileName  - the path name of the file
    // charCount - set to the number of UChars read from the file
    // returns   - the address of the UChar array containing the characters
    static const UChar *readFile(char *fileName, int32_t &charCount);

    // removes spaces form the input UChar array:
    // spaces        - pointer to the input UChar array
    // count         - number of UChars in the spaces array
    // nonSpaceCount - the number of UChars in the result array
    // returns       - the address of the UChar array with spaces removed
    static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);

private:
    // The no arg constructor - private so clients can't call it
    ThaiWordbreakTest();

    // This does the actual comparison:
    // spaces - the address of the UChar array for the text with spaces
    // spaceCount - the number of UChars in the spaces array
    // noSpaces   - the address of the UChar array for the text without spaces
    // noSpaceCount - the number of UChars in the noSpaces array
    // returns      - true if all breaks match, FALSE otherwise
    UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                            const UChar *noSpaces, int32_t noSpaceCount);

    // helper method to report a break in the spaces
    // array that's not found in the noSpaces array
    void breakNotFound(int32_t br);

    // helper method to report a break that's found in
    // the noSpaces array that's not in the spaces array
    void foundInvalidBreak(int32_t br);

    // count of breaks in the spaces array that
    // aren't found in the noSpaces array
    int32_t fBreaksNotFound;

    // count of breaks found in the noSpaces array
    // that aren't in the spaces array
    int32_t fInvalidBreaks;

    // number of words found in the spaces array
    int32_t fWordCount;

    // report all breaks if true, otherwise just report differences
    UBool fVerbose;
};

/*
 * The main constructor: it calls compareWordBreaks and reports any differences
 */
ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
                                     const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
{
    compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
}

/*
 * The no arg constructor
 */
ThaiWordbreakTest::ThaiWordbreakTest()
{
    // nothing
}

/*
 * The destructor
 */
ThaiWordbreakTest::~ThaiWordbreakTest()
{
    // nothing?
}

/*
 * returns the number of breaks in the spaces array
 * that aren't found in the noSpaces array
 */
inline int32_t ThaiWordbreakTest::getBreaksNotFound()
{
    return fBreaksNotFound;
}

/*
 * Returns the number of breaks found in the noSpaces
 * array that aren't in the spaces array
 */
inline int32_t ThaiWordbreakTest::getInvalidBreaks()
{
    return fInvalidBreaks;
}

/*
 * Returns the number of words found in the spaces array
 */
inline int32_t ThaiWordbreakTest::getWordCount()
{
    return fWordCount;
}

/*
 * This method does the acutal break comparison and reports the results.
 * It uses a SpaceBreakIterator to iterate over the text with spaces,
 * and a word instance of a Thai BreakIterator to iterate over the text
 * without spaces.
 */
UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                                           const UChar *noSpaces, int32_t noSpaceCount)
{
    UBool result = TRUE;
    Locale thai("th");
    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
    UErrorCode status = U_ZERO_ERROR;
    
    BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
    breakIter->adoptText(noSpaceIter);
    
    SpaceBreakIterator spaceIter(spaces, spaceCount);
    
    int32_t nextBreak = 0;
    int32_t nextSpaceBreak = 0;
    int32_t iterCount = 0;
    
    while (TRUE) {
        nextSpaceBreak = spaceIter.next();
        nextBreak = breakIter->next();
        
        if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
            if (nextBreak != BreakIterator::DONE) {
                fprintf(stderr, "break iterator didn't end.\n");
            } else if (nextSpaceBreak != BreakIterator::DONE) {
                fprintf(stderr, "premature break iterator end.\n");
            }
            
            break;
        }
        
        while (nextSpaceBreak != nextBreak &&
               nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
            if (nextSpaceBreak < nextBreak) {
                breakNotFound(nextSpaceBreak);
                result = FALSE;
                nextSpaceBreak = spaceIter.next();
            } else if (nextSpaceBreak > nextBreak) {
                foundInvalidBreak(nextBreak);
                result = FALSE;
                nextBreak = breakIter->next();
            }
        }
        
        if (fVerbose) {
            printf("%d   %d\n", nextSpaceBreak, nextBreak);
        }
    }
        
   
    fWordCount = spaceIter.getWordCount();
    
    delete breakIter;

    return result;
}

/*
 * Report a break that's in the text with spaces but
 * not found in the text without spaces.
 */
void ThaiWordbreakTest::breakNotFound(int32_t br)
{
    if (fVerbose) {
        printf("%d   ****\n", br);
    } else {
        fprintf(stderr, "break not found: %d\n", br);
    }
    
    fBreaksNotFound += 1;
}

/*
 * Report a break that's found in the text without spaces
 * that isn't in the text with spaces.
 */
void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
{
    if (fVerbose) {
        printf("****   %d\n", br);
    } else {
        fprintf(stderr, "found invalid break: %d\n", br);
    }
    
    fInvalidBreaks += 1;
}

/*
 * Read the text from a file. The text must start with a Unicode Byte
 * Order Mark (BOM) so that we know what order to read the bytes in.
 */
const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
{
    FILE *f;
    int32_t fileSize;
    
    UChar *buffer;
    char *bufferChars;
    
    f = fopen(fileName, "rb");
    
    if( f == NULL ) {
        fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
        return 0;
    }
    
    fseek(f, 0, SEEK_END);
    fileSize = ftell(f);
    
    fseek(f, 0, SEEK_SET);
    bufferChars = new char[fileSize];
    
    if(bufferChars == 0) {
        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
        fclose(f);
        return 0;
    }
    
    fread(bufferChars, sizeof(char), fileSize, f);
    if( ferror(f) ) {
        fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
        fclose(f);
        delete[] bufferChars;
        return 0;
    }
    fclose(f);
    
    UnicodeString myText(bufferChars, fileSize, "UTF-8");

    delete[] bufferChars;
    
    charCount = myText.length();
    buffer = new UChar[charCount];
    if(buffer == 0) {
        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
        return 0;
    }
    
    myText.extract(1, myText.length(), buffer);
    charCount--;  // skip the BOM
    buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
    
    return buffer;
}

/*
 * Remove spaces from the input UChar array.
 *
 * We check explicitly for a Unicode code value of 0x0020
 * because Unicode::isSpaceChar returns true for CR, LF, etc.
 *
 */
const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
{
    int32_t i, out, spaceCount;

    spaceCount = 0;
    for (i = 0; i < count; i += 1) {
        if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
            spaceCount += 1;
        }
    }

    nonSpaceCount = count - spaceCount;
    UChar *noSpaces = new UChar[nonSpaceCount];

    if (noSpaces == 0) {
        fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
        return 0;
    }

    for (out = 0, i = 0; i < count; i += 1) {
        if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
            noSpaces[out++] = spaces[i];
        }
    }

    return noSpaces;
}

/*
 * Generate a text file with spaces in it from a file without.
 */
int generateFile(const UChar *chars, int32_t length) {
    Locale root("");
    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
    UErrorCode status = U_ZERO_ERROR;
    
    UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
    BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
    breakIter->adoptText(noSpaceIter);
    char outbuf[1024];
    int32_t strlength;
    UChar bom = 0xFEFF;
    
    printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
    int32_t prevbreak = 0;
    while (U_SUCCESS(status)) {
        int32_t nextbreak = breakIter->next();
        if (nextbreak == BreakIterator::DONE) {
            break;
        }
        printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
                                    nextbreak-prevbreak, &status));
        if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
            && complexContext.contains(chars[nextbreak])) {
            printf(" ");
        }
        prevbreak = nextbreak;
    }
    
    if (U_FAILURE(status)) {
        fprintf(stderr, "generate failed: %s\n", u_errorName(status));
        return status;
    }
    else {
        return 0;
    }
}

/*
 * The main routine. Read the command line arguments, read the text file,
 * remove the spaces, do the comparison and report the final results
 */
int main(int argc, char **argv)
{
    char *fileName = "space.txt";
    int arg = 1;
    UBool verbose = FALSE;
    UBool generate = FALSE;

    if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
        generate = TRUE;
        arg += 1;
    }

    if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
        verbose = TRUE;
        arg += 1;
    }

    if (arg == argc - 1) {
        fileName = argv[arg++];
    }

    if (arg != argc) {
        fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
        return 1;
    }

    int32_t spaceCount, nonSpaceCount;
    const UChar *spaces, *noSpaces;

    spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);

    if (spaces == 0) {
        return 1;
    }
    
    if (generate) {
        return generateFile(spaces, spaceCount);
    }

    noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);

    if (noSpaces == 0) {
        return 1;
    }

    ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);

    printf("word count: %d\n", test.getWordCount());
    printf("breaks not found: %d\n", test.getBreaksNotFound());
    printf("invalid breaks found: %d\n", test.getInvalidBreaks());

    return 0;
}

/*
 * The main constructor. Clear all the counts and construct a default
 * word instance of a BreakIterator.
 */
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
  : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
{
    UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
    UErrorCode status = U_ZERO_ERROR;
    fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
    Locale root("");

    fBreakIter = BreakIterator::createWordInstance(root, status);
    fBreakIter->adoptText(iter);
}

SpaceBreakIterator::SpaceBreakIterator()
{
    // nothing
}

/*
 * The destructor. delete the underlying BreakIterator
 */
SpaceBreakIterator::~SpaceBreakIterator()
{
    delete fBreakIter;
}

/*
 * Return the next break, counting words and spaces.
 */
int32_t SpaceBreakIterator::next()
{
    if (fDone) {
        return BreakIterator::DONE;
    }
    
    int32_t nextBreak;
    do {
        nextBreak = fBreakIter->next();
        
        if (nextBreak == BreakIterator::DONE) {
            fDone = TRUE;
            return BreakIterator::DONE;
        }
    }
    while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
            && fComplexContext.contains(fText[nextBreak]));
    
   int32_t result = nextBreak - fSpaceCount;
    
    if (nextBreak < fTextCount) {
        if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
            fSpaceCount += fBreakIter->next() - nextBreak;
        }
    }
    
    fWordCount += 1;

    return result;
}

/*
 * Returns the current space count
 */
int32_t SpaceBreakIterator::getSpaceCount()
{
    return fSpaceCount;
}

/*
 * Returns the current word count
 */
int32_t SpaceBreakIterator::getWordCount()
{
    return fWordCount;
}
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	******************************************************************************
73c04bcf A	3	* Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
73c04bcf A	4	* and others. All Rights Reserved. *
b75a7d8f A	5	******************************************************************************
	6	*/
	7
	8	#include <errno.h>
	9	#include <stdio.h>
	10	#include <string.h>
	11
	12	#include "unicode/utypes.h"
	13	#include "unicode/uchar.h"
	14	#include "unicode/uchriter.h"
	15	#include "unicode/brkiter.h"
	16	#include "unicode/locid.h"
	17	#include "unicode/unistr.h"
73c04bcf A	18	#include "unicode/uniset.h"
73c04bcf A	19	#include "unicode/ustring.h"
b75a7d8f A	20
	21	/*
	22	* This program takes a Unicode text file containing Thai text with
	23	* spaces inserted where the word breaks are. It computes a copy of
	24	* the text without spaces and uses a word instance of a Thai BreakIterator
	25	* to compute the word breaks. The program reports any differences in the
	26	* breaks.
	27	*
	28	* NOTE: by it's very nature, Thai word breaking is not exact, so it is
	29	* exptected that this program will always report some differences.
	30	*/
	31
	32	/*
	33	* This class is a break iterator that counts words and spaces.
	34	*/
	35	class SpaceBreakIterator
	36	{
	37	public:
	38	// The constructor:
	39	// text - pointer to an array of UChars to iterate over
	40	// count - the number of UChars in text
	41	SpaceBreakIterator(const UChar *text, int32_t count);
	42
	43	// the destructor
	44	~SpaceBreakIterator();
	45
	46	// return next break position
	47	int32_t next();
	48
	49	// return current word count
	50	int32_t getWordCount();
	51
	52	// return current space count
	53	int32_t getSpaceCount();
	54
	55	private:
	56	// No arg constructor: private so clients can't call it.
	57	SpaceBreakIterator();
	58
	59	// The underlying BreakIterator
	60	BreakIterator *fBreakIter;
	61
	62	// address of the UChar array
	63	const UChar *fText;
	64
	65	// number of UChars in fText
	66	int32_t fTextCount;
	67
	68	// current word count
	69	int32_t fWordCount;
	70
	71	// current space count
	72	int32_t fSpaceCount;
73c04bcf A	73
	74	// UnicodeSet of SA characters
	75	UnicodeSet fComplexContext;
b75a7d8f A	76
	77	// true when fBreakIter has returned DONE
	78	UBool fDone;
	79	};
	80
	81	/*
	82	* This is the main class. It compares word breaks and reports the differences.
	83	*/
	84	class ThaiWordbreakTest
	85	{
	86	public:
	87	// The main constructor:
	88	// spaces - pointer to a UChar array for the text with spaces
	89	// spaceCount - the number of characters in the spaces array
	90	// noSpaces - pointer to a UChar array for the text without spaces
	91	// noSpaceCount - the number of characters in the noSpaces array
	92	// verbose - report all breaks if true, otherwise just report differences
	93	ThaiWordbreakTest(const UChar spaces, int32_t spaceCount, const UChar noSpaces, int32_t noSpaceCount, UBool verbose);
	94	~ThaiWordbreakTest();
	95
	96	// returns the number of breaks that are in the spaces array
	97	// but aren't found in the noSpaces array
	98	int32_t getBreaksNotFound();
	99
	100	// returns the number of breaks which are found in the noSpaces
	101	// array but aren't in the spaces array
	102	int32_t getInvalidBreaks();
	103
	104	// returns the number of words found in the spaces array
	105	int32_t getWordCount();
	106
	107	// reads the input Unicode text file:
	108	// fileName - the path name of the file
	109	// charCount - set to the number of UChars read from the file
	110	// returns - the address of the UChar array containing the characters
	111	static const UChar readFile(char fileName, int32_t &charCount);
	112
	113	// removes spaces form the input UChar array:
	114	// spaces - pointer to the input UChar array
	115	// count - number of UChars in the spaces array
	116	// nonSpaceCount - the number of UChars in the result array
	117	// returns - the address of the UChar array with spaces removed
	118	static const UChar crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount);
	119
	120	private:
	121	// The no arg constructor - private so clients can't call it
	122	ThaiWordbreakTest();
	123
	124	// This does the actual comparison:
	125	// spaces - the address of the UChar array for the text with spaces
	126	// spaceCount - the number of UChars in the spaces array
	127	// noSpaces - the address of the UChar array for the text without spaces
	128	// noSpaceCount - the number of UChars in the noSpaces array
	129	// returns - true if all breaks match, FALSE otherwise
	130	UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
	131	const UChar *noSpaces, int32_t noSpaceCount);
	132
	133	// helper method to report a break in the spaces
	134	// array that's not found in the noSpaces array
	135	void breakNotFound(int32_t br);
	136
	137	// helper method to report a break that's found in
	138	// the noSpaces array that's not in the spaces array
	139	void foundInvalidBreak(int32_t br);
140
141	// count of breaks in the spaces array that
142	// aren't found in the noSpaces array
143	int32_t fBreaksNotFound;
144
145	// count of breaks found in the noSpaces array
146	// that aren't in the spaces array
147	int32_t fInvalidBreaks;
148
149	// number of words found in the spaces array
150	int32_t fWordCount;
151
152	// report all breaks if true, otherwise just report differences
153	UBool fVerbose;
154	};
155
156	/*
157	* The main constructor: it calls compareWordBreaks and reports any differences
158	*/
159	ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
160	const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
161	: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
162	{
163	compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
164	}
165
166	/*
167	* The no arg constructor
168	*/
169	ThaiWordbreakTest::ThaiWordbreakTest()
170	{
171	// nothing
172	}
173
174	/*
175	* The destructor
176	*/
177	ThaiWordbreakTest::~ThaiWordbreakTest()
178	{
179	// nothing?
180	}
181
182	/*
183	* returns the number of breaks in the spaces array
184	* that aren't found in the noSpaces array
185	*/
186	inline int32_t ThaiWordbreakTest::getBreaksNotFound()
187	{
188	return fBreaksNotFound;
189	}
190
191	/*
192	* Returns the number of breaks found in the noSpaces
193	* array that aren't in the spaces array
194	*/
195	inline int32_t ThaiWordbreakTest::getInvalidBreaks()
196	{
197	return fInvalidBreaks;
198	}
199
200	/*
201	* Returns the number of words found in the spaces array
202	*/
203	inline int32_t ThaiWordbreakTest::getWordCount()
204	{
205	return fWordCount;
206	}
207
208	/*
209	* This method does the acutal break comparison and reports the results.
210	* It uses a SpaceBreakIterator to iterate over the text with spaces,
211	* and a word instance of a Thai BreakIterator to iterate over the text
212	* without spaces.
213	*/
214	UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
215	const UChar *noSpaces, int32_t noSpaceCount)
216	{
217	UBool result = TRUE;
218	Locale thai("th");
219	UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
220	UErrorCode status = U_ZERO_ERROR;
221
222	BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
223	breakIter->adoptText(noSpaceIter);
224
225	SpaceBreakIterator spaceIter(spaces, spaceCount);
226
227	int32_t nextBreak = 0;
228	int32_t nextSpaceBreak = 0;
229	int32_t iterCount = 0;
230
231	while (TRUE) {
232	nextSpaceBreak = spaceIter.next();
233	nextBreak = breakIter->next();
234
235	if (nextSpaceBreak == BreakIterator::DONE \|\| nextBreak == BreakIterator::DONE) {
236	if (nextBreak != BreakIterator::DONE) {
237	fprintf(stderr, "break iterator didn't end.\n");
238	} else if (nextSpaceBreak != BreakIterator::DONE) {
239	fprintf(stderr, "premature break iterator end.\n");
240	}
241
242	break;
243	}
244
245	while (nextSpaceBreak != nextBreak &&
246	nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
247	if (nextSpaceBreak < nextBreak) {
248	breakNotFound(nextSpaceBreak);
249	result = FALSE;
250	nextSpaceBreak = spaceIter.next();
251	} else if (nextSpaceBreak > nextBreak) {
252	foundInvalidBreak(nextBreak);
253	result = FALSE;
254	nextBreak = breakIter->next();
255	}
256	}
257
258	if (fVerbose) {
259	printf("%d %d\n", nextSpaceBreak, nextBreak);
260	}
261	}
262
263
264	fWordCount = spaceIter.getWordCount();
265
266	delete breakIter;
267
268	return result;
269	}
270
271	/*
272	* Report a break that's in the text with spaces but
273	* not found in the text without spaces.
274	*/
275	void ThaiWordbreakTest::breakNotFound(int32_t br)
276	{
277	if (fVerbose) {
278	printf("%d ****\n", br);
279	} else {
280	fprintf(stderr, "break not found: %d\n", br);
281	}
282
283	fBreaksNotFound += 1;
284	}
285
286	/*
287	* Report a break that's found in the text without spaces
288	* that isn't in the text with spaces.
289	*/
290	void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
291	{
292	if (fVerbose) {
293	printf("**** %d\n", br);
294	} else {
295	fprintf(stderr, "found invalid break: %d\n", br);
296	}
297
298	fInvalidBreaks += 1;
299	}
300
301	/*
302	* Read the text from a file. The text must start with a Unicode Byte
303	* Order Mark (BOM) so that we know what order to read the bytes in.
304	*/
305	const UChar ThaiWordbreakTest::readFile(char fileName, int32_t &charCount)
306	{
307	FILE *f;
308	int32_t fileSize;
309
310	UChar *buffer;
311	char *bufferChars;
312
313	f = fopen(fileName, "rb");
314
315	if( f == NULL ) {
316	fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
317	return 0;
318	}
319
320	fseek(f, 0, SEEK_END);
321	fileSize = ftell(f);
322
323	fseek(f, 0, SEEK_SET);
324	bufferChars = new char[fileSize];
325
326	if(bufferChars == 0) {
327	fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
328	fclose(f);
329	return 0;
330	}
331
332	fread(bufferChars, sizeof(char), fileSize, f);
333	if( ferror(f) ) {
334	fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
335	fclose(f);
336	delete[] bufferChars;
337	return 0;
338	}
339	fclose(f);
340
341	UnicodeString myText(bufferChars, fileSize, "UTF-8");
342
343	delete[] bufferChars;
344
345	charCount = myText.length();
346	buffer = new UChar[charCount];
347	if(buffer == 0) {
348	fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
349	return 0;
350	}
351
352	myText.extract(1, myText.length(), buffer);
353	charCount--; // skip the BOM
354	buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
355
356	return buffer;
357	}
358
359	/*
360	* Remove spaces from the input UChar array.
361	*
362	* We check explicitly for a Unicode code value of 0x0020
363	* because Unicode::isSpaceChar returns true for CR, LF, etc.
364	*
365	*/
366	const UChar ThaiWordbreakTest::crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount)
367	{
368	int32_t i, out, spaceCount;
369
370	spaceCount = 0;
371	for (i = 0; i < count; i += 1) {
372	if (spaces[i] == 0x0020 /Unicode::isSpaceChar(spaces[i])/) {
373	spaceCount += 1;
374	}
375	}
376
377	nonSpaceCount = count - spaceCount;
378	UChar *noSpaces = new UChar[nonSpaceCount];
379
380	if (noSpaces == 0) {
381	fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
382	return 0;
383	}
384
385	for (out = 0, i = 0; i < count; i += 1) {
386	if (spaces[i] != 0x0020 /! Unicode::isSpaceChar(spaces[i])/) {
387	noSpaces[out++] = spaces[i];
388	}
389	}
390
391	return noSpaces;
392	}
393
73c04bcf A	394	/*
	395	* Generate a text file with spaces in it from a file without.
	396	*/
	397	int generateFile(const UChar *chars, int32_t length) {
	398	Locale root("");
	399	UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
	400	UErrorCode status = U_ZERO_ERROR;
	401
	402	UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
	403	BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
	404	breakIter->adoptText(noSpaceIter);
	405	char outbuf[1024];
	406	int32_t strlength;
	407	UChar bom = 0xFEFF;
	408
	409	printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
	410	int32_t prevbreak = 0;
	411	while (U_SUCCESS(status)) {
	412	int32_t nextbreak = breakIter->next();
	413	if (nextbreak == BreakIterator::DONE) {
	414	break;
	415	}
	416	printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
	417	nextbreak-prevbreak, &status));
	418	if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
	419	&& complexContext.contains(chars[nextbreak])) {
	420	printf(" ");
	421	}
	422	prevbreak = nextbreak;
	423	}
	424
	425	if (U_FAILURE(status)) {
	426	fprintf(stderr, "generate failed: %s\n", u_errorName(status));
	427	return status;
	428	}
	429	else {
	430	return 0;
	431	}
	432	}
	433
b75a7d8f A	434	/*
	435	* The main routine. Read the command line arguments, read the text file,
	436	* remove the spaces, do the comparison and report the final results
	437	*/
	438	int main(int argc, char **argv)
	439	{
	440	char *fileName = "space.txt";
	441	int arg = 1;
	442	UBool verbose = FALSE;
73c04bcf A	443	UBool generate = FALSE;
	444
	445	if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
	446	generate = TRUE;
	447	arg += 1;
	448	}
b75a7d8f A	449
	450	if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
	451	verbose = TRUE;
	452	arg += 1;
	453	}
	454
	455	if (arg == argc - 1) {
	456	fileName = argv[arg++];
	457	}
	458
	459	if (arg != argc) {
	460	fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
	461	return 1;
	462	}
	463
	464	int32_t spaceCount, nonSpaceCount;
	465	const UChar spaces, noSpaces;
	466
	467	spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
	468
	469	if (spaces == 0) {
	470	return 1;
	471	}
73c04bcf A	472
	473	if (generate) {
	474	return generateFile(spaces, spaceCount);
	475	}
b75a7d8f A	476
	477	noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
	478
	479	if (noSpaces == 0) {
	480	return 1;
	481	}
	482
	483	ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
	484
	485	printf("word count: %d\n", test.getWordCount());
	486	printf("breaks not found: %d\n", test.getBreaksNotFound());
	487	printf("invalid breaks found: %d\n", test.getInvalidBreaks());
	488
	489	return 0;
	490	}
	491
	492	/*
	493	* The main constructor. Clear all the counts and construct a default
	494	* word instance of a BreakIterator.
	495	*/
	496	SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
	497	: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
	498	{
	499	UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
	500	UErrorCode status = U_ZERO_ERROR;
73c04bcf A	501	fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
73c04bcf A	502	Locale root("");
b75a7d8f	503
73c04bcf	504	fBreakIter = BreakIterator::createWordInstance(root, status);
b75a7d8f A	505	fBreakIter->adoptText(iter);
	506	}
	507
	508	SpaceBreakIterator::SpaceBreakIterator()
	509	{
	510	// nothing
	511	}
	512
	513	/*
	514	* The destructor. delete the underlying BreakIterator
	515	*/
	516	SpaceBreakIterator::~SpaceBreakIterator()
	517	{
	518	delete fBreakIter;
	519	}
	520
	521	/*
	522	* Return the next break, counting words and spaces.
	523	*/
	524	int32_t SpaceBreakIterator::next()
	525	{
	526	if (fDone) {
	527	return BreakIterator::DONE;
	528	}
	529
73c04bcf A	530	int32_t nextBreak;
	531	do {
	532	nextBreak = fBreakIter->next();
	533
	534	if (nextBreak == BreakIterator::DONE) {
	535	fDone = TRUE;
	536	return BreakIterator::DONE;
	537	}
b75a7d8f	538	}
73c04bcf A	539	while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
73c04bcf A	540	&& fComplexContext.contains(fText[nextBreak]));
b75a7d8f A	541
	542	int32_t result = nextBreak - fSpaceCount;
	543
	544	if (nextBreak < fTextCount) {
	545	if (fText[nextBreak] == 0x0020 /Unicode::isSpaceChar(fText[nextBreak])/) {
	546	fSpaceCount += fBreakIter->next() - nextBreak;
	547	}
	548	}
	549
	550	fWordCount += 1;
	551
	552	return result;
	553	}
	554
	555	/*
	556	* Returns the current space count
	557	*/
	558	int32_t SpaceBreakIterator::getSpaceCount()
	559	{
	560	return fSpaceCount;
	561	}
	562
	563	/*
	564	* Returns the current word count
	565	*/
	566	int32_t SpaceBreakIterator::getWordCount()
	567	{
	568	return fWordCount;
	569	}
	570
	571