git.saurik.com Git - apple/icu.git/blame - icuSources/test/thaitest/thaitest.cpp

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
b75a7d8f A	4	******************************************************************************
73c04bcf A	5	* Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
73c04bcf A	6	* and others. All Rights Reserved. *
b75a7d8f A	7	******************************************************************************
	8	*/
	9
	10	#include <errno.h>
	11	#include <stdio.h>
	12	#include <string.h>
	13
	14	#include "unicode/utypes.h"
	15	#include "unicode/uchar.h"
	16	#include "unicode/uchriter.h"
	17	#include "unicode/brkiter.h"
	18	#include "unicode/locid.h"
	19	#include "unicode/unistr.h"
73c04bcf A	20	#include "unicode/uniset.h"
73c04bcf A	21	#include "unicode/ustring.h"
b75a7d8f A	22
	23	/*
	24	* This program takes a Unicode text file containing Thai text with
	25	* spaces inserted where the word breaks are. It computes a copy of
	26	* the text without spaces and uses a word instance of a Thai BreakIterator
	27	* to compute the word breaks. The program reports any differences in the
	28	* breaks.
	29	*
	30	* NOTE: by it's very nature, Thai word breaking is not exact, so it is
	31	* exptected that this program will always report some differences.
	32	*/
	33
	34	/*
	35	* This class is a break iterator that counts words and spaces.
	36	*/
	37	class SpaceBreakIterator
	38	{
	39	public:
	40	// The constructor:
	41	// text - pointer to an array of UChars to iterate over
	42	// count - the number of UChars in text
	43	SpaceBreakIterator(const UChar *text, int32_t count);
	44
	45	// the destructor
	46	~SpaceBreakIterator();
	47
	48	// return next break position
	49	int32_t next();
	50
	51	// return current word count
	52	int32_t getWordCount();
	53
	54	// return current space count
	55	int32_t getSpaceCount();
	56
	57	private:
	58	// No arg constructor: private so clients can't call it.
	59	SpaceBreakIterator();
	60
	61	// The underlying BreakIterator
	62	BreakIterator *fBreakIter;
	63
	64	// address of the UChar array
	65	const UChar *fText;
	66
	67	// number of UChars in fText
	68	int32_t fTextCount;
	69
	70	// current word count
	71	int32_t fWordCount;
	72
	73	// current space count
	74	int32_t fSpaceCount;
73c04bcf A	75
	76	// UnicodeSet of SA characters
	77	UnicodeSet fComplexContext;
b75a7d8f A	78
	79	// true when fBreakIter has returned DONE
	80	UBool fDone;
	81	};
	82
	83	/*
	84	* This is the main class. It compares word breaks and reports the differences.
	85	*/
	86	class ThaiWordbreakTest
	87	{
	88	public:
	89	// The main constructor:
	90	// spaces - pointer to a UChar array for the text with spaces
	91	// spaceCount - the number of characters in the spaces array
	92	// noSpaces - pointer to a UChar array for the text without spaces
	93	// noSpaceCount - the number of characters in the noSpaces array
	94	// verbose - report all breaks if true, otherwise just report differences
	95	ThaiWordbreakTest(const UChar spaces, int32_t spaceCount, const UChar noSpaces, int32_t noSpaceCount, UBool verbose);
	96	~ThaiWordbreakTest();
	97
	98	// returns the number of breaks that are in the spaces array
	99	// but aren't found in the noSpaces array
	100	int32_t getBreaksNotFound();
	101
	102	// returns the number of breaks which are found in the noSpaces
	103	// array but aren't in the spaces array
	104	int32_t getInvalidBreaks();
	105
	106	// returns the number of words found in the spaces array
	107	int32_t getWordCount();
	108
	109	// reads the input Unicode text file:
	110	// fileName - the path name of the file
	111	// charCount - set to the number of UChars read from the file
	112	// returns - the address of the UChar array containing the characters
	113	static const UChar readFile(char fileName, int32_t &charCount);
	114
	115	// removes spaces form the input UChar array:
	116	// spaces - pointer to the input UChar array
	117	// count - number of UChars in the spaces array
	118	// nonSpaceCount - the number of UChars in the result array
	119	// returns - the address of the UChar array with spaces removed
	120	static const UChar crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount);
	121
	122	private:
	123	// The no arg constructor - private so clients can't call it
	124	ThaiWordbreakTest();
	125
	126	// This does the actual comparison:
	127	// spaces - the address of the UChar array for the text with spaces
	128	// spaceCount - the number of UChars in the spaces array
	129	// noSpaces - the address of the UChar array for the text without spaces
	130	// noSpaceCount - the number of UChars in the noSpaces array
	131	// returns - true if all breaks match, FALSE otherwise
	132	UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
	133	const UChar *noSpaces, int32_t noSpaceCount);
	134
	135	// helper method to report a break in the spaces
	136	// array that's not found in the noSpaces array
	137	void breakNotFound(int32_t br);
	138
	139	// helper method to report a break that's found in
	140	// the noSpaces array that's not in the spaces array
	141	void foundInvalidBreak(int32_t br);
142
143	// count of breaks in the spaces array that
144	// aren't found in the noSpaces array
145	int32_t fBreaksNotFound;
146
147	// count of breaks found in the noSpaces array
148	// that aren't in the spaces array
149	int32_t fInvalidBreaks;
150
151	// number of words found in the spaces array
152	int32_t fWordCount;
153
154	// report all breaks if true, otherwise just report differences
155	UBool fVerbose;
156	};
157
158	/*
159	* The main constructor: it calls compareWordBreaks and reports any differences
160	*/
161	ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
162	const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
163	: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
164	{
165	compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
166	}
167
168	/*
169	* The no arg constructor
170	*/
171	ThaiWordbreakTest::ThaiWordbreakTest()
172	{
173	// nothing
174	}
175
176	/*
177	* The destructor
178	*/
179	ThaiWordbreakTest::~ThaiWordbreakTest()
180	{
181	// nothing?
182	}
183
184	/*
185	* returns the number of breaks in the spaces array
186	* that aren't found in the noSpaces array
187	*/
188	inline int32_t ThaiWordbreakTest::getBreaksNotFound()
189	{
190	return fBreaksNotFound;
191	}
192
193	/*
194	* Returns the number of breaks found in the noSpaces
195	* array that aren't in the spaces array
196	*/
197	inline int32_t ThaiWordbreakTest::getInvalidBreaks()
198	{
199	return fInvalidBreaks;
200	}
201
202	/*
203	* Returns the number of words found in the spaces array
204	*/
205	inline int32_t ThaiWordbreakTest::getWordCount()
206	{
207	return fWordCount;
208	}
209
210	/*
211	* This method does the acutal break comparison and reports the results.
212	* It uses a SpaceBreakIterator to iterate over the text with spaces,
213	* and a word instance of a Thai BreakIterator to iterate over the text
214	* without spaces.
215	*/
216	UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
217	const UChar *noSpaces, int32_t noSpaceCount)
218	{
219	UBool result = TRUE;
220	Locale thai("th");
221	UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
222	UErrorCode status = U_ZERO_ERROR;
223
224	BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
225	breakIter->adoptText(noSpaceIter);
226
227	SpaceBreakIterator spaceIter(spaces, spaceCount);
228
229	int32_t nextBreak = 0;
230	int32_t nextSpaceBreak = 0;
231	int32_t iterCount = 0;
232
233	while (TRUE) {
234	nextSpaceBreak = spaceIter.next();
235	nextBreak = breakIter->next();
236
237	if (nextSpaceBreak == BreakIterator::DONE \|\| nextBreak == BreakIterator::DONE) {
238	if (nextBreak != BreakIterator::DONE) {
239	fprintf(stderr, "break iterator didn't end.\n");
240	} else if (nextSpaceBreak != BreakIterator::DONE) {
241	fprintf(stderr, "premature break iterator end.\n");
242	}
243
244	break;
245	}
246
247	while (nextSpaceBreak != nextBreak &&
248	nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
249	if (nextSpaceBreak < nextBreak) {
250	breakNotFound(nextSpaceBreak);
251	result = FALSE;
252	nextSpaceBreak = spaceIter.next();
253	} else if (nextSpaceBreak > nextBreak) {
254	foundInvalidBreak(nextBreak);
255	result = FALSE;
256	nextBreak = breakIter->next();
257	}
258	}
259
260	if (fVerbose) {
261	printf("%d %d\n", nextSpaceBreak, nextBreak);
262	}
263	}
264
265
266	fWordCount = spaceIter.getWordCount();
267
268	delete breakIter;
269
270	return result;
271	}
272
273	/*
274	* Report a break that's in the text with spaces but
275	* not found in the text without spaces.
276	*/
277	void ThaiWordbreakTest::breakNotFound(int32_t br)
278	{
279	if (fVerbose) {
280	printf("%d ****\n", br);
281	} else {
282	fprintf(stderr, "break not found: %d\n", br);
283	}
284
285	fBreaksNotFound += 1;
286	}
287
288	/*
289	* Report a break that's found in the text without spaces
290	* that isn't in the text with spaces.
291	*/
292	void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
293	{
294	if (fVerbose) {
295	printf("**** %d\n", br);
296	} else {
297	fprintf(stderr, "found invalid break: %d\n", br);
298	}
299
300	fInvalidBreaks += 1;
301	}
302
303	/*
304	* Read the text from a file. The text must start with a Unicode Byte
305	* Order Mark (BOM) so that we know what order to read the bytes in.
306	*/
307	const UChar ThaiWordbreakTest::readFile(char fileName, int32_t &charCount)
308	{
309	FILE *f;
310	int32_t fileSize;
311
312	UChar *buffer;
313	char *bufferChars;
314
315	f = fopen(fileName, "rb");
316
317	if( f == NULL ) {
318	fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
319	return 0;
320	}
321
322	fseek(f, 0, SEEK_END);
323	fileSize = ftell(f);
324
325	fseek(f, 0, SEEK_SET);
326	bufferChars = new char[fileSize];
327
328	if(bufferChars == 0) {
329	fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
330	fclose(f);
331	return 0;
332	}
333
334	fread(bufferChars, sizeof(char), fileSize, f);
335	if( ferror(f) ) {
336	fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
337	fclose(f);
338	delete[] bufferChars;
339	return 0;
340	}
341	fclose(f);
342
343	UnicodeString myText(bufferChars, fileSize, "UTF-8");
344
345	delete[] bufferChars;
346
347	charCount = myText.length();
348	buffer = new UChar[charCount];
349	if(buffer == 0) {
350	fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
351	return 0;
352	}
353
354	myText.extract(1, myText.length(), buffer);
355	charCount--; // skip the BOM
356	buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
357
358	return buffer;
359	}
360
361	/*
362	* Remove spaces from the input UChar array.
363	*
364	* We check explicitly for a Unicode code value of 0x0020
365	* because Unicode::isSpaceChar returns true for CR, LF, etc.
366	*
367	*/
368	const UChar ThaiWordbreakTest::crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount)
369	{
370	int32_t i, out, spaceCount;
371
372	spaceCount = 0;
373	for (i = 0; i < count; i += 1) {
374	if (spaces[i] == 0x0020 /Unicode::isSpaceChar(spaces[i])/) {
375	spaceCount += 1;
376	}
377	}
378
379	nonSpaceCount = count - spaceCount;
380	UChar *noSpaces = new UChar[nonSpaceCount];
381
382	if (noSpaces == 0) {
383	fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
384	return 0;
385	}
386
387	for (out = 0, i = 0; i < count; i += 1) {
388	if (spaces[i] != 0x0020 /! Unicode::isSpaceChar(spaces[i])/) {
389	noSpaces[out++] = spaces[i];
390	}
391	}
392
393	return noSpaces;
394	}
395
73c04bcf A	396	/*
	397	* Generate a text file with spaces in it from a file without.
	398	*/
	399	int generateFile(const UChar *chars, int32_t length) {
	400	Locale root("");
	401	UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
	402	UErrorCode status = U_ZERO_ERROR;
	403
	404	UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
	405	BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
	406	breakIter->adoptText(noSpaceIter);
	407	char outbuf[1024];
	408	int32_t strlength;
	409	UChar bom = 0xFEFF;
	410
	411	printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
	412	int32_t prevbreak = 0;
	413	while (U_SUCCESS(status)) {
	414	int32_t nextbreak = breakIter->next();
	415	if (nextbreak == BreakIterator::DONE) {
	416	break;
	417	}
	418	printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
	419	nextbreak-prevbreak, &status));
	420	if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
	421	&& complexContext.contains(chars[nextbreak])) {
	422	printf(" ");
	423	}
	424	prevbreak = nextbreak;
	425	}
	426
	427	if (U_FAILURE(status)) {
	428	fprintf(stderr, "generate failed: %s\n", u_errorName(status));
	429	return status;
	430	}
	431	else {
	432	return 0;
	433	}
	434	}
	435
b75a7d8f A	436	/*
	437	* The main routine. Read the command line arguments, read the text file,
	438	* remove the spaces, do the comparison and report the final results
	439	*/
	440	int main(int argc, char **argv)
	441	{
	442	char *fileName = "space.txt";
	443	int arg = 1;
	444	UBool verbose = FALSE;
73c04bcf A	445	UBool generate = FALSE;
	446
	447	if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
	448	generate = TRUE;
	449	arg += 1;
	450	}
b75a7d8f A	451
	452	if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
	453	verbose = TRUE;
	454	arg += 1;
	455	}
	456
	457	if (arg == argc - 1) {
	458	fileName = argv[arg++];
	459	}
	460
	461	if (arg != argc) {
	462	fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
	463	return 1;
	464	}
	465
	466	int32_t spaceCount, nonSpaceCount;
	467	const UChar spaces, noSpaces;
	468
	469	spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
	470
	471	if (spaces == 0) {
	472	return 1;
	473	}
73c04bcf A	474
	475	if (generate) {
	476	return generateFile(spaces, spaceCount);
	477	}
b75a7d8f A	478
	479	noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
	480
	481	if (noSpaces == 0) {
	482	return 1;
	483	}
	484
	485	ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
	486
	487	printf("word count: %d\n", test.getWordCount());
	488	printf("breaks not found: %d\n", test.getBreaksNotFound());
	489	printf("invalid breaks found: %d\n", test.getInvalidBreaks());
	490
	491	return 0;
	492	}
	493
	494	/*
	495	* The main constructor. Clear all the counts and construct a default
	496	* word instance of a BreakIterator.
	497	*/
	498	SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
	499	: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
	500	{
	501	UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
	502	UErrorCode status = U_ZERO_ERROR;
73c04bcf A	503	fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
73c04bcf A	504	Locale root("");
b75a7d8f	505
73c04bcf	506	fBreakIter = BreakIterator::createWordInstance(root, status);
b75a7d8f A	507	fBreakIter->adoptText(iter);
	508	}
	509
	510	SpaceBreakIterator::SpaceBreakIterator()
	511	{
	512	// nothing
	513	}
	514
	515	/*
	516	* The destructor. delete the underlying BreakIterator
	517	*/
	518	SpaceBreakIterator::~SpaceBreakIterator()
	519	{
	520	delete fBreakIter;
	521	}
	522
	523	/*
	524	* Return the next break, counting words and spaces.
	525	*/
	526	int32_t SpaceBreakIterator::next()
	527	{
	528	if (fDone) {
	529	return BreakIterator::DONE;
	530	}
	531
73c04bcf A	532	int32_t nextBreak;
	533	do {
	534	nextBreak = fBreakIter->next();
	535
	536	if (nextBreak == BreakIterator::DONE) {
	537	fDone = TRUE;
	538	return BreakIterator::DONE;
	539	}
b75a7d8f	540	}
73c04bcf A	541	while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
73c04bcf A	542	&& fComplexContext.contains(fText[nextBreak]));
b75a7d8f A	543
	544	int32_t result = nextBreak - fSpaceCount;
	545
	546	if (nextBreak < fTextCount) {
	547	if (fText[nextBreak] == 0x0020 /Unicode::isSpaceChar(fText[nextBreak])/) {
	548	fSpaceCount += fBreakIter->next() - nextBreak;
	549	}
	550	}
	551
	552	fWordCount += 1;
	553
	554	return result;
	555	}
	556
	557	/*
	558	* Returns the current space count
	559	*/
	560	int32_t SpaceBreakIterator::getSpaceCount()
	561	{
	562	return fSpaceCount;
	563	}
	564
	565	/*
	566	* Returns the current word count
	567	*/
	568	int32_t SpaceBreakIterator::getWordCount()
	569	{
	570	return fWordCount;
	571	}
	572
	573