]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
374ca955 | 3 | * Copyright (C) 1998-2003, International Business Machines Corporation and * |
b75a7d8f A |
4 | * others. All Rights Reserved. * |
5 | ****************************************************************************** | |
6 | */ | |
7 | ||
8 | #include <errno.h> | |
9 | #include <stdio.h> | |
10 | #include <string.h> | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | #include "unicode/uchar.h" | |
14 | #include "unicode/uchriter.h" | |
15 | #include "unicode/brkiter.h" | |
16 | #include "unicode/locid.h" | |
17 | #include "unicode/unistr.h" | |
18 | ||
19 | /* | |
20 | * This program takes a Unicode text file containing Thai text with | |
21 | * spaces inserted where the word breaks are. It computes a copy of | |
22 | * the text without spaces and uses a word instance of a Thai BreakIterator | |
23 | * to compute the word breaks. The program reports any differences in the | |
24 | * breaks. | |
25 | * | |
26 | * NOTE: by it's very nature, Thai word breaking is not exact, so it is | |
27 | * exptected that this program will always report some differences. | |
28 | */ | |
29 | ||
30 | /* | |
31 | * This class is a break iterator that counts words and spaces. | |
32 | */ | |
33 | class SpaceBreakIterator | |
34 | { | |
35 | public: | |
36 | // The constructor: | |
37 | // text - pointer to an array of UChars to iterate over | |
38 | // count - the number of UChars in text | |
39 | SpaceBreakIterator(const UChar *text, int32_t count); | |
40 | ||
41 | // the destructor | |
42 | ~SpaceBreakIterator(); | |
43 | ||
44 | // return next break position | |
45 | int32_t next(); | |
46 | ||
47 | // return current word count | |
48 | int32_t getWordCount(); | |
49 | ||
50 | // return current space count | |
51 | int32_t getSpaceCount(); | |
52 | ||
53 | private: | |
54 | // No arg constructor: private so clients can't call it. | |
55 | SpaceBreakIterator(); | |
56 | ||
57 | // The underlying BreakIterator | |
58 | BreakIterator *fBreakIter; | |
59 | ||
60 | // address of the UChar array | |
61 | const UChar *fText; | |
62 | ||
63 | // number of UChars in fText | |
64 | int32_t fTextCount; | |
65 | ||
66 | // current word count | |
67 | int32_t fWordCount; | |
68 | ||
69 | // current space count | |
70 | int32_t fSpaceCount; | |
71 | ||
72 | // true when fBreakIter has returned DONE | |
73 | UBool fDone; | |
74 | }; | |
75 | ||
76 | /* | |
77 | * This is the main class. It compares word breaks and reports the differences. | |
78 | */ | |
79 | class ThaiWordbreakTest | |
80 | { | |
81 | public: | |
82 | // The main constructor: | |
83 | // spaces - pointer to a UChar array for the text with spaces | |
84 | // spaceCount - the number of characters in the spaces array | |
85 | // noSpaces - pointer to a UChar array for the text without spaces | |
86 | // noSpaceCount - the number of characters in the noSpaces array | |
87 | // verbose - report all breaks if true, otherwise just report differences | |
88 | ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); | |
89 | ~ThaiWordbreakTest(); | |
90 | ||
91 | // returns the number of breaks that are in the spaces array | |
92 | // but aren't found in the noSpaces array | |
93 | int32_t getBreaksNotFound(); | |
94 | ||
95 | // returns the number of breaks which are found in the noSpaces | |
96 | // array but aren't in the spaces array | |
97 | int32_t getInvalidBreaks(); | |
98 | ||
99 | // returns the number of words found in the spaces array | |
100 | int32_t getWordCount(); | |
101 | ||
102 | // reads the input Unicode text file: | |
103 | // fileName - the path name of the file | |
104 | // charCount - set to the number of UChars read from the file | |
105 | // returns - the address of the UChar array containing the characters | |
106 | static const UChar *readFile(char *fileName, int32_t &charCount); | |
107 | ||
108 | // removes spaces form the input UChar array: | |
109 | // spaces - pointer to the input UChar array | |
110 | // count - number of UChars in the spaces array | |
111 | // nonSpaceCount - the number of UChars in the result array | |
112 | // returns - the address of the UChar array with spaces removed | |
113 | static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); | |
114 | ||
115 | private: | |
116 | // The no arg constructor - private so clients can't call it | |
117 | ThaiWordbreakTest(); | |
118 | ||
119 | // This does the actual comparison: | |
120 | // spaces - the address of the UChar array for the text with spaces | |
121 | // spaceCount - the number of UChars in the spaces array | |
122 | // noSpaces - the address of the UChar array for the text without spaces | |
123 | // noSpaceCount - the number of UChars in the noSpaces array | |
124 | // returns - true if all breaks match, FALSE otherwise | |
125 | UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
126 | const UChar *noSpaces, int32_t noSpaceCount); | |
127 | ||
128 | // helper method to report a break in the spaces | |
129 | // array that's not found in the noSpaces array | |
130 | void breakNotFound(int32_t br); | |
131 | ||
132 | // helper method to report a break that's found in | |
133 | // the noSpaces array that's not in the spaces array | |
134 | void foundInvalidBreak(int32_t br); | |
135 | ||
136 | // count of breaks in the spaces array that | |
137 | // aren't found in the noSpaces array | |
138 | int32_t fBreaksNotFound; | |
139 | ||
140 | // count of breaks found in the noSpaces array | |
141 | // that aren't in the spaces array | |
142 | int32_t fInvalidBreaks; | |
143 | ||
144 | // number of words found in the spaces array | |
145 | int32_t fWordCount; | |
146 | ||
147 | // report all breaks if true, otherwise just report differences | |
148 | UBool fVerbose; | |
149 | }; | |
150 | ||
151 | /* | |
152 | * The main constructor: it calls compareWordBreaks and reports any differences | |
153 | */ | |
154 | ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, | |
155 | const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) | |
156 | : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) | |
157 | { | |
158 | compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); | |
159 | } | |
160 | ||
161 | /* | |
162 | * The no arg constructor | |
163 | */ | |
164 | ThaiWordbreakTest::ThaiWordbreakTest() | |
165 | { | |
166 | // nothing | |
167 | } | |
168 | ||
169 | /* | |
170 | * The destructor | |
171 | */ | |
172 | ThaiWordbreakTest::~ThaiWordbreakTest() | |
173 | { | |
174 | // nothing? | |
175 | } | |
176 | ||
177 | /* | |
178 | * returns the number of breaks in the spaces array | |
179 | * that aren't found in the noSpaces array | |
180 | */ | |
181 | inline int32_t ThaiWordbreakTest::getBreaksNotFound() | |
182 | { | |
183 | return fBreaksNotFound; | |
184 | } | |
185 | ||
186 | /* | |
187 | * Returns the number of breaks found in the noSpaces | |
188 | * array that aren't in the spaces array | |
189 | */ | |
190 | inline int32_t ThaiWordbreakTest::getInvalidBreaks() | |
191 | { | |
192 | return fInvalidBreaks; | |
193 | } | |
194 | ||
195 | /* | |
196 | * Returns the number of words found in the spaces array | |
197 | */ | |
198 | inline int32_t ThaiWordbreakTest::getWordCount() | |
199 | { | |
200 | return fWordCount; | |
201 | } | |
202 | ||
203 | /* | |
204 | * This method does the acutal break comparison and reports the results. | |
205 | * It uses a SpaceBreakIterator to iterate over the text with spaces, | |
206 | * and a word instance of a Thai BreakIterator to iterate over the text | |
207 | * without spaces. | |
208 | */ | |
209 | UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
210 | const UChar *noSpaces, int32_t noSpaceCount) | |
211 | { | |
212 | UBool result = TRUE; | |
213 | Locale thai("th"); | |
214 | UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); | |
215 | UErrorCode status = U_ZERO_ERROR; | |
216 | ||
217 | BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); | |
218 | breakIter->adoptText(noSpaceIter); | |
219 | ||
220 | SpaceBreakIterator spaceIter(spaces, spaceCount); | |
221 | ||
222 | int32_t nextBreak = 0; | |
223 | int32_t nextSpaceBreak = 0; | |
224 | int32_t iterCount = 0; | |
225 | ||
226 | while (TRUE) { | |
227 | nextSpaceBreak = spaceIter.next(); | |
228 | nextBreak = breakIter->next(); | |
229 | ||
230 | if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { | |
231 | if (nextBreak != BreakIterator::DONE) { | |
232 | fprintf(stderr, "break iterator didn't end.\n"); | |
233 | } else if (nextSpaceBreak != BreakIterator::DONE) { | |
234 | fprintf(stderr, "premature break iterator end.\n"); | |
235 | } | |
236 | ||
237 | break; | |
238 | } | |
239 | ||
240 | while (nextSpaceBreak != nextBreak && | |
241 | nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { | |
242 | if (nextSpaceBreak < nextBreak) { | |
243 | breakNotFound(nextSpaceBreak); | |
244 | result = FALSE; | |
245 | nextSpaceBreak = spaceIter.next(); | |
246 | } else if (nextSpaceBreak > nextBreak) { | |
247 | foundInvalidBreak(nextBreak); | |
248 | result = FALSE; | |
249 | nextBreak = breakIter->next(); | |
250 | } | |
251 | } | |
252 | ||
253 | if (fVerbose) { | |
254 | printf("%d %d\n", nextSpaceBreak, nextBreak); | |
255 | } | |
256 | } | |
257 | ||
258 | ||
259 | fWordCount = spaceIter.getWordCount(); | |
260 | ||
261 | delete breakIter; | |
262 | ||
263 | return result; | |
264 | } | |
265 | ||
266 | /* | |
267 | * Report a break that's in the text with spaces but | |
268 | * not found in the text without spaces. | |
269 | */ | |
270 | void ThaiWordbreakTest::breakNotFound(int32_t br) | |
271 | { | |
272 | if (fVerbose) { | |
273 | printf("%d ****\n", br); | |
274 | } else { | |
275 | fprintf(stderr, "break not found: %d\n", br); | |
276 | } | |
277 | ||
278 | fBreaksNotFound += 1; | |
279 | } | |
280 | ||
281 | /* | |
282 | * Report a break that's found in the text without spaces | |
283 | * that isn't in the text with spaces. | |
284 | */ | |
285 | void ThaiWordbreakTest::foundInvalidBreak(int32_t br) | |
286 | { | |
287 | if (fVerbose) { | |
288 | printf("**** %d\n", br); | |
289 | } else { | |
290 | fprintf(stderr, "found invalid break: %d\n", br); | |
291 | } | |
292 | ||
293 | fInvalidBreaks += 1; | |
294 | } | |
295 | ||
296 | /* | |
297 | * Read the text from a file. The text must start with a Unicode Byte | |
298 | * Order Mark (BOM) so that we know what order to read the bytes in. | |
299 | */ | |
300 | const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) | |
301 | { | |
302 | FILE *f; | |
303 | int32_t fileSize; | |
304 | ||
305 | UChar *buffer; | |
306 | char *bufferChars; | |
307 | ||
308 | f = fopen(fileName, "rb"); | |
309 | ||
310 | if( f == NULL ) { | |
311 | fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); | |
312 | return 0; | |
313 | } | |
314 | ||
315 | fseek(f, 0, SEEK_END); | |
316 | fileSize = ftell(f); | |
317 | ||
318 | fseek(f, 0, SEEK_SET); | |
319 | bufferChars = new char[fileSize]; | |
320 | ||
321 | if(bufferChars == 0) { | |
322 | fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); | |
323 | fclose(f); | |
324 | return 0; | |
325 | } | |
326 | ||
327 | fread(bufferChars, sizeof(char), fileSize, f); | |
328 | if( ferror(f) ) { | |
329 | fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); | |
330 | fclose(f); | |
331 | delete[] bufferChars; | |
332 | return 0; | |
333 | } | |
334 | fclose(f); | |
335 | ||
336 | UnicodeString myText(bufferChars, fileSize, "UTF-8"); | |
337 | ||
338 | delete[] bufferChars; | |
339 | ||
340 | charCount = myText.length(); | |
341 | buffer = new UChar[charCount]; | |
342 | if(buffer == 0) { | |
343 | fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); | |
344 | return 0; | |
345 | } | |
346 | ||
347 | myText.extract(1, myText.length(), buffer); | |
348 | charCount--; // skip the BOM | |
349 | buffer[charCount] = 0; // NULL terminate for easier reading in the debugger | |
350 | ||
351 | return buffer; | |
352 | } | |
353 | ||
354 | /* | |
355 | * Remove spaces from the input UChar array. | |
356 | * | |
357 | * We check explicitly for a Unicode code value of 0x0020 | |
358 | * because Unicode::isSpaceChar returns true for CR, LF, etc. | |
359 | * | |
360 | */ | |
361 | const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) | |
362 | { | |
363 | int32_t i, out, spaceCount; | |
364 | ||
365 | spaceCount = 0; | |
366 | for (i = 0; i < count; i += 1) { | |
367 | if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { | |
368 | spaceCount += 1; | |
369 | } | |
370 | } | |
371 | ||
372 | nonSpaceCount = count - spaceCount; | |
373 | UChar *noSpaces = new UChar[nonSpaceCount]; | |
374 | ||
375 | if (noSpaces == 0) { | |
376 | fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); | |
377 | return 0; | |
378 | } | |
379 | ||
380 | for (out = 0, i = 0; i < count; i += 1) { | |
381 | if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { | |
382 | noSpaces[out++] = spaces[i]; | |
383 | } | |
384 | } | |
385 | ||
386 | return noSpaces; | |
387 | } | |
388 | ||
389 | /* | |
390 | * The main routine. Read the command line arguments, read the text file, | |
391 | * remove the spaces, do the comparison and report the final results | |
392 | */ | |
393 | int main(int argc, char **argv) | |
394 | { | |
395 | char *fileName = "space.txt"; | |
396 | int arg = 1; | |
397 | UBool verbose = FALSE; | |
398 | ||
399 | if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { | |
400 | verbose = TRUE; | |
401 | arg += 1; | |
402 | } | |
403 | ||
404 | if (arg == argc - 1) { | |
405 | fileName = argv[arg++]; | |
406 | } | |
407 | ||
408 | if (arg != argc) { | |
409 | fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); | |
410 | return 1; | |
411 | } | |
412 | ||
413 | int32_t spaceCount, nonSpaceCount; | |
414 | const UChar *spaces, *noSpaces; | |
415 | ||
416 | spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); | |
417 | ||
418 | if (spaces == 0) { | |
419 | return 1; | |
420 | } | |
421 | ||
422 | noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); | |
423 | ||
424 | if (noSpaces == 0) { | |
425 | return 1; | |
426 | } | |
427 | ||
428 | ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); | |
429 | ||
430 | printf("word count: %d\n", test.getWordCount()); | |
431 | printf("breaks not found: %d\n", test.getBreaksNotFound()); | |
432 | printf("invalid breaks found: %d\n", test.getInvalidBreaks()); | |
433 | ||
434 | return 0; | |
435 | } | |
436 | ||
437 | /* | |
438 | * The main constructor. Clear all the counts and construct a default | |
439 | * word instance of a BreakIterator. | |
440 | */ | |
441 | SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) | |
442 | : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) | |
443 | { | |
444 | UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); | |
445 | UErrorCode status = U_ZERO_ERROR; | |
446 | Locale us("us"); | |
447 | ||
448 | fBreakIter = BreakIterator::createWordInstance(us, status); | |
449 | fBreakIter->adoptText(iter); | |
450 | } | |
451 | ||
452 | SpaceBreakIterator::SpaceBreakIterator() | |
453 | { | |
454 | // nothing | |
455 | } | |
456 | ||
457 | /* | |
458 | * The destructor. delete the underlying BreakIterator | |
459 | */ | |
460 | SpaceBreakIterator::~SpaceBreakIterator() | |
461 | { | |
462 | delete fBreakIter; | |
463 | } | |
464 | ||
465 | /* | |
466 | * Return the next break, counting words and spaces. | |
467 | */ | |
468 | int32_t SpaceBreakIterator::next() | |
469 | { | |
470 | if (fDone) { | |
471 | return BreakIterator::DONE; | |
472 | } | |
473 | ||
474 | int32_t nextBreak = fBreakIter->next(); | |
475 | ||
476 | if (nextBreak == BreakIterator::DONE) { | |
477 | fDone = TRUE; | |
478 | return BreakIterator::DONE; | |
479 | } | |
480 | ||
481 | int32_t result = nextBreak - fSpaceCount; | |
482 | ||
483 | if (nextBreak < fTextCount) { | |
484 | if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { | |
485 | fSpaceCount += fBreakIter->next() - nextBreak; | |
486 | } | |
487 | } | |
488 | ||
489 | fWordCount += 1; | |
490 | ||
491 | return result; | |
492 | } | |
493 | ||
494 | /* | |
495 | * Returns the current space count | |
496 | */ | |
497 | int32_t SpaceBreakIterator::getSpaceCount() | |
498 | { | |
499 | return fSpaceCount; | |
500 | } | |
501 | ||
502 | /* | |
503 | * Returns the current word count | |
504 | */ | |
505 | int32_t SpaceBreakIterator::getWordCount() | |
506 | { | |
507 | return fWordCount; | |
508 | } | |
509 | ||
510 |