]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
73c04bcf A |
3 | * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * |
4 | * and others. All Rights Reserved. * | |
b75a7d8f A |
5 | ****************************************************************************** |
6 | */ | |
7 | ||
8 | #include <errno.h> | |
9 | #include <stdio.h> | |
10 | #include <string.h> | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | #include "unicode/uchar.h" | |
14 | #include "unicode/uchriter.h" | |
15 | #include "unicode/brkiter.h" | |
16 | #include "unicode/locid.h" | |
17 | #include "unicode/unistr.h" | |
73c04bcf A |
18 | #include "unicode/uniset.h" |
19 | #include "unicode/ustring.h" | |
b75a7d8f A |
20 | |
21 | /* | |
22 | * This program takes a Unicode text file containing Thai text with | |
23 | * spaces inserted where the word breaks are. It computes a copy of | |
24 | * the text without spaces and uses a word instance of a Thai BreakIterator | |
25 | * to compute the word breaks. The program reports any differences in the | |
26 | * breaks. | |
27 | * | |
28 | * NOTE: by it's very nature, Thai word breaking is not exact, so it is | |
29 | * exptected that this program will always report some differences. | |
30 | */ | |
31 | ||
32 | /* | |
33 | * This class is a break iterator that counts words and spaces. | |
34 | */ | |
35 | class SpaceBreakIterator | |
36 | { | |
37 | public: | |
38 | // The constructor: | |
39 | // text - pointer to an array of UChars to iterate over | |
40 | // count - the number of UChars in text | |
41 | SpaceBreakIterator(const UChar *text, int32_t count); | |
42 | ||
43 | // the destructor | |
44 | ~SpaceBreakIterator(); | |
45 | ||
46 | // return next break position | |
47 | int32_t next(); | |
48 | ||
49 | // return current word count | |
50 | int32_t getWordCount(); | |
51 | ||
52 | // return current space count | |
53 | int32_t getSpaceCount(); | |
54 | ||
55 | private: | |
56 | // No arg constructor: private so clients can't call it. | |
57 | SpaceBreakIterator(); | |
58 | ||
59 | // The underlying BreakIterator | |
60 | BreakIterator *fBreakIter; | |
61 | ||
62 | // address of the UChar array | |
63 | const UChar *fText; | |
64 | ||
65 | // number of UChars in fText | |
66 | int32_t fTextCount; | |
67 | ||
68 | // current word count | |
69 | int32_t fWordCount; | |
70 | ||
71 | // current space count | |
72 | int32_t fSpaceCount; | |
73c04bcf A |
73 | |
74 | // UnicodeSet of SA characters | |
75 | UnicodeSet fComplexContext; | |
b75a7d8f A |
76 | |
77 | // true when fBreakIter has returned DONE | |
78 | UBool fDone; | |
79 | }; | |
80 | ||
81 | /* | |
82 | * This is the main class. It compares word breaks and reports the differences. | |
83 | */ | |
84 | class ThaiWordbreakTest | |
85 | { | |
86 | public: | |
87 | // The main constructor: | |
88 | // spaces - pointer to a UChar array for the text with spaces | |
89 | // spaceCount - the number of characters in the spaces array | |
90 | // noSpaces - pointer to a UChar array for the text without spaces | |
91 | // noSpaceCount - the number of characters in the noSpaces array | |
92 | // verbose - report all breaks if true, otherwise just report differences | |
93 | ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); | |
94 | ~ThaiWordbreakTest(); | |
95 | ||
96 | // returns the number of breaks that are in the spaces array | |
97 | // but aren't found in the noSpaces array | |
98 | int32_t getBreaksNotFound(); | |
99 | ||
100 | // returns the number of breaks which are found in the noSpaces | |
101 | // array but aren't in the spaces array | |
102 | int32_t getInvalidBreaks(); | |
103 | ||
104 | // returns the number of words found in the spaces array | |
105 | int32_t getWordCount(); | |
106 | ||
107 | // reads the input Unicode text file: | |
108 | // fileName - the path name of the file | |
109 | // charCount - set to the number of UChars read from the file | |
110 | // returns - the address of the UChar array containing the characters | |
111 | static const UChar *readFile(char *fileName, int32_t &charCount); | |
112 | ||
113 | // removes spaces form the input UChar array: | |
114 | // spaces - pointer to the input UChar array | |
115 | // count - number of UChars in the spaces array | |
116 | // nonSpaceCount - the number of UChars in the result array | |
117 | // returns - the address of the UChar array with spaces removed | |
118 | static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); | |
119 | ||
120 | private: | |
121 | // The no arg constructor - private so clients can't call it | |
122 | ThaiWordbreakTest(); | |
123 | ||
124 | // This does the actual comparison: | |
125 | // spaces - the address of the UChar array for the text with spaces | |
126 | // spaceCount - the number of UChars in the spaces array | |
127 | // noSpaces - the address of the UChar array for the text without spaces | |
128 | // noSpaceCount - the number of UChars in the noSpaces array | |
129 | // returns - true if all breaks match, FALSE otherwise | |
130 | UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
131 | const UChar *noSpaces, int32_t noSpaceCount); | |
132 | ||
133 | // helper method to report a break in the spaces | |
134 | // array that's not found in the noSpaces array | |
135 | void breakNotFound(int32_t br); | |
136 | ||
137 | // helper method to report a break that's found in | |
138 | // the noSpaces array that's not in the spaces array | |
139 | void foundInvalidBreak(int32_t br); | |
140 | ||
141 | // count of breaks in the spaces array that | |
142 | // aren't found in the noSpaces array | |
143 | int32_t fBreaksNotFound; | |
144 | ||
145 | // count of breaks found in the noSpaces array | |
146 | // that aren't in the spaces array | |
147 | int32_t fInvalidBreaks; | |
148 | ||
149 | // number of words found in the spaces array | |
150 | int32_t fWordCount; | |
151 | ||
152 | // report all breaks if true, otherwise just report differences | |
153 | UBool fVerbose; | |
154 | }; | |
155 | ||
156 | /* | |
157 | * The main constructor: it calls compareWordBreaks and reports any differences | |
158 | */ | |
159 | ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, | |
160 | const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) | |
161 | : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) | |
162 | { | |
163 | compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); | |
164 | } | |
165 | ||
166 | /* | |
167 | * The no arg constructor | |
168 | */ | |
169 | ThaiWordbreakTest::ThaiWordbreakTest() | |
170 | { | |
171 | // nothing | |
172 | } | |
173 | ||
174 | /* | |
175 | * The destructor | |
176 | */ | |
177 | ThaiWordbreakTest::~ThaiWordbreakTest() | |
178 | { | |
179 | // nothing? | |
180 | } | |
181 | ||
182 | /* | |
183 | * returns the number of breaks in the spaces array | |
184 | * that aren't found in the noSpaces array | |
185 | */ | |
186 | inline int32_t ThaiWordbreakTest::getBreaksNotFound() | |
187 | { | |
188 | return fBreaksNotFound; | |
189 | } | |
190 | ||
191 | /* | |
192 | * Returns the number of breaks found in the noSpaces | |
193 | * array that aren't in the spaces array | |
194 | */ | |
195 | inline int32_t ThaiWordbreakTest::getInvalidBreaks() | |
196 | { | |
197 | return fInvalidBreaks; | |
198 | } | |
199 | ||
200 | /* | |
201 | * Returns the number of words found in the spaces array | |
202 | */ | |
203 | inline int32_t ThaiWordbreakTest::getWordCount() | |
204 | { | |
205 | return fWordCount; | |
206 | } | |
207 | ||
208 | /* | |
209 | * This method does the acutal break comparison and reports the results. | |
210 | * It uses a SpaceBreakIterator to iterate over the text with spaces, | |
211 | * and a word instance of a Thai BreakIterator to iterate over the text | |
212 | * without spaces. | |
213 | */ | |
214 | UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
215 | const UChar *noSpaces, int32_t noSpaceCount) | |
216 | { | |
217 | UBool result = TRUE; | |
218 | Locale thai("th"); | |
219 | UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); | |
220 | UErrorCode status = U_ZERO_ERROR; | |
221 | ||
222 | BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); | |
223 | breakIter->adoptText(noSpaceIter); | |
224 | ||
225 | SpaceBreakIterator spaceIter(spaces, spaceCount); | |
226 | ||
227 | int32_t nextBreak = 0; | |
228 | int32_t nextSpaceBreak = 0; | |
229 | int32_t iterCount = 0; | |
230 | ||
231 | while (TRUE) { | |
232 | nextSpaceBreak = spaceIter.next(); | |
233 | nextBreak = breakIter->next(); | |
234 | ||
235 | if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { | |
236 | if (nextBreak != BreakIterator::DONE) { | |
237 | fprintf(stderr, "break iterator didn't end.\n"); | |
238 | } else if (nextSpaceBreak != BreakIterator::DONE) { | |
239 | fprintf(stderr, "premature break iterator end.\n"); | |
240 | } | |
241 | ||
242 | break; | |
243 | } | |
244 | ||
245 | while (nextSpaceBreak != nextBreak && | |
246 | nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { | |
247 | if (nextSpaceBreak < nextBreak) { | |
248 | breakNotFound(nextSpaceBreak); | |
249 | result = FALSE; | |
250 | nextSpaceBreak = spaceIter.next(); | |
251 | } else if (nextSpaceBreak > nextBreak) { | |
252 | foundInvalidBreak(nextBreak); | |
253 | result = FALSE; | |
254 | nextBreak = breakIter->next(); | |
255 | } | |
256 | } | |
257 | ||
258 | if (fVerbose) { | |
259 | printf("%d %d\n", nextSpaceBreak, nextBreak); | |
260 | } | |
261 | } | |
262 | ||
263 | ||
264 | fWordCount = spaceIter.getWordCount(); | |
265 | ||
266 | delete breakIter; | |
267 | ||
268 | return result; | |
269 | } | |
270 | ||
271 | /* | |
272 | * Report a break that's in the text with spaces but | |
273 | * not found in the text without spaces. | |
274 | */ | |
275 | void ThaiWordbreakTest::breakNotFound(int32_t br) | |
276 | { | |
277 | if (fVerbose) { | |
278 | printf("%d ****\n", br); | |
279 | } else { | |
280 | fprintf(stderr, "break not found: %d\n", br); | |
281 | } | |
282 | ||
283 | fBreaksNotFound += 1; | |
284 | } | |
285 | ||
286 | /* | |
287 | * Report a break that's found in the text without spaces | |
288 | * that isn't in the text with spaces. | |
289 | */ | |
290 | void ThaiWordbreakTest::foundInvalidBreak(int32_t br) | |
291 | { | |
292 | if (fVerbose) { | |
293 | printf("**** %d\n", br); | |
294 | } else { | |
295 | fprintf(stderr, "found invalid break: %d\n", br); | |
296 | } | |
297 | ||
298 | fInvalidBreaks += 1; | |
299 | } | |
300 | ||
301 | /* | |
302 | * Read the text from a file. The text must start with a Unicode Byte | |
303 | * Order Mark (BOM) so that we know what order to read the bytes in. | |
304 | */ | |
305 | const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) | |
306 | { | |
307 | FILE *f; | |
308 | int32_t fileSize; | |
309 | ||
310 | UChar *buffer; | |
311 | char *bufferChars; | |
312 | ||
313 | f = fopen(fileName, "rb"); | |
314 | ||
315 | if( f == NULL ) { | |
316 | fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); | |
317 | return 0; | |
318 | } | |
319 | ||
320 | fseek(f, 0, SEEK_END); | |
321 | fileSize = ftell(f); | |
322 | ||
323 | fseek(f, 0, SEEK_SET); | |
324 | bufferChars = new char[fileSize]; | |
325 | ||
326 | if(bufferChars == 0) { | |
327 | fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); | |
328 | fclose(f); | |
329 | return 0; | |
330 | } | |
331 | ||
332 | fread(bufferChars, sizeof(char), fileSize, f); | |
333 | if( ferror(f) ) { | |
334 | fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); | |
335 | fclose(f); | |
336 | delete[] bufferChars; | |
337 | return 0; | |
338 | } | |
339 | fclose(f); | |
340 | ||
341 | UnicodeString myText(bufferChars, fileSize, "UTF-8"); | |
342 | ||
343 | delete[] bufferChars; | |
344 | ||
345 | charCount = myText.length(); | |
346 | buffer = new UChar[charCount]; | |
347 | if(buffer == 0) { | |
348 | fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); | |
349 | return 0; | |
350 | } | |
351 | ||
352 | myText.extract(1, myText.length(), buffer); | |
353 | charCount--; // skip the BOM | |
354 | buffer[charCount] = 0; // NULL terminate for easier reading in the debugger | |
355 | ||
356 | return buffer; | |
357 | } | |
358 | ||
359 | /* | |
360 | * Remove spaces from the input UChar array. | |
361 | * | |
362 | * We check explicitly for a Unicode code value of 0x0020 | |
363 | * because Unicode::isSpaceChar returns true for CR, LF, etc. | |
364 | * | |
365 | */ | |
366 | const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) | |
367 | { | |
368 | int32_t i, out, spaceCount; | |
369 | ||
370 | spaceCount = 0; | |
371 | for (i = 0; i < count; i += 1) { | |
372 | if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { | |
373 | spaceCount += 1; | |
374 | } | |
375 | } | |
376 | ||
377 | nonSpaceCount = count - spaceCount; | |
378 | UChar *noSpaces = new UChar[nonSpaceCount]; | |
379 | ||
380 | if (noSpaces == 0) { | |
381 | fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); | |
382 | return 0; | |
383 | } | |
384 | ||
385 | for (out = 0, i = 0; i < count; i += 1) { | |
386 | if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { | |
387 | noSpaces[out++] = spaces[i]; | |
388 | } | |
389 | } | |
390 | ||
391 | return noSpaces; | |
392 | } | |
393 | ||
73c04bcf A |
394 | /* |
395 | * Generate a text file with spaces in it from a file without. | |
396 | */ | |
397 | int generateFile(const UChar *chars, int32_t length) { | |
398 | Locale root(""); | |
399 | UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); | |
400 | UErrorCode status = U_ZERO_ERROR; | |
401 | ||
402 | UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); | |
403 | BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); | |
404 | breakIter->adoptText(noSpaceIter); | |
405 | char outbuf[1024]; | |
406 | int32_t strlength; | |
407 | UChar bom = 0xFEFF; | |
408 | ||
409 | printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); | |
410 | int32_t prevbreak = 0; | |
411 | while (U_SUCCESS(status)) { | |
412 | int32_t nextbreak = breakIter->next(); | |
413 | if (nextbreak == BreakIterator::DONE) { | |
414 | break; | |
415 | } | |
416 | printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], | |
417 | nextbreak-prevbreak, &status)); | |
418 | if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) | |
419 | && complexContext.contains(chars[nextbreak])) { | |
420 | printf(" "); | |
421 | } | |
422 | prevbreak = nextbreak; | |
423 | } | |
424 | ||
425 | if (U_FAILURE(status)) { | |
426 | fprintf(stderr, "generate failed: %s\n", u_errorName(status)); | |
427 | return status; | |
428 | } | |
429 | else { | |
430 | return 0; | |
431 | } | |
432 | } | |
433 | ||
b75a7d8f A |
434 | /* |
435 | * The main routine. Read the command line arguments, read the text file, | |
436 | * remove the spaces, do the comparison and report the final results | |
437 | */ | |
438 | int main(int argc, char **argv) | |
439 | { | |
440 | char *fileName = "space.txt"; | |
441 | int arg = 1; | |
442 | UBool verbose = FALSE; | |
73c04bcf A |
443 | UBool generate = FALSE; |
444 | ||
445 | if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { | |
446 | generate = TRUE; | |
447 | arg += 1; | |
448 | } | |
b75a7d8f A |
449 | |
450 | if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { | |
451 | verbose = TRUE; | |
452 | arg += 1; | |
453 | } | |
454 | ||
455 | if (arg == argc - 1) { | |
456 | fileName = argv[arg++]; | |
457 | } | |
458 | ||
459 | if (arg != argc) { | |
460 | fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); | |
461 | return 1; | |
462 | } | |
463 | ||
464 | int32_t spaceCount, nonSpaceCount; | |
465 | const UChar *spaces, *noSpaces; | |
466 | ||
467 | spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); | |
468 | ||
469 | if (spaces == 0) { | |
470 | return 1; | |
471 | } | |
73c04bcf A |
472 | |
473 | if (generate) { | |
474 | return generateFile(spaces, spaceCount); | |
475 | } | |
b75a7d8f A |
476 | |
477 | noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); | |
478 | ||
479 | if (noSpaces == 0) { | |
480 | return 1; | |
481 | } | |
482 | ||
483 | ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); | |
484 | ||
485 | printf("word count: %d\n", test.getWordCount()); | |
486 | printf("breaks not found: %d\n", test.getBreaksNotFound()); | |
487 | printf("invalid breaks found: %d\n", test.getInvalidBreaks()); | |
488 | ||
489 | return 0; | |
490 | } | |
491 | ||
492 | /* | |
493 | * The main constructor. Clear all the counts and construct a default | |
494 | * word instance of a BreakIterator. | |
495 | */ | |
496 | SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) | |
497 | : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) | |
498 | { | |
499 | UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); | |
500 | UErrorCode status = U_ZERO_ERROR; | |
73c04bcf A |
501 | fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); |
502 | Locale root(""); | |
b75a7d8f | 503 | |
73c04bcf | 504 | fBreakIter = BreakIterator::createWordInstance(root, status); |
b75a7d8f A |
505 | fBreakIter->adoptText(iter); |
506 | } | |
507 | ||
508 | SpaceBreakIterator::SpaceBreakIterator() | |
509 | { | |
510 | // nothing | |
511 | } | |
512 | ||
513 | /* | |
514 | * The destructor. delete the underlying BreakIterator | |
515 | */ | |
516 | SpaceBreakIterator::~SpaceBreakIterator() | |
517 | { | |
518 | delete fBreakIter; | |
519 | } | |
520 | ||
521 | /* | |
522 | * Return the next break, counting words and spaces. | |
523 | */ | |
524 | int32_t SpaceBreakIterator::next() | |
525 | { | |
526 | if (fDone) { | |
527 | return BreakIterator::DONE; | |
528 | } | |
529 | ||
73c04bcf A |
530 | int32_t nextBreak; |
531 | do { | |
532 | nextBreak = fBreakIter->next(); | |
533 | ||
534 | if (nextBreak == BreakIterator::DONE) { | |
535 | fDone = TRUE; | |
536 | return BreakIterator::DONE; | |
537 | } | |
b75a7d8f | 538 | } |
73c04bcf A |
539 | while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) |
540 | && fComplexContext.contains(fText[nextBreak])); | |
b75a7d8f A |
541 | |
542 | int32_t result = nextBreak - fSpaceCount; | |
543 | ||
544 | if (nextBreak < fTextCount) { | |
545 | if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { | |
546 | fSpaceCount += fBreakIter->next() - nextBreak; | |
547 | } | |
548 | } | |
549 | ||
550 | fWordCount += 1; | |
551 | ||
552 | return result; | |
553 | } | |
554 | ||
555 | /* | |
556 | * Returns the current space count | |
557 | */ | |
558 | int32_t SpaceBreakIterator::getSpaceCount() | |
559 | { | |
560 | return fSpaceCount; | |
561 | } | |
562 | ||
563 | /* | |
564 | * Returns the current word count | |
565 | */ | |
566 | int32_t SpaceBreakIterator::getWordCount() | |
567 | { | |
568 | return fWordCount; | |
569 | } | |
570 | ||
571 |