]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ****************************************************************************** | |
73c04bcf A |
5 | * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * |
6 | * and others. All Rights Reserved. * | |
b75a7d8f A |
7 | ****************************************************************************** |
8 | */ | |
9 | ||
10 | #include <errno.h> | |
11 | #include <stdio.h> | |
12 | #include <string.h> | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | #include "unicode/uchar.h" | |
16 | #include "unicode/uchriter.h" | |
17 | #include "unicode/brkiter.h" | |
18 | #include "unicode/locid.h" | |
19 | #include "unicode/unistr.h" | |
73c04bcf A |
20 | #include "unicode/uniset.h" |
21 | #include "unicode/ustring.h" | |
b75a7d8f A |
22 | |
23 | /* | |
24 | * This program takes a Unicode text file containing Thai text with | |
25 | * spaces inserted where the word breaks are. It computes a copy of | |
26 | * the text without spaces and uses a word instance of a Thai BreakIterator | |
27 | * to compute the word breaks. The program reports any differences in the | |
28 | * breaks. | |
29 | * | |
30 | * NOTE: by it's very nature, Thai word breaking is not exact, so it is | |
31 | * exptected that this program will always report some differences. | |
32 | */ | |
33 | ||
34 | /* | |
35 | * This class is a break iterator that counts words and spaces. | |
36 | */ | |
37 | class SpaceBreakIterator | |
38 | { | |
39 | public: | |
40 | // The constructor: | |
41 | // text - pointer to an array of UChars to iterate over | |
42 | // count - the number of UChars in text | |
43 | SpaceBreakIterator(const UChar *text, int32_t count); | |
44 | ||
45 | // the destructor | |
46 | ~SpaceBreakIterator(); | |
47 | ||
48 | // return next break position | |
49 | int32_t next(); | |
50 | ||
51 | // return current word count | |
52 | int32_t getWordCount(); | |
53 | ||
54 | // return current space count | |
55 | int32_t getSpaceCount(); | |
56 | ||
57 | private: | |
58 | // No arg constructor: private so clients can't call it. | |
59 | SpaceBreakIterator(); | |
60 | ||
61 | // The underlying BreakIterator | |
62 | BreakIterator *fBreakIter; | |
63 | ||
64 | // address of the UChar array | |
65 | const UChar *fText; | |
66 | ||
67 | // number of UChars in fText | |
68 | int32_t fTextCount; | |
69 | ||
70 | // current word count | |
71 | int32_t fWordCount; | |
72 | ||
73 | // current space count | |
74 | int32_t fSpaceCount; | |
73c04bcf A |
75 | |
76 | // UnicodeSet of SA characters | |
77 | UnicodeSet fComplexContext; | |
b75a7d8f A |
78 | |
79 | // true when fBreakIter has returned DONE | |
80 | UBool fDone; | |
81 | }; | |
82 | ||
83 | /* | |
84 | * This is the main class. It compares word breaks and reports the differences. | |
85 | */ | |
86 | class ThaiWordbreakTest | |
87 | { | |
88 | public: | |
89 | // The main constructor: | |
90 | // spaces - pointer to a UChar array for the text with spaces | |
91 | // spaceCount - the number of characters in the spaces array | |
92 | // noSpaces - pointer to a UChar array for the text without spaces | |
93 | // noSpaceCount - the number of characters in the noSpaces array | |
94 | // verbose - report all breaks if true, otherwise just report differences | |
95 | ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); | |
96 | ~ThaiWordbreakTest(); | |
97 | ||
98 | // returns the number of breaks that are in the spaces array | |
99 | // but aren't found in the noSpaces array | |
100 | int32_t getBreaksNotFound(); | |
101 | ||
102 | // returns the number of breaks which are found in the noSpaces | |
103 | // array but aren't in the spaces array | |
104 | int32_t getInvalidBreaks(); | |
105 | ||
106 | // returns the number of words found in the spaces array | |
107 | int32_t getWordCount(); | |
108 | ||
109 | // reads the input Unicode text file: | |
110 | // fileName - the path name of the file | |
111 | // charCount - set to the number of UChars read from the file | |
112 | // returns - the address of the UChar array containing the characters | |
113 | static const UChar *readFile(char *fileName, int32_t &charCount); | |
114 | ||
115 | // removes spaces form the input UChar array: | |
116 | // spaces - pointer to the input UChar array | |
117 | // count - number of UChars in the spaces array | |
118 | // nonSpaceCount - the number of UChars in the result array | |
119 | // returns - the address of the UChar array with spaces removed | |
120 | static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); | |
121 | ||
122 | private: | |
123 | // The no arg constructor - private so clients can't call it | |
124 | ThaiWordbreakTest(); | |
125 | ||
126 | // This does the actual comparison: | |
127 | // spaces - the address of the UChar array for the text with spaces | |
128 | // spaceCount - the number of UChars in the spaces array | |
129 | // noSpaces - the address of the UChar array for the text without spaces | |
130 | // noSpaceCount - the number of UChars in the noSpaces array | |
131 | // returns - true if all breaks match, FALSE otherwise | |
132 | UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
133 | const UChar *noSpaces, int32_t noSpaceCount); | |
134 | ||
135 | // helper method to report a break in the spaces | |
136 | // array that's not found in the noSpaces array | |
137 | void breakNotFound(int32_t br); | |
138 | ||
139 | // helper method to report a break that's found in | |
140 | // the noSpaces array that's not in the spaces array | |
141 | void foundInvalidBreak(int32_t br); | |
142 | ||
143 | // count of breaks in the spaces array that | |
144 | // aren't found in the noSpaces array | |
145 | int32_t fBreaksNotFound; | |
146 | ||
147 | // count of breaks found in the noSpaces array | |
148 | // that aren't in the spaces array | |
149 | int32_t fInvalidBreaks; | |
150 | ||
151 | // number of words found in the spaces array | |
152 | int32_t fWordCount; | |
153 | ||
154 | // report all breaks if true, otherwise just report differences | |
155 | UBool fVerbose; | |
156 | }; | |
157 | ||
158 | /* | |
159 | * The main constructor: it calls compareWordBreaks and reports any differences | |
160 | */ | |
161 | ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, | |
162 | const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) | |
163 | : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) | |
164 | { | |
165 | compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); | |
166 | } | |
167 | ||
168 | /* | |
169 | * The no arg constructor | |
170 | */ | |
171 | ThaiWordbreakTest::ThaiWordbreakTest() | |
172 | { | |
173 | // nothing | |
174 | } | |
175 | ||
176 | /* | |
177 | * The destructor | |
178 | */ | |
179 | ThaiWordbreakTest::~ThaiWordbreakTest() | |
180 | { | |
181 | // nothing? | |
182 | } | |
183 | ||
184 | /* | |
185 | * returns the number of breaks in the spaces array | |
186 | * that aren't found in the noSpaces array | |
187 | */ | |
188 | inline int32_t ThaiWordbreakTest::getBreaksNotFound() | |
189 | { | |
190 | return fBreaksNotFound; | |
191 | } | |
192 | ||
193 | /* | |
194 | * Returns the number of breaks found in the noSpaces | |
195 | * array that aren't in the spaces array | |
196 | */ | |
197 | inline int32_t ThaiWordbreakTest::getInvalidBreaks() | |
198 | { | |
199 | return fInvalidBreaks; | |
200 | } | |
201 | ||
202 | /* | |
203 | * Returns the number of words found in the spaces array | |
204 | */ | |
205 | inline int32_t ThaiWordbreakTest::getWordCount() | |
206 | { | |
207 | return fWordCount; | |
208 | } | |
209 | ||
210 | /* | |
211 | * This method does the acutal break comparison and reports the results. | |
212 | * It uses a SpaceBreakIterator to iterate over the text with spaces, | |
213 | * and a word instance of a Thai BreakIterator to iterate over the text | |
214 | * without spaces. | |
215 | */ | |
216 | UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
217 | const UChar *noSpaces, int32_t noSpaceCount) | |
218 | { | |
219 | UBool result = TRUE; | |
220 | Locale thai("th"); | |
221 | UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); | |
222 | UErrorCode status = U_ZERO_ERROR; | |
223 | ||
224 | BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); | |
225 | breakIter->adoptText(noSpaceIter); | |
226 | ||
227 | SpaceBreakIterator spaceIter(spaces, spaceCount); | |
228 | ||
229 | int32_t nextBreak = 0; | |
230 | int32_t nextSpaceBreak = 0; | |
231 | int32_t iterCount = 0; | |
232 | ||
233 | while (TRUE) { | |
234 | nextSpaceBreak = spaceIter.next(); | |
235 | nextBreak = breakIter->next(); | |
236 | ||
237 | if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { | |
238 | if (nextBreak != BreakIterator::DONE) { | |
239 | fprintf(stderr, "break iterator didn't end.\n"); | |
240 | } else if (nextSpaceBreak != BreakIterator::DONE) { | |
241 | fprintf(stderr, "premature break iterator end.\n"); | |
242 | } | |
243 | ||
244 | break; | |
245 | } | |
246 | ||
247 | while (nextSpaceBreak != nextBreak && | |
248 | nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { | |
249 | if (nextSpaceBreak < nextBreak) { | |
250 | breakNotFound(nextSpaceBreak); | |
251 | result = FALSE; | |
252 | nextSpaceBreak = spaceIter.next(); | |
253 | } else if (nextSpaceBreak > nextBreak) { | |
254 | foundInvalidBreak(nextBreak); | |
255 | result = FALSE; | |
256 | nextBreak = breakIter->next(); | |
257 | } | |
258 | } | |
259 | ||
260 | if (fVerbose) { | |
261 | printf("%d %d\n", nextSpaceBreak, nextBreak); | |
262 | } | |
263 | } | |
264 | ||
265 | ||
266 | fWordCount = spaceIter.getWordCount(); | |
267 | ||
268 | delete breakIter; | |
269 | ||
270 | return result; | |
271 | } | |
272 | ||
273 | /* | |
274 | * Report a break that's in the text with spaces but | |
275 | * not found in the text without spaces. | |
276 | */ | |
277 | void ThaiWordbreakTest::breakNotFound(int32_t br) | |
278 | { | |
279 | if (fVerbose) { | |
280 | printf("%d ****\n", br); | |
281 | } else { | |
282 | fprintf(stderr, "break not found: %d\n", br); | |
283 | } | |
284 | ||
285 | fBreaksNotFound += 1; | |
286 | } | |
287 | ||
288 | /* | |
289 | * Report a break that's found in the text without spaces | |
290 | * that isn't in the text with spaces. | |
291 | */ | |
292 | void ThaiWordbreakTest::foundInvalidBreak(int32_t br) | |
293 | { | |
294 | if (fVerbose) { | |
295 | printf("**** %d\n", br); | |
296 | } else { | |
297 | fprintf(stderr, "found invalid break: %d\n", br); | |
298 | } | |
299 | ||
300 | fInvalidBreaks += 1; | |
301 | } | |
302 | ||
303 | /* | |
304 | * Read the text from a file. The text must start with a Unicode Byte | |
305 | * Order Mark (BOM) so that we know what order to read the bytes in. | |
306 | */ | |
307 | const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) | |
308 | { | |
309 | FILE *f; | |
310 | int32_t fileSize; | |
311 | ||
312 | UChar *buffer; | |
313 | char *bufferChars; | |
314 | ||
315 | f = fopen(fileName, "rb"); | |
316 | ||
317 | if( f == NULL ) { | |
318 | fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); | |
319 | return 0; | |
320 | } | |
321 | ||
322 | fseek(f, 0, SEEK_END); | |
323 | fileSize = ftell(f); | |
324 | ||
325 | fseek(f, 0, SEEK_SET); | |
326 | bufferChars = new char[fileSize]; | |
327 | ||
328 | if(bufferChars == 0) { | |
329 | fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); | |
330 | fclose(f); | |
331 | return 0; | |
332 | } | |
333 | ||
334 | fread(bufferChars, sizeof(char), fileSize, f); | |
335 | if( ferror(f) ) { | |
336 | fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); | |
337 | fclose(f); | |
338 | delete[] bufferChars; | |
339 | return 0; | |
340 | } | |
341 | fclose(f); | |
342 | ||
343 | UnicodeString myText(bufferChars, fileSize, "UTF-8"); | |
344 | ||
345 | delete[] bufferChars; | |
346 | ||
347 | charCount = myText.length(); | |
348 | buffer = new UChar[charCount]; | |
349 | if(buffer == 0) { | |
350 | fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); | |
351 | return 0; | |
352 | } | |
353 | ||
354 | myText.extract(1, myText.length(), buffer); | |
355 | charCount--; // skip the BOM | |
356 | buffer[charCount] = 0; // NULL terminate for easier reading in the debugger | |
357 | ||
358 | return buffer; | |
359 | } | |
360 | ||
361 | /* | |
362 | * Remove spaces from the input UChar array. | |
363 | * | |
364 | * We check explicitly for a Unicode code value of 0x0020 | |
365 | * because Unicode::isSpaceChar returns true for CR, LF, etc. | |
366 | * | |
367 | */ | |
368 | const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) | |
369 | { | |
370 | int32_t i, out, spaceCount; | |
371 | ||
372 | spaceCount = 0; | |
373 | for (i = 0; i < count; i += 1) { | |
374 | if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { | |
375 | spaceCount += 1; | |
376 | } | |
377 | } | |
378 | ||
379 | nonSpaceCount = count - spaceCount; | |
380 | UChar *noSpaces = new UChar[nonSpaceCount]; | |
381 | ||
382 | if (noSpaces == 0) { | |
383 | fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); | |
384 | return 0; | |
385 | } | |
386 | ||
387 | for (out = 0, i = 0; i < count; i += 1) { | |
388 | if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { | |
389 | noSpaces[out++] = spaces[i]; | |
390 | } | |
391 | } | |
392 | ||
393 | return noSpaces; | |
394 | } | |
395 | ||
73c04bcf A |
396 | /* |
397 | * Generate a text file with spaces in it from a file without. | |
398 | */ | |
399 | int generateFile(const UChar *chars, int32_t length) { | |
400 | Locale root(""); | |
401 | UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); | |
402 | UErrorCode status = U_ZERO_ERROR; | |
403 | ||
404 | UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); | |
405 | BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); | |
406 | breakIter->adoptText(noSpaceIter); | |
407 | char outbuf[1024]; | |
408 | int32_t strlength; | |
409 | UChar bom = 0xFEFF; | |
410 | ||
411 | printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); | |
412 | int32_t prevbreak = 0; | |
413 | while (U_SUCCESS(status)) { | |
414 | int32_t nextbreak = breakIter->next(); | |
415 | if (nextbreak == BreakIterator::DONE) { | |
416 | break; | |
417 | } | |
418 | printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], | |
419 | nextbreak-prevbreak, &status)); | |
420 | if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) | |
421 | && complexContext.contains(chars[nextbreak])) { | |
422 | printf(" "); | |
423 | } | |
424 | prevbreak = nextbreak; | |
425 | } | |
426 | ||
427 | if (U_FAILURE(status)) { | |
428 | fprintf(stderr, "generate failed: %s\n", u_errorName(status)); | |
429 | return status; | |
430 | } | |
431 | else { | |
432 | return 0; | |
433 | } | |
434 | } | |
435 | ||
b75a7d8f A |
436 | /* |
437 | * The main routine. Read the command line arguments, read the text file, | |
438 | * remove the spaces, do the comparison and report the final results | |
439 | */ | |
440 | int main(int argc, char **argv) | |
441 | { | |
442 | char *fileName = "space.txt"; | |
443 | int arg = 1; | |
444 | UBool verbose = FALSE; | |
73c04bcf A |
445 | UBool generate = FALSE; |
446 | ||
447 | if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { | |
448 | generate = TRUE; | |
449 | arg += 1; | |
450 | } | |
b75a7d8f A |
451 | |
452 | if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { | |
453 | verbose = TRUE; | |
454 | arg += 1; | |
455 | } | |
456 | ||
457 | if (arg == argc - 1) { | |
458 | fileName = argv[arg++]; | |
459 | } | |
460 | ||
461 | if (arg != argc) { | |
462 | fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); | |
463 | return 1; | |
464 | } | |
465 | ||
466 | int32_t spaceCount, nonSpaceCount; | |
467 | const UChar *spaces, *noSpaces; | |
468 | ||
469 | spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); | |
470 | ||
471 | if (spaces == 0) { | |
472 | return 1; | |
473 | } | |
73c04bcf A |
474 | |
475 | if (generate) { | |
476 | return generateFile(spaces, spaceCount); | |
477 | } | |
b75a7d8f A |
478 | |
479 | noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); | |
480 | ||
481 | if (noSpaces == 0) { | |
482 | return 1; | |
483 | } | |
484 | ||
485 | ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); | |
486 | ||
487 | printf("word count: %d\n", test.getWordCount()); | |
488 | printf("breaks not found: %d\n", test.getBreaksNotFound()); | |
489 | printf("invalid breaks found: %d\n", test.getInvalidBreaks()); | |
490 | ||
491 | return 0; | |
492 | } | |
493 | ||
494 | /* | |
495 | * The main constructor. Clear all the counts and construct a default | |
496 | * word instance of a BreakIterator. | |
497 | */ | |
498 | SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) | |
499 | : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) | |
500 | { | |
501 | UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); | |
502 | UErrorCode status = U_ZERO_ERROR; | |
73c04bcf A |
503 | fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); |
504 | Locale root(""); | |
b75a7d8f | 505 | |
73c04bcf | 506 | fBreakIter = BreakIterator::createWordInstance(root, status); |
b75a7d8f A |
507 | fBreakIter->adoptText(iter); |
508 | } | |
509 | ||
510 | SpaceBreakIterator::SpaceBreakIterator() | |
511 | { | |
512 | // nothing | |
513 | } | |
514 | ||
515 | /* | |
516 | * The destructor. delete the underlying BreakIterator | |
517 | */ | |
518 | SpaceBreakIterator::~SpaceBreakIterator() | |
519 | { | |
520 | delete fBreakIter; | |
521 | } | |
522 | ||
523 | /* | |
524 | * Return the next break, counting words and spaces. | |
525 | */ | |
526 | int32_t SpaceBreakIterator::next() | |
527 | { | |
528 | if (fDone) { | |
529 | return BreakIterator::DONE; | |
530 | } | |
531 | ||
73c04bcf A |
532 | int32_t nextBreak; |
533 | do { | |
534 | nextBreak = fBreakIter->next(); | |
535 | ||
536 | if (nextBreak == BreakIterator::DONE) { | |
537 | fDone = TRUE; | |
538 | return BreakIterator::DONE; | |
539 | } | |
b75a7d8f | 540 | } |
73c04bcf A |
541 | while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) |
542 | && fComplexContext.contains(fText[nextBreak])); | |
b75a7d8f A |
543 | |
544 | int32_t result = nextBreak - fSpaceCount; | |
545 | ||
546 | if (nextBreak < fTextCount) { | |
547 | if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { | |
548 | fSpaceCount += fBreakIter->next() - nextBreak; | |
549 | } | |
550 | } | |
551 | ||
552 | fWordCount += 1; | |
553 | ||
554 | return result; | |
555 | } | |
556 | ||
557 | /* | |
558 | * Returns the current space count | |
559 | */ | |
560 | int32_t SpaceBreakIterator::getSpaceCount() | |
561 | { | |
562 | return fSpaceCount; | |
563 | } | |
564 | ||
565 | /* | |
566 | * Returns the current word count | |
567 | */ | |
568 | int32_t SpaceBreakIterator::getWordCount() | |
569 | { | |
570 | return fWordCount; | |
571 | } | |
572 | ||
573 |