icuSources/test/thaitest/thaitest.cpp

   1 /*
   2  ******************************************************************************
   3  * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
   4  * and others. All Rights Reserved.                                           *
   5  ******************************************************************************
   6  */
   7
   8 #include <errno.h>
   9 #include <stdio.h>
  10 #include <string.h>
  11
  12 #include "unicode/utypes.h"
  13 #include "unicode/uchar.h"
  14 #include "unicode/uchriter.h"
  15 #include "unicode/brkiter.h"
  16 #include "unicode/locid.h"
  17 #include "unicode/unistr.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/ustring.h"
  20
  21 /*
  22  * This program takes a Unicode text file containing Thai text with
  23  * spaces inserted where the word breaks are. It computes a copy of
  24  * the text without spaces and uses a word instance of a Thai BreakIterator
  25  * to compute the word breaks. The program reports any differences in the
  26  * breaks.
  27  *
  28  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
  29  * exptected that this program will always report some differences.
  30  */
  31
  32 /*
  33  * This class is a break iterator that counts words and spaces.
  34  */
  35 class SpaceBreakIterator
  36 {
  37 public:
  38     // The constructor:
  39     // text  - pointer to an array of UChars to iterate over
  40     // count - the number of UChars in text
  41     SpaceBreakIterator(const UChar *text, int32_t count);
  42
  43     // the destructor
  44     ~SpaceBreakIterator();
  45
  46     // return next break position
  47     int32_t next();
  48
  49     // return current word count
  50     int32_t getWordCount();
  51
  52     // return current space count
  53     int32_t getSpaceCount();
  54
  55 private:
  56     // No arg constructor: private so clients can't call it.
  57     SpaceBreakIterator();
  58
  59     // The underlying BreakIterator
  60     BreakIterator *fBreakIter;
  61
  62     // address of the UChar array
  63     const UChar *fText;
  64
  65     // number of UChars in fText
  66     int32_t fTextCount;
  67
  68     // current word count
  69     int32_t fWordCount;
  70
  71     // current space count
  72     int32_t fSpaceCount;
  73
  74     // UnicodeSet of SA characters
  75     UnicodeSet fComplexContext;
  76
  77     // true when fBreakIter has returned DONE
  78     UBool fDone;
  79 };
  80
  81 /*
  82  * This is the main class. It compares word breaks and reports the differences.
  83  */
  84 class ThaiWordbreakTest
  85 {
  86 public:
  87     // The main constructor:
  88     // spaces       - pointer to a UChar array for the text with spaces
  89     // spaceCount   - the number of characters in the spaces array
  90     // noSpaces     - pointer to a UChar array for the text without spaces
  91     // noSpaceCount - the number of characters in the noSpaces array
  92     // verbose      - report all breaks if true, otherwise just report differences
  93     ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
  94     ~ThaiWordbreakTest();
  95
  96     // returns the number of breaks that are in the spaces array
  97     // but aren't found in the noSpaces array
  98     int32_t getBreaksNotFound();
  99
 100     // returns the number of breaks which are found in the noSpaces
 101     // array but aren't in the spaces array
 102     int32_t getInvalidBreaks();
 103
 104     // returns the number of words found in the spaces array
 105     int32_t getWordCount();
 106
 107     // reads the input Unicode text file:
 108     // fileName  - the path name of the file
 109     // charCount - set to the number of UChars read from the file
 110     // returns   - the address of the UChar array containing the characters
 111     static const UChar *readFile(char *fileName, int32_t &charCount);
 112
 113     // removes spaces form the input UChar array:
 114     // spaces        - pointer to the input UChar array
 115     // count         - number of UChars in the spaces array
 116     // nonSpaceCount - the number of UChars in the result array
 117     // returns       - the address of the UChar array with spaces removed
 118     static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
 119
 120 private:
 121     // The no arg constructor - private so clients can't call it
 122     ThaiWordbreakTest();
 123
 124     // This does the actual comparison:
 125     // spaces - the address of the UChar array for the text with spaces
 126     // spaceCount - the number of UChars in the spaces array
 127     // noSpaces   - the address of the UChar array for the text without spaces
 128     // noSpaceCount - the number of UChars in the noSpaces array
 129     // returns      - true if all breaks match, FALSE otherwise
 130     UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
 131                             const UChar *noSpaces, int32_t noSpaceCount);
 132
 133     // helper method to report a break in the spaces
 134     // array that's not found in the noSpaces array
 135     void breakNotFound(int32_t br);
 136
 137     // helper method to report a break that's found in
 138     // the noSpaces array that's not in the spaces array
 139     void foundInvalidBreak(int32_t br);
 140
 141     // count of breaks in the spaces array that
 142     // aren't found in the noSpaces array
 143     int32_t fBreaksNotFound;
 144
 145     // count of breaks found in the noSpaces array
 146     // that aren't in the spaces array
 147     int32_t fInvalidBreaks;
 148
 149     // number of words found in the spaces array
 150     int32_t fWordCount;
 151
 152     // report all breaks if true, otherwise just report differences
 153     UBool fVerbose;
 154 };
 155
 156 /*
 157  * The main constructor: it calls compareWordBreaks and reports any differences
 158  */
 159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
 160                                      const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
 161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
 162 {
 163     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
 164 }
 165
 166 /*
 167  * The no arg constructor
 168  */
 169 ThaiWordbreakTest::ThaiWordbreakTest()
 170 {
 171     // nothing
 172 }
 173
 174 /*
 175  * The destructor
 176  */
 177 ThaiWordbreakTest::~ThaiWordbreakTest()
 178 {
 179     // nothing?
 180 }
 181
 182 /*
 183  * returns the number of breaks in the spaces array
 184  * that aren't found in the noSpaces array
 185  */
 186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
 187 {
 188     return fBreaksNotFound;
 189 }
 190
 191 /*
 192  * Returns the number of breaks found in the noSpaces
 193  * array that aren't in the spaces array
 194  */
 195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
 196 {
 197     return fInvalidBreaks;
 198 }
 199
 200 /*
 201  * Returns the number of words found in the spaces array
 202  */
 203 inline int32_t ThaiWordbreakTest::getWordCount()
 204 {
 205     return fWordCount;
 206 }
 207
 208 /*
 209  * This method does the acutal break comparison and reports the results.
 210  * It uses a SpaceBreakIterator to iterate over the text with spaces,
 211  * and a word instance of a Thai BreakIterator to iterate over the text
 212  * without spaces.
 213  */
 214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
 215                                            const UChar *noSpaces, int32_t noSpaceCount)
 216 {
 217     UBool result = TRUE;
 218     Locale thai("th");
 219     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
 220     UErrorCode status = U_ZERO_ERROR;
 221
 222     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
 223     breakIter->adoptText(noSpaceIter);
 224
 225     SpaceBreakIterator spaceIter(spaces, spaceCount);
 226
 227     int32_t nextBreak = 0;
 228     int32_t nextSpaceBreak = 0;
 229     int32_t iterCount = 0;
 230
 231     while (TRUE) {
 232         nextSpaceBreak = spaceIter.next();
 233         nextBreak = breakIter->next();
 234
 235         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
 236             if (nextBreak != BreakIterator::DONE) {
 237                 fprintf(stderr, "break iterator didn't end.\n");
 238             } else if (nextSpaceBreak != BreakIterator::DONE) {
 239                 fprintf(stderr, "premature break iterator end.\n");
 240             }
 241
 242             break;
 243         }
 244
 245         while (nextSpaceBreak != nextBreak &&
 246                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
 247             if (nextSpaceBreak < nextBreak) {
 248                 breakNotFound(nextSpaceBreak);
 249                 result = FALSE;
 250                 nextSpaceBreak = spaceIter.next();
 251             } else if (nextSpaceBreak > nextBreak) {
 252                 foundInvalidBreak(nextBreak);
 253                 result = FALSE;
 254                 nextBreak = breakIter->next();
 255             }
 256         }
 257
 258         if (fVerbose) {
 259             printf("%d   %d\n", nextSpaceBreak, nextBreak);
 260         }
 261     }
 262
 263
 264     fWordCount = spaceIter.getWordCount();
 265
 266     delete breakIter;
 267
 268     return result;
 269 }
 270
 271 /*
 272  * Report a break that's in the text with spaces but
 273  * not found in the text without spaces.
 274  */
 275 void ThaiWordbreakTest::breakNotFound(int32_t br)
 276 {
 277     if (fVerbose) {
 278         printf("%d   ****\n", br);
 279     } else {
 280         fprintf(stderr, "break not found: %d\n", br);
 281     }
 282
 283     fBreaksNotFound += 1;
 284 }
 285
 286 /*
 287  * Report a break that's found in the text without spaces
 288  * that isn't in the text with spaces.
 289  */
 290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
 291 {
 292     if (fVerbose) {
 293         printf("****   %d\n", br);
 294     } else {
 295         fprintf(stderr, "found invalid break: %d\n", br);
 296     }
 297
 298     fInvalidBreaks += 1;
 299 }
 300
 301 /*
 302  * Read the text from a file. The text must start with a Unicode Byte
 303  * Order Mark (BOM) so that we know what order to read the bytes in.
 304  */
 305 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
 306 {
 307     FILE *f;
 308     int32_t fileSize;
 309
 310     UChar *buffer;
 311     char *bufferChars;
 312
 313     f = fopen(fileName, "rb");
 314
 315     if( f == NULL ) {
 316         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
 317         return 0;
 318     }
 319
 320     fseek(f, 0, SEEK_END);
 321     fileSize = ftell(f);
 322
 323     fseek(f, 0, SEEK_SET);
 324     bufferChars = new char[fileSize];
 325
 326     if(bufferChars == 0) {
 327         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
 328         fclose(f);
 329         return 0;
 330     }
 331
 332     fread(bufferChars, sizeof(char), fileSize, f);
 333     if( ferror(f) ) {
 334         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
 335         fclose(f);
 336         delete[] bufferChars;
 337         return 0;
 338     }
 339     fclose(f);
 340
 341     UnicodeString myText(bufferChars, fileSize, "UTF-8");
 342
 343     delete[] bufferChars;
 344
 345     charCount = myText.length();
 346     buffer = new UChar[charCount];
 347     if(buffer == 0) {
 348         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
 349         return 0;
 350     }
 351
 352     myText.extract(1, myText.length(), buffer);
 353     charCount--;  // skip the BOM
 354     buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
 355
 356     return buffer;
 357 }
 358
 359 /*
 360  * Remove spaces from the input UChar array.
 361  *
 362  * We check explicitly for a Unicode code value of 0x0020
 363  * because Unicode::isSpaceChar returns true for CR, LF, etc.
 364  *
 365  */
 366 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
 367 {
 368     int32_t i, out, spaceCount;
 369
 370     spaceCount = 0;
 371     for (i = 0; i < count; i += 1) {
 372         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
 373             spaceCount += 1;
 374         }
 375     }
 376
 377     nonSpaceCount = count - spaceCount;
 378     UChar *noSpaces = new UChar[nonSpaceCount];
 379
 380     if (noSpaces == 0) {
 381         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
 382         return 0;
 383     }
 384
 385     for (out = 0, i = 0; i < count; i += 1) {
 386         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
 387             noSpaces[out++] = spaces[i];
 388         }
 389     }
 390
 391     return noSpaces;
 392 }
 393
 394 /*
 395  * Generate a text file with spaces in it from a file without.
 396  */
 397 int generateFile(const UChar *chars, int32_t length) {
 398     Locale root("");
 399     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
 400     UErrorCode status = U_ZERO_ERROR;
 401
 402     UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
 403     BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
 404     breakIter->adoptText(noSpaceIter);
 405     char outbuf[1024];
 406     int32_t strlength;
 407     UChar bom = 0xFEFF;
 408
 409     printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
 410     int32_t prevbreak = 0;
 411     while (U_SUCCESS(status)) {
 412         int32_t nextbreak = breakIter->next();
 413         if (nextbreak == BreakIterator::DONE) {
 414             break;
 415         }
 416         printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
 417                                     nextbreak-prevbreak, &status));
 418         if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
 419             && complexContext.contains(chars[nextbreak])) {
 420             printf(" ");
 421         }
 422         prevbreak = nextbreak;
 423     }
 424
 425     if (U_FAILURE(status)) {
 426         fprintf(stderr, "generate failed: %s\n", u_errorName(status));
 427         return status;
 428     }
 429     else {
 430         return 0;
 431     }
 432 }
 433
 434 /*
 435  * The main routine. Read the command line arguments, read the text file,
 436  * remove the spaces, do the comparison and report the final results
 437  */
 438 int main(int argc, char **argv)
 439 {
 440     char *fileName = "space.txt";
 441     int arg = 1;
 442     UBool verbose = FALSE;
 443     UBool generate = FALSE;
 444
 445     if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
 446         generate = TRUE;
 447         arg += 1;
 448     }
 449
 450     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
 451         verbose = TRUE;
 452         arg += 1;
 453     }
 454
 455     if (arg == argc - 1) {
 456         fileName = argv[arg++];
 457     }
 458
 459     if (arg != argc) {
 460         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
 461         return 1;
 462     }
 463
 464     int32_t spaceCount, nonSpaceCount;
 465     const UChar *spaces, *noSpaces;
 466
 467     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
 468
 469     if (spaces == 0) {
 470         return 1;
 471     }
 472
 473     if (generate) {
 474         return generateFile(spaces, spaceCount);
 475     }
 476
 477     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
 478
 479     if (noSpaces == 0) {
 480         return 1;
 481     }
 482
 483     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
 484
 485     printf("word count: %d\n", test.getWordCount());
 486     printf("breaks not found: %d\n", test.getBreaksNotFound());
 487     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
 488
 489     return 0;
 490 }
 491
 492 /*
 493  * The main constructor. Clear all the counts and construct a default
 494  * word instance of a BreakIterator.
 495  */
 496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
 497   : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
 498 {
 499     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
 500     UErrorCode status = U_ZERO_ERROR;
 501     fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
 502     Locale root("");
 503
 504     fBreakIter = BreakIterator::createWordInstance(root, status);
 505     fBreakIter->adoptText(iter);
 506 }
 507
 508 SpaceBreakIterator::SpaceBreakIterator()
 509 {
 510     // nothing
 511 }
 512
 513 /*
 514  * The destructor. delete the underlying BreakIterator
 515  */
 516 SpaceBreakIterator::~SpaceBreakIterator()
 517 {
 518     delete fBreakIter;
 519 }
 520
 521 /*
 522  * Return the next break, counting words and spaces.
 523  */
 524 int32_t SpaceBreakIterator::next()
 525 {
 526     if (fDone) {
 527         return BreakIterator::DONE;
 528     }
 529
 530     int32_t nextBreak;
 531     do {
 532         nextBreak = fBreakIter->next();
 533
 534         if (nextBreak == BreakIterator::DONE) {
 535             fDone = TRUE;
 536             return BreakIterator::DONE;
 537         }
 538     }
 539     while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
 540             && fComplexContext.contains(fText[nextBreak]));
 541
 542    int32_t result = nextBreak - fSpaceCount;
 543
 544     if (nextBreak < fTextCount) {
 545         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
 546             fSpaceCount += fBreakIter->next() - nextBreak;
 547         }
 548     }
 549
 550     fWordCount += 1;
 551
 552     return result;
 553 }
 554
 555 /*
 556  * Returns the current space count
 557  */
 558 int32_t SpaceBreakIterator::getSpaceCount()
 559 {
 560     return fSpaceCount;
 561 }
 562
 563 /*
 564  * Returns the current word count
 565  */
 566 int32_t SpaceBreakIterator::getWordCount()
 567 {
 568     return fWordCount;
 569 }
 570
 571