icuSources/test/thaitest/thaitest.cpp

   1 /*
   2  ******************************************************************************
   3  * Copyright (C) 1998-2003, International Business Machines Corporation and   *
   4  * others. All Rights Reserved.                                               *
   5  ******************************************************************************
   6  */
   7
   8 #include <errno.h>
   9 #include <stdio.h>
  10 #include <string.h>
  11
  12 #include "unicode/utypes.h"
  13 #include "unicode/uchar.h"
  14 #include "unicode/uchriter.h"
  15 #include "unicode/brkiter.h"
  16 #include "unicode/locid.h"
  17 #include "unicode/unistr.h"
  18
  19 /*
  20  * This program takes a Unicode text file containing Thai text with
  21  * spaces inserted where the word breaks are. It computes a copy of
  22  * the text without spaces and uses a word instance of a Thai BreakIterator
  23  * to compute the word breaks. The program reports any differences in the
  24  * breaks.
  25  *
  26  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
  27  * exptected that this program will always report some differences.
  28  */
  29
  30 /*
  31  * This class is a break iterator that counts words and spaces.
  32  */
  33 class SpaceBreakIterator
  34 {
  35 public:
  36     // The constructor:
  37     // text  - pointer to an array of UChars to iterate over
  38     // count - the number of UChars in text
  39     SpaceBreakIterator(const UChar *text, int32_t count);
  40
  41     // the destructor
  42     ~SpaceBreakIterator();
  43
  44     // return next break position
  45     int32_t next();
  46
  47     // return current word count
  48     int32_t getWordCount();
  49
  50     // return current space count
  51     int32_t getSpaceCount();
  52
  53 private:
  54     // No arg constructor: private so clients can't call it.
  55     SpaceBreakIterator();
  56
  57     // The underlying BreakIterator
  58     BreakIterator *fBreakIter;
  59
  60     // address of the UChar array
  61     const UChar *fText;
  62
  63     // number of UChars in fText
  64     int32_t fTextCount;
  65
  66     // current word count
  67     int32_t fWordCount;
  68
  69     // current space count
  70     int32_t fSpaceCount;
  71
  72     // true when fBreakIter has returned DONE
  73     UBool fDone;
  74 };
  75
  76 /*
  77  * This is the main class. It compares word breaks and reports the differences.
  78  */
  79 class ThaiWordbreakTest
  80 {
  81 public:
  82     // The main constructor:
  83     // spaces       - pointer to a UChar array for the text with spaces
  84     // spaceCount   - the number of characters in the spaces array
  85     // noSpaces     - pointer to a UChar array for the text without spaces
  86     // noSpaceCount - the number of characters in the noSpaces array
  87     // verbose      - report all breaks if true, otherwise just report differences
  88     ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
  89     ~ThaiWordbreakTest();
  90
  91     // returns the number of breaks that are in the spaces array
  92     // but aren't found in the noSpaces array
  93     int32_t getBreaksNotFound();
  94
  95     // returns the number of breaks which are found in the noSpaces
  96     // array but aren't in the spaces array
  97     int32_t getInvalidBreaks();
  98
  99     // returns the number of words found in the spaces array
 100     int32_t getWordCount();
 101
 102     // reads the input Unicode text file:
 103     // fileName  - the path name of the file
 104     // charCount - set to the number of UChars read from the file
 105     // returns   - the address of the UChar array containing the characters
 106     static const UChar *readFile(char *fileName, int32_t &charCount);
 107
 108     // removes spaces form the input UChar array:
 109     // spaces        - pointer to the input UChar array
 110     // count         - number of UChars in the spaces array
 111     // nonSpaceCount - the number of UChars in the result array
 112     // returns       - the address of the UChar array with spaces removed
 113     static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
 114
 115 private:
 116     // The no arg constructor - private so clients can't call it
 117     ThaiWordbreakTest();
 118
 119     // This does the actual comparison:
 120     // spaces - the address of the UChar array for the text with spaces
 121     // spaceCount - the number of UChars in the spaces array
 122     // noSpaces   - the address of the UChar array for the text without spaces
 123     // noSpaceCount - the number of UChars in the noSpaces array
 124     // returns      - true if all breaks match, FALSE otherwise
 125     UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
 126                             const UChar *noSpaces, int32_t noSpaceCount);
 127
 128     // helper method to report a break in the spaces
 129     // array that's not found in the noSpaces array
 130     void breakNotFound(int32_t br);
 131
 132     // helper method to report a break that's found in
 133     // the noSpaces array that's not in the spaces array
 134     void foundInvalidBreak(int32_t br);
 135
 136     // count of breaks in the spaces array that
 137     // aren't found in the noSpaces array
 138     int32_t fBreaksNotFound;
 139
 140     // count of breaks found in the noSpaces array
 141     // that aren't in the spaces array
 142     int32_t fInvalidBreaks;
 143
 144     // number of words found in the spaces array
 145     int32_t fWordCount;
 146
 147     // report all breaks if true, otherwise just report differences
 148     UBool fVerbose;
 149 };
 150
 151 /*
 152  * The main constructor: it calls compareWordBreaks and reports any differences
 153  */
 154 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
 155                                      const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
 156 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
 157 {
 158     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
 159 }
 160
 161 /*
 162  * The no arg constructor
 163  */
 164 ThaiWordbreakTest::ThaiWordbreakTest()
 165 {
 166     // nothing
 167 }
 168
 169 /*
 170  * The destructor
 171  */
 172 ThaiWordbreakTest::~ThaiWordbreakTest()
 173 {
 174     // nothing?
 175 }
 176
 177 /*
 178  * returns the number of breaks in the spaces array
 179  * that aren't found in the noSpaces array
 180  */
 181 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
 182 {
 183     return fBreaksNotFound;
 184 }
 185
 186 /*
 187  * Returns the number of breaks found in the noSpaces
 188  * array that aren't in the spaces array
 189  */
 190 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
 191 {
 192     return fInvalidBreaks;
 193 }
 194
 195 /*
 196  * Returns the number of words found in the spaces array
 197  */
 198 inline int32_t ThaiWordbreakTest::getWordCount()
 199 {
 200     return fWordCount;
 201 }
 202
 203 /*
 204  * This method does the acutal break comparison and reports the results.
 205  * It uses a SpaceBreakIterator to iterate over the text with spaces,
 206  * and a word instance of a Thai BreakIterator to iterate over the text
 207  * without spaces.
 208  */
 209 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
 210                                            const UChar *noSpaces, int32_t noSpaceCount)
 211 {
 212     UBool result = TRUE;
 213     Locale thai("th");
 214     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
 215     UErrorCode status = U_ZERO_ERROR;
 216
 217     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
 218     breakIter->adoptText(noSpaceIter);
 219
 220     SpaceBreakIterator spaceIter(spaces, spaceCount);
 221
 222     int32_t nextBreak = 0;
 223     int32_t nextSpaceBreak = 0;
 224     int32_t iterCount = 0;
 225
 226     while (TRUE) {
 227         nextSpaceBreak = spaceIter.next();
 228         nextBreak = breakIter->next();
 229
 230         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
 231             if (nextBreak != BreakIterator::DONE) {
 232                 fprintf(stderr, "break iterator didn't end.\n");
 233             } else if (nextSpaceBreak != BreakIterator::DONE) {
 234                 fprintf(stderr, "premature break iterator end.\n");
 235             }
 236
 237             break;
 238         }
 239
 240         while (nextSpaceBreak != nextBreak &&
 241                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
 242             if (nextSpaceBreak < nextBreak) {
 243                 breakNotFound(nextSpaceBreak);
 244                 result = FALSE;
 245                 nextSpaceBreak = spaceIter.next();
 246             } else if (nextSpaceBreak > nextBreak) {
 247                 foundInvalidBreak(nextBreak);
 248                 result = FALSE;
 249                 nextBreak = breakIter->next();
 250             }
 251         }
 252
 253         if (fVerbose) {
 254             printf("%d   %d\n", nextSpaceBreak, nextBreak);
 255         }
 256     }
 257
 258
 259     fWordCount = spaceIter.getWordCount();
 260
 261     delete breakIter;
 262
 263     return result;
 264 }
 265
 266 /*
 267  * Report a break that's in the text with spaces but
 268  * not found in the text without spaces.
 269  */
 270 void ThaiWordbreakTest::breakNotFound(int32_t br)
 271 {
 272     if (fVerbose) {
 273         printf("%d   ****\n", br);
 274     } else {
 275         fprintf(stderr, "break not found: %d\n", br);
 276     }
 277
 278     fBreaksNotFound += 1;
 279 }
 280
 281 /*
 282  * Report a break that's found in the text without spaces
 283  * that isn't in the text with spaces.
 284  */
 285 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
 286 {
 287     if (fVerbose) {
 288         printf("****   %d\n", br);
 289     } else {
 290         fprintf(stderr, "found invalid break: %d\n", br);
 291     }
 292
 293     fInvalidBreaks += 1;
 294 }
 295
 296 /*
 297  * Read the text from a file. The text must start with a Unicode Byte
 298  * Order Mark (BOM) so that we know what order to read the bytes in.
 299  */
 300 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
 301 {
 302     FILE *f;
 303     int32_t fileSize;
 304
 305     UChar *buffer;
 306     char *bufferChars;
 307
 308     f = fopen(fileName, "rb");
 309
 310     if( f == NULL ) {
 311         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
 312         return 0;
 313     }
 314
 315     fseek(f, 0, SEEK_END);
 316     fileSize = ftell(f);
 317
 318     fseek(f, 0, SEEK_SET);
 319     bufferChars = new char[fileSize];
 320
 321     if(bufferChars == 0) {
 322         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
 323         fclose(f);
 324         return 0;
 325     }
 326
 327     fread(bufferChars, sizeof(char), fileSize, f);
 328     if( ferror(f) ) {
 329         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
 330         fclose(f);
 331         delete[] bufferChars;
 332         return 0;
 333     }
 334     fclose(f);
 335
 336     UnicodeString myText(bufferChars, fileSize, "UTF-8");
 337
 338     delete[] bufferChars;
 339
 340     charCount = myText.length();
 341     buffer = new UChar[charCount];
 342     if(buffer == 0) {
 343         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
 344         return 0;
 345     }
 346
 347     myText.extract(1, myText.length(), buffer);
 348     charCount--;  // skip the BOM
 349     buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
 350
 351     return buffer;
 352 }
 353
 354 /*
 355  * Remove spaces from the input UChar array.
 356  *
 357  * We check explicitly for a Unicode code value of 0x0020
 358  * because Unicode::isSpaceChar returns true for CR, LF, etc.
 359  *
 360  */
 361 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
 362 {
 363     int32_t i, out, spaceCount;
 364
 365     spaceCount = 0;
 366     for (i = 0; i < count; i += 1) {
 367         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
 368             spaceCount += 1;
 369         }
 370     }
 371
 372     nonSpaceCount = count - spaceCount;
 373     UChar *noSpaces = new UChar[nonSpaceCount];
 374
 375     if (noSpaces == 0) {
 376         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
 377         return 0;
 378     }
 379
 380     for (out = 0, i = 0; i < count; i += 1) {
 381         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
 382             noSpaces[out++] = spaces[i];
 383         }
 384     }
 385
 386     return noSpaces;
 387 }
 388
 389 /*
 390  * The main routine. Read the command line arguments, read the text file,
 391  * remove the spaces, do the comparison and report the final results
 392  */
 393 int main(int argc, char **argv)
 394 {
 395     char *fileName = "space.txt";
 396     int arg = 1;
 397     UBool verbose = FALSE;
 398
 399     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
 400         verbose = TRUE;
 401         arg += 1;
 402     }
 403
 404     if (arg == argc - 1) {
 405         fileName = argv[arg++];
 406     }
 407
 408     if (arg != argc) {
 409         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
 410         return 1;
 411     }
 412
 413     int32_t spaceCount, nonSpaceCount;
 414     const UChar *spaces, *noSpaces;
 415
 416     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
 417
 418     if (spaces == 0) {
 419         return 1;
 420     }
 421
 422     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
 423
 424     if (noSpaces == 0) {
 425         return 1;
 426     }
 427
 428     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
 429
 430     printf("word count: %d\n", test.getWordCount());
 431     printf("breaks not found: %d\n", test.getBreaksNotFound());
 432     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
 433
 434     return 0;
 435 }
 436
 437 /*
 438  * The main constructor. Clear all the counts and construct a default
 439  * word instance of a BreakIterator.
 440  */
 441 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
 442   : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
 443 {
 444     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
 445     UErrorCode status = U_ZERO_ERROR;
 446     Locale us("us");
 447
 448     fBreakIter = BreakIterator::createWordInstance(us, status);
 449     fBreakIter->adoptText(iter);
 450 }
 451
 452 SpaceBreakIterator::SpaceBreakIterator()
 453 {
 454     // nothing
 455 }
 456
 457 /*
 458  * The destructor. delete the underlying BreakIterator
 459  */
 460 SpaceBreakIterator::~SpaceBreakIterator()
 461 {
 462     delete fBreakIter;
 463 }
 464
 465 /*
 466  * Return the next break, counting words and spaces.
 467  */
 468 int32_t SpaceBreakIterator::next()
 469 {
 470     if (fDone) {
 471         return BreakIterator::DONE;
 472     }
 473
 474     int32_t nextBreak = fBreakIter->next();
 475
 476     if (nextBreak == BreakIterator::DONE) {
 477         fDone = TRUE;
 478         return BreakIterator::DONE;
 479     }
 480
 481    int32_t result = nextBreak - fSpaceCount;
 482
 483     if (nextBreak < fTextCount) {
 484         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
 485             fSpaceCount += fBreakIter->next() - nextBreak;
 486         }
 487     }
 488
 489     fWordCount += 1;
 490
 491     return result;
 492 }
 493
 494 /*
 495  * Returns the current space count
 496  */
 497 int32_t SpaceBreakIterator::getSpaceCount()
 498 {
 499     return fSpaceCount;
 500 }
 501
 502 /*
 503  * Returns the current word count
 504  */
 505 int32_t SpaceBreakIterator::getWordCount()
 506 {
 507     return fWordCount;
 508 }
 509
 510