icuSources/common/dbbi.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1999-2004 IBM Corp. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *   12/1/99    rgillam     Complete port from Java.
   7 *   01/13/2000 helena      Added UErrorCode to ctors.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_BREAK_ITERATION
  14
  15 #include "unicode/dbbi.h"
  16 #include "unicode/schriter.h"
  17 #include "dbbi_tbl.h"
  18 #include "uvector.h"
  19 #include "cmemory.h"
  20 #include "uassert.h"
  21
  22 U_NAMESPACE_BEGIN
  23
  24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DictionaryBasedBreakIterator)
  25
  26
  27 //------------------------------------------------------------------------------
  28 //
  29 // constructors
  30 //
  31 //------------------------------------------------------------------------------
  32
  33 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
  34 RuleBasedBreakIterator() {
  35     init();
  36 }
  37
  38
  39 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
  40                                                            const char* dictionaryFilename,
  41                                                            UErrorCode& status)
  42 : RuleBasedBreakIterator(rbbiData, status)
  43 {
  44     init();
  45     if (U_FAILURE(status)) {return;};
  46     fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
  47     if (U_FAILURE(status)) {
  48         if (fTables != NULL) {
  49             fTables->removeReference();
  50             fTables = NULL;
  51         }
  52         return;
  53     }
  54     /* test for NULL */
  55     if(fTables == 0) {
  56         status = U_MEMORY_ALLOCATION_ERROR;
  57         return;
  58     }
  59 }
  60
  61
  62 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
  63 RuleBasedBreakIterator(other)
  64 {
  65     init();
  66     if (other.fTables != NULL) {
  67         fTables = other.fTables;
  68         fTables->addReference();
  69     }
  70 }
  71
  72
  73
  74
  75 //------------------------------------------------------------------------------
  76 //
  77 //   Destructor
  78 //
  79 //------------------------------------------------------------------------------
  80 DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
  81 {
  82     uprv_free(cachedBreakPositions);
  83     cachedBreakPositions = NULL;
  84     if (fTables != NULL) {fTables->removeReference();};
  85 }
  86
  87 //------------------------------------------------------------------------------
  88 //
  89 //   Assignment operator.     Sets this iterator to have the same behavior,
  90 //                            and iterate over the same text, as the one passed in.
  91 //
  92 //------------------------------------------------------------------------------
  93 DictionaryBasedBreakIterator&
  94 DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
  95     if (this == &that) {
  96         return *this;
  97     }
  98     reset();      // clears out cached break positions.
  99     RuleBasedBreakIterator::operator=(that);
 100     if (this->fTables != that.fTables) {
 101         if (this->fTables != NULL) {this->fTables->removeReference();};
 102         this->fTables = that.fTables;
 103         if (this->fTables != NULL) {this->fTables->addReference();};
 104     }
 105     return *this;
 106 }
 107
 108 //------------------------------------------------------------------------------
 109 //
 110 //   Clone()    Returns a newly-constructed RuleBasedBreakIterator with the same
 111 //              behavior, and iterating over the same text, as this one.
 112 //
 113 //------------------------------------------------------------------------------
 114 BreakIterator*
 115 DictionaryBasedBreakIterator::clone() const {
 116     return new DictionaryBasedBreakIterator(*this);
 117 }
 118
 119 //=======================================================================
 120 // BreakIterator overrides
 121 //=======================================================================
 122
 123 /**
 124  * Advances the iterator one step backwards.
 125  * @return The position of the last boundary position before the
 126  * current iteration position
 127  */
 128 int32_t
 129 DictionaryBasedBreakIterator::previous()
 130 {
 131     // if we have cached break positions and we're still in the range
 132     // covered by them, just move one step backward in the cache
 133     if (cachedBreakPositions != NULL && positionInCache > 0) {
 134         --positionInCache;
 135         fText->setIndex(cachedBreakPositions[positionInCache]);
 136         return cachedBreakPositions[positionInCache];
 137     }
 138
 139     // otherwise, dump the cache and use the inherited previous() method to move
 140     // backward.  This may fill up the cache with new break positions, in which
 141     // case we have to mark our position in the cache
 142     else {
 143         reset();
 144         int32_t result = RuleBasedBreakIterator::previous();
 145         if (cachedBreakPositions != NULL) {
 146             for (positionInCache=0;
 147                 cachedBreakPositions[positionInCache] != result;
 148                 positionInCache++);
 149             U_ASSERT(positionInCache < numCachedBreakPositions);
 150             if (positionInCache >= numCachedBreakPositions) {
 151                 // Something has gone wrong.  Dump the cache.
 152                 reset();
 153             }
 154         }
 155         return result;
 156     }
 157 }
 158
 159 /**
 160  * Sets the current iteration position to the last boundary position
 161  * before the specified position.
 162  * @param offset The position to begin searching from
 163  * @return The position of the last boundary before "offset"
 164  */
 165 int32_t
 166 DictionaryBasedBreakIterator::preceding(int32_t offset)
 167 {
 168     // if the offset passed in is already past the end of the text,
 169     // just return DONE; if it's before the beginning, return the
 170     // text's starting offset
 171     if (fText == NULL || offset > fText->endIndex()) {
 172         return BreakIterator::DONE;
 173     }
 174     else if (offset < fText->startIndex()) {
 175         return fText->startIndex();
 176     }
 177
 178     // if we have no cached break positions, or "offset" is outside the
 179     // range covered by the cache, we can just call the inherited routine
 180     // (which will eventually call other routines in this class that may
 181     // refresh the cache)
 182     if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
 183             offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
 184         reset();
 185         return RuleBasedBreakIterator::preceding(offset);
 186     }
 187
 188     // on the other hand, if "offset" is within the range covered by the cache,
 189     // then all we have to do is search the cache for the last break position
 190     // before "offset"
 191     else {
 192         positionInCache = 0;
 193         while (positionInCache < numCachedBreakPositions
 194                && offset > cachedBreakPositions[positionInCache])
 195             ++positionInCache;
 196         --positionInCache;
 197         fText->setIndex(cachedBreakPositions[positionInCache]);
 198         return fText->getIndex();
 199     }
 200 }
 201
 202 /**
 203  * Sets the current iteration position to the first boundary position after
 204  * the specified position.
 205  * @param offset The position to begin searching forward from
 206  * @return The position of the first boundary after "offset"
 207  */
 208 int32_t
 209 DictionaryBasedBreakIterator::following(int32_t offset)
 210 {
 211     // if the offset passed in is already past the end of the text,
 212     // just return DONE; if it's before the beginning, return the
 213     // text's starting offset
 214     if (fText == NULL || offset > fText->endIndex()) {
 215         return BreakIterator::DONE;
 216     }
 217     else if (offset < fText->startIndex()) {
 218         return fText->startIndex();
 219     }
 220
 221     // if we have no cached break positions, or if "offset" is outside the
 222     // range covered by the cache, then dump the cache and call our
 223     // inherited following() method.  This will call other methods in this
 224     // class that may refresh the cache.
 225     if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
 226             offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
 227         reset();
 228         return RuleBasedBreakIterator::following(offset);
 229     }
 230
 231     // on the other hand, if "offset" is within the range covered by the
 232     // cache, then just search the cache for the first break position
 233     // after "offset"
 234     else {
 235         positionInCache = 0;
 236         while (positionInCache < numCachedBreakPositions
 237                && offset >= cachedBreakPositions[positionInCache])
 238             ++positionInCache;
 239         fText->setIndex(cachedBreakPositions[positionInCache]);
 240         return fText->getIndex();
 241     }
 242 }
 243
 244 /**
 245  * This is the implementation function for next().
 246  */
 247 int32_t
 248 DictionaryBasedBreakIterator::handleNext()
 249 {
 250     UErrorCode status = U_ZERO_ERROR;
 251     // if there are no cached break positions, or if we've just moved
 252     // off the end of the range covered by the cache, we have to dump
 253     // and possibly regenerate the cache
 254     if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
 255
 256         // start by using the inherited handleNext() to find a tentative return
 257         // value.   dictionaryCharCount tells us how many dictionary characters
 258         // we passed over on our way to the tentative return value
 259         int32_t startPos = fText->getIndex();
 260         fDictionaryCharCount = 0;
 261         int32_t result = RuleBasedBreakIterator::handleNext();
 262
 263         // if we passed over more than one dictionary character, then we use
 264         // divideUpDictionaryRange() to regenerate the cached break positions
 265         // for the new range
 266         if (fDictionaryCharCount > 1 && result - startPos > 1) {
 267             divideUpDictionaryRange(startPos, result, status);
 268             U_ASSERT(U_SUCCESS(status));
 269             if (U_FAILURE(status)) {
 270                 // Something went badly wrong, an internal error.
 271                 // We have no way from here to report it to caller.
 272                 // Treat as if this is if the dictionary did not apply to range.
 273                 reset();
 274                 return result;
 275             }
 276         }
 277
 278         // otherwise, the value we got back from the inherited fuction
 279         // is our return value, and we can dump the cache
 280         else {
 281             reset();
 282             return result;
 283         }
 284     }
 285
 286     // if the cache of break positions has been regenerated (or existed all
 287     // along), then just advance to the next break position in the cache
 288     // and return it
 289     if (cachedBreakPositions != NULL) {
 290         ++positionInCache;
 291         fText->setIndex(cachedBreakPositions[positionInCache]);
 292         return cachedBreakPositions[positionInCache];
 293     }
 294     return -9999;   // SHOULD NEVER GET HERE!
 295 }
 296
 297 void
 298 DictionaryBasedBreakIterator::reset()
 299 {
 300     uprv_free(cachedBreakPositions);
 301     cachedBreakPositions = NULL;
 302     numCachedBreakPositions = 0;
 303     fDictionaryCharCount = 0;
 304     positionInCache = 0;
 305 }
 306
 307
 308
 309 //------------------------------------------------------------------------------
 310 //
 311 //    init()    Common initialization routine, for use by constructors, etc.
 312 //
 313 //------------------------------------------------------------------------------
 314 void DictionaryBasedBreakIterator::init() {
 315     cachedBreakPositions    = NULL;
 316     fTables                 = NULL;
 317     numCachedBreakPositions = 0;
 318     fDictionaryCharCount    = 0;
 319     positionInCache         = 0;
 320 }
 321
 322
 323 //------------------------------------------------------------------------------
 324 //
 325 //    BufferClone
 326 //
 327 //------------------------------------------------------------------------------
 328 BreakIterator *  DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
 329                                    int32_t &bufferSize,
 330                                    UErrorCode &status)
 331 {
 332     if (U_FAILURE(status)){
 333         return NULL;
 334     }
 335
 336     //
 337     //  If user buffer size is zero this is a preflight operation to
 338     //    obtain the needed buffer size, allowing for worst case misalignment.
 339     //
 340     if (bufferSize == 0) {
 341         bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
 342         return NULL;
 343     }
 344
 345     //
 346     //  Check the alignment and size of the user supplied buffer.
 347     //  Allocate heap memory if the user supplied memory is insufficient.
 348     //
 349     char     *buf   = (char *)stackBuffer;
 350     uint32_t s      = bufferSize;
 351
 352     if (stackBuffer == NULL) {
 353         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
 354     }
 355     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
 356         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
 357         s   -= offsetUp;
 358         buf += offsetUp;
 359     }
 360     if (s < sizeof(DictionaryBasedBreakIterator)) {
 361         buf = (char *) new DictionaryBasedBreakIterator();
 362         if (buf == 0) {
 363             status = U_MEMORY_ALLOCATION_ERROR;
 364             return NULL;
 365         }
 366         status = U_SAFECLONE_ALLOCATED_WARNING;
 367     }
 368
 369     //
 370     //  Initialize the clone object.
 371     //    TODO:  using an overloaded C++ "operator new" to directly initialize the
 372     //           copy in the user's buffer would be better, but it doesn't seem
 373     //           to get along with namespaces.  Investigate why.
 374     //
 375     //           The memcpy is only safe with an empty (default constructed)
 376     //           break iterator.  Use on others can screw up reference counts
 377     //           to data.  memcpy-ing objects is not really a good idea...
 378     //
 379     DictionaryBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
 380     DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
 381     uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
 382     *clone = *this;                               // clone = the real one we want.
 383     if (status != U_SAFECLONE_ALLOCATED_WARNING) {
 384         clone->fBufferClone = TRUE;
 385     }
 386     return clone;
 387 }
 388
 389
 390
 391
 392 /**
 393  * This is the function that actually implements the dictionary-based
 394  * algorithm.  Given the endpoints of a range of text, it uses the
 395  * dictionary to determine the positions of any boundaries in this
 396  * range.  It stores all the boundary positions it discovers in
 397  * cachedBreakPositions so that we only have to do this work once
 398  * for each time we enter the range.
 399  */
 400 void
 401 DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
 402 {
 403     // the range we're dividing may begin or end with non-dictionary characters
 404     // (i.e., for line breaking, we may have leading or trailing punctuation
 405     // that needs to be kept with the word).  Seek from the beginning of the
 406     // range to the first dictionary character
 407     fText->setIndex(startPos);
 408     UChar c = fText->current();
 409     while (isDictionaryChar(c) == FALSE) {
 410         c = fText->next();
 411     }
 412
 413     if (U_FAILURE(status)) {
 414         return; // UStack below overwrites the status error codes
 415     }
 416
 417     // initialize.  We maintain two stacks: currentBreakPositions contains
 418     // the list of break positions that will be returned if we successfully
 419     // finish traversing the whole range now.  possibleBreakPositions lists
 420     // all other possible word ends we've passed along the way.  (Whenever
 421     // we reach an error [a sequence of characters that can't begin any word
 422     // in the dictionary], we back up, possibly delete some breaks from
 423     // currentBreakPositions, move a break from possibleBreakPositions
 424     // to currentBreakPositions, and start over from there.  This process
 425     // continues in this way until we either successfully make it all the way
 426     // across the range, or exhaust all of our combinations of break
 427     // positions.) wrongBreakPositions is used to keep track of paths we've
 428     // tried on previous iterations.  As the iterator backs up further and
 429     // further, this saves us from having to follow each possible path
 430     // through the text all the way to the error (hopefully avoiding many
 431     // future recursive calls as well).
 432     // there can be only one kind of error in UStack and UVector, so we'll
 433     // just let the error fall through
 434     UStack currentBreakPositions(status);
 435     UStack possibleBreakPositions(status);
 436     UVector wrongBreakPositions(status);
 437
 438     // the dictionary is implemented as a trie, which is treated as a state
 439     // machine.  -1 represents the end of a legal word.  Every word in the
 440     // dictionary is represented by a path from the root node to -1.  A path
 441     // that ends in state 0 is an illegal combination of characters.
 442     int16_t state = 0;
 443
 444     // these two variables are used for error handling.  We keep track of the
 445     // farthest we've gotten through the range being divided, and the combination
 446     // of breaks that got us that far.  If we use up all possible break
 447     // combinations, the text contains an error or a word that's not in the
 448     // dictionary.  In this case, we "bless" the break positions that got us the
 449     // farthest as real break positions, and then start over from scratch with
 450     // the character where the error occurred.
 451     int32_t farthestEndPoint = fText->getIndex();
 452     UStack bestBreakPositions(status);
 453     UBool bestBreakPositionsInitialized = FALSE;
 454
 455     if (U_FAILURE(status)) {
 456         return;
 457     }
 458     // initialize (we always exit the loop with a break statement)
 459     c = fText->current();
 460     for (;;) {
 461
 462         // if we can transition to state "-1" from our current state, we're
 463         // on the last character of a legal word.  Push that position onto
 464         // the possible-break-positions stack
 465         if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
 466             possibleBreakPositions.push(fText->getIndex(), status);
 467             if (U_FAILURE(status)) {
 468                 return;
 469             }
 470         }
 471
 472         // look up the new state to transition to in the dictionary
 473         state = fTables->fDictionary->at(state, c);
 474
 475         // if the character we're sitting on causes us to transition to
 476         // the "end of word" state, then it was a non-dictionary character
 477         // and we've successfully traversed the whole range.  Drop out
 478         // of the loop.
 479         if (state == -1) {
 480             currentBreakPositions.push(fText->getIndex(), status);
 481             if (U_FAILURE(status)) {
 482                 return;
 483             }
 484             break;
 485         }
 486
 487         // if the character we're sitting on causes us to transition to
 488         // the error state, or if we've gone off the end of the range
 489         // without transitioning to the "end of word" state, we've hit
 490         // an error...
 491         else if (state == 0 || fText->getIndex() >= endPos) {
 492
 493             // if this is the farthest we've gotten, take note of it in
 494             // case there's an error in the text
 495             if (fText->getIndex() > farthestEndPoint) {
 496                 farthestEndPoint = fText->getIndex();
 497                 bestBreakPositions.removeAllElements();
 498                 bestBreakPositionsInitialized = TRUE;
 499                 for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
 500                     bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
 501                 }
 502             }
 503
 504             // wrongBreakPositions is a list of all break positions we've tried starting
 505             // that didn't allow us to traverse all the way through the text.  Every time
 506             // we pop a break position off of currentBreakPositions, we put it into
 507             // wrongBreakPositions to avoid trying it again later.  If we make it to this
 508             // spot, we're either going to back up to a break in possibleBreakPositions
 509             // and try starting over from there, or we've exhausted all possible break
 510             // positions and are going to do the fallback procedure.  This loop prevents
 511             // us from messing with anything in possibleBreakPositions that didn't work as
 512             // a starting point the last time we tried it (this is to prevent a bunch of
 513             // repetitive checks from slowing down some extreme cases)
 514             while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
 515                         possibleBreakPositions.peeki())) {
 516                 possibleBreakPositions.popi();
 517             }
 518
 519             // if we've used up all possible break-position combinations, there's
 520             // an error or an unknown word in the text.  In this case, we start
 521             // over, treating the farthest character we've reached as the beginning
 522             // of the range, and "blessing" the break positions that got us that
 523             // far as real break positions
 524             if (possibleBreakPositions.isEmpty()) {
 525                 if (bestBreakPositionsInitialized) {
 526                     currentBreakPositions.removeAllElements();
 527                     for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
 528                         currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
 529                         if (U_FAILURE(status)) {
 530                             return;
 531                         }
 532                     }
 533                     bestBreakPositions.removeAllElements();
 534                     if (farthestEndPoint < endPos) {
 535                         fText->setIndex(farthestEndPoint + 1);
 536                     }
 537                     else {
 538                         break;
 539                     }
 540                 }
 541                 else {
 542                     if ((currentBreakPositions.isEmpty()
 543                             || currentBreakPositions.peeki() != fText->getIndex())
 544                             && fText->getIndex() != startPos) {
 545                         currentBreakPositions.push(fText->getIndex(), status);
 546                         if (U_FAILURE(status)) {
 547                             return;
 548                         }
 549                     }
 550                     fText->next();
 551                     currentBreakPositions.push(fText->getIndex(), status);
 552                     if (U_FAILURE(status)) {
 553                         return;
 554                     }
 555                 }
 556             }
 557
 558             // if we still have more break positions we can try, then promote the
 559             // last break in possibleBreakPositions into currentBreakPositions,
 560             // and get rid of all entries in currentBreakPositions that come after
 561             // it.  Then back up to that position and start over from there (i.e.,
 562             // treat that position as the beginning of a new word)
 563             else {
 564                 int32_t temp = possibleBreakPositions.popi();
 565                 int32_t temp2 = 0;
 566                 while (!currentBreakPositions.isEmpty() && temp <
 567                        currentBreakPositions.peeki()) {
 568                     temp2 = currentBreakPositions.popi();
 569                     wrongBreakPositions.addElement(temp2, status);
 570                 }
 571                 currentBreakPositions.push(temp, status);
 572                 fText->setIndex(currentBreakPositions.peeki());
 573             }
 574
 575             // re-sync "c" for the next go-round, and drop out of the loop if
 576             // we've made it off the end of the range
 577             c = fText->current();
 578             if (fText->getIndex() >= endPos) {
 579                 break;
 580             }
 581         }
 582
 583         // if we didn't hit any exceptional conditions on this last iteration,
 584         // just advance to the next character and loop
 585         else {
 586             c = fText->next();
 587         }
 588     }
 589
 590     // dump the last break position in the list, and replace it with the actual
 591     // end of the range (which may be the same character, or may be further on
 592     // because the range actually ended with non-dictionary characters we want to
 593     // keep with the word)
 594     if (!currentBreakPositions.isEmpty()) {
 595         currentBreakPositions.popi();
 596     }
 597     currentBreakPositions.push(endPos, status);
 598     if (U_FAILURE(status)) {
 599         return;
 600     }
 601
 602     // create a regular array to hold the break positions and copy
 603     // the break positions from the stack to the array (in addition,
 604     // our starting position goes into this array as a break position).
 605     // This array becomes the cache of break positions used by next()
 606     // and previous(), so this is where we actually refresh the cache.
 607     if (cachedBreakPositions != NULL) {
 608         uprv_free(cachedBreakPositions);
 609     }
 610     cachedBreakPositions = (int32_t *)uprv_malloc((currentBreakPositions.size() + 1) * sizeof(int32_t));
 611     /* Test for NULL */
 612     if(cachedBreakPositions == NULL) {
 613         status = U_MEMORY_ALLOCATION_ERROR;
 614         return;
 615     }
 616     numCachedBreakPositions = currentBreakPositions.size() + 1;
 617     cachedBreakPositions[0] = startPos;
 618
 619     for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
 620         cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
 621     }
 622     positionInCache = 0;
 623 }
 624
 625 U_NAMESPACE_END
 626
 627 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 628
 629 /* eof */