icuSources/common/rbbi.cpp

   1 /*
   2 ***************************************************************************
   3 *   Copyright (C) 1999-2008 International Business Machines Corporation   *
   4 *   and others. All rights reserved.                                      *
   5 ***************************************************************************
   6 */
   7 //
   8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
   9 //                   runtime engine and the API implementation for
  10 //                   class RuleBasedBreakIterator
  11 //
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include "unicode/rbbi.h"
  18 #include "unicode/schriter.h"
  19 #include "unicode/uchriter.h"
  20 #include "unicode/udata.h"
  21 #include "unicode/uclean.h"
  22 #include "rbbidata.h"
  23 #include "rbbirb.h"
  24 #include "cmemory.h"
  25 #include "cstring.h"
  26 #include "umutex.h"
  27 #include "ucln_cmn.h"
  28 #include "brkeng.h"
  29
  30 #include "uassert.h"
  31 #include "uvector.h"
  32
  33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
  34 #if U_LOCAL_SERVICE_HOOK
  35 #include "localsvc.h"
  36 #endif
  37
  38 #ifdef RBBI_DEBUG
  39 static UBool fTrace = FALSE;
  40 #endif
  41
  42 U_NAMESPACE_BEGIN
  43
  44 // The state number of the starting state
  45 #define START_STATE 1
  46
  47 // The state-transition value indicating "stop"
  48 #define STOP_STATE  0
  49
  50
  51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  52
  53
  54 //=======================================================================
  55 // constructors
  56 //=======================================================================
  57
  58 /**
  59  * Constructs a RuleBasedBreakIterator that uses the already-created
  60  * tables object that is passed in as a parameter.
  61  */
  62 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  63 {
  64     init();
  65     fData = new RBBIDataWrapper(data, status); // status checked in constructor
  66     if (U_FAILURE(status)) {return;}
  67     if(fData == 0) {
  68         status = U_MEMORY_ALLOCATION_ERROR;
  69         return;
  70     }
  71 }
  72
  73 /**
  74  * Same as above but does not adopt memory
  75  */
  76 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
  77 {
  78     init();
  79     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
  80     if (U_FAILURE(status)) {return;}
  81     if(fData == 0) {
  82         status = U_MEMORY_ALLOCATION_ERROR;
  83         return;
  84     }
  85 }
  86
  87 //-------------------------------------------------------------------------------
  88 //
  89 //   Constructor   from a UDataMemory handle to precompiled break rules
  90 //                 stored in an ICU data file.
  91 //
  92 //-------------------------------------------------------------------------------
  93 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
  94 {
  95     init();
  96     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
  97     if (U_FAILURE(status)) {return;}
  98     if(fData == 0) {
  99         status = U_MEMORY_ALLOCATION_ERROR;
 100         return;
 101     }
 102 }
 103
 104
 105
 106 //-------------------------------------------------------------------------------
 107 //
 108 //   Constructor       from a set of rules supplied as a string.
 109 //
 110 //-------------------------------------------------------------------------------
 111 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 112                                                 UParseError          &parseError,
 113                                                 UErrorCode           &status)
 114 {
 115     init();
 116     if (U_FAILURE(status)) {return;}
 117     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
 118         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
 119     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
 120     //        creates and returns a complete RBBI.  From here, in a constructor, we
 121     //        can't just return the object created by the builder factory, hence
 122     //        the assignment of the factory created object to "this".
 123     if (U_SUCCESS(status)) {
 124         *this = *bi;
 125         delete bi;
 126     }
 127 }
 128
 129
 130 //-------------------------------------------------------------------------------
 131 //
 132 // Default Constructor.      Create an empty shell that can be set up later.
 133 //                           Used when creating a RuleBasedBreakIterator from a set
 134 //                           of rules.
 135 //-------------------------------------------------------------------------------
 136 RuleBasedBreakIterator::RuleBasedBreakIterator() {
 137     init();
 138 }
 139
 140
 141 //-------------------------------------------------------------------------------
 142 //
 143 //   Copy constructor.  Will produce a break iterator with the same behavior,
 144 //                      and which iterates over the same text, as the one passed in.
 145 //
 146 //-------------------------------------------------------------------------------
 147 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
 148 : BreakIterator(other)
 149 {
 150     this->init();
 151     *this = other;
 152 }
 153
 154
 155 /**
 156  * Destructor
 157  */
 158 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
 159     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 160         // fCharIter was adopted from the outside.
 161         delete fCharIter;
 162     }
 163     fCharIter = NULL;
 164     delete fSCharIter;
 165     fCharIter = NULL;
 166     delete fDCharIter;
 167     fDCharIter = NULL;
 168
 169     utext_close(fText);
 170
 171     if (fData != NULL) {
 172         fData->removeReference();
 173         fData = NULL;
 174     }
 175     if (fCachedBreakPositions) {
 176         uprv_free(fCachedBreakPositions);
 177         fCachedBreakPositions = NULL;
 178     }
 179     if (fLanguageBreakEngines) {
 180         delete fLanguageBreakEngines;
 181         fLanguageBreakEngines = NULL;
 182     }
 183     if (fUnhandledBreakEngine) {
 184         delete fUnhandledBreakEngine;
 185         fUnhandledBreakEngine = NULL;
 186     }
 187 }
 188
 189 /**
 190  * Assignment operator.  Sets this iterator to have the same behavior,
 191  * and iterate over the same text, as the one passed in.
 192  */
 193 RuleBasedBreakIterator&
 194 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 195     if (this == &that) {
 196         return *this;
 197     }
 198     reset();    // Delete break cache information
 199     fBreakType = that.fBreakType;
 200     if (fLanguageBreakEngines != NULL) {
 201         delete fLanguageBreakEngines;
 202         fLanguageBreakEngines = NULL;   // Just rebuild for now
 203     }
 204     // TODO: clone fLanguageBreakEngines from "that"
 205     UErrorCode status = U_ZERO_ERROR;
 206     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
 207
 208     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 209         delete fCharIter;
 210     }
 211     fCharIter = NULL;
 212
 213     if (that.fCharIter != NULL ) {
 214         // This is a little bit tricky - it will intially appear that
 215         //  this->fCharIter is adopted, even if that->fCharIter was
 216         //  not adopted.  That's ok.
 217         fCharIter = that.fCharIter->clone();
 218     }
 219
 220     if (fData != NULL) {
 221         fData->removeReference();
 222         fData = NULL;
 223     }
 224     if (that.fData != NULL) {
 225         fData = that.fData->addReference();
 226     }
 227
 228     return *this;
 229 }
 230
 231
 232
 233 //-----------------------------------------------------------------------------
 234 //
 235 //    init()      Shared initialization routine.   Used by all the constructors.
 236 //                Initializes all fields, leaving the object in a consistent state.
 237 //
 238 //-----------------------------------------------------------------------------
 239 void RuleBasedBreakIterator::init() {
 240     UErrorCode  status    = U_ZERO_ERROR;
 241     fBufferClone          = FALSE;
 242     fText                 = utext_openUChars(NULL, NULL, 0, &status);
 243     fCharIter             = NULL;
 244     fSCharIter            = NULL;
 245     fDCharIter            = NULL;
 246     fData                 = NULL;
 247     fLastRuleStatusIndex  = 0;
 248     fLastStatusIndexValid = TRUE;
 249     fDictionaryCharCount  = 0;
 250     fBreakType            = -1;
 251
 252     fCachedBreakPositions    = NULL;
 253     fLanguageBreakEngines    = NULL;
 254     fUnhandledBreakEngine    = NULL;
 255     fNumCachedBreakPositions = 0;
 256     fPositionInCache         = 0;
 257
 258 #ifdef RBBI_DEBUG
 259     static UBool debugInitDone = FALSE;
 260     if (debugInitDone == FALSE) {
 261         char *debugEnv = getenv("U_RBBIDEBUG");
 262         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
 263             fTrace = TRUE;
 264         }
 265         debugInitDone = TRUE;
 266     }
 267 #endif
 268 }
 269
 270
 271
 272 //-----------------------------------------------------------------------------
 273 //
 274 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
 275 //            behavior, and iterating over the same text, as this one.
 276 //            Virtual function: does the right thing with subclasses.
 277 //
 278 //-----------------------------------------------------------------------------
 279 BreakIterator*
 280 RuleBasedBreakIterator::clone(void) const {
 281     return new RuleBasedBreakIterator(*this);
 282 }
 283
 284 /**
 285  * Equality operator.  Returns TRUE if both BreakIterators are of the
 286  * same class, have the same behavior, and iterate over the same text.
 287  */
 288 UBool
 289 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
 290     if (that.getDynamicClassID() != getDynamicClassID()) {
 291         return FALSE;
 292     }
 293
 294     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
 295
 296     if (!utext_equals(fText, that2.fText)) {
 297         // The two break iterators are operating on different text,
 298         //   or have a different interation position.
 299         return FALSE;
 300     };
 301
 302     // TODO:  need a check for when in a dictionary region at different offsets.
 303
 304     if (that2.fData == fData ||
 305         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
 306             // The two break iterators are using the same rules.
 307             return TRUE;
 308         }
 309     return FALSE;
 310 }
 311
 312 /**
 313  * Compute a hash code for this BreakIterator
 314  * @return A hash code
 315  */
 316 int32_t
 317 RuleBasedBreakIterator::hashCode(void) const {
 318     int32_t   hash = 0;
 319     if (fData != NULL) {
 320         hash = fData->hashCode();
 321     }
 322     return hash;
 323 }
 324
 325
 326 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
 327     if (U_FAILURE(status)) {
 328         return;
 329     }
 330     reset();
 331     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
 332
 333     // Set up a dummy CharacterIterator to be returned if anyone
 334     //   calls getText().  With input from UText, there is no reasonable
 335     //   way to return a characterIterator over the actual input text.
 336     //   Return one over an empty string instead - this is the closest
 337     //   we can come to signaling a failure.
 338     //   (GetText() is obsolete, this failure is sort of OK)
 339     if (fDCharIter == NULL) {
 340         static const UChar c = 0;
 341         fDCharIter = new UCharCharacterIterator(&c, 0);
 342         if (fDCharIter == NULL) {
 343             status = U_MEMORY_ALLOCATION_ERROR;
 344             return;
 345         }
 346     }
 347
 348     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 349         // existing fCharIter was adopted from the outside.  Delete it now.
 350         delete fCharIter;
 351     }
 352     fCharIter = fDCharIter;
 353
 354     this->first();
 355 }
 356
 357
 358 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
 359     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
 360     return result;
 361 }
 362
 363
 364
 365 /**
 366  * Returns the description used to create this iterator
 367  */
 368 const UnicodeString&
 369 RuleBasedBreakIterator::getRules() const {
 370     if (fData != NULL) {
 371         return fData->getRuleSourceString();
 372     } else {
 373         static const UnicodeString *s;
 374         if (s == NULL) {
 375             // TODO:  something more elegant here.
 376             //        perhaps API should return the string by value.
 377             //        Note:  thread unsafe init & leak are semi-ok, better than
 378             //               what was before.  Sould be cleaned up, though.
 379             s = new UnicodeString;
 380         }
 381         return *s;
 382     }
 383 }
 384
 385 //=======================================================================
 386 // BreakIterator overrides
 387 //=======================================================================
 388
 389 /**
 390  * Return a CharacterIterator over the text being analyzed.
 391  */
 392 CharacterIterator&
 393 RuleBasedBreakIterator::getText() const {
 394     return *fCharIter;
 395 }
 396
 397 /**
 398  * Set the iterator to analyze a new piece of text.  This function resets
 399  * the current iteration position to the beginning of the text.
 400  * @param newText An iterator over the text to analyze.
 401  */
 402 void
 403 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 404     // If we are holding a CharacterIterator adopted from a
 405     //   previous call to this function, delete it now.
 406     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 407         delete fCharIter;
 408     }
 409
 410     fCharIter = newText;
 411     UErrorCode status = U_ZERO_ERROR;
 412     reset();
 413     if (newText==NULL || newText->startIndex() != 0) {
 414         // startIndex !=0 wants to be an error, but there's no way to report it.
 415         // Make the iterator text be an empty string.
 416         fText = utext_openUChars(fText, NULL, 0, &status);
 417     } else {
 418         fText = utext_openCharacterIterator(fText, newText, &status);
 419     }
 420     this->first();
 421 }
 422
 423 /**
 424  * Set the iterator to analyze a new piece of text.  This function resets
 425  * the current iteration position to the beginning of the text.
 426  * @param newText An iterator over the text to analyze.
 427  */
 428 void
 429 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 430     UErrorCode status = U_ZERO_ERROR;
 431     reset();
 432     fText = utext_openConstUnicodeString(fText, &newText, &status);
 433
 434     // Set up a character iterator on the string.
 435     //   Needed in case someone calls getText().
 436     //  Can not, unfortunately, do this lazily on the (probably never)
 437     //  call to getText(), because getText is const.
 438     if (fSCharIter == NULL) {
 439         fSCharIter = new StringCharacterIterator(newText);
 440     } else {
 441         fSCharIter->setText(newText);
 442     }
 443
 444     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 445         // old fCharIter was adopted from the outside.  Delete it.
 446         delete fCharIter;
 447     }
 448     fCharIter = fSCharIter;
 449
 450     this->first();
 451 }
 452
 453
 454
 455 /**
 456  * Sets the current iteration position to the beginning of the text.
 457  * @return The offset of the beginning of the text.
 458  */
 459 int32_t RuleBasedBreakIterator::first(void) {
 460     reset();
 461     fLastRuleStatusIndex  = 0;
 462     fLastStatusIndexValid = TRUE;
 463     //if (fText == NULL)
 464     //    return BreakIterator::DONE;
 465
 466     utext_setNativeIndex(fText, 0);
 467     return 0;
 468 }
 469
 470 /**
 471  * Sets the current iteration position to the end of the text.
 472  * @return The text's past-the-end offset.
 473  */
 474 int32_t RuleBasedBreakIterator::last(void) {
 475     reset();
 476     if (fText == NULL) {
 477         fLastRuleStatusIndex  = 0;
 478         fLastStatusIndexValid = TRUE;
 479         return BreakIterator::DONE;
 480     }
 481
 482     fLastStatusIndexValid = FALSE;
 483     int32_t pos = (int32_t)utext_nativeLength(fText);
 484     utext_setNativeIndex(fText, pos);
 485     return pos;
 486 }
 487
 488 /**
 489  * Advances the iterator either forward or backward the specified number of steps.
 490  * Negative values move backward, and positive values move forward.  This is
 491  * equivalent to repeatedly calling next() or previous().
 492  * @param n The number of steps to move.  The sign indicates the direction
 493  * (negative is backwards, and positive is forwards).
 494  * @return The character offset of the boundary position n boundaries away from
 495  * the current one.
 496  */
 497 int32_t RuleBasedBreakIterator::next(int32_t n) {
 498     int32_t result = current();
 499     while (n > 0) {
 500         result = next();
 501         --n;
 502     }
 503     while (n < 0) {
 504         result = previous();
 505         ++n;
 506     }
 507     return result;
 508 }
 509
 510 /**
 511  * Advances the iterator to the next boundary position.
 512  * @return The position of the first boundary after this one.
 513  */
 514 int32_t RuleBasedBreakIterator::next(void) {
 515     // if we have cached break positions and we're still in the range
 516     // covered by them, just move one step forward in the cache
 517     if (fCachedBreakPositions != NULL) {
 518         if (fPositionInCache < fNumCachedBreakPositions - 1) {
 519             ++fPositionInCache;
 520             int32_t pos = fCachedBreakPositions[fPositionInCache];
 521             utext_setNativeIndex(fText, pos);
 522             return pos;
 523         }
 524         else {
 525             reset();
 526         }
 527     }
 528
 529     int32_t startPos = current();
 530     int32_t result = handleNext(fData->fForwardTable);
 531     if (fDictionaryCharCount > 0) {
 532         result = checkDictionary(startPos, result, FALSE);
 533     }
 534     return result;
 535 }
 536
 537 /**
 538  * Advances the iterator backwards, to the last boundary preceding this one.
 539  * @return The position of the last boundary position preceding this one.
 540  */
 541 int32_t RuleBasedBreakIterator::previous(void) {
 542     int32_t result;
 543     int32_t startPos;
 544
 545     // if we have cached break positions and we're still in the range
 546     // covered by them, just move one step backward in the cache
 547     if (fCachedBreakPositions != NULL) {
 548         if (fPositionInCache > 0) {
 549             --fPositionInCache;
 550             // If we're at the beginning of the cache, need to reevaluate the
 551             // rule status
 552             if (fPositionInCache <= 0) {
 553                 fLastStatusIndexValid = FALSE;
 554             }
 555             int32_t pos = fCachedBreakPositions[fPositionInCache];
 556             utext_setNativeIndex(fText, pos);
 557             return pos;
 558         }
 559         else {
 560             reset();
 561         }
 562     }
 563
 564     // if we're already sitting at the beginning of the text, return DONE
 565     if (fText == NULL || (startPos = current()) == 0) {
 566         fLastRuleStatusIndex  = 0;
 567         fLastStatusIndexValid = TRUE;
 568         return BreakIterator::DONE;
 569     }
 570
 571     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
 572         result = handlePrevious(fData->fReverseTable);
 573         if (fDictionaryCharCount > 0) {
 574             result = checkDictionary(result, startPos, TRUE);
 575         }
 576         return result;
 577     }
 578
 579     // old rule syntax
 580     // set things up.  handlePrevious() will back us up to some valid
 581     // break position before the current position (we back our internal
 582     // iterator up one step to prevent handlePrevious() from returning
 583     // the current position), but not necessarily the last one before
 584
 585     // where we started
 586
 587     int32_t start = current();
 588
 589     UTEXT_PREVIOUS32(fText);
 590     int32_t lastResult    = handlePrevious(fData->fReverseTable);
 591     if (lastResult == UBRK_DONE) {
 592         lastResult = 0;
 593         utext_setNativeIndex(fText, 0);
 594     }
 595     result = lastResult;
 596     int32_t lastTag       = 0;
 597     UBool   breakTagValid = FALSE;
 598
 599     // iterate forward from the known break position until we pass our
 600     // starting point.  The last break position before the starting
 601     // point is our return value
 602
 603     for (;;) {
 604         result         = next();
 605         if (result == BreakIterator::DONE || result >= start) {
 606             break;
 607         }
 608         lastResult     = result;
 609         lastTag        = fLastRuleStatusIndex;
 610         breakTagValid  = TRUE;
 611     }
 612
 613     // fLastBreakTag wants to have the value for section of text preceding
 614     // the result position that we are to return (in lastResult.)  If
 615     // the backwards rules overshot and the above loop had to do two or more
 616     // next()s to move up to the desired return position, we will have a valid
 617     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
 618     // we wont have a tag value for that position, which is only set by handleNext().
 619
 620     // set the current iteration position to be the last break position
 621     // before where we started, and then return that value
 622     utext_setNativeIndex(fText, lastResult);
 623     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
 624     fLastStatusIndexValid = breakTagValid;
 625
 626     // No need to check the dictionary; it will have been handled by
 627     // next()
 628
 629     return lastResult;
 630 }
 631
 632 /**
 633  * Sets the iterator to refer to the first boundary position following
 634  * the specified position.
 635  * @offset The position from which to begin searching for a break position.
 636  * @return The position of the first break after the current position.
 637  */
 638 int32_t RuleBasedBreakIterator::following(int32_t offset) {
 639     // if we have cached break positions and offset is in the range
 640     // covered by them, use them
 641     // TODO: could use binary search
 642     // TODO: what if offset is outside range, but break is not?
 643     if (fCachedBreakPositions != NULL) {
 644         if (offset >= fCachedBreakPositions[0]
 645                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 646             fPositionInCache = 0;
 647             // We are guaranteed not to leave the array due to range test above
 648             while (offset >= fCachedBreakPositions[fPositionInCache]) {
 649                 ++fPositionInCache;
 650             }
 651             int32_t pos = fCachedBreakPositions[fPositionInCache];
 652             utext_setNativeIndex(fText, pos);
 653             return pos;
 654         }
 655         else {
 656             reset();
 657         }
 658     }
 659
 660     // if the offset passed in is already past the end of the text,
 661     // just return DONE; if it's before the beginning, return the
 662     // text's starting offset
 663     fLastRuleStatusIndex  = 0;
 664     fLastStatusIndexValid = TRUE;
 665     if (fText == NULL || offset >= utext_nativeLength(fText)) {
 666         last();
 667         return next();
 668     }
 669     else if (offset < 0) {
 670         return first();
 671     }
 672
 673     // otherwise, set our internal iteration position (temporarily)
 674     // to the position passed in.  If this is the _beginning_ position,
 675     // then we can just use next() to get our return value
 676
 677     int32_t result = 0;
 678
 679     if (fData->fSafeRevTable != NULL) {
 680         // new rule syntax
 681         utext_setNativeIndex(fText, offset);
 682         // move forward one codepoint to prepare for moving back to a
 683         // safe point.
 684         // this handles offset being between a supplementary character
 685         UTEXT_NEXT32(fText);
 686         // handlePrevious will move most of the time to < 1 boundary away
 687         handlePrevious(fData->fSafeRevTable);
 688         int32_t result = next();
 689         while (result <= offset) {
 690             result = next();
 691         }
 692         return result;
 693     }
 694     if (fData->fSafeFwdTable != NULL) {
 695         // backup plan if forward safe table is not available
 696         utext_setNativeIndex(fText, offset);
 697         UTEXT_PREVIOUS32(fText);
 698         // handle next will give result >= offset
 699         handleNext(fData->fSafeFwdTable);
 700         // previous will give result 0 or 1 boundary away from offset,
 701         // most of the time
 702         // we have to
 703         int32_t oldresult = previous();
 704         while (oldresult > offset) {
 705             int32_t result = previous();
 706             if (result <= offset) {
 707                 return oldresult;
 708             }
 709             oldresult = result;
 710         }
 711         int32_t result = next();
 712         if (result <= offset) {
 713             return next();
 714         }
 715         return result;
 716     }
 717     // otherwise, we have to sync up first.  Use handlePrevious() to back
 718     // up to a known break position before the specified position (if
 719     // we can determine that the specified position is a break position,
 720     // we don't back up at all).  This may or may not be the last break
 721     // position at or before our starting position.  Advance forward
 722     // from here until we've passed the starting position.  The position
 723     // we stop on will be the first break position after the specified one.
 724     // old rule syntax
 725
 726     utext_setNativeIndex(fText, offset);
 727     if (offset==0 ||
 728         offset==1  && utext_getNativeIndex(fText)==0) {
 729         return next();
 730     }
 731     result = previous();
 732
 733     while (result != BreakIterator::DONE && result <= offset) {
 734         result = next();
 735     }
 736
 737     return result;
 738 }
 739
 740 /**
 741  * Sets the iterator to refer to the last boundary position before the
 742  * specified position.
 743  * @offset The position to begin searching for a break from.
 744  * @return The position of the last boundary before the starting position.
 745  */
 746 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 747     // if we have cached break positions and offset is in the range
 748     // covered by them, use them
 749     if (fCachedBreakPositions != NULL) {
 750         // TODO: binary search?
 751         // TODO: What if offset is outside range, but break is not?
 752         if (offset > fCachedBreakPositions[0]
 753                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 754             fPositionInCache = 0;
 755             while (fPositionInCache < fNumCachedBreakPositions
 756                    && offset > fCachedBreakPositions[fPositionInCache])
 757                 ++fPositionInCache;
 758             --fPositionInCache;
 759             // If we're at the beginning of the cache, need to reevaluate the
 760             // rule status
 761             if (fPositionInCache <= 0) {
 762                 fLastStatusIndexValid = FALSE;
 763             }
 764             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
 765             return fCachedBreakPositions[fPositionInCache];
 766         }
 767         else {
 768             reset();
 769         }
 770     }
 771
 772     // if the offset passed in is already past the end of the text,
 773     // just return DONE; if it's before the beginning, return the
 774     // text's starting offset
 775     if (fText == NULL || offset > utext_nativeLength(fText)) {
 776         // return BreakIterator::DONE;
 777         return last();
 778     }
 779     else if (offset < 0) {
 780         return first();
 781     }
 782
 783     // if we start by updating the current iteration position to the
 784     // position specified by the caller, we can just use previous()
 785     // to carry out this operation
 786
 787     if (fData->fSafeFwdTable != NULL) {
 788         // new rule syntax
 789         utext_setNativeIndex(fText, offset);
 790         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 791         if (newOffset != offset) {
 792             // Will come here if specified offset was not a code point boundary AND
 793             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
 794             //   indices to the containing code point.
 795             // For breakitereator::preceding only, these non-code-point indices need to be moved
 796             //   up to refer to the following codepoint.
 797             UTEXT_NEXT32(fText);
 798             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 799         }
 800
 801         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
 802         //        rather than adjusting the position unconditionally?
 803         //        (Change would interact with safe rules.)
 804         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
 805         //        affects only preceding(), seems cleaner, but is slightly different.
 806         UTEXT_PREVIOUS32(fText);
 807         handleNext(fData->fSafeFwdTable);
 808         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 809         while (result >= offset) {
 810             result = previous();
 811         }
 812         return result;
 813     }
 814     if (fData->fSafeRevTable != NULL) {
 815         // backup plan if forward safe table is not available
 816         //  TODO:  check whether this path can be discarded
 817         //         It's probably OK to say that rules must supply both safe tables
 818         //            if they use safe tables at all.  We have certainly never described
 819         //            to anyone how to work with just one safe table.
 820         utext_setNativeIndex(fText, offset);
 821         UTEXT_NEXT32(fText);
 822
 823         // handle previous will give result <= offset
 824         handlePrevious(fData->fSafeRevTable);
 825
 826         // next will give result 0 or 1 boundary away from offset,
 827         // most of the time
 828         // we have to
 829         int32_t oldresult = next();
 830         while (oldresult < offset) {
 831             int32_t result = next();
 832             if (result >= offset) {
 833                 return oldresult;
 834             }
 835             oldresult = result;
 836         }
 837         int32_t result = previous();
 838         if (result >= offset) {
 839             return previous();
 840         }
 841         return result;
 842     }
 843
 844     // old rule syntax
 845     utext_setNativeIndex(fText, offset);
 846     return previous();
 847 }
 848
 849 /**
 850  * Returns true if the specfied position is a boundary position.  As a side
 851  * effect, leaves the iterator pointing to the first boundary position at
 852  * or after "offset".
 853  * @param offset the offset to check.
 854  * @return True if "offset" is a boundary position.
 855  */
 856 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
 857     // the beginning index of the iterator is always a boundary position by definition
 858     if (offset == 0) {
 859         first();       // For side effects on current position, tag values.
 860         return TRUE;
 861     }
 862
 863     if (offset == (int32_t)utext_nativeLength(fText)) {
 864         last();       // For side effects on current position, tag values.
 865         return TRUE;
 866     }
 867
 868     // out-of-range indexes are never boundary positions
 869     if (offset < 0) {
 870         first();       // For side effects on current position, tag values.
 871         return FALSE;
 872     }
 873
 874     if (offset > utext_nativeLength(fText)) {
 875         last();        // For side effects on current position, tag values.
 876         return FALSE;
 877     }
 878
 879     // otherwise, we can use following() on the position before the specified
 880     // one and return true if the position we get back is the one the user
 881     // specified
 882     utext_previous32From(fText, offset);
 883     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 884     UBool    result  = following(backOne) == offset;
 885     return result;
 886 }
 887
 888 /**
 889  * Returns the current iteration position.
 890  * @return The current iteration position.
 891  */
 892 int32_t RuleBasedBreakIterator::current(void) const {
 893     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 894     return pos;
 895 }
 896
 897 //=======================================================================
 898 // implementation
 899 //=======================================================================
 900
 901 //
 902 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
 903 //                 of user text.  A variable with this enum type keeps track of where we
 904 //                 are.  The state machine only fetches user input while in the RUN mode.
 905 //
 906 enum RBBIRunMode {
 907     RBBI_START,     // state machine processing is before first char of input
 908     RBBI_RUN,       // state machine processing is in the user text
 909     RBBI_END        // state machine processing is after end of user text.
 910 };
 911
 912
 913 //-----------------------------------------------------------------------------------
 914 //
 915 //  handleNext(stateTable)
 916 //     This method is the actual implementation of the rbbi next() method.
 917 //     This method initializes the state machine to state 1
 918 //     and advances through the text character by character until we reach the end
 919 //     of the text or the state machine transitions to state 0.  We update our return
 920 //     value every time the state machine passes through an accepting state.
 921 //
 922 //-----------------------------------------------------------------------------------
 923 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
 924     int32_t             state;
 925     int16_t             category        = 0;
 926     RBBIRunMode         mode;
 927
 928     RBBIStateTableRow  *row;
 929     UChar32             c;
 930     int32_t             lookaheadStatus = 0;
 931     int32_t             lookaheadTagIdx = 0;
 932     int32_t             result          = 0;
 933     int32_t             initialPosition = 0;
 934     int32_t             lookaheadResult = 0;
 935     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
 936     const char         *tableData       = statetable->fTableData;
 937     uint32_t            tableRowLen     = statetable->fRowLen;
 938
 939     #ifdef RBBI_DEBUG
 940         if (fTrace) {
 941             RBBIDebugPuts("Handle Next   pos   char  state category");
 942         }
 943     #endif
 944
 945     // No matter what, handleNext alway correctly sets the break tag value.
 946     fLastStatusIndexValid = TRUE;
 947     fLastRuleStatusIndex = 0;
 948
 949     // if we're already at the end of the text, return DONE.
 950     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 951     result          = initialPosition;
 952     c               = UTEXT_NEXT32(fText);
 953     if (fData == NULL || c==U_SENTINEL) {
 954         return BreakIterator::DONE;
 955     }
 956
 957     //  Set the initial state for the state machine
 958     state = START_STATE;
 959     row = (RBBIStateTableRow *)
 960             //(statetable->fTableData + (statetable->fRowLen * state));
 961             (tableData + tableRowLen * state);
 962
 963
 964     mode     = RBBI_RUN;
 965     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
 966         category = 2;
 967         mode     = RBBI_START;
 968     }
 969
 970
 971     // loop until we reach the end of the text or transition to state 0
 972     //
 973     for (;;) {
 974         if (c == U_SENTINEL) {
 975             // Reached end of input string.
 976             if (mode == RBBI_END) {
 977                 // We have already run the loop one last time with the
 978                 //   character set to the psueudo {eof} value.  Now it is time
 979                 //   to unconditionally bail out.
 980                 if (lookaheadResult > result) {
 981                     // We ran off the end of the string with a pending look-ahead match.
 982                     // Treat this as if the look-ahead condition had been met, and return
 983                     //  the match at the / position from the look-ahead rule.
 984                     result               = lookaheadResult;
 985                     fLastRuleStatusIndex = lookaheadTagIdx;
 986                     lookaheadStatus = 0;
 987                 }
 988                 break;
 989             }
 990             // Run the loop one last time with the fake end-of-input character category.
 991             mode = RBBI_END;
 992             category = 1;
 993         }
 994
 995         //
 996         // Get the char category.  An incoming category of 1 or 2 means that
 997         //      we are preset for doing the beginning or end of input, and
 998         //      that we shouldn't get a category from an actual text input character.
 999         //
1000         if (mode == RBBI_RUN) {
1001             // look up the current character's character category, which tells us
1002             // which column in the state table to look at.
1003             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1004             //        not the size of the character going in, which is a UChar32.
1005             //
1006             UTRIE_GET16(&fData->fTrie, c, category);
1007
1008             // Check the dictionary bit in the character's category.
1009             //    Counter is only used by dictionary based iterators (subclasses).
1010             //    Chars that need to be handled by a dictionary have a flag bit set
1011             //    in their category values.
1012             //
1013             if ((category & 0x4000) != 0)  {
1014                 fDictionaryCharCount++;
1015                 //  And off the dictionary flag bit.
1016                 category &= ~0x4000;
1017             }
1018         }
1019
1020         #ifdef RBBI_DEBUG
1021             if (fTrace) {
1022                 RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
1023                 if (0x20<=c && c<0x7f) {
1024                     RBBIDebugPrintf("\"%c\"  ", c);
1025                 } else {
1026                     RBBIDebugPrintf("%5x  ", c);
1027                 }
1028                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1029             }
1030         #endif
1031
1032         // State Transition - move machine to its next state
1033         //
1034         state = row->fNextState[category];
1035         row = (RBBIStateTableRow *)
1036             // (statetable->fTableData + (statetable->fRowLen * state));
1037             (tableData + tableRowLen * state);
1038
1039
1040         if (row->fAccepting == -1) {
1041             // Match found, common case.
1042             if (mode != RBBI_START) {
1043                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1044             }
1045             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1046         }
1047
1048         if (row->fLookAhead != 0) {
1049             if (lookaheadStatus != 0
1050                 && row->fAccepting == lookaheadStatus) {
1051                 // Lookahead match is completed.
1052                 result               = lookaheadResult;
1053                 fLastRuleStatusIndex = lookaheadTagIdx;
1054                 lookaheadStatus      = 0;
1055                 // TODO:  make a standalone hard break in a rule work.
1056                 if (lookAheadHardBreak) {
1057                     UTEXT_SETNATIVEINDEX(fText, result);
1058                     return result;
1059                 }
1060                 // Look-ahead completed, but other rules may match further.  Continue on
1061                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1062                 goto continueOn;
1063             }
1064
1065             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1066             lookaheadResult = r;
1067             lookaheadStatus = row->fLookAhead;
1068             lookaheadTagIdx = row->fTagIdx;
1069             goto continueOn;
1070         }
1071
1072
1073         if (row->fAccepting != 0) {
1074             // Because this is an accepting state, any in-progress look-ahead match
1075             //   is no longer relavant.  Clear out the pending lookahead status.
1076             lookaheadStatus = 0;           // clear out any pending look-ahead match.
1077         }
1078
1079 continueOn:
1080         if (state == STOP_STATE) {
1081             // This is the normal exit from the lookup state machine.
1082             // We have advanced through the string until it is certain that no
1083             //   longer match is possible, no matter what characters follow.
1084             break;
1085         }
1086
1087         // Advance to the next character.
1088         // If this is a beginning-of-input loop iteration, don't advance
1089         //    the input position.  The next iteration will be processing the
1090         //    first real input character.
1091         if (mode == RBBI_RUN) {
1092             c = UTEXT_NEXT32(fText);
1093         } else {
1094             if (mode == RBBI_START) {
1095                 mode = RBBI_RUN;
1096             }
1097         }
1098
1099
1100     }
1101
1102     // The state machine is done.  Check whether it found a match...
1103
1104     // If the iterator failed to advance in the match engine, force it ahead by one.
1105     //   (This really indicates a defect in the break rules.  They should always match
1106     //    at least one character.)
1107     if (result == initialPosition) {
1108         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1109         UTEXT_NEXT32(fText);
1110         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1111     }
1112
1113     // Leave the iterator at our result position.
1114     UTEXT_SETNATIVEINDEX(fText, result);
1115     #ifdef RBBI_DEBUG
1116         if (fTrace) {
1117             RBBIDebugPrintf("result = %d\n\n", result);
1118         }
1119     #endif
1120     return result;
1121 }
1122
1123
1124
1125 //-----------------------------------------------------------------------------------
1126 //
1127 //  handlePrevious()
1128 //
1129 //      Iterate backwards, according to the logic of the reverse rules.
1130 //      This version handles the exact style backwards rules.
1131 //
1132 //      The logic of this function is very similar to handleNext(), above.
1133 //
1134 //-----------------------------------------------------------------------------------
1135 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1136     int32_t             state;
1137     int16_t             category        = 0;
1138     RBBIRunMode         mode;
1139     RBBIStateTableRow  *row;
1140     UChar32             c;
1141     int32_t             lookaheadStatus = 0;
1142     int32_t             result          = 0;
1143     int32_t             initialPosition = 0;
1144     int32_t             lookaheadResult = 0;
1145     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1146
1147     #ifdef RBBI_DEBUG
1148         if (fTrace) {
1149             RBBIDebugPuts("Handle Previous   pos   char  state category");
1150         }
1151     #endif
1152
1153     // handlePrevious() never gets the rule status.
1154     // Flag the status as invalid; if the user ever asks for status, we will need
1155     // to back up, then re-find the break position using handleNext(), which does
1156     // get the status value.
1157     fLastStatusIndexValid = FALSE;
1158     fLastRuleStatusIndex = 0;
1159
1160     // if we're already at the start of the text, return DONE.
1161     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1162         return BreakIterator::DONE;
1163     }
1164
1165     //  Set up the starting char.
1166     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1167     result          = initialPosition;
1168     c               = UTEXT_PREVIOUS32(fText);
1169
1170     //  Set the initial state for the state machine
1171     state = START_STATE;
1172     row = (RBBIStateTableRow *)
1173             (statetable->fTableData + (statetable->fRowLen * state));
1174     category = 3;
1175     mode     = RBBI_RUN;
1176     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1177         category = 2;
1178         mode     = RBBI_START;
1179     }
1180
1181
1182     // loop until we reach the start of the text or transition to state 0
1183     //
1184     for (;;) {
1185         if (c == U_SENTINEL) {
1186             // Reached end of input string.
1187             if (mode == RBBI_END ||
1188                 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
1189                 // We have already run the loop one last time with the
1190                 //   character set to the psueudo {eof} value.  Now it is time
1191                 //   to unconditionally bail out.
1192                 //  (Or we have an old format binary rule file that does not support {eof}.)
1193                 if (lookaheadResult < result) {
1194                     // We ran off the end of the string with a pending look-ahead match.
1195                     // Treat this as if the look-ahead condition had been met, and return
1196                     //  the match at the / position from the look-ahead rule.
1197                     result               = lookaheadResult;
1198                     lookaheadStatus = 0;
1199                 } else if (result == initialPosition) {
1200                     // Ran off start, no match found.
1201                     // move one index one (towards the start, since we are doing a previous())
1202                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1203                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1204                 }
1205                 break;
1206             }
1207             // Run the loop one last time with the fake end-of-input character category.
1208             mode = RBBI_END;
1209             category = 1;
1210         }
1211
1212         //
1213         // Get the char category.  An incoming category of 1 or 2 means that
1214         //      we are preset for doing the beginning or end of input, and
1215         //      that we shouldn't get a category from an actual text input character.
1216         //
1217         if (mode == RBBI_RUN) {
1218             // look up the current character's character category, which tells us
1219             // which column in the state table to look at.
1220             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1221             //        not the size of the character going in, which is a UChar32.
1222             //
1223             UTRIE_GET16(&fData->fTrie, c, category);
1224
1225             // Check the dictionary bit in the character's category.
1226             //    Counter is only used by dictionary based iterators (subclasses).
1227             //    Chars that need to be handled by a dictionary have a flag bit set
1228             //    in their category values.
1229             //
1230             if ((category & 0x4000) != 0)  {
1231                 fDictionaryCharCount++;
1232                 //  And off the dictionary flag bit.
1233                 category &= ~0x4000;
1234             }
1235         }
1236
1237         #ifdef RBBI_DEBUG
1238             if (fTrace) {
1239                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1240                 if (0x20<=c && c<0x7f) {
1241                     RBBIDebugPrintf("\"%c\"  ", c);
1242                 } else {
1243                     RBBIDebugPrintf("%5x  ", c);
1244                 }
1245                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1246             }
1247         #endif
1248
1249         // State Transition - move machine to its next state
1250         //
1251         state = row->fNextState[category];
1252         row = (RBBIStateTableRow *)
1253             (statetable->fTableData + (statetable->fRowLen * state));
1254
1255         if (row->fAccepting == -1) {
1256             // Match found, common case.
1257             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1258         }
1259
1260         if (row->fLookAhead != 0) {
1261             if (lookaheadStatus != 0
1262                 && row->fAccepting == lookaheadStatus) {
1263                 // Lookahead match is completed.
1264                 result               = lookaheadResult;
1265                 lookaheadStatus      = 0;
1266                 // TODO:  make a standalone hard break in a rule work.
1267                 if (lookAheadHardBreak) {
1268                     UTEXT_SETNATIVEINDEX(fText, result);
1269                     return result;
1270                 }
1271                 // Look-ahead completed, but other rules may match further.  Continue on
1272                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1273                 goto continueOn;
1274             }
1275
1276             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1277             lookaheadResult = r;
1278             lookaheadStatus = row->fLookAhead;
1279             goto continueOn;
1280         }
1281
1282
1283         if (row->fAccepting != 0) {
1284             // Because this is an accepting state, any in-progress look-ahead match
1285             //   is no longer relavant.  Clear out the pending lookahead status.
1286             lookaheadStatus = 0;
1287         }
1288
1289 continueOn:
1290         if (state == STOP_STATE) {
1291             // This is the normal exit from the lookup state machine.
1292             // We have advanced through the string until it is certain that no
1293             //   longer match is possible, no matter what characters follow.
1294             break;
1295         }
1296
1297         // Move (backwards) to the next character to process.
1298         // If this is a beginning-of-input loop iteration, don't advance
1299         //    the input position.  The next iteration will be processing the
1300         //    first real input character.
1301         if (mode == RBBI_RUN) {
1302             c = UTEXT_PREVIOUS32(fText);
1303         } else {
1304             if (mode == RBBI_START) {
1305                 mode = RBBI_RUN;
1306             }
1307         }
1308     }
1309
1310     // The state machine is done.  Check whether it found a match...
1311
1312     // If the iterator failed to advance in the match engine, force it ahead by one.
1313     //   (This really indicates a defect in the break rules.  They should always match
1314     //    at least one character.)
1315     if (result == initialPosition) {
1316         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1317         UTEXT_PREVIOUS32(fText);
1318         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1319     }
1320
1321     // Leave the iterator at our result position.
1322     UTEXT_SETNATIVEINDEX(fText, result);
1323     #ifdef RBBI_DEBUG
1324         if (fTrace) {
1325             RBBIDebugPrintf("result = %d\n\n", result);
1326         }
1327     #endif
1328     return result;
1329 }
1330
1331
1332 void
1333 RuleBasedBreakIterator::reset()
1334 {
1335     if (fCachedBreakPositions) {
1336         uprv_free(fCachedBreakPositions);
1337     }
1338     fCachedBreakPositions = NULL;
1339     fNumCachedBreakPositions = 0;
1340     fDictionaryCharCount = 0;
1341     fPositionInCache = 0;
1342 }
1343
1344
1345
1346 //-------------------------------------------------------------------------------
1347 //
1348 //   getRuleStatus()   Return the break rule tag associated with the current
1349 //                     iterator position.  If the iterator arrived at its current
1350 //                     position by iterating forwards, the value will have been
1351 //                     cached by the handleNext() function.
1352 //
1353 //                     If no cached status value is available, the status is
1354 //                     found by doing a previous() followed by a next(), which
1355 //                     leaves the iterator where it started, and computes the
1356 //                     status while doing the next().
1357 //
1358 //-------------------------------------------------------------------------------
1359 void RuleBasedBreakIterator::makeRuleStatusValid() {
1360     if (fLastStatusIndexValid == FALSE) {
1361         //  No cached status is available.
1362         if (fText == NULL || current() == 0) {
1363             //  At start of text, or there is no text.  Status is always zero.
1364             fLastRuleStatusIndex = 0;
1365             fLastStatusIndexValid = TRUE;
1366         } else {
1367             //  Not at start of text.  Find status the tedious way.
1368             int32_t pa = current();
1369             previous();
1370             if (fNumCachedBreakPositions > 0) {
1371                 reset();                // Blow off the dictionary cache
1372             }
1373             int32_t pb = next();
1374             if (pa != pb) {
1375                 // note: the if (pa != pb) test is here only to eliminate warnings for
1376                 //       unused local variables on gcc.  Logically, it isn't needed.
1377                 U_ASSERT(pa == pb);
1378             }
1379         }
1380     }
1381     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1382 }
1383
1384
1385 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1386     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1387     nonConstThis->makeRuleStatusValid();
1388
1389     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1390     //                                                 (the number of status values.)
1391     //   This function returns the last (largest) of the array of status values.
1392     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1393     int32_t  tagVal = fData->fRuleStatusTable[idx];
1394
1395     return tagVal;
1396 }
1397
1398
1399
1400
1401 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1402              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1403 {
1404     if (U_FAILURE(status)) {
1405         return 0;
1406     }
1407
1408     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1409     nonConstThis->makeRuleStatusValid();
1410     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1411     int32_t  numValsToCopy = numVals;
1412     if (numVals > capacity) {
1413         status = U_BUFFER_OVERFLOW_ERROR;
1414         numValsToCopy = capacity;
1415     }
1416     int i;
1417     for (i=0; i<numValsToCopy; i++) {
1418         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1419     }
1420     return numVals;
1421 }
1422
1423
1424
1425 //-------------------------------------------------------------------------------
1426 //
1427 //   getBinaryRules        Access to the compiled form of the rules,
1428 //                         for use by build system tools that save the data
1429 //                         for standard iterator types.
1430 //
1431 //-------------------------------------------------------------------------------
1432 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1433     const uint8_t  *retPtr = NULL;
1434     length = 0;
1435
1436     if (fData != NULL) {
1437         retPtr = (const uint8_t *)fData->fHeader;
1438         length = fData->fHeader->fLength;
1439     }
1440     return retPtr;
1441 }
1442
1443
1444
1445
1446 //-------------------------------------------------------------------------------
1447 //
1448 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1449 //                    Saving one heap allocation isn't worth the trouble.
1450 //                    Cloning shouldn't be done in tight loops, and
1451 //                    making the clone copy involves other heap operations anyway.
1452 //                    And the application code for correctly dealing with buffer
1453 //                    size problems and the eventual object destruction is ugly.
1454 //
1455 //-------------------------------------------------------------------------------
1456 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1457                                    int32_t &bufferSize,
1458                                    UErrorCode &status)
1459 {
1460     if (U_FAILURE(status)){
1461         return NULL;
1462     }
1463
1464     //
1465     //  If user buffer size is zero this is a preflight operation to
1466     //    obtain the needed buffer size, allowing for worst case misalignment.
1467     //
1468     if (bufferSize == 0) {
1469         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1470         return NULL;
1471     }
1472
1473
1474     //
1475     //  Check the alignment and size of the user supplied buffer.
1476     //  Allocate heap memory if the user supplied memory is insufficient.
1477     //
1478     char    *buf   = (char *)stackBuffer;
1479     uint32_t s      = bufferSize;
1480
1481     if (stackBuffer == NULL) {
1482         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1483     }
1484     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1485         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1486         s   -= offsetUp;
1487         buf += offsetUp;
1488     }
1489     if (s < sizeof(RuleBasedBreakIterator)) {
1490         // Not enough room in the caller-supplied buffer.
1491         // Do a plain-vanilla heap based clone and return that, along with
1492         //   a warning that the clone was allocated.
1493         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1494         if (clonedBI == 0) {
1495             status = U_MEMORY_ALLOCATION_ERROR;
1496         } else {
1497             status = U_SAFECLONE_ALLOCATED_WARNING;
1498         }
1499         return clonedBI;
1500     }
1501
1502     //
1503     //  Clone the source BI into the caller-supplied buffer.
1504     //    TODO:  using an overloaded operator new to directly initialize the
1505     //           copy in the user's buffer would be better, but it doesn't seem
1506     //           to get along with namespaces.  Investigate why.
1507     //
1508     //           The memcpy is only safe with an empty (default constructed)
1509     //           break iterator.  Use on others can screw up reference counts
1510     //           to data.  memcpy-ing objects is not really a good idea...
1511     //
1512     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
1513     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1514     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1515     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
1516     *clone = *this;               // clone = the real BI we want.
1517     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1518
1519     return clone;
1520 }
1521
1522
1523 //-------------------------------------------------------------------------------
1524 //
1525 //  isDictionaryChar      Return true if the category lookup for this char
1526 //                        indicates that it is in the set of dictionary lookup
1527 //                        chars.
1528 //
1529 //                        This function is intended for use by dictionary based
1530 //                        break iterators.
1531 //
1532 //-------------------------------------------------------------------------------
1533 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1534     if (fData == NULL) {
1535         return FALSE;
1536     }
1537     uint16_t category;
1538     UTRIE_GET16(&fData->fTrie, c, category);
1539     return (category & 0x4000) != 0;
1540 }*/
1541
1542
1543 //-------------------------------------------------------------------------------
1544 //
1545 //  checkDictionary       This function handles all processing of characters in
1546 //                        the "dictionary" set. It will determine the appropriate
1547 //                        course of action, and possibly set up a cache in the
1548 //                        process.
1549 //
1550 //-------------------------------------------------------------------------------
1551 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1552                             int32_t endPos,
1553                             UBool reverse) {
1554     // Reset the old break cache first.
1555     uint32_t dictionaryCount = fDictionaryCharCount;
1556     reset();
1557
1558     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1559         return (reverse ? startPos : endPos);
1560     }
1561
1562     // Starting from the starting point, scan towards the proposed result,
1563     // looking for the first dictionary character (which may be the one
1564     // we're on, if we're starting in the middle of a range).
1565     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1566     if (reverse) {
1567         UTEXT_PREVIOUS32(fText);
1568     }
1569
1570     int32_t rangeStart = startPos;
1571     int32_t rangeEnd = endPos;
1572
1573     uint16_t    category;
1574     int32_t     current;
1575     UErrorCode  status = U_ZERO_ERROR;
1576     UStack      breaks(status);
1577     int32_t     foundBreakCount = 0;
1578     UChar32     c = utext_current32(fText);
1579
1580     UTRIE_GET16(&fData->fTrie, c, category);
1581
1582     // Is the character we're starting on a dictionary character? If so, we
1583     // need to back up to include the entire run; otherwise the results of
1584     // the break algorithm will differ depending on where we start. Since
1585     // the result is cached and there is typically a non-dictionary break
1586     // within a small number of words, there should be little performance impact.
1587     if (category & 0x4000) {
1588         if (reverse) {
1589             do {
1590                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1591                 c = utext_current32(fText);
1592                 UTRIE_GET16(&fData->fTrie, c, category);
1593             } while (c != U_SENTINEL && (category & 0x4000));
1594             // Back up to the last dictionary character
1595             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1596             if (c == U_SENTINEL) {
1597                 // c = fText->last32();
1598                 //   TODO:  why was this if needed?
1599                 c = UTEXT_PREVIOUS32(fText);
1600             }
1601             else {
1602                 c = UTEXT_PREVIOUS32(fText);
1603             }
1604         }
1605         else {
1606             do {
1607                 c = UTEXT_PREVIOUS32(fText);
1608                 UTRIE_GET16(&fData->fTrie, c, category);
1609             }
1610             while (c != U_SENTINEL && (category & 0x4000));
1611             // Back up to the last dictionary character
1612             if (c == U_SENTINEL) {
1613                 // c = fText->first32();
1614                 c = utext_current32(fText);
1615             }
1616             else {
1617                 utext_next32(fText);
1618                 c = utext_current32(fText);
1619             }
1620             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1621         }
1622         UTRIE_GET16(&fData->fTrie, c, category);
1623     }
1624
1625     // Loop through the text, looking for ranges of dictionary characters.
1626     // For each span, find the appropriate break engine, and ask it to find
1627     // any breaks within the span.
1628     // Note: we always do this in the forward direction, so that the break
1629     // cache is built in the right order.
1630     if (reverse) {
1631         utext_setNativeIndex(fText, rangeStart);
1632         c = utext_current32(fText);
1633         UTRIE_GET16(&fData->fTrie, c, category);
1634     }
1635     while(U_SUCCESS(status)) {
1636         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1637             utext_next32(fText);           // TODO:  tweak for post-increment operation
1638             c = utext_current32(fText);
1639             UTRIE_GET16(&fData->fTrie, c, category);
1640         }
1641         if (current >= rangeEnd) {
1642             break;
1643         }
1644
1645         // We now have a dictionary character. Get the appropriate language object
1646         // to deal with it.
1647         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1648
1649         // Ask the language object if there are any breaks. It will leave the text
1650         // pointer on the other side of its range, ready to search for the next one.
1651         if (lbe != NULL) {
1652             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1653         }
1654
1655         // Reload the loop variables for the next go-round
1656         c = utext_current32(fText);
1657         UTRIE_GET16(&fData->fTrie, c, category);
1658     }
1659
1660     // If we found breaks, build a new break cache. The first and last entries must
1661     // be the original starting and ending position.
1662     if (foundBreakCount > 0) {
1663         int32_t totalBreaks = foundBreakCount;
1664         if (startPos < breaks.elementAti(0)) {
1665             totalBreaks += 1;
1666         }
1667         if (endPos > breaks.peeki()) {
1668             totalBreaks += 1;
1669         }
1670         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1671         if (fCachedBreakPositions != NULL) {
1672             int32_t out = 0;
1673             fNumCachedBreakPositions = totalBreaks;
1674             if (startPos < breaks.elementAti(0)) {
1675                 fCachedBreakPositions[out++] = startPos;
1676             }
1677             for (int32_t i = 0; i < foundBreakCount; ++i) {
1678                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1679             }
1680             if (endPos > fCachedBreakPositions[out-1]) {
1681                 fCachedBreakPositions[out] = endPos;
1682             }
1683             // If there are breaks, then by definition, we are replacing the original
1684             // proposed break by one of the breaks we found. Use following() and
1685             // preceding() to do the work. They should never recurse in this case.
1686             if (reverse) {
1687                 return preceding(endPos - 1);
1688             }
1689             else {
1690                 return following(startPos);
1691             }
1692         }
1693         // If the allocation failed, just fall through to the "no breaks found" case.
1694     }
1695
1696     // If we get here, there were no language-based breaks. Set the text pointer
1697     // to the original proposed break.
1698     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1699     return (reverse ? startPos : endPos);
1700 }
1701
1702 U_NAMESPACE_END
1703
1704 // defined in ucln_cmn.h
1705
1706 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1707
1708 /**
1709  * Release all static memory held by breakiterator.
1710  */
1711 U_CDECL_BEGIN
1712 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1713     if (gLanguageBreakFactories) {
1714         delete gLanguageBreakFactories;
1715         gLanguageBreakFactories = NULL;
1716     }
1717     return TRUE;
1718 }
1719 U_CDECL_END
1720
1721 U_CDECL_BEGIN
1722 static void U_CALLCONV _deleteFactory(void *obj) {
1723     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1724 }
1725 U_CDECL_END
1726 U_NAMESPACE_BEGIN
1727
1728 static const LanguageBreakEngine*
1729 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1730 {
1731     UBool       needsInit;
1732     UErrorCode  status = U_ZERO_ERROR;
1733     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1734
1735     if (needsInit) {
1736         UStack  *factories = new UStack(_deleteFactory, NULL, status);
1737         if (factories != NULL && U_SUCCESS(status)) {
1738             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1739             factories->push(builtIn, status);
1740 #ifdef U_LOCAL_SERVICE_HOOK
1741             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1742             if (extra != NULL) {
1743                 factories->push(extra, status);
1744             }
1745 #endif
1746         }
1747         umtx_lock(NULL);
1748         if (gLanguageBreakFactories == NULL) {
1749             gLanguageBreakFactories = factories;
1750             factories = NULL;
1751             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1752         }
1753         umtx_unlock(NULL);
1754         delete factories;
1755     }
1756
1757     if (gLanguageBreakFactories == NULL) {
1758         return NULL;
1759     }
1760
1761     int32_t i = gLanguageBreakFactories->size();
1762     const LanguageBreakEngine *lbe = NULL;
1763     while (--i >= 0) {
1764         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1765         lbe = factory->getEngineFor(c, breakType);
1766         if (lbe != NULL) {
1767             break;
1768         }
1769     }
1770     return lbe;
1771 }
1772
1773
1774 //-------------------------------------------------------------------------------
1775 //
1776 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1777 //                          the characer c.
1778 //
1779 //-------------------------------------------------------------------------------
1780 const LanguageBreakEngine *
1781 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1782     const LanguageBreakEngine *lbe = NULL;
1783     UErrorCode status = U_ZERO_ERROR;
1784
1785     if (fLanguageBreakEngines == NULL) {
1786         fLanguageBreakEngines = new UStack(status);
1787         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1788             delete fLanguageBreakEngines;
1789             fLanguageBreakEngines = 0;
1790             return NULL;
1791         }
1792     }
1793
1794     int32_t i = fLanguageBreakEngines->size();
1795     while (--i >= 0) {
1796         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1797         if (lbe->handles(c, fBreakType)) {
1798             return lbe;
1799         }
1800     }
1801
1802     // No existing dictionary took the character. See if a factory wants to
1803     // give us a new LanguageBreakEngine for this character.
1804     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1805
1806     // If we got one, use it and push it on our stack.
1807     if (lbe != NULL) {
1808         fLanguageBreakEngines->push((void *)lbe, status);
1809         // Even if we can't remember it, we can keep looking it up, so
1810         // return it even if the push fails.
1811         return lbe;
1812     }
1813
1814     // No engine is forthcoming for this character. Add it to the
1815     // reject set. Create the reject break engine if needed.
1816     if (fUnhandledBreakEngine == NULL) {
1817         fUnhandledBreakEngine = new UnhandledEngine(status);
1818         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1819             status = U_MEMORY_ALLOCATION_ERROR;
1820         }
1821         // Put it last so that scripts for which we have an engine get tried
1822         // first.
1823         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1824         // If we can't insert it, or creation failed, get rid of it
1825         if (U_FAILURE(status)) {
1826             delete fUnhandledBreakEngine;
1827             fUnhandledBreakEngine = 0;
1828             return NULL;
1829         }
1830     }
1831
1832     // Tell the reject engine about the character; at its discretion, it may
1833     // add more than just the one character.
1834     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1835
1836     return fUnhandledBreakEngine;
1837 }
1838
1839
1840
1841 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1842     return fBreakType;
1843 }*/
1844
1845 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1846     fBreakType = type;
1847     reset();
1848 }
1849
1850 U_NAMESPACE_END
1851
1852 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */