icuSources/common/rbbi.cpp

   1 /*
   2 ***************************************************************************
   3 *   Copyright (C) 1999-2010 International Business Machines Corporation
   4 *   and others. All rights reserved.
   5 ***************************************************************************
   6 */
   7 //
   8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
   9 //                   runtime engine and the API implementation for
  10 //                   class RuleBasedBreakIterator
  11 //
  12
  13 #include <typeinfo>  // for 'typeid' to work
  14
  15 #include "unicode/utypes.h"
  16
  17 #if !UCONFIG_NO_BREAK_ITERATION
  18
  19 #include "unicode/rbbi.h"
  20 #include "unicode/schriter.h"
  21 #include "unicode/uchriter.h"
  22 #include "unicode/udata.h"
  23 #include "unicode/uclean.h"
  24 #include "rbbidata.h"
  25 #include "rbbirb.h"
  26 #include "cmemory.h"
  27 #include "cstring.h"
  28 #include "umutex.h"
  29 #include "ucln_cmn.h"
  30 #include "brkeng.h"
  31
  32 #include "uassert.h"
  33 #include "uvector.h"
  34
  35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
  36 #if U_LOCAL_SERVICE_HOOK
  37 #include "localsvc.h"
  38 #endif
  39
  40 #ifdef RBBI_DEBUG
  41 static UBool fTrace = FALSE;
  42 #endif
  43
  44 U_NAMESPACE_BEGIN
  45
  46 // The state number of the starting state
  47 #define START_STATE 1
  48
  49 // The state-transition value indicating "stop"
  50 #define STOP_STATE  0
  51
  52
  53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  54
  55
  56 //=======================================================================
  57 // constructors
  58 //=======================================================================
  59
  60 /**
  61  * Constructs a RuleBasedBreakIterator that uses the already-created
  62  * tables object that is passed in as a parameter.
  63  */
  64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  65 {
  66     init();
  67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
  68     if (U_FAILURE(status)) {return;}
  69     if(fData == 0) {
  70         status = U_MEMORY_ALLOCATION_ERROR;
  71         return;
  72     }
  73 }
  74
  75 /**
  76  * Same as above but does not adopt memory
  77  */
  78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
  79 {
  80     init();
  81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
  82     if (U_FAILURE(status)) {return;}
  83     if(fData == 0) {
  84         status = U_MEMORY_ALLOCATION_ERROR;
  85         return;
  86     }
  87 }
  88
  89 //-------------------------------------------------------------------------------
  90 //
  91 //   Constructor   from a UDataMemory handle to precompiled break rules
  92 //                 stored in an ICU data file.
  93 //
  94 //-------------------------------------------------------------------------------
  95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
  96 {
  97     init();
  98     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
  99     if (U_FAILURE(status)) {return;}
 100     if(fData == 0) {
 101         status = U_MEMORY_ALLOCATION_ERROR;
 102         return;
 103     }
 104 }
 105
 106
 107
 108 //-------------------------------------------------------------------------------
 109 //
 110 //   Constructor       from a set of rules supplied as a string.
 111 //
 112 //-------------------------------------------------------------------------------
 113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 114                                                 UParseError          &parseError,
 115                                                 UErrorCode           &status)
 116 {
 117     init();
 118     if (U_FAILURE(status)) {return;}
 119     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
 120         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
 121     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
 122     //        creates and returns a complete RBBI.  From here, in a constructor, we
 123     //        can't just return the object created by the builder factory, hence
 124     //        the assignment of the factory created object to "this".
 125     if (U_SUCCESS(status)) {
 126         *this = *bi;
 127         delete bi;
 128     }
 129 }
 130
 131
 132 //-------------------------------------------------------------------------------
 133 //
 134 // Default Constructor.      Create an empty shell that can be set up later.
 135 //                           Used when creating a RuleBasedBreakIterator from a set
 136 //                           of rules.
 137 //-------------------------------------------------------------------------------
 138 RuleBasedBreakIterator::RuleBasedBreakIterator() {
 139     init();
 140 }
 141
 142
 143 //-------------------------------------------------------------------------------
 144 //
 145 //   Copy constructor.  Will produce a break iterator with the same behavior,
 146 //                      and which iterates over the same text, as the one passed in.
 147 //
 148 //-------------------------------------------------------------------------------
 149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
 150 : BreakIterator(other)
 151 {
 152     this->init();
 153     *this = other;
 154 }
 155
 156
 157 /**
 158  * Destructor
 159  */
 160 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
 161     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 162         // fCharIter was adopted from the outside.
 163         delete fCharIter;
 164     }
 165     fCharIter = NULL;
 166     delete fSCharIter;
 167     fCharIter = NULL;
 168     delete fDCharIter;
 169     fDCharIter = NULL;
 170
 171     utext_close(fText);
 172
 173     if (fData != NULL) {
 174         fData->removeReference();
 175         fData = NULL;
 176     }
 177     if (fCachedBreakPositions) {
 178         uprv_free(fCachedBreakPositions);
 179         fCachedBreakPositions = NULL;
 180     }
 181     if (fLanguageBreakEngines) {
 182         delete fLanguageBreakEngines;
 183         fLanguageBreakEngines = NULL;
 184     }
 185     if (fUnhandledBreakEngine) {
 186         delete fUnhandledBreakEngine;
 187         fUnhandledBreakEngine = NULL;
 188     }
 189 }
 190
 191 /**
 192  * Assignment operator.  Sets this iterator to have the same behavior,
 193  * and iterate over the same text, as the one passed in.
 194  */
 195 RuleBasedBreakIterator&
 196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 197     if (this == &that) {
 198         return *this;
 199     }
 200     reset();    // Delete break cache information
 201     fBreakType = that.fBreakType;
 202     if (fLanguageBreakEngines != NULL) {
 203         delete fLanguageBreakEngines;
 204         fLanguageBreakEngines = NULL;   // Just rebuild for now
 205     }
 206     // TODO: clone fLanguageBreakEngines from "that"
 207     UErrorCode status = U_ZERO_ERROR;
 208     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
 209
 210     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 211         delete fCharIter;
 212     }
 213     fCharIter = NULL;
 214
 215     if (that.fCharIter != NULL ) {
 216         // This is a little bit tricky - it will intially appear that
 217         //  this->fCharIter is adopted, even if that->fCharIter was
 218         //  not adopted.  That's ok.
 219         fCharIter = that.fCharIter->clone();
 220     }
 221
 222     if (fData != NULL) {
 223         fData->removeReference();
 224         fData = NULL;
 225     }
 226     if (that.fData != NULL) {
 227         fData = that.fData->addReference();
 228     }
 229
 230     return *this;
 231 }
 232
 233
 234
 235 //-----------------------------------------------------------------------------
 236 //
 237 //    init()      Shared initialization routine.   Used by all the constructors.
 238 //                Initializes all fields, leaving the object in a consistent state.
 239 //
 240 //-----------------------------------------------------------------------------
 241 void RuleBasedBreakIterator::init() {
 242     UErrorCode  status    = U_ZERO_ERROR;
 243     fBufferClone          = FALSE;
 244     fText                 = utext_openUChars(NULL, NULL, 0, &status);
 245     fCharIter             = NULL;
 246     fSCharIter            = NULL;
 247     fDCharIter            = NULL;
 248     fData                 = NULL;
 249     fLastRuleStatusIndex  = 0;
 250     fLastStatusIndexValid = TRUE;
 251     fDictionaryCharCount  = 0;
 252     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
 253                                         //   dictionary behavior for Break Iterators that are
 254                                         //   built from rules.  Even better would be the ability to
 255                                         //   declare the type in the rules.
 256
 257     fCachedBreakPositions    = NULL;
 258     fLanguageBreakEngines    = NULL;
 259     fUnhandledBreakEngine    = NULL;
 260     fNumCachedBreakPositions = 0;
 261     fPositionInCache         = 0;
 262
 263 #ifdef RBBI_DEBUG
 264     static UBool debugInitDone = FALSE;
 265     if (debugInitDone == FALSE) {
 266         char *debugEnv = getenv("U_RBBIDEBUG");
 267         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
 268             fTrace = TRUE;
 269         }
 270         debugInitDone = TRUE;
 271     }
 272 #endif
 273 }
 274
 275
 276
 277 //-----------------------------------------------------------------------------
 278 //
 279 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
 280 //            behavior, and iterating over the same text, as this one.
 281 //            Virtual function: does the right thing with subclasses.
 282 //
 283 //-----------------------------------------------------------------------------
 284 BreakIterator*
 285 RuleBasedBreakIterator::clone(void) const {
 286     return new RuleBasedBreakIterator(*this);
 287 }
 288
 289 /**
 290  * Equality operator.  Returns TRUE if both BreakIterators are of the
 291  * same class, have the same behavior, and iterate over the same text.
 292  */
 293 UBool
 294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
 295     if (typeid(*this) != typeid(that)) {
 296         return FALSE;
 297     }
 298
 299     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
 300
 301     if (!utext_equals(fText, that2.fText)) {
 302         // The two break iterators are operating on different text,
 303         //   or have a different interation position.
 304         return FALSE;
 305     };
 306
 307     // TODO:  need a check for when in a dictionary region at different offsets.
 308
 309     if (that2.fData == fData ||
 310         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
 311             // The two break iterators are using the same rules.
 312             return TRUE;
 313         }
 314     return FALSE;
 315 }
 316
 317 /**
 318  * Compute a hash code for this BreakIterator
 319  * @return A hash code
 320  */
 321 int32_t
 322 RuleBasedBreakIterator::hashCode(void) const {
 323     int32_t   hash = 0;
 324     if (fData != NULL) {
 325         hash = fData->hashCode();
 326     }
 327     return hash;
 328 }
 329
 330
 331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
 332     if (U_FAILURE(status)) {
 333         return;
 334     }
 335     reset();
 336     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
 337
 338     // Set up a dummy CharacterIterator to be returned if anyone
 339     //   calls getText().  With input from UText, there is no reasonable
 340     //   way to return a characterIterator over the actual input text.
 341     //   Return one over an empty string instead - this is the closest
 342     //   we can come to signaling a failure.
 343     //   (GetText() is obsolete, this failure is sort of OK)
 344     if (fDCharIter == NULL) {
 345         static const UChar c = 0;
 346         fDCharIter = new UCharCharacterIterator(&c, 0);
 347         if (fDCharIter == NULL) {
 348             status = U_MEMORY_ALLOCATION_ERROR;
 349             return;
 350         }
 351     }
 352
 353     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 354         // existing fCharIter was adopted from the outside.  Delete it now.
 355         delete fCharIter;
 356     }
 357     fCharIter = fDCharIter;
 358
 359     this->first();
 360 }
 361
 362
 363 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
 364     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
 365     return result;
 366 }
 367
 368
 369
 370 /**
 371  * Returns the description used to create this iterator
 372  */
 373 const UnicodeString&
 374 RuleBasedBreakIterator::getRules() const {
 375     if (fData != NULL) {
 376         return fData->getRuleSourceString();
 377     } else {
 378         static const UnicodeString *s;
 379         if (s == NULL) {
 380             // TODO:  something more elegant here.
 381             //        perhaps API should return the string by value.
 382             //        Note:  thread unsafe init & leak are semi-ok, better than
 383             //               what was before.  Sould be cleaned up, though.
 384             s = new UnicodeString;
 385         }
 386         return *s;
 387     }
 388 }
 389
 390 //=======================================================================
 391 // BreakIterator overrides
 392 //=======================================================================
 393
 394 /**
 395  * Return a CharacterIterator over the text being analyzed.
 396  */
 397 CharacterIterator&
 398 RuleBasedBreakIterator::getText() const {
 399     return *fCharIter;
 400 }
 401
 402 /**
 403  * Set the iterator to analyze a new piece of text.  This function resets
 404  * the current iteration position to the beginning of the text.
 405  * @param newText An iterator over the text to analyze.
 406  */
 407 void
 408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 409     // If we are holding a CharacterIterator adopted from a
 410     //   previous call to this function, delete it now.
 411     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 412         delete fCharIter;
 413     }
 414
 415     fCharIter = newText;
 416     UErrorCode status = U_ZERO_ERROR;
 417     reset();
 418     if (newText==NULL || newText->startIndex() != 0) {
 419         // startIndex !=0 wants to be an error, but there's no way to report it.
 420         // Make the iterator text be an empty string.
 421         fText = utext_openUChars(fText, NULL, 0, &status);
 422     } else {
 423         fText = utext_openCharacterIterator(fText, newText, &status);
 424     }
 425     this->first();
 426 }
 427
 428 /**
 429  * Set the iterator to analyze a new piece of text.  This function resets
 430  * the current iteration position to the beginning of the text.
 431  * @param newText An iterator over the text to analyze.
 432  */
 433 void
 434 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 435     UErrorCode status = U_ZERO_ERROR;
 436     reset();
 437     fText = utext_openConstUnicodeString(fText, &newText, &status);
 438
 439     // Set up a character iterator on the string.
 440     //   Needed in case someone calls getText().
 441     //  Can not, unfortunately, do this lazily on the (probably never)
 442     //  call to getText(), because getText is const.
 443     if (fSCharIter == NULL) {
 444         fSCharIter = new StringCharacterIterator(newText);
 445     } else {
 446         fSCharIter->setText(newText);
 447     }
 448
 449     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 450         // old fCharIter was adopted from the outside.  Delete it.
 451         delete fCharIter;
 452     }
 453     fCharIter = fSCharIter;
 454
 455     this->first();
 456 }
 457
 458
 459
 460 /**
 461  * Sets the current iteration position to the beginning of the text.
 462  * @return The offset of the beginning of the text.
 463  */
 464 int32_t RuleBasedBreakIterator::first(void) {
 465     reset();
 466     fLastRuleStatusIndex  = 0;
 467     fLastStatusIndexValid = TRUE;
 468     //if (fText == NULL)
 469     //    return BreakIterator::DONE;
 470
 471     utext_setNativeIndex(fText, 0);
 472     return 0;
 473 }
 474
 475 /**
 476  * Sets the current iteration position to the end of the text.
 477  * @return The text's past-the-end offset.
 478  */
 479 int32_t RuleBasedBreakIterator::last(void) {
 480     reset();
 481     if (fText == NULL) {
 482         fLastRuleStatusIndex  = 0;
 483         fLastStatusIndexValid = TRUE;
 484         return BreakIterator::DONE;
 485     }
 486
 487     fLastStatusIndexValid = FALSE;
 488     int32_t pos = (int32_t)utext_nativeLength(fText);
 489     utext_setNativeIndex(fText, pos);
 490     return pos;
 491 }
 492
 493 /**
 494  * Advances the iterator either forward or backward the specified number of steps.
 495  * Negative values move backward, and positive values move forward.  This is
 496  * equivalent to repeatedly calling next() or previous().
 497  * @param n The number of steps to move.  The sign indicates the direction
 498  * (negative is backwards, and positive is forwards).
 499  * @return The character offset of the boundary position n boundaries away from
 500  * the current one.
 501  */
 502 int32_t RuleBasedBreakIterator::next(int32_t n) {
 503     int32_t result = current();
 504     while (n > 0) {
 505         result = next();
 506         --n;
 507     }
 508     while (n < 0) {
 509         result = previous();
 510         ++n;
 511     }
 512     return result;
 513 }
 514
 515 /**
 516  * Advances the iterator to the next boundary position.
 517  * @return The position of the first boundary after this one.
 518  */
 519 int32_t RuleBasedBreakIterator::next(void) {
 520     // if we have cached break positions and we're still in the range
 521     // covered by them, just move one step forward in the cache
 522     if (fCachedBreakPositions != NULL) {
 523         if (fPositionInCache < fNumCachedBreakPositions - 1) {
 524             ++fPositionInCache;
 525             int32_t pos = fCachedBreakPositions[fPositionInCache];
 526             utext_setNativeIndex(fText, pos);
 527             return pos;
 528         }
 529         else {
 530             reset();
 531         }
 532     }
 533
 534     int32_t startPos = current();
 535     int32_t result = handleNext(fData->fForwardTable);
 536     if (fDictionaryCharCount > 0) {
 537         result = checkDictionary(startPos, result, FALSE);
 538     }
 539     return result;
 540 }
 541
 542 /**
 543  * Advances the iterator backwards, to the last boundary preceding this one.
 544  * @return The position of the last boundary position preceding this one.
 545  */
 546 int32_t RuleBasedBreakIterator::previous(void) {
 547     int32_t result;
 548     int32_t startPos;
 549
 550     // if we have cached break positions and we're still in the range
 551     // covered by them, just move one step backward in the cache
 552     if (fCachedBreakPositions != NULL) {
 553         if (fPositionInCache > 0) {
 554             --fPositionInCache;
 555             // If we're at the beginning of the cache, need to reevaluate the
 556             // rule status
 557             if (fPositionInCache <= 0) {
 558                 fLastStatusIndexValid = FALSE;
 559             }
 560             int32_t pos = fCachedBreakPositions[fPositionInCache];
 561             utext_setNativeIndex(fText, pos);
 562             return pos;
 563         }
 564         else {
 565             reset();
 566         }
 567     }
 568
 569     // if we're already sitting at the beginning of the text, return DONE
 570     if (fText == NULL || (startPos = current()) == 0) {
 571         fLastRuleStatusIndex  = 0;
 572         fLastStatusIndexValid = TRUE;
 573         return BreakIterator::DONE;
 574     }
 575
 576     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
 577         result = handlePrevious(fData->fReverseTable);
 578         if (fDictionaryCharCount > 0) {
 579             result = checkDictionary(result, startPos, TRUE);
 580         }
 581         return result;
 582     }
 583
 584     // old rule syntax
 585     // set things up.  handlePrevious() will back us up to some valid
 586     // break position before the current position (we back our internal
 587     // iterator up one step to prevent handlePrevious() from returning
 588     // the current position), but not necessarily the last one before
 589
 590     // where we started
 591
 592     int32_t start = current();
 593
 594     UTEXT_PREVIOUS32(fText);
 595     int32_t lastResult    = handlePrevious(fData->fReverseTable);
 596     if (lastResult == UBRK_DONE) {
 597         lastResult = 0;
 598         utext_setNativeIndex(fText, 0);
 599     }
 600     result = lastResult;
 601     int32_t lastTag       = 0;
 602     UBool   breakTagValid = FALSE;
 603
 604     // iterate forward from the known break position until we pass our
 605     // starting point.  The last break position before the starting
 606     // point is our return value
 607
 608     for (;;) {
 609         result         = next();
 610         if (result == BreakIterator::DONE || result >= start) {
 611             break;
 612         }
 613         lastResult     = result;
 614         lastTag        = fLastRuleStatusIndex;
 615         breakTagValid  = TRUE;
 616     }
 617
 618     // fLastBreakTag wants to have the value for section of text preceding
 619     // the result position that we are to return (in lastResult.)  If
 620     // the backwards rules overshot and the above loop had to do two or more
 621     // next()s to move up to the desired return position, we will have a valid
 622     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
 623     // we wont have a tag value for that position, which is only set by handleNext().
 624
 625     // set the current iteration position to be the last break position
 626     // before where we started, and then return that value
 627     utext_setNativeIndex(fText, lastResult);
 628     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
 629     fLastStatusIndexValid = breakTagValid;
 630
 631     // No need to check the dictionary; it will have been handled by
 632     // next()
 633
 634     return lastResult;
 635 }
 636
 637 /**
 638  * Sets the iterator to refer to the first boundary position following
 639  * the specified position.
 640  * @offset The position from which to begin searching for a break position.
 641  * @return The position of the first break after the current position.
 642  */
 643 int32_t RuleBasedBreakIterator::following(int32_t offset) {
 644     // if we have cached break positions and offset is in the range
 645     // covered by them, use them
 646     // TODO: could use binary search
 647     // TODO: what if offset is outside range, but break is not?
 648     if (fCachedBreakPositions != NULL) {
 649         if (offset >= fCachedBreakPositions[0]
 650                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 651             fPositionInCache = 0;
 652             // We are guaranteed not to leave the array due to range test above
 653             while (offset >= fCachedBreakPositions[fPositionInCache]) {
 654                 ++fPositionInCache;
 655             }
 656             int32_t pos = fCachedBreakPositions[fPositionInCache];
 657             utext_setNativeIndex(fText, pos);
 658             return pos;
 659         }
 660         else {
 661             reset();
 662         }
 663     }
 664
 665     // if the offset passed in is already past the end of the text,
 666     // just return DONE; if it's before the beginning, return the
 667     // text's starting offset
 668     fLastRuleStatusIndex  = 0;
 669     fLastStatusIndexValid = TRUE;
 670     if (fText == NULL || offset >= utext_nativeLength(fText)) {
 671         last();
 672         return next();
 673     }
 674     else if (offset < 0) {
 675         return first();
 676     }
 677
 678     // otherwise, set our internal iteration position (temporarily)
 679     // to the position passed in.  If this is the _beginning_ position,
 680     // then we can just use next() to get our return value
 681
 682     int32_t result = 0;
 683
 684     if (fData->fSafeRevTable != NULL) {
 685         // new rule syntax
 686         utext_setNativeIndex(fText, offset);
 687         // move forward one codepoint to prepare for moving back to a
 688         // safe point.
 689         // this handles offset being between a supplementary character
 690         UTEXT_NEXT32(fText);
 691         // handlePrevious will move most of the time to < 1 boundary away
 692         handlePrevious(fData->fSafeRevTable);
 693         int32_t result = next();
 694         while (result <= offset) {
 695             result = next();
 696         }
 697         return result;
 698     }
 699     if (fData->fSafeFwdTable != NULL) {
 700         // backup plan if forward safe table is not available
 701         utext_setNativeIndex(fText, offset);
 702         UTEXT_PREVIOUS32(fText);
 703         // handle next will give result >= offset
 704         handleNext(fData->fSafeFwdTable);
 705         // previous will give result 0 or 1 boundary away from offset,
 706         // most of the time
 707         // we have to
 708         int32_t oldresult = previous();
 709         while (oldresult > offset) {
 710             int32_t result = previous();
 711             if (result <= offset) {
 712                 return oldresult;
 713             }
 714             oldresult = result;
 715         }
 716         int32_t result = next();
 717         if (result <= offset) {
 718             return next();
 719         }
 720         return result;
 721     }
 722     // otherwise, we have to sync up first.  Use handlePrevious() to back
 723     // up to a known break position before the specified position (if
 724     // we can determine that the specified position is a break position,
 725     // we don't back up at all).  This may or may not be the last break
 726     // position at or before our starting position.  Advance forward
 727     // from here until we've passed the starting position.  The position
 728     // we stop on will be the first break position after the specified one.
 729     // old rule syntax
 730
 731     utext_setNativeIndex(fText, offset);
 732     if (offset==0 ||
 733         (offset==1  && utext_getNativeIndex(fText)==0)) {
 734         return next();
 735     }
 736     result = previous();
 737
 738     while (result != BreakIterator::DONE && result <= offset) {
 739         result = next();
 740     }
 741
 742     return result;
 743 }
 744
 745 /**
 746  * Sets the iterator to refer to the last boundary position before the
 747  * specified position.
 748  * @offset The position to begin searching for a break from.
 749  * @return The position of the last boundary before the starting position.
 750  */
 751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 752     // if we have cached break positions and offset is in the range
 753     // covered by them, use them
 754     if (fCachedBreakPositions != NULL) {
 755         // TODO: binary search?
 756         // TODO: What if offset is outside range, but break is not?
 757         if (offset > fCachedBreakPositions[0]
 758                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 759             fPositionInCache = 0;
 760             while (fPositionInCache < fNumCachedBreakPositions
 761                    && offset > fCachedBreakPositions[fPositionInCache])
 762                 ++fPositionInCache;
 763             --fPositionInCache;
 764             // If we're at the beginning of the cache, need to reevaluate the
 765             // rule status
 766             if (fPositionInCache <= 0) {
 767                 fLastStatusIndexValid = FALSE;
 768             }
 769             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
 770             return fCachedBreakPositions[fPositionInCache];
 771         }
 772         else {
 773             reset();
 774         }
 775     }
 776
 777     // if the offset passed in is already past the end of the text,
 778     // just return DONE; if it's before the beginning, return the
 779     // text's starting offset
 780     if (fText == NULL || offset > utext_nativeLength(fText)) {
 781         // return BreakIterator::DONE;
 782         return last();
 783     }
 784     else if (offset < 0) {
 785         return first();
 786     }
 787
 788     // if we start by updating the current iteration position to the
 789     // position specified by the caller, we can just use previous()
 790     // to carry out this operation
 791
 792     if (fData->fSafeFwdTable != NULL) {
 793         // new rule syntax
 794         utext_setNativeIndex(fText, offset);
 795         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 796         if (newOffset != offset) {
 797             // Will come here if specified offset was not a code point boundary AND
 798             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
 799             //   indices to the containing code point.
 800             // For breakitereator::preceding only, these non-code-point indices need to be moved
 801             //   up to refer to the following codepoint.
 802             UTEXT_NEXT32(fText);
 803             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 804         }
 805
 806         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
 807         //        rather than adjusting the position unconditionally?
 808         //        (Change would interact with safe rules.)
 809         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
 810         //        affects only preceding(), seems cleaner, but is slightly different.
 811         UTEXT_PREVIOUS32(fText);
 812         handleNext(fData->fSafeFwdTable);
 813         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 814         while (result >= offset) {
 815             result = previous();
 816         }
 817         return result;
 818     }
 819     if (fData->fSafeRevTable != NULL) {
 820         // backup plan if forward safe table is not available
 821         //  TODO:  check whether this path can be discarded
 822         //         It's probably OK to say that rules must supply both safe tables
 823         //            if they use safe tables at all.  We have certainly never described
 824         //            to anyone how to work with just one safe table.
 825         utext_setNativeIndex(fText, offset);
 826         UTEXT_NEXT32(fText);
 827
 828         // handle previous will give result <= offset
 829         handlePrevious(fData->fSafeRevTable);
 830
 831         // next will give result 0 or 1 boundary away from offset,
 832         // most of the time
 833         // we have to
 834         int32_t oldresult = next();
 835         while (oldresult < offset) {
 836             int32_t result = next();
 837             if (result >= offset) {
 838                 return oldresult;
 839             }
 840             oldresult = result;
 841         }
 842         int32_t result = previous();
 843         if (result >= offset) {
 844             return previous();
 845         }
 846         return result;
 847     }
 848
 849     // old rule syntax
 850     utext_setNativeIndex(fText, offset);
 851     return previous();
 852 }
 853
 854 /**
 855  * Returns true if the specfied position is a boundary position.  As a side
 856  * effect, leaves the iterator pointing to the first boundary position at
 857  * or after "offset".
 858  * @param offset the offset to check.
 859  * @return True if "offset" is a boundary position.
 860  */
 861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
 862     // the beginning index of the iterator is always a boundary position by definition
 863     if (offset == 0) {
 864         first();       // For side effects on current position, tag values.
 865         return TRUE;
 866     }
 867
 868     if (offset == (int32_t)utext_nativeLength(fText)) {
 869         last();       // For side effects on current position, tag values.
 870         return TRUE;
 871     }
 872
 873     // out-of-range indexes are never boundary positions
 874     if (offset < 0) {
 875         first();       // For side effects on current position, tag values.
 876         return FALSE;
 877     }
 878
 879     if (offset > utext_nativeLength(fText)) {
 880         last();        // For side effects on current position, tag values.
 881         return FALSE;
 882     }
 883
 884     // otherwise, we can use following() on the position before the specified
 885     // one and return true if the position we get back is the one the user
 886     // specified
 887     utext_previous32From(fText, offset);
 888     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 889     UBool    result  = following(backOne) == offset;
 890     return result;
 891 }
 892
 893 /**
 894  * Returns the current iteration position.
 895  * @return The current iteration position.
 896  */
 897 int32_t RuleBasedBreakIterator::current(void) const {
 898     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 899     return pos;
 900 }
 901
 902 //=======================================================================
 903 // implementation
 904 //=======================================================================
 905
 906 //
 907 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
 908 //                 of user text.  A variable with this enum type keeps track of where we
 909 //                 are.  The state machine only fetches user input while in the RUN mode.
 910 //
 911 enum RBBIRunMode {
 912     RBBI_START,     // state machine processing is before first char of input
 913     RBBI_RUN,       // state machine processing is in the user text
 914     RBBI_END        // state machine processing is after end of user text.
 915 };
 916
 917
 918 //-----------------------------------------------------------------------------------
 919 //
 920 //  handleNext(stateTable)
 921 //     This method is the actual implementation of the rbbi next() method.
 922 //     This method initializes the state machine to state 1
 923 //     and advances through the text character by character until we reach the end
 924 //     of the text or the state machine transitions to state 0.  We update our return
 925 //     value every time the state machine passes through an accepting state.
 926 //
 927 //-----------------------------------------------------------------------------------
 928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
 929     int32_t             state;
 930     int16_t             category        = 0;
 931     RBBIRunMode         mode;
 932
 933     RBBIStateTableRow  *row;
 934     UChar32             c;
 935     int32_t             lookaheadStatus = 0;
 936     int32_t             lookaheadTagIdx = 0;
 937     int32_t             result          = 0;
 938     int32_t             initialPosition = 0;
 939     int32_t             lookaheadResult = 0;
 940     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
 941     const char         *tableData       = statetable->fTableData;
 942     uint32_t            tableRowLen     = statetable->fRowLen;
 943
 944     #ifdef RBBI_DEBUG
 945         if (fTrace) {
 946             RBBIDebugPuts("Handle Next   pos   char  state category");
 947         }
 948     #endif
 949
 950     // No matter what, handleNext alway correctly sets the break tag value.
 951     fLastStatusIndexValid = TRUE;
 952     fLastRuleStatusIndex = 0;
 953
 954     // if we're already at the end of the text, return DONE.
 955     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 956     result          = initialPosition;
 957     c               = UTEXT_NEXT32(fText);
 958     if (fData == NULL || c==U_SENTINEL) {
 959         return BreakIterator::DONE;
 960     }
 961
 962     //  Set the initial state for the state machine
 963     state = START_STATE;
 964     row = (RBBIStateTableRow *)
 965             //(statetable->fTableData + (statetable->fRowLen * state));
 966             (tableData + tableRowLen * state);
 967
 968
 969     mode     = RBBI_RUN;
 970     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
 971         category = 2;
 972         mode     = RBBI_START;
 973     }
 974
 975
 976     // loop until we reach the end of the text or transition to state 0
 977     //
 978     for (;;) {
 979         if (c == U_SENTINEL) {
 980             // Reached end of input string.
 981             if (mode == RBBI_END) {
 982                 // We have already run the loop one last time with the
 983                 //   character set to the psueudo {eof} value.  Now it is time
 984                 //   to unconditionally bail out.
 985                 if (lookaheadResult > result) {
 986                     // We ran off the end of the string with a pending look-ahead match.
 987                     // Treat this as if the look-ahead condition had been met, and return
 988                     //  the match at the / position from the look-ahead rule.
 989                     result               = lookaheadResult;
 990                     fLastRuleStatusIndex = lookaheadTagIdx;
 991                     lookaheadStatus = 0;
 992                 }
 993                 break;
 994             }
 995             // Run the loop one last time with the fake end-of-input character category.
 996             mode = RBBI_END;
 997             category = 1;
 998         }
 999
1000         //
1001         // Get the char category.  An incoming category of 1 or 2 means that
1002         //      we are preset for doing the beginning or end of input, and
1003         //      that we shouldn't get a category from an actual text input character.
1004         //
1005         if (mode == RBBI_RUN) {
1006             // look up the current character's character category, which tells us
1007             // which column in the state table to look at.
1008             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1009             //        not the size of the character going in, which is a UChar32.
1010             //
1011             UTRIE_GET16(&fData->fTrie, c, category);
1012
1013             // Check the dictionary bit in the character's category.
1014             //    Counter is only used by dictionary based iterators (subclasses).
1015             //    Chars that need to be handled by a dictionary have a flag bit set
1016             //    in their category values.
1017             //
1018             if ((category & 0x4000) != 0)  {
1019                 fDictionaryCharCount++;
1020                 //  And off the dictionary flag bit.
1021                 category &= ~0x4000;
1022             }
1023         }
1024
1025         #ifdef RBBI_DEBUG
1026             if (fTrace) {
1027                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
1028                 if (0x20<=c && c<0x7f) {
1029                     RBBIDebugPrintf("\"%c\"  ", c);
1030                 } else {
1031                     RBBIDebugPrintf("%5x  ", c);
1032                 }
1033                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1034             }
1035         #endif
1036
1037         // State Transition - move machine to its next state
1038         //
1039         state = row->fNextState[category];
1040         row = (RBBIStateTableRow *)
1041             // (statetable->fTableData + (statetable->fRowLen * state));
1042             (tableData + tableRowLen * state);
1043
1044
1045         if (row->fAccepting == -1) {
1046             // Match found, common case.
1047             if (mode != RBBI_START) {
1048                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1049             }
1050             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1051         }
1052
1053         if (row->fLookAhead != 0) {
1054             if (lookaheadStatus != 0
1055                 && row->fAccepting == lookaheadStatus) {
1056                 // Lookahead match is completed.
1057                 result               = lookaheadResult;
1058                 fLastRuleStatusIndex = lookaheadTagIdx;
1059                 lookaheadStatus      = 0;
1060                 // TODO:  make a standalone hard break in a rule work.
1061                 if (lookAheadHardBreak) {
1062                     UTEXT_SETNATIVEINDEX(fText, result);
1063                     return result;
1064                 }
1065                 // Look-ahead completed, but other rules may match further.  Continue on
1066                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1067                 goto continueOn;
1068             }
1069
1070             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1071             lookaheadResult = r;
1072             lookaheadStatus = row->fLookAhead;
1073             lookaheadTagIdx = row->fTagIdx;
1074             goto continueOn;
1075         }
1076
1077
1078         if (row->fAccepting != 0) {
1079             // Because this is an accepting state, any in-progress look-ahead match
1080             //   is no longer relavant.  Clear out the pending lookahead status.
1081             lookaheadStatus = 0;           // clear out any pending look-ahead match.
1082         }
1083
1084 continueOn:
1085         if (state == STOP_STATE) {
1086             // This is the normal exit from the lookup state machine.
1087             // We have advanced through the string until it is certain that no
1088             //   longer match is possible, no matter what characters follow.
1089             break;
1090         }
1091
1092         // Advance to the next character.
1093         // If this is a beginning-of-input loop iteration, don't advance
1094         //    the input position.  The next iteration will be processing the
1095         //    first real input character.
1096         if (mode == RBBI_RUN) {
1097             c = UTEXT_NEXT32(fText);
1098         } else {
1099             if (mode == RBBI_START) {
1100                 mode = RBBI_RUN;
1101             }
1102         }
1103
1104
1105     }
1106
1107     // The state machine is done.  Check whether it found a match...
1108
1109     // If the iterator failed to advance in the match engine, force it ahead by one.
1110     //   (This really indicates a defect in the break rules.  They should always match
1111     //    at least one character.)
1112     if (result == initialPosition) {
1113         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1114         UTEXT_NEXT32(fText);
1115         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1116     }
1117
1118     // Leave the iterator at our result position.
1119     UTEXT_SETNATIVEINDEX(fText, result);
1120     #ifdef RBBI_DEBUG
1121         if (fTrace) {
1122             RBBIDebugPrintf("result = %d\n\n", result);
1123         }
1124     #endif
1125     return result;
1126 }
1127
1128
1129
1130 //-----------------------------------------------------------------------------------
1131 //
1132 //  handlePrevious()
1133 //
1134 //      Iterate backwards, according to the logic of the reverse rules.
1135 //      This version handles the exact style backwards rules.
1136 //
1137 //      The logic of this function is very similar to handleNext(), above.
1138 //
1139 //-----------------------------------------------------------------------------------
1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1141     int32_t             state;
1142     int16_t             category        = 0;
1143     RBBIRunMode         mode;
1144     RBBIStateTableRow  *row;
1145     UChar32             c;
1146     int32_t             lookaheadStatus = 0;
1147     int32_t             result          = 0;
1148     int32_t             initialPosition = 0;
1149     int32_t             lookaheadResult = 0;
1150     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1151
1152     #ifdef RBBI_DEBUG
1153         if (fTrace) {
1154             RBBIDebugPuts("Handle Previous   pos   char  state category");
1155         }
1156     #endif
1157
1158     // handlePrevious() never gets the rule status.
1159     // Flag the status as invalid; if the user ever asks for status, we will need
1160     // to back up, then re-find the break position using handleNext(), which does
1161     // get the status value.
1162     fLastStatusIndexValid = FALSE;
1163     fLastRuleStatusIndex = 0;
1164
1165     // if we're already at the start of the text, return DONE.
1166     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1167         return BreakIterator::DONE;
1168     }
1169
1170     //  Set up the starting char.
1171     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1172     result          = initialPosition;
1173     c               = UTEXT_PREVIOUS32(fText);
1174
1175     //  Set the initial state for the state machine
1176     state = START_STATE;
1177     row = (RBBIStateTableRow *)
1178             (statetable->fTableData + (statetable->fRowLen * state));
1179     category = 3;
1180     mode     = RBBI_RUN;
1181     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1182         category = 2;
1183         mode     = RBBI_START;
1184     }
1185
1186
1187     // loop until we reach the start of the text or transition to state 0
1188     //
1189     for (;;) {
1190         if (c == U_SENTINEL) {
1191             // Reached end of input string.
1192             if (mode == RBBI_END) {
1193                 // We have already run the loop one last time with the
1194                 //   character set to the psueudo {eof} value.  Now it is time
1195                 //   to unconditionally bail out.
1196                 if (lookaheadResult < result) {
1197                     // We ran off the end of the string with a pending look-ahead match.
1198                     // Treat this as if the look-ahead condition had been met, and return
1199                     //  the match at the / position from the look-ahead rule.
1200                     result               = lookaheadResult;
1201                     lookaheadStatus = 0;
1202                 } else if (result == initialPosition) {
1203                     // Ran off start, no match found.
1204                     // move one index one (towards the start, since we are doing a previous())
1205                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1206                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1207                 }
1208                 break;
1209             }
1210             // Run the loop one last time with the fake end-of-input character category.
1211             mode = RBBI_END;
1212             category = 1;
1213         }
1214
1215         //
1216         // Get the char category.  An incoming category of 1 or 2 means that
1217         //      we are preset for doing the beginning or end of input, and
1218         //      that we shouldn't get a category from an actual text input character.
1219         //
1220         if (mode == RBBI_RUN) {
1221             // look up the current character's character category, which tells us
1222             // which column in the state table to look at.
1223             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1224             //        not the size of the character going in, which is a UChar32.
1225             //
1226             UTRIE_GET16(&fData->fTrie, c, category);
1227
1228             // Check the dictionary bit in the character's category.
1229             //    Counter is only used by dictionary based iterators (subclasses).
1230             //    Chars that need to be handled by a dictionary have a flag bit set
1231             //    in their category values.
1232             //
1233             if ((category & 0x4000) != 0)  {
1234                 fDictionaryCharCount++;
1235                 //  And off the dictionary flag bit.
1236                 category &= ~0x4000;
1237             }
1238         }
1239
1240         #ifdef RBBI_DEBUG
1241             if (fTrace) {
1242                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1243                 if (0x20<=c && c<0x7f) {
1244                     RBBIDebugPrintf("\"%c\"  ", c);
1245                 } else {
1246                     RBBIDebugPrintf("%5x  ", c);
1247                 }
1248                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1249             }
1250         #endif
1251
1252         // State Transition - move machine to its next state
1253         //
1254         state = row->fNextState[category];
1255         row = (RBBIStateTableRow *)
1256             (statetable->fTableData + (statetable->fRowLen * state));
1257
1258         if (row->fAccepting == -1) {
1259             // Match found, common case.
1260             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1261         }
1262
1263         if (row->fLookAhead != 0) {
1264             if (lookaheadStatus != 0
1265                 && row->fAccepting == lookaheadStatus) {
1266                 // Lookahead match is completed.
1267                 result               = lookaheadResult;
1268                 lookaheadStatus      = 0;
1269                 // TODO:  make a standalone hard break in a rule work.
1270                 if (lookAheadHardBreak) {
1271                     UTEXT_SETNATIVEINDEX(fText, result);
1272                     return result;
1273                 }
1274                 // Look-ahead completed, but other rules may match further.  Continue on
1275                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1276                 goto continueOn;
1277             }
1278
1279             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1280             lookaheadResult = r;
1281             lookaheadStatus = row->fLookAhead;
1282             goto continueOn;
1283         }
1284
1285
1286         if (row->fAccepting != 0) {
1287             // Because this is an accepting state, any in-progress look-ahead match
1288             //   is no longer relavant.  Clear out the pending lookahead status.
1289             lookaheadStatus = 0;
1290         }
1291
1292 continueOn:
1293         if (state == STOP_STATE) {
1294             // This is the normal exit from the lookup state machine.
1295             // We have advanced through the string until it is certain that no
1296             //   longer match is possible, no matter what characters follow.
1297             break;
1298         }
1299
1300         // Move (backwards) to the next character to process.
1301         // If this is a beginning-of-input loop iteration, don't advance
1302         //    the input position.  The next iteration will be processing the
1303         //    first real input character.
1304         if (mode == RBBI_RUN) {
1305             c = UTEXT_PREVIOUS32(fText);
1306         } else {
1307             if (mode == RBBI_START) {
1308                 mode = RBBI_RUN;
1309             }
1310         }
1311     }
1312
1313     // The state machine is done.  Check whether it found a match...
1314
1315     // If the iterator failed to advance in the match engine, force it ahead by one.
1316     //   (This really indicates a defect in the break rules.  They should always match
1317     //    at least one character.)
1318     if (result == initialPosition) {
1319         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1320         UTEXT_PREVIOUS32(fText);
1321         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1322     }
1323
1324     // Leave the iterator at our result position.
1325     UTEXT_SETNATIVEINDEX(fText, result);
1326     #ifdef RBBI_DEBUG
1327         if (fTrace) {
1328             RBBIDebugPrintf("result = %d\n\n", result);
1329         }
1330     #endif
1331     return result;
1332 }
1333
1334
1335 void
1336 RuleBasedBreakIterator::reset()
1337 {
1338     if (fCachedBreakPositions) {
1339         uprv_free(fCachedBreakPositions);
1340     }
1341     fCachedBreakPositions = NULL;
1342     fNumCachedBreakPositions = 0;
1343     fDictionaryCharCount = 0;
1344     fPositionInCache = 0;
1345 }
1346
1347
1348
1349 //-------------------------------------------------------------------------------
1350 //
1351 //   getRuleStatus()   Return the break rule tag associated with the current
1352 //                     iterator position.  If the iterator arrived at its current
1353 //                     position by iterating forwards, the value will have been
1354 //                     cached by the handleNext() function.
1355 //
1356 //                     If no cached status value is available, the status is
1357 //                     found by doing a previous() followed by a next(), which
1358 //                     leaves the iterator where it started, and computes the
1359 //                     status while doing the next().
1360 //
1361 //-------------------------------------------------------------------------------
1362 void RuleBasedBreakIterator::makeRuleStatusValid() {
1363     if (fLastStatusIndexValid == FALSE) {
1364         //  No cached status is available.
1365         if (fText == NULL || current() == 0) {
1366             //  At start of text, or there is no text.  Status is always zero.
1367             fLastRuleStatusIndex = 0;
1368             fLastStatusIndexValid = TRUE;
1369         } else {
1370             //  Not at start of text.  Find status the tedious way.
1371             int32_t pa = current();
1372             previous();
1373             if (fNumCachedBreakPositions > 0) {
1374                 reset();                // Blow off the dictionary cache
1375             }
1376             int32_t pb = next();
1377             if (pa != pb) {
1378                 // note: the if (pa != pb) test is here only to eliminate warnings for
1379                 //       unused local variables on gcc.  Logically, it isn't needed.
1380                 U_ASSERT(pa == pb);
1381             }
1382         }
1383     }
1384     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1385 }
1386
1387
1388 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1389     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1390     nonConstThis->makeRuleStatusValid();
1391
1392     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1393     //                                                 (the number of status values.)
1394     //   This function returns the last (largest) of the array of status values.
1395     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1396     int32_t  tagVal = fData->fRuleStatusTable[idx];
1397
1398     return tagVal;
1399 }
1400
1401
1402
1403
1404 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1405              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1406 {
1407     if (U_FAILURE(status)) {
1408         return 0;
1409     }
1410
1411     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1412     nonConstThis->makeRuleStatusValid();
1413     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1414     int32_t  numValsToCopy = numVals;
1415     if (numVals > capacity) {
1416         status = U_BUFFER_OVERFLOW_ERROR;
1417         numValsToCopy = capacity;
1418     }
1419     int i;
1420     for (i=0; i<numValsToCopy; i++) {
1421         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1422     }
1423     return numVals;
1424 }
1425
1426
1427
1428 //-------------------------------------------------------------------------------
1429 //
1430 //   getBinaryRules        Access to the compiled form of the rules,
1431 //                         for use by build system tools that save the data
1432 //                         for standard iterator types.
1433 //
1434 //-------------------------------------------------------------------------------
1435 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1436     const uint8_t  *retPtr = NULL;
1437     length = 0;
1438
1439     if (fData != NULL) {
1440         retPtr = (const uint8_t *)fData->fHeader;
1441         length = fData->fHeader->fLength;
1442     }
1443     return retPtr;
1444 }
1445
1446
1447
1448
1449 //-------------------------------------------------------------------------------
1450 //
1451 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1452 //                    Saving one heap allocation isn't worth the trouble.
1453 //                    Cloning shouldn't be done in tight loops, and
1454 //                    making the clone copy involves other heap operations anyway.
1455 //                    And the application code for correctly dealing with buffer
1456 //                    size problems and the eventual object destruction is ugly.
1457 //
1458 //-------------------------------------------------------------------------------
1459 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1460                                    int32_t &bufferSize,
1461                                    UErrorCode &status)
1462 {
1463     if (U_FAILURE(status)){
1464         return NULL;
1465     }
1466
1467     //
1468     //  If user buffer size is zero this is a preflight operation to
1469     //    obtain the needed buffer size, allowing for worst case misalignment.
1470     //
1471     if (bufferSize == 0) {
1472         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1473         return NULL;
1474     }
1475
1476
1477     //
1478     //  Check the alignment and size of the user supplied buffer.
1479     //  Allocate heap memory if the user supplied memory is insufficient.
1480     //
1481     char    *buf   = (char *)stackBuffer;
1482     uint32_t s      = bufferSize;
1483
1484     if (stackBuffer == NULL) {
1485         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1486     }
1487     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1488         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1489         s   -= offsetUp;
1490         buf += offsetUp;
1491     }
1492     if (s < sizeof(RuleBasedBreakIterator)) {
1493         // Not enough room in the caller-supplied buffer.
1494         // Do a plain-vanilla heap based clone and return that, along with
1495         //   a warning that the clone was allocated.
1496         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1497         if (clonedBI == 0) {
1498             status = U_MEMORY_ALLOCATION_ERROR;
1499         } else {
1500             status = U_SAFECLONE_ALLOCATED_WARNING;
1501         }
1502         return clonedBI;
1503     }
1504
1505     //
1506     //  Clone the source BI into the caller-supplied buffer.
1507     //    TODO:  using an overloaded operator new to directly initialize the
1508     //           copy in the user's buffer would be better, but it doesn't seem
1509     //           to get along with namespaces.  Investigate why.
1510     //
1511     //           The memcpy is only safe with an empty (default constructed)
1512     //           break iterator.  Use on others can screw up reference counts
1513     //           to data.  memcpy-ing objects is not really a good idea...
1514     //
1515     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
1516     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1517     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1518     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
1519     *clone = *this;               // clone = the real BI we want.
1520     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1521
1522     return clone;
1523 }
1524
1525
1526 //-------------------------------------------------------------------------------
1527 //
1528 //  isDictionaryChar      Return true if the category lookup for this char
1529 //                        indicates that it is in the set of dictionary lookup
1530 //                        chars.
1531 //
1532 //                        This function is intended for use by dictionary based
1533 //                        break iterators.
1534 //
1535 //-------------------------------------------------------------------------------
1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1537     if (fData == NULL) {
1538         return FALSE;
1539     }
1540     uint16_t category;
1541     UTRIE_GET16(&fData->fTrie, c, category);
1542     return (category & 0x4000) != 0;
1543 }*/
1544
1545
1546 //-------------------------------------------------------------------------------
1547 //
1548 //  checkDictionary       This function handles all processing of characters in
1549 //                        the "dictionary" set. It will determine the appropriate
1550 //                        course of action, and possibly set up a cache in the
1551 //                        process.
1552 //
1553 //-------------------------------------------------------------------------------
1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1555                             int32_t endPos,
1556                             UBool reverse) {
1557     // Reset the old break cache first.
1558     uint32_t dictionaryCount = fDictionaryCharCount;
1559     reset();
1560
1561     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1562         return (reverse ? startPos : endPos);
1563     }
1564
1565     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
1566     //      because native indexes are different from UTF-16 indexes.
1567     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
1568     //      It wont give the right breaks, but it's better than a crash.
1569     //
1570     //      Check the type of the UText by checking its pFuncs field, which
1571     //      is UText's function dispatch table.  It will be the same for all
1572     //      UTF-8 UTexts and different for any other UText type.
1573     //
1574     //      We have no other type of UText available with non-UTF-16 native indexing.
1575     //      This whole check will go away once the dictionary code is fixed.
1576     static const void *utext_utf8Funcs;
1577     if (utext_utf8Funcs == NULL) {
1578         // Cache the UTF-8 UText function pointer value.
1579         UErrorCode status = U_ZERO_ERROR;
1580         UText tempUText = UTEXT_INITIALIZER;
1581         utext_openUTF8(&tempUText, NULL, 0, &status);
1582         utext_utf8Funcs = tempUText.pFuncs;
1583         utext_close(&tempUText);
1584     }
1585     if (fText->pFuncs == utext_utf8Funcs) {
1586         return (reverse ? startPos : endPos);
1587     }
1588
1589     // Starting from the starting point, scan towards the proposed result,
1590     // looking for the first dictionary character (which may be the one
1591     // we're on, if we're starting in the middle of a range).
1592     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1593     if (reverse) {
1594         UTEXT_PREVIOUS32(fText);
1595     }
1596
1597     int32_t rangeStart = startPos;
1598     int32_t rangeEnd = endPos;
1599
1600     uint16_t    category;
1601     int32_t     current;
1602     UErrorCode  status = U_ZERO_ERROR;
1603     UStack      breaks(status);
1604     int32_t     foundBreakCount = 0;
1605     UChar32     c = utext_current32(fText);
1606
1607     UTRIE_GET16(&fData->fTrie, c, category);
1608
1609     // Is the character we're starting on a dictionary character? If so, we
1610     // need to back up to include the entire run; otherwise the results of
1611     // the break algorithm will differ depending on where we start. Since
1612     // the result is cached and there is typically a non-dictionary break
1613     // within a small number of words, there should be little performance impact.
1614     if (category & 0x4000) {
1615         if (reverse) {
1616             do {
1617                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1618                 c = utext_current32(fText);
1619                 UTRIE_GET16(&fData->fTrie, c, category);
1620             } while (c != U_SENTINEL && (category & 0x4000));
1621             // Back up to the last dictionary character
1622             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1623             if (c == U_SENTINEL) {
1624                 // c = fText->last32();
1625                 //   TODO:  why was this if needed?
1626                 c = UTEXT_PREVIOUS32(fText);
1627             }
1628             else {
1629                 c = UTEXT_PREVIOUS32(fText);
1630             }
1631         }
1632         else {
1633             do {
1634                 c = UTEXT_PREVIOUS32(fText);
1635                 UTRIE_GET16(&fData->fTrie, c, category);
1636             }
1637             while (c != U_SENTINEL && (category & 0x4000));
1638             // Back up to the last dictionary character
1639             if (c == U_SENTINEL) {
1640                 // c = fText->first32();
1641                 c = utext_current32(fText);
1642             }
1643             else {
1644                 utext_next32(fText);
1645                 c = utext_current32(fText);
1646             }
1647             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1648         }
1649         UTRIE_GET16(&fData->fTrie, c, category);
1650     }
1651
1652     // Loop through the text, looking for ranges of dictionary characters.
1653     // For each span, find the appropriate break engine, and ask it to find
1654     // any breaks within the span.
1655     // Note: we always do this in the forward direction, so that the break
1656     // cache is built in the right order.
1657     if (reverse) {
1658         utext_setNativeIndex(fText, rangeStart);
1659         c = utext_current32(fText);
1660         UTRIE_GET16(&fData->fTrie, c, category);
1661     }
1662     while(U_SUCCESS(status)) {
1663         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1664             utext_next32(fText);           // TODO:  tweak for post-increment operation
1665             c = utext_current32(fText);
1666             UTRIE_GET16(&fData->fTrie, c, category);
1667         }
1668         if (current >= rangeEnd) {
1669             break;
1670         }
1671
1672         // We now have a dictionary character. Get the appropriate language object
1673         // to deal with it.
1674         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1675
1676         // Ask the language object if there are any breaks. It will leave the text
1677         // pointer on the other side of its range, ready to search for the next one.
1678         if (lbe != NULL) {
1679             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1680         }
1681
1682         // Reload the loop variables for the next go-round
1683         c = utext_current32(fText);
1684         UTRIE_GET16(&fData->fTrie, c, category);
1685     }
1686
1687     // If we found breaks, build a new break cache. The first and last entries must
1688     // be the original starting and ending position.
1689     if (foundBreakCount > 0) {
1690         int32_t totalBreaks = foundBreakCount;
1691         if (startPos < breaks.elementAti(0)) {
1692             totalBreaks += 1;
1693         }
1694         if (endPos > breaks.peeki()) {
1695             totalBreaks += 1;
1696         }
1697         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1698         if (fCachedBreakPositions != NULL) {
1699             int32_t out = 0;
1700             fNumCachedBreakPositions = totalBreaks;
1701             if (startPos < breaks.elementAti(0)) {
1702                 fCachedBreakPositions[out++] = startPos;
1703             }
1704             for (int32_t i = 0; i < foundBreakCount; ++i) {
1705                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1706             }
1707             if (endPos > fCachedBreakPositions[out-1]) {
1708                 fCachedBreakPositions[out] = endPos;
1709             }
1710             // If there are breaks, then by definition, we are replacing the original
1711             // proposed break by one of the breaks we found. Use following() and
1712             // preceding() to do the work. They should never recurse in this case.
1713             if (reverse) {
1714                 return preceding(endPos - 1);
1715             }
1716             else {
1717                 return following(startPos);
1718             }
1719         }
1720         // If the allocation failed, just fall through to the "no breaks found" case.
1721     }
1722
1723     // If we get here, there were no language-based breaks. Set the text pointer
1724     // to the original proposed break.
1725     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1726     return (reverse ? startPos : endPos);
1727 }
1728
1729 U_NAMESPACE_END
1730
1731 // defined in ucln_cmn.h
1732
1733 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1734
1735 /**
1736  * Release all static memory held by breakiterator.
1737  */
1738 U_CDECL_BEGIN
1739 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1740     if (gLanguageBreakFactories) {
1741         delete gLanguageBreakFactories;
1742         gLanguageBreakFactories = NULL;
1743     }
1744     return TRUE;
1745 }
1746 U_CDECL_END
1747
1748 U_CDECL_BEGIN
1749 static void U_CALLCONV _deleteFactory(void *obj) {
1750     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1751 }
1752 U_CDECL_END
1753 U_NAMESPACE_BEGIN
1754
1755 static const LanguageBreakEngine*
1756 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1757 {
1758     UBool       needsInit;
1759     UErrorCode  status = U_ZERO_ERROR;
1760     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1761
1762     if (needsInit) {
1763         UStack  *factories = new UStack(_deleteFactory, NULL, status);
1764         if (factories != NULL && U_SUCCESS(status)) {
1765             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1766             factories->push(builtIn, status);
1767 #ifdef U_LOCAL_SERVICE_HOOK
1768             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1769             if (extra != NULL) {
1770                 factories->push(extra, status);
1771             }
1772 #endif
1773         }
1774         umtx_lock(NULL);
1775         if (gLanguageBreakFactories == NULL) {
1776             gLanguageBreakFactories = factories;
1777             factories = NULL;
1778             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1779         }
1780         umtx_unlock(NULL);
1781         delete factories;
1782     }
1783
1784     if (gLanguageBreakFactories == NULL) {
1785         return NULL;
1786     }
1787
1788     int32_t i = gLanguageBreakFactories->size();
1789     const LanguageBreakEngine *lbe = NULL;
1790     while (--i >= 0) {
1791         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1792         lbe = factory->getEngineFor(c, breakType);
1793         if (lbe != NULL) {
1794             break;
1795         }
1796     }
1797     return lbe;
1798 }
1799
1800
1801 //-------------------------------------------------------------------------------
1802 //
1803 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1804 //                          the characer c.
1805 //
1806 //-------------------------------------------------------------------------------
1807 const LanguageBreakEngine *
1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1809     const LanguageBreakEngine *lbe = NULL;
1810     UErrorCode status = U_ZERO_ERROR;
1811
1812     if (fLanguageBreakEngines == NULL) {
1813         fLanguageBreakEngines = new UStack(status);
1814         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1815             delete fLanguageBreakEngines;
1816             fLanguageBreakEngines = 0;
1817             return NULL;
1818         }
1819     }
1820
1821     int32_t i = fLanguageBreakEngines->size();
1822     while (--i >= 0) {
1823         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1824         if (lbe->handles(c, fBreakType)) {
1825             return lbe;
1826         }
1827     }
1828
1829     // No existing dictionary took the character. See if a factory wants to
1830     // give us a new LanguageBreakEngine for this character.
1831     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1832
1833     // If we got one, use it and push it on our stack.
1834     if (lbe != NULL) {
1835         fLanguageBreakEngines->push((void *)lbe, status);
1836         // Even if we can't remember it, we can keep looking it up, so
1837         // return it even if the push fails.
1838         return lbe;
1839     }
1840
1841     // No engine is forthcoming for this character. Add it to the
1842     // reject set. Create the reject break engine if needed.
1843     if (fUnhandledBreakEngine == NULL) {
1844         fUnhandledBreakEngine = new UnhandledEngine(status);
1845         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1846             status = U_MEMORY_ALLOCATION_ERROR;
1847         }
1848         // Put it last so that scripts for which we have an engine get tried
1849         // first.
1850         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1851         // If we can't insert it, or creation failed, get rid of it
1852         if (U_FAILURE(status)) {
1853             delete fUnhandledBreakEngine;
1854             fUnhandledBreakEngine = 0;
1855             return NULL;
1856         }
1857     }
1858
1859     // Tell the reject engine about the character; at its discretion, it may
1860     // add more than just the one character.
1861     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1862
1863     return fUnhandledBreakEngine;
1864 }
1865
1866
1867
1868 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1869     return fBreakType;
1870 }*/
1871
1872 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1873     fBreakType = type;
1874     reset();
1875 }
1876
1877 U_NAMESPACE_END
1878
1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */