icuSources/common/rbbi.cpp

   1 /*
   2 ***************************************************************************
   3 *   Copyright (C) 1999-2016 International Business Machines Corporation
   4 *   and others. All rights reserved.
   5 ***************************************************************************
   6 */
   7 //
   8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
   9 //                   runtime engine and the API implementation for
  10 //                   class RuleBasedBreakIterator
  11 //
  12
  13 #include "utypeinfo.h"  // for 'typeid' to work
  14
  15 #include "unicode/utypes.h"
  16
  17 #if !UCONFIG_NO_BREAK_ITERATION
  18
  19 #include "unicode/rbbi.h"
  20 #include "unicode/schriter.h"
  21 #include "unicode/uchriter.h"
  22 #include "unicode/udata.h"
  23 #include "unicode/uclean.h"
  24 #include "rbbidata.h"
  25 #include "rbbirb.h"
  26 #include "cmemory.h"
  27 #include "cstring.h"
  28 #include "umutex.h"
  29 #include "ucln_cmn.h"
  30 #include "brkeng.h"
  31
  32 #include "uassert.h"
  33 #include "uvector.h"
  34
  35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
  36 #if U_LOCAL_SERVICE_HOOK
  37 #include "localsvc.h"
  38 #endif
  39
  40 #ifdef RBBI_DEBUG
  41 static UBool fTrace = FALSE;
  42 #endif
  43
  44 U_NAMESPACE_BEGIN
  45
  46 // The state number of the starting state
  47 #define START_STATE 1
  48
  49 // The state-transition value indicating "stop"
  50 #define STOP_STATE  0
  51
  52
  53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  54
  55
  56 //=======================================================================
  57 // constructors
  58 //=======================================================================
  59
  60 /**
  61  * Constructs a RuleBasedBreakIterator that uses the already-created
  62  * tables object that is passed in as a parameter.
  63  */
  64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  65 {
  66     init();
  67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
  68     if (U_FAILURE(status)) {return;}
  69     if(fData == 0) {
  70         status = U_MEMORY_ALLOCATION_ERROR;
  71         return;
  72     }
  73 }
  74
  75 /**
  76  * Same as above but does not adopt memory
  77  */
  78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
  79 {
  80     init();
  81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
  82     if (U_FAILURE(status)) {return;}
  83     if(fData == 0) {
  84         status = U_MEMORY_ALLOCATION_ERROR;
  85         return;
  86     }
  87 }
  88
  89
  90 //
  91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
  92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
  93 //
  94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
  95                        uint32_t       ruleLength,
  96                        UErrorCode     &status) {
  97     init();
  98     if (U_FAILURE(status)) {
  99         return;
 100     }
 101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
 102         status = U_ILLEGAL_ARGUMENT_ERROR;
 103         return;
 104     }
 105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
 106     if (data->fLength > ruleLength) {
 107         status = U_ILLEGAL_ARGUMENT_ERROR;
 108         return;
 109     }
 110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
 111     if (U_FAILURE(status)) {return;}
 112     if(fData == 0) {
 113         status = U_MEMORY_ALLOCATION_ERROR;
 114         return;
 115     }
 116 }
 117
 118
 119 //-------------------------------------------------------------------------------
 120 //
 121 //   Constructor   from a UDataMemory handle to precompiled break rules
 122 //                 stored in an ICU data file.
 123 //
 124 //-------------------------------------------------------------------------------
 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
 126 {
 127     init();
 128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
 129     if (U_FAILURE(status)) {return;}
 130     if(fData == 0) {
 131         status = U_MEMORY_ALLOCATION_ERROR;
 132         return;
 133     }
 134 }
 135
 136
 137
 138 //-------------------------------------------------------------------------------
 139 //
 140 //   Constructor       from a set of rules supplied as a string.
 141 //
 142 //-------------------------------------------------------------------------------
 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 144                                                 UParseError          &parseError,
 145                                                 UErrorCode           &status)
 146 {
 147     init();
 148     if (U_FAILURE(status)) {return;}
 149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
 150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
 151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
 152     //        creates and returns a complete RBBI.  From here, in a constructor, we
 153     //        can't just return the object created by the builder factory, hence
 154     //        the assignment of the factory created object to "this".
 155     if (U_SUCCESS(status)) {
 156         *this = *bi;
 157         delete bi;
 158     }
 159 }
 160
 161
 162 //-------------------------------------------------------------------------------
 163 //
 164 // Default Constructor.      Create an empty shell that can be set up later.
 165 //                           Used when creating a RuleBasedBreakIterator from a set
 166 //                           of rules.
 167 //-------------------------------------------------------------------------------
 168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
 169     init();
 170 }
 171
 172
 173 //-------------------------------------------------------------------------------
 174 //
 175 //   Copy constructor.  Will produce a break iterator with the same behavior,
 176 //                      and which iterates over the same text, as the one passed in.
 177 //
 178 //-------------------------------------------------------------------------------
 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
 180 : BreakIterator(other)
 181 {
 182     this->init();
 183     *this = other;
 184 }
 185
 186
 187 /**
 188  * Destructor
 189  */
 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
 191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 192         // fCharIter was adopted from the outside.
 193         delete fCharIter;
 194     }
 195     fCharIter = NULL;
 196     delete fSCharIter;
 197     fCharIter = NULL;
 198     delete fDCharIter;
 199     fDCharIter = NULL;
 200
 201     utext_close(fText);
 202
 203     if (fData != NULL) {
 204         fData->removeReference();
 205         fData = NULL;
 206     }
 207     if (fCachedBreakPositions) {
 208         uprv_free(fCachedBreakPositions);
 209         fCachedBreakPositions = NULL;
 210     }
 211     if (fLanguageBreakEngines) {
 212         delete fLanguageBreakEngines;
 213         fLanguageBreakEngines = NULL;
 214     }
 215     if (fUnhandledBreakEngine) {
 216         delete fUnhandledBreakEngine;
 217         fUnhandledBreakEngine = NULL;
 218     }
 219 }
 220
 221 /**
 222  * Assignment operator.  Sets this iterator to have the same behavior,
 223  * and iterate over the same text, as the one passed in.
 224  */
 225 RuleBasedBreakIterator&
 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 227     if (this == &that) {
 228         return *this;
 229     }
 230     fKeepAll = that.fKeepAll;
 231     reset();    // Delete break cache information
 232     fBreakType = that.fBreakType;
 233     if (fLanguageBreakEngines != NULL) {
 234         delete fLanguageBreakEngines;
 235         fLanguageBreakEngines = NULL;   // Just rebuild for now
 236     }
 237     // TODO: clone fLanguageBreakEngines from "that"
 238     UErrorCode status = U_ZERO_ERROR;
 239     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
 240
 241     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 242         delete fCharIter;
 243     }
 244     fCharIter = NULL;
 245
 246     if (that.fCharIter != NULL ) {
 247         // This is a little bit tricky - it will intially appear that
 248         //  this->fCharIter is adopted, even if that->fCharIter was
 249         //  not adopted.  That's ok.
 250         fCharIter = that.fCharIter->clone();
 251     }
 252
 253     if (fData != NULL) {
 254         fData->removeReference();
 255         fData = NULL;
 256     }
 257     if (that.fData != NULL) {
 258         fData = that.fData->addReference();
 259     }
 260
 261     return *this;
 262 }
 263
 264
 265
 266 //-----------------------------------------------------------------------------
 267 //
 268 //    init()      Shared initialization routine.   Used by all the constructors.
 269 //                Initializes all fields, leaving the object in a consistent state.
 270 //
 271 //-----------------------------------------------------------------------------
 272 void RuleBasedBreakIterator::init() {
 273     UErrorCode  status    = U_ZERO_ERROR;
 274     fText                 = utext_openUChars(NULL, NULL, 0, &status);
 275     fCharIter             = NULL;
 276     fSCharIter            = NULL;
 277     fDCharIter            = NULL;
 278     fData                 = NULL;
 279     fLastRuleStatusIndex  = 0;
 280     fLastStatusIndexValid = TRUE;
 281     fDictionaryCharCount  = 0;
 282     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
 283                                         //   dictionary behavior for Break Iterators that are
 284                                         //   built from rules.  Even better would be the ability to
 285                                         //   declare the type in the rules.
 286
 287     fCachedBreakPositions    = NULL;
 288     fLanguageBreakEngines    = NULL;
 289     fUnhandledBreakEngine    = NULL;
 290     fNumCachedBreakPositions = 0;
 291     fPositionInCache         = 0;
 292
 293 #ifdef RBBI_DEBUG
 294     static UBool debugInitDone = FALSE;
 295     if (debugInitDone == FALSE) {
 296         char *debugEnv = getenv("U_RBBIDEBUG");
 297         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
 298             fTrace = TRUE;
 299         }
 300         debugInitDone = TRUE;
 301     }
 302 #endif
 303 }
 304
 305
 306
 307 //-----------------------------------------------------------------------------
 308 //
 309 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
 310 //            behavior, and iterating over the same text, as this one.
 311 //            Virtual function: does the right thing with subclasses.
 312 //
 313 //-----------------------------------------------------------------------------
 314 BreakIterator*
 315 RuleBasedBreakIterator::clone(void) const {
 316     return new RuleBasedBreakIterator(*this);
 317 }
 318
 319 /**
 320  * Equality operator.  Returns TRUE if both BreakIterators are of the
 321  * same class, have the same behavior, and iterate over the same text.
 322  */
 323 UBool
 324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
 325     if (typeid(*this) != typeid(that)) {
 326         return FALSE;
 327     }
 328
 329     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
 330     if (that2.fKeepAll != fKeepAll) {
 331         return FALSE;
 332     }
 333
 334     if (!utext_equals(fText, that2.fText)) {
 335         // The two break iterators are operating on different text,
 336         //   or have a different interation position.
 337         return FALSE;
 338     };
 339
 340     // TODO:  need a check for when in a dictionary region at different offsets.
 341
 342     if (that2.fData == fData ||
 343         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
 344             // The two break iterators are using the same rules.
 345             return TRUE;
 346         }
 347     return FALSE;
 348 }
 349
 350 /**
 351  * Compute a hash code for this BreakIterator
 352  * @return A hash code
 353  */
 354 int32_t
 355 RuleBasedBreakIterator::hashCode(void) const {
 356     int32_t   hash = 0;
 357     if (fData != NULL) {
 358         hash = fData->hashCode();
 359     }
 360     return hash;
 361 }
 362
 363
 364 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
 365     if (U_FAILURE(status)) {
 366         return;
 367     }
 368     reset();
 369     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
 370
 371     // Set up a dummy CharacterIterator to be returned if anyone
 372     //   calls getText().  With input from UText, there is no reasonable
 373     //   way to return a characterIterator over the actual input text.
 374     //   Return one over an empty string instead - this is the closest
 375     //   we can come to signaling a failure.
 376     //   (GetText() is obsolete, this failure is sort of OK)
 377     if (fDCharIter == NULL) {
 378         static const UChar c = 0;
 379         fDCharIter = new UCharCharacterIterator(&c, 0);
 380         if (fDCharIter == NULL) {
 381             status = U_MEMORY_ALLOCATION_ERROR;
 382             return;
 383         }
 384     }
 385
 386     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 387         // existing fCharIter was adopted from the outside.  Delete it now.
 388         delete fCharIter;
 389     }
 390     fCharIter = fDCharIter;
 391
 392     this->first();
 393 }
 394
 395
 396 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
 397     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
 398     return result;
 399 }
 400
 401
 402
 403 /**
 404  * Returns the description used to create this iterator
 405  */
 406 const UnicodeString&
 407 RuleBasedBreakIterator::getRules() const {
 408     if (fData != NULL) {
 409         return fData->getRuleSourceString();
 410     } else {
 411         static const UnicodeString *s;
 412         if (s == NULL) {
 413             // TODO:  something more elegant here.
 414             //        perhaps API should return the string by value.
 415             //        Note:  thread unsafe init & leak are semi-ok, better than
 416             //               what was before.  Sould be cleaned up, though.
 417             s = new UnicodeString;
 418         }
 419         return *s;
 420     }
 421 }
 422
 423 //=======================================================================
 424 // BreakIterator overrides
 425 //=======================================================================
 426
 427 /**
 428  * Return a CharacterIterator over the text being analyzed.
 429  */
 430 CharacterIterator&
 431 RuleBasedBreakIterator::getText() const {
 432     return *fCharIter;
 433 }
 434
 435 /**
 436  * Set the iterator to analyze a new piece of text.  This function resets
 437  * the current iteration position to the beginning of the text.
 438  * @param newText An iterator over the text to analyze.
 439  */
 440 void
 441 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 442     // If we are holding a CharacterIterator adopted from a
 443     //   previous call to this function, delete it now.
 444     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 445         delete fCharIter;
 446     }
 447
 448     fCharIter = newText;
 449     UErrorCode status = U_ZERO_ERROR;
 450     reset();
 451     if (newText==NULL || newText->startIndex() != 0) {
 452         // startIndex !=0 wants to be an error, but there's no way to report it.
 453         // Make the iterator text be an empty string.
 454         fText = utext_openUChars(fText, NULL, 0, &status);
 455     } else {
 456         fText = utext_openCharacterIterator(fText, newText, &status);
 457     }
 458     this->first();
 459 }
 460
 461 /**
 462  * Set the iterator to analyze a new piece of text.  This function resets
 463  * the current iteration position to the beginning of the text.
 464  * @param newText An iterator over the text to analyze.
 465  */
 466 void
 467 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 468     UErrorCode status = U_ZERO_ERROR;
 469     reset();
 470     fText = utext_openConstUnicodeString(fText, &newText, &status);
 471
 472     // Set up a character iterator on the string.
 473     //   Needed in case someone calls getText().
 474     //  Can not, unfortunately, do this lazily on the (probably never)
 475     //  call to getText(), because getText is const.
 476     if (fSCharIter == NULL) {
 477         fSCharIter = new StringCharacterIterator(newText);
 478     } else {
 479         fSCharIter->setText(newText);
 480     }
 481
 482     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 483         // old fCharIter was adopted from the outside.  Delete it.
 484         delete fCharIter;
 485     }
 486     fCharIter = fSCharIter;
 487
 488     this->first();
 489 }
 490
 491
 492 /**
 493  *  Provide a new UText for the input text.  Must reference text with contents identical
 494  *  to the original.
 495  *  Intended for use with text data originating in Java (garbage collected) environments
 496  *  where the data may be moved in memory at arbitrary times.
 497  */
 498 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
 499     if (U_FAILURE(status)) {
 500         return *this;
 501     }
 502     if (input == NULL) {
 503         status = U_ILLEGAL_ARGUMENT_ERROR;
 504         return *this;
 505     }
 506     int64_t pos = utext_getNativeIndex(fText);
 507     //  Shallow read-only clone of the new UText into the existing input UText
 508     fText = utext_clone(fText, input, FALSE, TRUE, &status);
 509     if (U_FAILURE(status)) {
 510         return *this;
 511     }
 512     utext_setNativeIndex(fText, pos);
 513     if (utext_getNativeIndex(fText) != pos) {
 514         // Sanity check.  The new input utext is supposed to have the exact same
 515         // contents as the old.  If we can't set to the same position, it doesn't.
 516         // The contents underlying the old utext might be invalid at this point,
 517         // so it's not safe to check directly.
 518         status = U_ILLEGAL_ARGUMENT_ERROR;
 519     }
 520     return *this;
 521 }
 522
 523
 524 /**
 525  * Sets the current iteration position to the beginning of the text, position zero.
 526  * @return The new iterator position, which is zero.
 527  */
 528 int32_t RuleBasedBreakIterator::first(void) {
 529     reset();
 530     fLastRuleStatusIndex  = 0;
 531     fLastStatusIndexValid = TRUE;
 532     //if (fText == NULL)
 533     //    return BreakIterator::DONE;
 534
 535     utext_setNativeIndex(fText, 0);
 536     return 0;
 537 }
 538
 539 /**
 540  * Sets the current iteration position to the end of the text.
 541  * @return The text's past-the-end offset.
 542  */
 543 int32_t RuleBasedBreakIterator::last(void) {
 544     reset();
 545     if (fText == NULL) {
 546         fLastRuleStatusIndex  = 0;
 547         fLastStatusIndexValid = TRUE;
 548         return BreakIterator::DONE;
 549     }
 550
 551     fLastStatusIndexValid = FALSE;
 552     int32_t pos = (int32_t)utext_nativeLength(fText);
 553     utext_setNativeIndex(fText, pos);
 554     return pos;
 555 }
 556
 557 /**
 558  * Advances the iterator either forward or backward the specified number of steps.
 559  * Negative values move backward, and positive values move forward.  This is
 560  * equivalent to repeatedly calling next() or previous().
 561  * @param n The number of steps to move.  The sign indicates the direction
 562  * (negative is backwards, and positive is forwards).
 563  * @return The character offset of the boundary position n boundaries away from
 564  * the current one.
 565  */
 566 int32_t RuleBasedBreakIterator::next(int32_t n) {
 567     int32_t result = current();
 568     while (n > 0) {
 569         result = next();
 570         --n;
 571     }
 572     while (n < 0) {
 573         result = previous();
 574         ++n;
 575     }
 576     return result;
 577 }
 578
 579 /**
 580  * Advances the iterator to the next boundary position.
 581  * @return The position of the first boundary after this one.
 582  */
 583 int32_t RuleBasedBreakIterator::next(void) {
 584     // if we have cached break positions and we're still in the range
 585     // covered by them, just move one step forward in the cache
 586     if (fCachedBreakPositions != NULL) {
 587         if (fPositionInCache < fNumCachedBreakPositions - 1) {
 588             ++fPositionInCache;
 589             int32_t pos = fCachedBreakPositions[fPositionInCache];
 590             utext_setNativeIndex(fText, pos);
 591             return pos;
 592         }
 593         else {
 594             reset();
 595         }
 596     }
 597
 598     int32_t startPos = current();
 599     fDictionaryCharCount = 0;
 600     int32_t result = handleNext(fData->fForwardTable);
 601     while (fKeepAll) {
 602         UChar32 prevChr = utext_char32At(fText, result-1);
 603         UChar32 currChr = utext_char32At(fText, result);
 604         if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
 605             break;
 606         }
 607         int32_t nextResult = handleNext(fData->fForwardTable);
 608         if (nextResult <= result) {
 609             break;
 610         }
 611         result = nextResult;
 612     }
 613     if (fDictionaryCharCount > 0) {
 614         result = checkDictionary(startPos, result, FALSE);
 615     }
 616     return result;
 617 }
 618
 619 /**
 620  * Advances the iterator backwards, to the last boundary preceding this one.
 621  * @return The position of the last boundary position preceding this one.
 622  */
 623 int32_t RuleBasedBreakIterator::previous(void) {
 624     int32_t result;
 625     int32_t startPos;
 626
 627     // if we have cached break positions and we're still in the range
 628     // covered by them, just move one step backward in the cache
 629     if (fCachedBreakPositions != NULL) {
 630         if (fPositionInCache > 0) {
 631             --fPositionInCache;
 632             // If we're at the beginning of the cache, need to reevaluate the
 633             // rule status
 634             if (fPositionInCache <= 0) {
 635                 fLastStatusIndexValid = FALSE;
 636             }
 637             int32_t pos = fCachedBreakPositions[fPositionInCache];
 638             utext_setNativeIndex(fText, pos);
 639             return pos;
 640         }
 641         else {
 642             reset();
 643         }
 644     }
 645
 646     // if we're already sitting at the beginning of the text, return DONE
 647     if (fText == NULL || (startPos = current()) == 0) {
 648         fLastRuleStatusIndex  = 0;
 649         fLastStatusIndexValid = TRUE;
 650         return BreakIterator::DONE;
 651     }
 652
 653     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
 654         result = handlePrevious(fData->fReverseTable);
 655         while (fKeepAll) {
 656             UChar32 prevChr = utext_char32At(fText, result-1);
 657             UChar32 currChr = utext_char32At(fText, result);
 658             if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
 659                 break;
 660             }
 661             int32_t prevResult = handlePrevious(fData->fReverseTable);
 662             if (prevResult >= result) {
 663                 break;
 664             }
 665             result = prevResult;
 666         }
 667         if (fDictionaryCharCount > 0) {
 668             result = checkDictionary(result, startPos, TRUE);
 669         }
 670         return result;
 671     }
 672
 673     // old rule syntax
 674     // set things up.  handlePrevious() will back us up to some valid
 675     // break position before the current position (we back our internal
 676     // iterator up one step to prevent handlePrevious() from returning
 677     // the current position), but not necessarily the last one before
 678     // where we started
 679
 680     int32_t start = current();
 681
 682     (void)UTEXT_PREVIOUS32(fText);
 683     int32_t lastResult    = handlePrevious(fData->fReverseTable);
 684     if (lastResult == UBRK_DONE) {
 685         lastResult = 0;
 686         utext_setNativeIndex(fText, 0);
 687     }
 688     result = lastResult;
 689     int32_t lastTag       = 0;
 690     UBool   breakTagValid = FALSE;
 691
 692     // iterate forward from the known break position until we pass our
 693     // starting point.  The last break position before the starting
 694     // point is our return value
 695
 696     for (;;) {
 697         result         = next();
 698         if (result == BreakIterator::DONE || result >= start) {
 699             break;
 700         }
 701         lastResult     = result;
 702         lastTag        = fLastRuleStatusIndex;
 703         breakTagValid  = TRUE;
 704     }
 705
 706     // fLastBreakTag wants to have the value for section of text preceding
 707     // the result position that we are to return (in lastResult.)  If
 708     // the backwards rules overshot and the above loop had to do two or more
 709     // next()s to move up to the desired return position, we will have a valid
 710     // tag value. But, if handlePrevious() took us to exactly the correct result position,
 711     // we wont have a tag value for that position, which is only set by handleNext().
 712
 713     // Set the current iteration position to be the last break position
 714     // before where we started, and then return that value.
 715     utext_setNativeIndex(fText, lastResult);
 716     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
 717     fLastStatusIndexValid = breakTagValid;
 718
 719     // No need to check the dictionary; it will have been handled by
 720     // next()
 721
 722     return lastResult;
 723 }
 724
 725 /**
 726  * Sets the iterator to refer to the first boundary position following
 727  * the specified position.
 728  * @offset The position from which to begin searching for a break position.
 729  * @return The position of the first break after the current position.
 730  */
 731 int32_t RuleBasedBreakIterator::following(int32_t offset) {
 732     // if the offset passed in is already past the end of the text,
 733     // just return DONE; if it's before the beginning, return the
 734     // text's starting offset
 735     if (fText == NULL || offset >= utext_nativeLength(fText)) {
 736         last();
 737         return next();
 738     }
 739     else if (offset < 0) {
 740         return first();
 741     }
 742
 743     // Move requested offset to a code point start. It might be on a trail surrogate,
 744     // or on a trail byte if the input is UTF-8.
 745     utext_setNativeIndex(fText, offset);
 746     offset = (int32_t)utext_getNativeIndex(fText);
 747
 748     // if we have cached break positions and offset is in the range
 749     // covered by them, use them
 750     // TODO: could use binary search
 751     // TODO: what if offset is outside range, but break is not?
 752     if (fCachedBreakPositions != NULL) {
 753         if (offset >= fCachedBreakPositions[0]
 754                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 755             fPositionInCache = 0;
 756             // We are guaranteed not to leave the array due to range test above
 757             while (offset >= fCachedBreakPositions[fPositionInCache]) {
 758                 ++fPositionInCache;
 759             }
 760             int32_t pos = fCachedBreakPositions[fPositionInCache];
 761             utext_setNativeIndex(fText, pos);
 762             return pos;
 763         }
 764         else {
 765             reset();
 766         }
 767     }
 768
 769     // Set our internal iteration position (temporarily)
 770     // to the position passed in.  If this is the _beginning_ position,
 771     // then we can just use next() to get our return value
 772
 773     int32_t result = 0;
 774
 775     if (fData->fSafeRevTable != NULL) {
 776         // new rule syntax
 777         utext_setNativeIndex(fText, offset);
 778         // move forward one codepoint to prepare for moving back to a
 779         // safe point.
 780         // this handles offset being between a supplementary character
 781         // TODO: is this still needed, with move to code point boundary handled above?
 782         (void)UTEXT_NEXT32(fText);
 783         // handlePrevious will move most of the time to < 1 boundary away
 784         handlePrevious(fData->fSafeRevTable);
 785         int32_t result = next();
 786         while (result <= offset) {
 787             result = next();
 788         }
 789         return result;
 790     }
 791     if (fData->fSafeFwdTable != NULL) {
 792         // backup plan if forward safe table is not available
 793         utext_setNativeIndex(fText, offset);
 794         (void)UTEXT_PREVIOUS32(fText);
 795         // handle next will give result >= offset
 796         handleNext(fData->fSafeFwdTable);
 797         // previous will give result 0 or 1 boundary away from offset,
 798         // most of the time
 799         // we have to
 800         int32_t oldresult = previous();
 801         while (oldresult > offset) {
 802             int32_t result = previous();
 803             if (result <= offset) {
 804                 return oldresult;
 805             }
 806             oldresult = result;
 807         }
 808         int32_t result = next();
 809         if (result <= offset) {
 810             return next();
 811         }
 812         return result;
 813     }
 814     // otherwise, we have to sync up first.  Use handlePrevious() to back
 815     // up to a known break position before the specified position (if
 816     // we can determine that the specified position is a break position,
 817     // we don't back up at all).  This may or may not be the last break
 818     // position at or before our starting position.  Advance forward
 819     // from here until we've passed the starting position.  The position
 820     // we stop on will be the first break position after the specified one.
 821     // old rule syntax
 822
 823     utext_setNativeIndex(fText, offset);
 824     if (offset==0 ||
 825         (offset==1  && utext_getNativeIndex(fText)==0)) {
 826         return next();
 827     }
 828     result = previous();
 829
 830     while (result != BreakIterator::DONE && result <= offset) {
 831         result = next();
 832     }
 833
 834     return result;
 835 }
 836
 837 /**
 838  * Sets the iterator to refer to the last boundary position before the
 839  * specified position.
 840  * @offset The position to begin searching for a break from.
 841  * @return The position of the last boundary before the starting position.
 842  */
 843 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 844     // if the offset passed in is already past the end of the text,
 845     // just return DONE; if it's before the beginning, return the
 846     // text's starting offset
 847     if (fText == NULL || offset > utext_nativeLength(fText)) {
 848         return last();
 849     }
 850     else if (offset < 0) {
 851         return first();
 852     }
 853
 854     // Move requested offset to a code point start. It might be on a trail surrogate,
 855     // or on a trail byte if the input is UTF-8.
 856     utext_setNativeIndex(fText, offset);
 857     offset = (int32_t)utext_getNativeIndex(fText);
 858
 859     // if we have cached break positions and offset is in the range
 860     // covered by them, use them
 861     if (fCachedBreakPositions != NULL) {
 862         // TODO: binary search?
 863         // TODO: What if offset is outside range, but break is not?
 864         if (offset > fCachedBreakPositions[0]
 865                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 866             fPositionInCache = 0;
 867             while (fPositionInCache < fNumCachedBreakPositions
 868                    && offset > fCachedBreakPositions[fPositionInCache])
 869                 ++fPositionInCache;
 870             --fPositionInCache;
 871             // If we're at the beginning of the cache, need to reevaluate the
 872             // rule status
 873             if (fPositionInCache <= 0) {
 874                 fLastStatusIndexValid = FALSE;
 875             }
 876             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
 877             return fCachedBreakPositions[fPositionInCache];
 878         }
 879         else {
 880             reset();
 881         }
 882     }
 883
 884     // if we start by updating the current iteration position to the
 885     // position specified by the caller, we can just use previous()
 886     // to carry out this operation
 887
 888     if (fData->fSafeFwdTable != NULL) {
 889         // new rule syntax
 890         utext_setNativeIndex(fText, offset);
 891         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 892         if (newOffset != offset) {
 893             // Will come here if specified offset was not a code point boundary AND
 894             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
 895             //   indices to the containing code point.
 896             // For breakitereator::preceding only, these non-code-point indices need to be moved
 897             //   up to refer to the following codepoint.
 898             (void)UTEXT_NEXT32(fText);
 899             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 900         }
 901
 902         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
 903         //        rather than adjusting the position unconditionally?
 904         //        (Change would interact with safe rules.)
 905         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
 906         //        affects only preceding(), seems cleaner, but is slightly different.
 907         (void)UTEXT_PREVIOUS32(fText);
 908         handleNext(fData->fSafeFwdTable);
 909         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 910         while (result >= offset) {
 911             result = previous();
 912         }
 913         return result;
 914     }
 915     if (fData->fSafeRevTable != NULL) {
 916         // backup plan if forward safe table is not available
 917         //  TODO:  check whether this path can be discarded
 918         //         It's probably OK to say that rules must supply both safe tables
 919         //            if they use safe tables at all.  We have certainly never described
 920         //            to anyone how to work with just one safe table.
 921         utext_setNativeIndex(fText, offset);
 922         (void)UTEXT_NEXT32(fText);
 923
 924         // handle previous will give result <= offset
 925         handlePrevious(fData->fSafeRevTable);
 926
 927         // next will give result 0 or 1 boundary away from offset,
 928         // most of the time
 929         // we have to
 930         int32_t oldresult = next();
 931         while (oldresult < offset) {
 932             int32_t result = next();
 933             if (result >= offset) {
 934                 return oldresult;
 935             }
 936             oldresult = result;
 937         }
 938         int32_t result = previous();
 939         if (result >= offset) {
 940             return previous();
 941         }
 942         return result;
 943     }
 944
 945     // old rule syntax
 946     utext_setNativeIndex(fText, offset);
 947     return previous();
 948 }
 949
 950 /**
 951  * Returns true if the specfied position is a boundary position.  As a side
 952  * effect, leaves the iterator pointing to the first boundary position at
 953  * or after "offset".
 954  * @param offset the offset to check.
 955  * @return True if "offset" is a boundary position.
 956  */
 957 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
 958     // the beginning index of the iterator is always a boundary position by definition
 959     if (offset == 0) {
 960         first();       // For side effects on current position, tag values.
 961         return TRUE;
 962     }
 963
 964     if (offset == (int32_t)utext_nativeLength(fText)) {
 965         last();       // For side effects on current position, tag values.
 966         return TRUE;
 967     }
 968
 969     // out-of-range indexes are never boundary positions
 970     if (offset < 0) {
 971         first();       // For side effects on current position, tag values.
 972         return FALSE;
 973     }
 974
 975     if (offset > utext_nativeLength(fText)) {
 976         last();        // For side effects on current position, tag values.
 977         return FALSE;
 978     }
 979
 980     // otherwise, we can use following() on the position before the specified
 981     // one and return true if the position we get back is the one the user
 982     // specified
 983     utext_previous32From(fText, offset);
 984     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 985     UBool    result  = following(backOne) == offset;
 986     return result;
 987 }
 988
 989 /**
 990  * Returns the current iteration position.
 991  * @return The current iteration position.
 992  */
 993 int32_t RuleBasedBreakIterator::current(void) const {
 994     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 995     return pos;
 996 }
 997
 998 //=======================================================================
 999 // implementation
1000 //=======================================================================
1001
1002 //
1003 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
1004 //                 of user text.  A variable with this enum type keeps track of where we
1005 //                 are.  The state machine only fetches user input while in the RUN mode.
1006 //
1007 enum RBBIRunMode {
1008     RBBI_START,     // state machine processing is before first char of input
1009     RBBI_RUN,       // state machine processing is in the user text
1010     RBBI_END        // state machine processing is after end of user text.
1011 };
1012
1013
1014 // Map from look-ahead break states (corresponds to rules) to boundary positions.
1015 // Allows multiple lookahead break rules to be in flight at the same time.
1016 //
1017 // This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
1018 // in the state table be sequential, then we can just index an array. And the
1019 // table could also tell us in advance how big that array needs to be.
1020 //
1021 // Before ICU 57 there was just a single simple variable for a look-ahead match that
1022 // was in progress. Two rules at once did not work.
1023
1024 static const int32_t kMaxLookaheads = 8;
1025 struct LookAheadResults {
1026     int32_t    fUsedSlotLimit;
1027     int32_t    fPositions[8];
1028     int16_t    fKeys[8];
1029
1030     LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
1031
1032     int32_t getPosition(int16_t key) {
1033         for (int32_t i=0; i<fUsedSlotLimit; ++i) {
1034             if (fKeys[i] == key) {
1035                 return fPositions[i];
1036             }
1037         }
1038         U_ASSERT(FALSE);
1039         return -1;
1040     }
1041
1042     void setPosition(int16_t key, int32_t position) {
1043         int32_t i;
1044         for (i=0; i<fUsedSlotLimit; ++i) {
1045             if (fKeys[i] == key) {
1046                 fPositions[i] = position;
1047                 return;
1048             }
1049         }
1050         if (i >= kMaxLookaheads) {
1051             U_ASSERT(FALSE);
1052             i = kMaxLookaheads - 1;
1053         }
1054         fKeys[i] = key;
1055         fPositions[i] = position;
1056         U_ASSERT(fUsedSlotLimit == i);
1057         fUsedSlotLimit = i + 1;
1058     }
1059 };
1060
1061
1062 //-----------------------------------------------------------------------------------
1063 //
1064 //  handleNext(stateTable)
1065 //     This method is the actual implementation of the rbbi next() method.
1066 //     This method initializes the state machine to state 1
1067 //     and advances through the text character by character until we reach the end
1068 //     of the text or the state machine transitions to state 0.  We update our return
1069 //     value every time the state machine passes through an accepting state.
1070 //
1071 //-----------------------------------------------------------------------------------
1072 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
1073     int32_t             state;
1074     uint16_t            category        = 0;
1075     RBBIRunMode         mode;
1076
1077     RBBIStateTableRow  *row;
1078     UChar32             c;
1079     LookAheadResults    lookAheadMatches;
1080     int32_t             result             = 0;
1081     int32_t             initialPosition    = 0;
1082     const char         *tableData          = statetable->fTableData;
1083     uint32_t            tableRowLen        = statetable->fRowLen;
1084
1085     #ifdef RBBI_DEBUG
1086         if (fTrace) {
1087             RBBIDebugPuts("Handle Next   pos   char  state category");
1088         }
1089     #endif
1090
1091     // No matter what, handleNext alway correctly sets the break tag value.
1092     fLastStatusIndexValid = TRUE;
1093     fLastRuleStatusIndex = 0;
1094
1095     // if we're already at the end of the text, return DONE.
1096     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1097     result          = initialPosition;
1098     c               = UTEXT_NEXT32(fText);
1099     if (fData == NULL || c==U_SENTINEL) {
1100         return BreakIterator::DONE;
1101     }
1102
1103     //  Set the initial state for the state machine
1104     state = START_STATE;
1105     row = (RBBIStateTableRow *)
1106             //(statetable->fTableData + (statetable->fRowLen * state));
1107             (tableData + tableRowLen * state);
1108
1109
1110     mode     = RBBI_RUN;
1111     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1112         category = 2;
1113         mode     = RBBI_START;
1114     }
1115
1116
1117     // loop until we reach the end of the text or transition to state 0
1118     //
1119     for (;;) {
1120         if (c == U_SENTINEL) {
1121             // Reached end of input string.
1122             if (mode == RBBI_END) {
1123                 // We have already run the loop one last time with the
1124                 //   character set to the psueudo {eof} value.  Now it is time
1125                 //   to unconditionally bail out.
1126                 break;
1127             }
1128             // Run the loop one last time with the fake end-of-input character category.
1129             mode = RBBI_END;
1130             category = 1;
1131         }
1132
1133         //
1134         // Get the char category.  An incoming category of 1 or 2 means that
1135         //      we are preset for doing the beginning or end of input, and
1136         //      that we shouldn't get a category from an actual text input character.
1137         //
1138         if (mode == RBBI_RUN) {
1139             // look up the current character's character category, which tells us
1140             // which column in the state table to look at.
1141             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1142             //        not the size of the character going in, which is a UChar32.
1143             //
1144             UTRIE_GET16(&fData->fTrie, c, category);
1145
1146             // Check the dictionary bit in the character's category.
1147             //    Counter is only used by dictionary based iterators (subclasses).
1148             //    Chars that need to be handled by a dictionary have a flag bit set
1149             //    in their category values.
1150             //
1151             if ((category & 0x4000) != 0)  {
1152                 fDictionaryCharCount++;
1153                 //  And off the dictionary flag bit.
1154                 category &= ~0x4000;
1155             }
1156         }
1157
1158        #ifdef RBBI_DEBUG
1159             if (fTrace) {
1160                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
1161                 if (0x20<=c && c<0x7f) {
1162                     RBBIDebugPrintf("\"%c\"  ", c);
1163                 } else {
1164                     RBBIDebugPrintf("%5x  ", c);
1165                 }
1166                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1167             }
1168         #endif
1169
1170         // State Transition - move machine to its next state
1171         //
1172
1173         // Note: fNextState is defined as uint16_t[2], but we are casting
1174         // a generated RBBI table to RBBIStateTableRow and some tables
1175         // actually have more than 2 categories.
1176         U_ASSERT(category<fData->fHeader->fCatCount);
1177         state = row->fNextState[category];  /*Not accessing beyond memory*/
1178         row = (RBBIStateTableRow *)
1179             // (statetable->fTableData + (statetable->fRowLen * state));
1180             (tableData + tableRowLen * state);
1181
1182
1183         if (row->fAccepting == -1) {
1184             // Match found, common case.
1185             if (mode != RBBI_START) {
1186                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1187             }
1188             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1189         }
1190
1191         int16_t completedRule = row->fAccepting;
1192         if (completedRule > 0) {
1193             // Lookahead match is completed.
1194             int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1195             if (lookaheadResult >= 0) {
1196                 fLastRuleStatusIndex = row->fTagIdx;
1197                 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1198                 return lookaheadResult;
1199             }
1200         }
1201         int16_t rule = row->fLookAhead;
1202         if (rule != 0) {
1203             // At the position of a '/' in a look-ahead match. Record it.
1204             int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1205             lookAheadMatches.setPosition(rule, pos);
1206         }
1207
1208         if (state == STOP_STATE) {
1209             // This is the normal exit from the lookup state machine.
1210             // We have advanced through the string until it is certain that no
1211             //   longer match is possible, no matter what characters follow.
1212             break;
1213         }
1214
1215         // Advance to the next character.
1216         // If this is a beginning-of-input loop iteration, don't advance
1217         //    the input position.  The next iteration will be processing the
1218         //    first real input character.
1219         if (mode == RBBI_RUN) {
1220             c = UTEXT_NEXT32(fText);
1221         } else {
1222             if (mode == RBBI_START) {
1223                 mode = RBBI_RUN;
1224             }
1225         }
1226
1227
1228     }
1229
1230     // The state machine is done.  Check whether it found a match...
1231
1232     // If the iterator failed to advance in the match engine, force it ahead by one.
1233     //   (This really indicates a defect in the break rules.  They should always match
1234     //    at least one character.)
1235     if (result == initialPosition) {
1236         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1237         UTEXT_NEXT32(fText);
1238         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1239     }
1240
1241     // Leave the iterator at our result position.
1242     UTEXT_SETNATIVEINDEX(fText, result);
1243     #ifdef RBBI_DEBUG
1244         if (fTrace) {
1245             RBBIDebugPrintf("result = %d\n\n", result);
1246         }
1247     #endif
1248     return result;
1249 }
1250
1251
1252
1253 //-----------------------------------------------------------------------------------
1254 //
1255 //  handlePrevious()
1256 //
1257 //      Iterate backwards, according to the logic of the reverse rules.
1258 //      This version handles the exact style backwards rules.
1259 //
1260 //      The logic of this function is very similar to handleNext(), above.
1261 //
1262 //-----------------------------------------------------------------------------------
1263 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1264     int32_t             state;
1265     uint16_t            category        = 0;
1266     RBBIRunMode         mode;
1267     RBBIStateTableRow  *row;
1268     UChar32             c;
1269     LookAheadResults    lookAheadMatches;
1270     int32_t             result          = 0;
1271     int32_t             initialPosition = 0;
1272
1273     #ifdef RBBI_DEBUG
1274         if (fTrace) {
1275             RBBIDebugPuts("Handle Previous   pos   char  state category");
1276         }
1277     #endif
1278
1279     // handlePrevious() never gets the rule status.
1280     // Flag the status as invalid; if the user ever asks for status, we will need
1281     // to back up, then re-find the break position using handleNext(), which does
1282     // get the status value.
1283     fLastStatusIndexValid = FALSE;
1284     fLastRuleStatusIndex = 0;
1285
1286     // if we're already at the start of the text, return DONE.
1287     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1288         return BreakIterator::DONE;
1289     }
1290
1291     //  Set up the starting char.
1292     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1293     result          = initialPosition;
1294     c               = UTEXT_PREVIOUS32(fText);
1295
1296     //  Set the initial state for the state machine
1297     state = START_STATE;
1298     row = (RBBIStateTableRow *)
1299             (statetable->fTableData + (statetable->fRowLen * state));
1300     category = 3;
1301     mode     = RBBI_RUN;
1302     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1303         category = 2;
1304         mode     = RBBI_START;
1305     }
1306
1307
1308     // loop until we reach the start of the text or transition to state 0
1309     //
1310     for (;;) {
1311         if (c == U_SENTINEL) {
1312             // Reached end of input string.
1313             if (mode == RBBI_END) {
1314                 // We have already run the loop one last time with the
1315                 //   character set to the psueudo {eof} value.  Now it is time
1316                 //   to unconditionally bail out.
1317                 if (result == initialPosition) {
1318                     // Ran off start, no match found.
1319                     // move one index one (towards the start, since we are doing a previous())
1320                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1321                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1322                 }
1323                 break;
1324             }
1325             // Run the loop one last time with the fake end-of-input character category.
1326             mode = RBBI_END;
1327             category = 1;
1328         }
1329
1330         //
1331         // Get the char category.  An incoming category of 1 or 2 means that
1332         //      we are preset for doing the beginning or end of input, and
1333         //      that we shouldn't get a category from an actual text input character.
1334         //
1335         if (mode == RBBI_RUN) {
1336             // look up the current character's character category, which tells us
1337             // which column in the state table to look at.
1338             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1339             //        not the size of the character going in, which is a UChar32.
1340             //
1341             UTRIE_GET16(&fData->fTrie, c, category);
1342
1343             // Check the dictionary bit in the character's category.
1344             //    Counter is only used by dictionary based iterators (subclasses).
1345             //    Chars that need to be handled by a dictionary have a flag bit set
1346             //    in their category values.
1347             //
1348             if ((category & 0x4000) != 0)  {
1349                 fDictionaryCharCount++;
1350                 //  And off the dictionary flag bit.
1351                 category &= ~0x4000;
1352             }
1353         }
1354
1355         #ifdef RBBI_DEBUG
1356             if (fTrace) {
1357                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1358                 if (0x20<=c && c<0x7f) {
1359                     RBBIDebugPrintf("\"%c\"  ", c);
1360                 } else {
1361                     RBBIDebugPrintf("%5x  ", c);
1362                 }
1363                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1364             }
1365         #endif
1366
1367         // State Transition - move machine to its next state
1368         //
1369
1370         // Note: fNextState is defined as uint16_t[2], but we are casting
1371         // a generated RBBI table to RBBIStateTableRow and some tables
1372         // actually have more than 2 categories.
1373         U_ASSERT(category<fData->fHeader->fCatCount);
1374         state = row->fNextState[category];  /*Not accessing beyond memory*/
1375         row = (RBBIStateTableRow *)
1376             (statetable->fTableData + (statetable->fRowLen * state));
1377
1378         if (row->fAccepting == -1) {
1379             // Match found, common case.
1380             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1381         }
1382
1383         int16_t completedRule = row->fAccepting;
1384         if (completedRule > 0) {
1385             // Lookahead match is completed.
1386             int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1387             if (lookaheadResult >= 0) {
1388                 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1389                 return lookaheadResult;
1390             }
1391         }
1392         int16_t rule = row->fLookAhead;
1393         if (rule != 0) {
1394             // At the position of a '/' in a look-ahead match. Record it.
1395             int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1396             lookAheadMatches.setPosition(rule, pos);
1397         }
1398
1399         if (state == STOP_STATE) {
1400             // This is the normal exit from the lookup state machine.
1401             // We have advanced through the string until it is certain that no
1402             //   longer match is possible, no matter what characters follow.
1403             break;
1404         }
1405
1406         // Move (backwards) to the next character to process.
1407         // If this is a beginning-of-input loop iteration, don't advance
1408         //    the input position.  The next iteration will be processing the
1409         //    first real input character.
1410         if (mode == RBBI_RUN) {
1411             c = UTEXT_PREVIOUS32(fText);
1412         } else {
1413             if (mode == RBBI_START) {
1414                 mode = RBBI_RUN;
1415             }
1416         }
1417     }
1418
1419     // The state machine is done.  Check whether it found a match...
1420
1421     // If the iterator failed to advance in the match engine, force it ahead by one.
1422     //   (This really indicates a defect in the break rules.  They should always match
1423     //    at least one character.)
1424     if (result == initialPosition) {
1425         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1426         UTEXT_PREVIOUS32(fText);
1427         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1428     }
1429
1430     // Leave the iterator at our result position.
1431     UTEXT_SETNATIVEINDEX(fText, result);
1432     #ifdef RBBI_DEBUG
1433         if (fTrace) {
1434             RBBIDebugPrintf("result = %d\n\n", result);
1435         }
1436     #endif
1437     return result;
1438 }
1439
1440
1441 void
1442 RuleBasedBreakIterator::reset()
1443 {
1444     if (fCachedBreakPositions) {
1445         uprv_free(fCachedBreakPositions);
1446     }
1447     fCachedBreakPositions = NULL;
1448     fNumCachedBreakPositions = 0;
1449     fDictionaryCharCount = 0;
1450     fPositionInCache = 0;
1451 }
1452
1453
1454
1455 //-------------------------------------------------------------------------------
1456 //
1457 //   getRuleStatus()   Return the break rule tag associated with the current
1458 //                     iterator position.  If the iterator arrived at its current
1459 //                     position by iterating forwards, the value will have been
1460 //                     cached by the handleNext() function.
1461 //
1462 //                     If no cached status value is available, the status is
1463 //                     found by doing a previous() followed by a next(), which
1464 //                     leaves the iterator where it started, and computes the
1465 //                     status while doing the next().
1466 //
1467 //-------------------------------------------------------------------------------
1468 void RuleBasedBreakIterator::makeRuleStatusValid() {
1469     if (fLastStatusIndexValid == FALSE) {
1470         //  No cached status is available.
1471         if (fText == NULL || current() == 0) {
1472             //  At start of text, or there is no text.  Status is always zero.
1473             fLastRuleStatusIndex = 0;
1474             fLastStatusIndexValid = TRUE;
1475         } else {
1476             //  Not at start of text.  Find status the tedious way.
1477             int32_t pa = current();
1478             previous();
1479             if (fNumCachedBreakPositions > 0) {
1480                 reset();                // Blow off the dictionary cache
1481             }
1482             int32_t pb = next();
1483             if (pa != pb) {
1484                 // note: the if (pa != pb) test is here only to eliminate warnings for
1485                 //       unused local variables on gcc.  Logically, it isn't needed.
1486                 U_ASSERT(pa == pb);
1487             }
1488         }
1489     }
1490     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1491 }
1492
1493
1494 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1495     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1496     nonConstThis->makeRuleStatusValid();
1497
1498     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1499     //                                                 (the number of status values.)
1500     //   This function returns the last (largest) of the array of status values.
1501     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1502     int32_t  tagVal = fData->fRuleStatusTable[idx];
1503
1504     return tagVal;
1505 }
1506
1507
1508
1509
1510 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1511              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1512 {
1513     if (U_FAILURE(status)) {
1514         return 0;
1515     }
1516
1517     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1518     nonConstThis->makeRuleStatusValid();
1519     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1520     int32_t  numValsToCopy = numVals;
1521     if (numVals > capacity) {
1522         status = U_BUFFER_OVERFLOW_ERROR;
1523         numValsToCopy = capacity;
1524     }
1525     int i;
1526     for (i=0; i<numValsToCopy; i++) {
1527         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1528     }
1529     return numVals;
1530 }
1531
1532
1533
1534 //-------------------------------------------------------------------------------
1535 //
1536 //   getBinaryRules        Access to the compiled form of the rules,
1537 //                         for use by build system tools that save the data
1538 //                         for standard iterator types.
1539 //
1540 //-------------------------------------------------------------------------------
1541 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1542     const uint8_t  *retPtr = NULL;
1543     length = 0;
1544
1545     if (fData != NULL) {
1546         retPtr = (const uint8_t *)fData->fHeader;
1547         length = fData->fHeader->fLength;
1548     }
1549     return retPtr;
1550 }
1551
1552
1553 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
1554                                    int32_t &bufferSize,
1555                                    UErrorCode &status)
1556 {
1557     if (U_FAILURE(status)){
1558         return NULL;
1559     }
1560
1561     if (bufferSize == 0) {
1562         bufferSize = 1;  // preflighting for deprecated functionality
1563         return NULL;
1564     }
1565
1566     BreakIterator *clonedBI = clone();
1567     if (clonedBI == NULL) {
1568         status = U_MEMORY_ALLOCATION_ERROR;
1569     } else {
1570         status = U_SAFECLONE_ALLOCATED_WARNING;
1571     }
1572     return (RuleBasedBreakIterator *)clonedBI;
1573 }
1574
1575
1576 //-------------------------------------------------------------------------------
1577 //
1578 //  isDictionaryChar      Return true if the category lookup for this char
1579 //                        indicates that it is in the set of dictionary lookup
1580 //                        chars.
1581 //
1582 //                        This function is intended for use by dictionary based
1583 //                        break iterators.
1584 //
1585 //-------------------------------------------------------------------------------
1586 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1587     if (fData == NULL) {
1588         return FALSE;
1589     }
1590     uint16_t category;
1591     UTRIE_GET16(&fData->fTrie, c, category);
1592     return (category & 0x4000) != 0;
1593 }*/
1594
1595
1596 //-------------------------------------------------------------------------------
1597 //
1598 //  checkDictionary       This function handles all processing of characters in
1599 //                        the "dictionary" set. It will determine the appropriate
1600 //                        course of action, and possibly set up a cache in the
1601 //                        process.
1602 //
1603 //-------------------------------------------------------------------------------
1604 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1605                             int32_t endPos,
1606                             UBool reverse) {
1607     // Reset the old break cache first.
1608     reset();
1609
1610     // note: code segment below assumes that dictionary chars are in the
1611     // startPos-endPos range
1612     // value returned should be next character in sequence
1613     if ((endPos - startPos) <= 1) {
1614         return (reverse ? startPos : endPos);
1615     }
1616
1617     // Starting from the starting point, scan towards the proposed result,
1618     // looking for the first dictionary character (which may be the one
1619     // we're on, if we're starting in the middle of a range).
1620     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1621     if (reverse) {
1622         UTEXT_PREVIOUS32(fText);
1623     }
1624
1625     int32_t rangeStart = startPos;
1626     int32_t rangeEnd = endPos;
1627
1628     uint16_t    category;
1629     int32_t     current;
1630     UErrorCode  status = U_ZERO_ERROR;
1631     UStack      breaks(status);
1632     int32_t     foundBreakCount = 0;
1633     UChar32     c = utext_current32(fText);
1634
1635     UTRIE_GET16(&fData->fTrie, c, category);
1636
1637     // Is the character we're starting on a dictionary character? If so, we
1638     // need to back up to include the entire run; otherwise the results of
1639     // the break algorithm will differ depending on where we start. Since
1640     // the result is cached and there is typically a non-dictionary break
1641     // within a small number of words, there should be little performance impact.
1642     if (category & 0x4000) {
1643         if (reverse) {
1644             do {
1645                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1646                 c = utext_current32(fText);
1647                 UTRIE_GET16(&fData->fTrie, c, category);
1648             } while (c != U_SENTINEL && (category & 0x4000));
1649             // Back up to the last dictionary character
1650             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1651             if (c == U_SENTINEL) {
1652                 // c = fText->last32();
1653                 //   TODO:  why was this if needed?
1654                 c = UTEXT_PREVIOUS32(fText);
1655             }
1656             else {
1657                 c = UTEXT_PREVIOUS32(fText);
1658             }
1659         }
1660         else {
1661             do {
1662                 c = UTEXT_PREVIOUS32(fText);
1663                 UTRIE_GET16(&fData->fTrie, c, category);
1664             }
1665             while (c != U_SENTINEL && (category & 0x4000));
1666             // Back up to the last dictionary character
1667             if (c == U_SENTINEL) {
1668                 // c = fText->first32();
1669                 c = utext_current32(fText);
1670             }
1671             else {
1672                 utext_next32(fText);
1673                 c = utext_current32(fText);
1674             }
1675             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1676         }
1677         UTRIE_GET16(&fData->fTrie, c, category);
1678     }
1679
1680     // Loop through the text, looking for ranges of dictionary characters.
1681     // For each span, find the appropriate break engine, and ask it to find
1682     // any breaks within the span.
1683     // Note: we always do this in the forward direction, so that the break
1684     // cache is built in the right order.
1685     if (reverse) {
1686         utext_setNativeIndex(fText, rangeStart);
1687         c = utext_current32(fText);
1688         UTRIE_GET16(&fData->fTrie, c, category);
1689     }
1690     while(U_SUCCESS(status)) {
1691         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1692             utext_next32(fText);           // TODO:  tweak for post-increment operation
1693             c = utext_current32(fText);
1694             UTRIE_GET16(&fData->fTrie, c, category);
1695         }
1696         if (current >= rangeEnd) {
1697             break;
1698         }
1699
1700         // We now have a dictionary character. Get the appropriate language object
1701         // to deal with it.
1702         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1703
1704         // Ask the language object if there are any breaks. It will leave the text
1705         // pointer on the other side of its range, ready to search for the next one.
1706         if (lbe != NULL) {
1707             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1708         }
1709
1710         // Reload the loop variables for the next go-round
1711         c = utext_current32(fText);
1712         UTRIE_GET16(&fData->fTrie, c, category);
1713     }
1714
1715     // If we found breaks, build a new break cache. The first and last entries must
1716     // be the original starting and ending position.
1717     if (foundBreakCount > 0) {
1718         U_ASSERT(foundBreakCount == breaks.size());
1719         int32_t totalBreaks = foundBreakCount;
1720         if (startPos < breaks.elementAti(0)) {
1721             totalBreaks += 1;
1722         }
1723         if (endPos > breaks.peeki()) {
1724             totalBreaks += 1;
1725         }
1726         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1727         if (fCachedBreakPositions != NULL) {
1728             int32_t out = 0;
1729             fNumCachedBreakPositions = totalBreaks;
1730             if (startPos < breaks.elementAti(0)) {
1731                 fCachedBreakPositions[out++] = startPos;
1732             }
1733             for (int32_t i = 0; i < foundBreakCount; ++i) {
1734                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1735             }
1736             if (endPos > fCachedBreakPositions[out-1]) {
1737                 fCachedBreakPositions[out] = endPos;
1738             }
1739             // If there are breaks, then by definition, we are replacing the original
1740             // proposed break by one of the breaks we found. Use following() and
1741             // preceding() to do the work. They should never recurse in this case.
1742             if (reverse) {
1743                 return preceding(endPos);
1744             }
1745             else {
1746                 return following(startPos);
1747             }
1748         }
1749         // If the allocation failed, just fall through to the "no breaks found" case.
1750     }
1751
1752     // If we get here, there were no language-based breaks. Set the text pointer
1753     // to the original proposed break.
1754     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1755     return (reverse ? startPos : endPos);
1756 }
1757
1758 U_NAMESPACE_END
1759
1760
1761 static icu::UStack *gLanguageBreakFactories = NULL;
1762 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1763
1764 /**
1765  * Release all static memory held by breakiterator.
1766  */
1767 U_CDECL_BEGIN
1768 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1769     if (gLanguageBreakFactories) {
1770         delete gLanguageBreakFactories;
1771         gLanguageBreakFactories = NULL;
1772     }
1773     gLanguageBreakFactoriesInitOnce.reset();
1774     return TRUE;
1775 }
1776 U_CDECL_END
1777
1778 U_CDECL_BEGIN
1779 static void U_CALLCONV _deleteFactory(void *obj) {
1780     delete (icu::LanguageBreakFactory *) obj;
1781 }
1782 U_CDECL_END
1783 U_NAMESPACE_BEGIN
1784
1785 static void U_CALLCONV initLanguageFactories() {
1786     UErrorCode status = U_ZERO_ERROR;
1787     U_ASSERT(gLanguageBreakFactories == NULL);
1788     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1789     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1790         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1791         gLanguageBreakFactories->push(builtIn, status);
1792 #ifdef U_LOCAL_SERVICE_HOOK
1793         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1794         if (extra != NULL) {
1795             gLanguageBreakFactories->push(extra, status);
1796         }
1797 #endif
1798     }
1799     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1800 }
1801
1802
1803 static const LanguageBreakEngine*
1804 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1805 {
1806     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1807     if (gLanguageBreakFactories == NULL) {
1808         return NULL;
1809     }
1810
1811     int32_t i = gLanguageBreakFactories->size();
1812     const LanguageBreakEngine *lbe = NULL;
1813     while (--i >= 0) {
1814         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1815         lbe = factory->getEngineFor(c, breakType);
1816         if (lbe != NULL) {
1817             break;
1818         }
1819     }
1820     return lbe;
1821 }
1822
1823
1824 //-------------------------------------------------------------------------------
1825 //
1826 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1827 //                          the character c.
1828 //
1829 //-------------------------------------------------------------------------------
1830 const LanguageBreakEngine *
1831 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1832     const LanguageBreakEngine *lbe = NULL;
1833     UErrorCode status = U_ZERO_ERROR;
1834
1835     if (fLanguageBreakEngines == NULL) {
1836         fLanguageBreakEngines = new UStack(status);
1837         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1838             delete fLanguageBreakEngines;
1839             fLanguageBreakEngines = 0;
1840             return NULL;
1841         }
1842     }
1843
1844     int32_t i = fLanguageBreakEngines->size();
1845     while (--i >= 0) {
1846         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1847         if (lbe->handles(c, fBreakType)) {
1848             return lbe;
1849         }
1850     }
1851
1852     // No existing dictionary took the character. See if a factory wants to
1853     // give us a new LanguageBreakEngine for this character.
1854     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1855
1856     // If we got one, use it and push it on our stack.
1857     if (lbe != NULL) {
1858         fLanguageBreakEngines->push((void *)lbe, status);
1859         // Even if we can't remember it, we can keep looking it up, so
1860         // return it even if the push fails.
1861         return lbe;
1862     }
1863
1864     // No engine is forthcoming for this character. Add it to the
1865     // reject set. Create the reject break engine if needed.
1866     if (fUnhandledBreakEngine == NULL) {
1867         fUnhandledBreakEngine = new UnhandledEngine(status);
1868         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1869             status = U_MEMORY_ALLOCATION_ERROR;
1870         }
1871         // Put it last so that scripts for which we have an engine get tried
1872         // first.
1873         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1874         // If we can't insert it, or creation failed, get rid of it
1875         if (U_FAILURE(status)) {
1876             delete fUnhandledBreakEngine;
1877             fUnhandledBreakEngine = 0;
1878             return NULL;
1879         }
1880     }
1881
1882     // Tell the reject engine about the character; at its discretion, it may
1883     // add more than just the one character.
1884     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1885
1886     return fUnhandledBreakEngine;
1887 }
1888
1889
1890
1891 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1892     return fBreakType;
1893 }*/
1894
1895 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1896     fBreakType = type;
1897     reset();
1898 }
1899
1900 U_NAMESPACE_END
1901
1902 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */