icuSources/common/rbbi.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ***************************************************************************
   5 *   Copyright (C) 1999-2016 International Business Machines Corporation
   6 *   and others. All rights reserved.
   7 ***************************************************************************
   8 */
   9 //
  10 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
  11 //                   runtime engine and the API implementation for
  12 //                   class RuleBasedBreakIterator
  13 //
  14
  15 #include "utypeinfo.h"  // for 'typeid' to work
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_BREAK_ITERATION
  20
  21 #include "unicode/rbbi.h"
  22 #include "unicode/schriter.h"
  23 #include "unicode/uchriter.h"
  24 #include "unicode/udata.h"
  25 #include "unicode/uclean.h"
  26 #include "rbbidata.h"
  27 #include "rbbirb.h"
  28 #include "cmemory.h"
  29 #include "cstring.h"
  30 #include "umutex.h"
  31 #include "ucln_cmn.h"
  32 #include "brkeng.h"
  33
  34 #include "uassert.h"
  35 #include "uvector.h"
  36
  37 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
  38 #if U_LOCAL_SERVICE_HOOK
  39 #include "localsvc.h"
  40 #endif
  41
  42 #ifdef RBBI_DEBUG
  43 static UBool fTrace = FALSE;
  44 #endif
  45
  46 U_NAMESPACE_BEGIN
  47
  48 // The state number of the starting state
  49 #define START_STATE 1
  50
  51 // The state-transition value indicating "stop"
  52 #define STOP_STATE  0
  53
  54
  55 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  56
  57
  58 //=======================================================================
  59 // constructors
  60 //=======================================================================
  61
  62 /**
  63  * Constructs a RuleBasedBreakIterator that uses the already-created
  64  * tables object that is passed in as a parameter.
  65  */
  66 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  67 {
  68     init();
  69     fData = new RBBIDataWrapper(data, status); // status checked in constructor
  70     if (U_FAILURE(status)) {return;}
  71     if(fData == 0) {
  72         status = U_MEMORY_ALLOCATION_ERROR;
  73         return;
  74     }
  75 }
  76
  77 /**
  78  * Same as above but does not adopt memory
  79  * Open-source ICU eliminated this method in #12071, but Apple code needs it, so restore it.
  80  */
  81 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
  82 {
  83     init();
  84     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
  85     if (U_FAILURE(status)) {return;}
  86     if(fData == 0) {
  87         status = U_MEMORY_ALLOCATION_ERROR;
  88         return;
  89     }
  90 }
  91
  92
  93 //
  94 //  Construct from precompiled binary rules (tables).  This constructor is public API,
  95 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
  96 //
  97 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
  98                        uint32_t       ruleLength,
  99                        UErrorCode     &status) {
 100     init();
 101     if (U_FAILURE(status)) {
 102         return;
 103     }
 104     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
 105         status = U_ILLEGAL_ARGUMENT_ERROR;
 106         return;
 107     }
 108     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
 109     if (data->fLength > ruleLength) {
 110         status = U_ILLEGAL_ARGUMENT_ERROR;
 111         return;
 112     }
 113     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
 114     if (U_FAILURE(status)) {return;}
 115     if(fData == 0) {
 116         status = U_MEMORY_ALLOCATION_ERROR;
 117         return;
 118     }
 119 }
 120
 121
 122 //-------------------------------------------------------------------------------
 123 //
 124 //   Constructor   from a UDataMemory handle to precompiled break rules
 125 //                 stored in an ICU data file.
 126 //
 127 //-------------------------------------------------------------------------------
 128 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
 129 {
 130     init();
 131     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
 132     if (U_FAILURE(status)) {return;}
 133     if(fData == 0) {
 134         status = U_MEMORY_ALLOCATION_ERROR;
 135         return;
 136     }
 137 }
 138
 139
 140
 141 //-------------------------------------------------------------------------------
 142 //
 143 //   Constructor       from a set of rules supplied as a string.
 144 //
 145 //-------------------------------------------------------------------------------
 146 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 147                                                 UParseError          &parseError,
 148                                                 UErrorCode           &status)
 149 {
 150     init();
 151     if (U_FAILURE(status)) {return;}
 152     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
 153         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
 154     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
 155     //        creates and returns a complete RBBI.  From here, in a constructor, we
 156     //        can't just return the object created by the builder factory, hence
 157     //        the assignment of the factory created object to "this".
 158     if (U_SUCCESS(status)) {
 159         *this = *bi;
 160         delete bi;
 161     }
 162 }
 163
 164
 165 //-------------------------------------------------------------------------------
 166 //
 167 // Default Constructor.      Create an empty shell that can be set up later.
 168 //                           Used when creating a RuleBasedBreakIterator from a set
 169 //                           of rules.
 170 //-------------------------------------------------------------------------------
 171 RuleBasedBreakIterator::RuleBasedBreakIterator() {
 172     init();
 173 }
 174
 175
 176 //-------------------------------------------------------------------------------
 177 //
 178 //   Copy constructor.  Will produce a break iterator with the same behavior,
 179 //                      and which iterates over the same text, as the one passed in.
 180 //
 181 //-------------------------------------------------------------------------------
 182 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
 183 : BreakIterator(other)
 184 {
 185     this->init();
 186     *this = other;
 187 }
 188
 189
 190 /**
 191  * Destructor
 192  */
 193 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
 194     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 195         // fCharIter was adopted from the outside.
 196         delete fCharIter;
 197     }
 198     fCharIter = NULL;
 199     delete fSCharIter;
 200     fCharIter = NULL;
 201     delete fDCharIter;
 202     fDCharIter = NULL;
 203
 204     utext_close(fText);
 205
 206     if (fData != NULL) {
 207         fData->removeReference();
 208         fData = NULL;
 209     }
 210     if (fCachedBreakPositions) {
 211         uprv_free(fCachedBreakPositions);
 212         fCachedBreakPositions = NULL;
 213     }
 214     if (fLanguageBreakEngines) {
 215         delete fLanguageBreakEngines;
 216         fLanguageBreakEngines = NULL;
 217     }
 218     if (fUnhandledBreakEngine) {
 219         delete fUnhandledBreakEngine;
 220         fUnhandledBreakEngine = NULL;
 221     }
 222 }
 223
 224 /**
 225  * Assignment operator.  Sets this iterator to have the same behavior,
 226  * and iterate over the same text, as the one passed in.
 227  */
 228 RuleBasedBreakIterator&
 229 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 230     if (this == &that) {
 231         return *this;
 232     }
 233     fKeepAll = that.fKeepAll;
 234     reset();    // Delete break cache information
 235     fBreakType = that.fBreakType;
 236     if (fLanguageBreakEngines != NULL) {
 237         delete fLanguageBreakEngines;
 238         fLanguageBreakEngines = NULL;   // Just rebuild for now
 239     }
 240     // TODO: clone fLanguageBreakEngines from "that"
 241     UErrorCode status = U_ZERO_ERROR;
 242     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
 243
 244     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 245         delete fCharIter;
 246     }
 247     fCharIter = NULL;
 248
 249     if (that.fCharIter != NULL ) {
 250         // This is a little bit tricky - it will intially appear that
 251         //  this->fCharIter is adopted, even if that->fCharIter was
 252         //  not adopted.  That's ok.
 253         fCharIter = that.fCharIter->clone();
 254     }
 255
 256     if (fData != NULL) {
 257         fData->removeReference();
 258         fData = NULL;
 259     }
 260     if (that.fData != NULL) {
 261         fData = that.fData->addReference();
 262     }
 263
 264     return *this;
 265 }
 266
 267
 268
 269 //-----------------------------------------------------------------------------
 270 //
 271 //    init()      Shared initialization routine.   Used by all the constructors.
 272 //                Initializes all fields, leaving the object in a consistent state.
 273 //
 274 //-----------------------------------------------------------------------------
 275 void RuleBasedBreakIterator::init() {
 276     UErrorCode  status    = U_ZERO_ERROR;
 277     fText                 = utext_openUChars(NULL, NULL, 0, &status);
 278     fCharIter             = NULL;
 279     fSCharIter            = NULL;
 280     fDCharIter            = NULL;
 281     fData                 = NULL;
 282     fLastRuleStatusIndex  = 0;
 283     fLastStatusIndexValid = TRUE;
 284     fDictionaryCharCount  = 0;
 285     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
 286                                         //   dictionary behavior for Break Iterators that are
 287                                         //   built from rules.  Even better would be the ability to
 288                                         //   declare the type in the rules.
 289
 290     fCachedBreakPositions    = NULL;
 291     fLanguageBreakEngines    = NULL;
 292     fUnhandledBreakEngine    = NULL;
 293     fNumCachedBreakPositions = 0;
 294     fPositionInCache         = 0;
 295
 296 #ifdef RBBI_DEBUG
 297     static UBool debugInitDone = FALSE;
 298     if (debugInitDone == FALSE) {
 299         char *debugEnv = getenv("U_RBBIDEBUG");
 300         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
 301             fTrace = TRUE;
 302         }
 303         debugInitDone = TRUE;
 304     }
 305 #endif
 306 }
 307
 308
 309
 310 //-----------------------------------------------------------------------------
 311 //
 312 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
 313 //            behavior, and iterating over the same text, as this one.
 314 //            Virtual function: does the right thing with subclasses.
 315 //
 316 //-----------------------------------------------------------------------------
 317 BreakIterator*
 318 RuleBasedBreakIterator::clone(void) const {
 319     return new RuleBasedBreakIterator(*this);
 320 }
 321
 322 /**
 323  * Equality operator.  Returns TRUE if both BreakIterators are of the
 324  * same class, have the same behavior, and iterate over the same text.
 325  */
 326 UBool
 327 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
 328     if (typeid(*this) != typeid(that)) {
 329         return FALSE;
 330     }
 331
 332     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
 333     if (that2.fKeepAll != fKeepAll) {
 334         return FALSE;
 335     }
 336
 337     if (!utext_equals(fText, that2.fText)) {
 338         // The two break iterators are operating on different text,
 339         //   or have a different interation position.
 340         return FALSE;
 341     };
 342
 343     // TODO:  need a check for when in a dictionary region at different offsets.
 344
 345     if (that2.fData == fData ||
 346         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
 347             // The two break iterators are using the same rules.
 348             return TRUE;
 349         }
 350     return FALSE;
 351 }
 352
 353 /**
 354  * Compute a hash code for this BreakIterator
 355  * @return A hash code
 356  */
 357 int32_t
 358 RuleBasedBreakIterator::hashCode(void) const {
 359     int32_t   hash = 0;
 360     if (fData != NULL) {
 361         hash = fData->hashCode();
 362     }
 363     return hash;
 364 }
 365
 366
 367 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
 368     if (U_FAILURE(status)) {
 369         return;
 370     }
 371     reset();
 372     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
 373
 374     // Set up a dummy CharacterIterator to be returned if anyone
 375     //   calls getText().  With input from UText, there is no reasonable
 376     //   way to return a characterIterator over the actual input text.
 377     //   Return one over an empty string instead - this is the closest
 378     //   we can come to signaling a failure.
 379     //   (GetText() is obsolete, this failure is sort of OK)
 380     if (fDCharIter == NULL) {
 381         static const UChar c = 0;
 382         fDCharIter = new UCharCharacterIterator(&c, 0);
 383         if (fDCharIter == NULL) {
 384             status = U_MEMORY_ALLOCATION_ERROR;
 385             return;
 386         }
 387     }
 388
 389     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 390         // existing fCharIter was adopted from the outside.  Delete it now.
 391         delete fCharIter;
 392     }
 393     fCharIter = fDCharIter;
 394
 395     this->first();
 396 }
 397
 398
 399 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
 400     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
 401     return result;
 402 }
 403
 404
 405
 406 /**
 407  * Returns the description used to create this iterator
 408  */
 409 const UnicodeString&
 410 RuleBasedBreakIterator::getRules() const {
 411     if (fData != NULL) {
 412         return fData->getRuleSourceString();
 413     } else {
 414         static const UnicodeString *s;
 415         if (s == NULL) {
 416             // TODO:  something more elegant here.
 417             //        perhaps API should return the string by value.
 418             //        Note:  thread unsafe init & leak are semi-ok, better than
 419             //               what was before.  Sould be cleaned up, though.
 420             s = new UnicodeString;
 421         }
 422         return *s;
 423     }
 424 }
 425
 426 //=======================================================================
 427 // BreakIterator overrides
 428 //=======================================================================
 429
 430 /**
 431  * Return a CharacterIterator over the text being analyzed.
 432  */
 433 CharacterIterator&
 434 RuleBasedBreakIterator::getText() const {
 435     return *fCharIter;
 436 }
 437
 438 /**
 439  * Set the iterator to analyze a new piece of text.  This function resets
 440  * the current iteration position to the beginning of the text.
 441  * @param newText An iterator over the text to analyze.
 442  */
 443 void
 444 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 445     // If we are holding a CharacterIterator adopted from a
 446     //   previous call to this function, delete it now.
 447     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 448         delete fCharIter;
 449     }
 450
 451     fCharIter = newText;
 452     UErrorCode status = U_ZERO_ERROR;
 453     reset();
 454     if (newText==NULL || newText->startIndex() != 0) {
 455         // startIndex !=0 wants to be an error, but there's no way to report it.
 456         // Make the iterator text be an empty string.
 457         fText = utext_openUChars(fText, NULL, 0, &status);
 458     } else {
 459         fText = utext_openCharacterIterator(fText, newText, &status);
 460     }
 461     this->first();
 462 }
 463
 464 /**
 465  * Set the iterator to analyze a new piece of text.  This function resets
 466  * the current iteration position to the beginning of the text.
 467  * @param newText An iterator over the text to analyze.
 468  */
 469 void
 470 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 471     UErrorCode status = U_ZERO_ERROR;
 472     reset();
 473     fText = utext_openConstUnicodeString(fText, &newText, &status);
 474
 475     // Set up a character iterator on the string.
 476     //   Needed in case someone calls getText().
 477     //  Can not, unfortunately, do this lazily on the (probably never)
 478     //  call to getText(), because getText is const.
 479     if (fSCharIter == NULL) {
 480         fSCharIter = new StringCharacterIterator(newText);
 481     } else {
 482         fSCharIter->setText(newText);
 483     }
 484
 485     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
 486         // old fCharIter was adopted from the outside.  Delete it.
 487         delete fCharIter;
 488     }
 489     fCharIter = fSCharIter;
 490
 491     this->first();
 492 }
 493
 494
 495 /**
 496  *  Provide a new UText for the input text.  Must reference text with contents identical
 497  *  to the original.
 498  *  Intended for use with text data originating in Java (garbage collected) environments
 499  *  where the data may be moved in memory at arbitrary times.
 500  */
 501 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
 502     if (U_FAILURE(status)) {
 503         return *this;
 504     }
 505     if (input == NULL) {
 506         status = U_ILLEGAL_ARGUMENT_ERROR;
 507         return *this;
 508     }
 509     int64_t pos = utext_getNativeIndex(fText);
 510     //  Shallow read-only clone of the new UText into the existing input UText
 511     fText = utext_clone(fText, input, FALSE, TRUE, &status);
 512     if (U_FAILURE(status)) {
 513         return *this;
 514     }
 515     utext_setNativeIndex(fText, pos);
 516     if (utext_getNativeIndex(fText) != pos) {
 517         // Sanity check.  The new input utext is supposed to have the exact same
 518         // contents as the old.  If we can't set to the same position, it doesn't.
 519         // The contents underlying the old utext might be invalid at this point,
 520         // so it's not safe to check directly.
 521         status = U_ILLEGAL_ARGUMENT_ERROR;
 522     }
 523     return *this;
 524 }
 525
 526
 527 /**
 528  * Sets the current iteration position to the beginning of the text, position zero.
 529  * @return The new iterator position, which is zero.
 530  */
 531 int32_t RuleBasedBreakIterator::first(void) {
 532     reset();
 533     fLastRuleStatusIndex  = 0;
 534     fLastStatusIndexValid = TRUE;
 535     //if (fText == NULL)
 536     //    return BreakIterator::DONE;
 537
 538     utext_setNativeIndex(fText, 0);
 539     return 0;
 540 }
 541
 542 /**
 543  * Sets the current iteration position to the end of the text.
 544  * @return The text's past-the-end offset.
 545  */
 546 int32_t RuleBasedBreakIterator::last(void) {
 547     reset();
 548     if (fText == NULL) {
 549         fLastRuleStatusIndex  = 0;
 550         fLastStatusIndexValid = TRUE;
 551         return BreakIterator::DONE;
 552     }
 553
 554     fLastStatusIndexValid = FALSE;
 555     int32_t pos = (int32_t)utext_nativeLength(fText);
 556     utext_setNativeIndex(fText, pos);
 557     return pos;
 558 }
 559
 560 /**
 561  * Advances the iterator either forward or backward the specified number of steps.
 562  * Negative values move backward, and positive values move forward.  This is
 563  * equivalent to repeatedly calling next() or previous().
 564  * @param n The number of steps to move.  The sign indicates the direction
 565  * (negative is backwards, and positive is forwards).
 566  * @return The character offset of the boundary position n boundaries away from
 567  * the current one.
 568  */
 569 int32_t RuleBasedBreakIterator::next(int32_t n) {
 570     int32_t result = current();
 571     while (n > 0) {
 572         result = next();
 573         --n;
 574     }
 575     while (n < 0) {
 576         result = previous();
 577         ++n;
 578     }
 579     return result;
 580 }
 581
 582 /**
 583  * Advances the iterator to the next boundary position.
 584  * @return The position of the first boundary after this one.
 585  */
 586 int32_t RuleBasedBreakIterator::next(void) {
 587     // if we have cached break positions and we're still in the range
 588     // covered by them, just move one step forward in the cache
 589     if (fCachedBreakPositions != NULL) {
 590         if (fPositionInCache < fNumCachedBreakPositions - 1) {
 591             ++fPositionInCache;
 592             int32_t pos = fCachedBreakPositions[fPositionInCache];
 593             utext_setNativeIndex(fText, pos);
 594             return pos;
 595         }
 596         else {
 597             reset();
 598         }
 599     }
 600
 601     int32_t startPos = current();
 602     fDictionaryCharCount = 0;
 603     int32_t result = handleNext(fData->fForwardTable);
 604     while (fKeepAll) {
 605         UChar32 prevChr = utext_char32At(fText, result-1);
 606         UChar32 currChr = utext_char32At(fText, result);
 607         if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
 608             break;
 609         }
 610         int32_t nextResult = handleNext(fData->fForwardTable);
 611         if (nextResult <= result) {
 612             break;
 613         }
 614         result = nextResult;
 615     }
 616     if (fDictionaryCharCount > 0) {
 617         result = checkDictionary(startPos, result, FALSE);
 618     }
 619     return result;
 620 }
 621
 622 /**
 623  * Advances the iterator backwards, to the last boundary preceding this one.
 624  * @return The position of the last boundary position preceding this one.
 625  */
 626 int32_t RuleBasedBreakIterator::previous(void) {
 627     int32_t result;
 628     int32_t startPos;
 629
 630     // if we have cached break positions and we're still in the range
 631     // covered by them, just move one step backward in the cache
 632     if (fCachedBreakPositions != NULL) {
 633         if (fPositionInCache > 0) {
 634             --fPositionInCache;
 635             // If we're at the beginning of the cache, need to reevaluate the
 636             // rule status
 637             if (fPositionInCache <= 0) {
 638                 fLastStatusIndexValid = FALSE;
 639             }
 640             int32_t pos = fCachedBreakPositions[fPositionInCache];
 641             utext_setNativeIndex(fText, pos);
 642             return pos;
 643         }
 644         else {
 645             reset();
 646         }
 647     }
 648
 649     // if we're already sitting at the beginning of the text, return DONE
 650     if (fText == NULL || (startPos = current()) == 0) {
 651         fLastRuleStatusIndex  = 0;
 652         fLastStatusIndexValid = TRUE;
 653         return BreakIterator::DONE;
 654     }
 655
 656     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
 657         result = handlePrevious(fData->fReverseTable);
 658         while (fKeepAll) {
 659             UChar32 prevChr = utext_char32At(fText, result-1);
 660             UChar32 currChr = utext_char32At(fText, result);
 661             if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
 662                 break;
 663             }
 664             int32_t prevResult = handlePrevious(fData->fReverseTable);
 665             if (prevResult >= result) {
 666                 break;
 667             }
 668             result = prevResult;
 669         }
 670         if (fDictionaryCharCount > 0) {
 671             result = checkDictionary(result, startPos, TRUE);
 672         }
 673         return result;
 674     }
 675
 676     // old rule syntax
 677     // set things up.  handlePrevious() will back us up to some valid
 678     // break position before the current position (we back our internal
 679     // iterator up one step to prevent handlePrevious() from returning
 680     // the current position), but not necessarily the last one before
 681     // where we started
 682
 683     int32_t start = current();
 684
 685     (void)UTEXT_PREVIOUS32(fText);
 686     int32_t lastResult    = handlePrevious(fData->fReverseTable);
 687     if (lastResult == UBRK_DONE) {
 688         lastResult = 0;
 689         utext_setNativeIndex(fText, 0);
 690     }
 691     result = lastResult;
 692     int32_t lastTag       = 0;
 693     UBool   breakTagValid = FALSE;
 694
 695     // iterate forward from the known break position until we pass our
 696     // starting point.  The last break position before the starting
 697     // point is our return value
 698
 699     for (;;) {
 700         result         = next();
 701         if (result == BreakIterator::DONE || result >= start) {
 702             break;
 703         }
 704         lastResult     = result;
 705         lastTag        = fLastRuleStatusIndex;
 706         breakTagValid  = TRUE;
 707     }
 708
 709     // fLastBreakTag wants to have the value for section of text preceding
 710     // the result position that we are to return (in lastResult.)  If
 711     // the backwards rules overshot and the above loop had to do two or more
 712     // next()s to move up to the desired return position, we will have a valid
 713     // tag value. But, if handlePrevious() took us to exactly the correct result position,
 714     // we wont have a tag value for that position, which is only set by handleNext().
 715
 716     // Set the current iteration position to be the last break position
 717     // before where we started, and then return that value.
 718     utext_setNativeIndex(fText, lastResult);
 719     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
 720     fLastStatusIndexValid = breakTagValid;
 721
 722     // No need to check the dictionary; it will have been handled by
 723     // next()
 724
 725     return lastResult;
 726 }
 727
 728 /**
 729  * Sets the iterator to refer to the first boundary position following
 730  * the specified position.
 731  * @offset The position from which to begin searching for a break position.
 732  * @return The position of the first break after the current position.
 733  */
 734 int32_t RuleBasedBreakIterator::following(int32_t offset) {
 735     // if the offset passed in is already past the end of the text,
 736     // just return DONE; if it's before the beginning, return the
 737     // text's starting offset
 738     if (fText == NULL || offset >= utext_nativeLength(fText)) {
 739         last();
 740         return next();
 741     }
 742     else if (offset < 0) {
 743         return first();
 744     }
 745
 746     // Move requested offset to a code point start. It might be on a trail surrogate,
 747     // or on a trail byte if the input is UTF-8.
 748     utext_setNativeIndex(fText, offset);
 749     offset = (int32_t)utext_getNativeIndex(fText);
 750
 751     // if we have cached break positions and offset is in the range
 752     // covered by them, use them
 753     // TODO: could use binary search
 754     // TODO: what if offset is outside range, but break is not?
 755     if (fCachedBreakPositions != NULL) {
 756         if (offset >= fCachedBreakPositions[0]
 757                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 758             fPositionInCache = 0;
 759             // We are guaranteed not to leave the array due to range test above
 760             while (offset >= fCachedBreakPositions[fPositionInCache]) {
 761                 ++fPositionInCache;
 762             }
 763             int32_t pos = fCachedBreakPositions[fPositionInCache];
 764             utext_setNativeIndex(fText, pos);
 765             return pos;
 766         }
 767         else {
 768             reset();
 769         }
 770     }
 771
 772     // Set our internal iteration position (temporarily)
 773     // to the position passed in.  If this is the _beginning_ position,
 774     // then we can just use next() to get our return value
 775
 776     int32_t result = 0;
 777
 778     if (fData->fSafeRevTable != NULL) {
 779         // new rule syntax
 780         utext_setNativeIndex(fText, offset);
 781         // move forward one codepoint to prepare for moving back to a
 782         // safe point.
 783         // this handles offset being between a supplementary character
 784         // TODO: is this still needed, with move to code point boundary handled above?
 785         (void)UTEXT_NEXT32(fText);
 786         // handlePrevious will move most of the time to < 1 boundary away
 787         handlePrevious(fData->fSafeRevTable);
 788         int32_t result = next();
 789         while (result <= offset) {
 790             result = next();
 791         }
 792         return result;
 793     }
 794     if (fData->fSafeFwdTable != NULL) {
 795         // backup plan if forward safe table is not available
 796         utext_setNativeIndex(fText, offset);
 797         (void)UTEXT_PREVIOUS32(fText);
 798         // handle next will give result >= offset
 799         handleNext(fData->fSafeFwdTable);
 800         // previous will give result 0 or 1 boundary away from offset,
 801         // most of the time
 802         // we have to
 803         int32_t oldresult = previous();
 804         while (oldresult > offset) {
 805             int32_t result = previous();
 806             if (result <= offset) {
 807                 return oldresult;
 808             }
 809             oldresult = result;
 810         }
 811         int32_t result = next();
 812         if (result <= offset) {
 813             return next();
 814         }
 815         return result;
 816     }
 817     // otherwise, we have to sync up first.  Use handlePrevious() to back
 818     // up to a known break position before the specified position (if
 819     // we can determine that the specified position is a break position,
 820     // we don't back up at all).  This may or may not be the last break
 821     // position at or before our starting position.  Advance forward
 822     // from here until we've passed the starting position.  The position
 823     // we stop on will be the first break position after the specified one.
 824     // old rule syntax
 825
 826     utext_setNativeIndex(fText, offset);
 827     if (offset==0 ||
 828         (offset==1  && utext_getNativeIndex(fText)==0)) {
 829         return next();
 830     }
 831     result = previous();
 832
 833     while (result != BreakIterator::DONE && result <= offset) {
 834         result = next();
 835     }
 836
 837     return result;
 838 }
 839
 840 /**
 841  * Sets the iterator to refer to the last boundary position before the
 842  * specified position.
 843  * @offset The position to begin searching for a break from.
 844  * @return The position of the last boundary before the starting position.
 845  */
 846 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 847     // if the offset passed in is already past the end of the text,
 848     // just return DONE; if it's before the beginning, return the
 849     // text's starting offset
 850     if (fText == NULL || offset > utext_nativeLength(fText)) {
 851         return last();
 852     }
 853     else if (offset < 0) {
 854         return first();
 855     }
 856
 857     // Move requested offset to a code point start. It might be on a trail surrogate,
 858     // or on a trail byte if the input is UTF-8.
 859     utext_setNativeIndex(fText, offset);
 860     offset = (int32_t)utext_getNativeIndex(fText);
 861
 862     // if we have cached break positions and offset is in the range
 863     // covered by them, use them
 864     if (fCachedBreakPositions != NULL) {
 865         // TODO: binary search?
 866         // TODO: What if offset is outside range, but break is not?
 867         if (offset > fCachedBreakPositions[0]
 868                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
 869             fPositionInCache = 0;
 870             while (fPositionInCache < fNumCachedBreakPositions
 871                    && offset > fCachedBreakPositions[fPositionInCache])
 872                 ++fPositionInCache;
 873             --fPositionInCache;
 874             // If we're at the beginning of the cache, need to reevaluate the
 875             // rule status
 876             if (fPositionInCache <= 0) {
 877                 fLastStatusIndexValid = FALSE;
 878             }
 879             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
 880             return fCachedBreakPositions[fPositionInCache];
 881         }
 882         else {
 883             reset();
 884         }
 885     }
 886
 887     // if we start by updating the current iteration position to the
 888     // position specified by the caller, we can just use previous()
 889     // to carry out this operation
 890
 891     if (fData->fSafeFwdTable != NULL) {
 892         // new rule syntax
 893         utext_setNativeIndex(fText, offset);
 894         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 895         if (newOffset != offset) {
 896             // Will come here if specified offset was not a code point boundary AND
 897             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
 898             //   indices to the containing code point.
 899             // For breakitereator::preceding only, these non-code-point indices need to be moved
 900             //   up to refer to the following codepoint.
 901             (void)UTEXT_NEXT32(fText);
 902             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 903         }
 904
 905         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
 906         //        rather than adjusting the position unconditionally?
 907         //        (Change would interact with safe rules.)
 908         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
 909         //        affects only preceding(), seems cleaner, but is slightly different.
 910         (void)UTEXT_PREVIOUS32(fText);
 911         handleNext(fData->fSafeFwdTable);
 912         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 913         while (result >= offset) {
 914             result = previous();
 915         }
 916         return result;
 917     }
 918     if (fData->fSafeRevTable != NULL) {
 919         // backup plan if forward safe table is not available
 920         //  TODO:  check whether this path can be discarded
 921         //         It's probably OK to say that rules must supply both safe tables
 922         //            if they use safe tables at all.  We have certainly never described
 923         //            to anyone how to work with just one safe table.
 924         utext_setNativeIndex(fText, offset);
 925         (void)UTEXT_NEXT32(fText);
 926
 927         // handle previous will give result <= offset
 928         handlePrevious(fData->fSafeRevTable);
 929
 930         // next will give result 0 or 1 boundary away from offset,
 931         // most of the time
 932         // we have to
 933         int32_t oldresult = next();
 934         while (oldresult < offset) {
 935             int32_t result = next();
 936             if (result >= offset) {
 937                 return oldresult;
 938             }
 939             oldresult = result;
 940         }
 941         int32_t result = previous();
 942         if (result >= offset) {
 943             return previous();
 944         }
 945         return result;
 946     }
 947
 948     // old rule syntax
 949     utext_setNativeIndex(fText, offset);
 950     return previous();
 951 }
 952
 953 /**
 954  * Returns true if the specfied position is a boundary position.  As a side
 955  * effect, leaves the iterator pointing to the first boundary position at
 956  * or after "offset".
 957  * @param offset the offset to check.
 958  * @return True if "offset" is a boundary position.
 959  */
 960 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
 961     // the beginning index of the iterator is always a boundary position by definition
 962     if (offset == 0) {
 963         first();       // For side effects on current position, tag values.
 964         return TRUE;
 965     }
 966
 967     if (offset == (int32_t)utext_nativeLength(fText)) {
 968         last();       // For side effects on current position, tag values.
 969         return TRUE;
 970     }
 971
 972     // out-of-range indexes are never boundary positions
 973     if (offset < 0) {
 974         first();       // For side effects on current position, tag values.
 975         return FALSE;
 976     }
 977
 978     if (offset > utext_nativeLength(fText)) {
 979         last();        // For side effects on current position, tag values.
 980         return FALSE;
 981     }
 982
 983     // otherwise, we can use following() on the position before the specified
 984     // one and return true if the position we get back is the one the user
 985     // specified
 986     utext_previous32From(fText, offset);
 987     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 988     UBool    result  = following(backOne) == offset;
 989     return result;
 990 }
 991
 992 /**
 993  * Returns the current iteration position.
 994  * @return The current iteration position.
 995  */
 996 int32_t RuleBasedBreakIterator::current(void) const {
 997     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
 998     return pos;
 999 }
1000
1001 //=======================================================================
1002 // implementation
1003 //=======================================================================
1004
1005 //
1006 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
1007 //                 of user text.  A variable with this enum type keeps track of where we
1008 //                 are.  The state machine only fetches user input while in the RUN mode.
1009 //
1010 enum RBBIRunMode {
1011     RBBI_START,     // state machine processing is before first char of input
1012     RBBI_RUN,       // state machine processing is in the user text
1013     RBBI_END        // state machine processing is after end of user text.
1014 };
1015
1016
1017 // Map from look-ahead break states (corresponds to rules) to boundary positions.
1018 // Allows multiple lookahead break rules to be in flight at the same time.
1019 //
1020 // This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
1021 // in the state table be sequential, then we can just index an array. And the
1022 // table could also tell us in advance how big that array needs to be.
1023 //
1024 // Before ICU 57 there was just a single simple variable for a look-ahead match that
1025 // was in progress. Two rules at once did not work.
1026
1027 static const int32_t kMaxLookaheads = 8;
1028 struct LookAheadResults {
1029     int32_t    fUsedSlotLimit;
1030     int32_t    fPositions[8];
1031     int16_t    fKeys[8];
1032
1033     LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
1034
1035     int32_t getPosition(int16_t key) {
1036         for (int32_t i=0; i<fUsedSlotLimit; ++i) {
1037             if (fKeys[i] == key) {
1038                 return fPositions[i];
1039             }
1040         }
1041         U_ASSERT(FALSE);
1042         return -1;
1043     }
1044
1045     void setPosition(int16_t key, int32_t position) {
1046         int32_t i;
1047         for (i=0; i<fUsedSlotLimit; ++i) {
1048             if (fKeys[i] == key) {
1049                 fPositions[i] = position;
1050                 return;
1051             }
1052         }
1053         if (i >= kMaxLookaheads) {
1054             U_ASSERT(FALSE);
1055             i = kMaxLookaheads - 1;
1056         }
1057         fKeys[i] = key;
1058         fPositions[i] = position;
1059         U_ASSERT(fUsedSlotLimit == i);
1060         fUsedSlotLimit = i + 1;
1061     }
1062 };
1063
1064
1065 //-----------------------------------------------------------------------------------
1066 //
1067 //  handleNext(stateTable)
1068 //     This method is the actual implementation of the rbbi next() method.
1069 //     This method initializes the state machine to state 1
1070 //     and advances through the text character by character until we reach the end
1071 //     of the text or the state machine transitions to state 0.  We update our return
1072 //     value every time the state machine passes through an accepting state.
1073 //
1074 //-----------------------------------------------------------------------------------
1075 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
1076     int32_t             state;
1077     uint16_t            category        = 0;
1078     RBBIRunMode         mode;
1079
1080     RBBIStateTableRow  *row;
1081     UChar32             c;
1082     LookAheadResults    lookAheadMatches;
1083     int32_t             result             = 0;
1084     int32_t             initialPosition    = 0;
1085     const char         *tableData          = statetable->fTableData;
1086     uint32_t            tableRowLen        = statetable->fRowLen;
1087
1088     #ifdef RBBI_DEBUG
1089         if (fTrace) {
1090             RBBIDebugPuts("Handle Next   pos   char  state category");
1091         }
1092     #endif
1093
1094     // No matter what, handleNext alway correctly sets the break tag value.
1095     fLastStatusIndexValid = TRUE;
1096     fLastRuleStatusIndex = 0;
1097
1098     // if we're already at the end of the text, return DONE.
1099     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1100     result          = initialPosition;
1101     c               = UTEXT_NEXT32(fText);
1102     if (fData == NULL || c==U_SENTINEL) {
1103         return BreakIterator::DONE;
1104     }
1105
1106     //  Set the initial state for the state machine
1107     state = START_STATE;
1108     row = (RBBIStateTableRow *)
1109             //(statetable->fTableData + (statetable->fRowLen * state));
1110             (tableData + tableRowLen * state);
1111
1112
1113     mode     = RBBI_RUN;
1114     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1115         category = 2;
1116         mode     = RBBI_START;
1117     }
1118
1119
1120     // loop until we reach the end of the text or transition to state 0
1121     //
1122     for (;;) {
1123         if (c == U_SENTINEL) {
1124             // Reached end of input string.
1125             if (mode == RBBI_END) {
1126                 // We have already run the loop one last time with the
1127                 //   character set to the psueudo {eof} value.  Now it is time
1128                 //   to unconditionally bail out.
1129                 break;
1130             }
1131             // Run the loop one last time with the fake end-of-input character category.
1132             mode = RBBI_END;
1133             category = 1;
1134         }
1135
1136         //
1137         // Get the char category.  An incoming category of 1 or 2 means that
1138         //      we are preset for doing the beginning or end of input, and
1139         //      that we shouldn't get a category from an actual text input character.
1140         //
1141         if (mode == RBBI_RUN) {
1142             // look up the current character's character category, which tells us
1143             // which column in the state table to look at.
1144             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1145             //        not the size of the character going in, which is a UChar32.
1146             //
1147             UTRIE_GET16(&fData->fTrie, c, category);
1148
1149             // Check the dictionary bit in the character's category.
1150             //    Counter is only used by dictionary based iterators (subclasses).
1151             //    Chars that need to be handled by a dictionary have a flag bit set
1152             //    in their category values.
1153             //
1154             if ((category & 0x4000) != 0)  {
1155                 fDictionaryCharCount++;
1156                 //  And off the dictionary flag bit.
1157                 category &= ~0x4000;
1158             }
1159         }
1160
1161        #ifdef RBBI_DEBUG
1162             if (fTrace) {
1163                 RBBIDebugPrintf("             %4lld   ", utext_getNativeIndex(fText));
1164                 if (0x20<=c && c<0x7f) {
1165                     RBBIDebugPrintf("\"%c\"  ", c);
1166                 } else {
1167                     RBBIDebugPrintf("%5x  ", c);
1168                 }
1169                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1170             }
1171         #endif
1172
1173         // State Transition - move machine to its next state
1174         //
1175
1176         // Note: fNextState is defined as uint16_t[2], but we are casting
1177         // a generated RBBI table to RBBIStateTableRow and some tables
1178         // actually have more than 2 categories.
1179         U_ASSERT(category<fData->fHeader->fCatCount);
1180         state = row->fNextState[category];  /*Not accessing beyond memory*/
1181         row = (RBBIStateTableRow *)
1182             // (statetable->fTableData + (statetable->fRowLen * state));
1183             (tableData + tableRowLen * state);
1184
1185
1186         if (row->fAccepting == -1) {
1187             // Match found, common case.
1188             if (mode != RBBI_START) {
1189                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1190             }
1191             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1192         }
1193
1194         int16_t completedRule = row->fAccepting;
1195         if (completedRule > 0) {
1196             // Lookahead match is completed.
1197             int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1198             if (lookaheadResult >= 0) {
1199                 fLastRuleStatusIndex = row->fTagIdx;
1200                 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1201                 return lookaheadResult;
1202             }
1203         }
1204         int16_t rule = row->fLookAhead;
1205         if (rule != 0) {
1206             // At the position of a '/' in a look-ahead match. Record it.
1207             int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1208             lookAheadMatches.setPosition(rule, pos);
1209         }
1210
1211         if (state == STOP_STATE) {
1212             // This is the normal exit from the lookup state machine.
1213             // We have advanced through the string until it is certain that no
1214             //   longer match is possible, no matter what characters follow.
1215             break;
1216         }
1217
1218         // Advance to the next character.
1219         // If this is a beginning-of-input loop iteration, don't advance
1220         //    the input position.  The next iteration will be processing the
1221         //    first real input character.
1222         if (mode == RBBI_RUN) {
1223             c = UTEXT_NEXT32(fText);
1224         } else {
1225             if (mode == RBBI_START) {
1226                 mode = RBBI_RUN;
1227             }
1228         }
1229
1230
1231     }
1232
1233     // The state machine is done.  Check whether it found a match...
1234
1235     // If the iterator failed to advance in the match engine, force it ahead by one.
1236     //   (This really indicates a defect in the break rules.  They should always match
1237     //    at least one character.)
1238     if (result == initialPosition) {
1239         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1240         UTEXT_NEXT32(fText);
1241         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1242     }
1243
1244     // Leave the iterator at our result position.
1245     UTEXT_SETNATIVEINDEX(fText, result);
1246     #ifdef RBBI_DEBUG
1247         if (fTrace) {
1248             RBBIDebugPrintf("result = %d\n\n", result);
1249         }
1250     #endif
1251     return result;
1252 }
1253
1254
1255
1256 //-----------------------------------------------------------------------------------
1257 //
1258 //  handlePrevious()
1259 //
1260 //      Iterate backwards, according to the logic of the reverse rules.
1261 //      This version handles the exact style backwards rules.
1262 //
1263 //      The logic of this function is very similar to handleNext(), above.
1264 //
1265 //-----------------------------------------------------------------------------------
1266 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1267     int32_t             state;
1268     uint16_t            category        = 0;
1269     RBBIRunMode         mode;
1270     RBBIStateTableRow  *row;
1271     UChar32             c;
1272     LookAheadResults    lookAheadMatches;
1273     int32_t             result          = 0;
1274     int32_t             initialPosition = 0;
1275
1276     #ifdef RBBI_DEBUG
1277         if (fTrace) {
1278             RBBIDebugPuts("Handle Previous   pos   char  state category");
1279         }
1280     #endif
1281
1282     // handlePrevious() never gets the rule status.
1283     // Flag the status as invalid; if the user ever asks for status, we will need
1284     // to back up, then re-find the break position using handleNext(), which does
1285     // get the status value.
1286     fLastStatusIndexValid = FALSE;
1287     fLastRuleStatusIndex = 0;
1288
1289     // if we're already at the start of the text, return DONE.
1290     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1291         return BreakIterator::DONE;
1292     }
1293
1294     //  Set up the starting char.
1295     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1296     result          = initialPosition;
1297     c               = UTEXT_PREVIOUS32(fText);
1298
1299     //  Set the initial state for the state machine
1300     state = START_STATE;
1301     row = (RBBIStateTableRow *)
1302             (statetable->fTableData + (statetable->fRowLen * state));
1303     category = 3;
1304     mode     = RBBI_RUN;
1305     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1306         category = 2;
1307         mode     = RBBI_START;
1308     }
1309
1310
1311     // loop until we reach the start of the text or transition to state 0
1312     //
1313     for (;;) {
1314         if (c == U_SENTINEL) {
1315             // Reached end of input string.
1316             if (mode == RBBI_END) {
1317                 // We have already run the loop one last time with the
1318                 //   character set to the psueudo {eof} value.  Now it is time
1319                 //   to unconditionally bail out.
1320                 if (result == initialPosition) {
1321                     // Ran off start, no match found.
1322                     // move one index one (towards the start, since we are doing a previous())
1323                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1324                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1325                 }
1326                 break;
1327             }
1328             // Run the loop one last time with the fake end-of-input character category.
1329             mode = RBBI_END;
1330             category = 1;
1331         }
1332
1333         //
1334         // Get the char category.  An incoming category of 1 or 2 means that
1335         //      we are preset for doing the beginning or end of input, and
1336         //      that we shouldn't get a category from an actual text input character.
1337         //
1338         if (mode == RBBI_RUN) {
1339             // look up the current character's character category, which tells us
1340             // which column in the state table to look at.
1341             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1342             //        not the size of the character going in, which is a UChar32.
1343             //
1344             UTRIE_GET16(&fData->fTrie, c, category);
1345
1346             // Check the dictionary bit in the character's category.
1347             //    Counter is only used by dictionary based iterators (subclasses).
1348             //    Chars that need to be handled by a dictionary have a flag bit set
1349             //    in their category values.
1350             //
1351             if ((category & 0x4000) != 0)  {
1352                 fDictionaryCharCount++;
1353                 //  And off the dictionary flag bit.
1354                 category &= ~0x4000;
1355             }
1356         }
1357
1358         #ifdef RBBI_DEBUG
1359             if (fTrace) {
1360                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1361                 if (0x20<=c && c<0x7f) {
1362                     RBBIDebugPrintf("\"%c\"  ", c);
1363                 } else {
1364                     RBBIDebugPrintf("%5x  ", c);
1365                 }
1366                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1367             }
1368         #endif
1369
1370         // State Transition - move machine to its next state
1371         //
1372
1373         // Note: fNextState is defined as uint16_t[2], but we are casting
1374         // a generated RBBI table to RBBIStateTableRow and some tables
1375         // actually have more than 2 categories.
1376         U_ASSERT(category<fData->fHeader->fCatCount);
1377         state = row->fNextState[category];  /*Not accessing beyond memory*/
1378         row = (RBBIStateTableRow *)
1379             (statetable->fTableData + (statetable->fRowLen * state));
1380
1381         if (row->fAccepting == -1) {
1382             // Match found, common case.
1383             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1384         }
1385
1386         int16_t completedRule = row->fAccepting;
1387         if (completedRule > 0) {
1388             // Lookahead match is completed.
1389             int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1390             if (lookaheadResult >= 0) {
1391                 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1392                 return lookaheadResult;
1393             }
1394         }
1395         int16_t rule = row->fLookAhead;
1396         if (rule != 0) {
1397             // At the position of a '/' in a look-ahead match. Record it.
1398             int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1399             lookAheadMatches.setPosition(rule, pos);
1400         }
1401
1402         if (state == STOP_STATE) {
1403             // This is the normal exit from the lookup state machine.
1404             // We have advanced through the string until it is certain that no
1405             //   longer match is possible, no matter what characters follow.
1406             break;
1407         }
1408
1409         // Move (backwards) to the next character to process.
1410         // If this is a beginning-of-input loop iteration, don't advance
1411         //    the input position.  The next iteration will be processing the
1412         //    first real input character.
1413         if (mode == RBBI_RUN) {
1414             c = UTEXT_PREVIOUS32(fText);
1415         } else {
1416             if (mode == RBBI_START) {
1417                 mode = RBBI_RUN;
1418             }
1419         }
1420     }
1421
1422     // The state machine is done.  Check whether it found a match...
1423
1424     // If the iterator failed to advance in the match engine, force it ahead by one.
1425     //   (This really indicates a defect in the break rules.  They should always match
1426     //    at least one character.)
1427     if (result == initialPosition) {
1428         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1429         UTEXT_PREVIOUS32(fText);
1430         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1431     }
1432
1433     // Leave the iterator at our result position.
1434     UTEXT_SETNATIVEINDEX(fText, result);
1435     #ifdef RBBI_DEBUG
1436         if (fTrace) {
1437             RBBIDebugPrintf("result = %d\n\n", result);
1438         }
1439     #endif
1440     return result;
1441 }
1442
1443
1444 void
1445 RuleBasedBreakIterator::reset()
1446 {
1447     if (fCachedBreakPositions) {
1448         uprv_free(fCachedBreakPositions);
1449     }
1450     fCachedBreakPositions = NULL;
1451     fNumCachedBreakPositions = 0;
1452     fDictionaryCharCount = 0;
1453     fPositionInCache = 0;
1454 }
1455
1456
1457
1458 //-------------------------------------------------------------------------------
1459 //
1460 //   getRuleStatus()   Return the break rule tag associated with the current
1461 //                     iterator position.  If the iterator arrived at its current
1462 //                     position by iterating forwards, the value will have been
1463 //                     cached by the handleNext() function.
1464 //
1465 //                     If no cached status value is available, the status is
1466 //                     found by doing a previous() followed by a next(), which
1467 //                     leaves the iterator where it started, and computes the
1468 //                     status while doing the next().
1469 //
1470 //-------------------------------------------------------------------------------
1471 void RuleBasedBreakIterator::makeRuleStatusValid() {
1472     if (fLastStatusIndexValid == FALSE) {
1473         //  No cached status is available.
1474         if (fText == NULL || current() == 0) {
1475             //  At start of text, or there is no text.  Status is always zero.
1476             fLastRuleStatusIndex = 0;
1477             fLastStatusIndexValid = TRUE;
1478         } else {
1479             //  Not at start of text.  Find status the tedious way.
1480             int32_t pa = current();
1481             previous();
1482             if (fNumCachedBreakPositions > 0) {
1483                 reset();                // Blow off the dictionary cache
1484             }
1485             int32_t pb = next();
1486             if (pa != pb) {
1487                 // note: the if (pa != pb) test is here only to eliminate warnings for
1488                 //       unused local variables on gcc.  Logically, it isn't needed.
1489                 U_ASSERT(pa == pb);
1490             }
1491         }
1492     }
1493     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1494 }
1495
1496
1497 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1498     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1499     nonConstThis->makeRuleStatusValid();
1500
1501     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1502     //                                                 (the number of status values.)
1503     //   This function returns the last (largest) of the array of status values.
1504     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1505     int32_t  tagVal = fData->fRuleStatusTable[idx];
1506
1507     return tagVal;
1508 }
1509
1510
1511
1512
1513 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1514              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1515 {
1516     if (U_FAILURE(status)) {
1517         return 0;
1518     }
1519
1520     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1521     nonConstThis->makeRuleStatusValid();
1522     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1523     int32_t  numValsToCopy = numVals;
1524     if (numVals > capacity) {
1525         status = U_BUFFER_OVERFLOW_ERROR;
1526         numValsToCopy = capacity;
1527     }
1528     int i;
1529     for (i=0; i<numValsToCopy; i++) {
1530         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1531     }
1532     return numVals;
1533 }
1534
1535
1536
1537 //-------------------------------------------------------------------------------
1538 //
1539 //   getBinaryRules        Access to the compiled form of the rules,
1540 //                         for use by build system tools that save the data
1541 //                         for standard iterator types.
1542 //
1543 //-------------------------------------------------------------------------------
1544 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1545     const uint8_t  *retPtr = NULL;
1546     length = 0;
1547
1548     if (fData != NULL) {
1549         retPtr = (const uint8_t *)fData->fHeader;
1550         length = fData->fHeader->fLength;
1551     }
1552     return retPtr;
1553 }
1554
1555
1556 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
1557                                    int32_t &bufferSize,
1558                                    UErrorCode &status)
1559 {
1560     if (U_FAILURE(status)){
1561         return NULL;
1562     }
1563
1564     if (bufferSize == 0) {
1565         bufferSize = 1;  // preflighting for deprecated functionality
1566         return NULL;
1567     }
1568
1569     BreakIterator *clonedBI = clone();
1570     if (clonedBI == NULL) {
1571         status = U_MEMORY_ALLOCATION_ERROR;
1572     } else {
1573         status = U_SAFECLONE_ALLOCATED_WARNING;
1574     }
1575     return (RuleBasedBreakIterator *)clonedBI;
1576 }
1577
1578
1579 //-------------------------------------------------------------------------------
1580 //
1581 //  isDictionaryChar      Return true if the category lookup for this char
1582 //                        indicates that it is in the set of dictionary lookup
1583 //                        chars.
1584 //
1585 //                        This function is intended for use by dictionary based
1586 //                        break iterators.
1587 //
1588 //-------------------------------------------------------------------------------
1589 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1590     if (fData == NULL) {
1591         return FALSE;
1592     }
1593     uint16_t category;
1594     UTRIE_GET16(&fData->fTrie, c, category);
1595     return (category & 0x4000) != 0;
1596 }*/
1597
1598
1599 //-------------------------------------------------------------------------------
1600 //
1601 //  checkDictionary       This function handles all processing of characters in
1602 //                        the "dictionary" set. It will determine the appropriate
1603 //                        course of action, and possibly set up a cache in the
1604 //                        process.
1605 //
1606 //-------------------------------------------------------------------------------
1607 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1608                             int32_t endPos,
1609                             UBool reverse) {
1610     // Reset the old break cache first.
1611     reset();
1612
1613     // note: code segment below assumes that dictionary chars are in the
1614     // startPos-endPos range
1615     // value returned should be next character in sequence
1616     if ((endPos - startPos) <= 1) {
1617         return (reverse ? startPos : endPos);
1618     }
1619
1620     // Starting from the starting point, scan towards the proposed result,
1621     // looking for the first dictionary character (which may be the one
1622     // we're on, if we're starting in the middle of a range).
1623     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1624     if (reverse) {
1625         UTEXT_PREVIOUS32(fText);
1626     }
1627
1628     int32_t rangeStart = startPos;
1629     int32_t rangeEnd = endPos;
1630
1631     uint16_t    category;
1632     int32_t     current;
1633     UErrorCode  status = U_ZERO_ERROR;
1634     UStack      breaks(status);
1635     int32_t     foundBreakCount = 0;
1636     UChar32     c = utext_current32(fText);
1637
1638     UTRIE_GET16(&fData->fTrie, c, category);
1639
1640     // Is the character we're starting on a dictionary character? If so, we
1641     // need to back up to include the entire run; otherwise the results of
1642     // the break algorithm will differ depending on where we start. Since
1643     // the result is cached and there is typically a non-dictionary break
1644     // within a small number of words, there should be little performance impact.
1645     if (category & 0x4000) {
1646         if (reverse) {
1647             do {
1648                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1649                 c = utext_current32(fText);
1650                 UTRIE_GET16(&fData->fTrie, c, category);
1651             } while (c != U_SENTINEL && (category & 0x4000));
1652             // Back up to the last dictionary character
1653             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1654             if (c == U_SENTINEL) {
1655                 // c = fText->last32();
1656                 //   TODO:  why was this if needed?
1657                 c = UTEXT_PREVIOUS32(fText);
1658             }
1659             else {
1660                 c = UTEXT_PREVIOUS32(fText);
1661             }
1662         }
1663         else {
1664             do {
1665                 c = UTEXT_PREVIOUS32(fText);
1666                 UTRIE_GET16(&fData->fTrie, c, category);
1667             }
1668             while (c != U_SENTINEL && (category & 0x4000));
1669             // Back up to the last dictionary character
1670             if (c == U_SENTINEL) {
1671                 // c = fText->first32();
1672                 c = utext_current32(fText);
1673             }
1674             else {
1675                 utext_next32(fText);
1676                 c = utext_current32(fText);
1677             }
1678             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1679         }
1680         UTRIE_GET16(&fData->fTrie, c, category);
1681     }
1682
1683     // Loop through the text, looking for ranges of dictionary characters.
1684     // For each span, find the appropriate break engine, and ask it to find
1685     // any breaks within the span.
1686     // Note: we always do this in the forward direction, so that the break
1687     // cache is built in the right order.
1688     if (reverse) {
1689         utext_setNativeIndex(fText, rangeStart);
1690         c = utext_current32(fText);
1691         UTRIE_GET16(&fData->fTrie, c, category);
1692     }
1693     while(U_SUCCESS(status)) {
1694         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1695             utext_next32(fText);           // TODO:  tweak for post-increment operation
1696             c = utext_current32(fText);
1697             UTRIE_GET16(&fData->fTrie, c, category);
1698         }
1699         if (current >= rangeEnd) {
1700             break;
1701         }
1702
1703         // We now have a dictionary character. Get the appropriate language object
1704         // to deal with it.
1705         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1706
1707         // Ask the language object if there are any breaks. It will leave the text
1708         // pointer on the other side of its range, ready to search for the next one.
1709         if (lbe != NULL) {
1710             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1711         }
1712
1713         // Reload the loop variables for the next go-round
1714         c = utext_current32(fText);
1715         UTRIE_GET16(&fData->fTrie, c, category);
1716     }
1717
1718     // If we found breaks, build a new break cache. The first and last entries must
1719     // be the original starting and ending position.
1720     if (foundBreakCount > 0) {
1721         U_ASSERT(foundBreakCount == breaks.size());
1722         int32_t totalBreaks = foundBreakCount;
1723         if (startPos < breaks.elementAti(0)) {
1724             totalBreaks += 1;
1725         }
1726         if (endPos > breaks.peeki()) {
1727             totalBreaks += 1;
1728         }
1729         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1730         if (fCachedBreakPositions != NULL) {
1731             int32_t out = 0;
1732             fNumCachedBreakPositions = totalBreaks;
1733             if (startPos < breaks.elementAti(0)) {
1734                 fCachedBreakPositions[out++] = startPos;
1735             }
1736             for (int32_t i = 0; i < foundBreakCount; ++i) {
1737                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1738             }
1739             if (endPos > fCachedBreakPositions[out-1]) {
1740                 fCachedBreakPositions[out] = endPos;
1741             }
1742             // If there are breaks, then by definition, we are replacing the original
1743             // proposed break by one of the breaks we found. Use following() and
1744             // preceding() to do the work. They should never recurse in this case.
1745             if (reverse) {
1746                 return preceding(endPos);
1747             }
1748             else {
1749                 return following(startPos);
1750             }
1751         }
1752         // If the allocation failed, just fall through to the "no breaks found" case.
1753     }
1754
1755     // If we get here, there were no language-based breaks. Set the text pointer
1756     // to the original proposed break.
1757     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1758     return (reverse ? startPos : endPos);
1759 }
1760
1761 U_NAMESPACE_END
1762
1763
1764 static icu::UStack *gLanguageBreakFactories = NULL;
1765 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1766
1767 /**
1768  * Release all static memory held by breakiterator.
1769  */
1770 U_CDECL_BEGIN
1771 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1772     if (gLanguageBreakFactories) {
1773         delete gLanguageBreakFactories;
1774         gLanguageBreakFactories = NULL;
1775     }
1776     gLanguageBreakFactoriesInitOnce.reset();
1777     return TRUE;
1778 }
1779 U_CDECL_END
1780
1781 U_CDECL_BEGIN
1782 static void U_CALLCONV _deleteFactory(void *obj) {
1783     delete (icu::LanguageBreakFactory *) obj;
1784 }
1785 U_CDECL_END
1786 U_NAMESPACE_BEGIN
1787
1788 static void U_CALLCONV initLanguageFactories() {
1789     UErrorCode status = U_ZERO_ERROR;
1790     U_ASSERT(gLanguageBreakFactories == NULL);
1791     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1792     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1793         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1794         gLanguageBreakFactories->push(builtIn, status);
1795 #ifdef U_LOCAL_SERVICE_HOOK
1796         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1797         if (extra != NULL) {
1798             gLanguageBreakFactories->push(extra, status);
1799         }
1800 #endif
1801     }
1802     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1803 }
1804
1805
1806 static const LanguageBreakEngine*
1807 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1808 {
1809     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1810     if (gLanguageBreakFactories == NULL) {
1811         return NULL;
1812     }
1813
1814     int32_t i = gLanguageBreakFactories->size();
1815     const LanguageBreakEngine *lbe = NULL;
1816     while (--i >= 0) {
1817         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1818         lbe = factory->getEngineFor(c, breakType);
1819         if (lbe != NULL) {
1820             break;
1821         }
1822     }
1823     return lbe;
1824 }
1825
1826
1827 //-------------------------------------------------------------------------------
1828 //
1829 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1830 //                          the character c.
1831 //
1832 //-------------------------------------------------------------------------------
1833 const LanguageBreakEngine *
1834 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1835     const LanguageBreakEngine *lbe = NULL;
1836     UErrorCode status = U_ZERO_ERROR;
1837
1838     if (fLanguageBreakEngines == NULL) {
1839         fLanguageBreakEngines = new UStack(status);
1840         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1841             delete fLanguageBreakEngines;
1842             fLanguageBreakEngines = 0;
1843             return NULL;
1844         }
1845     }
1846
1847     int32_t i = fLanguageBreakEngines->size();
1848     while (--i >= 0) {
1849         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1850         if (lbe->handles(c, fBreakType)) {
1851             return lbe;
1852         }
1853     }
1854
1855     // No existing dictionary took the character. See if a factory wants to
1856     // give us a new LanguageBreakEngine for this character.
1857     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1858
1859     // If we got one, use it and push it on our stack.
1860     if (lbe != NULL) {
1861         fLanguageBreakEngines->push((void *)lbe, status);
1862         // Even if we can't remember it, we can keep looking it up, so
1863         // return it even if the push fails.
1864         return lbe;
1865     }
1866
1867     // No engine is forthcoming for this character. Add it to the
1868     // reject set. Create the reject break engine if needed.
1869     if (fUnhandledBreakEngine == NULL) {
1870         fUnhandledBreakEngine = new UnhandledEngine(status);
1871         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1872             status = U_MEMORY_ALLOCATION_ERROR;
1873         }
1874         // Put it last so that scripts for which we have an engine get tried
1875         // first.
1876         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1877         // If we can't insert it, or creation failed, get rid of it
1878         if (U_FAILURE(status)) {
1879             delete fUnhandledBreakEngine;
1880             fUnhandledBreakEngine = 0;
1881             return NULL;
1882         }
1883     }
1884
1885     // Tell the reject engine about the character; at its discretion, it may
1886     // add more than just the one character.
1887     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1888
1889     return fUnhandledBreakEngine;
1890 }
1891
1892
1893
1894 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1895     return fBreakType;
1896 }*/
1897
1898 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1899     fBreakType = type;
1900     reset();
1901 }
1902
1903 U_NAMESPACE_END
1904
1905 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */