icuSources/common/rbbi.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ***************************************************************************
   5 *   Copyright (C) 1999-2016 International Business Machines Corporation
   6 *   and others. All rights reserved.
   7 ***************************************************************************
   8 */
   9 //
  10 //  file:  rbbi.cpp  Contains the implementation of the rule based break iterator
  11 //                   runtime engine and the API implementation for
  12 //                   class RuleBasedBreakIterator
  13 //
  14
  15 #include "utypeinfo.h"  // for 'typeid' to work
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_BREAK_ITERATION
  20
  21 #include <cinttypes>
  22
  23 #include "unicode/rbbi.h"
  24 #include "unicode/schriter.h"
  25 #include "unicode/uchriter.h"
  26 #include "unicode/uclean.h"
  27 #include "unicode/udata.h"
  28
  29 #include "brkeng.h"
  30 #include "ucln_cmn.h"
  31 #include "cmemory.h"
  32 #include "cstring.h"
  33 #include "localsvc.h"
  34 #include "rbbidata.h"
  35 #include "rbbi_cache.h"
  36 #include "rbbirb.h"
  37 #include "uassert.h"
  38 #include "umutex.h"
  39 #include "uvectr32.h"
  40
  41 #ifdef RBBI_DEBUG
  42 static UBool gTrace = FALSE;
  43 #endif
  44
  45 U_NAMESPACE_BEGIN
  46
  47 // The state number of the starting state
  48 constexpr int32_t START_STATE = 1;
  49
  50 // The state-transition value indicating "stop"
  51 constexpr int32_t STOP_STATE = 0;
  52
  53
  54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  55
  56
  57 //=======================================================================
  58 // constructors
  59 //=======================================================================
  60
  61 /**
  62  * Constructs a RuleBasedBreakIterator that uses the already-created
  63  * tables object that is passed in as a parameter.
  64  */
  65 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  66  : fSCharIter(UnicodeString())
  67 {
  68     init(status);
  69     fData = new RBBIDataWrapper(data, status); // status checked in constructor
  70     if (U_FAILURE(status)) {return;}
  71     if(fData == 0) {
  72         status = U_MEMORY_ALLOCATION_ERROR;
  73         return;
  74     }
  75 }
  76
  77 //
  78 //  Construct from precompiled binary rules (tables).  This constructor is public API,
  79 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
  80 //
  81 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
  82                        uint32_t       ruleLength,
  83                        UErrorCode     &status)
  84  : fSCharIter(UnicodeString())
  85 {
  86     init(status);
  87     if (U_FAILURE(status)) {
  88         return;
  89     }
  90     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
  91         status = U_ILLEGAL_ARGUMENT_ERROR;
  92         return;
  93     }
  94     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
  95     if (data->fLength > ruleLength) {
  96         status = U_ILLEGAL_ARGUMENT_ERROR;
  97         return;
  98     }
  99     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
 100     if (U_FAILURE(status)) {return;}
 101     if(fData == 0) {
 102         status = U_MEMORY_ALLOCATION_ERROR;
 103         return;
 104     }
 105 }
 106
 107
 108 //-------------------------------------------------------------------------------
 109 //
 110 //   Constructor   from a UDataMemory handle to precompiled break rules
 111 //                 stored in an ICU data file.
 112 //
 113 //-------------------------------------------------------------------------------
 114 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
 115  : fSCharIter(UnicodeString())
 116 {
 117     init(status);
 118     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
 119     if (U_FAILURE(status)) {return;}
 120     if(fData == 0) {
 121         status = U_MEMORY_ALLOCATION_ERROR;
 122         return;
 123     }
 124 }
 125
 126
 127
 128 //-------------------------------------------------------------------------------
 129 //
 130 //   Constructor       from a set of rules supplied as a string.
 131 //
 132 //-------------------------------------------------------------------------------
 133 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 134                                                 UParseError          &parseError,
 135                                                 UErrorCode           &status)
 136  : fSCharIter(UnicodeString())
 137 {
 138     init(status);
 139     if (U_FAILURE(status)) {return;}
 140     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
 141         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
 142     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
 143     //        creates and returns a complete RBBI.  From here, in a constructor, we
 144     //        can't just return the object created by the builder factory, hence
 145     //        the assignment of the factory created object to "this".
 146     if (U_SUCCESS(status)) {
 147         *this = *bi;
 148         delete bi;
 149     }
 150 }
 151
 152
 153 //-------------------------------------------------------------------------------
 154 //
 155 // Default Constructor.      Create an empty shell that can be set up later.
 156 //                           Used when creating a RuleBasedBreakIterator from a set
 157 //                           of rules.
 158 //-------------------------------------------------------------------------------
 159 RuleBasedBreakIterator::RuleBasedBreakIterator()
 160  : fSCharIter(UnicodeString())
 161 {
 162     UErrorCode status = U_ZERO_ERROR;
 163     init(status);
 164 }
 165
 166
 167 //-------------------------------------------------------------------------------
 168 //
 169 //   Copy constructor.  Will produce a break iterator with the same behavior,
 170 //                      and which iterates over the same text, as the one passed in.
 171 //
 172 //-------------------------------------------------------------------------------
 173 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
 174 : BreakIterator(other),
 175   fSCharIter(UnicodeString())
 176 {
 177     UErrorCode status = U_ZERO_ERROR;
 178     this->init(status);
 179     *this = other;
 180 }
 181
 182
 183 /**
 184  * Destructor
 185  */
 186 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
 187     if (fCharIter != &fSCharIter) {
 188         // fCharIter was adopted from the outside.
 189         delete fCharIter;
 190     }
 191     fCharIter = NULL;
 192
 193     utext_close(&fText);
 194
 195     if (fData != NULL) {
 196         fData->removeReference();
 197         fData = NULL;
 198     }
 199     delete fBreakCache;
 200     fBreakCache = NULL;
 201
 202     delete fDictionaryCache;
 203     fDictionaryCache = NULL;
 204
 205     delete fLanguageBreakEngines;
 206     fLanguageBreakEngines = NULL;
 207
 208     delete fUnhandledBreakEngine;
 209     fUnhandledBreakEngine = NULL;
 210
 211     delete [] fLatin1Cat;
 212     fLatin1Cat = NULL;
 213 }
 214
 215 /**
 216  * Assignment operator.  Sets this iterator to have the same behavior,
 217  * and iterate over the same text, as the one passed in.
 218  */
 219 RuleBasedBreakIterator&
 220 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 221     if (this == &that) {
 222         return *this;
 223     }
 224     BreakIterator::operator=(that);
 225     fLineWordOpts = that.fLineWordOpts;
 226
 227     if (fLanguageBreakEngines != NULL) {
 228         delete fLanguageBreakEngines;
 229         fLanguageBreakEngines = NULL;   // Just rebuild for now
 230     }
 231     // TODO: clone fLanguageBreakEngines from "that"
 232     UErrorCode status = U_ZERO_ERROR;
 233     utext_clone(&fText, &that.fText, FALSE, TRUE, &status);
 234
 235     if (fCharIter != &fSCharIter) {
 236         delete fCharIter;
 237     }
 238     fCharIter = &fSCharIter;
 239
 240     if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
 241         // This is a little bit tricky - it will intially appear that
 242         //  this->fCharIter is adopted, even if that->fCharIter was
 243         //  not adopted.  That's ok.
 244         fCharIter = that.fCharIter->clone();
 245     }
 246     fSCharIter = that.fSCharIter;
 247     if (fCharIter == NULL) {
 248         fCharIter = &fSCharIter;
 249     }
 250
 251     if (fData != NULL) {
 252         fData->removeReference();
 253         fData = NULL;
 254     }
 255     if (that.fData != NULL) {
 256         fData = that.fData->addReference();
 257     }
 258
 259     delete [] fLatin1Cat;
 260     fLatin1Cat = NULL;
 261
 262     fPosition = that.fPosition;
 263     fRuleStatusIndex = that.fRuleStatusIndex;
 264     fDone = that.fDone;
 265
 266     // TODO: both the dictionary and the main cache need to be copied.
 267     //       Current position could be within a dictionary range. Trying to continue
 268     //       the iteration without the caches present would go to the rules, with
 269     //       the assumption that the current position is on a rule boundary.
 270     fBreakCache->reset(fPosition, fRuleStatusIndex);
 271     fDictionaryCache->reset();
 272
 273     return *this;
 274 }
 275
 276
 277
 278 //-----------------------------------------------------------------------------
 279 //
 280 //    init()      Shared initialization routine.   Used by all the constructors.
 281 //                Initializes all fields, leaving the object in a consistent state.
 282 //
 283 //-----------------------------------------------------------------------------
 284 void RuleBasedBreakIterator::init(UErrorCode &status) {
 285     fCharIter             = NULL;
 286     fData                 = NULL;
 287     fLatin1Cat            = NULL;
 288     fPosition             = 0;
 289     fRuleStatusIndex      = 0;
 290     fDone                 = false;
 291     fDictionaryCharCount  = 0;
 292     fLanguageBreakEngines = NULL;
 293     fUnhandledBreakEngine = NULL;
 294     fBreakCache           = NULL;
 295     fDictionaryCache      = NULL;
 296
 297     // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
 298     // fText                 = UTEXT_INITIALIZER;
 299     static const UText initializedUText = UTEXT_INITIALIZER;
 300     uprv_memcpy(&fText, &initializedUText, sizeof(UText));
 301
 302    if (U_FAILURE(status)) {
 303         return;
 304     }
 305
 306     utext_openUChars(&fText, NULL, 0, &status);
 307     fDictionaryCache = new DictionaryCache(this, status);
 308     fBreakCache      = new BreakCache(this, status);
 309     if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) {
 310         status = U_MEMORY_ALLOCATION_ERROR;
 311     }
 312
 313 #ifdef RBBI_DEBUG
 314     static UBool debugInitDone = FALSE;
 315     if (debugInitDone == FALSE) {
 316         char *debugEnv = getenv("U_RBBIDEBUG");
 317         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
 318             gTrace = TRUE;
 319         }
 320         debugInitDone = TRUE;
 321     }
 322 #endif
 323 }
 324
 325
 326 void RuleBasedBreakIterator::initLatin1Cat(void) {
 327     fLatin1Cat = new uint16_t[256];
 328     for (UChar32 c = 0; c < 256; ++c) {
 329         fLatin1Cat[c] = UTRIE2_GET16(fData->fTrie, c);
 330     }
 331 }
 332
 333 //-----------------------------------------------------------------------------
 334 //
 335 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
 336 //            behavior, and iterating over the same text, as this one.
 337 //            Virtual function: does the right thing with subclasses.
 338 //
 339 //-----------------------------------------------------------------------------
 340 BreakIterator*
 341 RuleBasedBreakIterator::clone(void) const {
 342     return new RuleBasedBreakIterator(*this);
 343 }
 344
 345 /**
 346  * Equality operator.  Returns TRUE if both BreakIterators are of the
 347  * same class, have the same behavior, and iterate over the same text.
 348  */
 349 UBool
 350 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
 351     if (typeid(*this) != typeid(that)) {
 352         return FALSE;
 353     }
 354     if (this == &that) {
 355         return TRUE;
 356     }
 357
 358     // The base class BreakIterator carries no state that participates in equality,
 359     // and does not implement an equality function that would otherwise be
 360     // checked at this point.
 361
 362     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
 363     if (that2.fLineWordOpts != fLineWordOpts) {
 364         return FALSE;
 365     }
 366
 367     if (!utext_equals(&fText, &that2.fText)) {
 368         // The two break iterators are operating on different text,
 369         //   or have a different iteration position.
 370         //   Note that fText's position is always the same as the break iterator's position.
 371         return FALSE;
 372     };
 373
 374     if (!(fPosition == that2.fPosition &&
 375             fRuleStatusIndex == that2.fRuleStatusIndex &&
 376             fDone == that2.fDone)) {
 377         return FALSE;
 378     }
 379
 380     if (that2.fData == fData ||
 381         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
 382             // The two break iterators are using the same rules.
 383             return TRUE;
 384         }
 385     return FALSE;
 386 }
 387
 388 /**
 389  * Compute a hash code for this BreakIterator
 390  * @return A hash code
 391  */
 392 int32_t
 393 RuleBasedBreakIterator::hashCode(void) const {
 394     int32_t   hash = 0;
 395     if (fData != NULL) {
 396         hash = fData->hashCode();
 397     }
 398     return hash;
 399 }
 400
 401
 402 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
 403     if (U_FAILURE(status)) {
 404         return;
 405     }
 406     fBreakCache->reset();
 407     fDictionaryCache->reset();
 408     utext_clone(&fText, ut, FALSE, TRUE, &status);
 409
 410     // Set up a dummy CharacterIterator to be returned if anyone
 411     //   calls getText().  With input from UText, there is no reasonable
 412     //   way to return a characterIterator over the actual input text.
 413     //   Return one over an empty string instead - this is the closest
 414     //   we can come to signaling a failure.
 415     //   (GetText() is obsolete, this failure is sort of OK)
 416     fSCharIter.setText(UnicodeString());
 417
 418     if (fCharIter != &fSCharIter) {
 419         // existing fCharIter was adopted from the outside.  Delete it now.
 420         delete fCharIter;
 421     }
 422     fCharIter = &fSCharIter;
 423
 424     this->first();
 425 }
 426
 427
 428 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
 429     UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status);
 430     return result;
 431 }
 432
 433
 434 //=======================================================================
 435 // BreakIterator overrides
 436 //=======================================================================
 437
 438 /**
 439  * Return a CharacterIterator over the text being analyzed.
 440  */
 441 CharacterIterator&
 442 RuleBasedBreakIterator::getText() const {
 443     return *fCharIter;
 444 }
 445
 446 /**
 447  * Set the iterator to analyze a new piece of text.  This function resets
 448  * the current iteration position to the beginning of the text.
 449  * @param newText An iterator over the text to analyze.
 450  */
 451 void
 452 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 453     // If we are holding a CharacterIterator adopted from a
 454     //   previous call to this function, delete it now.
 455     if (fCharIter != &fSCharIter) {
 456         delete fCharIter;
 457     }
 458
 459     fCharIter = newText;
 460     UErrorCode status = U_ZERO_ERROR;
 461     fBreakCache->reset();
 462     fDictionaryCache->reset();
 463     if (newText==NULL || newText->startIndex() != 0) {
 464         // startIndex !=0 wants to be an error, but there's no way to report it.
 465         // Make the iterator text be an empty string.
 466         utext_openUChars(&fText, NULL, 0, &status);
 467     } else {
 468         utext_openCharacterIterator(&fText, newText, &status);
 469     }
 470     this->first();
 471 }
 472
 473 /**
 474  * Set the iterator to analyze a new piece of text.  This function resets
 475  * the current iteration position to the beginning of the text.
 476  * @param newText An iterator over the text to analyze.
 477  */
 478 void
 479 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 480     UErrorCode status = U_ZERO_ERROR;
 481     fBreakCache->reset();
 482     fDictionaryCache->reset();
 483     utext_openConstUnicodeString(&fText, &newText, &status);
 484
 485     // Set up a character iterator on the string.
 486     //   Needed in case someone calls getText().
 487     //  Can not, unfortunately, do this lazily on the (probably never)
 488     //  call to getText(), because getText is const.
 489     fSCharIter.setText(newText);
 490
 491     if (fCharIter != &fSCharIter) {
 492         // old fCharIter was adopted from the outside.  Delete it.
 493         delete fCharIter;
 494     }
 495     fCharIter = &fSCharIter;
 496
 497     this->first();
 498 }
 499
 500
 501 /**
 502  *  Provide a new UText for the input text.  Must reference text with contents identical
 503  *  to the original.
 504  *  Intended for use with text data originating in Java (garbage collected) environments
 505  *  where the data may be moved in memory at arbitrary times.
 506  */
 507 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
 508     if (U_FAILURE(status)) {
 509         return *this;
 510     }
 511     if (input == NULL) {
 512         status = U_ILLEGAL_ARGUMENT_ERROR;
 513         return *this;
 514     }
 515     int64_t pos = utext_getNativeIndex(&fText);
 516     //  Shallow read-only clone of the new UText into the existing input UText
 517     utext_clone(&fText, input, FALSE, TRUE, &status);
 518     if (U_FAILURE(status)) {
 519         return *this;
 520     }
 521     utext_setNativeIndex(&fText, pos);
 522     if (utext_getNativeIndex(&fText) != pos) {
 523         // Sanity check.  The new input utext is supposed to have the exact same
 524         // contents as the old.  If we can't set to the same position, it doesn't.
 525         // The contents underlying the old utext might be invalid at this point,
 526         // so it's not safe to check directly.
 527         status = U_ILLEGAL_ARGUMENT_ERROR;
 528     }
 529     return *this;
 530 }
 531
 532
 533 /**
 534  * Sets the current iteration position to the beginning of the text, position zero.
 535  * @return The new iterator position, which is zero.
 536  */
 537 int32_t RuleBasedBreakIterator::first(void) {
 538     UErrorCode status = U_ZERO_ERROR;
 539     if (!fBreakCache->seek(0)) {
 540         fBreakCache->populateNear(0, status);
 541     }
 542     fBreakCache->current();
 543     U_ASSERT(fPosition == 0);
 544     return 0;
 545 }
 546
 547 /**
 548  * Sets the current iteration position to the end of the text.
 549  * @return The text's past-the-end offset.
 550  */
 551 int32_t RuleBasedBreakIterator::last(void) {
 552     int32_t endPos = (int32_t)utext_nativeLength(&fText);
 553     UBool endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
 554     (void)endShouldBeBoundary;
 555     U_ASSERT(endShouldBeBoundary);
 556     U_ASSERT(fPosition == endPos);
 557     return endPos;
 558 }
 559
 560 /**
 561  * Advances the iterator either forward or backward the specified number of steps.
 562  * Negative values move backward, and positive values move forward.  This is
 563  * equivalent to repeatedly calling next() or previous().
 564  * @param n The number of steps to move.  The sign indicates the direction
 565  * (negative is backwards, and positive is forwards).
 566  * @return The character offset of the boundary position n boundaries away from
 567  * the current one.
 568  */
 569 int32_t RuleBasedBreakIterator::next(int32_t n) {
 570     int32_t result = 0;
 571     if (n > 0) {
 572         for (; n > 0 && result != UBRK_DONE; --n) {
 573             result = next();
 574         }
 575     } else if (n < 0) {
 576         for (; n < 0 && result != UBRK_DONE; ++n) {
 577             result = previous();
 578         }
 579     } else {
 580         result = current();
 581     }
 582     return result;
 583 }
 584
 585 /**
 586  * Advances the iterator to the next boundary position.
 587  * @return The position of the first boundary after this one.
 588  */
 589 int32_t RuleBasedBreakIterator::next(void) {
 590     fBreakCache->next();
 591     return fDone ? UBRK_DONE : fPosition;
 592 }
 593
 594 /**
 595  * Move the iterator backwards, to the boundary preceding the current one.
 596  *
 597  *         Starts from the current position within fText.
 598  *         Starting position need not be on a boundary.
 599  *
 600  * @return The position of the boundary position immediately preceding the starting position.
 601  */
 602 int32_t RuleBasedBreakIterator::previous(void) {
 603     UErrorCode status = U_ZERO_ERROR;
 604     fBreakCache->previous(status);
 605     return fDone ? UBRK_DONE : fPosition;
 606 }
 607
 608 /**
 609  * Sets the iterator to refer to the first boundary position following
 610  * the specified position.
 611  * @param startPos The position from which to begin searching for a break position.
 612  * @return The position of the first break after the current position.
 613  */
 614 int32_t RuleBasedBreakIterator::following(int32_t startPos) {
 615     // if the supplied position is before the beginning, return the
 616     // text's starting offset
 617     if (startPos < 0) {
 618         return first();
 619     }
 620
 621     // Move requested offset to a code point start. It might be on a trail surrogate,
 622     // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
 623     utext_setNativeIndex(&fText, startPos);
 624     startPos = (int32_t)utext_getNativeIndex(&fText);
 625
 626     UErrorCode status = U_ZERO_ERROR;
 627     fBreakCache->following(startPos, status);
 628     return fDone ? UBRK_DONE : fPosition;
 629 }
 630
 631 /**
 632  * Sets the iterator to refer to the last boundary position before the
 633  * specified position.
 634  * @param offset The position to begin searching for a break from.
 635  * @return The position of the last boundary before the starting position.
 636  */
 637 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 638     if (offset > utext_nativeLength(&fText)) {
 639         return last();
 640     }
 641
 642     // Move requested offset to a code point start. It might be on a trail surrogate,
 643     // or on a trail byte if the input is UTF-8.
 644
 645     utext_setNativeIndex(&fText, offset);
 646     int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
 647
 648     UErrorCode status = U_ZERO_ERROR;
 649     fBreakCache->preceding(adjustedOffset, status);
 650     return fDone ? UBRK_DONE : fPosition;
 651 }
 652
 653 /**
 654  * Returns true if the specfied position is a boundary position.  As a side
 655  * effect, leaves the iterator pointing to the first boundary position at
 656  * or after "offset".
 657  *
 658  * @param offset the offset to check.
 659  * @return True if "offset" is a boundary position.
 660  */
 661 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
 662     // out-of-range indexes are never boundary positions
 663     if (offset < 0) {
 664         first();       // For side effects on current position, tag values.
 665         return FALSE;
 666     }
 667
 668     // Adjust offset to be on a code point boundary and not beyond the end of the text.
 669     // Note that isBoundary() is always false for offsets that are not on code point boundaries.
 670     // But we still need the side effect of leaving iteration at the following boundary.
 671
 672     utext_setNativeIndex(&fText, offset);
 673     int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
 674
 675     bool result = false;
 676     UErrorCode status = U_ZERO_ERROR;
 677     if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
 678         result = (fBreakCache->current() == offset);
 679     }
 680
 681     if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
 682         // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
 683         // but the iteration position remains set to the end of the text, which is a boundary.
 684         return FALSE;
 685     }
 686     if (!result) {
 687         // Not on a boundary. isBoundary() must leave iterator on the following boundary.
 688         // Cache->seek(), above, left us on the preceding boundary, so advance one.
 689         next();
 690     }
 691     return result;
 692 }
 693
 694
 695 /**
 696  * Returns the current iteration position.
 697  * @return The current iteration position.
 698  */
 699 int32_t RuleBasedBreakIterator::current(void) const {
 700     return fPosition;
 701 }
 702
 703
 704 //=======================================================================
 705 // implementation
 706 //=======================================================================
 707
 708 //
 709 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
 710 //                 of user text.  A variable with this enum type keeps track of where we
 711 //                 are.  The state machine only fetches user input while in the RUN mode.
 712 //
 713 enum RBBIRunMode {
 714     RBBI_START,     // state machine processing is before first char of input
 715     RBBI_RUN,       // state machine processing is in the user text
 716     RBBI_END        // state machine processing is after end of user text.
 717 };
 718
 719
 720 // Map from look-ahead break states (corresponds to rules) to boundary positions.
 721 // Allows multiple lookahead break rules to be in flight at the same time.
 722 //
 723 // This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
 724 // in the state table be sequential, then we can just index an array. And the
 725 // table could also tell us in advance how big that array needs to be.
 726 //
 727 // Before ICU 57 there was just a single simple variable for a look-ahead match that
 728 // was in progress. Two rules at once did not work.
 729
 730 static const int32_t kMaxLookaheads = 8;
 731 struct LookAheadResults {
 732     int32_t    fUsedSlotLimit;
 733     int32_t    fPositions[8];
 734     int16_t    fKeys[8];
 735
 736     LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
 737
 738     int32_t getPosition(int16_t key) {
 739         for (int32_t i=0; i<fUsedSlotLimit; ++i) {
 740             if (fKeys[i] == key) {
 741                 return fPositions[i];
 742             }
 743         }
 744         // with NLLT source rules, Latn sample and ubrk_next, we see a request for key 79 here
 745         // near the end of text, when setPosition has only ever set positions for key 80 or 82.
 746         //UPRV_UNREACHABLE;
 747         return -1;
 748     }
 749
 750     void setPosition(int16_t key, int32_t position) {
 751         int32_t i;
 752         for (i=0; i<fUsedSlotLimit; ++i) {
 753             if (fKeys[i] == key) {
 754                 fPositions[i] = position;
 755                 return;
 756             }
 757         }
 758         if (i >= kMaxLookaheads) {
 759             UPRV_UNREACHABLE;
 760             i = kMaxLookaheads - 1; // Apple addition
 761         }
 762         fKeys[i] = key;
 763         fPositions[i] = position;
 764         U_ASSERT(fUsedSlotLimit == i);
 765         fUsedSlotLimit = i + 1;
 766     }
 767 };
 768
 769
 770 //-----------------------------------------------------------------------------------
 771 //
 772 //  handleNext()
 773 //     Run the state machine to find a boundary
 774 //
 775 //-----------------------------------------------------------------------------------
 776 // Route handleNext calls through the following to handleNextInternal,
 777 // in order to handle fLineWordOpts.
 778 int32_t RuleBasedBreakIterator::handleNext() {
 779     int32_t result = handleNextInternal();
 780     while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
 781         UChar32 prevChr = utext_char32At(&fText, result-1);
 782         UChar32 currChr = utext_char32At(&fText, result);
 783         if (currChr == U_SENTINEL || prevChr == U_SENTINEL) {
 784             break;
 785         }
 786         if (fLineWordOpts == UBRK_LINEWORD_KEEP_HANGUL) {
 787             UErrorCode status = U_ZERO_ERROR;
 788             if (uscript_getScript(currChr, &status) != USCRIPT_HANGUL || uscript_getScript(prevChr, &status) != USCRIPT_HANGUL) {
 789                 break;
 790             }
 791         } else {
 792             if (!u_isalpha(currChr) || !u_isalpha(prevChr)) {
 793                 break;
 794             }
 795         }
 796         int32_t nextResult = handleNextInternal();
 797         if (nextResult <= result) {
 798             break;
 799         }
 800         result = nextResult;
 801     }
 802     return result;
 803 }
 804
 805 int32_t RuleBasedBreakIterator::handleNextInternal() {
 806     int32_t             state;
 807     uint16_t            category        = 0;
 808     RBBIRunMode         mode;
 809
 810     RBBIStateTableRow  *row;
 811     UChar32             c;
 812     LookAheadResults    lookAheadMatches;
 813     int32_t             result             = 0;
 814     int32_t             initialPosition    = 0;
 815     const RBBIStateTable *statetable       = fData->fForwardTable;
 816     const char         *tableData          = statetable->fTableData;
 817     uint32_t            tableRowLen        = statetable->fRowLen;
 818     #ifdef RBBI_DEBUG
 819         if (gTrace) {
 820             RBBIDebugPuts("Handle Next   pos   char  state category");
 821         }
 822     #endif
 823
 824     // handleNext alway sets the break tag value.
 825     // Set the default for it.
 826     fRuleStatusIndex = 0;
 827
 828     fDictionaryCharCount = 0;
 829
 830     // if we're already at the end of the text, return DONE.
 831     initialPosition = fPosition;
 832     UTEXT_SETNATIVEINDEX(&fText, initialPosition);
 833     result          = initialPosition;
 834     c               = UTEXT_NEXT32(&fText);
 835     if (c==U_SENTINEL) {
 836         fDone = TRUE;
 837         return UBRK_DONE;
 838     }
 839
 840     //  Set the initial state for the state machine
 841     state = START_STATE;
 842     row = (RBBIStateTableRow *)
 843             //(statetable->fTableData + (statetable->fRowLen * state));
 844             (tableData + tableRowLen * state);
 845
 846
 847     mode     = RBBI_RUN;
 848     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
 849         category = 2;
 850         mode     = RBBI_START;
 851     }
 852
 853
 854     // loop until we reach the end of the text or transition to state 0
 855     //
 856     for (;;) {
 857         if (c == U_SENTINEL) {
 858             // Reached end of input string.
 859             if (mode == RBBI_END) {
 860                 // We have already run the loop one last time with the
 861                 //   character set to the psueudo {eof} value.  Now it is time
 862                 //   to unconditionally bail out.
 863                 break;
 864             }
 865             // Run the loop one last time with the fake end-of-input character category.
 866             mode = RBBI_END;
 867             category = 1;
 868         }
 869
 870         //
 871         // Get the char category.  An incoming category of 1 or 2 means that
 872         //      we are preset for doing the beginning or end of input, and
 873         //      that we shouldn't get a category from an actual text input character.
 874         //
 875         if (mode == RBBI_RUN) {
 876             // look up the current character's character category, which tells us
 877             // which column in the state table to look at.
 878             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
 879             //        not the size of the character going in, which is a UChar32.
 880             //
 881             category = (fLatin1Cat!=NULL && c<0x100)? fLatin1Cat[c]: UTRIE2_GET16(fData->fTrie, c);
 882
 883             // Check the dictionary bit in the character's category.
 884             //    Counter is only used by dictionary based iteration.
 885             //    Chars that need to be handled by a dictionary have a flag bit set
 886             //    in their category values.
 887             //
 888             if ((category & 0x4000) != 0)  {
 889                 fDictionaryCharCount++;
 890                 //  And off the dictionary flag bit.
 891                 category &= ~0x4000;
 892             }
 893         }
 894
 895        #ifdef RBBI_DEBUG
 896             if (gTrace) {
 897                 RBBIDebugPrintf("             %4" PRId64 "   ", utext_getNativeIndex(&fText));
 898                 if (0x20<=c && c<0x7f) {
 899                     RBBIDebugPrintf("\"%c\"  ", c);
 900                 } else {
 901                     RBBIDebugPrintf("%5x  ", c);
 902                 }
 903                 RBBIDebugPrintf("%3d  %3d\n", state, category);
 904             }
 905         #endif
 906
 907         // State Transition - move machine to its next state
 908         //
 909
 910         // fNextState is a variable-length array.
 911         U_ASSERT(category<fData->fHeader->fCatCount);
 912         state = row->fNextState[category];  /*Not accessing beyond memory*/
 913         row = (RBBIStateTableRow *)
 914             // (statetable->fTableData + (statetable->fRowLen * state));
 915             (tableData + tableRowLen * state);
 916
 917
 918         if (row->fAccepting == -1) {
 919             // Match found, common case.
 920             if (mode != RBBI_START) {
 921                 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
 922             }
 923             fRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
 924         }
 925
 926         int16_t completedRule = row->fAccepting;
 927         if (completedRule > 0) {
 928             // Lookahead match is completed.
 929             int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
 930             if (lookaheadResult >= 0) {
 931                 fRuleStatusIndex = row->fTagIdx;
 932                 fPosition = lookaheadResult;
 933                 return lookaheadResult;
 934             }
 935         }
 936         int16_t rule = row->fLookAhead;
 937         if (rule != 0) {
 938             // At the position of a '/' in a look-ahead match. Record it.
 939             int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
 940             lookAheadMatches.setPosition(rule, pos);
 941         }
 942
 943         if (state == STOP_STATE) {
 944             // This is the normal exit from the lookup state machine.
 945             // We have advanced through the string until it is certain that no
 946             //   longer match is possible, no matter what characters follow.
 947             break;
 948         }
 949
 950         // Advance to the next character.
 951         // If this is a beginning-of-input loop iteration, don't advance
 952         //    the input position.  The next iteration will be processing the
 953         //    first real input character.
 954         if (mode == RBBI_RUN) {
 955             c = UTEXT_NEXT32(&fText);
 956         } else {
 957             if (mode == RBBI_START) {
 958                 mode = RBBI_RUN;
 959             }
 960         }
 961     }
 962
 963     // The state machine is done.  Check whether it found a match...
 964
 965     // If the iterator failed to advance in the match engine, force it ahead by one.
 966     //   (This really indicates a defect in the break rules.  They should always match
 967     //    at least one character.)
 968     if (result == initialPosition) {
 969         utext_setNativeIndex(&fText, initialPosition);
 970         utext_next32(&fText);
 971         result = (int32_t)utext_getNativeIndex(&fText);
 972         fRuleStatusIndex = 0;
 973     }
 974
 975     // Leave the iterator at our result position.
 976     fPosition = result;
 977     #ifdef RBBI_DEBUG
 978         if (gTrace) {
 979             RBBIDebugPrintf("result = %d\n\n", result);
 980         }
 981     #endif
 982     return result;
 983 }
 984
 985
 986 //-----------------------------------------------------------------------------------
 987 //
 988 //  handleSafePrevious()
 989 //
 990 //      Iterate backwards using the safe reverse rules.
 991 //      The logic of this function is similar to handleNext(), but simpler
 992 //      because the safe table does not require as many options.
 993 //
 994 //-----------------------------------------------------------------------------------
 995 int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
 996     int32_t             state;
 997     uint16_t            category        = 0;
 998     RBBIStateTableRow  *row;
 999     UChar32             c;
1000     int32_t             result          = 0;
1001
1002     const RBBIStateTable *stateTable = fData->fReverseTable;
1003     UTEXT_SETNATIVEINDEX(&fText, fromPosition);
1004     #ifdef RBBI_DEBUG
1005         if (gTrace) {
1006             RBBIDebugPuts("Handle Previous   pos   char  state category");
1007         }
1008     #endif
1009
1010     // if we're already at the start of the text, return DONE.
1011     if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
1012         return BreakIterator::DONE;
1013     }
1014
1015     //  Set the initial state for the state machine
1016     c = UTEXT_PREVIOUS32(&fText);
1017     state = START_STATE;
1018     row = (RBBIStateTableRow *)
1019             (stateTable->fTableData + (stateTable->fRowLen * state));
1020
1021     // loop until we reach the start of the text or transition to state 0
1022     //
1023     for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
1024
1025         // look up the current character's character category, which tells us
1026         // which column in the state table to look at.
1027         // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1028         //        not the size of the character going in, which is a UChar32.
1029         //
1030         //  And off the dictionary flag bit. For reverse iteration it is not used.
1031         category = UTRIE2_GET16(fData->fTrie, c);
1032         category &= ~0x4000;
1033
1034         #ifdef RBBI_DEBUG
1035             if (gTrace) {
1036                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(&fText));
1037                 if (0x20<=c && c<0x7f) {
1038                     RBBIDebugPrintf("\"%c\"  ", c);
1039                 } else {
1040                     RBBIDebugPrintf("%5x  ", c);
1041                 }
1042                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1043             }
1044         #endif
1045
1046         // State Transition - move machine to its next state
1047         //
1048         // fNextState is a variable-length array.
1049         U_ASSERT(category<fData->fHeader->fCatCount);
1050         state = row->fNextState[category];  /*Not accessing beyond memory*/
1051         row = (RBBIStateTableRow *)
1052             (stateTable->fTableData + (stateTable->fRowLen * state));
1053
1054         if (state == STOP_STATE) {
1055             // This is the normal exit from the lookup state machine.
1056             // Transistion to state zero means we have found a safe point.
1057             break;
1058         }
1059     }
1060
1061     // The state machine is done.  Check whether it found a match...
1062     result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
1063     #ifdef RBBI_DEBUG
1064         if (gTrace) {
1065             RBBIDebugPrintf("result = %d\n\n", result);
1066         }
1067     #endif
1068     return result;
1069 }
1070
1071 //-------------------------------------------------------------------------------
1072 //
1073 //   getRuleStatus()   Return the break rule tag associated with the current
1074 //                     iterator position.  If the iterator arrived at its current
1075 //                     position by iterating forwards, the value will have been
1076 //                     cached by the handleNext() function.
1077 //
1078 //-------------------------------------------------------------------------------
1079
1080 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1081
1082     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1083     //                                                 (the number of status values.)
1084     //   This function returns the last (largest) of the array of status values.
1085     int32_t  idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
1086     int32_t  tagVal = fData->fRuleStatusTable[idx];
1087
1088     return tagVal;
1089 }
1090
1091
1092 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1093              int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
1094     if (U_FAILURE(status)) {
1095         return 0;
1096     }
1097
1098     int32_t  numVals = fData->fRuleStatusTable[fRuleStatusIndex];
1099     int32_t  numValsToCopy = numVals;
1100     if (numVals > capacity) {
1101         status = U_BUFFER_OVERFLOW_ERROR;
1102         numValsToCopy = capacity;
1103     }
1104     int i;
1105     for (i=0; i<numValsToCopy; i++) {
1106         fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
1107     }
1108     return numVals;
1109 }
1110
1111 // Apple custom addition
1112 int32_t RuleBasedBreakIterator::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
1113 {
1114     if (fDone) {
1115         return 0;
1116     }
1117     RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
1118     RuleBasedTokenRange *outTokenP = outTokenRanges;
1119     int32_t lastOffset = fPosition;
1120     while (outTokenP < outTokenLimit) {
1121         // start portion from inlining populateFollowing()
1122         int32_t pos = 0;
1123         int32_t ruleStatusIdx = 0;
1124         int32_t startPos = fPosition;
1125
1126         if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1127             fPosition = pos;
1128             fRuleStatusIndex = ruleStatusIdx;
1129         } else {
1130             pos = handleNextInternal(); // sets fRuleStatusIndex for the pos it returns, updates fPosition
1131             if (pos == UBRK_DONE) {
1132                 // fDone = TRUE; already set by handleNextInternal
1133                 break;
1134             }
1135             // Use current result from handleNextInternal(), including fRuleStatusIndex,
1136             // unless overridden by dictionary subdivisions
1137             fPosition = pos;
1138             if (fDictionaryCharCount > 0) {
1139                 // The text segment obtained from the rules includes dictionary characters.
1140                 // Subdivide it, with subdivided results going into the dictionary cache.
1141                 fDictionaryCache->populateDictionary(startPos, pos, fRuleStatusIndex, fRuleStatusIndex);
1142                 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1143                     fPosition = pos;
1144                     fRuleStatusIndex = ruleStatusIdx;
1145                 }
1146             }
1147         }
1148         // end portion from inlining populateFollowing()
1149         int32_t flagCount = fData->fRuleStatusTable[fRuleStatusIndex];
1150         const int32_t* flagPtr = fData->fRuleStatusTable + fRuleStatusIndex + flagCount;
1151         int32_t flagSet = *flagPtr; // if -1 then skip token
1152         if (flagSet != -1) {
1153             outTokenP->location = lastOffset;
1154             outTokenP++->length = fPosition - lastOffset;
1155             if (outTokenFlags) {
1156                 // flagSet should be the OR of all flags returned by getRuleStatusVec;
1157                 // here we collect from high-order to low-order.
1158                 while (--flagCount > 0) {
1159                    flagSet |=  *--flagPtr;
1160                 }
1161                 *outTokenFlags++ = (unsigned long)flagSet;
1162             }
1163         }
1164         lastOffset = fPosition;
1165     }
1166     return (outTokenP - outTokenRanges);
1167 }
1168
1169 //-------------------------------------------------------------------------------
1170 //
1171 //   getBinaryRules        Access to the compiled form of the rules,
1172 //                         for use by build system tools that save the data
1173 //                         for standard iterator types.
1174 //
1175 //-------------------------------------------------------------------------------
1176 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1177     const uint8_t  *retPtr = NULL;
1178     length = 0;
1179
1180     if (fData != NULL) {
1181         retPtr = (const uint8_t *)fData->fHeader;
1182         length = fData->fHeader->fLength;
1183     }
1184     return retPtr;
1185 }
1186
1187
1188 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
1189                                    int32_t &bufferSize,
1190                                    UErrorCode &status)
1191 {
1192     if (U_FAILURE(status)){
1193         return NULL;
1194     }
1195
1196     if (bufferSize == 0) {
1197         bufferSize = 1;  // preflighting for deprecated functionality
1198         return NULL;
1199     }
1200
1201     BreakIterator *clonedBI = clone();
1202     if (clonedBI == NULL) {
1203         status = U_MEMORY_ALLOCATION_ERROR;
1204     } else {
1205         status = U_SAFECLONE_ALLOCATED_WARNING;
1206     }
1207     return (RuleBasedBreakIterator *)clonedBI;
1208 }
1209
1210 U_NAMESPACE_END
1211
1212
1213 static icu::UStack *gLanguageBreakFactories = nullptr;
1214 static const icu::UnicodeString *gEmptyString = nullptr;
1215 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1216 static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
1217
1218 /**
1219  * Release all static memory held by breakiterator.
1220  */
1221 U_CDECL_BEGIN
1222 static UBool U_CALLCONV rbbi_cleanup(void) {
1223     delete gLanguageBreakFactories;
1224     gLanguageBreakFactories = nullptr;
1225     delete gEmptyString;
1226     gEmptyString = nullptr;
1227     gLanguageBreakFactoriesInitOnce.reset();
1228     gRBBIInitOnce.reset();
1229     return TRUE;
1230 }
1231 U_CDECL_END
1232
1233 U_CDECL_BEGIN
1234 static void U_CALLCONV _deleteFactory(void *obj) {
1235     delete (icu::LanguageBreakFactory *) obj;
1236 }
1237 U_CDECL_END
1238 U_NAMESPACE_BEGIN
1239
1240 static void U_CALLCONV rbbiInit() {
1241     gEmptyString = new UnicodeString();
1242     ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1243 }
1244
1245 static void U_CALLCONV initLanguageFactories() {
1246     UErrorCode status = U_ZERO_ERROR;
1247     U_ASSERT(gLanguageBreakFactories == NULL);
1248     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1249     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1250         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1251         gLanguageBreakFactories->push(builtIn, status);
1252 #ifdef U_LOCAL_SERVICE_HOOK
1253         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1254         if (extra != NULL) {
1255             gLanguageBreakFactories->push(extra, status);
1256         }
1257 #endif
1258     }
1259     ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1260 }
1261
1262
1263 static const LanguageBreakEngine*
1264 getLanguageBreakEngineFromFactory(UChar32 c)
1265 {
1266     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1267     if (gLanguageBreakFactories == NULL) {
1268         return NULL;
1269     }
1270
1271     int32_t i = gLanguageBreakFactories->size();
1272     const LanguageBreakEngine *lbe = NULL;
1273     while (--i >= 0) {
1274         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1275         lbe = factory->getEngineFor(c);
1276         if (lbe != NULL) {
1277             break;
1278         }
1279     }
1280     return lbe;
1281 }
1282
1283
1284 //-------------------------------------------------------------------------------
1285 //
1286 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1287 //                          the character c.
1288 //
1289 //-------------------------------------------------------------------------------
1290 const LanguageBreakEngine *
1291 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1292     const LanguageBreakEngine *lbe = NULL;
1293     UErrorCode status = U_ZERO_ERROR;
1294
1295     if (fLanguageBreakEngines == NULL) {
1296         fLanguageBreakEngines = new UStack(status);
1297         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1298             delete fLanguageBreakEngines;
1299             fLanguageBreakEngines = 0;
1300             return NULL;
1301         }
1302     }
1303
1304     int32_t i = fLanguageBreakEngines->size();
1305     while (--i >= 0) {
1306         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1307         if (lbe->handles(c)) {
1308             return lbe;
1309         }
1310     }
1311
1312     // No existing dictionary took the character. See if a factory wants to
1313     // give us a new LanguageBreakEngine for this character.
1314     lbe = getLanguageBreakEngineFromFactory(c);
1315
1316     // If we got one, use it and push it on our stack.
1317     if (lbe != NULL) {
1318         fLanguageBreakEngines->push((void *)lbe, status);
1319         // Even if we can't remember it, we can keep looking it up, so
1320         // return it even if the push fails.
1321         return lbe;
1322     }
1323
1324     // No engine is forthcoming for this character. Add it to the
1325     // reject set. Create the reject break engine if needed.
1326     if (fUnhandledBreakEngine == NULL) {
1327         fUnhandledBreakEngine = new UnhandledEngine(status);
1328         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1329             status = U_MEMORY_ALLOCATION_ERROR;
1330             return nullptr;
1331         }
1332         // Put it last so that scripts for which we have an engine get tried
1333         // first.
1334         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1335         // If we can't insert it, or creation failed, get rid of it
1336         if (U_FAILURE(status)) {
1337             delete fUnhandledBreakEngine;
1338             fUnhandledBreakEngine = 0;
1339             return NULL;
1340         }
1341     }
1342
1343     // Tell the reject engine about the character; at its discretion, it may
1344     // add more than just the one character.
1345     fUnhandledBreakEngine->handleCharacter(c);
1346
1347     return fUnhandledBreakEngine;
1348 }
1349
1350 void RuleBasedBreakIterator::dumpCache() {
1351     fBreakCache->dumpCache();
1352 }
1353
1354 void RuleBasedBreakIterator::dumpTables() {
1355     fData->printData();
1356 }
1357
1358 /**
1359  * Returns the description used to create this iterator
1360  */
1361
1362 const UnicodeString&
1363 RuleBasedBreakIterator::getRules() const {
1364     if (fData != NULL) {
1365         return fData->getRuleSourceString();
1366     } else {
1367         umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1368         return *gEmptyString;
1369     }
1370 }
1371
1372 U_NAMESPACE_END
1373
1374 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */