icuSources/common/rbbi.cpp

   1 /*
   2 ***************************************************************************
   3 *   Copyright (C) 1999-2004 International Business Machines Corporation   *
   4 *   and others. All rights reserved.                                      *
   5 ***************************************************************************
   6 */
   7 //
   8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
   9 //                   runtime engine and the API implementation for
  10 //                   class RuleBasedBreakIterator
  11 //
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include "unicode/rbbi.h"
  18 #include "unicode/schriter.h"
  19 #include "unicode/udata.h"
  20 #include "unicode/uclean.h"
  21 #include "rbbidata.h"
  22 #include "rbbirb.h"
  23 #include "cmemory.h"
  24 #include "cstring.h"
  25
  26 #include "uassert.h"
  27
  28 U_NAMESPACE_BEGIN
  29
  30
  31 static const int16_t START_STATE = 1;     // The state number of the starting state
  32 static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
  33
  34
  35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  36
  37
  38 //=======================================================================
  39 // constructors
  40 //=======================================================================
  41
  42 /**
  43  * Constructs a RuleBasedBreakIterator that uses the already-created
  44  * tables object that is passed in as a parameter.
  45  */
  46 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  47 {
  48     init();
  49     fData = new RBBIDataWrapper(data, status); // status checked in constructor
  50     if (U_FAILURE(status)) {return;}
  51     if(fData == 0) {
  52         status = U_MEMORY_ALLOCATION_ERROR;
  53         return;
  54     }
  55 }
  56
  57 //-------------------------------------------------------------------------------
  58 //
  59 //   Constructor   from a UDataMemory handle to precompiled break rules
  60 //                 stored in an ICU data file.
  61 //
  62 //-------------------------------------------------------------------------------
  63 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
  64 {
  65     init();
  66     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
  67     if (U_FAILURE(status)) {return;}
  68     if(fData == 0) {
  69         status = U_MEMORY_ALLOCATION_ERROR;
  70         return;
  71     }
  72 }
  73
  74
  75
  76 //-------------------------------------------------------------------------------
  77 //
  78 //   Constructor       from a set of rules supplied as a string.
  79 //
  80 //-------------------------------------------------------------------------------
  81 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
  82                                                 UParseError          &parseError,
  83                                                 UErrorCode           &status)
  84 {
  85     u_init(&status);      // Just in case ICU is not yet initialized
  86     init();
  87     if (U_FAILURE(status)) {return;}
  88     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
  89         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
  90     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
  91     //        creates and returns a complete RBBI.  From here, in a constructor, we
  92     //        can't just return the object created by the builder factory, hence
  93     //        the assignment of the factory created object to "this".
  94     if (U_SUCCESS(status)) {
  95         *this = *bi;
  96         delete bi;
  97     }
  98 }
  99
 100
 101 //-------------------------------------------------------------------------------
 102 //
 103 // Default Constructor.      Create an empty shell that can be set up later.
 104 //                           Used when creating a RuleBasedBreakIterator from a set
 105 //                           of rules.
 106 //-------------------------------------------------------------------------------
 107 RuleBasedBreakIterator::RuleBasedBreakIterator() {
 108     init();
 109 }
 110
 111
 112 //-------------------------------------------------------------------------------
 113 //
 114 //   Copy constructor.  Will produce a break iterator with the same behavior,
 115 //                      and which iterates over the same text, as the one passed in.
 116 //
 117 //-------------------------------------------------------------------------------
 118 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
 119 : BreakIterator(other)
 120 {
 121     this->init();
 122     *this = other;
 123 }
 124
 125
 126 /**
 127  * Destructor
 128  */
 129 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
 130     delete fText;
 131     fText = NULL;
 132     if (fData != NULL) {
 133         fData->removeReference();
 134         fData = NULL;
 135     }
 136 }
 137
 138 /**
 139  * Assignment operator.  Sets this iterator to have the same behavior,
 140  * and iterate over the same text, as the one passed in.
 141  */
 142 RuleBasedBreakIterator&
 143 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 144     if (this == &that) {
 145         return *this;
 146     }
 147     delete fText;
 148     fText = NULL;
 149     if (that.fText != NULL) {
 150         fText = that.fText->clone();
 151     }
 152
 153     if (fData != NULL) {
 154         fData->removeReference();
 155         fData = NULL;
 156     }
 157     if (that.fData != NULL) {
 158         fData = that.fData->addReference();
 159     }
 160     fTrace = that.fTrace;
 161
 162     return *this;
 163 }
 164
 165
 166
 167 //-----------------------------------------------------------------------------
 168 //
 169 //    init()      Shared initialization routine.   Used by all the constructors.
 170 //                Initializes all fields, leaving the object in a consistent state.
 171 //
 172 //-----------------------------------------------------------------------------
 173 UBool RuleBasedBreakIterator::fTrace = FALSE;
 174 void RuleBasedBreakIterator::init() {
 175
 176     fText                 = NULL;
 177     fData                 = NULL;
 178     fLastRuleStatusIndex  = 0;
 179     fLastStatusIndexValid = TRUE;
 180     fDictionaryCharCount  = 0;
 181
 182 #ifdef RBBI_DEBUG
 183     static UBool debugInitDone = FALSE;
 184     if (debugInitDone == FALSE) {
 185         char *debugEnv = getenv("U_RBBIDEBUG");
 186         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
 187             fTrace = TRUE;
 188         }
 189         debugInitDone = TRUE;
 190     }
 191 #endif
 192 }
 193
 194
 195
 196 //-----------------------------------------------------------------------------
 197 //
 198 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
 199 //            behavior, and iterating over the same text, as this one.
 200 //            Virtual function: does the right thing with subclasses.
 201 //
 202 //-----------------------------------------------------------------------------
 203 BreakIterator*
 204 RuleBasedBreakIterator::clone(void) const {
 205     return new RuleBasedBreakIterator(*this);
 206 }
 207
 208 /**
 209  * Equality operator.  Returns TRUE if both BreakIterators are of the
 210  * same class, have the same behavior, and iterate over the same text.
 211  */
 212 UBool
 213 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
 214     UBool r = FALSE;
 215     if (that.getDynamicClassID() != getDynamicClassID()) {
 216         return r;
 217     }
 218
 219     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
 220     if (fText == that2.fText ||
 221         (fText != NULL && that2.fText != NULL && *that2.fText == *fText)) {
 222         if (that2.fData == fData ||
 223             (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
 224             r = TRUE;
 225         }
 226     }
 227     return r;
 228 }
 229
 230 /**
 231  * Compute a hash code for this BreakIterator
 232  * @return A hash code
 233  */
 234 int32_t
 235 RuleBasedBreakIterator::hashCode(void) const {
 236     int32_t   hash = 0;
 237     if (fData != NULL) {
 238         hash = fData->hashCode();
 239     }
 240     return hash;
 241 }
 242
 243 /**
 244  * Returns the description used to create this iterator
 245  */
 246 const UnicodeString&
 247 RuleBasedBreakIterator::getRules() const {
 248     if (fData != NULL) {
 249         return fData->getRuleSourceString();
 250     } else {
 251         static const UnicodeString *s;
 252         if (s == NULL) {
 253             // TODO:  something more elegant here.
 254             //        perhaps API should return the string by value.
 255             //        Note:  thread unsafe init & leak are semi-ok, better than
 256             //               what was before.  Sould be cleaned up, though.
 257             s = new UnicodeString;
 258         }
 259         return *s;
 260     }
 261 }
 262
 263 //=======================================================================
 264 // BreakIterator overrides
 265 //=======================================================================
 266
 267 /**
 268  * Return a CharacterIterator over the text being analyzed.  This version
 269  * of this method returns the actual CharacterIterator we're using internally.
 270  * Changing the state of this iterator can have undefined consequences.  If
 271  * you need to change it, clone it first.
 272  * @return An iterator over the text being analyzed.
 273  */
 274 const CharacterIterator&
 275 RuleBasedBreakIterator::getText() const {
 276     RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
 277
 278     // The iterator is initialized pointing to no text at all, so if this
 279     // function is called while we're in that state, we have to fudge an
 280     // an iterator to return.
 281     if (nonConstThis->fText == NULL) {
 282         nonConstThis->fText = new StringCharacterIterator(UnicodeString());
 283     }
 284     return *nonConstThis->fText;
 285 }
 286
 287 /**
 288  * Set the iterator to analyze a new piece of text.  This function resets
 289  * the current iteration position to the beginning of the text.
 290  * @param newText An iterator over the text to analyze.
 291  */
 292 void
 293 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 294     reset();
 295     delete fText;
 296     fText = newText;
 297     this->first();
 298 }
 299
 300 /**
 301  * Set the iterator to analyze a new piece of text.  This function resets
 302  * the current iteration position to the beginning of the text.
 303  * @param newText An iterator over the text to analyze.
 304  */
 305 void
 306 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 307     reset();
 308     if (fText != NULL && fText->getDynamicClassID()
 309             == StringCharacterIterator::getStaticClassID()) {
 310         ((StringCharacterIterator*)fText)->setText(newText);
 311     }
 312     else {
 313         delete fText;
 314         fText = new StringCharacterIterator(newText);
 315     }
 316     this->first();
 317 }
 318
 319
 320
 321 /**
 322  * Sets the current iteration position to the beginning of the text.
 323  * (i.e., the CharacterIterator's starting offset).
 324  * @return The offset of the beginning of the text.
 325  */
 326 int32_t RuleBasedBreakIterator::first(void) {
 327     reset();
 328     fLastRuleStatusIndex  = 0;
 329     fLastStatusIndexValid = TRUE;
 330     if (fText == NULL)
 331         return BreakIterator::DONE;
 332
 333     fText->first();
 334     return fText->getIndex();
 335 }
 336
 337 /**
 338  * Sets the current iteration position to the end of the text.
 339  * (i.e., the CharacterIterator's ending offset).
 340  * @return The text's past-the-end offset.
 341  */
 342 int32_t RuleBasedBreakIterator::last(void) {
 343     reset();
 344     if (fText == NULL) {
 345         fLastRuleStatusIndex  = 0;
 346         fLastStatusIndexValid = TRUE;
 347         return BreakIterator::DONE;
 348     }
 349
 350     // I'm not sure why, but t.last() returns the offset of the last character,
 351     // rather than the past-the-end offset
 352     //
 353     //   (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
 354     //     will work correctly.)
 355
 356
 357     fLastStatusIndexValid = FALSE;
 358     int32_t pos = fText->endIndex();
 359     fText->setIndex(pos);
 360
 361     return pos;
 362 }
 363
 364 /**
 365  * Advances the iterator either forward or backward the specified number of steps.
 366  * Negative values move backward, and positive values move forward.  This is
 367  * equivalent to repeatedly calling next() or previous().
 368  * @param n The number of steps to move.  The sign indicates the direction
 369  * (negative is backwards, and positive is forwards).
 370  * @return The character offset of the boundary position n boundaries away from
 371  * the current one.
 372  */
 373 int32_t RuleBasedBreakIterator::next(int32_t n) {
 374     int32_t result = current();
 375     while (n > 0) {
 376         result = handleNext();
 377         --n;
 378     }
 379     while (n < 0) {
 380         result = previous();
 381         ++n;
 382     }
 383     return result;
 384 }
 385
 386 /**
 387  * Advances the iterator to the next boundary position.
 388  * @return The position of the first boundary after this one.
 389  */
 390 int32_t RuleBasedBreakIterator::next(void) {
 391     return handleNext();
 392 }
 393
 394 /**
 395  * Advances the iterator backwards, to the last boundary preceding this one.
 396  * @return The position of the last boundary position preceding this one.
 397  */
 398 int32_t RuleBasedBreakIterator::previous(void) {
 399     // if we're already sitting at the beginning of the text, return DONE
 400     if (fText == NULL || current() == fText->startIndex()) {
 401         fLastRuleStatusIndex  = 0;
 402         fLastStatusIndexValid = TRUE;
 403         return BreakIterator::DONE;
 404     }
 405
 406     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
 407         return handlePrevious(fData->fReverseTable);
 408     }
 409
 410     // old rule syntax
 411     // set things up.  handlePrevious() will back us up to some valid
 412     // break position before the current position (we back our internal
 413     // iterator up one step to prevent handlePrevious() from returning
 414     // the current position), but not necessarily the last one before
 415     // where we started
 416
 417     int32_t start = current();
 418
 419     fText->previous32();
 420     int32_t lastResult    = handlePrevious();
 421     int32_t result        = lastResult;
 422     int32_t lastTag       = 0;
 423     UBool   breakTagValid = FALSE;
 424
 425     // iterate forward from the known break position until we pass our
 426     // starting point.  The last break position before the starting
 427     // point is our return value
 428
 429     for (;;) {
 430         result         = handleNext();
 431         if (result == BreakIterator::DONE || result >= start) {
 432             break;
 433         }
 434         lastResult     = result;
 435         lastTag        = fLastRuleStatusIndex;
 436         breakTagValid  = TRUE;
 437     }
 438
 439     // fLastBreakTag wants to have the value for section of text preceding
 440     // the result position that we are to return (in lastResult.)  If
 441     // the backwards rules overshot and the above loop had to do two or more
 442     // handleNext()s to move up to the desired return position, we will have a valid
 443     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
 444     // we wont have a tag value for that position, which is only set by handleNext().
 445
 446     // set the current iteration position to be the last break position
 447     // before where we started, and then return that value
 448     fText->setIndex(lastResult);
 449     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
 450     fLastStatusIndexValid = breakTagValid;
 451     return lastResult;
 452 }
 453
 454 /**
 455  * Sets the iterator to refer to the first boundary position following
 456  * the specified position.
 457  * @offset The position from which to begin searching for a break position.
 458  * @return The position of the first break after the current position.
 459  */
 460 int32_t RuleBasedBreakIterator::following(int32_t offset) {
 461     // if the offset passed in is already past the end of the text,
 462     // just return DONE; if it's before the beginning, return the
 463     // text's starting offset
 464     fLastRuleStatusIndex  = 0;
 465     fLastStatusIndexValid = TRUE;
 466     if (fText == NULL || offset >= fText->endIndex()) {
 467         last();
 468         return next();
 469     }
 470     else if (offset < fText->startIndex()) {
 471         return first();
 472     }
 473
 474     // otherwise, set our internal iteration position (temporarily)
 475     // to the position passed in.  If this is the _beginning_ position,
 476     // then we can just use next() to get our return value
 477
 478     int32_t result = 0;
 479
 480     if (fData->fSafeRevTable != NULL) {
 481         // new rule syntax
 482         /// todo synwee
 483         fText->setIndex(offset);
 484         // move forward one codepoint to prepare for moving back to a
 485         // safe point.
 486         // this handles offset being between a supplementary character
 487         fText->next32();
 488         // handlePrevious will move most of the time to < 1 boundary away
 489         handlePrevious(fData->fSafeRevTable);
 490         int32_t result = next();
 491         while (result <= offset) {
 492             result = next();
 493         }
 494         return result;
 495     }
 496     if (fData->fSafeFwdTable != NULL) {
 497         // backup plan if forward safe table is not available
 498         fText->setIndex(offset);
 499         fText->previous32();
 500         // handle next will give result >= offset
 501         handleNext(fData->fSafeFwdTable);
 502         // previous will give result 0 or 1 boundary away from offset,
 503         // most of the time
 504         // we have to
 505         int32_t oldresult = previous();
 506         while (oldresult > offset) {
 507             int32_t result = previous();
 508             if (result <= offset) {
 509                 return oldresult;
 510             }
 511             oldresult = result;
 512         }
 513         int32_t result = next();
 514         if (result <= offset) {
 515             return next();
 516         }
 517         return result;
 518     }
 519     // otherwise, we have to sync up first.  Use handlePrevious() to back
 520     // us up to a known break position before the specified position (if
 521     // we can determine that the specified position is a break position,
 522     // we don't back up at all).  This may or may not be the last break
 523     // position at or before our starting position.  Advance forward
 524     // from here until we've passed the starting position.  The position
 525     // we stop on will be the first break position after the specified one.
 526     // old rule syntax
 527
 528     fText->setIndex(offset);
 529     if (offset == fText->startIndex()) {
 530         return handleNext();
 531     }
 532     result = previous();
 533
 534     while (result != BreakIterator::DONE && result <= offset) {
 535         result = next();
 536     }
 537
 538     return result;
 539 }
 540
 541 /**
 542  * Sets the iterator to refer to the last boundary position before the
 543  * specified position.
 544  * @offset The position to begin searching for a break from.
 545  * @return The position of the last boundary before the starting position.
 546  */
 547 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 548     // if the offset passed in is already past the end of the text,
 549     // just return DONE; if it's before the beginning, return the
 550
 551     // text's starting offset
 552     if (fText == NULL || offset > fText->endIndex()) {
 553         // return BreakIterator::DONE;
 554         return last();
 555     }
 556     else if (offset < fText->startIndex()) {
 557         return first();
 558     }
 559
 560     // if we start by updating the current iteration position to the
 561     // position specified by the caller, we can just use previous()
 562     // to carry out this operation
 563
 564     if (fData->fSafeFwdTable != NULL) {
 565         /// todo synwee
 566         // new rule syntax
 567         fText->setIndex(offset);
 568         // move backwards one codepoint to prepare for moving forwards to a
 569         // safe point.
 570         // this handles offset being between a supplementary character
 571         // TODO:  would it be better to just check for being in the middle of a surrogate pair,
 572         //        rather than adjusting the position unconditionally?
 573         //        (Change would interact with safe rules.)
 574         fText->previous32();
 575         handleNext(fData->fSafeFwdTable);
 576         int32_t result = fText->getIndex();
 577         while (result >= offset) {
 578             result = previous();
 579         }
 580         return result;
 581     }
 582     if (fData->fSafeRevTable != NULL) {
 583         // backup plan if forward safe table is not available
 584         fText->setIndex(offset);
 585         fText->next32();
 586         // handle previous will give result <= offset
 587         handlePrevious(fData->fSafeRevTable);
 588
 589         // next will give result 0 or 1 boundary away from offset,
 590         // most of the time
 591         // we have to
 592         int32_t oldresult = next();
 593         while (oldresult < offset) {
 594             int32_t result = next();
 595             if (result >= offset) {
 596                 return oldresult;
 597             }
 598             oldresult = result;
 599         }
 600         int32_t result = previous();
 601         if (result >= offset) {
 602             return previous();
 603         }
 604         return result;
 605     }
 606
 607     // old rule syntax
 608     fText->setIndex(offset);
 609     return previous();
 610 }
 611
 612 /**
 613  * Returns true if the specfied position is a boundary position.  As a side
 614  * effect, leaves the iterator pointing to the first boundary position at
 615  * or after "offset".
 616  * @param offset the offset to check.
 617  * @return True if "offset" is a boundary position.
 618  */
 619 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
 620     // the beginning index of the iterator is always a boundary position by definition
 621     if (fText == NULL || offset == fText->startIndex()) {
 622         first();       // For side effects on current position, tag values.
 623         return TRUE;
 624     }
 625
 626     if (offset == fText->endIndex()) {
 627         last();       // For side effects on current position, tag values.
 628         return TRUE;
 629     }
 630
 631     // out-of-range indexes are never boundary positions
 632     if (offset < fText->startIndex()) {
 633         first();       // For side effects on current position, tag values.
 634         return FALSE;
 635     }
 636
 637     if (offset > fText->endIndex()) {
 638         last();        // For side effects on current position, tag values.
 639         return FALSE;
 640     }
 641
 642     // otherwise, we can use following() on the position before the specified
 643     // one and return true if the position we get back is the one the user
 644     // specified
 645     return following(offset - 1) == offset;
 646 }
 647
 648 /**
 649  * Returns the current iteration position.
 650  * @return The current iteration position.
 651  */
 652 int32_t RuleBasedBreakIterator::current(void) const {
 653     return (fText != NULL) ? fText->getIndex() : BreakIterator::DONE;
 654 }
 655
 656 //=======================================================================
 657 // implementation
 658 //=======================================================================
 659
 660
 661 //-----------------------------------------------------------------------------------
 662 //
 663 //  handleNext()
 664 //     This method is the actual implementation of the next() method.  All iteration
 665 //     vectors through here.  This method initializes the state machine to state 1
 666 //     and advances through the text character by character until we reach the end
 667 //     of the text or the state machine transitions to state 0.  We update our return
 668 //     value every time the state machine passes through an accepting state.
 669 //
 670 //-----------------------------------------------------------------------------------
 671 int32_t RuleBasedBreakIterator::handleNext() {
 672     return handleNext(fData->fForwardTable);
 673 }
 674
 675 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
 676     if (fTrace) {
 677         RBBIDebugPuts("Handle Next   pos   char  state category");
 678     }
 679
 680     // No matter what, handleNext alway correctly sets the break tag value.
 681     fLastStatusIndexValid = TRUE;
 682
 683     // if we're already at the end of the text, return DONE.
 684     if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
 685         fLastRuleStatusIndex = 0;
 686         return BreakIterator::DONE;
 687     }
 688
 689     int32_t initialPosition = fText->getIndex();
 690     int32_t result          = initialPosition;
 691     int32_t lookaheadResult = 0;
 692
 693     // Initialize the state machine.  Begin in state 1
 694     int32_t            state           = START_STATE;
 695     int16_t            category;
 696     UChar32            c               = fText->current32();
 697     RBBIStateTableRow *row;
 698     int32_t            lookaheadStatus = 0;
 699     int32_t            lookaheadTagIdx = 0;
 700
 701     fLastRuleStatusIndex = 0;
 702
 703     row = (RBBIStateTableRow *)    // Point to starting row of state table.
 704         (statetable->fTableData + (statetable->fRowLen * state));
 705
 706     // Character Category fetch for starting character.
 707     //    See comments on character category code within loop, below.
 708     UTRIE_GET16(&fData->fTrie, c, category);
 709     if ((category & 0x4000) != 0)  {
 710           fDictionaryCharCount++;
 711           category &= ~0x4000;
 712         }
 713
 714     // loop until we reach the end of the text or transition to state 0
 715     for (;;) {
 716         if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
 717             // Reached end of input string.
 718             //    Note: CharacterIterator::DONE is 0xffff, which is also a legal
 719             //          character value.  Check for DONE first, because it's quicker,
 720             //          but also need to check fText->hasNext() to be certain.
 721
 722             if (lookaheadResult > result) {
 723                 // We ran off the end of the string with a pending look-ahead match.
 724                 // Treat this as if the look-ahead condition had been met, and return
 725                 //  the match at the / position from the look-ahead rule.
 726                 result               = lookaheadResult;
 727                 fLastRuleStatusIndex = lookaheadTagIdx;
 728                 lookaheadStatus = 0;
 729             } else if (result == initialPosition) {
 730                 // Ran off end, no match found.
 731                 // move forward one
 732                 fText->setIndex(initialPosition);
 733                 fText->next32();
 734                 fText->getIndex();
 735             }
 736             break;
 737         }
 738         // look up the current character's character category, which tells us
 739         // which column in the state table to look at.
 740         // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
 741         //        not the size of the character going in, which is a UChar32.
 742         //
 743         UTRIE_GET16(&fData->fTrie, c, category);
 744
 745         // Check the dictionary bit in the character's category.
 746         //    Counter is only used by dictionary based iterators (subclasses).
 747         //    Chars that need to be handled by a dictionary have a flag bit set
 748         //    in their category values.
 749         //
 750         if ((category & 0x4000) != 0)  {
 751             fDictionaryCharCount++;
 752             //  And off the dictionary flag bit.
 753             category &= ~0x4000;
 754         }
 755
 756         #ifdef RBBI_DEBUG
 757             if (fTrace) {
 758                 RBBIDebugPrintf("             %4d   ", fText->getIndex());
 759                 if (0x20<=c && c<0x7f) {
 760                     RBBIDebugPrintf("\"%c\"  ", c);
 761                 } else {
 762                     RBBIDebugPrintf("%5x  ", c);
 763                 }
 764                 RBBIDebugPrintf("%3d  %3d\n", state, category);
 765             }
 766         #endif
 767
 768         // look up a state transition in the state table
 769         state = row->fNextState[category];
 770         row = (RBBIStateTableRow *)
 771             (statetable->fTableData + (statetable->fRowLen * state));
 772
 773         // Get the next character.  Doing it here positions the iterator
 774         //    to the correct position for recording matches in the code that
 775         //    follows.
 776         c = fText->next32();
 777
 778         if (row->fAccepting == -1) {
 779             // Match found, common case, could have lookahead so we move on to check it
 780             result = fText->getIndex();
 781             /// added
 782             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
 783         }
 784
 785         if (row->fLookAhead != 0) {
 786             if (lookaheadStatus != 0
 787                 && row->fAccepting == lookaheadStatus) {
 788                 // Lookahead match is completed.  Set the result accordingly, but only
 789                 // if no other rule has matched further in the mean time.
 790                 result               = lookaheadResult;
 791                 fLastRuleStatusIndex = lookaheadTagIdx;
 792                 lookaheadStatus      = 0;
 793                 /// i think we have to back up to read the lookahead character again
 794                 /// fText->setIndex(lookaheadResult);
 795                 /// TODO: this is a simple hack since reverse rules only have simple
 796                 /// lookahead rules that we can definitely break out from.
 797                 /// we need to make the lookahead rules not chain eventually.
 798                 /// return result;
 799                 /// this is going to be the longest match again
 800                 goto continueOn;
 801             }
 802
 803             int32_t  r = fText->getIndex();
 804             lookaheadResult = r;
 805             lookaheadStatus = row->fLookAhead;
 806             lookaheadTagIdx = row->fTagIdx;
 807             goto continueOn;
 808         }
 809
 810
 811         if (row->fAccepting == 0) {
 812             // No match, nothing of interest happening, common case.
 813             goto continueOn;
 814         }
 815
 816         lookaheadStatus = 0;           // clear out any pending look-ahead matches.
 817
 818 continueOn:
 819         if (state == STOP_STATE) {
 820             // This is the normal exit from the lookup state machine.
 821             // We have advanced through the string until it is certain that no
 822             //   longer match is possible, no matter what characters follow.
 823             break;
 824         }
 825     }
 826
 827     // The state machine is done.  Check whether it found a match...
 828
 829     // If the iterator failed to advance in the match engine, force it ahead by one.
 830     //   (This really indicates a defect in the break rules.  They should always match
 831     //    at least one character.)
 832     if (result == initialPosition) {
 833         result = fText->setIndex(initialPosition);
 834         fText ->next32();
 835         result = fText->getIndex();
 836     }
 837
 838     // Leave the iterator at our result position.
 839     fText->setIndex(result);
 840     if (fTrace) {
 841         RBBIDebugPrintf("result = %d\n\n", result);
 842     }
 843     return result;
 844 }
 845
 846
 847 //----------------------------------------------------------------
 848 //
 849 //   handlePrevious(void)     This is the variant used with old style rules
 850 //                            (Overshoot to a safe point, then move forward)
 851 //
 852 //----------------------------------------------------------------
 853 int32_t RuleBasedBreakIterator::handlePrevious(void) {
 854     if (fText == NULL || fData == NULL) {
 855         return 0;
 856     }
 857     if (fData->fReverseTable == NULL) {
 858         return fText->setToStart();
 859     }
 860
 861     int32_t            state           = START_STATE;
 862     int32_t            category;
 863     int32_t            lastCategory    = 0;
 864     int32_t            result          = fText->getIndex();
 865     int32_t            lookaheadStatus = 0;
 866     int32_t            lookaheadResult = 0;
 867     int32_t            lookaheadTagIdx = 0;
 868     UChar32            c               = fText->current32();
 869     RBBIStateTableRow *row;
 870
 871     row = (RBBIStateTableRow *)
 872         (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
 873     UTRIE_GET16(&fData->fTrie, c, category);
 874     if ((category & 0x4000) != 0)  {
 875         fDictionaryCharCount++;
 876         category &= ~0x4000;
 877     }
 878
 879     if (fTrace) {
 880         RBBIDebugPuts("Handle Prev   pos   char  state category");
 881     }
 882
 883     // loop until we reach the beginning of the text or transition to state 0
 884     for (;;) {
 885         if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
 886             break;
 887         }
 888
 889         // save the last character's category and look up the current
 890         // character's category
 891         lastCategory = category;
 892         UTRIE_GET16(&fData->fTrie, c, category);
 893
 894         // Check the dictionary bit in the character's category.
 895         //    Counter is only used by dictionary based iterators.
 896         //
 897         if ((category & 0x4000) != 0)  {
 898             fDictionaryCharCount++;
 899             category &= ~0x4000;
 900         }
 901
 902         #ifdef RBBI_DEBUG
 903             if (fTrace) {
 904                 RBBIDebugPrintf("             %4d   ", fText->getIndex());
 905                 if (0x20<=c && c<0x7f) {
 906                     RBBIDebugPrintf("\"%c\"  ", c);
 907                 } else {
 908                     RBBIDebugPrintf("%5x  ", c);
 909                 }
 910                 RBBIDebugPrintf("%3d  %3d\n", state, category);
 911             }
 912         #endif
 913
 914         // look up a state transition in the backwards state table
 915         state = row->fNextState[category];
 916         row = (RBBIStateTableRow *)
 917             (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
 918
 919         if (row->fAccepting == 0 && row->fLookAhead == 0) {
 920             // No match, nothing of interest happening, common case.
 921             goto continueOn;
 922         }
 923
 924         if (row->fAccepting == -1) {
 925             // Match found, common case, no lookahead involved.
 926             result = fText->getIndex();
 927             lookaheadStatus = 0;     // clear out any pending look-ahead matches.
 928             goto continueOn;
 929         }
 930
 931         if (row->fAccepting == 0 && row->fLookAhead != 0) {
 932             // Lookahead match point.  Remember it, but only if no other rule
 933             //                         has unconditionally matched to this point.
 934             // TODO:  handle case where there's a pending match from a different rule
 935             //        where lookaheadStatus != 0  && lookaheadStatus != row->fLookAhead.
 936             int32_t  r = fText->getIndex();
 937             if (r > result) {
 938                 lookaheadResult = r;
 939                 lookaheadStatus = row->fLookAhead;
 940                 lookaheadTagIdx = row->fTagIdx;
 941             }
 942             goto continueOn;
 943         }
 944
 945         if (row->fAccepting != 0 && row->fLookAhead != 0) {
 946             // Lookahead match is completed.  Set the result accordingly, but only
 947             //   if no other rule has matched further in the mean time.
 948             if (lookaheadResult > result) {
 949                 U_ASSERT(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
 950                 //    of overlapping lookahead matches.
 951                 result               = lookaheadResult;
 952                 fLastRuleStatusIndex = lookaheadTagIdx;
 953                 lookaheadStatus = 0;
 954             }
 955             goto continueOn;
 956         }
 957
 958 continueOn:
 959         if (state == STOP_STATE) {
 960             break;
 961         }
 962
 963         // then advance one character backwards
 964         c = fText->previous32();
 965     }
 966
 967     // Note:  the result postion isn't what is returned to the user by previous(),
 968     //        but where the implementation of previous() turns around and
 969     //        starts iterating forward again.
 970     if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
 971         result = fText->startIndex();
 972     }
 973     fText->setIndex(result);
 974
 975     return result;
 976 }
 977
 978
 979 //-----------------------------------------------------------------------------------
 980 //
 981 //  handlePrevious()
 982 //
 983 //      This method backs the iterator back up to a "safe position" in the text.
 984 //      This is a position that we know, without any context, may be any position
 985 //      not more than 2 breaks away. Occasionally, the position may be less than
 986 //      one break away.
 987 //      The various calling methods then iterate forward from this safe position to
 988 //      the appropriate position to return.
 989 //
 990 //      The logic of this function is very similar to handleNext(), above.
 991 //
 992 //-----------------------------------------------------------------------------------
 993 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
 994     if (fText == NULL || statetable == NULL) {
 995         return 0;
 996     }
 997     // break tag is no longer valid after icu switched to exact backwards
 998     // positioning.
 999     fLastStatusIndexValid = FALSE;
1000     if (statetable == NULL) {
1001         return fText->setToStart();
1002     }
1003
1004     int32_t            state              = START_STATE;
1005     int32_t            category;
1006     int32_t            lastCategory       = 0;
1007     UBool              hasPassedStartText = !fText->hasPrevious();
1008     UChar32            c                  = fText->previous32();
1009     // previous character
1010     int32_t            result             = fText->getIndex();
1011     int32_t            lookaheadStatus    = 0;
1012     int32_t            lookaheadResult    = 0;
1013     int32_t            lookaheadTagIdx    = 0;
1014     UBool              lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1015
1016     RBBIStateTableRow *row;
1017
1018     row = (RBBIStateTableRow *)
1019         (statetable->fTableData + (state * statetable->fRowLen));
1020     UTRIE_GET16(&fData->fTrie, c, category);
1021     if ((category & 0x4000) != 0)  {
1022         fDictionaryCharCount++;
1023         category &= ~0x4000;
1024     }
1025
1026     if (fTrace) {
1027         RBBIDebugPuts("Handle Prev   pos   char  state category");
1028     }
1029
1030     // loop until we reach the beginning of the text or transition to state 0
1031     for (;;) {
1032         // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
1033         if (hasPassedStartText) {
1034             // if we have already considered the start of the text
1035             if (row->fLookAhead != 0 && lookaheadResult == 0) {
1036                 result = 0;
1037             }
1038             break;
1039         }
1040
1041         // save the last character's category and look up the current
1042         // character's category
1043         lastCategory = category;
1044         UTRIE_GET16(&fData->fTrie, c, category);
1045
1046         // Check the dictionary bit in the character's category.
1047         //    Counter is only used by dictionary based iterators.
1048         //
1049         if ((category & 0x4000) != 0)  {
1050             fDictionaryCharCount++;
1051             category &= ~0x4000;
1052         }
1053
1054         #ifdef RBBI_DEBUG
1055             if (fTrace) {
1056                 RBBIDebugPrintf("             %4d   ", fText->getIndex());
1057                 if (0x20<=c && c<0x7f) {
1058                     RBBIDebugPrintf("\"%c\"  ", c);
1059                 } else {
1060                     RBBIDebugPrintf("%5x  ", c);
1061                 }
1062                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1063             }
1064         #endif
1065
1066         // look up a state transition in the backwards state table
1067         state = row->fNextState[category];
1068         row = (RBBIStateTableRow *)
1069             (statetable->fTableData + (state * statetable->fRowLen));
1070
1071         if (row->fAccepting == -1) {
1072             // Match found, common case, could have lookahead so we move on to check it
1073             result = fText->getIndex();
1074             /// added
1075             fLastRuleStatusIndex   = row->fTagIdx;   // Remember the break status (tag) value.
1076         }
1077
1078         if (row->fLookAhead != 0) {
1079             if (lookaheadStatus != 0
1080                 && row->fAccepting == lookaheadStatus) {
1081                 // Lookahead match is completed.  Set the result accordingly, but only
1082                 // if no other rule has matched further in the mean time.
1083                 result               = lookaheadResult;
1084                 fLastRuleStatusIndex = lookaheadTagIdx;
1085                 lookaheadStatus      = 0;
1086                 /// i think we have to back up to read the lookahead character again
1087                 /// fText->setIndex(lookaheadResult);
1088                 /// TODO: this is a simple hack since reverse rules only have simple
1089                 /// lookahead rules that we can definitely break out from.
1090                 /// we need to make the lookahead rules not chain eventually.
1091                 /// return result;
1092                 /// this is going to be the longest match again
1093
1094                 /// syn wee todo hard coded for line breaks stuff
1095                 /// needs to provide a tag in rules to ensure a stop.
1096
1097                 if (lookAheadHardBreak) {
1098                     fText->setIndex(result);
1099                     return result;
1100                 }
1101                 category = lastCategory;
1102                 fText->setIndex(result);
1103
1104                 goto continueOn;
1105             }
1106
1107             int32_t    r         = fText->getIndex();
1108             lookaheadResult      = r;
1109             lookaheadStatus      = row->fLookAhead;
1110             fLastRuleStatusIndex = row->fTagIdx;
1111             goto continueOn;
1112         }
1113
1114         // not lookahead
1115         if (row->fAccepting == 0) {
1116             // No match, nothing of interest happening, common case.
1117             goto continueOn;
1118         }
1119
1120         lookaheadStatus = 0;     // clear out any pending look-ahead matches.
1121
1122 continueOn:
1123         if (state == STOP_STATE) {
1124             break;
1125         }
1126
1127         // then advance one character backwards
1128         hasPassedStartText = !fText->hasPrevious();
1129         c = fText->previous32();
1130     }
1131
1132     // Note:  the result postion isn't what is returned to the user by previous(),
1133     //        but where the implementation of previous() turns around and
1134     //        starts iterating forward again.
1135     fText->setIndex(result);
1136
1137     return result;
1138 }
1139
1140
1141 void
1142 RuleBasedBreakIterator::reset()
1143 {
1144     // Base-class version of this function is a no-op.
1145     // Subclasses may override with their own reset behavior.
1146 }
1147
1148
1149
1150 //-------------------------------------------------------------------------------
1151 //
1152 //   getRuleStatus()   Return the break rule tag associated with the current
1153 //                     iterator position.  If the iterator arrived at its current
1154 //                     position by iterating forwards, the value will have been
1155 //                     cached by the handleNext() function.
1156 //
1157 //                     If no cached status value is available, the status is
1158 //                     found by doing a previous() followed by a next(), which
1159 //                     leaves the iterator where it started, and computes the
1160 //                     status while doing the next().
1161 //
1162 //-------------------------------------------------------------------------------
1163 void RuleBasedBreakIterator::makeRuleStatusValid() {
1164     if (fLastStatusIndexValid == FALSE) {
1165         //  No cached status is available.
1166         if (fText == NULL || current() == fText->startIndex()) {
1167             //  At start of text, or there is no text.  Status is always zero.
1168             fLastRuleStatusIndex = 0;
1169             fLastStatusIndexValid = TRUE;
1170         } else {
1171             //  Not at start of text.  Find status the tedious way.
1172             int32_t pa = current();
1173             previous();
1174             int32_t pb = next();
1175             if (pa != pb) {
1176                 // note: the if (pa != pb) test is here only to eliminate warnings for
1177                 //       unused local variables on gcc.  Logically, it isn't needed.
1178                 U_ASSERT(pa == pb);
1179             }
1180         }
1181     }
1182     U_ASSERT(fLastStatusIndexValid == TRUE);
1183     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1184 }
1185
1186
1187 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1188     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1189     nonConstThis->makeRuleStatusValid();
1190
1191     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1192     //                                                 (the number of status values.)
1193     //   This function returns the last (largest) of the array of status values.
1194     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1195     int32_t  tagVal = fData->fRuleStatusTable[idx];
1196
1197     return tagVal;
1198 }
1199
1200
1201
1202
1203 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1204              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1205 {
1206     if (U_FAILURE(status)) {
1207         return 0;
1208     }
1209
1210     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1211     nonConstThis->makeRuleStatusValid();
1212     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1213     int32_t  numValsToCopy = numVals;
1214     if (numVals > capacity) {
1215         status = U_BUFFER_OVERFLOW_ERROR;
1216         numValsToCopy = capacity;
1217     }
1218     int i;
1219     for (i=0; i<numValsToCopy; i++) {
1220         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1221     }
1222     return numVals;
1223 }
1224
1225
1226
1227 //-------------------------------------------------------------------------------
1228 //
1229 //   getBinaryRules        Access to the compiled form of the rules,
1230 //                         for use by build system tools that save the data
1231 //                         for standard iterator types.
1232 //
1233 //-------------------------------------------------------------------------------
1234 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1235     const uint8_t  *retPtr = NULL;
1236     length = 0;
1237
1238     if (fData != NULL) {
1239         retPtr = (const uint8_t *)fData->fHeader;
1240         length = fData->fHeader->fLength;
1241     }
1242     return retPtr;
1243 }
1244
1245
1246
1247
1248 //-------------------------------------------------------------------------------
1249 //
1250 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1251 //                    Saving one heap allocation isn't worth the trouble.
1252 //                    Cloning shouldn't be done in tight loops, and
1253 //                    making the clone copy involves other heap operations anyway.
1254 //                    And the application code for correctly dealing with buffer
1255 //                    size problems and the eventual object destruction is ugly.
1256 //
1257 //-------------------------------------------------------------------------------
1258 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1259                                    int32_t &bufferSize,
1260                                    UErrorCode &status)
1261 {
1262     if (U_FAILURE(status)){
1263         return NULL;
1264     }
1265
1266     //
1267     //  If user buffer size is zero this is a preflight operation to
1268     //    obtain the needed buffer size, allowing for worst case misalignment.
1269     //
1270     if (bufferSize == 0) {
1271         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1272         return NULL;
1273     }
1274
1275
1276     //
1277     //  Check the alignment and size of the user supplied buffer.
1278     //  Allocate heap memory if the user supplied memory is insufficient.
1279     //
1280     char    *buf   = (char *)stackBuffer;
1281     uint32_t s      = bufferSize;
1282
1283     if (stackBuffer == NULL) {
1284         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1285     }
1286     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1287         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1288         s   -= offsetUp;
1289         buf += offsetUp;
1290     }
1291     if (s < sizeof(RuleBasedBreakIterator)) {
1292         buf = (char *) new RuleBasedBreakIterator;
1293         if (buf == 0) {
1294             status = U_MEMORY_ALLOCATION_ERROR;
1295             return NULL;
1296         }
1297         status = U_SAFECLONE_ALLOCATED_WARNING;
1298     }
1299
1300     //
1301     //  Clone the object.
1302     //    TODO:  using an overloaded operator new to directly initialize the
1303     //           copy in the user's buffer would be better, but it doesn't seem
1304     //           to get along with namespaces.  Investigate why.
1305     //
1306     //           The memcpy is only safe with an empty (default constructed)
1307     //           break iterator.  Use on others can screw up reference counts
1308     //           to data.  memcpy-ing objects is not really a good idea...
1309     //
1310     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
1311     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1312     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // clone = empty, but initialized, iterator.
1313     *clone = *this;                          // clone = the real one we want.
1314     if (status != U_SAFECLONE_ALLOCATED_WARNING) {
1315         clone->fBufferClone = TRUE;
1316     }
1317
1318     return clone;
1319 }
1320
1321
1322
1323 //-------------------------------------------------------------------------------
1324 //
1325 //  isDictionaryChar      Return true if the category lookup for this char
1326 //                        indicates that it is in the set of dictionary lookup
1327 //                        chars.
1328 //
1329 //                        This function is intended for use by dictionary based
1330 //                        break iterators.
1331 //
1332 //-------------------------------------------------------------------------------
1333 UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1334     if (fData == NULL) {
1335         return FALSE;
1336     }
1337     uint16_t category;
1338     UTRIE_GET16(&fData->fTrie, c, category);
1339     return (category & 0x4000) != 0;
1340 }
1341
1342
1343
1344 U_NAMESPACE_END
1345
1346 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */