2 *************************************************************************** 
   3 *   Copyright (C) 1999-2012 International Business Machines Corporation 
   4 *   and others. All rights reserved. 
   5 *************************************************************************** 
   8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator 
   9 //                   runtime engine and the API implementation for 
  10 //                   class RuleBasedBreakIterator 
  13 #include <typeinfo>  // for 'typeid' to work 
  15 #include "unicode/utypes.h" 
  17 #if !UCONFIG_NO_BREAK_ITERATION 
  19 #include "unicode/rbbi.h" 
  20 #include "unicode/schriter.h" 
  21 #include "unicode/uchriter.h" 
  22 #include "unicode/udata.h" 
  23 #include "unicode/uclean.h" 
  35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 
  36 #if U_LOCAL_SERVICE_HOOK 
  41 static UBool fTrace 
= FALSE
; 
  46 // The state number of the starting state 
  49 // The state-transition value indicating "stop" 
  53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator
) 
  56 //======================================================================= 
  58 //======================================================================= 
  61  * Constructs a RuleBasedBreakIterator that uses the already-created 
  62  * tables object that is passed in as a parameter. 
  64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader
* data
, UErrorCode 
&status
) 
  67     fData 
= new RBBIDataWrapper(data
, status
); // status checked in constructor 
  68     if (U_FAILURE(status
)) {return;} 
  70         status 
= U_MEMORY_ALLOCATION_ERROR
; 
  76  * Same as above but does not adopt memory 
  78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader
* data
, enum EDontAdopt
, UErrorCode 
&status
) 
  81     fData 
= new RBBIDataWrapper(data
, RBBIDataWrapper::kDontAdopt
, status
); // status checked in constructor 
  82     if (U_FAILURE(status
)) {return;} 
  84         status 
= U_MEMORY_ALLOCATION_ERROR
; 
  91 //  Construct from precompiled binary rules (tables).  This constructor is public API, 
  92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 
  94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules
, 
  98     if (U_FAILURE(status
)) { 
 101     if (compiledRules 
== NULL 
|| ruleLength 
< sizeof(RBBIDataHeader
)) { 
 102         status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 105     const RBBIDataHeader 
*data 
= (const RBBIDataHeader 
*)compiledRules
; 
 106     if (data
->fLength 
> ruleLength
) { 
 107         status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 110     fData 
= new RBBIDataWrapper(data
, RBBIDataWrapper::kDontAdopt
, status
);  
 111     if (U_FAILURE(status
)) {return;} 
 113         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 119 //------------------------------------------------------------------------------- 
 121 //   Constructor   from a UDataMemory handle to precompiled break rules 
 122 //                 stored in an ICU data file. 
 124 //------------------------------------------------------------------------------- 
 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory
* udm
, UErrorCode 
&status
) 
 128     fData 
= new RBBIDataWrapper(udm
, status
); // status checked in constructor 
 129     if (U_FAILURE(status
)) {return;} 
 131         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 138 //------------------------------------------------------------------------------- 
 140 //   Constructor       from a set of rules supplied as a string. 
 142 //------------------------------------------------------------------------------- 
 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  
&rules
, 
 144                                                 UParseError          
&parseError
, 
 148     if (U_FAILURE(status
)) {return;} 
 149     RuleBasedBreakIterator 
*bi 
= (RuleBasedBreakIterator 
*) 
 150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules
, &parseError
, status
); 
 151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that 
 152     //        creates and returns a complete RBBI.  From here, in a constructor, we 
 153     //        can't just return the object created by the builder factory, hence 
 154     //        the assignment of the factory created object to "this". 
 155     if (U_SUCCESS(status
)) { 
 162 //------------------------------------------------------------------------------- 
 164 // Default Constructor.      Create an empty shell that can be set up later. 
 165 //                           Used when creating a RuleBasedBreakIterator from a set 
 167 //------------------------------------------------------------------------------- 
 168 RuleBasedBreakIterator::RuleBasedBreakIterator() { 
 173 //------------------------------------------------------------------------------- 
 175 //   Copy constructor.  Will produce a break iterator with the same behavior, 
 176 //                      and which iterates over the same text, as the one passed in. 
 178 //------------------------------------------------------------------------------- 
 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator
& other
) 
 180 : BreakIterator(other
) 
 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 
 191     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 192         // fCharIter was adopted from the outside. 
 204         fData
->removeReference(); 
 207     if (fCachedBreakPositions
) { 
 208         uprv_free(fCachedBreakPositions
); 
 209         fCachedBreakPositions 
= NULL
; 
 211     if (fLanguageBreakEngines
) { 
 212         delete fLanguageBreakEngines
; 
 213         fLanguageBreakEngines 
= NULL
; 
 215     if (fUnhandledBreakEngine
) { 
 216         delete fUnhandledBreakEngine
; 
 217         fUnhandledBreakEngine 
= NULL
; 
 222  * Assignment operator.  Sets this iterator to have the same behavior, 
 223  * and iterate over the same text, as the one passed in. 
 225 RuleBasedBreakIterator
& 
 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator
& that
) { 
 230     reset();    // Delete break cache information 
 231     fBreakType 
= that
.fBreakType
; 
 232     if (fLanguageBreakEngines 
!= NULL
) { 
 233         delete fLanguageBreakEngines
; 
 234         fLanguageBreakEngines 
= NULL
;   // Just rebuild for now 
 236     // TODO: clone fLanguageBreakEngines from "that" 
 237     UErrorCode status 
= U_ZERO_ERROR
; 
 238     fText 
= utext_clone(fText
, that
.fText
, FALSE
, TRUE
, &status
); 
 240     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 245     if (that
.fCharIter 
!= NULL 
) { 
 246         // This is a little bit tricky - it will intially appear that 
 247         //  this->fCharIter is adopted, even if that->fCharIter was 
 248         //  not adopted.  That's ok. 
 249         fCharIter 
= that
.fCharIter
->clone(); 
 253         fData
->removeReference(); 
 256     if (that
.fData 
!= NULL
) { 
 257         fData 
= that
.fData
->addReference(); 
 265 //----------------------------------------------------------------------------- 
 267 //    init()      Shared initialization routine.   Used by all the constructors. 
 268 //                Initializes all fields, leaving the object in a consistent state. 
 270 //----------------------------------------------------------------------------- 
 271 void RuleBasedBreakIterator::init() { 
 272     UErrorCode  status    
= U_ZERO_ERROR
; 
 273     fBufferClone          
= FALSE
; 
 274     fText                 
= utext_openUChars(NULL
, NULL
, 0, &status
); 
 279     fLastRuleStatusIndex  
= 0; 
 280     fLastStatusIndexValid 
= TRUE
; 
 281     fDictionaryCharCount  
= 0; 
 282     fBreakType            
= UBRK_WORD
;  // Defaulting BreakType to word gives reasonable 
 283                                         //   dictionary behavior for Break Iterators that are 
 284                                         //   built from rules.  Even better would be the ability to 
 285                                         //   declare the type in the rules. 
 287     fCachedBreakPositions    
= NULL
; 
 288     fLanguageBreakEngines    
= NULL
; 
 289     fUnhandledBreakEngine    
= NULL
; 
 290     fNumCachedBreakPositions 
= 0; 
 291     fPositionInCache         
= 0; 
 294     static UBool debugInitDone 
= FALSE
; 
 295     if (debugInitDone 
== FALSE
) { 
 296         char *debugEnv 
= getenv("U_RBBIDEBUG"); 
 297         if (debugEnv 
&& uprv_strstr(debugEnv
, "trace")) { 
 300         debugInitDone 
= TRUE
; 
 307 //----------------------------------------------------------------------------- 
 309 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same 
 310 //            behavior, and iterating over the same text, as this one. 
 311 //            Virtual function: does the right thing with subclasses. 
 313 //----------------------------------------------------------------------------- 
 315 RuleBasedBreakIterator::clone(void) const { 
 316     return new RuleBasedBreakIterator(*this); 
 320  * Equality operator.  Returns TRUE if both BreakIterators are of the 
 321  * same class, have the same behavior, and iterate over the same text. 
 324 RuleBasedBreakIterator::operator==(const BreakIterator
& that
) const { 
 325     if (typeid(*this) != typeid(that
)) { 
 329     const RuleBasedBreakIterator
& that2 
= (const RuleBasedBreakIterator
&) that
; 
 331     if (!utext_equals(fText
, that2
.fText
)) { 
 332         // The two break iterators are operating on different text, 
 333         //   or have a different interation position. 
 337     // TODO:  need a check for when in a dictionary region at different offsets. 
 339     if (that2
.fData 
== fData 
|| 
 340         (fData 
!= NULL 
&& that2
.fData 
!= NULL 
&& *that2
.fData 
== *fData
)) { 
 341             // The two break iterators are using the same rules. 
 348  * Compute a hash code for this BreakIterator 
 349  * @return A hash code 
 352 RuleBasedBreakIterator::hashCode(void) const { 
 355         hash 
= fData
->hashCode(); 
 361 void RuleBasedBreakIterator::setText(UText 
*ut
, UErrorCode 
&status
) { 
 362     if (U_FAILURE(status
)) { 
 366     fText 
= utext_clone(fText
, ut
, FALSE
, TRUE
, &status
); 
 368     // Set up a dummy CharacterIterator to be returned if anyone 
 369     //   calls getText().  With input from UText, there is no reasonable 
 370     //   way to return a characterIterator over the actual input text. 
 371     //   Return one over an empty string instead - this is the closest 
 372     //   we can come to signaling a failure. 
 373     //   (GetText() is obsolete, this failure is sort of OK) 
 374     if (fDCharIter 
== NULL
) { 
 375         static const UChar c 
= 0; 
 376         fDCharIter 
= new UCharCharacterIterator(&c
, 0); 
 377         if (fDCharIter 
== NULL
) { 
 378             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 383     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 384         // existing fCharIter was adopted from the outside.  Delete it now. 
 387     fCharIter 
= fDCharIter
; 
 393 UText 
*RuleBasedBreakIterator::getUText(UText 
*fillIn
, UErrorCode 
&status
) const { 
 394     UText 
*result 
= utext_clone(fillIn
, fText
, FALSE
, TRUE
, &status
);   
 401  * Returns the description used to create this iterator 
 404 RuleBasedBreakIterator::getRules() const { 
 406         return fData
->getRuleSourceString(); 
 408         static const UnicodeString 
*s
; 
 410             // TODO:  something more elegant here. 
 411             //        perhaps API should return the string by value. 
 412             //        Note:  thread unsafe init & leak are semi-ok, better than 
 413             //               what was before.  Sould be cleaned up, though. 
 414             s 
= new UnicodeString
; 
 420 //======================================================================= 
 421 // BreakIterator overrides 
 422 //======================================================================= 
 425  * Return a CharacterIterator over the text being analyzed.   
 428 RuleBasedBreakIterator::getText() const { 
 433  * Set the iterator to analyze a new piece of text.  This function resets 
 434  * the current iteration position to the beginning of the text. 
 435  * @param newText An iterator over the text to analyze. 
 438 RuleBasedBreakIterator::adoptText(CharacterIterator
* newText
) { 
 439     // If we are holding a CharacterIterator adopted from a  
 440     //   previous call to this function, delete it now. 
 441     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 446     UErrorCode status 
= U_ZERO_ERROR
; 
 448     if (newText
==NULL 
|| newText
->startIndex() != 0) {    
 449         // startIndex !=0 wants to be an error, but there's no way to report it. 
 450         // Make the iterator text be an empty string. 
 451         fText 
= utext_openUChars(fText
, NULL
, 0, &status
); 
 453         fText 
= utext_openCharacterIterator(fText
, newText
, &status
); 
 459  * Set the iterator to analyze a new piece of text.  This function resets 
 460  * the current iteration position to the beginning of the text. 
 461  * @param newText An iterator over the text to analyze. 
 464 RuleBasedBreakIterator::setText(const UnicodeString
& newText
) { 
 465     UErrorCode status 
= U_ZERO_ERROR
; 
 467     fText 
= utext_openConstUnicodeString(fText
, &newText
, &status
); 
 469     // Set up a character iterator on the string.   
 470     //   Needed in case someone calls getText(). 
 471     //  Can not, unfortunately, do this lazily on the (probably never) 
 472     //  call to getText(), because getText is const. 
 473     if (fSCharIter 
== NULL
) { 
 474         fSCharIter 
= new StringCharacterIterator(newText
); 
 476         fSCharIter
->setText(newText
); 
 479     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 480         // old fCharIter was adopted from the outside.  Delete it. 
 483     fCharIter 
= fSCharIter
; 
 490  *  Provide a new UText for the input text.  Must reference text with contents identical 
 492  *  Intended for use with text data originating in Java (garbage collected) environments 
 493  *  where the data may be moved in memory at arbitrary times. 
 495 RuleBasedBreakIterator 
&RuleBasedBreakIterator::refreshInputText(UText 
*input
, UErrorCode 
&status
) { 
 496     if (U_FAILURE(status
)) { 
 500         status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 503     int64_t pos 
= utext_getNativeIndex(fText
); 
 504     //  Shallow read-only clone of the new UText into the existing input UText 
 505     fText 
= utext_clone(fText
, input
, FALSE
, TRUE
, &status
); 
 506     if (U_FAILURE(status
)) { 
 509     utext_setNativeIndex(fText
, pos
); 
 510     if (utext_getNativeIndex(fText
) != pos
) { 
 511         // Sanity check.  The new input utext is supposed to have the exact same 
 512         // contents as the old.  If we can't set to the same position, it doesn't. 
 513         // The contents underlying the old utext might be invalid at this point, 
 514         // so it's not safe to check directly. 
 515         status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 522  * Sets the current iteration position to the beginning of the text. 
 523  * @return The offset of the beginning of the text. 
 525 int32_t RuleBasedBreakIterator::first(void) { 
 527     fLastRuleStatusIndex  
= 0; 
 528     fLastStatusIndexValid 
= TRUE
; 
 530     //    return BreakIterator::DONE; 
 532     utext_setNativeIndex(fText
, 0); 
 537  * Sets the current iteration position to the end of the text. 
 538  * @return The text's past-the-end offset. 
 540 int32_t RuleBasedBreakIterator::last(void) { 
 543         fLastRuleStatusIndex  
= 0; 
 544         fLastStatusIndexValid 
= TRUE
; 
 545         return BreakIterator::DONE
; 
 548     fLastStatusIndexValid 
= FALSE
; 
 549     int32_t pos 
= (int32_t)utext_nativeLength(fText
); 
 550     utext_setNativeIndex(fText
, pos
); 
 555  * Advances the iterator either forward or backward the specified number of steps. 
 556  * Negative values move backward, and positive values move forward.  This is 
 557  * equivalent to repeatedly calling next() or previous(). 
 558  * @param n The number of steps to move.  The sign indicates the direction 
 559  * (negative is backwards, and positive is forwards). 
 560  * @return The character offset of the boundary position n boundaries away from 
 563 int32_t RuleBasedBreakIterator::next(int32_t n
) { 
 564     int32_t result 
= current(); 
 577  * Advances the iterator to the next boundary position. 
 578  * @return The position of the first boundary after this one. 
 580 int32_t RuleBasedBreakIterator::next(void) { 
 581     // if we have cached break positions and we're still in the range 
 582     // covered by them, just move one step forward in the cache 
 583     if (fCachedBreakPositions 
!= NULL
) { 
 584         if (fPositionInCache 
< fNumCachedBreakPositions 
- 1) { 
 586             int32_t pos 
= fCachedBreakPositions
[fPositionInCache
]; 
 587             utext_setNativeIndex(fText
, pos
); 
 595     int32_t startPos 
= current(); 
 596     int32_t result 
= handleNext(fData
->fForwardTable
); 
 597     if (fDictionaryCharCount 
> 0) { 
 598         result 
= checkDictionary(startPos
, result
, FALSE
); 
 604  * Advances the iterator backwards, to the last boundary preceding this one. 
 605  * @return The position of the last boundary position preceding this one. 
 607 int32_t RuleBasedBreakIterator::previous(void) { 
 611     // if we have cached break positions and we're still in the range 
 612     // covered by them, just move one step backward in the cache 
 613     if (fCachedBreakPositions 
!= NULL
) { 
 614         if (fPositionInCache 
> 0) { 
 616             // If we're at the beginning of the cache, need to reevaluate the 
 618             if (fPositionInCache 
<= 0) { 
 619                 fLastStatusIndexValid 
= FALSE
; 
 621             int32_t pos 
= fCachedBreakPositions
[fPositionInCache
]; 
 622             utext_setNativeIndex(fText
, pos
); 
 630     // if we're already sitting at the beginning of the text, return DONE 
 631     if (fText 
== NULL 
|| (startPos 
= current()) == 0) { 
 632         fLastRuleStatusIndex  
= 0; 
 633         fLastStatusIndexValid 
= TRUE
; 
 634         return BreakIterator::DONE
; 
 637     if (fData
->fSafeRevTable 
!= NULL 
|| fData
->fSafeFwdTable 
!= NULL
) { 
 638         result 
= handlePrevious(fData
->fReverseTable
); 
 639         if (fDictionaryCharCount 
> 0) { 
 640             result 
= checkDictionary(result
, startPos
, TRUE
); 
 646     // set things up.  handlePrevious() will back us up to some valid 
 647     // break position before the current position (we back our internal 
 648     // iterator up one step to prevent handlePrevious() from returning 
 649     // the current position), but not necessarily the last one before 
 653     int32_t start 
= current(); 
 655     (void)UTEXT_PREVIOUS32(fText
); 
 656     int32_t lastResult    
= handlePrevious(fData
->fReverseTable
); 
 657     if (lastResult 
== UBRK_DONE
) { 
 659         utext_setNativeIndex(fText
, 0); 
 663     UBool   breakTagValid 
= FALSE
; 
 665     // iterate forward from the known break position until we pass our 
 666     // starting point.  The last break position before the starting 
 667     // point is our return value 
 671         if (result 
== BreakIterator::DONE 
|| result 
>= start
) { 
 675         lastTag        
= fLastRuleStatusIndex
; 
 676         breakTagValid  
= TRUE
; 
 679     // fLastBreakTag wants to have the value for section of text preceding 
 680     // the result position that we are to return (in lastResult.)  If 
 681     // the backwards rules overshot and the above loop had to do two or more 
 682     // next()s to move up to the desired return position, we will have a valid 
 683     // tag value. But, if handlePrevious() took us to exactly the correct result positon, 
 684     // we wont have a tag value for that position, which is only set by handleNext(). 
 686     // set the current iteration position to be the last break position 
 687     // before where we started, and then return that value 
 688     utext_setNativeIndex(fText
, lastResult
); 
 689     fLastRuleStatusIndex  
= lastTag
;       // for use by getRuleStatus() 
 690     fLastStatusIndexValid 
= breakTagValid
; 
 692     // No need to check the dictionary; it will have been handled by 
 699  * Sets the iterator to refer to the first boundary position following 
 700  * the specified position. 
 701  * @offset The position from which to begin searching for a break position. 
 702  * @return The position of the first break after the current position. 
 704 int32_t RuleBasedBreakIterator::following(int32_t offset
) { 
 705     // if we have cached break positions and offset is in the range 
 706     // covered by them, use them 
 707     // TODO: could use binary search 
 708     // TODO: what if offset is outside range, but break is not? 
 709     if (fCachedBreakPositions 
!= NULL
) { 
 710         if (offset 
>= fCachedBreakPositions
[0] 
 711                 && offset 
< fCachedBreakPositions
[fNumCachedBreakPositions 
- 1]) { 
 712             fPositionInCache 
= 0; 
 713             // We are guaranteed not to leave the array due to range test above 
 714             while (offset 
>= fCachedBreakPositions
[fPositionInCache
]) { 
 717             int32_t pos 
= fCachedBreakPositions
[fPositionInCache
]; 
 718             utext_setNativeIndex(fText
, pos
); 
 726     // if the offset passed in is already past the end of the text, 
 727     // just return DONE; if it's before the beginning, return the 
 728     // text's starting offset 
 729     fLastRuleStatusIndex  
= 0; 
 730     fLastStatusIndexValid 
= TRUE
; 
 731     if (fText 
== NULL 
|| offset 
>= utext_nativeLength(fText
)) { 
 735     else if (offset 
< 0) { 
 739     // otherwise, set our internal iteration position (temporarily) 
 740     // to the position passed in.  If this is the _beginning_ position, 
 741     // then we can just use next() to get our return value 
 745     if (fData
->fSafeRevTable 
!= NULL
) { 
 747         utext_setNativeIndex(fText
, offset
); 
 748         // move forward one codepoint to prepare for moving back to a 
 750         // this handles offset being between a supplementary character 
 751         (void)UTEXT_NEXT32(fText
); 
 752         // handlePrevious will move most of the time to < 1 boundary away 
 753         handlePrevious(fData
->fSafeRevTable
); 
 754         int32_t result 
= next(); 
 755         while (result 
<= offset
) { 
 760     if (fData
->fSafeFwdTable 
!= NULL
) { 
 761         // backup plan if forward safe table is not available 
 762         utext_setNativeIndex(fText
, offset
); 
 763         (void)UTEXT_PREVIOUS32(fText
); 
 764         // handle next will give result >= offset 
 765         handleNext(fData
->fSafeFwdTable
); 
 766         // previous will give result 0 or 1 boundary away from offset, 
 769         int32_t oldresult 
= previous(); 
 770         while (oldresult 
> offset
) { 
 771             int32_t result 
= previous(); 
 772             if (result 
<= offset
) { 
 777         int32_t result 
= next(); 
 778         if (result 
<= offset
) { 
 783     // otherwise, we have to sync up first.  Use handlePrevious() to back 
 784     // up to a known break position before the specified position (if 
 785     // we can determine that the specified position is a break position, 
 786     // we don't back up at all).  This may or may not be the last break 
 787     // position at or before our starting position.  Advance forward 
 788     // from here until we've passed the starting position.  The position 
 789     // we stop on will be the first break position after the specified one. 
 792     utext_setNativeIndex(fText
, offset
); 
 794         (offset
==1  && utext_getNativeIndex(fText
)==0)) { 
 799     while (result 
!= BreakIterator::DONE 
&& result 
<= offset
) { 
 807  * Sets the iterator to refer to the last boundary position before the 
 808  * specified position. 
 809  * @offset The position to begin searching for a break from. 
 810  * @return The position of the last boundary before the starting position. 
 812 int32_t RuleBasedBreakIterator::preceding(int32_t offset
) { 
 813     // if we have cached break positions and offset is in the range 
 814     // covered by them, use them 
 815     if (fCachedBreakPositions 
!= NULL
) { 
 816         // TODO: binary search? 
 817         // TODO: What if offset is outside range, but break is not? 
 818         if (offset 
> fCachedBreakPositions
[0] 
 819                 && offset 
<= fCachedBreakPositions
[fNumCachedBreakPositions 
- 1]) { 
 820             fPositionInCache 
= 0; 
 821             while (fPositionInCache 
< fNumCachedBreakPositions
 
 822                    && offset 
> fCachedBreakPositions
[fPositionInCache
]) 
 825             // If we're at the beginning of the cache, need to reevaluate the 
 827             if (fPositionInCache 
<= 0) { 
 828                 fLastStatusIndexValid 
= FALSE
; 
 830             utext_setNativeIndex(fText
, fCachedBreakPositions
[fPositionInCache
]); 
 831             return fCachedBreakPositions
[fPositionInCache
]; 
 838     // if the offset passed in is already past the end of the text, 
 839     // just return DONE; if it's before the beginning, return the 
 840     // text's starting offset 
 841     if (fText 
== NULL 
|| offset 
> utext_nativeLength(fText
)) { 
 842         // return BreakIterator::DONE; 
 845     else if (offset 
< 0) { 
 849     // if we start by updating the current iteration position to the 
 850     // position specified by the caller, we can just use previous() 
 851     // to carry out this operation 
 853     if (fData
->fSafeFwdTable 
!= NULL
) { 
 855         utext_setNativeIndex(fText
, offset
); 
 856         int32_t newOffset 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 857         if (newOffset 
!= offset
) { 
 858             // Will come here if specified offset was not a code point boundary AND 
 859             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary 
 860             //   indices to the containing code point. 
 861             // For breakitereator::preceding only, these non-code-point indices need to be moved 
 862             //   up to refer to the following codepoint. 
 863             (void)UTEXT_NEXT32(fText
); 
 864             offset 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 867         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair, 
 868         //        rather than adjusting the position unconditionally? 
 869         //        (Change would interact with safe rules.) 
 870         // TODO:  change RBBI behavior for off-boundary indices to match that of UText? 
 871         //        affects only preceding(), seems cleaner, but is slightly different. 
 872         (void)UTEXT_PREVIOUS32(fText
); 
 873         handleNext(fData
->fSafeFwdTable
); 
 874         int32_t result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 875         while (result 
>= offset
) { 
 880     if (fData
->fSafeRevTable 
!= NULL
) { 
 881         // backup plan if forward safe table is not available 
 882         //  TODO:  check whether this path can be discarded 
 883         //         It's probably OK to say that rules must supply both safe tables 
 884         //            if they use safe tables at all.  We have certainly never described 
 885         //            to anyone how to work with just one safe table. 
 886         utext_setNativeIndex(fText
, offset
); 
 887         (void)UTEXT_NEXT32(fText
); 
 889         // handle previous will give result <= offset 
 890         handlePrevious(fData
->fSafeRevTable
); 
 892         // next will give result 0 or 1 boundary away from offset, 
 895         int32_t oldresult 
= next(); 
 896         while (oldresult 
< offset
) { 
 897             int32_t result 
= next(); 
 898             if (result 
>= offset
) { 
 903         int32_t result 
= previous(); 
 904         if (result 
>= offset
) { 
 911     utext_setNativeIndex(fText
, offset
); 
 916  * Returns true if the specfied position is a boundary position.  As a side 
 917  * effect, leaves the iterator pointing to the first boundary position at 
 919  * @param offset the offset to check. 
 920  * @return True if "offset" is a boundary position. 
 922 UBool 
RuleBasedBreakIterator::isBoundary(int32_t offset
) { 
 923     // the beginning index of the iterator is always a boundary position by definition 
 925         first();       // For side effects on current position, tag values. 
 929     if (offset 
== (int32_t)utext_nativeLength(fText
)) { 
 930         last();       // For side effects on current position, tag values. 
 934     // out-of-range indexes are never boundary positions 
 936         first();       // For side effects on current position, tag values. 
 940     if (offset 
> utext_nativeLength(fText
)) { 
 941         last();        // For side effects on current position, tag values. 
 945     // otherwise, we can use following() on the position before the specified 
 946     // one and return true if the position we get back is the one the user 
 948     utext_previous32From(fText
, offset
); 
 949     int32_t backOne 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 950     UBool    result  
= following(backOne
) == offset
; 
 955  * Returns the current iteration position. 
 956  * @return The current iteration position. 
 958 int32_t RuleBasedBreakIterator::current(void) const { 
 959     int32_t  pos 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 963 //======================================================================= 
 965 //======================================================================= 
 968 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end 
 969 //                 of user text.  A variable with this enum type keeps track of where we 
 970 //                 are.  The state machine only fetches user input while in the RUN mode. 
 973     RBBI_START
,     // state machine processing is before first char of input 
 974     RBBI_RUN
,       // state machine processing is in the user text 
 975     RBBI_END        
// state machine processing is after end of user text. 
 979 //----------------------------------------------------------------------------------- 
 981 //  handleNext(stateTable) 
 982 //     This method is the actual implementation of the rbbi next() method.  
 983 //     This method initializes the state machine to state 1 
 984 //     and advances through the text character by character until we reach the end 
 985 //     of the text or the state machine transitions to state 0.  We update our return 
 986 //     value every time the state machine passes through an accepting state. 
 988 //----------------------------------------------------------------------------------- 
 989 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable 
*statetable
) { 
 991     uint16_t            category        
= 0; 
 994     RBBIStateTableRow  
*row
; 
 996     int32_t             lookaheadStatus 
= 0; 
 997     int32_t             lookaheadTagIdx 
= 0; 
 999     int32_t             initialPosition 
= 0; 
1000     int32_t             lookaheadResult 
= 0; 
1001     UBool               lookAheadHardBreak 
= (statetable
->fFlags 
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0; 
1002     const char         *tableData       
= statetable
->fTableData
; 
1003     uint32_t            tableRowLen     
= statetable
->fRowLen
; 
1007             RBBIDebugPuts("Handle Next   pos   char  state category"); 
1011     // No matter what, handleNext alway correctly sets the break tag value. 
1012     fLastStatusIndexValid 
= TRUE
; 
1013     fLastRuleStatusIndex 
= 0; 
1015     // if we're already at the end of the text, return DONE. 
1016     initialPosition 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);  
1017     result          
= initialPosition
; 
1018     c               
= UTEXT_NEXT32(fText
); 
1019     if (fData 
== NULL 
|| c
==U_SENTINEL
) { 
1020         return BreakIterator::DONE
; 
1023     //  Set the initial state for the state machine 
1024     state 
= START_STATE
; 
1025     row 
= (RBBIStateTableRow 
*) 
1026             //(statetable->fTableData + (statetable->fRowLen * state)); 
1027             (tableData 
+ tableRowLen 
* state
); 
1031     if (statetable
->fFlags 
& RBBI_BOF_REQUIRED
) { 
1037     // loop until we reach the end of the text or transition to state 0 
1040         if (c 
== U_SENTINEL
) { 
1041             // Reached end of input string. 
1042             if (mode 
== RBBI_END
) { 
1043                 // We have already run the loop one last time with the  
1044                 //   character set to the psueudo {eof} value.  Now it is time 
1045                 //   to unconditionally bail out. 
1046                 if (lookaheadResult 
> result
) { 
1047                     // We ran off the end of the string with a pending look-ahead match. 
1048                     // Treat this as if the look-ahead condition had been met, and return 
1049                     //  the match at the / position from the look-ahead rule. 
1050                     result               
= lookaheadResult
; 
1051                     fLastRuleStatusIndex 
= lookaheadTagIdx
; 
1052                     lookaheadStatus 
= 0; 
1056             // Run the loop one last time with the fake end-of-input character category. 
1062         // Get the char category.  An incoming category of 1 or 2 means that 
1063         //      we are preset for doing the beginning or end of input, and 
1064         //      that we shouldn't get a category from an actual text input character. 
1066         if (mode 
== RBBI_RUN
) { 
1067             // look up the current character's character category, which tells us 
1068             // which column in the state table to look at. 
1069             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned, 
1070             //        not the size of the character going in, which is a UChar32. 
1072             UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1074             // Check the dictionary bit in the character's category. 
1075             //    Counter is only used by dictionary based iterators (subclasses). 
1076             //    Chars that need to be handled by a dictionary have a flag bit set 
1077             //    in their category values. 
1079             if ((category 
& 0x4000) != 0)  { 
1080                 fDictionaryCharCount
++; 
1081                 //  And off the dictionary flag bit. 
1082                 category 
&= ~0x4000; 
1088                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText
)); 
1089                 if (0x20<=c 
&& c
<0x7f) { 
1090                     RBBIDebugPrintf("\"%c\"  ", c
); 
1092                     RBBIDebugPrintf("%5x  ", c
); 
1094                 RBBIDebugPrintf("%3d  %3d\n", state
, category
); 
1098         // State Transition - move machine to its next state 
1101         // Note: fNextState is defined as uint16_t[2], but we are casting 
1102         // a generated RBBI table to RBBIStateTableRow and some tables 
1103         // actually have more than 2 categories. 
1104         U_ASSERT(category
<fData
->fHeader
->fCatCount
); 
1105         state 
= row
->fNextState
[category
];  /*Not accessing beyond memory*/ 
1106         row 
= (RBBIStateTableRow 
*) 
1107             // (statetable->fTableData + (statetable->fRowLen * state)); 
1108             (tableData 
+ tableRowLen 
* state
); 
1111         if (row
->fAccepting 
== -1) { 
1112             // Match found, common case. 
1113             if (mode 
!= RBBI_START
) { 
1114                 result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1116             fLastRuleStatusIndex 
= row
->fTagIdx
;   // Remember the break status (tag) values. 
1119         if (row
->fLookAhead 
!= 0) { 
1120             if (lookaheadStatus 
!= 0 
1121                 && row
->fAccepting 
== lookaheadStatus
) { 
1122                 // Lookahead match is completed.   
1123                 result               
= lookaheadResult
; 
1124                 fLastRuleStatusIndex 
= lookaheadTagIdx
; 
1125                 lookaheadStatus      
= 0; 
1126                 // TODO:  make a standalone hard break in a rule work. 
1127                 if (lookAheadHardBreak
) { 
1128                     UTEXT_SETNATIVEINDEX(fText
, result
); 
1131                 // Look-ahead completed, but other rules may match further.  Continue on 
1132                 //  TODO:  junk this feature?  I don't think it's used anywhwere. 
1136             int32_t  r 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1137             lookaheadResult 
= r
; 
1138             lookaheadStatus 
= row
->fLookAhead
; 
1139             lookaheadTagIdx 
= row
->fTagIdx
; 
1144         if (row
->fAccepting 
!= 0) { 
1145             // Because this is an accepting state, any in-progress look-ahead match 
1146             //   is no longer relavant.  Clear out the pending lookahead status. 
1147             lookaheadStatus 
= 0;           // clear out any pending look-ahead match. 
1151         if (state 
== STOP_STATE
) { 
1152             // This is the normal exit from the lookup state machine. 
1153             // We have advanced through the string until it is certain that no 
1154             //   longer match is possible, no matter what characters follow. 
1158         // Advance to the next character.   
1159         // If this is a beginning-of-input loop iteration, don't advance 
1160         //    the input position.  The next iteration will be processing the 
1161         //    first real input character. 
1162         if (mode 
== RBBI_RUN
) { 
1163             c 
= UTEXT_NEXT32(fText
); 
1165             if (mode 
== RBBI_START
) { 
1173     // The state machine is done.  Check whether it found a match... 
1175     // If the iterator failed to advance in the match engine, force it ahead by one. 
1176     //   (This really indicates a defect in the break rules.  They should always match 
1177     //    at least one character.) 
1178     if (result 
== initialPosition
) { 
1179         UTEXT_SETNATIVEINDEX(fText
, initialPosition
); 
1180         UTEXT_NEXT32(fText
); 
1181         result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1184     // Leave the iterator at our result position. 
1185     UTEXT_SETNATIVEINDEX(fText
, result
); 
1188             RBBIDebugPrintf("result = %d\n\n", result
); 
1196 //----------------------------------------------------------------------------------- 
1200 //      Iterate backwards, according to the logic of the reverse rules. 
1201 //      This version handles the exact style backwards rules. 
1203 //      The logic of this function is very similar to handleNext(), above. 
1205 //----------------------------------------------------------------------------------- 
1206 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable 
*statetable
) { 
1208     uint16_t            category        
= 0; 
1210     RBBIStateTableRow  
*row
; 
1212     int32_t             lookaheadStatus 
= 0; 
1214     int32_t             initialPosition 
= 0; 
1215     int32_t             lookaheadResult 
= 0; 
1216     UBool               lookAheadHardBreak 
= (statetable
->fFlags 
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0; 
1220             RBBIDebugPuts("Handle Previous   pos   char  state category"); 
1224     // handlePrevious() never gets the rule status. 
1225     // Flag the status as invalid; if the user ever asks for status, we will need 
1226     // to back up, then re-find the break position using handleNext(), which does 
1227     // get the status value. 
1228     fLastStatusIndexValid 
= FALSE
; 
1229     fLastRuleStatusIndex 
= 0; 
1231     // if we're already at the start of the text, return DONE. 
1232     if (fText 
== NULL 
|| fData 
== NULL 
|| UTEXT_GETNATIVEINDEX(fText
)==0) { 
1233         return BreakIterator::DONE
; 
1236     //  Set up the starting char. 
1237     initialPosition 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1238     result          
= initialPosition
; 
1239     c               
= UTEXT_PREVIOUS32(fText
); 
1241     //  Set the initial state for the state machine 
1242     state 
= START_STATE
; 
1243     row 
= (RBBIStateTableRow 
*) 
1244             (statetable
->fTableData 
+ (statetable
->fRowLen 
* state
)); 
1247     if (statetable
->fFlags 
& RBBI_BOF_REQUIRED
) { 
1253     // loop until we reach the start of the text or transition to state 0 
1256         if (c 
== U_SENTINEL
) { 
1257             // Reached end of input string. 
1258             if (mode 
== RBBI_END
) { 
1259                 // We have already run the loop one last time with the  
1260                 //   character set to the psueudo {eof} value.  Now it is time 
1261                 //   to unconditionally bail out. 
1262                 if (lookaheadResult 
< result
) { 
1263                     // We ran off the end of the string with a pending look-ahead match. 
1264                     // Treat this as if the look-ahead condition had been met, and return 
1265                     //  the match at the / position from the look-ahead rule. 
1266                     result               
= lookaheadResult
; 
1267                     lookaheadStatus 
= 0; 
1268                 } else if (result 
== initialPosition
) { 
1269                     // Ran off start, no match found. 
1270                     // move one index one (towards the start, since we are doing a previous()) 
1271                     UTEXT_SETNATIVEINDEX(fText
, initialPosition
); 
1272                     (void)UTEXT_PREVIOUS32(fText
);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check. 
1276             // Run the loop one last time with the fake end-of-input character category. 
1282         // Get the char category.  An incoming category of 1 or 2 means that 
1283         //      we are preset for doing the beginning or end of input, and 
1284         //      that we shouldn't get a category from an actual text input character. 
1286         if (mode 
== RBBI_RUN
) { 
1287             // look up the current character's character category, which tells us 
1288             // which column in the state table to look at. 
1289             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned, 
1290             //        not the size of the character going in, which is a UChar32. 
1292             UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1294             // Check the dictionary bit in the character's category. 
1295             //    Counter is only used by dictionary based iterators (subclasses). 
1296             //    Chars that need to be handled by a dictionary have a flag bit set 
1297             //    in their category values. 
1299             if ((category 
& 0x4000) != 0)  { 
1300                 fDictionaryCharCount
++; 
1301                 //  And off the dictionary flag bit. 
1302                 category 
&= ~0x4000; 
1308                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText
)); 
1309                 if (0x20<=c 
&& c
<0x7f) { 
1310                     RBBIDebugPrintf("\"%c\"  ", c
); 
1312                     RBBIDebugPrintf("%5x  ", c
); 
1314                 RBBIDebugPrintf("%3d  %3d\n", state
, category
); 
1318         // State Transition - move machine to its next state 
1321         // Note: fNextState is defined as uint16_t[2], but we are casting 
1322         // a generated RBBI table to RBBIStateTableRow and some tables 
1323         // actually have more than 2 categories. 
1324         U_ASSERT(category
<fData
->fHeader
->fCatCount
); 
1325         state 
= row
->fNextState
[category
];  /*Not accessing beyond memory*/ 
1326         row 
= (RBBIStateTableRow 
*) 
1327             (statetable
->fTableData 
+ (statetable
->fRowLen 
* state
)); 
1329         if (row
->fAccepting 
== -1) { 
1330             // Match found, common case. 
1331             result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1334         if (row
->fLookAhead 
!= 0) { 
1335             if (lookaheadStatus 
!= 0 
1336                 && row
->fAccepting 
== lookaheadStatus
) { 
1337                 // Lookahead match is completed.   
1338                 result               
= lookaheadResult
; 
1339                 lookaheadStatus      
= 0; 
1340                 // TODO:  make a standalone hard break in a rule work. 
1341                 if (lookAheadHardBreak
) { 
1342                     UTEXT_SETNATIVEINDEX(fText
, result
); 
1345                 // Look-ahead completed, but other rules may match further.  Continue on 
1346                 //  TODO:  junk this feature?  I don't think it's used anywhwere. 
1350             int32_t  r 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1351             lookaheadResult 
= r
; 
1352             lookaheadStatus 
= row
->fLookAhead
; 
1357         if (row
->fAccepting 
!= 0) { 
1358             // Because this is an accepting state, any in-progress look-ahead match 
1359             //   is no longer relavant.  Clear out the pending lookahead status. 
1360             lookaheadStatus 
= 0;     
1364         if (state 
== STOP_STATE
) { 
1365             // This is the normal exit from the lookup state machine. 
1366             // We have advanced through the string until it is certain that no 
1367             //   longer match is possible, no matter what characters follow. 
1371         // Move (backwards) to the next character to process.   
1372         // If this is a beginning-of-input loop iteration, don't advance 
1373         //    the input position.  The next iteration will be processing the 
1374         //    first real input character. 
1375         if (mode 
== RBBI_RUN
) { 
1376             c 
= UTEXT_PREVIOUS32(fText
); 
1378             if (mode 
== RBBI_START
) { 
1384     // The state machine is done.  Check whether it found a match... 
1386     // If the iterator failed to advance in the match engine, force it ahead by one. 
1387     //   (This really indicates a defect in the break rules.  They should always match 
1388     //    at least one character.) 
1389     if (result 
== initialPosition
) { 
1390         UTEXT_SETNATIVEINDEX(fText
, initialPosition
); 
1391         UTEXT_PREVIOUS32(fText
); 
1392         result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1395     // Leave the iterator at our result position. 
1396     UTEXT_SETNATIVEINDEX(fText
, result
); 
1399             RBBIDebugPrintf("result = %d\n\n", result
); 
1407 RuleBasedBreakIterator::reset() 
1409     if (fCachedBreakPositions
) { 
1410         uprv_free(fCachedBreakPositions
); 
1412     fCachedBreakPositions 
= NULL
; 
1413     fNumCachedBreakPositions 
= 0; 
1414     fDictionaryCharCount 
= 0; 
1415     fPositionInCache 
= 0; 
1420 //------------------------------------------------------------------------------- 
1422 //   getRuleStatus()   Return the break rule tag associated with the current 
1423 //                     iterator position.  If the iterator arrived at its current 
1424 //                     position by iterating forwards, the value will have been 
1425 //                     cached by the handleNext() function. 
1427 //                     If no cached status value is available, the status is 
1428 //                     found by doing a previous() followed by a next(), which 
1429 //                     leaves the iterator where it started, and computes the 
1430 //                     status while doing the next(). 
1432 //------------------------------------------------------------------------------- 
1433 void RuleBasedBreakIterator::makeRuleStatusValid() { 
1434     if (fLastStatusIndexValid 
== FALSE
) { 
1435         //  No cached status is available. 
1436         if (fText 
== NULL 
|| current() == 0) { 
1437             //  At start of text, or there is no text.  Status is always zero. 
1438             fLastRuleStatusIndex 
= 0; 
1439             fLastStatusIndexValid 
= TRUE
; 
1441             //  Not at start of text.  Find status the tedious way. 
1442             int32_t pa 
= current(); 
1444             if (fNumCachedBreakPositions 
> 0) { 
1445                 reset();                // Blow off the dictionary cache 
1447             int32_t pb 
= next(); 
1449                 // note: the if (pa != pb) test is here only to eliminate warnings for 
1450                 //       unused local variables on gcc.  Logically, it isn't needed. 
1455     U_ASSERT(fLastRuleStatusIndex 
>= 0  &&  fLastRuleStatusIndex 
< fData
->fStatusMaxIdx
); 
1459 int32_t  RuleBasedBreakIterator::getRuleStatus() const { 
1460     RuleBasedBreakIterator 
*nonConstThis  
= (RuleBasedBreakIterator 
*)this; 
1461     nonConstThis
->makeRuleStatusValid(); 
1463     // fLastRuleStatusIndex indexes to the start of the appropriate status record 
1464     //                                                 (the number of status values.) 
1465     //   This function returns the last (largest) of the array of status values. 
1466     int32_t  idx 
= fLastRuleStatusIndex 
+ fData
->fRuleStatusTable
[fLastRuleStatusIndex
]; 
1467     int32_t  tagVal 
= fData
->fRuleStatusTable
[idx
]; 
1475 int32_t RuleBasedBreakIterator::getRuleStatusVec( 
1476              int32_t *fillInVec
, int32_t capacity
, UErrorCode 
&status
) 
1478     if (U_FAILURE(status
)) { 
1482     RuleBasedBreakIterator 
*nonConstThis  
= (RuleBasedBreakIterator 
*)this; 
1483     nonConstThis
->makeRuleStatusValid(); 
1484     int32_t  numVals 
= fData
->fRuleStatusTable
[fLastRuleStatusIndex
]; 
1485     int32_t  numValsToCopy 
= numVals
; 
1486     if (numVals 
> capacity
) { 
1487         status 
= U_BUFFER_OVERFLOW_ERROR
; 
1488         numValsToCopy 
= capacity
; 
1491     for (i
=0; i
<numValsToCopy
; i
++) { 
1492         fillInVec
[i
] = fData
->fRuleStatusTable
[fLastRuleStatusIndex 
+ i 
+ 1]; 
1499 //------------------------------------------------------------------------------- 
1501 //   getBinaryRules        Access to the compiled form of the rules, 
1502 //                         for use by build system tools that save the data 
1503 //                         for standard iterator types. 
1505 //------------------------------------------------------------------------------- 
1506 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length
) { 
1507     const uint8_t  *retPtr 
= NULL
; 
1510     if (fData 
!= NULL
) { 
1511         retPtr 
= (const uint8_t *)fData
->fHeader
; 
1512         length 
= fData
->fHeader
->fLength
; 
1520 //------------------------------------------------------------------------------- 
1522 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated. 
1523 //                    Saving one heap allocation isn't worth the trouble. 
1524 //                    Cloning shouldn't be done in tight loops, and 
1525 //                    making the clone copy involves other heap operations anyway. 
1526 //                    And the application code for correctly dealing with buffer 
1527 //                    size problems and the eventual object destruction is ugly. 
1529 //------------------------------------------------------------------------------- 
1530 BreakIterator 
*  RuleBasedBreakIterator::createBufferClone(void *stackBuffer
, 
1531                                    int32_t &bufferSize
, 
1534     if (U_FAILURE(status
)){ 
1539     //  If user buffer size is zero this is a preflight operation to 
1540     //    obtain the needed buffer size, allowing for worst case misalignment. 
1542     if (bufferSize 
== 0) { 
1543         bufferSize 
= sizeof(RuleBasedBreakIterator
) + U_ALIGNMENT_OFFSET_UP(0); 
1549     //  Check the alignment and size of the user supplied buffer. 
1550     //  Allocate heap memory if the user supplied memory is insufficient. 
1552     char    *buf   
= (char *)stackBuffer
; 
1553     uint32_t s      
= bufferSize
; 
1555     if (stackBuffer 
== NULL
) { 
1556         s 
= 0;   // Ignore size, force allocation if user didn't give us a buffer. 
1558     if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) { 
1559         uint32_t offsetUp 
= (uint32_t)U_ALIGNMENT_OFFSET_UP(buf
); 
1563     if (s 
< sizeof(RuleBasedBreakIterator
)) { 
1564         // Not enough room in the caller-supplied buffer. 
1565         // Do a plain-vanilla heap based clone and return that, along with 
1566         //   a warning that the clone was allocated. 
1567         RuleBasedBreakIterator 
*clonedBI 
= new RuleBasedBreakIterator(*this); 
1568         if (clonedBI 
== 0) { 
1569             status 
= U_MEMORY_ALLOCATION_ERROR
; 
1571             status 
= U_SAFECLONE_ALLOCATED_WARNING
; 
1577     //  Clone the source BI into the caller-supplied buffer. 
1579     RuleBasedBreakIterator 
*clone 
= new(buf
) RuleBasedBreakIterator(*this); 
1580     clone
->fBufferClone 
= TRUE
;   // Flag to prevent deleting storage on close (From C code) 
1586 //------------------------------------------------------------------------------- 
1588 //  isDictionaryChar      Return true if the category lookup for this char 
1589 //                        indicates that it is in the set of dictionary lookup 
1592 //                        This function is intended for use by dictionary based 
1595 //------------------------------------------------------------------------------- 
1596 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) { 
1597     if (fData == NULL) { 
1601     UTRIE_GET16(&fData->fTrie, c, category); 
1602     return (category & 0x4000) != 0; 
1606 //------------------------------------------------------------------------------- 
1608 //  checkDictionary       This function handles all processing of characters in 
1609 //                        the "dictionary" set. It will determine the appropriate 
1610 //                        course of action, and possibly set up a cache in the 
1613 //------------------------------------------------------------------------------- 
1614 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos
, 
1617     // Reset the old break cache first. 
1618     uint32_t dictionaryCount 
= fDictionaryCharCount
; 
1621     if (dictionaryCount 
<= 1 || (endPos 
- startPos
) <= 1) { 
1622         return (reverse 
? startPos 
: endPos
); 
1625     // Bug 5532.  The dictionary code will crash if the input text is UTF-8 
1626     //      because native indexes are different from UTF-16 indexes. 
1627     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text. 
1628     //      It wont give the right breaks, but it's better than a crash. 
1630     //      Check the type of the UText by checking its pFuncs field, which 
1631     //      is UText's function dispatch table.  It will be the same for all 
1632     //      UTF-8 UTexts and different for any other UText type. 
1634     //      We have no other type of UText available with non-UTF-16 native indexing. 
1635     //      This whole check will go away once the dictionary code is fixed. 
1636     static const void *utext_utf8Funcs
; 
1637     if (utext_utf8Funcs 
== NULL
) { 
1638         // Cache the UTF-8 UText function pointer value. 
1639         UErrorCode status 
= U_ZERO_ERROR
; 
1640         UText tempUText 
= UTEXT_INITIALIZER
;  
1641         utext_openUTF8(&tempUText
, NULL
, 0, &status
); 
1642         utext_utf8Funcs 
= tempUText
.pFuncs
; 
1643         utext_close(&tempUText
); 
1645     if (fText
->pFuncs 
== utext_utf8Funcs
) { 
1646         return (reverse 
? startPos 
: endPos
); 
1649     // Starting from the starting point, scan towards the proposed result, 
1650     // looking for the first dictionary character (which may be the one 
1651     // we're on, if we're starting in the middle of a range). 
1652     utext_setNativeIndex(fText
, reverse 
? endPos 
: startPos
); 
1654         UTEXT_PREVIOUS32(fText
); 
1657     int32_t rangeStart 
= startPos
; 
1658     int32_t rangeEnd 
= endPos
; 
1662     UErrorCode  status 
= U_ZERO_ERROR
; 
1663     UStack      
breaks(status
); 
1664     int32_t     foundBreakCount 
= 0; 
1665     UChar32     c 
= utext_current32(fText
); 
1667     UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1669     // Is the character we're starting on a dictionary character? If so, we 
1670     // need to back up to include the entire run; otherwise the results of 
1671     // the break algorithm will differ depending on where we start. Since 
1672     // the result is cached and there is typically a non-dictionary break 
1673     // within a small number of words, there should be little performance impact. 
1674     if (category 
& 0x4000) { 
1677                 utext_next32(fText
);          // TODO:  recast to work directly with postincrement. 
1678                 c 
= utext_current32(fText
); 
1679                 UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1680             } while (c 
!= U_SENTINEL 
&& (category 
& 0x4000)); 
1681             // Back up to the last dictionary character 
1682             rangeEnd 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1683             if (c 
== U_SENTINEL
) { 
1684                 // c = fText->last32(); 
1685                 //   TODO:  why was this if needed? 
1686                 c 
= UTEXT_PREVIOUS32(fText
); 
1689                 c 
= UTEXT_PREVIOUS32(fText
); 
1694                 c 
= UTEXT_PREVIOUS32(fText
); 
1695                 UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1697             while (c 
!= U_SENTINEL 
&& (category 
& 0x4000)); 
1698             // Back up to the last dictionary character 
1699             if (c 
== U_SENTINEL
) { 
1700                 // c = fText->first32(); 
1701                 c 
= utext_current32(fText
); 
1704                 utext_next32(fText
); 
1705                 c 
= utext_current32(fText
); 
1707             rangeStart 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);; 
1709         UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1712     // Loop through the text, looking for ranges of dictionary characters. 
1713     // For each span, find the appropriate break engine, and ask it to find 
1714     // any breaks within the span. 
1715     // Note: we always do this in the forward direction, so that the break 
1716     // cache is built in the right order. 
1718         utext_setNativeIndex(fText
, rangeStart
); 
1719         c 
= utext_current32(fText
); 
1720         UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1722     while(U_SUCCESS(status
)) { 
1723         while((current 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
)) < rangeEnd 
&& (category 
& 0x4000) == 0) { 
1724             utext_next32(fText
);           // TODO:  tweak for post-increment operation 
1725             c 
= utext_current32(fText
); 
1726             UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1728         if (current 
>= rangeEnd
) { 
1732         // We now have a dictionary character. Get the appropriate language object 
1734         const LanguageBreakEngine 
*lbe 
= getLanguageBreakEngine(c
); 
1736         // Ask the language object if there are any breaks. It will leave the text 
1737         // pointer on the other side of its range, ready to search for the next one. 
1739             foundBreakCount 
+= lbe
->findBreaks(fText
, rangeStart
, rangeEnd
, FALSE
, fBreakType
, breaks
); 
1742         // Reload the loop variables for the next go-round 
1743         c 
= utext_current32(fText
); 
1744         UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1747     // If we found breaks, build a new break cache. The first and last entries must 
1748     // be the original starting and ending position. 
1749     if (foundBreakCount 
> 0) { 
1750         int32_t totalBreaks 
= foundBreakCount
; 
1751         if (startPos 
< breaks
.elementAti(0)) { 
1754         if (endPos 
> breaks
.peeki()) { 
1757         fCachedBreakPositions 
= (int32_t *)uprv_malloc(totalBreaks 
* sizeof(int32_t)); 
1758         if (fCachedBreakPositions 
!= NULL
) { 
1760             fNumCachedBreakPositions 
= totalBreaks
; 
1761             if (startPos 
< breaks
.elementAti(0)) { 
1762                 fCachedBreakPositions
[out
++] = startPos
; 
1764             for (int32_t i 
= 0; i 
< foundBreakCount
; ++i
) { 
1765                 fCachedBreakPositions
[out
++] = breaks
.elementAti(i
); 
1767             if (endPos 
> fCachedBreakPositions
[out
-1]) { 
1768                 fCachedBreakPositions
[out
] = endPos
; 
1770             // If there are breaks, then by definition, we are replacing the original 
1771             // proposed break by one of the breaks we found. Use following() and 
1772             // preceding() to do the work. They should never recurse in this case. 
1774                 return preceding(endPos 
- 1); 
1777                 return following(startPos
); 
1780         // If the allocation failed, just fall through to the "no breaks found" case. 
1783     // If we get here, there were no language-based breaks. Set the text pointer 
1784     // to the original proposed break. 
1785     utext_setNativeIndex(fText
, reverse 
? startPos 
: endPos
); 
1786     return (reverse 
? startPos 
: endPos
); 
1791 // defined in ucln_cmn.h 
1793 static icu::UStack 
*gLanguageBreakFactories 
= NULL
; 
1796  * Release all static memory held by breakiterator.   
1799 static UBool U_CALLCONV 
breakiterator_cleanup_dict(void) { 
1800     if (gLanguageBreakFactories
) { 
1801         delete gLanguageBreakFactories
; 
1802         gLanguageBreakFactories 
= NULL
; 
1809 static void U_CALLCONV 
_deleteFactory(void *obj
) { 
1810     delete (icu::LanguageBreakFactory 
*) obj
; 
1815 static const LanguageBreakEngine
* 
1816 getLanguageBreakEngineFromFactory(UChar32 c
, int32_t breakType
) 
1819     UErrorCode  status 
= U_ZERO_ERROR
; 
1820     UMTX_CHECK(NULL
, (UBool
)(gLanguageBreakFactories 
== NULL
), needsInit
); 
1823         UStack  
*factories 
= new UStack(_deleteFactory
, NULL
, status
); 
1824         if (factories 
!= NULL 
&& U_SUCCESS(status
)) { 
1825             ICULanguageBreakFactory 
*builtIn 
= new ICULanguageBreakFactory(status
); 
1826             factories
->push(builtIn
, status
); 
1827 #ifdef U_LOCAL_SERVICE_HOOK 
1828             LanguageBreakFactory 
*extra 
= (LanguageBreakFactory 
*)uprv_svc_hook("languageBreakFactory", &status
); 
1829             if (extra 
!= NULL
) { 
1830                 factories
->push(extra
, status
); 
1835         if (gLanguageBreakFactories 
== NULL
) { 
1836             gLanguageBreakFactories 
= factories
; 
1838             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT
, breakiterator_cleanup_dict
); 
1844     if (gLanguageBreakFactories 
== NULL
) { 
1848     int32_t i 
= gLanguageBreakFactories
->size(); 
1849     const LanguageBreakEngine 
*lbe 
= NULL
; 
1851         LanguageBreakFactory 
*factory 
= (LanguageBreakFactory 
*)(gLanguageBreakFactories
->elementAt(i
)); 
1852         lbe 
= factory
->getEngineFor(c
, breakType
); 
1861 //------------------------------------------------------------------------------- 
1863 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the 
1866 //------------------------------------------------------------------------------- 
1867 const LanguageBreakEngine 
* 
1868 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c
) { 
1869     const LanguageBreakEngine 
*lbe 
= NULL
; 
1870     UErrorCode status 
= U_ZERO_ERROR
; 
1872     if (fLanguageBreakEngines 
== NULL
) { 
1873         fLanguageBreakEngines 
= new UStack(status
); 
1874         if (fLanguageBreakEngines 
== NULL 
|| U_FAILURE(status
)) { 
1875             delete fLanguageBreakEngines
; 
1876             fLanguageBreakEngines 
= 0; 
1881     int32_t i 
= fLanguageBreakEngines
->size(); 
1883         lbe 
= (const LanguageBreakEngine 
*)(fLanguageBreakEngines
->elementAt(i
)); 
1884         if (lbe
->handles(c
, fBreakType
)) { 
1889     // No existing dictionary took the character. See if a factory wants to 
1890     // give us a new LanguageBreakEngine for this character. 
1891     lbe 
= getLanguageBreakEngineFromFactory(c
, fBreakType
); 
1893     // If we got one, use it and push it on our stack. 
1895         fLanguageBreakEngines
->push((void *)lbe
, status
); 
1896         // Even if we can't remember it, we can keep looking it up, so 
1897         // return it even if the push fails. 
1901     // No engine is forthcoming for this character. Add it to the 
1902     // reject set. Create the reject break engine if needed. 
1903     if (fUnhandledBreakEngine 
== NULL
) { 
1904         fUnhandledBreakEngine 
= new UnhandledEngine(status
); 
1905         if (U_SUCCESS(status
) && fUnhandledBreakEngine 
== NULL
) { 
1906             status 
= U_MEMORY_ALLOCATION_ERROR
; 
1908         // Put it last so that scripts for which we have an engine get tried 
1910         fLanguageBreakEngines
->insertElementAt(fUnhandledBreakEngine
, 0, status
); 
1911         // If we can't insert it, or creation failed, get rid of it 
1912         if (U_FAILURE(status
)) { 
1913             delete fUnhandledBreakEngine
; 
1914             fUnhandledBreakEngine 
= 0; 
1919     // Tell the reject engine about the character; at its discretion, it may 
1920     // add more than just the one character. 
1921     fUnhandledBreakEngine
->handleCharacter(c
, fBreakType
); 
1923     return fUnhandledBreakEngine
; 
1928 /*int32_t RuleBasedBreakIterator::getBreakType() const { 
1932 void RuleBasedBreakIterator::setBreakType(int32_t type
) { 
1939 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */