2 *************************************************************************** 
   3 *   Copyright (C) 1999-2010 International Business Machines Corporation 
   4 *   and others. All rights reserved. 
   5 *************************************************************************** 
   8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator 
   9 //                   runtime engine and the API implementation for 
  10 //                   class RuleBasedBreakIterator 
  13 #include <typeinfo>  // for 'typeid' to work 
  15 #include "unicode/utypes.h" 
  17 #if !UCONFIG_NO_BREAK_ITERATION 
  19 #include "unicode/rbbi.h" 
  20 #include "unicode/schriter.h" 
  21 #include "unicode/uchriter.h" 
  22 #include "unicode/udata.h" 
  23 #include "unicode/uclean.h" 
  35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 
  36 #if U_LOCAL_SERVICE_HOOK 
  41 static UBool fTrace 
= FALSE
; 
  46 // The state number of the starting state 
  49 // The state-transition value indicating "stop" 
  53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator
) 
  56 //======================================================================= 
  58 //======================================================================= 
  61  * Constructs a RuleBasedBreakIterator that uses the already-created 
  62  * tables object that is passed in as a parameter. 
  64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader
* data
, UErrorCode 
&status
) 
  67     fData 
= new RBBIDataWrapper(data
, status
); // status checked in constructor 
  68     if (U_FAILURE(status
)) {return;} 
  70         status 
= U_MEMORY_ALLOCATION_ERROR
; 
  76  * Same as above but does not adopt memory 
  78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader
* data
, enum EDontAdopt
, UErrorCode 
&status
) 
  81     fData 
= new RBBIDataWrapper(data
, RBBIDataWrapper::kDontAdopt
, status
); // status checked in constructor 
  82     if (U_FAILURE(status
)) {return;} 
  84         status 
= U_MEMORY_ALLOCATION_ERROR
; 
  89 //------------------------------------------------------------------------------- 
  91 //   Constructor   from a UDataMemory handle to precompiled break rules 
  92 //                 stored in an ICU data file. 
  94 //------------------------------------------------------------------------------- 
  95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory
* udm
, UErrorCode 
&status
) 
  98     fData 
= new RBBIDataWrapper(udm
, status
); // status checked in constructor 
  99     if (U_FAILURE(status
)) {return;} 
 101         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 108 //------------------------------------------------------------------------------- 
 110 //   Constructor       from a set of rules supplied as a string. 
 112 //------------------------------------------------------------------------------- 
 113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  
&rules
, 
 114                                                 UParseError          
&parseError
, 
 118     if (U_FAILURE(status
)) {return;} 
 119     RuleBasedBreakIterator 
*bi 
= (RuleBasedBreakIterator 
*) 
 120         RBBIRuleBuilder::createRuleBasedBreakIterator(rules
, &parseError
, status
); 
 121     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that 
 122     //        creates and returns a complete RBBI.  From here, in a constructor, we 
 123     //        can't just return the object created by the builder factory, hence 
 124     //        the assignment of the factory created object to "this". 
 125     if (U_SUCCESS(status
)) { 
 132 //------------------------------------------------------------------------------- 
 134 // Default Constructor.      Create an empty shell that can be set up later. 
 135 //                           Used when creating a RuleBasedBreakIterator from a set 
 137 //------------------------------------------------------------------------------- 
 138 RuleBasedBreakIterator::RuleBasedBreakIterator() { 
 143 //------------------------------------------------------------------------------- 
 145 //   Copy constructor.  Will produce a break iterator with the same behavior, 
 146 //                      and which iterates over the same text, as the one passed in. 
 148 //------------------------------------------------------------------------------- 
 149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator
& other
) 
 150 : BreakIterator(other
) 
 160 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 
 161     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 162         // fCharIter was adopted from the outside. 
 174         fData
->removeReference(); 
 177     if (fCachedBreakPositions
) { 
 178         uprv_free(fCachedBreakPositions
); 
 179         fCachedBreakPositions 
= NULL
; 
 181     if (fLanguageBreakEngines
) { 
 182         delete fLanguageBreakEngines
; 
 183         fLanguageBreakEngines 
= NULL
; 
 185     if (fUnhandledBreakEngine
) { 
 186         delete fUnhandledBreakEngine
; 
 187         fUnhandledBreakEngine 
= NULL
; 
 192  * Assignment operator.  Sets this iterator to have the same behavior, 
 193  * and iterate over the same text, as the one passed in. 
 195 RuleBasedBreakIterator
& 
 196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator
& that
) { 
 200     reset();    // Delete break cache information 
 201     fBreakType 
= that
.fBreakType
; 
 202     if (fLanguageBreakEngines 
!= NULL
) { 
 203         delete fLanguageBreakEngines
; 
 204         fLanguageBreakEngines 
= NULL
;   // Just rebuild for now 
 206     // TODO: clone fLanguageBreakEngines from "that" 
 207     UErrorCode status 
= U_ZERO_ERROR
; 
 208     fText 
= utext_clone(fText
, that
.fText
, FALSE
, TRUE
, &status
); 
 210     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 215     if (that
.fCharIter 
!= NULL 
) { 
 216         // This is a little bit tricky - it will intially appear that 
 217         //  this->fCharIter is adopted, even if that->fCharIter was 
 218         //  not adopted.  That's ok. 
 219         fCharIter 
= that
.fCharIter
->clone(); 
 223         fData
->removeReference(); 
 226     if (that
.fData 
!= NULL
) { 
 227         fData 
= that
.fData
->addReference(); 
 235 //----------------------------------------------------------------------------- 
 237 //    init()      Shared initialization routine.   Used by all the constructors. 
 238 //                Initializes all fields, leaving the object in a consistent state. 
 240 //----------------------------------------------------------------------------- 
 241 void RuleBasedBreakIterator::init() { 
 242     UErrorCode  status    
= U_ZERO_ERROR
; 
 243     fBufferClone          
= FALSE
; 
 244     fText                 
= utext_openUChars(NULL
, NULL
, 0, &status
); 
 249     fLastRuleStatusIndex  
= 0; 
 250     fLastStatusIndexValid 
= TRUE
; 
 251     fDictionaryCharCount  
= 0; 
 252     fBreakType            
= UBRK_WORD
;  // Defaulting BreakType to word gives reasonable 
 253                                         //   dictionary behavior for Break Iterators that are 
 254                                         //   built from rules.  Even better would be the ability to 
 255                                         //   declare the type in the rules. 
 257     fCachedBreakPositions    
= NULL
; 
 258     fLanguageBreakEngines    
= NULL
; 
 259     fUnhandledBreakEngine    
= NULL
; 
 260     fNumCachedBreakPositions 
= 0; 
 261     fPositionInCache         
= 0; 
 264     static UBool debugInitDone 
= FALSE
; 
 265     if (debugInitDone 
== FALSE
) { 
 266         char *debugEnv 
= getenv("U_RBBIDEBUG"); 
 267         if (debugEnv 
&& uprv_strstr(debugEnv
, "trace")) { 
 270         debugInitDone 
= TRUE
; 
 277 //----------------------------------------------------------------------------- 
 279 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same 
 280 //            behavior, and iterating over the same text, as this one. 
 281 //            Virtual function: does the right thing with subclasses. 
 283 //----------------------------------------------------------------------------- 
 285 RuleBasedBreakIterator::clone(void) const { 
 286     return new RuleBasedBreakIterator(*this); 
 290  * Equality operator.  Returns TRUE if both BreakIterators are of the 
 291  * same class, have the same behavior, and iterate over the same text. 
 294 RuleBasedBreakIterator::operator==(const BreakIterator
& that
) const { 
 295     if (typeid(*this) != typeid(that
)) { 
 299     const RuleBasedBreakIterator
& that2 
= (const RuleBasedBreakIterator
&) that
; 
 301     if (!utext_equals(fText
, that2
.fText
)) { 
 302         // The two break iterators are operating on different text, 
 303         //   or have a different interation position. 
 307     // TODO:  need a check for when in a dictionary region at different offsets. 
 309     if (that2
.fData 
== fData 
|| 
 310         (fData 
!= NULL 
&& that2
.fData 
!= NULL 
&& *that2
.fData 
== *fData
)) { 
 311             // The two break iterators are using the same rules. 
 318  * Compute a hash code for this BreakIterator 
 319  * @return A hash code 
 322 RuleBasedBreakIterator::hashCode(void) const { 
 325         hash 
= fData
->hashCode(); 
 331 void RuleBasedBreakIterator::setText(UText 
*ut
, UErrorCode 
&status
) { 
 332     if (U_FAILURE(status
)) { 
 336     fText 
= utext_clone(fText
, ut
, FALSE
, TRUE
, &status
); 
 338     // Set up a dummy CharacterIterator to be returned if anyone 
 339     //   calls getText().  With input from UText, there is no reasonable 
 340     //   way to return a characterIterator over the actual input text. 
 341     //   Return one over an empty string instead - this is the closest 
 342     //   we can come to signaling a failure. 
 343     //   (GetText() is obsolete, this failure is sort of OK) 
 344     if (fDCharIter 
== NULL
) { 
 345         static const UChar c 
= 0; 
 346         fDCharIter 
= new UCharCharacterIterator(&c
, 0); 
 347         if (fDCharIter 
== NULL
) { 
 348             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 353     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 354         // existing fCharIter was adopted from the outside.  Delete it now. 
 357     fCharIter 
= fDCharIter
; 
 363 UText 
*RuleBasedBreakIterator::getUText(UText 
*fillIn
, UErrorCode 
&status
) const { 
 364     UText 
*result 
= utext_clone(fillIn
, fText
, FALSE
, TRUE
, &status
);   
 371  * Returns the description used to create this iterator 
 374 RuleBasedBreakIterator::getRules() const { 
 376         return fData
->getRuleSourceString(); 
 378         static const UnicodeString 
*s
; 
 380             // TODO:  something more elegant here. 
 381             //        perhaps API should return the string by value. 
 382             //        Note:  thread unsafe init & leak are semi-ok, better than 
 383             //               what was before.  Sould be cleaned up, though. 
 384             s 
= new UnicodeString
; 
 390 //======================================================================= 
 391 // BreakIterator overrides 
 392 //======================================================================= 
 395  * Return a CharacterIterator over the text being analyzed.   
 398 RuleBasedBreakIterator::getText() const { 
 403  * Set the iterator to analyze a new piece of text.  This function resets 
 404  * the current iteration position to the beginning of the text. 
 405  * @param newText An iterator over the text to analyze. 
 408 RuleBasedBreakIterator::adoptText(CharacterIterator
* newText
) { 
 409     // If we are holding a CharacterIterator adopted from a  
 410     //   previous call to this function, delete it now. 
 411     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 416     UErrorCode status 
= U_ZERO_ERROR
; 
 418     if (newText
==NULL 
|| newText
->startIndex() != 0) {    
 419         // startIndex !=0 wants to be an error, but there's no way to report it. 
 420         // Make the iterator text be an empty string. 
 421         fText 
= utext_openUChars(fText
, NULL
, 0, &status
); 
 423         fText 
= utext_openCharacterIterator(fText
, newText
, &status
); 
 429  * Set the iterator to analyze a new piece of text.  This function resets 
 430  * the current iteration position to the beginning of the text. 
 431  * @param newText An iterator over the text to analyze. 
 434 RuleBasedBreakIterator::setText(const UnicodeString
& newText
) { 
 435     UErrorCode status 
= U_ZERO_ERROR
; 
 437     fText 
= utext_openConstUnicodeString(fText
, &newText
, &status
); 
 439     // Set up a character iterator on the string.   
 440     //   Needed in case someone calls getText(). 
 441     //  Can not, unfortunately, do this lazily on the (probably never) 
 442     //  call to getText(), because getText is const. 
 443     if (fSCharIter 
== NULL
) { 
 444         fSCharIter 
= new StringCharacterIterator(newText
); 
 446         fSCharIter
->setText(newText
); 
 449     if (fCharIter
!=fSCharIter 
&& fCharIter
!=fDCharIter
) { 
 450         // old fCharIter was adopted from the outside.  Delete it. 
 453     fCharIter 
= fSCharIter
; 
 461  * Sets the current iteration position to the beginning of the text. 
 462  * @return The offset of the beginning of the text. 
 464 int32_t RuleBasedBreakIterator::first(void) { 
 466     fLastRuleStatusIndex  
= 0; 
 467     fLastStatusIndexValid 
= TRUE
; 
 469     //    return BreakIterator::DONE; 
 471     utext_setNativeIndex(fText
, 0); 
 476  * Sets the current iteration position to the end of the text. 
 477  * @return The text's past-the-end offset. 
 479 int32_t RuleBasedBreakIterator::last(void) { 
 482         fLastRuleStatusIndex  
= 0; 
 483         fLastStatusIndexValid 
= TRUE
; 
 484         return BreakIterator::DONE
; 
 487     fLastStatusIndexValid 
= FALSE
; 
 488     int32_t pos 
= (int32_t)utext_nativeLength(fText
); 
 489     utext_setNativeIndex(fText
, pos
); 
 494  * Advances the iterator either forward or backward the specified number of steps. 
 495  * Negative values move backward, and positive values move forward.  This is 
 496  * equivalent to repeatedly calling next() or previous(). 
 497  * @param n The number of steps to move.  The sign indicates the direction 
 498  * (negative is backwards, and positive is forwards). 
 499  * @return The character offset of the boundary position n boundaries away from 
 502 int32_t RuleBasedBreakIterator::next(int32_t n
) { 
 503     int32_t result 
= current(); 
 516  * Advances the iterator to the next boundary position. 
 517  * @return The position of the first boundary after this one. 
 519 int32_t RuleBasedBreakIterator::next(void) { 
 520     // if we have cached break positions and we're still in the range 
 521     // covered by them, just move one step forward in the cache 
 522     if (fCachedBreakPositions 
!= NULL
) { 
 523         if (fPositionInCache 
< fNumCachedBreakPositions 
- 1) { 
 525             int32_t pos 
= fCachedBreakPositions
[fPositionInCache
]; 
 526             utext_setNativeIndex(fText
, pos
); 
 534     int32_t startPos 
= current(); 
 535     int32_t result 
= handleNext(fData
->fForwardTable
); 
 536     if (fDictionaryCharCount 
> 0) { 
 537         result 
= checkDictionary(startPos
, result
, FALSE
); 
 543  * Advances the iterator backwards, to the last boundary preceding this one. 
 544  * @return The position of the last boundary position preceding this one. 
 546 int32_t RuleBasedBreakIterator::previous(void) { 
 550     // if we have cached break positions and we're still in the range 
 551     // covered by them, just move one step backward in the cache 
 552     if (fCachedBreakPositions 
!= NULL
) { 
 553         if (fPositionInCache 
> 0) { 
 555             // If we're at the beginning of the cache, need to reevaluate the 
 557             if (fPositionInCache 
<= 0) { 
 558                 fLastStatusIndexValid 
= FALSE
; 
 560             int32_t pos 
= fCachedBreakPositions
[fPositionInCache
]; 
 561             utext_setNativeIndex(fText
, pos
); 
 569     // if we're already sitting at the beginning of the text, return DONE 
 570     if (fText 
== NULL 
|| (startPos 
= current()) == 0) { 
 571         fLastRuleStatusIndex  
= 0; 
 572         fLastStatusIndexValid 
= TRUE
; 
 573         return BreakIterator::DONE
; 
 576     if (fData
->fSafeRevTable 
!= NULL 
|| fData
->fSafeFwdTable 
!= NULL
) { 
 577         result 
= handlePrevious(fData
->fReverseTable
); 
 578         if (fDictionaryCharCount 
> 0) { 
 579             result 
= checkDictionary(result
, startPos
, TRUE
); 
 585     // set things up.  handlePrevious() will back us up to some valid 
 586     // break position before the current position (we back our internal 
 587     // iterator up one step to prevent handlePrevious() from returning 
 588     // the current position), but not necessarily the last one before 
 592     int32_t start 
= current(); 
 594     UTEXT_PREVIOUS32(fText
); 
 595     int32_t lastResult    
= handlePrevious(fData
->fReverseTable
); 
 596     if (lastResult 
== UBRK_DONE
) { 
 598         utext_setNativeIndex(fText
, 0); 
 602     UBool   breakTagValid 
= FALSE
; 
 604     // iterate forward from the known break position until we pass our 
 605     // starting point.  The last break position before the starting 
 606     // point is our return value 
 610         if (result 
== BreakIterator::DONE 
|| result 
>= start
) { 
 614         lastTag        
= fLastRuleStatusIndex
; 
 615         breakTagValid  
= TRUE
; 
 618     // fLastBreakTag wants to have the value for section of text preceding 
 619     // the result position that we are to return (in lastResult.)  If 
 620     // the backwards rules overshot and the above loop had to do two or more 
 621     // next()s to move up to the desired return position, we will have a valid 
 622     // tag value. But, if handlePrevious() took us to exactly the correct result positon, 
 623     // we wont have a tag value for that position, which is only set by handleNext(). 
 625     // set the current iteration position to be the last break position 
 626     // before where we started, and then return that value 
 627     utext_setNativeIndex(fText
, lastResult
); 
 628     fLastRuleStatusIndex  
= lastTag
;       // for use by getRuleStatus() 
 629     fLastStatusIndexValid 
= breakTagValid
; 
 631     // No need to check the dictionary; it will have been handled by 
 638  * Sets the iterator to refer to the first boundary position following 
 639  * the specified position. 
 640  * @offset The position from which to begin searching for a break position. 
 641  * @return The position of the first break after the current position. 
 643 int32_t RuleBasedBreakIterator::following(int32_t offset
) { 
 644     // if we have cached break positions and offset is in the range 
 645     // covered by them, use them 
 646     // TODO: could use binary search 
 647     // TODO: what if offset is outside range, but break is not? 
 648     if (fCachedBreakPositions 
!= NULL
) { 
 649         if (offset 
>= fCachedBreakPositions
[0] 
 650                 && offset 
< fCachedBreakPositions
[fNumCachedBreakPositions 
- 1]) { 
 651             fPositionInCache 
= 0; 
 652             // We are guaranteed not to leave the array due to range test above 
 653             while (offset 
>= fCachedBreakPositions
[fPositionInCache
]) { 
 656             int32_t pos 
= fCachedBreakPositions
[fPositionInCache
]; 
 657             utext_setNativeIndex(fText
, pos
); 
 665     // if the offset passed in is already past the end of the text, 
 666     // just return DONE; if it's before the beginning, return the 
 667     // text's starting offset 
 668     fLastRuleStatusIndex  
= 0; 
 669     fLastStatusIndexValid 
= TRUE
; 
 670     if (fText 
== NULL 
|| offset 
>= utext_nativeLength(fText
)) { 
 674     else if (offset 
< 0) { 
 678     // otherwise, set our internal iteration position (temporarily) 
 679     // to the position passed in.  If this is the _beginning_ position, 
 680     // then we can just use next() to get our return value 
 684     if (fData
->fSafeRevTable 
!= NULL
) { 
 686         utext_setNativeIndex(fText
, offset
); 
 687         // move forward one codepoint to prepare for moving back to a 
 689         // this handles offset being between a supplementary character 
 691         // handlePrevious will move most of the time to < 1 boundary away 
 692         handlePrevious(fData
->fSafeRevTable
); 
 693         int32_t result 
= next(); 
 694         while (result 
<= offset
) { 
 699     if (fData
->fSafeFwdTable 
!= NULL
) { 
 700         // backup plan if forward safe table is not available 
 701         utext_setNativeIndex(fText
, offset
); 
 702         UTEXT_PREVIOUS32(fText
); 
 703         // handle next will give result >= offset 
 704         handleNext(fData
->fSafeFwdTable
); 
 705         // previous will give result 0 or 1 boundary away from offset, 
 708         int32_t oldresult 
= previous(); 
 709         while (oldresult 
> offset
) { 
 710             int32_t result 
= previous(); 
 711             if (result 
<= offset
) { 
 716         int32_t result 
= next(); 
 717         if (result 
<= offset
) { 
 722     // otherwise, we have to sync up first.  Use handlePrevious() to back 
 723     // up to a known break position before the specified position (if 
 724     // we can determine that the specified position is a break position, 
 725     // we don't back up at all).  This may or may not be the last break 
 726     // position at or before our starting position.  Advance forward 
 727     // from here until we've passed the starting position.  The position 
 728     // we stop on will be the first break position after the specified one. 
 731     utext_setNativeIndex(fText
, offset
); 
 733         (offset
==1  && utext_getNativeIndex(fText
)==0)) { 
 738     while (result 
!= BreakIterator::DONE 
&& result 
<= offset
) { 
 746  * Sets the iterator to refer to the last boundary position before the 
 747  * specified position. 
 748  * @offset The position to begin searching for a break from. 
 749  * @return The position of the last boundary before the starting position. 
 751 int32_t RuleBasedBreakIterator::preceding(int32_t offset
) { 
 752     // if we have cached break positions and offset is in the range 
 753     // covered by them, use them 
 754     if (fCachedBreakPositions 
!= NULL
) { 
 755         // TODO: binary search? 
 756         // TODO: What if offset is outside range, but break is not? 
 757         if (offset 
> fCachedBreakPositions
[0] 
 758                 && offset 
<= fCachedBreakPositions
[fNumCachedBreakPositions 
- 1]) { 
 759             fPositionInCache 
= 0; 
 760             while (fPositionInCache 
< fNumCachedBreakPositions
 
 761                    && offset 
> fCachedBreakPositions
[fPositionInCache
]) 
 764             // If we're at the beginning of the cache, need to reevaluate the 
 766             if (fPositionInCache 
<= 0) { 
 767                 fLastStatusIndexValid 
= FALSE
; 
 769             utext_setNativeIndex(fText
, fCachedBreakPositions
[fPositionInCache
]); 
 770             return fCachedBreakPositions
[fPositionInCache
]; 
 777     // if the offset passed in is already past the end of the text, 
 778     // just return DONE; if it's before the beginning, return the 
 779     // text's starting offset 
 780     if (fText 
== NULL 
|| offset 
> utext_nativeLength(fText
)) { 
 781         // return BreakIterator::DONE; 
 784     else if (offset 
< 0) { 
 788     // if we start by updating the current iteration position to the 
 789     // position specified by the caller, we can just use previous() 
 790     // to carry out this operation 
 792     if (fData
->fSafeFwdTable 
!= NULL
) { 
 794         utext_setNativeIndex(fText
, offset
); 
 795         int32_t newOffset 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 796         if (newOffset 
!= offset
) { 
 797             // Will come here if specified offset was not a code point boundary AND 
 798             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary 
 799             //   indices to the containing code point. 
 800             // For breakitereator::preceding only, these non-code-point indices need to be moved 
 801             //   up to refer to the following codepoint. 
 803             offset 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 806         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair, 
 807         //        rather than adjusting the position unconditionally? 
 808         //        (Change would interact with safe rules.) 
 809         // TODO:  change RBBI behavior for off-boundary indices to match that of UText? 
 810         //        affects only preceding(), seems cleaner, but is slightly different. 
 811         UTEXT_PREVIOUS32(fText
); 
 812         handleNext(fData
->fSafeFwdTable
); 
 813         int32_t result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 814         while (result 
>= offset
) { 
 819     if (fData
->fSafeRevTable 
!= NULL
) { 
 820         // backup plan if forward safe table is not available 
 821         //  TODO:  check whether this path can be discarded 
 822         //         It's probably OK to say that rules must supply both safe tables 
 823         //            if they use safe tables at all.  We have certainly never described 
 824         //            to anyone how to work with just one safe table. 
 825         utext_setNativeIndex(fText
, offset
); 
 828         // handle previous will give result <= offset 
 829         handlePrevious(fData
->fSafeRevTable
); 
 831         // next will give result 0 or 1 boundary away from offset, 
 834         int32_t oldresult 
= next(); 
 835         while (oldresult 
< offset
) { 
 836             int32_t result 
= next(); 
 837             if (result 
>= offset
) { 
 842         int32_t result 
= previous(); 
 843         if (result 
>= offset
) { 
 850     utext_setNativeIndex(fText
, offset
); 
 855  * Returns true if the specfied position is a boundary position.  As a side 
 856  * effect, leaves the iterator pointing to the first boundary position at 
 858  * @param offset the offset to check. 
 859  * @return True if "offset" is a boundary position. 
 861 UBool 
RuleBasedBreakIterator::isBoundary(int32_t offset
) { 
 862     // the beginning index of the iterator is always a boundary position by definition 
 864         first();       // For side effects on current position, tag values. 
 868     if (offset 
== (int32_t)utext_nativeLength(fText
)) { 
 869         last();       // For side effects on current position, tag values. 
 873     // out-of-range indexes are never boundary positions 
 875         first();       // For side effects on current position, tag values. 
 879     if (offset 
> utext_nativeLength(fText
)) { 
 880         last();        // For side effects on current position, tag values. 
 884     // otherwise, we can use following() on the position before the specified 
 885     // one and return true if the position we get back is the one the user 
 887     utext_previous32From(fText
, offset
); 
 888     int32_t backOne 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 889     UBool    result  
= following(backOne
) == offset
; 
 894  * Returns the current iteration position. 
 895  * @return The current iteration position. 
 897 int32_t RuleBasedBreakIterator::current(void) const { 
 898     int32_t  pos 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
 902 //======================================================================= 
 904 //======================================================================= 
 907 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end 
 908 //                 of user text.  A variable with this enum type keeps track of where we 
 909 //                 are.  The state machine only fetches user input while in the RUN mode. 
 912     RBBI_START
,     // state machine processing is before first char of input 
 913     RBBI_RUN
,       // state machine processing is in the user text 
 914     RBBI_END        
// state machine processing is after end of user text. 
 918 //----------------------------------------------------------------------------------- 
 920 //  handleNext(stateTable) 
 921 //     This method is the actual implementation of the rbbi next() method.  
 922 //     This method initializes the state machine to state 1 
 923 //     and advances through the text character by character until we reach the end 
 924 //     of the text or the state machine transitions to state 0.  We update our return 
 925 //     value every time the state machine passes through an accepting state. 
 927 //----------------------------------------------------------------------------------- 
 928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable 
*statetable
) { 
 930     int16_t             category        
= 0; 
 933     RBBIStateTableRow  
*row
; 
 935     int32_t             lookaheadStatus 
= 0; 
 936     int32_t             lookaheadTagIdx 
= 0; 
 938     int32_t             initialPosition 
= 0; 
 939     int32_t             lookaheadResult 
= 0; 
 940     UBool               lookAheadHardBreak 
= (statetable
->fFlags 
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0; 
 941     const char         *tableData       
= statetable
->fTableData
; 
 942     uint32_t            tableRowLen     
= statetable
->fRowLen
; 
 946             RBBIDebugPuts("Handle Next   pos   char  state category"); 
 950     // No matter what, handleNext alway correctly sets the break tag value. 
 951     fLastStatusIndexValid 
= TRUE
; 
 952     fLastRuleStatusIndex 
= 0; 
 954     // if we're already at the end of the text, return DONE. 
 955     initialPosition 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);  
 956     result          
= initialPosition
; 
 957     c               
= UTEXT_NEXT32(fText
); 
 958     if (fData 
== NULL 
|| c
==U_SENTINEL
) { 
 959         return BreakIterator::DONE
; 
 962     //  Set the initial state for the state machine 
 964     row 
= (RBBIStateTableRow 
*) 
 965             //(statetable->fTableData + (statetable->fRowLen * state)); 
 966             (tableData 
+ tableRowLen 
* state
); 
 970     if (statetable
->fFlags 
& RBBI_BOF_REQUIRED
) { 
 976     // loop until we reach the end of the text or transition to state 0 
 979         if (c 
== U_SENTINEL
) { 
 980             // Reached end of input string. 
 981             if (mode 
== RBBI_END
) { 
 982                 // We have already run the loop one last time with the  
 983                 //   character set to the psueudo {eof} value.  Now it is time 
 984                 //   to unconditionally bail out. 
 985                 if (lookaheadResult 
> result
) { 
 986                     // We ran off the end of the string with a pending look-ahead match. 
 987                     // Treat this as if the look-ahead condition had been met, and return 
 988                     //  the match at the / position from the look-ahead rule. 
 989                     result               
= lookaheadResult
; 
 990                     fLastRuleStatusIndex 
= lookaheadTagIdx
; 
 995             // Run the loop one last time with the fake end-of-input character category. 
1001         // Get the char category.  An incoming category of 1 or 2 means that 
1002         //      we are preset for doing the beginning or end of input, and 
1003         //      that we shouldn't get a category from an actual text input character. 
1005         if (mode 
== RBBI_RUN
) { 
1006             // look up the current character's character category, which tells us 
1007             // which column in the state table to look at. 
1008             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned, 
1009             //        not the size of the character going in, which is a UChar32. 
1011             UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1013             // Check the dictionary bit in the character's category. 
1014             //    Counter is only used by dictionary based iterators (subclasses). 
1015             //    Chars that need to be handled by a dictionary have a flag bit set 
1016             //    in their category values. 
1018             if ((category 
& 0x4000) != 0)  { 
1019                 fDictionaryCharCount
++; 
1020                 //  And off the dictionary flag bit. 
1021                 category 
&= ~0x4000; 
1027                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText
)); 
1028                 if (0x20<=c 
&& c
<0x7f) { 
1029                     RBBIDebugPrintf("\"%c\"  ", c
); 
1031                     RBBIDebugPrintf("%5x  ", c
); 
1033                 RBBIDebugPrintf("%3d  %3d\n", state
, category
); 
1037         // State Transition - move machine to its next state 
1039         state 
= row
->fNextState
[category
]; 
1040         row 
= (RBBIStateTableRow 
*) 
1041             // (statetable->fTableData + (statetable->fRowLen * state)); 
1042             (tableData 
+ tableRowLen 
* state
); 
1045         if (row
->fAccepting 
== -1) { 
1046             // Match found, common case. 
1047             if (mode 
!= RBBI_START
) { 
1048                 result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1050             fLastRuleStatusIndex 
= row
->fTagIdx
;   // Remember the break status (tag) values. 
1053         if (row
->fLookAhead 
!= 0) { 
1054             if (lookaheadStatus 
!= 0 
1055                 && row
->fAccepting 
== lookaheadStatus
) { 
1056                 // Lookahead match is completed.   
1057                 result               
= lookaheadResult
; 
1058                 fLastRuleStatusIndex 
= lookaheadTagIdx
; 
1059                 lookaheadStatus      
= 0; 
1060                 // TODO:  make a standalone hard break in a rule work. 
1061                 if (lookAheadHardBreak
) { 
1062                     UTEXT_SETNATIVEINDEX(fText
, result
); 
1065                 // Look-ahead completed, but other rules may match further.  Continue on 
1066                 //  TODO:  junk this feature?  I don't think it's used anywhwere. 
1070             int32_t  r 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1071             lookaheadResult 
= r
; 
1072             lookaheadStatus 
= row
->fLookAhead
; 
1073             lookaheadTagIdx 
= row
->fTagIdx
; 
1078         if (row
->fAccepting 
!= 0) { 
1079             // Because this is an accepting state, any in-progress look-ahead match 
1080             //   is no longer relavant.  Clear out the pending lookahead status. 
1081             lookaheadStatus 
= 0;           // clear out any pending look-ahead match. 
1085         if (state 
== STOP_STATE
) { 
1086             // This is the normal exit from the lookup state machine. 
1087             // We have advanced through the string until it is certain that no 
1088             //   longer match is possible, no matter what characters follow. 
1092         // Advance to the next character.   
1093         // If this is a beginning-of-input loop iteration, don't advance 
1094         //    the input position.  The next iteration will be processing the 
1095         //    first real input character. 
1096         if (mode 
== RBBI_RUN
) { 
1097             c 
= UTEXT_NEXT32(fText
); 
1099             if (mode 
== RBBI_START
) { 
1107     // The state machine is done.  Check whether it found a match... 
1109     // If the iterator failed to advance in the match engine, force it ahead by one. 
1110     //   (This really indicates a defect in the break rules.  They should always match 
1111     //    at least one character.) 
1112     if (result 
== initialPosition
) { 
1113         UTEXT_SETNATIVEINDEX(fText
, initialPosition
); 
1114         UTEXT_NEXT32(fText
); 
1115         result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1118     // Leave the iterator at our result position. 
1119     UTEXT_SETNATIVEINDEX(fText
, result
); 
1122             RBBIDebugPrintf("result = %d\n\n", result
); 
1130 //----------------------------------------------------------------------------------- 
1134 //      Iterate backwards, according to the logic of the reverse rules. 
1135 //      This version handles the exact style backwards rules. 
1137 //      The logic of this function is very similar to handleNext(), above. 
1139 //----------------------------------------------------------------------------------- 
1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable 
*statetable
) { 
1142     int16_t             category        
= 0; 
1144     RBBIStateTableRow  
*row
; 
1146     int32_t             lookaheadStatus 
= 0; 
1148     int32_t             initialPosition 
= 0; 
1149     int32_t             lookaheadResult 
= 0; 
1150     UBool               lookAheadHardBreak 
= (statetable
->fFlags 
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0; 
1154             RBBIDebugPuts("Handle Previous   pos   char  state category"); 
1158     // handlePrevious() never gets the rule status. 
1159     // Flag the status as invalid; if the user ever asks for status, we will need 
1160     // to back up, then re-find the break position using handleNext(), which does 
1161     // get the status value. 
1162     fLastStatusIndexValid 
= FALSE
; 
1163     fLastRuleStatusIndex 
= 0; 
1165     // if we're already at the start of the text, return DONE. 
1166     if (fText 
== NULL 
|| fData 
== NULL 
|| UTEXT_GETNATIVEINDEX(fText
)==0) { 
1167         return BreakIterator::DONE
; 
1170     //  Set up the starting char. 
1171     initialPosition 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1172     result          
= initialPosition
; 
1173     c               
= UTEXT_PREVIOUS32(fText
); 
1175     //  Set the initial state for the state machine 
1176     state 
= START_STATE
; 
1177     row 
= (RBBIStateTableRow 
*) 
1178             (statetable
->fTableData 
+ (statetable
->fRowLen 
* state
)); 
1181     if (statetable
->fFlags 
& RBBI_BOF_REQUIRED
) { 
1187     // loop until we reach the start of the text or transition to state 0 
1190         if (c 
== U_SENTINEL
) { 
1191             // Reached end of input string. 
1192             if (mode 
== RBBI_END
) { 
1193                 // We have already run the loop one last time with the  
1194                 //   character set to the psueudo {eof} value.  Now it is time 
1195                 //   to unconditionally bail out. 
1196                 if (lookaheadResult 
< result
) { 
1197                     // We ran off the end of the string with a pending look-ahead match. 
1198                     // Treat this as if the look-ahead condition had been met, and return 
1199                     //  the match at the / position from the look-ahead rule. 
1200                     result               
= lookaheadResult
; 
1201                     lookaheadStatus 
= 0; 
1202                 } else if (result 
== initialPosition
) { 
1203                     // Ran off start, no match found. 
1204                     // move one index one (towards the start, since we are doing a previous()) 
1205                     UTEXT_SETNATIVEINDEX(fText
, initialPosition
); 
1206                     UTEXT_PREVIOUS32(fText
);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check. 
1210             // Run the loop one last time with the fake end-of-input character category. 
1216         // Get the char category.  An incoming category of 1 or 2 means that 
1217         //      we are preset for doing the beginning or end of input, and 
1218         //      that we shouldn't get a category from an actual text input character. 
1220         if (mode 
== RBBI_RUN
) { 
1221             // look up the current character's character category, which tells us 
1222             // which column in the state table to look at. 
1223             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned, 
1224             //        not the size of the character going in, which is a UChar32. 
1226             UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1228             // Check the dictionary bit in the character's category. 
1229             //    Counter is only used by dictionary based iterators (subclasses). 
1230             //    Chars that need to be handled by a dictionary have a flag bit set 
1231             //    in their category values. 
1233             if ((category 
& 0x4000) != 0)  { 
1234                 fDictionaryCharCount
++; 
1235                 //  And off the dictionary flag bit. 
1236                 category 
&= ~0x4000; 
1242                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText
)); 
1243                 if (0x20<=c 
&& c
<0x7f) { 
1244                     RBBIDebugPrintf("\"%c\"  ", c
); 
1246                     RBBIDebugPrintf("%5x  ", c
); 
1248                 RBBIDebugPrintf("%3d  %3d\n", state
, category
); 
1252         // State Transition - move machine to its next state 
1254         state 
= row
->fNextState
[category
]; 
1255         row 
= (RBBIStateTableRow 
*) 
1256             (statetable
->fTableData 
+ (statetable
->fRowLen 
* state
)); 
1258         if (row
->fAccepting 
== -1) { 
1259             // Match found, common case. 
1260             result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1263         if (row
->fLookAhead 
!= 0) { 
1264             if (lookaheadStatus 
!= 0 
1265                 && row
->fAccepting 
== lookaheadStatus
) { 
1266                 // Lookahead match is completed.   
1267                 result               
= lookaheadResult
; 
1268                 lookaheadStatus      
= 0; 
1269                 // TODO:  make a standalone hard break in a rule work. 
1270                 if (lookAheadHardBreak
) { 
1271                     UTEXT_SETNATIVEINDEX(fText
, result
); 
1274                 // Look-ahead completed, but other rules may match further.  Continue on 
1275                 //  TODO:  junk this feature?  I don't think it's used anywhwere. 
1279             int32_t  r 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1280             lookaheadResult 
= r
; 
1281             lookaheadStatus 
= row
->fLookAhead
; 
1286         if (row
->fAccepting 
!= 0) { 
1287             // Because this is an accepting state, any in-progress look-ahead match 
1288             //   is no longer relavant.  Clear out the pending lookahead status. 
1289             lookaheadStatus 
= 0;     
1293         if (state 
== STOP_STATE
) { 
1294             // This is the normal exit from the lookup state machine. 
1295             // We have advanced through the string until it is certain that no 
1296             //   longer match is possible, no matter what characters follow. 
1300         // Move (backwards) to the next character to process.   
1301         // If this is a beginning-of-input loop iteration, don't advance 
1302         //    the input position.  The next iteration will be processing the 
1303         //    first real input character. 
1304         if (mode 
== RBBI_RUN
) { 
1305             c 
= UTEXT_PREVIOUS32(fText
); 
1307             if (mode 
== RBBI_START
) { 
1313     // The state machine is done.  Check whether it found a match... 
1315     // If the iterator failed to advance in the match engine, force it ahead by one. 
1316     //   (This really indicates a defect in the break rules.  They should always match 
1317     //    at least one character.) 
1318     if (result 
== initialPosition
) { 
1319         UTEXT_SETNATIVEINDEX(fText
, initialPosition
); 
1320         UTEXT_PREVIOUS32(fText
); 
1321         result 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1324     // Leave the iterator at our result position. 
1325     UTEXT_SETNATIVEINDEX(fText
, result
); 
1328             RBBIDebugPrintf("result = %d\n\n", result
); 
1336 RuleBasedBreakIterator::reset() 
1338     if (fCachedBreakPositions
) { 
1339         uprv_free(fCachedBreakPositions
); 
1341     fCachedBreakPositions 
= NULL
; 
1342     fNumCachedBreakPositions 
= 0; 
1343     fDictionaryCharCount 
= 0; 
1344     fPositionInCache 
= 0; 
1349 //------------------------------------------------------------------------------- 
1351 //   getRuleStatus()   Return the break rule tag associated with the current 
1352 //                     iterator position.  If the iterator arrived at its current 
1353 //                     position by iterating forwards, the value will have been 
1354 //                     cached by the handleNext() function. 
1356 //                     If no cached status value is available, the status is 
1357 //                     found by doing a previous() followed by a next(), which 
1358 //                     leaves the iterator where it started, and computes the 
1359 //                     status while doing the next(). 
1361 //------------------------------------------------------------------------------- 
1362 void RuleBasedBreakIterator::makeRuleStatusValid() { 
1363     if (fLastStatusIndexValid 
== FALSE
) { 
1364         //  No cached status is available. 
1365         if (fText 
== NULL 
|| current() == 0) { 
1366             //  At start of text, or there is no text.  Status is always zero. 
1367             fLastRuleStatusIndex 
= 0; 
1368             fLastStatusIndexValid 
= TRUE
; 
1370             //  Not at start of text.  Find status the tedious way. 
1371             int32_t pa 
= current(); 
1373             if (fNumCachedBreakPositions 
> 0) { 
1374                 reset();                // Blow off the dictionary cache 
1376             int32_t pb 
= next(); 
1378                 // note: the if (pa != pb) test is here only to eliminate warnings for 
1379                 //       unused local variables on gcc.  Logically, it isn't needed. 
1384     U_ASSERT(fLastRuleStatusIndex 
>= 0  &&  fLastRuleStatusIndex 
< fData
->fStatusMaxIdx
); 
1388 int32_t  RuleBasedBreakIterator::getRuleStatus() const { 
1389     RuleBasedBreakIterator 
*nonConstThis  
= (RuleBasedBreakIterator 
*)this; 
1390     nonConstThis
->makeRuleStatusValid(); 
1392     // fLastRuleStatusIndex indexes to the start of the appropriate status record 
1393     //                                                 (the number of status values.) 
1394     //   This function returns the last (largest) of the array of status values. 
1395     int32_t  idx 
= fLastRuleStatusIndex 
+ fData
->fRuleStatusTable
[fLastRuleStatusIndex
]; 
1396     int32_t  tagVal 
= fData
->fRuleStatusTable
[idx
]; 
1404 int32_t RuleBasedBreakIterator::getRuleStatusVec( 
1405              int32_t *fillInVec
, int32_t capacity
, UErrorCode 
&status
) 
1407     if (U_FAILURE(status
)) { 
1411     RuleBasedBreakIterator 
*nonConstThis  
= (RuleBasedBreakIterator 
*)this; 
1412     nonConstThis
->makeRuleStatusValid(); 
1413     int32_t  numVals 
= fData
->fRuleStatusTable
[fLastRuleStatusIndex
]; 
1414     int32_t  numValsToCopy 
= numVals
; 
1415     if (numVals 
> capacity
) { 
1416         status 
= U_BUFFER_OVERFLOW_ERROR
; 
1417         numValsToCopy 
= capacity
; 
1420     for (i
=0; i
<numValsToCopy
; i
++) { 
1421         fillInVec
[i
] = fData
->fRuleStatusTable
[fLastRuleStatusIndex 
+ i 
+ 1]; 
1428 //------------------------------------------------------------------------------- 
1430 //   getBinaryRules        Access to the compiled form of the rules, 
1431 //                         for use by build system tools that save the data 
1432 //                         for standard iterator types. 
1434 //------------------------------------------------------------------------------- 
1435 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length
) { 
1436     const uint8_t  *retPtr 
= NULL
; 
1439     if (fData 
!= NULL
) { 
1440         retPtr 
= (const uint8_t *)fData
->fHeader
; 
1441         length 
= fData
->fHeader
->fLength
; 
1449 //------------------------------------------------------------------------------- 
1451 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated. 
1452 //                    Saving one heap allocation isn't worth the trouble. 
1453 //                    Cloning shouldn't be done in tight loops, and 
1454 //                    making the clone copy involves other heap operations anyway. 
1455 //                    And the application code for correctly dealing with buffer 
1456 //                    size problems and the eventual object destruction is ugly. 
1458 //------------------------------------------------------------------------------- 
1459 BreakIterator 
*  RuleBasedBreakIterator::createBufferClone(void *stackBuffer
, 
1460                                    int32_t &bufferSize
, 
1463     if (U_FAILURE(status
)){ 
1468     //  If user buffer size is zero this is a preflight operation to 
1469     //    obtain the needed buffer size, allowing for worst case misalignment. 
1471     if (bufferSize 
== 0) { 
1472         bufferSize 
= sizeof(RuleBasedBreakIterator
) + U_ALIGNMENT_OFFSET_UP(0); 
1478     //  Check the alignment and size of the user supplied buffer. 
1479     //  Allocate heap memory if the user supplied memory is insufficient. 
1481     char    *buf   
= (char *)stackBuffer
; 
1482     uint32_t s      
= bufferSize
; 
1484     if (stackBuffer 
== NULL
) { 
1485         s 
= 0;   // Ignore size, force allocation if user didn't give us a buffer. 
1487     if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) { 
1488         uint32_t offsetUp 
= (uint32_t)U_ALIGNMENT_OFFSET_UP(buf
); 
1492     if (s 
< sizeof(RuleBasedBreakIterator
)) { 
1493         // Not enough room in the caller-supplied buffer. 
1494         // Do a plain-vanilla heap based clone and return that, along with 
1495         //   a warning that the clone was allocated. 
1496         RuleBasedBreakIterator 
*clonedBI 
= new RuleBasedBreakIterator(*this); 
1497         if (clonedBI 
== 0) { 
1498             status 
= U_MEMORY_ALLOCATION_ERROR
; 
1500             status 
= U_SAFECLONE_ALLOCATED_WARNING
; 
1506     //  Clone the source BI into the caller-supplied buffer. 
1507     //    TODO:  using an overloaded operator new to directly initialize the 
1508     //           copy in the user's buffer would be better, but it doesn't seem 
1509     //           to get along with namespaces.  Investigate why. 
1511     //           The memcpy is only safe with an empty (default constructed) 
1512     //           break iterator.  Use on others can screw up reference counts 
1513     //           to data.  memcpy-ing objects is not really a good idea... 
1515     RuleBasedBreakIterator localIter
;        // Empty break iterator, source for memcpy 
1516     RuleBasedBreakIterator 
*clone 
= (RuleBasedBreakIterator 
*)buf
; 
1517     uprv_memcpy(clone
, &localIter
, sizeof(RuleBasedBreakIterator
)); // init C++ gorp, BreakIterator base class part 
1518     clone
->init();                // Init RuleBasedBreakIterator part, (user default constructor) 
1519     *clone 
= *this;               // clone = the real BI we want. 
1520     clone
->fBufferClone 
= TRUE
;   // Flag to prevent deleting storage on close (From C code) 
1526 //------------------------------------------------------------------------------- 
1528 //  isDictionaryChar      Return true if the category lookup for this char 
1529 //                        indicates that it is in the set of dictionary lookup 
1532 //                        This function is intended for use by dictionary based 
1535 //------------------------------------------------------------------------------- 
1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) { 
1537     if (fData == NULL) { 
1541     UTRIE_GET16(&fData->fTrie, c, category); 
1542     return (category & 0x4000) != 0; 
1546 //------------------------------------------------------------------------------- 
1548 //  checkDictionary       This function handles all processing of characters in 
1549 //                        the "dictionary" set. It will determine the appropriate 
1550 //                        course of action, and possibly set up a cache in the 
1553 //------------------------------------------------------------------------------- 
1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos
, 
1557     // Reset the old break cache first. 
1558     uint32_t dictionaryCount 
= fDictionaryCharCount
; 
1561     if (dictionaryCount 
<= 1 || (endPos 
- startPos
) <= 1) { 
1562         return (reverse 
? startPos 
: endPos
); 
1565     // Bug 5532.  The dictionary code will crash if the input text is UTF-8 
1566     //      because native indexes are different from UTF-16 indexes. 
1567     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text. 
1568     //      It wont give the right breaks, but it's better than a crash. 
1570     //      Check the type of the UText by checking its pFuncs field, which 
1571     //      is UText's function dispatch table.  It will be the same for all 
1572     //      UTF-8 UTexts and different for any other UText type. 
1574     //      We have no other type of UText available with non-UTF-16 native indexing. 
1575     //      This whole check will go away once the dictionary code is fixed. 
1576     static const void *utext_utf8Funcs
; 
1577     if (utext_utf8Funcs 
== NULL
) { 
1578         // Cache the UTF-8 UText function pointer value. 
1579         UErrorCode status 
= U_ZERO_ERROR
; 
1580         UText tempUText 
= UTEXT_INITIALIZER
;  
1581         utext_openUTF8(&tempUText
, NULL
, 0, &status
); 
1582         utext_utf8Funcs 
= tempUText
.pFuncs
; 
1583         utext_close(&tempUText
); 
1585     if (fText
->pFuncs 
== utext_utf8Funcs
) { 
1586         return (reverse 
? startPos 
: endPos
); 
1589     // Starting from the starting point, scan towards the proposed result, 
1590     // looking for the first dictionary character (which may be the one 
1591     // we're on, if we're starting in the middle of a range). 
1592     utext_setNativeIndex(fText
, reverse 
? endPos 
: startPos
); 
1594         UTEXT_PREVIOUS32(fText
); 
1597     int32_t rangeStart 
= startPos
; 
1598     int32_t rangeEnd 
= endPos
; 
1602     UErrorCode  status 
= U_ZERO_ERROR
; 
1603     UStack      
breaks(status
); 
1604     int32_t     foundBreakCount 
= 0; 
1605     UChar32     c 
= utext_current32(fText
); 
1607     UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1609     // Is the character we're starting on a dictionary character? If so, we 
1610     // need to back up to include the entire run; otherwise the results of 
1611     // the break algorithm will differ depending on where we start. Since 
1612     // the result is cached and there is typically a non-dictionary break 
1613     // within a small number of words, there should be little performance impact. 
1614     if (category 
& 0x4000) { 
1617                 utext_next32(fText
);          // TODO:  recast to work directly with postincrement. 
1618                 c 
= utext_current32(fText
); 
1619                 UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1620             } while (c 
!= U_SENTINEL 
&& (category 
& 0x4000)); 
1621             // Back up to the last dictionary character 
1622             rangeEnd 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
); 
1623             if (c 
== U_SENTINEL
) { 
1624                 // c = fText->last32(); 
1625                 //   TODO:  why was this if needed? 
1626                 c 
= UTEXT_PREVIOUS32(fText
); 
1629                 c 
= UTEXT_PREVIOUS32(fText
); 
1634                 c 
= UTEXT_PREVIOUS32(fText
); 
1635                 UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1637             while (c 
!= U_SENTINEL 
&& (category 
& 0x4000)); 
1638             // Back up to the last dictionary character 
1639             if (c 
== U_SENTINEL
) { 
1640                 // c = fText->first32(); 
1641                 c 
= utext_current32(fText
); 
1644                 utext_next32(fText
); 
1645                 c 
= utext_current32(fText
); 
1647             rangeStart 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);; 
1649         UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1652     // Loop through the text, looking for ranges of dictionary characters. 
1653     // For each span, find the appropriate break engine, and ask it to find 
1654     // any breaks within the span. 
1655     // Note: we always do this in the forward direction, so that the break 
1656     // cache is built in the right order. 
1658         utext_setNativeIndex(fText
, rangeStart
); 
1659         c 
= utext_current32(fText
); 
1660         UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1662     while(U_SUCCESS(status
)) { 
1663         while((current 
= (int32_t)UTEXT_GETNATIVEINDEX(fText
)) < rangeEnd 
&& (category 
& 0x4000) == 0) { 
1664             utext_next32(fText
);           // TODO:  tweak for post-increment operation 
1665             c 
= utext_current32(fText
); 
1666             UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1668         if (current 
>= rangeEnd
) { 
1672         // We now have a dictionary character. Get the appropriate language object 
1674         const LanguageBreakEngine 
*lbe 
= getLanguageBreakEngine(c
); 
1676         // Ask the language object if there are any breaks. It will leave the text 
1677         // pointer on the other side of its range, ready to search for the next one. 
1679             foundBreakCount 
+= lbe
->findBreaks(fText
, rangeStart
, rangeEnd
, FALSE
, fBreakType
, breaks
); 
1682         // Reload the loop variables for the next go-round 
1683         c 
= utext_current32(fText
); 
1684         UTRIE_GET16(&fData
->fTrie
, c
, category
); 
1687     // If we found breaks, build a new break cache. The first and last entries must 
1688     // be the original starting and ending position. 
1689     if (foundBreakCount 
> 0) { 
1690         int32_t totalBreaks 
= foundBreakCount
; 
1691         if (startPos 
< breaks
.elementAti(0)) { 
1694         if (endPos 
> breaks
.peeki()) { 
1697         fCachedBreakPositions 
= (int32_t *)uprv_malloc(totalBreaks 
* sizeof(int32_t)); 
1698         if (fCachedBreakPositions 
!= NULL
) { 
1700             fNumCachedBreakPositions 
= totalBreaks
; 
1701             if (startPos 
< breaks
.elementAti(0)) { 
1702                 fCachedBreakPositions
[out
++] = startPos
; 
1704             for (int32_t i 
= 0; i 
< foundBreakCount
; ++i
) { 
1705                 fCachedBreakPositions
[out
++] = breaks
.elementAti(i
); 
1707             if (endPos 
> fCachedBreakPositions
[out
-1]) { 
1708                 fCachedBreakPositions
[out
] = endPos
; 
1710             // If there are breaks, then by definition, we are replacing the original 
1711             // proposed break by one of the breaks we found. Use following() and 
1712             // preceding() to do the work. They should never recurse in this case. 
1714                 return preceding(endPos 
- 1); 
1717                 return following(startPos
); 
1720         // If the allocation failed, just fall through to the "no breaks found" case. 
1723     // If we get here, there were no language-based breaks. Set the text pointer 
1724     // to the original proposed break. 
1725     utext_setNativeIndex(fText
, reverse 
? startPos 
: endPos
); 
1726     return (reverse 
? startPos 
: endPos
); 
1731 // defined in ucln_cmn.h 
1733 static U_NAMESPACE_QUALIFIER UStack 
*gLanguageBreakFactories 
= NULL
; 
1736  * Release all static memory held by breakiterator.   
1739 static UBool U_CALLCONV 
breakiterator_cleanup_dict(void) { 
1740     if (gLanguageBreakFactories
) { 
1741         delete gLanguageBreakFactories
; 
1742         gLanguageBreakFactories 
= NULL
; 
1749 static void U_CALLCONV 
_deleteFactory(void *obj
) { 
1750     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory 
*) obj
; 
1755 static const LanguageBreakEngine
* 
1756 getLanguageBreakEngineFromFactory(UChar32 c
, int32_t breakType
) 
1759     UErrorCode  status 
= U_ZERO_ERROR
; 
1760     UMTX_CHECK(NULL
, (UBool
)(gLanguageBreakFactories 
== NULL
), needsInit
); 
1763         UStack  
*factories 
= new UStack(_deleteFactory
, NULL
, status
); 
1764         if (factories 
!= NULL 
&& U_SUCCESS(status
)) { 
1765             ICULanguageBreakFactory 
*builtIn 
= new ICULanguageBreakFactory(status
); 
1766             factories
->push(builtIn
, status
); 
1767 #ifdef U_LOCAL_SERVICE_HOOK 
1768             LanguageBreakFactory 
*extra 
= (LanguageBreakFactory 
*)uprv_svc_hook("languageBreakFactory", &status
); 
1769             if (extra 
!= NULL
) { 
1770                 factories
->push(extra
, status
); 
1775         if (gLanguageBreakFactories 
== NULL
) { 
1776             gLanguageBreakFactories 
= factories
; 
1778             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT
, breakiterator_cleanup_dict
); 
1784     if (gLanguageBreakFactories 
== NULL
) { 
1788     int32_t i 
= gLanguageBreakFactories
->size(); 
1789     const LanguageBreakEngine 
*lbe 
= NULL
; 
1791         LanguageBreakFactory 
*factory 
= (LanguageBreakFactory 
*)(gLanguageBreakFactories
->elementAt(i
)); 
1792         lbe 
= factory
->getEngineFor(c
, breakType
); 
1801 //------------------------------------------------------------------------------- 
1803 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the 
1806 //------------------------------------------------------------------------------- 
1807 const LanguageBreakEngine 
* 
1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c
) { 
1809     const LanguageBreakEngine 
*lbe 
= NULL
; 
1810     UErrorCode status 
= U_ZERO_ERROR
; 
1812     if (fLanguageBreakEngines 
== NULL
) { 
1813         fLanguageBreakEngines 
= new UStack(status
); 
1814         if (fLanguageBreakEngines 
== NULL 
|| U_FAILURE(status
)) { 
1815             delete fLanguageBreakEngines
; 
1816             fLanguageBreakEngines 
= 0; 
1821     int32_t i 
= fLanguageBreakEngines
->size(); 
1823         lbe 
= (const LanguageBreakEngine 
*)(fLanguageBreakEngines
->elementAt(i
)); 
1824         if (lbe
->handles(c
, fBreakType
)) { 
1829     // No existing dictionary took the character. See if a factory wants to 
1830     // give us a new LanguageBreakEngine for this character. 
1831     lbe 
= getLanguageBreakEngineFromFactory(c
, fBreakType
); 
1833     // If we got one, use it and push it on our stack. 
1835         fLanguageBreakEngines
->push((void *)lbe
, status
); 
1836         // Even if we can't remember it, we can keep looking it up, so 
1837         // return it even if the push fails. 
1841     // No engine is forthcoming for this character. Add it to the 
1842     // reject set. Create the reject break engine if needed. 
1843     if (fUnhandledBreakEngine 
== NULL
) { 
1844         fUnhandledBreakEngine 
= new UnhandledEngine(status
); 
1845         if (U_SUCCESS(status
) && fUnhandledBreakEngine 
== NULL
) { 
1846             status 
= U_MEMORY_ALLOCATION_ERROR
; 
1848         // Put it last so that scripts for which we have an engine get tried 
1850         fLanguageBreakEngines
->insertElementAt(fUnhandledBreakEngine
, 0, status
); 
1851         // If we can't insert it, or creation failed, get rid of it 
1852         if (U_FAILURE(status
)) { 
1853             delete fUnhandledBreakEngine
; 
1854             fUnhandledBreakEngine 
= 0; 
1859     // Tell the reject engine about the character; at its discretion, it may 
1860     // add more than just the one character. 
1861     fUnhandledBreakEngine
->handleCharacter(c
, fBreakType
); 
1863     return fUnhandledBreakEngine
; 
1868 /*int32_t RuleBasedBreakIterator::getBreakType() const { 
1872 void RuleBasedBreakIterator::setBreakType(int32_t type
) { 
1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */