2 ***************************************************************************
3 * Copyright (C) 1999-2004 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
8 // file: rbbi.c Contains the implementation of the rule based break iterator
9 // runtime engine and the API implementation for
10 // class RuleBasedBreakIterator
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
17 #include "unicode/rbbi.h"
18 #include "unicode/schriter.h"
19 #include "unicode/udata.h"
20 #include "unicode/uclean.h"
31 static const int16_t START_STATE
= 1; // The state number of the starting state
32 static const int16_t STOP_STATE
= 0; // The state-transition value indicating "stop"
35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator
)
38 //=======================================================================
40 //=======================================================================
43 * Constructs a RuleBasedBreakIterator that uses the already-created
44 * tables object that is passed in as a parameter.
46 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader
* data
, UErrorCode
&status
)
49 fData
= new RBBIDataWrapper(data
, status
); // status checked in constructor
50 if (U_FAILURE(status
)) {return;}
52 status
= U_MEMORY_ALLOCATION_ERROR
;
57 //-------------------------------------------------------------------------------
59 // Constructor from a UDataMemory handle to precompiled break rules
60 // stored in an ICU data file.
62 //-------------------------------------------------------------------------------
63 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory
* udm
, UErrorCode
&status
)
66 fData
= new RBBIDataWrapper(udm
, status
); // status checked in constructor
67 if (U_FAILURE(status
)) {return;}
69 status
= U_MEMORY_ALLOCATION_ERROR
;
76 //-------------------------------------------------------------------------------
78 // Constructor from a set of rules supplied as a string.
80 //-------------------------------------------------------------------------------
81 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString
&rules
,
82 UParseError
&parseError
,
85 u_init(&status
); // Just in case ICU is not yet initialized
87 if (U_FAILURE(status
)) {return;}
88 RuleBasedBreakIterator
*bi
= (RuleBasedBreakIterator
*)
89 RBBIRuleBuilder::createRuleBasedBreakIterator(rules
, parseError
, status
);
90 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
91 // creates and returns a complete RBBI. From here, in a constructor, we
92 // can't just return the object created by the builder factory, hence
93 // the assignment of the factory created object to "this".
94 if (U_SUCCESS(status
)) {
101 //-------------------------------------------------------------------------------
103 // Default Constructor. Create an empty shell that can be set up later.
104 // Used when creating a RuleBasedBreakIterator from a set
106 //-------------------------------------------------------------------------------
107 RuleBasedBreakIterator::RuleBasedBreakIterator() {
112 //-------------------------------------------------------------------------------
114 // Copy constructor. Will produce a break iterator with the same behavior,
115 // and which iterates over the same text, as the one passed in.
117 //-------------------------------------------------------------------------------
118 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator
& other
)
119 : BreakIterator(other
)
129 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
133 fData
->removeReference();
139 * Assignment operator. Sets this iterator to have the same behavior,
140 * and iterate over the same text, as the one passed in.
142 RuleBasedBreakIterator
&
143 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator
& that
) {
149 if (that
.fText
!= NULL
) {
150 fText
= that
.fText
->clone();
154 fData
->removeReference();
157 if (that
.fData
!= NULL
) {
158 fData
= that
.fData
->addReference();
160 fTrace
= that
.fTrace
;
167 //-----------------------------------------------------------------------------
169 // init() Shared initialization routine. Used by all the constructors.
170 // Initializes all fields, leaving the object in a consistent state.
172 //-----------------------------------------------------------------------------
173 UBool
RuleBasedBreakIterator::fTrace
= FALSE
;
174 void RuleBasedBreakIterator::init() {
178 fLastRuleStatusIndex
= 0;
179 fLastStatusIndexValid
= TRUE
;
180 fDictionaryCharCount
= 0;
183 static UBool debugInitDone
= FALSE
;
184 if (debugInitDone
== FALSE
) {
185 char *debugEnv
= getenv("U_RBBIDEBUG");
186 if (debugEnv
&& uprv_strstr(debugEnv
, "trace")) {
189 debugInitDone
= TRUE
;
196 //-----------------------------------------------------------------------------
198 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
199 // behavior, and iterating over the same text, as this one.
200 // Virtual function: does the right thing with subclasses.
202 //-----------------------------------------------------------------------------
204 RuleBasedBreakIterator::clone(void) const {
205 return new RuleBasedBreakIterator(*this);
209 * Equality operator. Returns TRUE if both BreakIterators are of the
210 * same class, have the same behavior, and iterate over the same text.
213 RuleBasedBreakIterator::operator==(const BreakIterator
& that
) const {
215 if (that
.getDynamicClassID() != getDynamicClassID()) {
219 const RuleBasedBreakIterator
& that2
= (const RuleBasedBreakIterator
&) that
;
220 if (fText
== that2
.fText
||
221 (fText
!= NULL
&& that2
.fText
!= NULL
&& *that2
.fText
== *fText
)) {
222 if (that2
.fData
== fData
||
223 (fData
!= NULL
&& that2
.fData
!= NULL
&& *that2
.fData
== *fData
)) {
231 * Compute a hash code for this BreakIterator
232 * @return A hash code
235 RuleBasedBreakIterator::hashCode(void) const {
238 hash
= fData
->hashCode();
244 * Returns the description used to create this iterator
247 RuleBasedBreakIterator::getRules() const {
249 return fData
->getRuleSourceString();
251 static const UnicodeString
*s
;
253 // TODO: something more elegant here.
254 // perhaps API should return the string by value.
255 // Note: thread unsafe init & leak are semi-ok, better than
256 // what was before. Sould be cleaned up, though.
257 s
= new UnicodeString
;
263 //=======================================================================
264 // BreakIterator overrides
265 //=======================================================================
268 * Return a CharacterIterator over the text being analyzed. This version
269 * of this method returns the actual CharacterIterator we're using internally.
270 * Changing the state of this iterator can have undefined consequences. If
271 * you need to change it, clone it first.
272 * @return An iterator over the text being analyzed.
274 const CharacterIterator
&
275 RuleBasedBreakIterator::getText() const {
276 RuleBasedBreakIterator
* nonConstThis
= (RuleBasedBreakIterator
*)this;
278 // The iterator is initialized pointing to no text at all, so if this
279 // function is called while we're in that state, we have to fudge an
280 // an iterator to return.
281 if (nonConstThis
->fText
== NULL
) {
282 nonConstThis
->fText
= new StringCharacterIterator(UnicodeString());
284 return *nonConstThis
->fText
;
288 * Set the iterator to analyze a new piece of text. This function resets
289 * the current iteration position to the beginning of the text.
290 * @param newText An iterator over the text to analyze.
293 RuleBasedBreakIterator::adoptText(CharacterIterator
* newText
) {
301 * Set the iterator to analyze a new piece of text. This function resets
302 * the current iteration position to the beginning of the text.
303 * @param newText An iterator over the text to analyze.
306 RuleBasedBreakIterator::setText(const UnicodeString
& newText
) {
308 if (fText
!= NULL
&& fText
->getDynamicClassID()
309 == StringCharacterIterator::getStaticClassID()) {
310 ((StringCharacterIterator
*)fText
)->setText(newText
);
314 fText
= new StringCharacterIterator(newText
);
322 * Sets the current iteration position to the beginning of the text.
323 * (i.e., the CharacterIterator's starting offset).
324 * @return The offset of the beginning of the text.
326 int32_t RuleBasedBreakIterator::first(void) {
328 fLastRuleStatusIndex
= 0;
329 fLastStatusIndexValid
= TRUE
;
331 return BreakIterator::DONE
;
334 return fText
->getIndex();
338 * Sets the current iteration position to the end of the text.
339 * (i.e., the CharacterIterator's ending offset).
340 * @return The text's past-the-end offset.
342 int32_t RuleBasedBreakIterator::last(void) {
345 fLastRuleStatusIndex
= 0;
346 fLastStatusIndexValid
= TRUE
;
347 return BreakIterator::DONE
;
350 // I'm not sure why, but t.last() returns the offset of the last character,
351 // rather than the past-the-end offset
353 // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
354 // will work correctly.)
357 fLastStatusIndexValid
= FALSE
;
358 int32_t pos
= fText
->endIndex();
359 fText
->setIndex(pos
);
365 * Advances the iterator either forward or backward the specified number of steps.
366 * Negative values move backward, and positive values move forward. This is
367 * equivalent to repeatedly calling next() or previous().
368 * @param n The number of steps to move. The sign indicates the direction
369 * (negative is backwards, and positive is forwards).
370 * @return The character offset of the boundary position n boundaries away from
373 int32_t RuleBasedBreakIterator::next(int32_t n
) {
374 int32_t result
= current();
376 result
= handleNext();
387 * Advances the iterator to the next boundary position.
388 * @return The position of the first boundary after this one.
390 int32_t RuleBasedBreakIterator::next(void) {
395 * Advances the iterator backwards, to the last boundary preceding this one.
396 * @return The position of the last boundary position preceding this one.
398 int32_t RuleBasedBreakIterator::previous(void) {
399 // if we're already sitting at the beginning of the text, return DONE
400 if (fText
== NULL
|| current() == fText
->startIndex()) {
401 fLastRuleStatusIndex
= 0;
402 fLastStatusIndexValid
= TRUE
;
403 return BreakIterator::DONE
;
406 if (fData
->fSafeRevTable
!= NULL
|| fData
->fSafeFwdTable
!= NULL
) {
407 return handlePrevious(fData
->fReverseTable
);
411 // set things up. handlePrevious() will back us up to some valid
412 // break position before the current position (we back our internal
413 // iterator up one step to prevent handlePrevious() from returning
414 // the current position), but not necessarily the last one before
417 int32_t start
= current();
420 int32_t lastResult
= handlePrevious();
421 int32_t result
= lastResult
;
423 UBool breakTagValid
= FALSE
;
425 // iterate forward from the known break position until we pass our
426 // starting point. The last break position before the starting
427 // point is our return value
430 result
= handleNext();
431 if (result
== BreakIterator::DONE
|| result
>= start
) {
435 lastTag
= fLastRuleStatusIndex
;
436 breakTagValid
= TRUE
;
439 // fLastBreakTag wants to have the value for section of text preceding
440 // the result position that we are to return (in lastResult.) If
441 // the backwards rules overshot and the above loop had to do two or more
442 // handleNext()s to move up to the desired return position, we will have a valid
443 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
444 // we wont have a tag value for that position, which is only set by handleNext().
446 // set the current iteration position to be the last break position
447 // before where we started, and then return that value
448 fText
->setIndex(lastResult
);
449 fLastRuleStatusIndex
= lastTag
; // for use by getRuleStatus()
450 fLastStatusIndexValid
= breakTagValid
;
455 * Sets the iterator to refer to the first boundary position following
456 * the specified position.
457 * @offset The position from which to begin searching for a break position.
458 * @return The position of the first break after the current position.
460 int32_t RuleBasedBreakIterator::following(int32_t offset
) {
461 // if the offset passed in is already past the end of the text,
462 // just return DONE; if it's before the beginning, return the
463 // text's starting offset
464 fLastRuleStatusIndex
= 0;
465 fLastStatusIndexValid
= TRUE
;
466 if (fText
== NULL
|| offset
>= fText
->endIndex()) {
470 else if (offset
< fText
->startIndex()) {
474 // otherwise, set our internal iteration position (temporarily)
475 // to the position passed in. If this is the _beginning_ position,
476 // then we can just use next() to get our return value
480 if (fData
->fSafeRevTable
!= NULL
) {
483 fText
->setIndex(offset
);
484 // move forward one codepoint to prepare for moving back to a
486 // this handles offset being between a supplementary character
488 // handlePrevious will move most of the time to < 1 boundary away
489 handlePrevious(fData
->fSafeRevTable
);
490 int32_t result
= next();
491 while (result
<= offset
) {
496 if (fData
->fSafeFwdTable
!= NULL
) {
497 // backup plan if forward safe table is not available
498 fText
->setIndex(offset
);
500 // handle next will give result >= offset
501 handleNext(fData
->fSafeFwdTable
);
502 // previous will give result 0 or 1 boundary away from offset,
505 int32_t oldresult
= previous();
506 while (oldresult
> offset
) {
507 int32_t result
= previous();
508 if (result
<= offset
) {
513 int32_t result
= next();
514 if (result
<= offset
) {
519 // otherwise, we have to sync up first. Use handlePrevious() to back
520 // us up to a known break position before the specified position (if
521 // we can determine that the specified position is a break position,
522 // we don't back up at all). This may or may not be the last break
523 // position at or before our starting position. Advance forward
524 // from here until we've passed the starting position. The position
525 // we stop on will be the first break position after the specified one.
528 fText
->setIndex(offset
);
529 if (offset
== fText
->startIndex()) {
534 while (result
!= BreakIterator::DONE
&& result
<= offset
) {
542 * Sets the iterator to refer to the last boundary position before the
543 * specified position.
544 * @offset The position to begin searching for a break from.
545 * @return The position of the last boundary before the starting position.
547 int32_t RuleBasedBreakIterator::preceding(int32_t offset
) {
548 // if the offset passed in is already past the end of the text,
549 // just return DONE; if it's before the beginning, return the
551 // text's starting offset
552 if (fText
== NULL
|| offset
> fText
->endIndex()) {
553 // return BreakIterator::DONE;
556 else if (offset
< fText
->startIndex()) {
560 // if we start by updating the current iteration position to the
561 // position specified by the caller, we can just use previous()
562 // to carry out this operation
564 if (fData
->fSafeFwdTable
!= NULL
) {
567 fText
->setIndex(offset
);
568 // move backwards one codepoint to prepare for moving forwards to a
570 // this handles offset being between a supplementary character
571 // TODO: would it be better to just check for being in the middle of a surrogate pair,
572 // rather than adjusting the position unconditionally?
573 // (Change would interact with safe rules.)
575 handleNext(fData
->fSafeFwdTable
);
576 int32_t result
= fText
->getIndex();
577 while (result
>= offset
) {
582 if (fData
->fSafeRevTable
!= NULL
) {
583 // backup plan if forward safe table is not available
584 fText
->setIndex(offset
);
586 // handle previous will give result <= offset
587 handlePrevious(fData
->fSafeRevTable
);
589 // next will give result 0 or 1 boundary away from offset,
592 int32_t oldresult
= next();
593 while (oldresult
< offset
) {
594 int32_t result
= next();
595 if (result
>= offset
) {
600 int32_t result
= previous();
601 if (result
>= offset
) {
608 fText
->setIndex(offset
);
613 * Returns true if the specfied position is a boundary position. As a side
614 * effect, leaves the iterator pointing to the first boundary position at
616 * @param offset the offset to check.
617 * @return True if "offset" is a boundary position.
619 UBool
RuleBasedBreakIterator::isBoundary(int32_t offset
) {
620 // the beginning index of the iterator is always a boundary position by definition
621 if (fText
== NULL
|| offset
== fText
->startIndex()) {
622 first(); // For side effects on current position, tag values.
626 if (offset
== fText
->endIndex()) {
627 last(); // For side effects on current position, tag values.
631 // out-of-range indexes are never boundary positions
632 if (offset
< fText
->startIndex()) {
633 first(); // For side effects on current position, tag values.
637 if (offset
> fText
->endIndex()) {
638 last(); // For side effects on current position, tag values.
642 // otherwise, we can use following() on the position before the specified
643 // one and return true if the position we get back is the one the user
645 return following(offset
- 1) == offset
;
649 * Returns the current iteration position.
650 * @return The current iteration position.
652 int32_t RuleBasedBreakIterator::current(void) const {
653 return (fText
!= NULL
) ? fText
->getIndex() : BreakIterator::DONE
;
656 //=======================================================================
658 //=======================================================================
661 //-----------------------------------------------------------------------------------
664 // This method is the actual implementation of the next() method. All iteration
665 // vectors through here. This method initializes the state machine to state 1
666 // and advances through the text character by character until we reach the end
667 // of the text or the state machine transitions to state 0. We update our return
668 // value every time the state machine passes through an accepting state.
670 //-----------------------------------------------------------------------------------
671 int32_t RuleBasedBreakIterator::handleNext() {
672 return handleNext(fData
->fForwardTable
);
675 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable
*statetable
) {
677 RBBIDebugPuts("Handle Next pos char state category");
680 // No matter what, handleNext alway correctly sets the break tag value.
681 fLastStatusIndexValid
= TRUE
;
683 // if we're already at the end of the text, return DONE.
684 if (fText
== NULL
|| fData
== NULL
|| fText
->hasNext() == FALSE
) {
685 fLastRuleStatusIndex
= 0;
686 return BreakIterator::DONE
;
689 int32_t initialPosition
= fText
->getIndex();
690 int32_t result
= initialPosition
;
691 int32_t lookaheadResult
= 0;
693 // Initialize the state machine. Begin in state 1
694 int32_t state
= START_STATE
;
696 UChar32 c
= fText
->current32();
697 RBBIStateTableRow
*row
;
698 int32_t lookaheadStatus
= 0;
699 int32_t lookaheadTagIdx
= 0;
701 fLastRuleStatusIndex
= 0;
703 row
= (RBBIStateTableRow
*) // Point to starting row of state table.
704 (statetable
->fTableData
+ (statetable
->fRowLen
* state
));
706 // Character Category fetch for starting character.
707 // See comments on character category code within loop, below.
708 UTRIE_GET16(&fData
->fTrie
, c
, category
);
709 if ((category
& 0x4000) != 0) {
710 fDictionaryCharCount
++;
714 // loop until we reach the end of the text or transition to state 0
716 if (c
== CharacterIterator::DONE
&& fText
->hasNext()==FALSE
) {
717 // Reached end of input string.
718 // Note: CharacterIterator::DONE is 0xffff, which is also a legal
719 // character value. Check for DONE first, because it's quicker,
720 // but also need to check fText->hasNext() to be certain.
722 if (lookaheadResult
> result
) {
723 // We ran off the end of the string with a pending look-ahead match.
724 // Treat this as if the look-ahead condition had been met, and return
725 // the match at the / position from the look-ahead rule.
726 result
= lookaheadResult
;
727 fLastRuleStatusIndex
= lookaheadTagIdx
;
729 } else if (result
== initialPosition
) {
730 // Ran off end, no match found.
732 fText
->setIndex(initialPosition
);
738 // look up the current character's character category, which tells us
739 // which column in the state table to look at.
740 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
741 // not the size of the character going in, which is a UChar32.
743 UTRIE_GET16(&fData
->fTrie
, c
, category
);
745 // Check the dictionary bit in the character's category.
746 // Counter is only used by dictionary based iterators (subclasses).
747 // Chars that need to be handled by a dictionary have a flag bit set
748 // in their category values.
750 if ((category
& 0x4000) != 0) {
751 fDictionaryCharCount
++;
752 // And off the dictionary flag bit.
758 RBBIDebugPrintf(" %4d ", fText
->getIndex());
759 if (0x20<=c
&& c
<0x7f) {
760 RBBIDebugPrintf("\"%c\" ", c
);
762 RBBIDebugPrintf("%5x ", c
);
764 RBBIDebugPrintf("%3d %3d\n", state
, category
);
768 // look up a state transition in the state table
769 state
= row
->fNextState
[category
];
770 row
= (RBBIStateTableRow
*)
771 (statetable
->fTableData
+ (statetable
->fRowLen
* state
));
773 // Get the next character. Doing it here positions the iterator
774 // to the correct position for recording matches in the code that
778 if (row
->fAccepting
== -1) {
779 // Match found, common case, could have lookahead so we move on to check it
780 result
= fText
->getIndex();
782 fLastRuleStatusIndex
= row
->fTagIdx
; // Remember the break status (tag) values.
785 if (row
->fLookAhead
!= 0) {
786 if (lookaheadStatus
!= 0
787 && row
->fAccepting
== lookaheadStatus
) {
788 // Lookahead match is completed. Set the result accordingly, but only
789 // if no other rule has matched further in the mean time.
790 result
= lookaheadResult
;
791 fLastRuleStatusIndex
= lookaheadTagIdx
;
793 /// i think we have to back up to read the lookahead character again
794 /// fText->setIndex(lookaheadResult);
795 /// TODO: this is a simple hack since reverse rules only have simple
796 /// lookahead rules that we can definitely break out from.
797 /// we need to make the lookahead rules not chain eventually.
799 /// this is going to be the longest match again
803 int32_t r
= fText
->getIndex();
805 lookaheadStatus
= row
->fLookAhead
;
806 lookaheadTagIdx
= row
->fTagIdx
;
811 if (row
->fAccepting
== 0) {
812 // No match, nothing of interest happening, common case.
816 lookaheadStatus
= 0; // clear out any pending look-ahead matches.
819 if (state
== STOP_STATE
) {
820 // This is the normal exit from the lookup state machine.
821 // We have advanced through the string until it is certain that no
822 // longer match is possible, no matter what characters follow.
827 // The state machine is done. Check whether it found a match...
829 // If the iterator failed to advance in the match engine, force it ahead by one.
830 // (This really indicates a defect in the break rules. They should always match
831 // at least one character.)
832 if (result
== initialPosition
) {
833 result
= fText
->setIndex(initialPosition
);
835 result
= fText
->getIndex();
838 // Leave the iterator at our result position.
839 fText
->setIndex(result
);
841 RBBIDebugPrintf("result = %d\n\n", result
);
847 //----------------------------------------------------------------
849 // handlePrevious(void) This is the variant used with old style rules
850 // (Overshoot to a safe point, then move forward)
852 //----------------------------------------------------------------
853 int32_t RuleBasedBreakIterator::handlePrevious(void) {
854 if (fText
== NULL
|| fData
== NULL
) {
857 if (fData
->fReverseTable
== NULL
) {
858 return fText
->setToStart();
861 int32_t state
= START_STATE
;
863 int32_t lastCategory
= 0;
864 int32_t result
= fText
->getIndex();
865 int32_t lookaheadStatus
= 0;
866 int32_t lookaheadResult
= 0;
867 int32_t lookaheadTagIdx
= 0;
868 UChar32 c
= fText
->current32();
869 RBBIStateTableRow
*row
;
871 row
= (RBBIStateTableRow
*)
872 (this->fData
->fReverseTable
->fTableData
+ (state
* fData
->fReverseTable
->fRowLen
));
873 UTRIE_GET16(&fData
->fTrie
, c
, category
);
874 if ((category
& 0x4000) != 0) {
875 fDictionaryCharCount
++;
880 RBBIDebugPuts("Handle Prev pos char state category");
883 // loop until we reach the beginning of the text or transition to state 0
885 if (c
== CharacterIterator::DONE
&& fText
->hasPrevious()==FALSE
) {
889 // save the last character's category and look up the current
890 // character's category
891 lastCategory
= category
;
892 UTRIE_GET16(&fData
->fTrie
, c
, category
);
894 // Check the dictionary bit in the character's category.
895 // Counter is only used by dictionary based iterators.
897 if ((category
& 0x4000) != 0) {
898 fDictionaryCharCount
++;
904 RBBIDebugPrintf(" %4d ", fText
->getIndex());
905 if (0x20<=c
&& c
<0x7f) {
906 RBBIDebugPrintf("\"%c\" ", c
);
908 RBBIDebugPrintf("%5x ", c
);
910 RBBIDebugPrintf("%3d %3d\n", state
, category
);
914 // look up a state transition in the backwards state table
915 state
= row
->fNextState
[category
];
916 row
= (RBBIStateTableRow
*)
917 (this->fData
->fReverseTable
->fTableData
+ (state
* fData
->fReverseTable
->fRowLen
));
919 if (row
->fAccepting
== 0 && row
->fLookAhead
== 0) {
920 // No match, nothing of interest happening, common case.
924 if (row
->fAccepting
== -1) {
925 // Match found, common case, no lookahead involved.
926 result
= fText
->getIndex();
927 lookaheadStatus
= 0; // clear out any pending look-ahead matches.
931 if (row
->fAccepting
== 0 && row
->fLookAhead
!= 0) {
932 // Lookahead match point. Remember it, but only if no other rule
933 // has unconditionally matched to this point.
934 // TODO: handle case where there's a pending match from a different rule
935 // where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
936 int32_t r
= fText
->getIndex();
939 lookaheadStatus
= row
->fLookAhead
;
940 lookaheadTagIdx
= row
->fTagIdx
;
945 if (row
->fAccepting
!= 0 && row
->fLookAhead
!= 0) {
946 // Lookahead match is completed. Set the result accordingly, but only
947 // if no other rule has matched further in the mean time.
948 if (lookaheadResult
> result
) {
949 U_ASSERT(row
->fAccepting
== lookaheadStatus
); // TODO: handle this case
950 // of overlapping lookahead matches.
951 result
= lookaheadResult
;
952 fLastRuleStatusIndex
= lookaheadTagIdx
;
959 if (state
== STOP_STATE
) {
963 // then advance one character backwards
964 c
= fText
->previous32();
967 // Note: the result postion isn't what is returned to the user by previous(),
968 // but where the implementation of previous() turns around and
969 // starts iterating forward again.
970 if (c
== CharacterIterator::DONE
&& fText
->hasPrevious()==FALSE
) {
971 result
= fText
->startIndex();
973 fText
->setIndex(result
);
979 //-----------------------------------------------------------------------------------
983 // This method backs the iterator back up to a "safe position" in the text.
984 // This is a position that we know, without any context, may be any position
985 // not more than 2 breaks away. Occasionally, the position may be less than
987 // The various calling methods then iterate forward from this safe position to
988 // the appropriate position to return.
990 // The logic of this function is very similar to handleNext(), above.
992 //-----------------------------------------------------------------------------------
993 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable
*statetable
) {
994 if (fText
== NULL
|| statetable
== NULL
) {
997 // break tag is no longer valid after icu switched to exact backwards
999 fLastStatusIndexValid
= FALSE
;
1000 if (statetable
== NULL
) {
1001 return fText
->setToStart();
1004 int32_t state
= START_STATE
;
1006 int32_t lastCategory
= 0;
1007 UBool hasPassedStartText
= !fText
->hasPrevious();
1008 UChar32 c
= fText
->previous32();
1009 // previous character
1010 int32_t result
= fText
->getIndex();
1011 int32_t lookaheadStatus
= 0;
1012 int32_t lookaheadResult
= 0;
1013 int32_t lookaheadTagIdx
= 0;
1014 UBool lookAheadHardBreak
= (statetable
->fFlags
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0;
1016 RBBIStateTableRow
*row
;
1018 row
= (RBBIStateTableRow
*)
1019 (statetable
->fTableData
+ (state
* statetable
->fRowLen
));
1020 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1021 if ((category
& 0x4000) != 0) {
1022 fDictionaryCharCount
++;
1023 category
&= ~0x4000;
1027 RBBIDebugPuts("Handle Prev pos char state category");
1030 // loop until we reach the beginning of the text or transition to state 0
1032 // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
1033 if (hasPassedStartText
) {
1034 // if we have already considered the start of the text
1035 if (row
->fLookAhead
!= 0 && lookaheadResult
== 0) {
1041 // save the last character's category and look up the current
1042 // character's category
1043 lastCategory
= category
;
1044 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1046 // Check the dictionary bit in the character's category.
1047 // Counter is only used by dictionary based iterators.
1049 if ((category
& 0x4000) != 0) {
1050 fDictionaryCharCount
++;
1051 category
&= ~0x4000;
1056 RBBIDebugPrintf(" %4d ", fText
->getIndex());
1057 if (0x20<=c
&& c
<0x7f) {
1058 RBBIDebugPrintf("\"%c\" ", c
);
1060 RBBIDebugPrintf("%5x ", c
);
1062 RBBIDebugPrintf("%3d %3d\n", state
, category
);
1066 // look up a state transition in the backwards state table
1067 state
= row
->fNextState
[category
];
1068 row
= (RBBIStateTableRow
*)
1069 (statetable
->fTableData
+ (state
* statetable
->fRowLen
));
1071 if (row
->fAccepting
== -1) {
1072 // Match found, common case, could have lookahead so we move on to check it
1073 result
= fText
->getIndex();
1075 fLastRuleStatusIndex
= row
->fTagIdx
; // Remember the break status (tag) value.
1078 if (row
->fLookAhead
!= 0) {
1079 if (lookaheadStatus
!= 0
1080 && row
->fAccepting
== lookaheadStatus
) {
1081 // Lookahead match is completed. Set the result accordingly, but only
1082 // if no other rule has matched further in the mean time.
1083 result
= lookaheadResult
;
1084 fLastRuleStatusIndex
= lookaheadTagIdx
;
1085 lookaheadStatus
= 0;
1086 /// i think we have to back up to read the lookahead character again
1087 /// fText->setIndex(lookaheadResult);
1088 /// TODO: this is a simple hack since reverse rules only have simple
1089 /// lookahead rules that we can definitely break out from.
1090 /// we need to make the lookahead rules not chain eventually.
1092 /// this is going to be the longest match again
1094 /// syn wee todo hard coded for line breaks stuff
1095 /// needs to provide a tag in rules to ensure a stop.
1097 if (lookAheadHardBreak
) {
1098 fText
->setIndex(result
);
1101 category
= lastCategory
;
1102 fText
->setIndex(result
);
1107 int32_t r
= fText
->getIndex();
1108 lookaheadResult
= r
;
1109 lookaheadStatus
= row
->fLookAhead
;
1110 fLastRuleStatusIndex
= row
->fTagIdx
;
1115 if (row
->fAccepting
== 0) {
1116 // No match, nothing of interest happening, common case.
1120 lookaheadStatus
= 0; // clear out any pending look-ahead matches.
1123 if (state
== STOP_STATE
) {
1127 // then advance one character backwards
1128 hasPassedStartText
= !fText
->hasPrevious();
1129 c
= fText
->previous32();
1132 // Note: the result postion isn't what is returned to the user by previous(),
1133 // but where the implementation of previous() turns around and
1134 // starts iterating forward again.
1135 fText
->setIndex(result
);
1142 RuleBasedBreakIterator::reset()
1144 // Base-class version of this function is a no-op.
1145 // Subclasses may override with their own reset behavior.
1150 //-------------------------------------------------------------------------------
1152 // getRuleStatus() Return the break rule tag associated with the current
1153 // iterator position. If the iterator arrived at its current
1154 // position by iterating forwards, the value will have been
1155 // cached by the handleNext() function.
1157 // If no cached status value is available, the status is
1158 // found by doing a previous() followed by a next(), which
1159 // leaves the iterator where it started, and computes the
1160 // status while doing the next().
1162 //-------------------------------------------------------------------------------
1163 void RuleBasedBreakIterator::makeRuleStatusValid() {
1164 if (fLastStatusIndexValid
== FALSE
) {
1165 // No cached status is available.
1166 if (fText
== NULL
|| current() == fText
->startIndex()) {
1167 // At start of text, or there is no text. Status is always zero.
1168 fLastRuleStatusIndex
= 0;
1169 fLastStatusIndexValid
= TRUE
;
1171 // Not at start of text. Find status the tedious way.
1172 int32_t pa
= current();
1174 int32_t pb
= next();
1176 // note: the if (pa != pb) test is here only to eliminate warnings for
1177 // unused local variables on gcc. Logically, it isn't needed.
1182 U_ASSERT(fLastStatusIndexValid
== TRUE
);
1183 U_ASSERT(fLastRuleStatusIndex
>= 0 && fLastRuleStatusIndex
< fData
->fStatusMaxIdx
);
1187 int32_t RuleBasedBreakIterator::getRuleStatus() const {
1188 RuleBasedBreakIterator
*nonConstThis
= (RuleBasedBreakIterator
*)this;
1189 nonConstThis
->makeRuleStatusValid();
1191 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1192 // (the number of status values.)
1193 // This function returns the last (largest) of the array of status values.
1194 int32_t idx
= fLastRuleStatusIndex
+ fData
->fRuleStatusTable
[fLastRuleStatusIndex
];
1195 int32_t tagVal
= fData
->fRuleStatusTable
[idx
];
1203 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1204 int32_t *fillInVec
, int32_t capacity
, UErrorCode
&status
)
1206 if (U_FAILURE(status
)) {
1210 RuleBasedBreakIterator
*nonConstThis
= (RuleBasedBreakIterator
*)this;
1211 nonConstThis
->makeRuleStatusValid();
1212 int32_t numVals
= fData
->fRuleStatusTable
[fLastRuleStatusIndex
];
1213 int32_t numValsToCopy
= numVals
;
1214 if (numVals
> capacity
) {
1215 status
= U_BUFFER_OVERFLOW_ERROR
;
1216 numValsToCopy
= capacity
;
1219 for (i
=0; i
<numValsToCopy
; i
++) {
1220 fillInVec
[i
] = fData
->fRuleStatusTable
[fLastRuleStatusIndex
+ i
+ 1];
1227 //-------------------------------------------------------------------------------
1229 // getBinaryRules Access to the compiled form of the rules,
1230 // for use by build system tools that save the data
1231 // for standard iterator types.
1233 //-------------------------------------------------------------------------------
1234 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length
) {
1235 const uint8_t *retPtr
= NULL
;
1238 if (fData
!= NULL
) {
1239 retPtr
= (const uint8_t *)fData
->fHeader
;
1240 length
= fData
->fHeader
->fLength
;
1248 //-------------------------------------------------------------------------------
1250 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
1251 // Saving one heap allocation isn't worth the trouble.
1252 // Cloning shouldn't be done in tight loops, and
1253 // making the clone copy involves other heap operations anyway.
1254 // And the application code for correctly dealing with buffer
1255 // size problems and the eventual object destruction is ugly.
1257 //-------------------------------------------------------------------------------
1258 BreakIterator
* RuleBasedBreakIterator::createBufferClone(void *stackBuffer
,
1259 int32_t &bufferSize
,
1262 if (U_FAILURE(status
)){
1267 // If user buffer size is zero this is a preflight operation to
1268 // obtain the needed buffer size, allowing for worst case misalignment.
1270 if (bufferSize
== 0) {
1271 bufferSize
= sizeof(RuleBasedBreakIterator
) + U_ALIGNMENT_OFFSET_UP(0);
1277 // Check the alignment and size of the user supplied buffer.
1278 // Allocate heap memory if the user supplied memory is insufficient.
1280 char *buf
= (char *)stackBuffer
;
1281 uint32_t s
= bufferSize
;
1283 if (stackBuffer
== NULL
) {
1284 s
= 0; // Ignore size, force allocation if user didn't give us a buffer.
1286 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
1287 uint32_t offsetUp
= (uint32_t)U_ALIGNMENT_OFFSET_UP(buf
);
1291 if (s
< sizeof(RuleBasedBreakIterator
)) {
1292 buf
= (char *) new RuleBasedBreakIterator
;
1294 status
= U_MEMORY_ALLOCATION_ERROR
;
1297 status
= U_SAFECLONE_ALLOCATED_WARNING
;
1301 // Clone the object.
1302 // TODO: using an overloaded operator new to directly initialize the
1303 // copy in the user's buffer would be better, but it doesn't seem
1304 // to get along with namespaces. Investigate why.
1306 // The memcpy is only safe with an empty (default constructed)
1307 // break iterator. Use on others can screw up reference counts
1308 // to data. memcpy-ing objects is not really a good idea...
1310 RuleBasedBreakIterator localIter
; // Empty break iterator, source for memcpy
1311 RuleBasedBreakIterator
*clone
= (RuleBasedBreakIterator
*)buf
;
1312 uprv_memcpy(clone
, &localIter
, sizeof(RuleBasedBreakIterator
)); // clone = empty, but initialized, iterator.
1313 *clone
= *this; // clone = the real one we want.
1314 if (status
!= U_SAFECLONE_ALLOCATED_WARNING
) {
1315 clone
->fBufferClone
= TRUE
;
1323 //-------------------------------------------------------------------------------
1325 // isDictionaryChar Return true if the category lookup for this char
1326 // indicates that it is in the set of dictionary lookup
1329 // This function is intended for use by dictionary based
1332 //-------------------------------------------------------------------------------
1333 UBool
RuleBasedBreakIterator::isDictionaryChar(UChar32 c
) {
1334 if (fData
== NULL
) {
1338 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1339 return (category
& 0x4000) != 0;
1346 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */