2 ***************************************************************************
3 * Copyright (C) 1999-2008 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
8 // file: rbbi.c Contains the implementation of the rule based break iterator
9 // runtime engine and the API implementation for
10 // class RuleBasedBreakIterator
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
17 #include "unicode/rbbi.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/udata.h"
21 #include "unicode/uclean.h"
33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
34 #if U_LOCAL_SERVICE_HOOK
39 static UBool fTrace
= FALSE
;
44 // The state number of the starting state
47 // The state-transition value indicating "stop"
51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator
)
54 //=======================================================================
56 //=======================================================================
59 * Constructs a RuleBasedBreakIterator that uses the already-created
60 * tables object that is passed in as a parameter.
62 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader
* data
, UErrorCode
&status
)
65 fData
= new RBBIDataWrapper(data
, status
); // status checked in constructor
66 if (U_FAILURE(status
)) {return;}
68 status
= U_MEMORY_ALLOCATION_ERROR
;
74 * Same as above but does not adopt memory
76 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader
* data
, enum EDontAdopt
, UErrorCode
&status
)
79 fData
= new RBBIDataWrapper(data
, RBBIDataWrapper::kDontAdopt
, status
); // status checked in constructor
80 if (U_FAILURE(status
)) {return;}
82 status
= U_MEMORY_ALLOCATION_ERROR
;
87 //-------------------------------------------------------------------------------
89 // Constructor from a UDataMemory handle to precompiled break rules
90 // stored in an ICU data file.
92 //-------------------------------------------------------------------------------
93 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory
* udm
, UErrorCode
&status
)
96 fData
= new RBBIDataWrapper(udm
, status
); // status checked in constructor
97 if (U_FAILURE(status
)) {return;}
99 status
= U_MEMORY_ALLOCATION_ERROR
;
106 //-------------------------------------------------------------------------------
108 // Constructor from a set of rules supplied as a string.
110 //-------------------------------------------------------------------------------
111 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString
&rules
,
112 UParseError
&parseError
,
116 if (U_FAILURE(status
)) {return;}
117 RuleBasedBreakIterator
*bi
= (RuleBasedBreakIterator
*)
118 RBBIRuleBuilder::createRuleBasedBreakIterator(rules
, &parseError
, status
);
119 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
120 // creates and returns a complete RBBI. From here, in a constructor, we
121 // can't just return the object created by the builder factory, hence
122 // the assignment of the factory created object to "this".
123 if (U_SUCCESS(status
)) {
130 //-------------------------------------------------------------------------------
132 // Default Constructor. Create an empty shell that can be set up later.
133 // Used when creating a RuleBasedBreakIterator from a set
135 //-------------------------------------------------------------------------------
136 RuleBasedBreakIterator::RuleBasedBreakIterator() {
141 //-------------------------------------------------------------------------------
143 // Copy constructor. Will produce a break iterator with the same behavior,
144 // and which iterates over the same text, as the one passed in.
146 //-------------------------------------------------------------------------------
147 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator
& other
)
148 : BreakIterator(other
)
158 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
159 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
160 // fCharIter was adopted from the outside.
172 fData
->removeReference();
175 if (fCachedBreakPositions
) {
176 uprv_free(fCachedBreakPositions
);
177 fCachedBreakPositions
= NULL
;
179 if (fLanguageBreakEngines
) {
180 delete fLanguageBreakEngines
;
181 fLanguageBreakEngines
= NULL
;
183 if (fUnhandledBreakEngine
) {
184 delete fUnhandledBreakEngine
;
185 fUnhandledBreakEngine
= NULL
;
190 * Assignment operator. Sets this iterator to have the same behavior,
191 * and iterate over the same text, as the one passed in.
193 RuleBasedBreakIterator
&
194 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator
& that
) {
198 reset(); // Delete break cache information
199 fBreakType
= that
.fBreakType
;
200 if (fLanguageBreakEngines
!= NULL
) {
201 delete fLanguageBreakEngines
;
202 fLanguageBreakEngines
= NULL
; // Just rebuild for now
204 // TODO: clone fLanguageBreakEngines from "that"
205 UErrorCode status
= U_ZERO_ERROR
;
206 fText
= utext_clone(fText
, that
.fText
, FALSE
, TRUE
, &status
);
208 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
213 if (that
.fCharIter
!= NULL
) {
214 // This is a little bit tricky - it will intially appear that
215 // this->fCharIter is adopted, even if that->fCharIter was
216 // not adopted. That's ok.
217 fCharIter
= that
.fCharIter
->clone();
221 fData
->removeReference();
224 if (that
.fData
!= NULL
) {
225 fData
= that
.fData
->addReference();
233 //-----------------------------------------------------------------------------
235 // init() Shared initialization routine. Used by all the constructors.
236 // Initializes all fields, leaving the object in a consistent state.
238 //-----------------------------------------------------------------------------
239 void RuleBasedBreakIterator::init() {
240 UErrorCode status
= U_ZERO_ERROR
;
241 fBufferClone
= FALSE
;
242 fText
= utext_openUChars(NULL
, NULL
, 0, &status
);
247 fLastRuleStatusIndex
= 0;
248 fLastStatusIndexValid
= TRUE
;
249 fDictionaryCharCount
= 0;
252 fCachedBreakPositions
= NULL
;
253 fLanguageBreakEngines
= NULL
;
254 fUnhandledBreakEngine
= NULL
;
255 fNumCachedBreakPositions
= 0;
256 fPositionInCache
= 0;
259 static UBool debugInitDone
= FALSE
;
260 if (debugInitDone
== FALSE
) {
261 char *debugEnv
= getenv("U_RBBIDEBUG");
262 if (debugEnv
&& uprv_strstr(debugEnv
, "trace")) {
265 debugInitDone
= TRUE
;
272 //-----------------------------------------------------------------------------
274 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
275 // behavior, and iterating over the same text, as this one.
276 // Virtual function: does the right thing with subclasses.
278 //-----------------------------------------------------------------------------
280 RuleBasedBreakIterator::clone(void) const {
281 return new RuleBasedBreakIterator(*this);
285 * Equality operator. Returns TRUE if both BreakIterators are of the
286 * same class, have the same behavior, and iterate over the same text.
289 RuleBasedBreakIterator::operator==(const BreakIterator
& that
) const {
290 if (that
.getDynamicClassID() != getDynamicClassID()) {
294 const RuleBasedBreakIterator
& that2
= (const RuleBasedBreakIterator
&) that
;
296 if (!utext_equals(fText
, that2
.fText
)) {
297 // The two break iterators are operating on different text,
298 // or have a different interation position.
302 // TODO: need a check for when in a dictionary region at different offsets.
304 if (that2
.fData
== fData
||
305 (fData
!= NULL
&& that2
.fData
!= NULL
&& *that2
.fData
== *fData
)) {
306 // The two break iterators are using the same rules.
313 * Compute a hash code for this BreakIterator
314 * @return A hash code
317 RuleBasedBreakIterator::hashCode(void) const {
320 hash
= fData
->hashCode();
326 void RuleBasedBreakIterator::setText(UText
*ut
, UErrorCode
&status
) {
327 if (U_FAILURE(status
)) {
331 fText
= utext_clone(fText
, ut
, FALSE
, TRUE
, &status
);
333 // Set up a dummy CharacterIterator to be returned if anyone
334 // calls getText(). With input from UText, there is no reasonable
335 // way to return a characterIterator over the actual input text.
336 // Return one over an empty string instead - this is the closest
337 // we can come to signaling a failure.
338 // (GetText() is obsolete, this failure is sort of OK)
339 if (fDCharIter
== NULL
) {
340 static const UChar c
= 0;
341 fDCharIter
= new UCharCharacterIterator(&c
, 0);
342 if (fDCharIter
== NULL
) {
343 status
= U_MEMORY_ALLOCATION_ERROR
;
348 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
349 // existing fCharIter was adopted from the outside. Delete it now.
352 fCharIter
= fDCharIter
;
358 UText
*RuleBasedBreakIterator::getUText(UText
*fillIn
, UErrorCode
&status
) const {
359 UText
*result
= utext_clone(fillIn
, fText
, FALSE
, TRUE
, &status
);
366 * Returns the description used to create this iterator
369 RuleBasedBreakIterator::getRules() const {
371 return fData
->getRuleSourceString();
373 static const UnicodeString
*s
;
375 // TODO: something more elegant here.
376 // perhaps API should return the string by value.
377 // Note: thread unsafe init & leak are semi-ok, better than
378 // what was before. Sould be cleaned up, though.
379 s
= new UnicodeString
;
385 //=======================================================================
386 // BreakIterator overrides
387 //=======================================================================
390 * Return a CharacterIterator over the text being analyzed.
393 RuleBasedBreakIterator::getText() const {
398 * Set the iterator to analyze a new piece of text. This function resets
399 * the current iteration position to the beginning of the text.
400 * @param newText An iterator over the text to analyze.
403 RuleBasedBreakIterator::adoptText(CharacterIterator
* newText
) {
404 // If we are holding a CharacterIterator adopted from a
405 // previous call to this function, delete it now.
406 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
411 UErrorCode status
= U_ZERO_ERROR
;
413 if (newText
==NULL
|| newText
->startIndex() != 0) {
414 // startIndex !=0 wants to be an error, but there's no way to report it.
415 // Make the iterator text be an empty string.
416 fText
= utext_openUChars(fText
, NULL
, 0, &status
);
418 fText
= utext_openCharacterIterator(fText
, newText
, &status
);
424 * Set the iterator to analyze a new piece of text. This function resets
425 * the current iteration position to the beginning of the text.
426 * @param newText An iterator over the text to analyze.
429 RuleBasedBreakIterator::setText(const UnicodeString
& newText
) {
430 UErrorCode status
= U_ZERO_ERROR
;
432 fText
= utext_openConstUnicodeString(fText
, &newText
, &status
);
434 // Set up a character iterator on the string.
435 // Needed in case someone calls getText().
436 // Can not, unfortunately, do this lazily on the (probably never)
437 // call to getText(), because getText is const.
438 if (fSCharIter
== NULL
) {
439 fSCharIter
= new StringCharacterIterator(newText
);
441 fSCharIter
->setText(newText
);
444 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
445 // old fCharIter was adopted from the outside. Delete it.
448 fCharIter
= fSCharIter
;
456 * Sets the current iteration position to the beginning of the text.
457 * @return The offset of the beginning of the text.
459 int32_t RuleBasedBreakIterator::first(void) {
461 fLastRuleStatusIndex
= 0;
462 fLastStatusIndexValid
= TRUE
;
464 // return BreakIterator::DONE;
466 utext_setNativeIndex(fText
, 0);
471 * Sets the current iteration position to the end of the text.
472 * @return The text's past-the-end offset.
474 int32_t RuleBasedBreakIterator::last(void) {
477 fLastRuleStatusIndex
= 0;
478 fLastStatusIndexValid
= TRUE
;
479 return BreakIterator::DONE
;
482 fLastStatusIndexValid
= FALSE
;
483 int32_t pos
= (int32_t)utext_nativeLength(fText
);
484 utext_setNativeIndex(fText
, pos
);
489 * Advances the iterator either forward or backward the specified number of steps.
490 * Negative values move backward, and positive values move forward. This is
491 * equivalent to repeatedly calling next() or previous().
492 * @param n The number of steps to move. The sign indicates the direction
493 * (negative is backwards, and positive is forwards).
494 * @return The character offset of the boundary position n boundaries away from
497 int32_t RuleBasedBreakIterator::next(int32_t n
) {
498 int32_t result
= current();
511 * Advances the iterator to the next boundary position.
512 * @return The position of the first boundary after this one.
514 int32_t RuleBasedBreakIterator::next(void) {
515 // if we have cached break positions and we're still in the range
516 // covered by them, just move one step forward in the cache
517 if (fCachedBreakPositions
!= NULL
) {
518 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
520 int32_t pos
= fCachedBreakPositions
[fPositionInCache
];
521 utext_setNativeIndex(fText
, pos
);
529 int32_t startPos
= current();
530 int32_t result
= handleNext(fData
->fForwardTable
);
531 if (fDictionaryCharCount
> 0) {
532 result
= checkDictionary(startPos
, result
, FALSE
);
538 * Advances the iterator backwards, to the last boundary preceding this one.
539 * @return The position of the last boundary position preceding this one.
541 int32_t RuleBasedBreakIterator::previous(void) {
545 // if we have cached break positions and we're still in the range
546 // covered by them, just move one step backward in the cache
547 if (fCachedBreakPositions
!= NULL
) {
548 if (fPositionInCache
> 0) {
550 // If we're at the beginning of the cache, need to reevaluate the
552 if (fPositionInCache
<= 0) {
553 fLastStatusIndexValid
= FALSE
;
555 int32_t pos
= fCachedBreakPositions
[fPositionInCache
];
556 utext_setNativeIndex(fText
, pos
);
564 // if we're already sitting at the beginning of the text, return DONE
565 if (fText
== NULL
|| (startPos
= current()) == 0) {
566 fLastRuleStatusIndex
= 0;
567 fLastStatusIndexValid
= TRUE
;
568 return BreakIterator::DONE
;
571 if (fData
->fSafeRevTable
!= NULL
|| fData
->fSafeFwdTable
!= NULL
) {
572 result
= handlePrevious(fData
->fReverseTable
);
573 if (fDictionaryCharCount
> 0) {
574 result
= checkDictionary(result
, startPos
, TRUE
);
580 // set things up. handlePrevious() will back us up to some valid
581 // break position before the current position (we back our internal
582 // iterator up one step to prevent handlePrevious() from returning
583 // the current position), but not necessarily the last one before
587 int32_t start
= current();
589 UTEXT_PREVIOUS32(fText
);
590 int32_t lastResult
= handlePrevious(fData
->fReverseTable
);
591 if (lastResult
== UBRK_DONE
) {
593 utext_setNativeIndex(fText
, 0);
597 UBool breakTagValid
= FALSE
;
599 // iterate forward from the known break position until we pass our
600 // starting point. The last break position before the starting
601 // point is our return value
605 if (result
== BreakIterator::DONE
|| result
>= start
) {
609 lastTag
= fLastRuleStatusIndex
;
610 breakTagValid
= TRUE
;
613 // fLastBreakTag wants to have the value for section of text preceding
614 // the result position that we are to return (in lastResult.) If
615 // the backwards rules overshot and the above loop had to do two or more
616 // next()s to move up to the desired return position, we will have a valid
617 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
618 // we wont have a tag value for that position, which is only set by handleNext().
620 // set the current iteration position to be the last break position
621 // before where we started, and then return that value
622 utext_setNativeIndex(fText
, lastResult
);
623 fLastRuleStatusIndex
= lastTag
; // for use by getRuleStatus()
624 fLastStatusIndexValid
= breakTagValid
;
626 // No need to check the dictionary; it will have been handled by
633 * Sets the iterator to refer to the first boundary position following
634 * the specified position.
635 * @offset The position from which to begin searching for a break position.
636 * @return The position of the first break after the current position.
638 int32_t RuleBasedBreakIterator::following(int32_t offset
) {
639 // if we have cached break positions and offset is in the range
640 // covered by them, use them
641 // TODO: could use binary search
642 // TODO: what if offset is outside range, but break is not?
643 if (fCachedBreakPositions
!= NULL
) {
644 if (offset
>= fCachedBreakPositions
[0]
645 && offset
< fCachedBreakPositions
[fNumCachedBreakPositions
- 1]) {
646 fPositionInCache
= 0;
647 // We are guaranteed not to leave the array due to range test above
648 while (offset
>= fCachedBreakPositions
[fPositionInCache
]) {
651 int32_t pos
= fCachedBreakPositions
[fPositionInCache
];
652 utext_setNativeIndex(fText
, pos
);
660 // if the offset passed in is already past the end of the text,
661 // just return DONE; if it's before the beginning, return the
662 // text's starting offset
663 fLastRuleStatusIndex
= 0;
664 fLastStatusIndexValid
= TRUE
;
665 if (fText
== NULL
|| offset
>= utext_nativeLength(fText
)) {
669 else if (offset
< 0) {
673 // otherwise, set our internal iteration position (temporarily)
674 // to the position passed in. If this is the _beginning_ position,
675 // then we can just use next() to get our return value
679 if (fData
->fSafeRevTable
!= NULL
) {
681 utext_setNativeIndex(fText
, offset
);
682 // move forward one codepoint to prepare for moving back to a
684 // this handles offset being between a supplementary character
686 // handlePrevious will move most of the time to < 1 boundary away
687 handlePrevious(fData
->fSafeRevTable
);
688 int32_t result
= next();
689 while (result
<= offset
) {
694 if (fData
->fSafeFwdTable
!= NULL
) {
695 // backup plan if forward safe table is not available
696 utext_setNativeIndex(fText
, offset
);
697 UTEXT_PREVIOUS32(fText
);
698 // handle next will give result >= offset
699 handleNext(fData
->fSafeFwdTable
);
700 // previous will give result 0 or 1 boundary away from offset,
703 int32_t oldresult
= previous();
704 while (oldresult
> offset
) {
705 int32_t result
= previous();
706 if (result
<= offset
) {
711 int32_t result
= next();
712 if (result
<= offset
) {
717 // otherwise, we have to sync up first. Use handlePrevious() to back
718 // up to a known break position before the specified position (if
719 // we can determine that the specified position is a break position,
720 // we don't back up at all). This may or may not be the last break
721 // position at or before our starting position. Advance forward
722 // from here until we've passed the starting position. The position
723 // we stop on will be the first break position after the specified one.
726 utext_setNativeIndex(fText
, offset
);
728 offset
==1 && utext_getNativeIndex(fText
)==0) {
733 while (result
!= BreakIterator::DONE
&& result
<= offset
) {
741 * Sets the iterator to refer to the last boundary position before the
742 * specified position.
743 * @offset The position to begin searching for a break from.
744 * @return The position of the last boundary before the starting position.
746 int32_t RuleBasedBreakIterator::preceding(int32_t offset
) {
747 // if we have cached break positions and offset is in the range
748 // covered by them, use them
749 if (fCachedBreakPositions
!= NULL
) {
750 // TODO: binary search?
751 // TODO: What if offset is outside range, but break is not?
752 if (offset
> fCachedBreakPositions
[0]
753 && offset
<= fCachedBreakPositions
[fNumCachedBreakPositions
- 1]) {
754 fPositionInCache
= 0;
755 while (fPositionInCache
< fNumCachedBreakPositions
756 && offset
> fCachedBreakPositions
[fPositionInCache
])
759 // If we're at the beginning of the cache, need to reevaluate the
761 if (fPositionInCache
<= 0) {
762 fLastStatusIndexValid
= FALSE
;
764 utext_setNativeIndex(fText
, fCachedBreakPositions
[fPositionInCache
]);
765 return fCachedBreakPositions
[fPositionInCache
];
772 // if the offset passed in is already past the end of the text,
773 // just return DONE; if it's before the beginning, return the
774 // text's starting offset
775 if (fText
== NULL
|| offset
> utext_nativeLength(fText
)) {
776 // return BreakIterator::DONE;
779 else if (offset
< 0) {
783 // if we start by updating the current iteration position to the
784 // position specified by the caller, we can just use previous()
785 // to carry out this operation
787 if (fData
->fSafeFwdTable
!= NULL
) {
789 utext_setNativeIndex(fText
, offset
);
790 int32_t newOffset
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
791 if (newOffset
!= offset
) {
792 // Will come here if specified offset was not a code point boundary AND
793 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
794 // indices to the containing code point.
795 // For breakitereator::preceding only, these non-code-point indices need to be moved
796 // up to refer to the following codepoint.
798 offset
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
801 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
802 // rather than adjusting the position unconditionally?
803 // (Change would interact with safe rules.)
804 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
805 // affects only preceding(), seems cleaner, but is slightly different.
806 UTEXT_PREVIOUS32(fText
);
807 handleNext(fData
->fSafeFwdTable
);
808 int32_t result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
809 while (result
>= offset
) {
814 if (fData
->fSafeRevTable
!= NULL
) {
815 // backup plan if forward safe table is not available
816 // TODO: check whether this path can be discarded
817 // It's probably OK to say that rules must supply both safe tables
818 // if they use safe tables at all. We have certainly never described
819 // to anyone how to work with just one safe table.
820 utext_setNativeIndex(fText
, offset
);
823 // handle previous will give result <= offset
824 handlePrevious(fData
->fSafeRevTable
);
826 // next will give result 0 or 1 boundary away from offset,
829 int32_t oldresult
= next();
830 while (oldresult
< offset
) {
831 int32_t result
= next();
832 if (result
>= offset
) {
837 int32_t result
= previous();
838 if (result
>= offset
) {
845 utext_setNativeIndex(fText
, offset
);
850 * Returns true if the specfied position is a boundary position. As a side
851 * effect, leaves the iterator pointing to the first boundary position at
853 * @param offset the offset to check.
854 * @return True if "offset" is a boundary position.
856 UBool
RuleBasedBreakIterator::isBoundary(int32_t offset
) {
857 // the beginning index of the iterator is always a boundary position by definition
859 first(); // For side effects on current position, tag values.
863 if (offset
== (int32_t)utext_nativeLength(fText
)) {
864 last(); // For side effects on current position, tag values.
868 // out-of-range indexes are never boundary positions
870 first(); // For side effects on current position, tag values.
874 if (offset
> utext_nativeLength(fText
)) {
875 last(); // For side effects on current position, tag values.
879 // otherwise, we can use following() on the position before the specified
880 // one and return true if the position we get back is the one the user
882 utext_previous32From(fText
, offset
);
883 int32_t backOne
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
884 UBool result
= following(backOne
) == offset
;
889 * Returns the current iteration position.
890 * @return The current iteration position.
892 int32_t RuleBasedBreakIterator::current(void) const {
893 int32_t pos
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
897 //=======================================================================
899 //=======================================================================
902 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
903 // of user text. A variable with this enum type keeps track of where we
904 // are. The state machine only fetches user input while in the RUN mode.
907 RBBI_START
, // state machine processing is before first char of input
908 RBBI_RUN
, // state machine processing is in the user text
909 RBBI_END
// state machine processing is after end of user text.
913 //-----------------------------------------------------------------------------------
915 // handleNext(stateTable)
916 // This method is the actual implementation of the rbbi next() method.
917 // This method initializes the state machine to state 1
918 // and advances through the text character by character until we reach the end
919 // of the text or the state machine transitions to state 0. We update our return
920 // value every time the state machine passes through an accepting state.
922 //-----------------------------------------------------------------------------------
923 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable
*statetable
) {
925 int16_t category
= 0;
928 RBBIStateTableRow
*row
;
930 int32_t lookaheadStatus
= 0;
931 int32_t lookaheadTagIdx
= 0;
933 int32_t initialPosition
= 0;
934 int32_t lookaheadResult
= 0;
935 UBool lookAheadHardBreak
= (statetable
->fFlags
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0;
936 const char *tableData
= statetable
->fTableData
;
937 uint32_t tableRowLen
= statetable
->fRowLen
;
941 RBBIDebugPuts("Handle Next pos char state category");
945 // No matter what, handleNext alway correctly sets the break tag value.
946 fLastStatusIndexValid
= TRUE
;
947 fLastRuleStatusIndex
= 0;
949 // if we're already at the end of the text, return DONE.
950 initialPosition
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
951 result
= initialPosition
;
952 c
= UTEXT_NEXT32(fText
);
953 if (fData
== NULL
|| c
==U_SENTINEL
) {
954 return BreakIterator::DONE
;
957 // Set the initial state for the state machine
959 row
= (RBBIStateTableRow
*)
960 //(statetable->fTableData + (statetable->fRowLen * state));
961 (tableData
+ tableRowLen
* state
);
965 if (statetable
->fFlags
& RBBI_BOF_REQUIRED
) {
971 // loop until we reach the end of the text or transition to state 0
974 if (c
== U_SENTINEL
) {
975 // Reached end of input string.
976 if (mode
== RBBI_END
) {
977 // We have already run the loop one last time with the
978 // character set to the psueudo {eof} value. Now it is time
979 // to unconditionally bail out.
980 if (lookaheadResult
> result
) {
981 // We ran off the end of the string with a pending look-ahead match.
982 // Treat this as if the look-ahead condition had been met, and return
983 // the match at the / position from the look-ahead rule.
984 result
= lookaheadResult
;
985 fLastRuleStatusIndex
= lookaheadTagIdx
;
990 // Run the loop one last time with the fake end-of-input character category.
996 // Get the char category. An incoming category of 1 or 2 means that
997 // we are preset for doing the beginning or end of input, and
998 // that we shouldn't get a category from an actual text input character.
1000 if (mode
== RBBI_RUN
) {
1001 // look up the current character's character category, which tells us
1002 // which column in the state table to look at.
1003 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1004 // not the size of the character going in, which is a UChar32.
1006 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1008 // Check the dictionary bit in the character's category.
1009 // Counter is only used by dictionary based iterators (subclasses).
1010 // Chars that need to be handled by a dictionary have a flag bit set
1011 // in their category values.
1013 if ((category
& 0x4000) != 0) {
1014 fDictionaryCharCount
++;
1015 // And off the dictionary flag bit.
1016 category
&= ~0x4000;
1022 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText
));
1023 if (0x20<=c
&& c
<0x7f) {
1024 RBBIDebugPrintf("\"%c\" ", c
);
1026 RBBIDebugPrintf("%5x ", c
);
1028 RBBIDebugPrintf("%3d %3d\n", state
, category
);
1032 // State Transition - move machine to its next state
1034 state
= row
->fNextState
[category
];
1035 row
= (RBBIStateTableRow
*)
1036 // (statetable->fTableData + (statetable->fRowLen * state));
1037 (tableData
+ tableRowLen
* state
);
1040 if (row
->fAccepting
== -1) {
1041 // Match found, common case.
1042 if (mode
!= RBBI_START
) {
1043 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1045 fLastRuleStatusIndex
= row
->fTagIdx
; // Remember the break status (tag) values.
1048 if (row
->fLookAhead
!= 0) {
1049 if (lookaheadStatus
!= 0
1050 && row
->fAccepting
== lookaheadStatus
) {
1051 // Lookahead match is completed.
1052 result
= lookaheadResult
;
1053 fLastRuleStatusIndex
= lookaheadTagIdx
;
1054 lookaheadStatus
= 0;
1055 // TODO: make a standalone hard break in a rule work.
1056 if (lookAheadHardBreak
) {
1057 UTEXT_SETNATIVEINDEX(fText
, result
);
1060 // Look-ahead completed, but other rules may match further. Continue on
1061 // TODO: junk this feature? I don't think it's used anywhwere.
1065 int32_t r
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1066 lookaheadResult
= r
;
1067 lookaheadStatus
= row
->fLookAhead
;
1068 lookaheadTagIdx
= row
->fTagIdx
;
1073 if (row
->fAccepting
!= 0) {
1074 // Because this is an accepting state, any in-progress look-ahead match
1075 // is no longer relavant. Clear out the pending lookahead status.
1076 lookaheadStatus
= 0; // clear out any pending look-ahead match.
1080 if (state
== STOP_STATE
) {
1081 // This is the normal exit from the lookup state machine.
1082 // We have advanced through the string until it is certain that no
1083 // longer match is possible, no matter what characters follow.
1087 // Advance to the next character.
1088 // If this is a beginning-of-input loop iteration, don't advance
1089 // the input position. The next iteration will be processing the
1090 // first real input character.
1091 if (mode
== RBBI_RUN
) {
1092 c
= UTEXT_NEXT32(fText
);
1094 if (mode
== RBBI_START
) {
1102 // The state machine is done. Check whether it found a match...
1104 // If the iterator failed to advance in the match engine, force it ahead by one.
1105 // (This really indicates a defect in the break rules. They should always match
1106 // at least one character.)
1107 if (result
== initialPosition
) {
1108 UTEXT_SETNATIVEINDEX(fText
, initialPosition
);
1109 UTEXT_NEXT32(fText
);
1110 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1113 // Leave the iterator at our result position.
1114 UTEXT_SETNATIVEINDEX(fText
, result
);
1117 RBBIDebugPrintf("result = %d\n\n", result
);
1125 //-----------------------------------------------------------------------------------
1129 // Iterate backwards, according to the logic of the reverse rules.
1130 // This version handles the exact style backwards rules.
1132 // The logic of this function is very similar to handleNext(), above.
1134 //-----------------------------------------------------------------------------------
1135 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable
*statetable
) {
1137 int16_t category
= 0;
1139 RBBIStateTableRow
*row
;
1141 int32_t lookaheadStatus
= 0;
1143 int32_t initialPosition
= 0;
1144 int32_t lookaheadResult
= 0;
1145 UBool lookAheadHardBreak
= (statetable
->fFlags
& RBBI_LOOKAHEAD_HARD_BREAK
) != 0;
1149 RBBIDebugPuts("Handle Previous pos char state category");
1153 // handlePrevious() never gets the rule status.
1154 // Flag the status as invalid; if the user ever asks for status, we will need
1155 // to back up, then re-find the break position using handleNext(), which does
1156 // get the status value.
1157 fLastStatusIndexValid
= FALSE
;
1158 fLastRuleStatusIndex
= 0;
1160 // if we're already at the start of the text, return DONE.
1161 if (fText
== NULL
|| fData
== NULL
|| UTEXT_GETNATIVEINDEX(fText
)==0) {
1162 return BreakIterator::DONE
;
1165 // Set up the starting char.
1166 initialPosition
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1167 result
= initialPosition
;
1168 c
= UTEXT_PREVIOUS32(fText
);
1170 // Set the initial state for the state machine
1171 state
= START_STATE
;
1172 row
= (RBBIStateTableRow
*)
1173 (statetable
->fTableData
+ (statetable
->fRowLen
* state
));
1176 if (statetable
->fFlags
& RBBI_BOF_REQUIRED
) {
1182 // loop until we reach the start of the text or transition to state 0
1185 if (c
== U_SENTINEL
) {
1186 // Reached end of input string.
1187 if (mode
== RBBI_END
||
1188 *(int32_t *)fData
->fHeader
->fFormatVersion
== 1 ) {
1189 // We have already run the loop one last time with the
1190 // character set to the psueudo {eof} value. Now it is time
1191 // to unconditionally bail out.
1192 // (Or we have an old format binary rule file that does not support {eof}.)
1193 if (lookaheadResult
< result
) {
1194 // We ran off the end of the string with a pending look-ahead match.
1195 // Treat this as if the look-ahead condition had been met, and return
1196 // the match at the / position from the look-ahead rule.
1197 result
= lookaheadResult
;
1198 lookaheadStatus
= 0;
1199 } else if (result
== initialPosition
) {
1200 // Ran off start, no match found.
1201 // move one index one (towards the start, since we are doing a previous())
1202 UTEXT_SETNATIVEINDEX(fText
, initialPosition
);
1203 UTEXT_PREVIOUS32(fText
); // TODO: shouldn't be necessary. We're already at beginning. Check.
1207 // Run the loop one last time with the fake end-of-input character category.
1213 // Get the char category. An incoming category of 1 or 2 means that
1214 // we are preset for doing the beginning or end of input, and
1215 // that we shouldn't get a category from an actual text input character.
1217 if (mode
== RBBI_RUN
) {
1218 // look up the current character's character category, which tells us
1219 // which column in the state table to look at.
1220 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1221 // not the size of the character going in, which is a UChar32.
1223 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1225 // Check the dictionary bit in the character's category.
1226 // Counter is only used by dictionary based iterators (subclasses).
1227 // Chars that need to be handled by a dictionary have a flag bit set
1228 // in their category values.
1230 if ((category
& 0x4000) != 0) {
1231 fDictionaryCharCount
++;
1232 // And off the dictionary flag bit.
1233 category
&= ~0x4000;
1239 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText
));
1240 if (0x20<=c
&& c
<0x7f) {
1241 RBBIDebugPrintf("\"%c\" ", c
);
1243 RBBIDebugPrintf("%5x ", c
);
1245 RBBIDebugPrintf("%3d %3d\n", state
, category
);
1249 // State Transition - move machine to its next state
1251 state
= row
->fNextState
[category
];
1252 row
= (RBBIStateTableRow
*)
1253 (statetable
->fTableData
+ (statetable
->fRowLen
* state
));
1255 if (row
->fAccepting
== -1) {
1256 // Match found, common case.
1257 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1260 if (row
->fLookAhead
!= 0) {
1261 if (lookaheadStatus
!= 0
1262 && row
->fAccepting
== lookaheadStatus
) {
1263 // Lookahead match is completed.
1264 result
= lookaheadResult
;
1265 lookaheadStatus
= 0;
1266 // TODO: make a standalone hard break in a rule work.
1267 if (lookAheadHardBreak
) {
1268 UTEXT_SETNATIVEINDEX(fText
, result
);
1271 // Look-ahead completed, but other rules may match further. Continue on
1272 // TODO: junk this feature? I don't think it's used anywhwere.
1276 int32_t r
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1277 lookaheadResult
= r
;
1278 lookaheadStatus
= row
->fLookAhead
;
1283 if (row
->fAccepting
!= 0) {
1284 // Because this is an accepting state, any in-progress look-ahead match
1285 // is no longer relavant. Clear out the pending lookahead status.
1286 lookaheadStatus
= 0;
1290 if (state
== STOP_STATE
) {
1291 // This is the normal exit from the lookup state machine.
1292 // We have advanced through the string until it is certain that no
1293 // longer match is possible, no matter what characters follow.
1297 // Move (backwards) to the next character to process.
1298 // If this is a beginning-of-input loop iteration, don't advance
1299 // the input position. The next iteration will be processing the
1300 // first real input character.
1301 if (mode
== RBBI_RUN
) {
1302 c
= UTEXT_PREVIOUS32(fText
);
1304 if (mode
== RBBI_START
) {
1310 // The state machine is done. Check whether it found a match...
1312 // If the iterator failed to advance in the match engine, force it ahead by one.
1313 // (This really indicates a defect in the break rules. They should always match
1314 // at least one character.)
1315 if (result
== initialPosition
) {
1316 UTEXT_SETNATIVEINDEX(fText
, initialPosition
);
1317 UTEXT_PREVIOUS32(fText
);
1318 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1321 // Leave the iterator at our result position.
1322 UTEXT_SETNATIVEINDEX(fText
, result
);
1325 RBBIDebugPrintf("result = %d\n\n", result
);
1333 RuleBasedBreakIterator::reset()
1335 if (fCachedBreakPositions
) {
1336 uprv_free(fCachedBreakPositions
);
1338 fCachedBreakPositions
= NULL
;
1339 fNumCachedBreakPositions
= 0;
1340 fDictionaryCharCount
= 0;
1341 fPositionInCache
= 0;
1346 //-------------------------------------------------------------------------------
1348 // getRuleStatus() Return the break rule tag associated with the current
1349 // iterator position. If the iterator arrived at its current
1350 // position by iterating forwards, the value will have been
1351 // cached by the handleNext() function.
1353 // If no cached status value is available, the status is
1354 // found by doing a previous() followed by a next(), which
1355 // leaves the iterator where it started, and computes the
1356 // status while doing the next().
1358 //-------------------------------------------------------------------------------
1359 void RuleBasedBreakIterator::makeRuleStatusValid() {
1360 if (fLastStatusIndexValid
== FALSE
) {
1361 // No cached status is available.
1362 if (fText
== NULL
|| current() == 0) {
1363 // At start of text, or there is no text. Status is always zero.
1364 fLastRuleStatusIndex
= 0;
1365 fLastStatusIndexValid
= TRUE
;
1367 // Not at start of text. Find status the tedious way.
1368 int32_t pa
= current();
1370 if (fNumCachedBreakPositions
> 0) {
1371 reset(); // Blow off the dictionary cache
1373 int32_t pb
= next();
1375 // note: the if (pa != pb) test is here only to eliminate warnings for
1376 // unused local variables on gcc. Logically, it isn't needed.
1381 U_ASSERT(fLastRuleStatusIndex
>= 0 && fLastRuleStatusIndex
< fData
->fStatusMaxIdx
);
1385 int32_t RuleBasedBreakIterator::getRuleStatus() const {
1386 RuleBasedBreakIterator
*nonConstThis
= (RuleBasedBreakIterator
*)this;
1387 nonConstThis
->makeRuleStatusValid();
1389 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1390 // (the number of status values.)
1391 // This function returns the last (largest) of the array of status values.
1392 int32_t idx
= fLastRuleStatusIndex
+ fData
->fRuleStatusTable
[fLastRuleStatusIndex
];
1393 int32_t tagVal
= fData
->fRuleStatusTable
[idx
];
1401 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1402 int32_t *fillInVec
, int32_t capacity
, UErrorCode
&status
)
1404 if (U_FAILURE(status
)) {
1408 RuleBasedBreakIterator
*nonConstThis
= (RuleBasedBreakIterator
*)this;
1409 nonConstThis
->makeRuleStatusValid();
1410 int32_t numVals
= fData
->fRuleStatusTable
[fLastRuleStatusIndex
];
1411 int32_t numValsToCopy
= numVals
;
1412 if (numVals
> capacity
) {
1413 status
= U_BUFFER_OVERFLOW_ERROR
;
1414 numValsToCopy
= capacity
;
1417 for (i
=0; i
<numValsToCopy
; i
++) {
1418 fillInVec
[i
] = fData
->fRuleStatusTable
[fLastRuleStatusIndex
+ i
+ 1];
1425 //-------------------------------------------------------------------------------
1427 // getBinaryRules Access to the compiled form of the rules,
1428 // for use by build system tools that save the data
1429 // for standard iterator types.
1431 //-------------------------------------------------------------------------------
1432 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length
) {
1433 const uint8_t *retPtr
= NULL
;
1436 if (fData
!= NULL
) {
1437 retPtr
= (const uint8_t *)fData
->fHeader
;
1438 length
= fData
->fHeader
->fLength
;
1446 //-------------------------------------------------------------------------------
1448 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
1449 // Saving one heap allocation isn't worth the trouble.
1450 // Cloning shouldn't be done in tight loops, and
1451 // making the clone copy involves other heap operations anyway.
1452 // And the application code for correctly dealing with buffer
1453 // size problems and the eventual object destruction is ugly.
1455 //-------------------------------------------------------------------------------
1456 BreakIterator
* RuleBasedBreakIterator::createBufferClone(void *stackBuffer
,
1457 int32_t &bufferSize
,
1460 if (U_FAILURE(status
)){
1465 // If user buffer size is zero this is a preflight operation to
1466 // obtain the needed buffer size, allowing for worst case misalignment.
1468 if (bufferSize
== 0) {
1469 bufferSize
= sizeof(RuleBasedBreakIterator
) + U_ALIGNMENT_OFFSET_UP(0);
1475 // Check the alignment and size of the user supplied buffer.
1476 // Allocate heap memory if the user supplied memory is insufficient.
1478 char *buf
= (char *)stackBuffer
;
1479 uint32_t s
= bufferSize
;
1481 if (stackBuffer
== NULL
) {
1482 s
= 0; // Ignore size, force allocation if user didn't give us a buffer.
1484 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
1485 uint32_t offsetUp
= (uint32_t)U_ALIGNMENT_OFFSET_UP(buf
);
1489 if (s
< sizeof(RuleBasedBreakIterator
)) {
1490 // Not enough room in the caller-supplied buffer.
1491 // Do a plain-vanilla heap based clone and return that, along with
1492 // a warning that the clone was allocated.
1493 RuleBasedBreakIterator
*clonedBI
= new RuleBasedBreakIterator(*this);
1494 if (clonedBI
== 0) {
1495 status
= U_MEMORY_ALLOCATION_ERROR
;
1497 status
= U_SAFECLONE_ALLOCATED_WARNING
;
1503 // Clone the source BI into the caller-supplied buffer.
1504 // TODO: using an overloaded operator new to directly initialize the
1505 // copy in the user's buffer would be better, but it doesn't seem
1506 // to get along with namespaces. Investigate why.
1508 // The memcpy is only safe with an empty (default constructed)
1509 // break iterator. Use on others can screw up reference counts
1510 // to data. memcpy-ing objects is not really a good idea...
1512 RuleBasedBreakIterator localIter
; // Empty break iterator, source for memcpy
1513 RuleBasedBreakIterator
*clone
= (RuleBasedBreakIterator
*)buf
;
1514 uprv_memcpy(clone
, &localIter
, sizeof(RuleBasedBreakIterator
)); // init C++ gorp, BreakIterator base class part
1515 clone
->init(); // Init RuleBasedBreakIterator part, (user default constructor)
1516 *clone
= *this; // clone = the real BI we want.
1517 clone
->fBufferClone
= TRUE
; // Flag to prevent deleting storage on close (From C code)
1523 //-------------------------------------------------------------------------------
1525 // isDictionaryChar Return true if the category lookup for this char
1526 // indicates that it is in the set of dictionary lookup
1529 // This function is intended for use by dictionary based
1532 //-------------------------------------------------------------------------------
1533 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
1534 if (fData == NULL) {
1538 UTRIE_GET16(&fData->fTrie, c, category);
1539 return (category & 0x4000) != 0;
1543 //-------------------------------------------------------------------------------
1545 // checkDictionary This function handles all processing of characters in
1546 // the "dictionary" set. It will determine the appropriate
1547 // course of action, and possibly set up a cache in the
1550 //-------------------------------------------------------------------------------
1551 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos
,
1554 // Reset the old break cache first.
1555 uint32_t dictionaryCount
= fDictionaryCharCount
;
1558 if (dictionaryCount
<= 1 || (endPos
- startPos
) <= 1) {
1559 return (reverse
? startPos
: endPos
);
1562 // Starting from the starting point, scan towards the proposed result,
1563 // looking for the first dictionary character (which may be the one
1564 // we're on, if we're starting in the middle of a range).
1565 utext_setNativeIndex(fText
, reverse
? endPos
: startPos
);
1567 UTEXT_PREVIOUS32(fText
);
1570 int32_t rangeStart
= startPos
;
1571 int32_t rangeEnd
= endPos
;
1575 UErrorCode status
= U_ZERO_ERROR
;
1576 UStack
breaks(status
);
1577 int32_t foundBreakCount
= 0;
1578 UChar32 c
= utext_current32(fText
);
1580 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1582 // Is the character we're starting on a dictionary character? If so, we
1583 // need to back up to include the entire run; otherwise the results of
1584 // the break algorithm will differ depending on where we start. Since
1585 // the result is cached and there is typically a non-dictionary break
1586 // within a small number of words, there should be little performance impact.
1587 if (category
& 0x4000) {
1590 utext_next32(fText
); // TODO: recast to work directly with postincrement.
1591 c
= utext_current32(fText
);
1592 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1593 } while (c
!= U_SENTINEL
&& (category
& 0x4000));
1594 // Back up to the last dictionary character
1595 rangeEnd
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1596 if (c
== U_SENTINEL
) {
1597 // c = fText->last32();
1598 // TODO: why was this if needed?
1599 c
= UTEXT_PREVIOUS32(fText
);
1602 c
= UTEXT_PREVIOUS32(fText
);
1607 c
= UTEXT_PREVIOUS32(fText
);
1608 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1610 while (c
!= U_SENTINEL
&& (category
& 0x4000));
1611 // Back up to the last dictionary character
1612 if (c
== U_SENTINEL
) {
1613 // c = fText->first32();
1614 c
= utext_current32(fText
);
1617 utext_next32(fText
);
1618 c
= utext_current32(fText
);
1620 rangeStart
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);;
1622 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1625 // Loop through the text, looking for ranges of dictionary characters.
1626 // For each span, find the appropriate break engine, and ask it to find
1627 // any breaks within the span.
1628 // Note: we always do this in the forward direction, so that the break
1629 // cache is built in the right order.
1631 utext_setNativeIndex(fText
, rangeStart
);
1632 c
= utext_current32(fText
);
1633 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1635 while(U_SUCCESS(status
)) {
1636 while((current
= (int32_t)UTEXT_GETNATIVEINDEX(fText
)) < rangeEnd
&& (category
& 0x4000) == 0) {
1637 utext_next32(fText
); // TODO: tweak for post-increment operation
1638 c
= utext_current32(fText
);
1639 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1641 if (current
>= rangeEnd
) {
1645 // We now have a dictionary character. Get the appropriate language object
1647 const LanguageBreakEngine
*lbe
= getLanguageBreakEngine(c
);
1649 // Ask the language object if there are any breaks. It will leave the text
1650 // pointer on the other side of its range, ready to search for the next one.
1652 foundBreakCount
+= lbe
->findBreaks(fText
, rangeStart
, rangeEnd
, FALSE
, fBreakType
, breaks
);
1655 // Reload the loop variables for the next go-round
1656 c
= utext_current32(fText
);
1657 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1660 // If we found breaks, build a new break cache. The first and last entries must
1661 // be the original starting and ending position.
1662 if (foundBreakCount
> 0) {
1663 int32_t totalBreaks
= foundBreakCount
;
1664 if (startPos
< breaks
.elementAti(0)) {
1667 if (endPos
> breaks
.peeki()) {
1670 fCachedBreakPositions
= (int32_t *)uprv_malloc(totalBreaks
* sizeof(int32_t));
1671 if (fCachedBreakPositions
!= NULL
) {
1673 fNumCachedBreakPositions
= totalBreaks
;
1674 if (startPos
< breaks
.elementAti(0)) {
1675 fCachedBreakPositions
[out
++] = startPos
;
1677 for (int32_t i
= 0; i
< foundBreakCount
; ++i
) {
1678 fCachedBreakPositions
[out
++] = breaks
.elementAti(i
);
1680 if (endPos
> fCachedBreakPositions
[out
-1]) {
1681 fCachedBreakPositions
[out
] = endPos
;
1683 // If there are breaks, then by definition, we are replacing the original
1684 // proposed break by one of the breaks we found. Use following() and
1685 // preceding() to do the work. They should never recurse in this case.
1687 return preceding(endPos
- 1);
1690 return following(startPos
);
1693 // If the allocation failed, just fall through to the "no breaks found" case.
1696 // If we get here, there were no language-based breaks. Set the text pointer
1697 // to the original proposed break.
1698 utext_setNativeIndex(fText
, reverse
? startPos
: endPos
);
1699 return (reverse
? startPos
: endPos
);
1704 // defined in ucln_cmn.h
1706 static U_NAMESPACE_QUALIFIER UStack
*gLanguageBreakFactories
= NULL
;
1709 * Release all static memory held by breakiterator.
1712 static UBool U_CALLCONV
breakiterator_cleanup_dict(void) {
1713 if (gLanguageBreakFactories
) {
1714 delete gLanguageBreakFactories
;
1715 gLanguageBreakFactories
= NULL
;
1722 static void U_CALLCONV
_deleteFactory(void *obj
) {
1723 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory
*) obj
;
1728 static const LanguageBreakEngine
*
1729 getLanguageBreakEngineFromFactory(UChar32 c
, int32_t breakType
)
1732 UErrorCode status
= U_ZERO_ERROR
;
1733 UMTX_CHECK(NULL
, (UBool
)(gLanguageBreakFactories
== NULL
), needsInit
);
1736 UStack
*factories
= new UStack(_deleteFactory
, NULL
, status
);
1737 if (factories
!= NULL
&& U_SUCCESS(status
)) {
1738 ICULanguageBreakFactory
*builtIn
= new ICULanguageBreakFactory(status
);
1739 factories
->push(builtIn
, status
);
1740 #ifdef U_LOCAL_SERVICE_HOOK
1741 LanguageBreakFactory
*extra
= (LanguageBreakFactory
*)uprv_svc_hook("languageBreakFactory", &status
);
1742 if (extra
!= NULL
) {
1743 factories
->push(extra
, status
);
1748 if (gLanguageBreakFactories
== NULL
) {
1749 gLanguageBreakFactories
= factories
;
1751 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT
, breakiterator_cleanup_dict
);
1757 if (gLanguageBreakFactories
== NULL
) {
1761 int32_t i
= gLanguageBreakFactories
->size();
1762 const LanguageBreakEngine
*lbe
= NULL
;
1764 LanguageBreakFactory
*factory
= (LanguageBreakFactory
*)(gLanguageBreakFactories
->elementAt(i
));
1765 lbe
= factory
->getEngineFor(c
, breakType
);
1774 //-------------------------------------------------------------------------------
1776 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1779 //-------------------------------------------------------------------------------
1780 const LanguageBreakEngine
*
1781 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c
) {
1782 const LanguageBreakEngine
*lbe
= NULL
;
1783 UErrorCode status
= U_ZERO_ERROR
;
1785 if (fLanguageBreakEngines
== NULL
) {
1786 fLanguageBreakEngines
= new UStack(status
);
1787 if (fLanguageBreakEngines
== NULL
|| U_FAILURE(status
)) {
1788 delete fLanguageBreakEngines
;
1789 fLanguageBreakEngines
= 0;
1794 int32_t i
= fLanguageBreakEngines
->size();
1796 lbe
= (const LanguageBreakEngine
*)(fLanguageBreakEngines
->elementAt(i
));
1797 if (lbe
->handles(c
, fBreakType
)) {
1802 // No existing dictionary took the character. See if a factory wants to
1803 // give us a new LanguageBreakEngine for this character.
1804 lbe
= getLanguageBreakEngineFromFactory(c
, fBreakType
);
1806 // If we got one, use it and push it on our stack.
1808 fLanguageBreakEngines
->push((void *)lbe
, status
);
1809 // Even if we can't remember it, we can keep looking it up, so
1810 // return it even if the push fails.
1814 // No engine is forthcoming for this character. Add it to the
1815 // reject set. Create the reject break engine if needed.
1816 if (fUnhandledBreakEngine
== NULL
) {
1817 fUnhandledBreakEngine
= new UnhandledEngine(status
);
1818 if (U_SUCCESS(status
) && fUnhandledBreakEngine
== NULL
) {
1819 status
= U_MEMORY_ALLOCATION_ERROR
;
1821 // Put it last so that scripts for which we have an engine get tried
1823 fLanguageBreakEngines
->insertElementAt(fUnhandledBreakEngine
, 0, status
);
1824 // If we can't insert it, or creation failed, get rid of it
1825 if (U_FAILURE(status
)) {
1826 delete fUnhandledBreakEngine
;
1827 fUnhandledBreakEngine
= 0;
1832 // Tell the reject engine about the character; at its discretion, it may
1833 // add more than just the one character.
1834 fUnhandledBreakEngine
->handleCharacter(c
, fBreakType
);
1836 return fUnhandledBreakEngine
;
1841 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1845 void RuleBasedBreakIterator::setBreakType(int32_t type
) {
1852 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */