1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 ***************************************************************************
9 **********************************************************************
10 * Legacy version of RuleBasedBreakIterator from ICU 57,
11 * only for use by Apple RuleBasedTokenizer
12 **********************************************************************
15 #include "utypeinfo.h" // for 'typeid' to work
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_BREAK_ITERATION
21 #include "unicode/schriter.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/udata.h"
24 #include "unicode/uclean.h"
25 #include "unicode/utext.h"
26 #include "rbbidata57.h"
39 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
40 #if U_LOCAL_SERVICE_HOOK
45 static UBool fTrace
= FALSE
;
50 // The state number of the starting state
53 // The state-transition value indicating "stop"
57 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator57
)
60 //=======================================================================
62 //=======================================================================
65 * Constructs a RuleBasedBreakIterator57 that uses the already-created
66 * tables object that is passed in as a parameter.
68 RuleBasedBreakIterator57::RuleBasedBreakIterator57(RBBIDataHeader57
* data
, UErrorCode
&status
)
71 fData
= new RBBIDataWrapper57(data
, status
); // status checked in constructor
72 if (U_FAILURE(status
)) {return;}
74 status
= U_MEMORY_ALLOCATION_ERROR
;
80 * Same as above but does not adopt memory
82 RuleBasedBreakIterator57::RuleBasedBreakIterator57(const RBBIDataHeader57
* data
, enum EDontAdopt
, UErrorCode
&status
)
85 fData
= new RBBIDataWrapper57(data
, RBBIDataWrapper57::kDontAdopt
, status
); // status checked in constructor
86 if (U_FAILURE(status
)) {return;}
88 status
= U_MEMORY_ALLOCATION_ERROR
;
95 // not used by rbtok.cpp
98 // Construct from precompiled binary rules (tables). This constructor is public API,
99 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
101 RuleBasedBreakIterator57::RuleBasedBreakIterator57(const uint8_t *compiledRules
,
103 UErrorCode
&status
) {
105 if (U_FAILURE(status
)) {
108 if (compiledRules
== NULL
|| ruleLength
< sizeof(RBBIDataHeader57
)) {
109 status
= U_ILLEGAL_ARGUMENT_ERROR
;
112 const RBBIDataHeader57
*data
= (const RBBIDataHeader57
*)compiledRules
;
113 if (data
->fLength
> ruleLength
) {
114 status
= U_ILLEGAL_ARGUMENT_ERROR
;
117 fData
= new RBBIDataWrapper57(data
, RBBIDataWrapper57::kDontAdopt
, status
);
118 if (U_FAILURE(status
)) {return;}
120 status
= U_MEMORY_ALLOCATION_ERROR
;
126 //-------------------------------------------------------------------------------
128 // Constructor from a UDataMemory handle to precompiled break rules
129 // stored in an ICU data file.
131 //-------------------------------------------------------------------------------
132 RuleBasedBreakIterator57::RuleBasedBreakIterator57(UDataMemory
* udm
, UErrorCode
&status
)
135 fData
= new RBBIDataWrapper57(udm
, status
); // status checked in constructor
136 if (U_FAILURE(status
)) {return;}
138 status
= U_MEMORY_ALLOCATION_ERROR
;
146 //-------------------------------------------------------------------------------
148 // Constructor from a set of rules supplied as a string.
150 //-------------------------------------------------------------------------------
151 RuleBasedBreakIterator57::RuleBasedBreakIterator57( const UnicodeString
&rules
,
152 UParseError
&parseError
,
156 if (U_FAILURE(status
)) {return;}
157 RuleBasedBreakIterator57
*bi
= (RuleBasedBreakIterator57
*)
158 RBBIRuleBuilder57::createRuleBasedBreakIterator(rules
, &parseError
, status
);
159 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
160 // creates and returns a complete RBBI. From here, in a constructor, we
161 // can't just return the object created by the builder factory, hence
162 // the assignment of the factory created object to "this".
163 if (U_SUCCESS(status
)) {
170 //-------------------------------------------------------------------------------
172 // Default Constructor. Create an empty shell that can be set up later.
173 // Used when creating a RuleBasedBreakIterator57 from a set
175 //-------------------------------------------------------------------------------
176 RuleBasedBreakIterator57::RuleBasedBreakIterator57() {
181 //-------------------------------------------------------------------------------
183 // Copy constructor. Will produce a break iterator with the same behavior,
184 // and which iterates over the same text, as the one passed in.
186 //-------------------------------------------------------------------------------
187 RuleBasedBreakIterator57::RuleBasedBreakIterator57(const RuleBasedBreakIterator57
& other
)
188 : BreakIterator(other
)
198 RuleBasedBreakIterator57::~RuleBasedBreakIterator57() {
199 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
200 // fCharIter was adopted from the outside.
212 fData
->removeReference();
215 if (fCachedBreakPositions
) {
216 uprv_free(fCachedBreakPositions
);
217 fCachedBreakPositions
= NULL
;
219 if (fLanguageBreakEngines
) {
220 delete fLanguageBreakEngines
;
221 fLanguageBreakEngines
= NULL
;
223 if (fUnhandledBreakEngine
) {
224 delete fUnhandledBreakEngine
;
225 fUnhandledBreakEngine
= NULL
;
230 * Assignment operator. Sets this iterator to have the same behavior,
231 * and iterate over the same text, as the one passed in.
233 RuleBasedBreakIterator57
&
234 RuleBasedBreakIterator57::operator=(const RuleBasedBreakIterator57
& that
) {
238 fLineWordOpts
= that
.fLineWordOpts
;
239 reset(); // Delete break cache information
240 fBreakType
= that
.fBreakType
;
241 if (fLanguageBreakEngines
!= NULL
) {
242 delete fLanguageBreakEngines
;
243 fLanguageBreakEngines
= NULL
; // Just rebuild for now
245 // TODO: clone fLanguageBreakEngines from "that"
246 UErrorCode status
= U_ZERO_ERROR
;
247 fText
= utext_clone(fText
, that
.fText
, FALSE
, TRUE
, &status
);
249 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
254 if (that
.fCharIter
!= NULL
) {
255 // This is a little bit tricky - it will intially appear that
256 // this->fCharIter is adopted, even if that->fCharIter was
257 // not adopted. That's ok.
258 fCharIter
= that
.fCharIter
->clone();
262 fData
->removeReference();
265 if (that
.fData
!= NULL
) {
266 fData
= that
.fData
->addReference();
274 //-----------------------------------------------------------------------------
276 // init() Shared initialization routine. Used by all the constructors.
277 // Initializes all fields, leaving the object in a consistent state.
279 //-----------------------------------------------------------------------------
280 void RuleBasedBreakIterator57::init() {
281 UErrorCode status
= U_ZERO_ERROR
;
282 fText
= utext_openUChars(NULL
, NULL
, 0, &status
);
287 fLastRuleStatusIndex
= 0;
288 fLastStatusIndexValid
= TRUE
;
289 fDictionaryCharCount
= 0;
290 fBreakType
= UBRK_WORD
; // Defaulting BreakType to word gives reasonable
291 // dictionary behavior for Break Iterators that are
292 // built from rules. Even better would be the ability to
293 // declare the type in the rules.
295 fCachedBreakPositions
= NULL
;
296 fLanguageBreakEngines
= NULL
;
297 fUnhandledBreakEngine
= NULL
;
298 fNumCachedBreakPositions
= 0;
299 fPositionInCache
= 0;
302 static UBool debugInitDone
= FALSE
;
303 if (debugInitDone
== FALSE
) {
304 char *debugEnv
= getenv("U_RBBIDEBUG");
305 if (debugEnv
&& uprv_strstr(debugEnv
, "trace")) {
308 debugInitDone
= TRUE
;
315 //-----------------------------------------------------------------------------
317 // clone - Returns a newly-constructed RuleBasedBreakIterator57 with the same
318 // behavior, and iterating over the same text, as this one.
319 // Virtual function: does the right thing with subclasses.
321 //-----------------------------------------------------------------------------
323 RuleBasedBreakIterator57::clone(void) const {
324 return new RuleBasedBreakIterator57(*this);
328 * Equality operator. Returns TRUE if both BreakIterators are of the
329 * same class, have the same behavior, and iterate over the same text.
332 RuleBasedBreakIterator57::operator==(const BreakIterator
& that
) const {
333 if (typeid(*this) != typeid(that
)) {
337 const RuleBasedBreakIterator57
& that2
= (const RuleBasedBreakIterator57
&) that
;
338 if (that2
.fLineWordOpts
!= fLineWordOpts
) {
342 if (!utext_equals(fText
, that2
.fText
)) {
343 // The two break iterators are operating on different text,
344 // or have a different interation position.
348 // TODO: need a check for when in a dictionary region at different offsets.
350 if (that2
.fData
== fData
||
351 (fData
!= NULL
&& that2
.fData
!= NULL
&& *that2
.fData
== *fData
)) {
352 // The two break iterators are using the same rules.
359 * Compute a hash code for this BreakIterator
360 * @return A hash code
363 RuleBasedBreakIterator57::hashCode(void) const {
366 hash
= fData
->hashCode();
372 void RuleBasedBreakIterator57::setText(UText
*ut
, UErrorCode
&status
) {
373 if (U_FAILURE(status
)) {
377 fText
= utext_clone(fText
, ut
, FALSE
, TRUE
, &status
);
379 // Set up a dummy CharacterIterator to be returned if anyone
380 // calls getText(). With input from UText, there is no reasonable
381 // way to return a characterIterator over the actual input text.
382 // Return one over an empty string instead - this is the closest
383 // we can come to signaling a failure.
384 // (GetText() is obsolete, this failure is sort of OK)
385 if (fDCharIter
== NULL
) {
386 static const UChar c
= 0;
387 fDCharIter
= new UCharCharacterIterator(&c
, 0);
388 if (fDCharIter
== NULL
) {
389 status
= U_MEMORY_ALLOCATION_ERROR
;
394 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
395 // existing fCharIter was adopted from the outside. Delete it now.
398 fCharIter
= fDCharIter
;
404 UText
*RuleBasedBreakIterator57::getUText(UText
*fillIn
, UErrorCode
&status
) const {
405 UText
*result
= utext_clone(fillIn
, fText
, FALSE
, TRUE
, &status
);
412 // not used by rbtok.cpp
414 * Returns the description used to create this iterator
417 RuleBasedBreakIterator57::getRules() const {
419 return fData
->getRuleSourceString();
421 static const UnicodeString
*s
;
423 // TODO: something more elegant here.
424 // perhaps API should return the string by value.
425 // Note: thread unsafe init & leak are semi-ok, better than
426 // what was before. Sould be cleaned up, though.
427 s
= new UnicodeString
;
434 //=======================================================================
435 // BreakIterator overrides
436 //=======================================================================
439 * Return a CharacterIterator over the text being analyzed.
442 RuleBasedBreakIterator57::getText() const {
447 * Set the iterator to analyze a new piece of text. This function resets
448 * the current iteration position to the beginning of the text.
449 * @param newText An iterator over the text to analyze.
452 RuleBasedBreakIterator57::adoptText(CharacterIterator
* newText
) {
453 // If we are holding a CharacterIterator adopted from a
454 // previous call to this function, delete it now.
455 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
460 UErrorCode status
= U_ZERO_ERROR
;
462 if (newText
==NULL
|| newText
->startIndex() != 0) {
463 // startIndex !=0 wants to be an error, but there's no way to report it.
464 // Make the iterator text be an empty string.
465 fText
= utext_openUChars(fText
, NULL
, 0, &status
);
467 fText
= utext_openCharacterIterator(fText
, newText
, &status
);
473 * Set the iterator to analyze a new piece of text. This function resets
474 * the current iteration position to the beginning of the text.
475 * @param newText An iterator over the text to analyze.
478 RuleBasedBreakIterator57::setText(const UnicodeString
& newText
) {
479 UErrorCode status
= U_ZERO_ERROR
;
481 fText
= utext_openConstUnicodeString(fText
, &newText
, &status
);
483 // Set up a character iterator on the string.
484 // Needed in case someone calls getText().
485 // Can not, unfortunately, do this lazily on the (probably never)
486 // call to getText(), because getText is const.
487 if (fSCharIter
== NULL
) {
488 fSCharIter
= new StringCharacterIterator(newText
);
490 fSCharIter
->setText(newText
);
493 if (fCharIter
!=fSCharIter
&& fCharIter
!=fDCharIter
) {
494 // old fCharIter was adopted from the outside. Delete it.
497 fCharIter
= fSCharIter
;
504 * Provide a new UText for the input text. Must reference text with contents identical
506 * Intended for use with text data originating in Java (garbage collected) environments
507 * where the data may be moved in memory at arbitrary times.
509 RuleBasedBreakIterator57
&RuleBasedBreakIterator57::refreshInputText(UText
*input
, UErrorCode
&status
) {
510 if (U_FAILURE(status
)) {
514 status
= U_ILLEGAL_ARGUMENT_ERROR
;
517 int64_t pos
= utext_getNativeIndex(fText
);
518 // Shallow read-only clone of the new UText into the existing input UText
519 fText
= utext_clone(fText
, input
, FALSE
, TRUE
, &status
);
520 if (U_FAILURE(status
)) {
523 utext_setNativeIndex(fText
, pos
);
524 if (utext_getNativeIndex(fText
) != pos
) {
525 // Sanity check. The new input utext is supposed to have the exact same
526 // contents as the old. If we can't set to the same position, it doesn't.
527 // The contents underlying the old utext might be invalid at this point,
528 // so it's not safe to check directly.
529 status
= U_ILLEGAL_ARGUMENT_ERROR
;
536 * Sets the current iteration position to the beginning of the text, position zero.
537 * @return The new iterator position, which is zero.
539 int32_t RuleBasedBreakIterator57::first(void) {
541 fLastRuleStatusIndex
= 0;
542 fLastStatusIndexValid
= TRUE
;
544 // return BreakIterator::DONE;
546 utext_setNativeIndex(fText
, 0);
551 * Sets the current iteration position to the end of the text.
552 * @return The text's past-the-end offset.
554 int32_t RuleBasedBreakIterator57::last(void) {
557 fLastRuleStatusIndex
= 0;
558 fLastStatusIndexValid
= TRUE
;
559 return BreakIterator::DONE
;
562 fLastStatusIndexValid
= FALSE
;
563 int32_t pos
= (int32_t)utext_nativeLength(fText
);
564 utext_setNativeIndex(fText
, pos
);
569 * Advances the iterator either forward or backward the specified number of steps.
570 * Negative values move backward, and positive values move forward. This is
571 * equivalent to repeatedly calling next() or previous().
572 * @param n The number of steps to move. The sign indicates the direction
573 * (negative is backwards, and positive is forwards).
574 * @return The character offset of the boundary position n boundaries away from
577 int32_t RuleBasedBreakIterator57::next(int32_t n
) {
578 int32_t result
= current();
591 * Advances the iterator to the next boundary position.
592 * @return The position of the first boundary after this one.
594 int32_t RuleBasedBreakIterator57::next(void) {
595 // if we have cached break positions and we're still in the range
596 // covered by them, just move one step forward in the cache
597 if (fCachedBreakPositions
!= NULL
) {
598 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
600 int32_t pos
= fCachedBreakPositions
[fPositionInCache
];
601 utext_setNativeIndex(fText
, pos
);
609 int32_t startPos
= current();
610 fDictionaryCharCount
= 0;
611 int32_t result
= handleNext(fData
->fForwardTable
);
612 while (fLineWordOpts
!= UBRK_LINEWORD_NORMAL
) {
613 UChar32 prevChr
= utext_char32At(fText
, result
-1);
614 UChar32 currChr
= utext_char32At(fText
, result
);
615 if (currChr
== U_SENTINEL
|| prevChr
== U_SENTINEL
|| !u_isalpha(currChr
) || !u_isalpha(prevChr
)) {
618 int32_t nextResult
= handleNext(fData
->fForwardTable
);
619 if (nextResult
<= result
) {
624 if (fDictionaryCharCount
> 0) {
625 result
= checkDictionary(startPos
, result
, FALSE
);
631 * Advances the iterator backwards, to the last boundary preceding this one.
632 * @return The position of the last boundary position preceding this one.
634 int32_t RuleBasedBreakIterator57::previous(void) {
638 // if we have cached break positions and we're still in the range
639 // covered by them, just move one step backward in the cache
640 if (fCachedBreakPositions
!= NULL
) {
641 if (fPositionInCache
> 0) {
643 // If we're at the beginning of the cache, need to reevaluate the
645 if (fPositionInCache
<= 0) {
646 fLastStatusIndexValid
= FALSE
;
648 int32_t pos
= fCachedBreakPositions
[fPositionInCache
];
649 utext_setNativeIndex(fText
, pos
);
657 // if we're already sitting at the beginning of the text, return DONE
658 if (fText
== NULL
|| (startPos
= current()) == 0) {
659 fLastRuleStatusIndex
= 0;
660 fLastStatusIndexValid
= TRUE
;
661 return BreakIterator::DONE
;
664 if (fData
->fSafeRevTable
!= NULL
|| fData
->fSafeFwdTable
!= NULL
) {
665 result
= handlePrevious(fData
->fReverseTable
);
666 while (fLineWordOpts
!= UBRK_LINEWORD_NORMAL
) {
667 UChar32 prevChr
= utext_char32At(fText
, result
-1);
668 UChar32 currChr
= utext_char32At(fText
, result
);
669 if (currChr
== U_SENTINEL
|| prevChr
== U_SENTINEL
|| !u_isalpha(currChr
) || !u_isalpha(prevChr
)) {
672 int32_t prevResult
= handlePrevious(fData
->fReverseTable
);
673 if (prevResult
>= result
) {
678 if (fDictionaryCharCount
> 0) {
679 result
= checkDictionary(result
, startPos
, TRUE
);
685 // set things up. handlePrevious() will back us up to some valid
686 // break position before the current position (we back our internal
687 // iterator up one step to prevent handlePrevious() from returning
688 // the current position), but not necessarily the last one before
691 int32_t start
= current();
693 (void)UTEXT_PREVIOUS32(fText
);
694 int32_t lastResult
= handlePrevious(fData
->fReverseTable
);
695 if (lastResult
== UBRK_DONE
) {
697 utext_setNativeIndex(fText
, 0);
701 UBool breakTagValid
= FALSE
;
703 // iterate forward from the known break position until we pass our
704 // starting point. The last break position before the starting
705 // point is our return value
709 if (result
== BreakIterator::DONE
|| result
>= start
) {
713 lastTag
= fLastRuleStatusIndex
;
714 breakTagValid
= TRUE
;
717 // fLastBreakTag wants to have the value for section of text preceding
718 // the result position that we are to return (in lastResult.) If
719 // the backwards rules overshot and the above loop had to do two or more
720 // next()s to move up to the desired return position, we will have a valid
721 // tag value. But, if handlePrevious() took us to exactly the correct result position,
722 // we wont have a tag value for that position, which is only set by handleNext().
724 // Set the current iteration position to be the last break position
725 // before where we started, and then return that value.
726 utext_setNativeIndex(fText
, lastResult
);
727 fLastRuleStatusIndex
= lastTag
; // for use by getRuleStatus()
728 fLastStatusIndexValid
= breakTagValid
;
730 // No need to check the dictionary; it will have been handled by
737 * Sets the iterator to refer to the first boundary position following
738 * the specified position.
739 * @offset The position from which to begin searching for a break position.
740 * @return The position of the first break after the current position.
742 int32_t RuleBasedBreakIterator57::following(int32_t offset
) {
743 // if the offset passed in is already past the end of the text,
744 // just return DONE; if it's before the beginning, return the
745 // text's starting offset
746 if (fText
== NULL
|| offset
>= utext_nativeLength(fText
)) {
750 else if (offset
< 0) {
754 // Move requested offset to a code point start. It might be on a trail surrogate,
755 // or on a trail byte if the input is UTF-8.
756 utext_setNativeIndex(fText
, offset
);
757 offset
= (int32_t)utext_getNativeIndex(fText
);
759 // if we have cached break positions and offset is in the range
760 // covered by them, use them
761 // TODO: could use binary search
762 // TODO: what if offset is outside range, but break is not?
763 if (fCachedBreakPositions
!= NULL
) {
764 if (offset
>= fCachedBreakPositions
[0]
765 && offset
< fCachedBreakPositions
[fNumCachedBreakPositions
- 1]) {
766 fPositionInCache
= 0;
767 // We are guaranteed not to leave the array due to range test above
768 while (offset
>= fCachedBreakPositions
[fPositionInCache
]) {
771 int32_t pos
= fCachedBreakPositions
[fPositionInCache
];
772 utext_setNativeIndex(fText
, pos
);
780 // Set our internal iteration position (temporarily)
781 // to the position passed in. If this is the _beginning_ position,
782 // then we can just use next() to get our return value
786 if (fData
->fSafeRevTable
!= NULL
) {
788 utext_setNativeIndex(fText
, offset
);
789 // move forward one codepoint to prepare for moving back to a
791 // this handles offset being between a supplementary character
792 // TODO: is this still needed, with move to code point boundary handled above?
793 (void)UTEXT_NEXT32(fText
);
794 // handlePrevious will move most of the time to < 1 boundary away
795 handlePrevious(fData
->fSafeRevTable
);
796 int32_t result
= next();
797 while (result
<= offset
) {
802 if (fData
->fSafeFwdTable
!= NULL
) {
803 // backup plan if forward safe table is not available
804 utext_setNativeIndex(fText
, offset
);
805 (void)UTEXT_PREVIOUS32(fText
);
806 // handle next will give result >= offset
807 handleNext(fData
->fSafeFwdTable
);
808 // previous will give result 0 or 1 boundary away from offset,
811 int32_t oldresult
= previous();
812 while (oldresult
> offset
) {
813 int32_t result
= previous();
814 if (result
<= offset
) {
819 int32_t result
= next();
820 if (result
<= offset
) {
825 // otherwise, we have to sync up first. Use handlePrevious() to back
826 // up to a known break position before the specified position (if
827 // we can determine that the specified position is a break position,
828 // we don't back up at all). This may or may not be the last break
829 // position at or before our starting position. Advance forward
830 // from here until we've passed the starting position. The position
831 // we stop on will be the first break position after the specified one.
834 utext_setNativeIndex(fText
, offset
);
836 (offset
==1 && utext_getNativeIndex(fText
)==0)) {
841 while (result
!= BreakIterator::DONE
&& result
<= offset
) {
849 * Sets the iterator to refer to the last boundary position before the
850 * specified position.
851 * @offset The position to begin searching for a break from.
852 * @return The position of the last boundary before the starting position.
854 int32_t RuleBasedBreakIterator57::preceding(int32_t offset
) {
855 // if the offset passed in is already past the end of the text,
856 // just return DONE; if it's before the beginning, return the
857 // text's starting offset
858 if (fText
== NULL
|| offset
> utext_nativeLength(fText
)) {
861 else if (offset
< 0) {
865 // Move requested offset to a code point start. It might be on a trail surrogate,
866 // or on a trail byte if the input is UTF-8.
867 utext_setNativeIndex(fText
, offset
);
868 offset
= (int32_t)utext_getNativeIndex(fText
);
870 // if we have cached break positions and offset is in the range
871 // covered by them, use them
872 if (fCachedBreakPositions
!= NULL
) {
873 // TODO: binary search?
874 // TODO: What if offset is outside range, but break is not?
875 if (offset
> fCachedBreakPositions
[0]
876 && offset
<= fCachedBreakPositions
[fNumCachedBreakPositions
- 1]) {
877 fPositionInCache
= 0;
878 while (fPositionInCache
< fNumCachedBreakPositions
879 && offset
> fCachedBreakPositions
[fPositionInCache
])
882 // If we're at the beginning of the cache, need to reevaluate the
884 if (fPositionInCache
<= 0) {
885 fLastStatusIndexValid
= FALSE
;
887 utext_setNativeIndex(fText
, fCachedBreakPositions
[fPositionInCache
]);
888 return fCachedBreakPositions
[fPositionInCache
];
895 // if we start by updating the current iteration position to the
896 // position specified by the caller, we can just use previous()
897 // to carry out this operation
899 if (fData
->fSafeFwdTable
!= NULL
) {
901 utext_setNativeIndex(fText
, offset
);
902 int32_t newOffset
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
903 if (newOffset
!= offset
) {
904 // Will come here if specified offset was not a code point boundary AND
905 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
906 // indices to the containing code point.
907 // For breakitereator::preceding only, these non-code-point indices need to be moved
908 // up to refer to the following codepoint.
909 (void)UTEXT_NEXT32(fText
);
910 offset
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
913 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
914 // rather than adjusting the position unconditionally?
915 // (Change would interact with safe rules.)
916 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
917 // affects only preceding(), seems cleaner, but is slightly different.
918 (void)UTEXT_PREVIOUS32(fText
);
919 handleNext(fData
->fSafeFwdTable
);
920 int32_t result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
921 while (result
>= offset
) {
926 if (fData
->fSafeRevTable
!= NULL
) {
927 // backup plan if forward safe table is not available
928 // TODO: check whether this path can be discarded
929 // It's probably OK to say that rules must supply both safe tables
930 // if they use safe tables at all. We have certainly never described
931 // to anyone how to work with just one safe table.
932 utext_setNativeIndex(fText
, offset
);
933 (void)UTEXT_NEXT32(fText
);
935 // handle previous will give result <= offset
936 handlePrevious(fData
->fSafeRevTable
);
938 // next will give result 0 or 1 boundary away from offset,
941 int32_t oldresult
= next();
942 while (oldresult
< offset
) {
943 int32_t result
= next();
944 if (result
>= offset
) {
949 int32_t result
= previous();
950 if (result
>= offset
) {
957 utext_setNativeIndex(fText
, offset
);
962 * Returns true if the specfied position is a boundary position. As a side
963 * effect, leaves the iterator pointing to the first boundary position at
965 * @param offset the offset to check.
966 * @return True if "offset" is a boundary position.
968 UBool
RuleBasedBreakIterator57::isBoundary(int32_t offset
) {
969 // the beginning index of the iterator is always a boundary position by definition
971 first(); // For side effects on current position, tag values.
975 if (offset
== (int32_t)utext_nativeLength(fText
)) {
976 last(); // For side effects on current position, tag values.
980 // out-of-range indexes are never boundary positions
982 first(); // For side effects on current position, tag values.
986 if (offset
> utext_nativeLength(fText
)) {
987 last(); // For side effects on current position, tag values.
991 // otherwise, we can use following() on the position before the specified
992 // one and return true if the position we get back is the one the user
994 utext_previous32From(fText
, offset
);
995 int32_t backOne
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
996 UBool result
= following(backOne
) == offset
;
1001 * Returns the current iteration position.
1002 * @return The current iteration position.
1004 int32_t RuleBasedBreakIterator57::current(void) const {
1005 int32_t pos
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1009 //=======================================================================
1011 //=======================================================================
1014 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
1015 // of user text. A variable with this enum type keeps track of where we
1016 // are. The state machine only fetches user input while in the RUN mode.
1019 RBBI_START
, // state machine processing is before first char of input
1020 RBBI_RUN
, // state machine processing is in the user text
1021 RBBI_END
// state machine processing is after end of user text.
1025 // Map from look-ahead break states (corresponds to rules) to boundary positions.
1026 // Allows multiple lookahead break rules to be in flight at the same time.
1028 // This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
1029 // in the state table be sequential, then we can just index an array. And the
1030 // table could also tell us in advance how big that array needs to be.
1032 // Before ICU 57 there was just a single simple variable for a look-ahead match that
1033 // was in progress. Two rules at once did not work.
1035 static const int32_t kMaxLookaheads
= 8;
1036 struct LookAheadResults
{
1037 int32_t fUsedSlotLimit
;
1038 int32_t fPositions
[8];
1041 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
1043 int32_t getPosition(int16_t key
) {
1044 for (int32_t i
=0; i
<fUsedSlotLimit
; ++i
) {
1045 if (fKeys
[i
] == key
) {
1046 return fPositions
[i
];
1053 void setPosition(int16_t key
, int32_t position
) {
1055 for (i
=0; i
<fUsedSlotLimit
; ++i
) {
1056 if (fKeys
[i
] == key
) {
1057 fPositions
[i
] = position
;
1061 if (i
>= kMaxLookaheads
) {
1063 i
= kMaxLookaheads
- 1;
1066 fPositions
[i
] = position
;
1067 U_ASSERT(fUsedSlotLimit
== i
);
1068 fUsedSlotLimit
= i
+ 1;
1073 //-----------------------------------------------------------------------------------
1075 // handleNext(stateTable)
1076 // This method is the actual implementation of the rbbi next() method.
1077 // This method initializes the state machine to state 1
1078 // and advances through the text character by character until we reach the end
1079 // of the text or the state machine transitions to state 0. We update our return
1080 // value every time the state machine passes through an accepting state.
1082 //-----------------------------------------------------------------------------------
1083 int32_t RuleBasedBreakIterator57::handleNext(const RBBIStateTable
*statetable
) {
1085 uint16_t category
= 0;
1088 RBBIStateTableRow
*row
;
1090 LookAheadResults lookAheadMatches
;
1092 int32_t initialPosition
= 0;
1093 const char *tableData
= statetable
->fTableData
;
1094 uint32_t tableRowLen
= statetable
->fRowLen
;
1098 RBBIDebugPuts("Handle Next pos char state category");
1102 // No matter what, handleNext alway correctly sets the break tag value.
1103 fLastStatusIndexValid
= TRUE
;
1104 fLastRuleStatusIndex
= 0;
1106 // if we're already at the end of the text, return DONE.
1107 initialPosition
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1108 result
= initialPosition
;
1109 c
= UTEXT_NEXT32(fText
);
1110 if (fData
== NULL
|| c
==U_SENTINEL
) {
1111 return BreakIterator::DONE
;
1114 // Set the initial state for the state machine
1115 state
= START_STATE
;
1116 row
= (RBBIStateTableRow
*)
1117 //(statetable->fTableData + (statetable->fRowLen * state));
1118 (tableData
+ tableRowLen
* state
);
1122 if (statetable
->fFlags
& RBBI_BOF_REQUIRED
) {
1128 // loop until we reach the end of the text or transition to state 0
1131 if (c
== U_SENTINEL
) {
1132 // Reached end of input string.
1133 if (mode
== RBBI_END
) {
1134 // We have already run the loop one last time with the
1135 // character set to the psueudo {eof} value. Now it is time
1136 // to unconditionally bail out.
1139 // Run the loop one last time with the fake end-of-input character category.
1145 // Get the char category. An incoming category of 1 or 2 means that
1146 // we are preset for doing the beginning or end of input, and
1147 // that we shouldn't get a category from an actual text input character.
1149 if (mode
== RBBI_RUN
) {
1150 // look up the current character's character category, which tells us
1151 // which column in the state table to look at.
1152 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1153 // not the size of the character going in, which is a UChar32.
1155 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1157 // Check the dictionary bit in the character's category.
1158 // Counter is only used by dictionary based iterators (subclasses).
1159 // Chars that need to be handled by a dictionary have a flag bit set
1160 // in their category values.
1162 if ((category
& 0x4000) != 0) {
1163 fDictionaryCharCount
++;
1164 // And off the dictionary flag bit.
1165 category
&= ~0x4000;
1171 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText
));
1172 if (0x20<=c
&& c
<0x7f) {
1173 RBBIDebugPrintf("\"%c\" ", c
);
1175 RBBIDebugPrintf("%5x ", c
);
1177 RBBIDebugPrintf("%3d %3d\n", state
, category
);
1181 // State Transition - move machine to its next state
1184 // Note: fNextState is defined as uint16_t[2], but we are casting
1185 // a generated RBBI table to RBBIStateTableRow and some tables
1186 // actually have more than 2 categories.
1187 U_ASSERT(category
<fData
->fHeader
->fCatCount
);
1188 state
= row
->fNextState
[category
]; /*Not accessing beyond memory*/
1189 row
= (RBBIStateTableRow
*)
1190 // (statetable->fTableData + (statetable->fRowLen * state));
1191 (tableData
+ tableRowLen
* state
);
1194 if (row
->fAccepting
== -1) {
1195 // Match found, common case.
1196 if (mode
!= RBBI_START
) {
1197 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1199 fLastRuleStatusIndex
= row
->fTagIdx
; // Remember the break status (tag) values.
1202 int16_t completedRule
= row
->fAccepting
;
1203 if (completedRule
> 0) {
1204 // Lookahead match is completed.
1205 int32_t lookaheadResult
= lookAheadMatches
.getPosition(completedRule
);
1206 if (lookaheadResult
>= 0) {
1207 fLastRuleStatusIndex
= row
->fTagIdx
;
1208 UTEXT_SETNATIVEINDEX(fText
, lookaheadResult
);
1209 return lookaheadResult
;
1212 int16_t rule
= row
->fLookAhead
;
1214 // At the position of a '/' in a look-ahead match. Record it.
1215 int32_t pos
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1216 lookAheadMatches
.setPosition(rule
, pos
);
1219 if (state
== STOP_STATE
) {
1220 // This is the normal exit from the lookup state machine.
1221 // We have advanced through the string until it is certain that no
1222 // longer match is possible, no matter what characters follow.
1226 // Advance to the next character.
1227 // If this is a beginning-of-input loop iteration, don't advance
1228 // the input position. The next iteration will be processing the
1229 // first real input character.
1230 if (mode
== RBBI_RUN
) {
1231 c
= UTEXT_NEXT32(fText
);
1233 if (mode
== RBBI_START
) {
1241 // The state machine is done. Check whether it found a match...
1243 // If the iterator failed to advance in the match engine, force it ahead by one.
1244 // (This really indicates a defect in the break rules. They should always match
1245 // at least one character.)
1246 if (result
== initialPosition
) {
1247 UTEXT_SETNATIVEINDEX(fText
, initialPosition
);
1248 UTEXT_NEXT32(fText
);
1249 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1252 // Leave the iterator at our result position.
1253 UTEXT_SETNATIVEINDEX(fText
, result
);
1256 RBBIDebugPrintf("result = %d\n\n", result
);
1264 //-----------------------------------------------------------------------------------
1268 // Iterate backwards, according to the logic of the reverse rules.
1269 // This version handles the exact style backwards rules.
1271 // The logic of this function is very similar to handleNext(), above.
1273 //-----------------------------------------------------------------------------------
1274 int32_t RuleBasedBreakIterator57::handlePrevious(const RBBIStateTable
*statetable
) {
1276 uint16_t category
= 0;
1278 RBBIStateTableRow
*row
;
1280 LookAheadResults lookAheadMatches
;
1282 int32_t initialPosition
= 0;
1286 RBBIDebugPuts("Handle Previous pos char state category");
1290 // handlePrevious() never gets the rule status.
1291 // Flag the status as invalid; if the user ever asks for status, we will need
1292 // to back up, then re-find the break position using handleNext(), which does
1293 // get the status value.
1294 fLastStatusIndexValid
= FALSE
;
1295 fLastRuleStatusIndex
= 0;
1297 // if we're already at the start of the text, return DONE.
1298 if (fText
== NULL
|| fData
== NULL
|| UTEXT_GETNATIVEINDEX(fText
)==0) {
1299 return BreakIterator::DONE
;
1302 // Set up the starting char.
1303 initialPosition
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1304 result
= initialPosition
;
1305 c
= UTEXT_PREVIOUS32(fText
);
1307 // Set the initial state for the state machine
1308 state
= START_STATE
;
1309 row
= (RBBIStateTableRow
*)
1310 (statetable
->fTableData
+ (statetable
->fRowLen
* state
));
1313 if (statetable
->fFlags
& RBBI_BOF_REQUIRED
) {
1319 // loop until we reach the start of the text or transition to state 0
1322 if (c
== U_SENTINEL
) {
1323 // Reached end of input string.
1324 if (mode
== RBBI_END
) {
1325 // We have already run the loop one last time with the
1326 // character set to the psueudo {eof} value. Now it is time
1327 // to unconditionally bail out.
1328 if (result
== initialPosition
) {
1329 // Ran off start, no match found.
1330 // move one index one (towards the start, since we are doing a previous())
1331 UTEXT_SETNATIVEINDEX(fText
, initialPosition
);
1332 (void)UTEXT_PREVIOUS32(fText
); // TODO: shouldn't be necessary. We're already at beginning. Check.
1336 // Run the loop one last time with the fake end-of-input character category.
1342 // Get the char category. An incoming category of 1 or 2 means that
1343 // we are preset for doing the beginning or end of input, and
1344 // that we shouldn't get a category from an actual text input character.
1346 if (mode
== RBBI_RUN
) {
1347 // look up the current character's character category, which tells us
1348 // which column in the state table to look at.
1349 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1350 // not the size of the character going in, which is a UChar32.
1352 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1354 // Check the dictionary bit in the character's category.
1355 // Counter is only used by dictionary based iterators (subclasses).
1356 // Chars that need to be handled by a dictionary have a flag bit set
1357 // in their category values.
1359 if ((category
& 0x4000) != 0) {
1360 fDictionaryCharCount
++;
1361 // And off the dictionary flag bit.
1362 category
&= ~0x4000;
1368 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText
));
1369 if (0x20<=c
&& c
<0x7f) {
1370 RBBIDebugPrintf("\"%c\" ", c
);
1372 RBBIDebugPrintf("%5x ", c
);
1374 RBBIDebugPrintf("%3d %3d\n", state
, category
);
1378 // State Transition - move machine to its next state
1381 // Note: fNextState is defined as uint16_t[2], but we are casting
1382 // a generated RBBI table to RBBIStateTableRow and some tables
1383 // actually have more than 2 categories.
1384 U_ASSERT(category
<fData
->fHeader
->fCatCount
);
1385 state
= row
->fNextState
[category
]; /*Not accessing beyond memory*/
1386 row
= (RBBIStateTableRow
*)
1387 (statetable
->fTableData
+ (statetable
->fRowLen
* state
));
1389 if (row
->fAccepting
== -1) {
1390 // Match found, common case.
1391 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1394 int16_t completedRule
= row
->fAccepting
;
1395 if (completedRule
> 0) {
1396 // Lookahead match is completed.
1397 int32_t lookaheadResult
= lookAheadMatches
.getPosition(completedRule
);
1398 if (lookaheadResult
>= 0) {
1399 UTEXT_SETNATIVEINDEX(fText
, lookaheadResult
);
1400 return lookaheadResult
;
1403 int16_t rule
= row
->fLookAhead
;
1405 // At the position of a '/' in a look-ahead match. Record it.
1406 int32_t pos
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1407 lookAheadMatches
.setPosition(rule
, pos
);
1410 if (state
== STOP_STATE
) {
1411 // This is the normal exit from the lookup state machine.
1412 // We have advanced through the string until it is certain that no
1413 // longer match is possible, no matter what characters follow.
1417 // Move (backwards) to the next character to process.
1418 // If this is a beginning-of-input loop iteration, don't advance
1419 // the input position. The next iteration will be processing the
1420 // first real input character.
1421 if (mode
== RBBI_RUN
) {
1422 c
= UTEXT_PREVIOUS32(fText
);
1424 if (mode
== RBBI_START
) {
1430 // The state machine is done. Check whether it found a match...
1432 // If the iterator failed to advance in the match engine, force it ahead by one.
1433 // (This really indicates a defect in the break rules. They should always match
1434 // at least one character.)
1435 if (result
== initialPosition
) {
1436 UTEXT_SETNATIVEINDEX(fText
, initialPosition
);
1437 UTEXT_PREVIOUS32(fText
);
1438 result
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1441 // Leave the iterator at our result position.
1442 UTEXT_SETNATIVEINDEX(fText
, result
);
1445 RBBIDebugPrintf("result = %d\n\n", result
);
1453 RuleBasedBreakIterator57::reset()
1455 if (fCachedBreakPositions
) {
1456 uprv_free(fCachedBreakPositions
);
1458 fCachedBreakPositions
= NULL
;
1459 fNumCachedBreakPositions
= 0;
1460 fDictionaryCharCount
= 0;
1461 fPositionInCache
= 0;
1466 //-------------------------------------------------------------------------------
1468 // getRuleStatus() Return the break rule tag associated with the current
1469 // iterator position. If the iterator arrived at its current
1470 // position by iterating forwards, the value will have been
1471 // cached by the handleNext() function.
1473 // If no cached status value is available, the status is
1474 // found by doing a previous() followed by a next(), which
1475 // leaves the iterator where it started, and computes the
1476 // status while doing the next().
1478 //-------------------------------------------------------------------------------
1479 void RuleBasedBreakIterator57::makeRuleStatusValid() {
1480 if (fLastStatusIndexValid
== FALSE
) {
1481 // No cached status is available.
1482 if (fText
== NULL
|| current() == 0) {
1483 // At start of text, or there is no text. Status is always zero.
1484 fLastRuleStatusIndex
= 0;
1485 fLastStatusIndexValid
= TRUE
;
1487 // Not at start of text. Find status the tedious way.
1488 int32_t pa
= current();
1490 if (fNumCachedBreakPositions
> 0) {
1491 reset(); // Blow off the dictionary cache
1493 int32_t pb
= next();
1495 // note: the if (pa != pb) test is here only to eliminate warnings for
1496 // unused local variables on gcc. Logically, it isn't needed.
1501 U_ASSERT(fLastRuleStatusIndex
>= 0 && fLastRuleStatusIndex
< fData
->fStatusMaxIdx
);
1505 int32_t RuleBasedBreakIterator57::getRuleStatus() const {
1506 RuleBasedBreakIterator57
*nonConstThis
= (RuleBasedBreakIterator57
*)this;
1507 nonConstThis
->makeRuleStatusValid();
1509 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1510 // (the number of status values.)
1511 // This function returns the last (largest) of the array of status values.
1512 int32_t idx
= fLastRuleStatusIndex
+ fData
->fRuleStatusTable
[fLastRuleStatusIndex
];
1513 int32_t tagVal
= fData
->fRuleStatusTable
[idx
];
1521 int32_t RuleBasedBreakIterator57::getRuleStatusVec(
1522 int32_t *fillInVec
, int32_t capacity
, UErrorCode
&status
)
1524 if (U_FAILURE(status
)) {
1528 RuleBasedBreakIterator57
*nonConstThis
= (RuleBasedBreakIterator57
*)this;
1529 nonConstThis
->makeRuleStatusValid();
1530 int32_t numVals
= fData
->fRuleStatusTable
[fLastRuleStatusIndex
];
1531 int32_t numValsToCopy
= numVals
;
1532 if (numVals
> capacity
) {
1533 status
= U_BUFFER_OVERFLOW_ERROR
;
1534 numValsToCopy
= capacity
;
1537 for (i
=0; i
<numValsToCopy
; i
++) {
1538 fillInVec
[i
] = fData
->fRuleStatusTable
[fLastRuleStatusIndex
+ i
+ 1];
1545 //-------------------------------------------------------------------------------
1547 // getBinaryRules Access to the compiled form of the rules,
1548 // for use by build system tools that save the data
1549 // for standard iterator types.
1551 //-------------------------------------------------------------------------------
1552 const uint8_t *RuleBasedBreakIterator57::getBinaryRules(uint32_t &length
) {
1553 const uint8_t *retPtr
= NULL
;
1556 if (fData
!= NULL
) {
1557 retPtr
= (const uint8_t *)fData
->fHeader
;
1558 length
= fData
->fHeader
->fLength
;
1564 BreakIterator
* RuleBasedBreakIterator57::createBufferClone(void * /*stackBuffer*/,
1565 int32_t &bufferSize
,
1568 if (U_FAILURE(status
)){
1572 if (bufferSize
== 0) {
1573 bufferSize
= 1; // preflighting for deprecated functionality
1577 BreakIterator
*clonedBI
= clone();
1578 if (clonedBI
== NULL
) {
1579 status
= U_MEMORY_ALLOCATION_ERROR
;
1581 status
= U_SAFECLONE_ALLOCATED_WARNING
;
1583 return (RuleBasedBreakIterator57
*)clonedBI
;
1586 //-------------------------------------------------------------------------------
1588 // checkDictionary This function handles all processing of characters in
1589 // the "dictionary" set. It will determine the appropriate
1590 // course of action, and possibly set up a cache in the
1593 //-------------------------------------------------------------------------------
1594 int32_t RuleBasedBreakIterator57::checkDictionary(int32_t startPos
,
1597 // Reset the old break cache first.
1600 // note: code segment below assumes that dictionary chars are in the
1601 // startPos-endPos range
1602 // value returned should be next character in sequence
1603 if ((endPos
- startPos
) <= 1) {
1604 return (reverse
? startPos
: endPos
);
1607 // Starting from the starting point, scan towards the proposed result,
1608 // looking for the first dictionary character (which may be the one
1609 // we're on, if we're starting in the middle of a range).
1610 utext_setNativeIndex(fText
, reverse
? endPos
: startPos
);
1612 UTEXT_PREVIOUS32(fText
);
1615 int32_t rangeStart
= startPos
;
1616 int32_t rangeEnd
= endPos
;
1620 UErrorCode status
= U_ZERO_ERROR
;
1621 UVector32
breaks(status
); // changed from UStack in ICU 57
1622 int32_t foundBreakCount
= 0;
1623 UChar32 c
= utext_current32(fText
);
1625 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1627 // Is the character we're starting on a dictionary character? If so, we
1628 // need to back up to include the entire run; otherwise the results of
1629 // the break algorithm will differ depending on where we start. Since
1630 // the result is cached and there is typically a non-dictionary break
1631 // within a small number of words, there should be little performance impact.
1632 if (category
& 0x4000) {
1635 utext_next32(fText
); // TODO: recast to work directly with postincrement.
1636 c
= utext_current32(fText
);
1637 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1638 } while (c
!= U_SENTINEL
&& (category
& 0x4000));
1639 // Back up to the last dictionary character
1640 rangeEnd
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);
1641 if (c
== U_SENTINEL
) {
1642 // c = fText->last32();
1643 // TODO: why was this if needed?
1644 c
= UTEXT_PREVIOUS32(fText
);
1647 c
= UTEXT_PREVIOUS32(fText
);
1652 c
= UTEXT_PREVIOUS32(fText
);
1653 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1655 while (c
!= U_SENTINEL
&& (category
& 0x4000));
1656 // Back up to the last dictionary character
1657 if (c
== U_SENTINEL
) {
1658 // c = fText->first32();
1659 c
= utext_current32(fText
);
1662 utext_next32(fText
);
1663 c
= utext_current32(fText
);
1665 rangeStart
= (int32_t)UTEXT_GETNATIVEINDEX(fText
);;
1667 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1670 // Loop through the text, looking for ranges of dictionary characters.
1671 // For each span, find the appropriate break engine, and ask it to find
1672 // any breaks within the span.
1673 // Note: we always do this in the forward direction, so that the break
1674 // cache is built in the right order.
1676 utext_setNativeIndex(fText
, rangeStart
);
1677 c
= utext_current32(fText
);
1678 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1680 while(U_SUCCESS(status
)) {
1681 while((current
= (int32_t)UTEXT_GETNATIVEINDEX(fText
)) < rangeEnd
&& (category
& 0x4000) == 0) {
1682 utext_next32(fText
); // TODO: tweak for post-increment operation
1683 c
= utext_current32(fText
);
1684 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1686 if (current
>= rangeEnd
) {
1690 // We now have a dictionary character. Get the appropriate language object
1692 const LanguageBreakEngine
*lbe
= getLanguageBreakEngine(c
);
1694 // Ask the language object if there are any breaks. It will leave the text
1695 // pointer on the other side of its range, ready to search for the next one.
1697 foundBreakCount
+= lbe
->findBreaks(fText
, rangeStart
, rangeEnd
, breaks
);
1700 // Reload the loop variables for the next go-round
1701 c
= utext_current32(fText
);
1702 UTRIE_GET16(&fData
->fTrie
, c
, category
);
1705 // If we found breaks, build a new break cache. The first and last entries must
1706 // be the original starting and ending position.
1707 if (foundBreakCount
> 0) {
1708 U_ASSERT(foundBreakCount
== breaks
.size());
1709 int32_t totalBreaks
= foundBreakCount
;
1710 if (startPos
< breaks
.elementAti(0)) {
1713 if (endPos
> breaks
.peeki()) {
1716 fCachedBreakPositions
= (int32_t *)uprv_malloc(totalBreaks
* sizeof(int32_t));
1717 if (fCachedBreakPositions
!= NULL
) {
1719 fNumCachedBreakPositions
= totalBreaks
;
1720 if (startPos
< breaks
.elementAti(0)) {
1721 fCachedBreakPositions
[out
++] = startPos
;
1723 for (int32_t i
= 0; i
< foundBreakCount
; ++i
) {
1724 fCachedBreakPositions
[out
++] = breaks
.elementAti(i
);
1726 if (endPos
> fCachedBreakPositions
[out
-1]) {
1727 fCachedBreakPositions
[out
] = endPos
;
1729 // If there are breaks, then by definition, we are replacing the original
1730 // proposed break by one of the breaks we found. Use following() and
1731 // preceding() to do the work. They should never recurse in this case.
1733 return preceding(endPos
);
1736 return following(startPos
);
1739 // If the allocation failed, just fall through to the "no breaks found" case.
1742 // If we get here, there were no language-based breaks. Set the text pointer
1743 // to the original proposed break.
1744 utext_setNativeIndex(fText
, reverse
? startPos
: endPos
);
1745 return (reverse
? startPos
: endPos
);
1751 static icu::UStack
*gLanguageBreakFactories
= NULL
;
1752 static icu::UInitOnce gLanguageBreakFactoriesInitOnce
= U_INITONCE_INITIALIZER
;
1755 * Release all static memory held by breakiterator.
1758 static UBool U_CALLCONV
breakiterator_cleanup_dict(void) {
1759 if (gLanguageBreakFactories
) {
1760 delete gLanguageBreakFactories
;
1761 gLanguageBreakFactories
= NULL
;
1763 gLanguageBreakFactoriesInitOnce
.reset();
1769 static void U_CALLCONV
_deleteFactory(void *obj
) {
1770 delete (icu::LanguageBreakFactory
*) obj
;
1775 static void U_CALLCONV
initLanguageFactories() {
1776 UErrorCode status
= U_ZERO_ERROR
;
1777 U_ASSERT(gLanguageBreakFactories
== NULL
);
1778 gLanguageBreakFactories
= new UStack(_deleteFactory
, NULL
, status
);
1779 if (gLanguageBreakFactories
!= NULL
&& U_SUCCESS(status
)) {
1780 ICULanguageBreakFactory
*builtIn
= new ICULanguageBreakFactory(status
);
1781 gLanguageBreakFactories
->push(builtIn
, status
);
1782 #ifdef U_LOCAL_SERVICE_HOOK
1783 LanguageBreakFactory
*extra
= (LanguageBreakFactory
*)uprv_svc_hook("languageBreakFactory", &status
);
1784 if (extra
!= NULL
) {
1785 gLanguageBreakFactories
->push(extra
, status
);
1789 ucln_common_registerCleanup(UCLN_COMMON_RBBI57
, breakiterator_cleanup_dict
);
1793 static const LanguageBreakEngine
*
1794 getLanguageBreakEngineFromFactory(UChar32 c
, int32_t breakType
)
1796 umtx_initOnce(gLanguageBreakFactoriesInitOnce
, &initLanguageFactories
);
1797 if (gLanguageBreakFactories
== NULL
) {
1801 int32_t i
= gLanguageBreakFactories
->size();
1802 const LanguageBreakEngine
*lbe
= NULL
;
1804 LanguageBreakFactory
*factory
= (LanguageBreakFactory
*)(gLanguageBreakFactories
->elementAt(i
));
1805 lbe
= factory
->getEngineFor(c
);
1814 //-------------------------------------------------------------------------------
1816 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1819 //-------------------------------------------------------------------------------
1820 const LanguageBreakEngine
*
1821 RuleBasedBreakIterator57::getLanguageBreakEngine(UChar32 c
) {
1822 const LanguageBreakEngine
*lbe
= NULL
;
1823 UErrorCode status
= U_ZERO_ERROR
;
1825 if (fLanguageBreakEngines
== NULL
) {
1826 fLanguageBreakEngines
= new UStack(status
);
1827 if (fLanguageBreakEngines
== NULL
|| U_FAILURE(status
)) {
1828 delete fLanguageBreakEngines
;
1829 fLanguageBreakEngines
= 0;
1834 int32_t i
= fLanguageBreakEngines
->size();
1836 lbe
= (const LanguageBreakEngine
*)(fLanguageBreakEngines
->elementAt(i
));
1837 if (lbe
->handles(c
)) {
1842 // No existing dictionary took the character. See if a factory wants to
1843 // give us a new LanguageBreakEngine for this character.
1844 lbe
= getLanguageBreakEngineFromFactory(c
, fBreakType
);
1846 // If we got one, use it and push it on our stack.
1848 fLanguageBreakEngines
->push((void *)lbe
, status
);
1849 // Even if we can't remember it, we can keep looking it up, so
1850 // return it even if the push fails.
1854 // No engine is forthcoming for this character. Add it to the
1855 // reject set. Create the reject break engine if needed.
1856 if (fUnhandledBreakEngine
== NULL
) {
1857 fUnhandledBreakEngine
= new UnhandledEngine(status
);
1858 if (U_SUCCESS(status
) && fUnhandledBreakEngine
== NULL
) {
1859 status
= U_MEMORY_ALLOCATION_ERROR
;
1861 // Put it last so that scripts for which we have an engine get tried
1863 fLanguageBreakEngines
->insertElementAt(fUnhandledBreakEngine
, 0, status
);
1864 // If we can't insert it, or creation failed, get rid of it
1865 if (U_FAILURE(status
)) {
1866 delete fUnhandledBreakEngine
;
1867 fUnhandledBreakEngine
= 0;
1872 // Tell the reject engine about the character; at its discretion, it may
1873 // add more than just the one character.
1874 fUnhandledBreakEngine
->handleCharacter(c
);
1876 return fUnhandledBreakEngine
;
1879 void RuleBasedBreakIterator57::setBreakType(int32_t type
) {
1886 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */