#include "unicode/udata.h"
#include "unicode/parseerr.h"
#include "unicode/schriter.h"
-#include "unicode/uchriter.h"
-
-
-struct UTrie;
+// for Apple addition:
+#include "unicode/urbtok.h"
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/** @internal */
+class LanguageBreakEngine;
struct RBBIDataHeader;
-class RuleBasedBreakIteratorTables;
-class BreakIterator;
class RBBIDataWrapper;
-class UStack;
-class LanguageBreakEngine;
class UnhandledEngine;
-struct RBBIStateTable;
-
-
-
+class UStack;
/**
*
*/
class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator {
-// The following was changed from protected to private per #12071.
-// However Apple code needs these, so change back to protected.
-protected:
+private:
/**
* The UText through which this BreakIterator accesses the text
- * @internal
- */
- UText *fText;
-
- /**
- * A character iterator that refers to the same text as the UText, above.
- * Only included for compatibility with old API, which was based on CharacterIterators.
- * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
- */
- CharacterIterator *fCharIter;
-
- /**
- * When the input text is provided by a UnicodeString, this will point to
- * a characterIterator that wraps that data. Needed only for the
- * implementation of getText(), a backwards compatibility issue.
+ * @internal (private)
*/
- StringCharacterIterator *fSCharIter;
-
- /**
- * When the input text is provided by a UText, this
- * dummy CharacterIterator over an empty string will
- * be returned from getText()
- */
- UCharCharacterIterator *fDCharIter;
+ UText fText;
+#ifndef U_HIDE_INTERNAL_API
+public:
+#endif /* U_HIDE_INTERNAL_API */
/**
- * The rule data for this BreakIterator instance
+ * The rule data for this BreakIterator instance.
+ * Not for general use; Public only for testing purposes.
* @internal
*/
RBBIDataWrapper *fData;
-
- /** Index of the Rule {tag} values for the most recent match.
- * @internal
- */
- int32_t fLastRuleStatusIndex;
+private:
/**
- * Rule tag value valid flag.
- * Some iterator operations don't intrinsically set the correct tag value.
- * This flag lets us lazily compute the value if we are ever asked for it.
- * @internal
+ * Character categories for the Latin1 subset of Unicode
+ * @internal Apple-only
*/
- UBool fLastStatusIndexValid;
+ uint16_t *fLatin1Cat;
/**
- * Counter for the number of characters encountered with the "dictionary"
- * flag set.
- * @internal
- */
- uint32_t fDictionaryCharCount;
+ * The current position of the iterator. Pinned, 0 < fPosition <= text.length.
+ * Never has the value UBRK_DONE (-1).
+ */
+ int32_t fPosition;
/**
- * When a range of characters is divided up using the dictionary, the break
- * positions that are discovered are stored here, preventing us from having
- * to use either the dictionary or the state table again until the iterator
- * leaves this range of text. Has the most impact for line breaking.
- * @internal
- */
- int32_t* fCachedBreakPositions;
+ * TODO:
+ */
+ int32_t fRuleStatusIndex;
/**
- * The number of elements in fCachedBreakPositions
- * @internal
+ * Cache of previously determined boundary positions.
*/
- int32_t fNumCachedBreakPositions;
+ class BreakCache;
+ BreakCache *fBreakCache;
/**
- * if fCachedBreakPositions is not null, this indicates which item in the
- * cache the current iteration position refers to
- * @internal
+ * Cache of boundary positions within a region of text that has been
+ * sub-divided by dictionary based breaking.
*/
- int32_t fPositionInCache;
+ class DictionaryCache;
+ DictionaryCache *fDictionaryCache;
/**
*
UnhandledEngine *fUnhandledBreakEngine;
/**
- *
- * The type of the break iterator, or -1 if it has not been set.
+ * Counter for the number of characters encountered with the "dictionary"
+ * flag set.
* @internal
*/
- int32_t fBreakType;
+ uint32_t fDictionaryCharCount;
- //=======================================================================
- // constructors
- //=======================================================================
+ /**
+ * A character iterator that refers to the same text as the UText, above.
+ * Only included for compatibility with old API, which was based on CharacterIterators.
+ * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
+ */
+ CharacterIterator *fCharIter;
/**
- * Open-source ICU eliminated this enum in #12071, restored here since Apple needs it
- * Constant to be used in the constructor
- * RuleBasedBreakIterator(RBBIDataHeader*, EDontAdopt, UErrorCode &);
- * which does not adopt the memory indicated by the RBBIDataHeader*
- * parameter.
- *
- * @internal
+ * When the input text is provided by a UnicodeString, this will point to
+ * a characterIterator that wraps that data. Needed only for the
+ * implementation of getText(), a backwards compatibility issue.
*/
- enum EDontAdopt {
- kDontAdopt
- };
+ StringCharacterIterator fSCharIter;
+
+ /**
+ * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
+ */
+ UBool fDone;
+ //=======================================================================
+ // constructors
+ //=======================================================================
+
+// The following is intended to be private in open-source.
+// However Apple needs it to be public for urbtok.cpp
+public:
/**
* Constructor from a flattened set of RBBI data in malloced memory.
* RulesBasedBreakIterators built from a custom set of rules
* @internal
*/
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
+private:
- /**
- * Open-source ICU eliminated this method in #12071, restored here since Apple needs it
- * Constructor from a flattened set of RBBI data in memory which need not
- * be malloced (e.g. it may be a memory-mapped file, etc.).
- *
- * This version does not adopt the memory, and does not
- * free it when done.
- * @internal
- */
- RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt dontAdopt, UErrorCode &status);
-
-
+ /** @internal */
friend class RBBIRuleBuilder;
/** @internal */
friend class BreakIterator;
-
-
public:
/** Default constructor. Creates an empty shell of an iterator, with no
UErrorCode &status);
/**
- * Contruct a RuleBasedBreakIterator from a set of precompiled binary rules.
+ * Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
* Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
* Construction of a break iterator in this way is substantially faster than
- * constuction from source rules.
+ * construction from source rules.
*
* Ownership of the storage containing the compiled rules remains with the
* caller of this function. The compiled rules must not be modified or
* deleted during the life of the break iterator.
*
* The compiled rules are not compatible across different major versions of ICU.
- * The compiled rules are comaptible only between machines with the same
+ * The compiled rules are compatible only between machines with the same
* byte ordering (little or big endian) and the same base character set family
* (ASCII or EBCDIC).
*
* behavior, and iterating over the same text, as this one.
* Differs from the copy constructor in that it is polymorphic, and
* will correctly clone (copy) a derived class.
- * clone() is thread safe. Multiple threads may simultaeneously
+ * clone() is thread safe. Multiple threads may simultaneously
* clone the same source break iterator.
* @return a newly-constructed RuleBasedBreakIterator
* @stable ICU 2.0
virtual int32_t preceding(int32_t offset);
/**
- * Returns true if the specfied position is a boundary position. As a side
+ * Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param offset the offset to check.
virtual UBool isBoundary(int32_t offset);
/**
- * Returns the current iteration position.
+ * Returns the current iteration position. Note that UBRK_DONE is never
+ * returned from this function; if iteration has run to the end of a
+ * string, current() will return the length of the string while
+ * next() will return UBRK_DONE).
* @return The current iteration position.
* @stable ICU 2.0
*/
/**
- * Return the status tag from the break rule that determined the most recently
- * returned break position. For break rules that do not specify a
+ * Return the status tag from the break rule that determined the boundary at
+ * the current iteration position. For break rules that do not specify a
* status, a default value of 0 is returned. If more than one break rule
* would cause a boundary to be located at some position in the text,
* the numerically largest of the applicable status values is returned.
* position from <code>next()</code>, <code>previous()</code>, or
* any other break iterator functions that returns a boundary position.
* <p>
+ * Note that <code>getRuleStatus()</code> returns the value corresponding to
+ * <code>current()</code> index even after <code>next()</code> has returned DONE.
+ * <p>
* When creating custom break rules, one is free to define whatever
* status values may be convenient for the application.
* <p>
- * Note: this function is not thread safe. It should not have been
- * declared const, and the const remains only for compatibility
- * reasons. (The function is logically const, but not bit-wise const).
- * <p>
- * @return the status from the break rule that determined the most recently
- * returned break position.
+ * @return the status from the break rule that determined the boundary
+ * at the current iteration position.
*
* @see UWordBreak
* @stable ICU 2.2
virtual int32_t getRuleStatus() const;
/**
- * Get the status (tag) values from the break rule(s) that determined the most
- * recently returned break position.
+ * Get the status (tag) values from the break rule(s) that determined the boundary
+ * at the current iteration position.
* <p>
* The returned status value(s) are stored into an array provided by the caller.
* The values are stored in sorted (ascending) order.
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
- * normal way, without attemtping to store any values.
+ * normal way, without attempting to store any values.
* @param status receives error codes.
- * @return The number of rule status values from rules that determined
- * the most recent boundary returned by the break iterator.
+ * @return The number of rule status values from the rules that determined
+ * the boundary at the current iteration position.
* In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
*/
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
+ /**
+ * Apple custom extension
+ * Initializes Latin1 category
+ * @internal
+ */
+ void initLatin1Cat(void);
+
+ /**
+ * Apple custom extension
+ * Fetch the next set of tokens.
+ * @param maxTokens The maximum number of tokens to return.
+ * @param outTokenRanges Pointer to output array of token ranges.
+ * @param outTokenFlags (optional) pointer to output array of token flags.
+ * @internal
+ */
+ int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);
+
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
*
* Create a clone (copy) of this break iterator in memory provided
* by the caller. The idea is to increase performance by avoiding
- * a storage allocation. Use of this functoin is NOT RECOMMENDED.
+ * a storage allocation. Use of this function is NOT RECOMMENDED.
* Performance gains are minimal, and correct buffer management is
* tricky. Use clone() instead.
*
* storage for the cloned object.
*
* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
- * returned if the the provided buffer was too small, and
+ * returned if the provided buffer was too small, and
* the clone was therefore put on the heap.
*
* @return Pointer to the clone object. This may differ from the stackBuffer
* The binary data can only be used with the same version of ICU
* and on the same platform type (processor endian-ness)
*
- * @param length Returns the length of the binary data. (Out paramter.)
+ * @param length Returns the length of the binary data. (Out parameter.)
*
* @return A pointer to the binary (compiled) rule data. The storage
* belongs to the RulesBasedBreakIterator object, not the
virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
-// The following was changed from protected to private per #12071.
-// However Apple code needs these, so change back to protected.
-protected:
+private:
//=======================================================================
// implementation
//=======================================================================
/**
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position.
- * @internal
+ * @internal (private)
*/
void reset(void);
- /**
- * Set the type of the break iterator.
- * @internal
- */
- void setBreakType(int32_t type);
-
/**
* Common initialization function, used by constructors and bufferClone.
- * @internal
+ * @internal (private)
*/
- void init();
-
-private:
-
- /**
- * This method backs the iterator back up to a "safe position" in the text.
- * This is a position that we know, without any context, must be a break position.
- * The various calling methods then iterate forward from this safe position to
- * the appropriate position to return. (For more information, see the description
- * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
- * @param statetable state table used of moving backwards
- * @internal
- */
- int32_t handlePrevious(const RBBIStateTable *statetable);
+ void init(UErrorCode &status);
/**
- * This method is the actual implementation of the next() method. All iteration
- * vectors through here. This method initializes the state machine to state 1
- * and advances through the text character by character until we reach the end
- * of the text or the state machine transitions to state 0. We update our return
- * value every time the state machine passes through a possible end state.
- * @param statetable state table used of moving forwards
- * @internal
+ * Iterate backwards from an arbitrary position in the input text using the
+ * synthesized Safe Reverse rules.
+ * This locates a "Safe Position" from which the forward break rules
+ * will operate correctly. A Safe Position is not necessarily a boundary itself.
+ *
+ * @param fromPosition the position in the input text to begin the iteration.
+ * @internal (private)
*/
- int32_t handleNext(const RBBIStateTable *statetable);
-
-protected:
+ int32_t handleSafePrevious(int32_t fromPosition);
/**
- * This is the function that actually implements dictionary-based
- * breaking. Covering at least the range from startPos to endPos,
- * it checks for dictionary characters, and if it finds them determines
- * the appropriate object to deal with them. It may cache found breaks in
- * fCachedBreakPositions as it goes. It may well also look at text outside
- * the range startPos to endPos.
- * If going forward, endPos is the normal Unicode break result, and
- * if goind in reverse, startPos is the normal Unicode break result
- * @param startPos The start position of a range of text
- * @param endPos The end position of a range of text
- * @param reverse The call is for the reverse direction
- * @internal
+ * Find a rule-based boundary by running the state machine.
+ * Input
+ * fPosition, the position in the text to begin from.
+ * Output
+ * fPosition: the boundary following the starting position.
+ * fDictionaryCharCount the number of dictionary characters encountered.
+ * If > 0, the segment will be further subdivided
+ * fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
+ *
+ * @internal (private)
*/
- int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
+ int32_t handleNext();
+ int32_t handleNextInternal();
-private:
/**
* This function returns the appropriate LanguageBreakEngine for a
* given character c.
* @param c A character in the dictionary set
- * @internal
+ * @internal (private)
*/
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
+ public:
+#ifndef U_HIDE_INTERNAL_API
/**
- * @internal
+ * Debugging function only.
+ * @internal
+ */
+ void dumpCache();
+
+ /**
+ * Debugging function only.
+ * @internal
*/
- void makeRuleStatusValid();
+ void dumpTables();
+#endif /* U_HIDE_INTERNAL_API */
};
//------------------------------------------------------------------------------