/*
********************************************************************************
-* Copyright (C) 1997-2006, International Business Machines
+* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
* \file
* \brief C++ API: Break Iterator.
*/
-
+
#if UCONFIG_NO_BREAK_ITERATION
U_NAMESPACE_BEGIN
* plain C API with equivalent functionality is defined in the
* file ubrk.h
* <p>
- * Code snippits illustrating the use of the Break Iterator APIs
- * are available in the ICU User Guide,
- * http://icu.sourceforge.net/userguide/boundaryAnalysis.html
- * and in the sample program icu/source/samples/break/break.cpp"
+ * Code snippets illustrating the use of the Break Iterator APIs
+ * are available in the ICU User Guide,
+ * http://icu-project.org/userguide/boundaryAnalysis.html
+ * and in the sample program icu/source/samples/break/break.cpp
*
*/
class U_COMMON_API BreakIterator : public UObject {
* @param status receives any error codes.
* @return The current UText for this break iterator. If an input
* UText was provided, it will always be returned.
- * @draft ICU 3.4
+ * @stable ICU 3.4
*/
virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
virtual void setText(const UnicodeString &text) = 0;
/**
- * Reset the break iterator to operate over the text represented by
+ * Reset the break iterator to operate over the text represented by
* the UText. The iterator position is reset to the start.
*
* This function makes a shallow clone of the supplied UText. This means
* Utext that was passed as a parameter, but that the underlying text itself
* must not be altered while being referenced by the break iterator.
*
+ * All index positions returned by break iterator functions are
+ * native indices from the UText. For example, when breaking UTF-8
+ * encoded text, the break positions returned by next(), previous(), etc.
+ * will be UTF-8 string indices, not UTF-16 positions.
+ *
* @param text The UText used to change the text.
* @param status receives any error codes.
- * @draft ICU 3.4
+ * @stable ICU 3.4
*/
virtual void setText(UText *text, UErrorCode &status) = 0;
};
/**
- * Return the index of the first character in the text being scanned.
+ * Sets the current iteration position to the beginning of the text, position zero.
+ * @return The offset of the beginning of the text, zero.
* @stable ICU 2.0
*/
virtual int32_t first(void) = 0;
/**
- * Return the index immediately BEYOND the last character in the text being scanned.
+ * Set the iterator position to the index immediately BEYOND the last character in the text being scanned.
+ * @return The index immediately BEYOND the last character in the text being scanned.
* @stable ICU 2.0
*/
virtual int32_t last(void) = 0;
/**
- * Return the boundary preceding the current boundary.
+ * Set the iterator position to the boundary preceding the current boundary.
* @return The character index of the previous text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
virtual int32_t previous(void) = 0;
/**
- * Return the boundary following the current boundary.
+ * Advance the iterator to the boundary following the current boundary.
* @return The character index of the next text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
virtual int32_t current(void) const = 0;
/**
- * Return the first boundary following the specified offset.
+ * Advance the iterator to the first boundary following the specified offset.
* The value returned is always greater than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
virtual int32_t following(int32_t offset) = 0;
/**
- * Return the first boundary preceding the specified offset.
+ * Set the iterator position to the first boundary preceding the specified offset.
* The value returned is always smaller than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
virtual UBool isBoundary(int32_t offset) = 0;
/**
- * Return the nth boundary from the current boundary
- * @param n which boundary to return. A value of 0
+ * Set the iterator position to the nth boundary from the current boundary
+ * @param n the number of boundaries to move by. A value of 0
* does nothing. Negative values move to previous boundaries
* and positive values move to later boundaries.
- * @return The index of the nth boundary from the current position, or
+ * @return The new iterator position, or
* DONE if there are fewer than |n| boundaries in the specfied direction.
* @stable ICU 2.0
*/
virtual int32_t next(int32_t n) = 0;
+ /**
+ * For RuleBasedBreakIterators, return the status tag from the
+ * break rule that determined the most recently
+ * returned break position.
+ * <p>
+ * For break iterator types that do not support a rule status,
+ * a default value of 0 is returned.
+ * <p>
+ * @return the status from the break rule that determined the most recently
+ * returned break position.
+ * @see RuleBaseBreakIterator::getRuleStatus()
+ * @see UWordBreak
+ * @stable ICU 52
+ */
+ virtual int32_t getRuleStatus() const;
+
+ /**
+ * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s)
+ * that determined the most recently returned break position.
+ * <p>
+ * For break iterator types that do not support rule status,
+ * no values are returned.
+ * <p>
+ * The returned status value(s) are stored into an array provided by the caller.
+ * The values are stored in sorted (ascending) order.
+ * If the capacity of the output array is insufficient to hold the data,
+ * the output will be truncated to the available length, and a
+ * U_BUFFER_OVERFLOW_ERROR will be signaled.
+ * <p>
+ * @see RuleBaseBreakIterator::getRuleStatusVec
+ *
+ * @param fillInVec an array to be filled in with the status values.
+ * @param capacity the length of the supplied vector. A length of zero causes
+ * the function to return the number of status values, in the
+ * normal way, without attemtping to store any values.
+ * @param status receives error codes.
+ * @return The number of rule status values from rules that determined
+ * the most recent boundary returned by the break iterator.
+ * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
+ * is the total number of status values that were available,
+ * not the reduced number that were actually returned.
+ * @see getRuleStatus
+ * @stable ICU 52
+ */
+ virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
+
/**
* Create BreakIterator for word-breaks using the given locale.
* Returns an instance of a BreakIterator implementing word breaks.
UnicodeString& name);
/**
+ * Deprecated functionality. Use clone() instead.
+ *
* Thread safe client-buffer-based cloning operation
* Do NOT call delete on a safeclone, since 'new' is not used to create it.
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* necessary.
* @return pointer to the new clone
*
- * @stable ICU 2.0
+ * @deprecated ICU 52. Use clone() instead.
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status) = 0;
+#ifndef U_HIDE_DEPRECATED_API
+
/**
* Determine whether the BreakIterator was created in user memory by
* createBufferClone(), and thus should not be deleted. Such objects
* must be closed by an explicit call to the destructor (not delete).
- * @stable ICU 2.0
+ * @deprecated ICU 52. Always delete the BreakIterator.
*/
inline UBool isBufferClone(void);
+#endif /* U_HIDE_DEPRECATED_API */
+
#if !UCONFIG_NO_SERVICE
/**
* Register a new break iterator of the indicated kind, to use in the given locale.
* The break iterator will be adopted. Clones of the iterator will be returned
* if a request for a break iterator of the given kind matches or falls back to
* this locale.
+ * Because ICU may choose to cache BreakIterators internally, this must
+ * be called at application startup, prior to any calls to
+ * BreakIterator::createXXXInstance to avoid undefined behavior.
* @param toAdopt the BreakIterator instance to be adopted
* @param locale the Locale for which this instance is to be registered
* @param kind the type of iterator for which this instance is to be registered
* Unregister a previously-registered BreakIterator using the key returned from the
* register call. Key becomes invalid after a successful call and should not be used again.
* The BreakIterator corresponding to the key will be deleted.
+ * Because ICU may choose to cache BreakIterators internally, this should
+ * be called during application shutdown, after all calls to
+ * BreakIterator::createXXXInstance to avoid undefined behavior.
* @param key the registry key returned by a previous call to registerInstance
* @param status the in/out status code, no special meanings are assigned
* @return TRUE if the iterator for the key was successfully unregistered
*/
Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
+#ifndef U_HIDE_INTERNAL_API
/** Get the locale for this break iterator object. You can choose between valid and actual locale.
* @param type type of the locale we're looking for (valid or actual)
* @param status error code for the operation
* @internal
*/
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
+#endif /* U_HIDE_INTERNAL_API */
+
+ /**
+ * Set the subject text string upon which the break iterator is operating
+ * without changing any other aspect of the matching state.
+ * The new and previous text strings must have the same content.
+ *
+ * This function is intended for use in environments where ICU is operating on
+ * strings that may move around in memory. It provides a mechanism for notifying
+ * ICU that the string has been relocated, and providing a new UText to access the
+ * string in its new position.
+ *
+ * Note that the break iterator implementation never copies the underlying text
+ * of a string being processed, but always operates directly on the original text
+ * provided by the user. Refreshing simply drops the references to the old text
+ * and replaces them with references to the new.
+ *
+ * Caution: this function is normally used only by very specialized,
+ * system-level code. One example use case is with garbage collection that moves
+ * the text in memory.
+ *
+ * @param input The new (moved) text string.
+ * @param status Receives errors detected by this function.
+ * @return *this
+ *
+ * @stable ICU 49
+ */
+ virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
friend class ICUBreakIteratorService;
protected:
+ // Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
+ // or else the compiler will create a public ones.
/** @internal */
BreakIterator();
/** @internal */
- UBool fBufferClone;
+ BreakIterator (const BreakIterator &other) : UObject(other) {}
+#ifndef U_HIDE_INTERNAL_API
/** @internal */
- BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
+ BreakIterator (const Locale& valid, const Locale& actual);
+#endif /* U_HIDE_INTERNAL_API */
+ UBool fKeepAll;
private:
* It's provided to make the compiler happy. Do not call.
*/
BreakIterator& operator=(const BreakIterator&);
+ void setKeepAll(UBool keepAll);
};
+inline void BreakIterator::setKeepAll(UBool keepAll)
+{
+ fKeepAll = keepAll;
+}
+
+#ifndef U_HIDE_DEPRECATED_API
+
inline UBool BreakIterator::isBufferClone()
{
- return fBufferClone;
+ return FALSE;
}
+#endif /* U_HIDE_DEPRECATED_API */
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif // _BRKITER
//eof
-