+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
********************************************************************************
-* Copyright (C) 1997-2007, International Business Machines
+* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
#if UCONFIG_NO_BREAK_ITERATION
+#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/*
class BreakIterator;
U_NAMESPACE_END
+#endif // U_SHOW_CPLUSPLUS_API
#else
#include "unicode/utext.h"
#include "unicode/umisc.h"
+#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* plain C API with equivalent functionality is defined in the
* file ubrk.h
* <p>
- * Code snippits illustrating the use of the Break Iterator APIs
+ * Code snippets illustrating the use of the Break Iterator APIs
* are available in the ICU User Guide,
* http://icu-project.org/userguide/boundaryAnalysis.html
- * and in the sample program icu/source/samples/break/break.cpp"
+ * and in the sample program icu/source/samples/break/break.cpp
*
*/
class U_COMMON_API BreakIterator : public UObject {
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
+ *
+ * The BreakIterator will retain a reference to the supplied string.
+ * The caller must not modify or delete the text while the BreakIterator
+ * retains the reference.
+ *
* @param text The UnicodeString used to change the text.
* @stable ICU 2.0
*/
* Utext that was passed as a parameter, but that the underlying text itself
* must not be altered while being referenced by the break iterator.
*
+ * All index positions returned by break iterator functions are
+ * native indices from the UText. For example, when breaking UTF-8
+ * encoded text, the break positions returned by next(), previous(), etc.
+ * will be UTF-8 string indices, not UTF-16 positions.
+ *
* @param text The UText used to change the text.
* @param status receives any error codes.
* @stable ICU 3.4
};
/**
- * Return the index of the first character in the text being scanned.
+ * Sets the current iteration position to the beginning of the text, position zero.
+ * @return The offset of the beginning of the text, zero.
* @stable ICU 2.0
*/
virtual int32_t first(void) = 0;
/**
- * Return the index immediately BEYOND the last character in the text being scanned.
+ * Set the iterator position to the index immediately BEYOND the last character in the text being scanned.
+ * @return The index immediately BEYOND the last character in the text being scanned.
* @stable ICU 2.0
*/
virtual int32_t last(void) = 0;
/**
- * Return the boundary preceding the current boundary.
+ * Set the iterator position to the boundary preceding the current boundary.
* @return The character index of the previous text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
virtual int32_t previous(void) = 0;
/**
- * Return the boundary following the current boundary.
+ * Advance the iterator to the boundary following the current boundary.
* @return The character index of the next text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
virtual int32_t next(void) = 0;
/**
- * Return character index of the current interator position within the text.
+ * Return character index of the current iterator position within the text.
* @return The boundary most recently returned.
* @stable ICU 2.0
*/
virtual int32_t current(void) const = 0;
/**
- * Return the first boundary following the specified offset.
+ * Advance the iterator to the first boundary following the specified offset.
* The value returned is always greater than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
virtual int32_t following(int32_t offset) = 0;
/**
- * Return the first boundary preceding the specified offset.
+ * Set the iterator position to the first boundary preceding the specified offset.
* The value returned is always smaller than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
virtual int32_t preceding(int32_t offset) = 0;
/**
- * Return true if the specfied position is a boundary position.
+ * Return true if the specified position is a boundary position.
* As a side effect, the current position of the iterator is set
* to the first boundary position at or following the specified offset.
* @param offset the offset to check.
virtual UBool isBoundary(int32_t offset) = 0;
/**
- * Return the nth boundary from the current boundary
- * @param n which boundary to return. A value of 0
+ * Set the iterator position to the nth boundary from the current boundary
+ * @param n the number of boundaries to move by. A value of 0
* does nothing. Negative values move to previous boundaries
* and positive values move to later boundaries.
- * @return The index of the nth boundary from the current position, or
- * DONE if there are fewer than |n| boundaries in the specfied direction.
+ * @return The new iterator position, or
+ * DONE if there are fewer than |n| boundaries in the specified direction.
* @stable ICU 2.0
*/
virtual int32_t next(int32_t n) = 0;
+ /**
+ * For RuleBasedBreakIterators, return the status tag from the break rule
+ * that determined the boundary at the current iteration position.
+ * <p>
+ * For break iterator types that do not support a rule status,
+ * a default value of 0 is returned.
+ * <p>
+ * @return the status from the break rule that determined the boundary at
+ * the current iteration position.
+ * @see RuleBaseBreakIterator::getRuleStatus()
+ * @see UWordBreak
+ * @stable ICU 52
+ */
+ virtual int32_t getRuleStatus() const;
+
+ /**
+ * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s)
+ * that determined the boundary at the current iteration position.
+ * <p>
+ * For break iterator types that do not support rule status,
+ * no values are returned.
+ * <p>
+ * The returned status value(s) are stored into an array provided by the caller.
+ * The values are stored in sorted (ascending) order.
+ * If the capacity of the output array is insufficient to hold the data,
+ * the output will be truncated to the available length, and a
+ * U_BUFFER_OVERFLOW_ERROR will be signaled.
+ * <p>
+ * @see RuleBaseBreakIterator::getRuleStatusVec
+ *
+ * @param fillInVec an array to be filled in with the status values.
+ * @param capacity the length of the supplied vector. A length of zero causes
+ * the function to return the number of status values, in the
+ * normal way, without attempting to store any values.
+ * @param status receives error codes.
+ * @return The number of rule status values from rules that determined
+ * the boundary at the current iteration position.
+ * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
+ * is the total number of status values that were available,
+ * not the reduced number that were actually returned.
+ * @see getRuleStatus
+ * @stable ICU 52
+ */
+ virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
+
/**
* Create BreakIterator for word-breaks using the given locale.
* Returns an instance of a BreakIterator implementing word breaks.
static BreakIterator* U_EXPORT2
createSentenceInstance(const Locale& where, UErrorCode& status);
+#ifndef U_HIDE_DEPRECATED_API
/**
* Create BreakIterator for title-casing breaks using the specified locale
* Returns an instance of a BreakIterator implementing title breaks.
* The iterator returned locates title boundaries as described for
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
- * please use Word Boundary iterator.{@link #createWordInstance }
+ * please use a word boundary iterator. See {@link #createWordInstance }.
*
* @param where the locale.
* @param status The error code.
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
- * @stable ICU 2.1
+ * @deprecated ICU 64 Use createWordInstance instead.
*/
static BreakIterator* U_EXPORT2
createTitleInstance(const Locale& where, UErrorCode& status);
+#endif /* U_HIDE_DEPRECATED_API */
/**
* Get the set of Locales for which TextBoundaries are installed.
static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
/**
- * Get name of the object for the desired Locale, in the desired langauge.
+ * Get name of the object for the desired Locale, in the desired language.
* @param objectLocale must be from getAvailableLocales.
* @param displayLocale specifies the desired locale for output.
* @param name the fill-in parameter of the return value
UnicodeString& name);
/**
- * Get name of the object for the desired Locale, in the langauge of the
+ * Get name of the object for the desired Locale, in the language of the
* default locale.
* @param objectLocale must be from getMatchingLocales
* @param name the fill-in parameter of the return value
UnicodeString& name);
/**
+ * Deprecated functionality. Use clone() instead.
+ *
* Thread safe client-buffer-based cloning operation
* Do NOT call delete on a safeclone, since 'new' is not used to create it.
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* necessary.
* @return pointer to the new clone
*
- * @stable ICU 2.0
+ * @deprecated ICU 52. Use clone() instead.
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status) = 0;
+#ifndef U_HIDE_DEPRECATED_API
+
/**
* Determine whether the BreakIterator was created in user memory by
* createBufferClone(), and thus should not be deleted. Such objects
* must be closed by an explicit call to the destructor (not delete).
- * @stable ICU 2.0
+ * @deprecated ICU 52. Always delete the BreakIterator.
*/
inline UBool isBufferClone(void);
+#endif /* U_HIDE_DEPRECATED_API */
+
#if !UCONFIG_NO_SERVICE
/**
* Register a new break iterator of the indicated kind, to use in the given locale.
* The break iterator will be adopted. Clones of the iterator will be returned
* if a request for a break iterator of the given kind matches or falls back to
* this locale.
+ * Because ICU may choose to cache BreakIterators internally, this must
+ * be called at application startup, prior to any calls to
+ * BreakIterator::createXXXInstance to avoid undefined behavior.
* @param toAdopt the BreakIterator instance to be adopted
* @param locale the Locale for which this instance is to be registered
* @param kind the type of iterator for which this instance is to be registered
* Unregister a previously-registered BreakIterator using the key returned from the
* register call. Key becomes invalid after a successful call and should not be used again.
* The BreakIterator corresponding to the key will be deleted.
+ * Because ICU may choose to cache BreakIterators internally, this should
+ * be called during application shutdown, after all calls to
+ * BreakIterator::createXXXInstance to avoid undefined behavior.
* @param key the registry key returned by a previous call to registerInstance
* @param status the in/out status code, no special meanings are assigned
* @return TRUE if the iterator for the key was successfully unregistered
*/
Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
+#ifndef U_HIDE_INTERNAL_API
/** Get the locale for this break iterator object. You can choose between valid and actual locale.
* @param type type of the locale we're looking for (valid or actual)
* @param status error code for the operation
* @internal
*/
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
+#endif /* U_HIDE_INTERNAL_API */
+
+ /**
+ * Set the subject text string upon which the break iterator is operating
+ * without changing any other aspect of the matching state.
+ * The new and previous text strings must have the same content.
+ *
+ * This function is intended for use in environments where ICU is operating on
+ * strings that may move around in memory. It provides a mechanism for notifying
+ * ICU that the string has been relocated, and providing a new UText to access the
+ * string in its new position.
+ *
+ * Note that the break iterator implementation never copies the underlying text
+ * of a string being processed, but always operates directly on the original text
+ * provided by the user. Refreshing simply drops the references to the old text
+ * and replaces them with references to the new.
+ *
+ * Caution: this function is normally used only by very specialized,
+ * system-level code. One example use case is with garbage collection that moves
+ * the text in memory.
+ *
+ * @param input The new (moved) text string.
+ * @param status Receives errors detected by this function.
+ * @return *this
+ *
+ * @stable ICU 49
+ */
+ virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
+
+#ifndef U_HIDE_INTERNAL_API
+ /**
+ * Set the ULineWordOptions for this break iterator.
+ * @param lineWordOpts The ULineWordOptions to set.
+ * @internal Apple only
+ */
+ void setLineWordOpts(ULineWordOptions lineWordOpts);
+#endif /* U_HIDE_INTERNAL_API */
private:
- static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
+ static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
friend class ICUBreakIteratorService;
protected:
+ // Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
+ // or else the compiler will create a public ones.
/** @internal */
BreakIterator();
/** @internal */
- UBool fBufferClone;
+ BreakIterator (const BreakIterator &other);
+#ifndef U_HIDE_INTERNAL_API
/** @internal */
- BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
+ BreakIterator (const Locale& valid, const Locale &actual);
+ /** @internal. Assignment Operator, used by RuleBasedBreakIterator. */
+ BreakIterator &operator = (const BreakIterator &other);
+#endif /* U_HIDE_INTERNAL_API */
+ ULineWordOptions fLineWordOpts;
private:
- /** @internal */
+ /** @internal (private) */
char actualLocale[ULOC_FULLNAME_CAPACITY];
char validLocale[ULOC_FULLNAME_CAPACITY];
-
- /**
- * The assignment operator has no real implementation.
- * It's provided to make the compiler happy. Do not call.
- */
- BreakIterator& operator=(const BreakIterator&);
};
+inline void BreakIterator::setLineWordOpts(ULineWordOptions lineWordOpts)
+{
+ fLineWordOpts = lineWordOpts;
+}
+
+#ifndef U_HIDE_DEPRECATED_API
+
inline UBool BreakIterator::isBufferClone()
{
- return fBufferClone;
+ return FALSE;
}
+#endif /* U_HIDE_DEPRECATED_API */
+
U_NAMESPACE_END
+#endif // U_SHOW_CPLUSPLUS_API
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
-#endif // _BRKITER
+#endif // BRKITER_H
//eof
-