X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/unicode/brkiter.h?ds=sidebyside diff --git a/icuSources/common/unicode/brkiter.h b/icuSources/common/unicode/brkiter.h index 1c48876d..42a0fbfa 100644 --- a/icuSources/common/unicode/brkiter.h +++ b/icuSources/common/unicode/brkiter.h @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************** -* Copyright (C) 1997-2003, International Business Machines +* Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * @@ -22,8 +24,14 @@ #include "unicode/utypes.h" +/** + * \file + * \brief C++ API: Break Iterator. + */ + #if UCONFIG_NO_BREAK_ITERATION +#if U_SHOW_CPLUSPLUS_API U_NAMESPACE_BEGIN /* @@ -33,6 +41,7 @@ U_NAMESPACE_BEGIN class BreakIterator; U_NAMESPACE_END +#endif // U_SHOW_CPLUSPLUS_API #else @@ -42,32 +51,33 @@ U_NAMESPACE_END #include "unicode/locid.h" #include "unicode/ubrk.h" #include "unicode/strenum.h" +#include "unicode/utext.h" +#include "unicode/umisc.h" +#if U_SHOW_CPLUSPLUS_API U_NAMESPACE_BEGIN -typedef const void* URegistryKey; - /** * The BreakIterator class implements methods for finding the location * of boundaries in text. BreakIterator is an abstract base class. * Instances of BreakIterator maintain a current position and scan over * text returning the index of characters where boundaries occur. - *
+ *
* Line boundary analysis determines where a text string can be broken * when line-wrapping. The mechanism correctly handles punctuation and * hyphenated words. - *
+ *
* Sentence boundary analysis allows selection with correct * interpretation of periods within numbers and abbreviations, and * trailing punctuation marks such as quotation marks and parentheses. - *
+ *
* Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct * interpretation of punctuation marks within and following * words. Characters that are not part of a word, such as symbols or * punctuation marks, have word-breaks on both sides. - *
+ *
* Character boundary analysis allows users to interact with * characters as they expect to, for example, when moving the cursor * through a text string. Character boundary analysis provides correct @@ -75,126 +85,22 @@ typedef const void* URegistryKey; * character is stored. For example, an accented character might be * stored as a base character and a diacritical mark. What users * consider to be a character can differ between languages. - *
- * This is the interface for all text boundaries. - *
- * Examples: - *
- * Helper function to output text - *
- * \code - * void printTextRange( BreakIterator& iterator, int32_t start, int32_t end ) - * { - * UnicodeString textBuffer, temp; - * CharacterIterator *strIter = iterator.createText(); - * strIter->getText(temp); - * cout << " " << start << " " << end << " |" - * << temp.extractBetween(start, end, textBuffer) - * << "|" << endl; - * delete strIter; - * } - * \endcode - *- * Print each element in order: - *
- * \code - * void printEachForward( BreakIterator& boundary) - * { - * int32_t start = boundary.first(); - * for (int32_t end = boundary.next(); - * end != BreakIterator::DONE; - * start = end, end = boundary.next()) - * { - * printTextRange( boundary, start, end ); - * } - * } - * \code - *- * Print each element in reverse order: - *
- * \code - * void printEachBackward( BreakIterator& boundary) - * { - * int32_t end = boundary.last(); - * for (int32_t start = boundary.previous(); - * start != BreakIterator::DONE; - * end = start, start = boundary.previous()) - * { - * printTextRange( boundary, start, end ); - * } - * } - * \endcode - *- * Print first element - *
- * \code - * void printFirst(BreakIterator& boundary) - * { - * int32_t start = boundary.first(); - * int32_t end = boundary.next(); - * printTextRange( boundary, start, end ); - * } - * \endcode - *- * Print last element - *
- * \code - * void printLast(BreakIterator& boundary) - * { - * int32_t end = boundary.last(); - * int32_t start = boundary.previous(); - * printTextRange( boundary, start, end ); - * } - * \endcode - *- * Print the element at a specified position - *
- * \code - * void printAt(BreakIterator &boundary, int32_t pos ) - * { - * int32_t end = boundary.following(pos); - * int32_t start = boundary.previous(); - * printTextRange( boundary, start, end ); - * } - * \endcode - *- * Creating and using text boundaries - *
- * \code - * void BreakIterator_Example( void ) - * { - * BreakIterator* boundary; - * UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff."); - * cout << "Examining: " << stringToExamine << endl; + **/ class U_COMMON_API BreakIterator : public UObject { public: @@ -243,49 +149,95 @@ public: /** * Return a CharacterIterator over the text being analyzed. - * Changing the state of the returned iterator can have undefined consequences - * on the operation of the break iterator. If you need to change it, clone it first. * @stable ICU 2.0 */ - virtual const CharacterIterator& getText(void) const = 0; + virtual CharacterIterator& getText(void) const = 0; + + + /** + * Get a UText for the text being analyzed. + * The returned UText is a shallow clone of the UText used internally + * by the break iterator implementation. It can safely be used to + * access the text without impacting any break iterator operations, + * but the underlying text itself must not be altered. + * + * @param fillIn A UText to be filled in. If NULL, a new UText will be + * allocated to hold the result. + * @param status receives any error codes. + * @return The current UText for this break iterator. If an input + * UText was provided, it will always be returned. + * @stable ICU 3.4 + */ + virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0; /** * Change the text over which this operates. The text boundary is * reset to the start. + * + * The BreakIterator will retain a reference to the supplied string. + * The caller must not modify or delete the text while the BreakIterator + * retains the reference. + * * @param text The UnicodeString used to change the text. * @stable ICU 2.0 */ virtual void setText(const UnicodeString &text) = 0; + /** + * Reset the break iterator to operate over the text represented by + * the UText. The iterator position is reset to the start. + * + * This function makes a shallow clone of the supplied UText. This means + * that the caller is free to immediately close or otherwise reuse the + * Utext that was passed as a parameter, but that the underlying text itself + * must not be altered while being referenced by the break iterator. + * + * All index positions returned by break iterator functions are + * native indices from the UText. For example, when breaking UTF-8 + * encoded text, the break positions returned by next(), previous(), etc. + * will be UTF-8 string indices, not UTF-16 positions. + * + * @param text The UText used to change the text. + * @param status receives any error codes. + * @stable ICU 3.4 + */ + virtual void setText(UText *text, UErrorCode &status) = 0; + /** * Change the text over which this operates. The text boundary is * reset to the start. + * Note that setText(UText *) provides similar functionality to this function, + * and is more efficient. * @param it The CharacterIterator used to change the text. * @stable ICU 2.0 */ virtual void adoptText(CharacterIterator* it) = 0; - /** - * DONE is returned by previous() and next() after all valid - * boundaries have been returned. - * @stable ICU 2.0 - */ - static const int32_t DONE; + enum { + /** + * DONE is returned by previous() and next() after all valid + * boundaries have been returned. + * @stable ICU 2.0 + */ + DONE = (int32_t)-1 + }; /** - * Return the index of the first character in the text being scanned. + * Sets the current iteration position to the beginning of the text, position zero. + * @return The offset of the beginning of the text, zero. * @stable ICU 2.0 */ virtual int32_t first(void) = 0; /** - * Return the index immediately BEYOND the last character in the text being scanned. + * Set the iterator position to the index immediately BEYOND the last character in the text being scanned. + * @return The index immediately BEYOND the last character in the text being scanned. * @stable ICU 2.0 */ virtual int32_t last(void) = 0; /** - * Return the boundary preceding the current boundary. + * Set the iterator position to the boundary preceding the current boundary. * @return The character index of the previous text boundary or DONE if all * boundaries have been returned. * @stable ICU 2.0 @@ -293,7 +245,7 @@ public: virtual int32_t previous(void) = 0; /** - * Return the boundary following the current boundary. + * Advance the iterator to the boundary following the current boundary. * @return The character index of the next text boundary or DONE if all * boundaries have been returned. * @stable ICU 2.0 @@ -301,14 +253,14 @@ public: virtual int32_t next(void) = 0; /** - * Return character index of the current interator position within the text. + * Return character index of the current iterator position within the text. * @return The boundary most recently returned. * @stable ICU 2.0 */ virtual int32_t current(void) const = 0; /** - * Return the first boundary following the specified offset. + * Advance the iterator to the first boundary following the specified offset. * The value returned is always greater than the offset or * the value BreakIterator.DONE * @param offset the offset to begin scanning. @@ -318,7 +270,7 @@ public: virtual int32_t following(int32_t offset) = 0; /** - * Return the first boundary preceding the specified offset. + * Set the iterator position to the first boundary preceding the specified offset. * The value returned is always smaller than the offset or * the value BreakIterator.DONE * @param offset the offset to begin scanning. @@ -328,7 +280,7 @@ public: virtual int32_t preceding(int32_t offset) = 0; /** - * Return true if the specfied position is a boundary position. + * Return true if the specified position is a boundary position. * As a side effect, the current position of the iterator is set * to the first boundary position at or following the specified offset. * @param offset the offset to check. @@ -338,16 +290,61 @@ public: virtual UBool isBoundary(int32_t offset) = 0; /** - * Return the nth boundary from the current boundary - * @param n which boundary to return. A value of 0 + * Set the iterator position to the nth boundary from the current boundary + * @param n the number of boundaries to move by. A value of 0 * does nothing. Negative values move to previous boundaries * and positive values move to later boundaries. - * @return The index of the nth boundary from the current position, or - * DONE if there are fewer than |n| boundaries in the specfied direction. + * @return The new iterator position, or + * DONE if there are fewer than |n| boundaries in the specified direction. * @stable ICU 2.0 */ virtual int32_t next(int32_t n) = 0; + /** + * For RuleBasedBreakIterators, return the status tag from the break rule + * that determined the boundary at the current iteration position. + *+ * The text boundary positions are found according to the rules + * described in Unicode Standard Annex #29, Text Boundaries, and + * Unicode Standard Annex #14, Line Breaking Properties. These + * are available at http://www.unicode.org/reports/tr14/ and + * http://www.unicode.org/reports/tr29/. + *
+ * In addition to the C++ API defined in this header file, a + * plain C API with equivalent functionality is defined in the + * file ubrk.h + *
+ * Code snippets illustrating the use of the Break Iterator APIs + * are available in the ICU User Guide, + * http://icu-project.org/userguide/boundaryAnalysis.html + * and in the sample program icu/source/samples/break/break.cpp * - * //print each sentence in forward and reverse order - * boundary = BreakIterator::createSentenceInstance( Locale::US ); - * boundary->setText(stringToExamine); - * cout << "----- forward: -----------" << endl; - * printEachForward(*boundary); - * cout << "----- backward: ----------" << endl; - * printEachBackward(*boundary); - * delete boundary; - * - * //print each word in order - * boundary = BreakIterator::createWordInstance(); - * boundary->setText(stringToExamine); - * cout << "----- forward: -----------" << endl; - * printEachForward(*boundary); - * //print first element - * cout << "----- first: -------------" << endl; - * printFirst(*boundary); - * //print last element - * cout << "----- last: --------------" << endl; - * printLast(*boundary); - * //print word at charpos 10 - * cout << "----- at pos 10: ---------" << endl; - * printAt(*boundary, 10 ); - * - * delete boundary; - * } - * \endcode - *
+ * For break iterator types that do not support a rule status, + * a default value of 0 is returned. + *
+ * @return the status from the break rule that determined the boundary at + * the current iteration position. + * @see RuleBaseBreakIterator::getRuleStatus() + * @see UWordBreak + * @stable ICU 52 + */ + virtual int32_t getRuleStatus() const; + + /** + * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) + * that determined the boundary at the current iteration position. + *
+ * For break iterator types that do not support rule status, + * no values are returned. + *
+ * The returned status value(s) are stored into an array provided by the caller. + * The values are stored in sorted (ascending) order. + * If the capacity of the output array is insufficient to hold the data, + * the output will be truncated to the available length, and a + * U_BUFFER_OVERFLOW_ERROR will be signaled. + *
+ * @see RuleBaseBreakIterator::getRuleStatusVec + * + * @param fillInVec an array to be filled in with the status values. + * @param capacity the length of the supplied vector. A length of zero causes + * the function to return the number of status values, in the + * normal way, without attempting to store any values. + * @param status receives error codes. + * @return The number of rule status values from rules that determined + * the boundary at the current iteration position. + * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value + * is the total number of status values that were available, + * not the reduced number that were actually returned. + * @see getRuleStatus + * @stable ICU 52 + */ + virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status); + /** * Create BreakIterator for word-breaks using the given locale. * Returns an instance of a BreakIterator implementing word breaks. @@ -367,8 +364,8 @@ public: * The caller owns the returned object and is responsible for deleting it. * @stable ICU 2.0 */ - static BreakIterator* createWordInstance(const Locale& where, - UErrorCode& status); + static BreakIterator* U_EXPORT2 + createWordInstance(const Locale& where, UErrorCode& status); /** * Create BreakIterator for line-breaks using specified locale. @@ -391,8 +388,8 @@ public: * The caller owns the returned object and is responsible for deleting it. * @stable ICU 2.0 */ - static BreakIterator* createLineInstance(const Locale& where, - UErrorCode& status); + static BreakIterator* U_EXPORT2 + createLineInstance(const Locale& where, UErrorCode& status); /** * Create BreakIterator for character-breaks using specified locale @@ -413,8 +410,8 @@ public: * The caller owns the returned object and is responsible for deleting it. * @stable ICU 2.0 */ - static BreakIterator* createCharacterInstance(const Locale& where, - UErrorCode& status); + static BreakIterator* U_EXPORT2 + createCharacterInstance(const Locale& where, UErrorCode& status); /** * Create BreakIterator for sentence-breaks using specified locale @@ -434,15 +431,16 @@ public: * The caller owns the returned object and is responsible for deleting it. * @stable ICU 2.0 */ - static BreakIterator* createSentenceInstance(const Locale& where, - UErrorCode& status); + static BreakIterator* U_EXPORT2 + createSentenceInstance(const Locale& where, UErrorCode& status); +#ifndef U_HIDE_DEPRECATED_API /** * Create BreakIterator for title-casing breaks using the specified locale * Returns an instance of a BreakIterator implementing title breaks. - * The iterator returned locates title boundaries as described for + * The iterator returned locates title boundaries as described for * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, - * please use Word Boundary iterator.{@link createWordInstance()} + * please use a word boundary iterator. See {@link #createWordInstance }. * * @param where the locale. * @param status The error code. @@ -457,23 +455,25 @@ public: * used; neither the requested locale nor any of its fall back locales * could be found. * The caller owns the returned object and is responsible for deleting it. - * @stable ICU 2.1 + * @deprecated ICU 64 Use createWordInstance instead. */ - static BreakIterator* createTitleInstance(const Locale& where, - UErrorCode& status); + static BreakIterator* U_EXPORT2 + createTitleInstance(const Locale& where, UErrorCode& status); +#endif /* U_HIDE_DEPRECATED_API */ /** * Get the set of Locales for which TextBoundaries are installed. *
Note: this will not return locales added through the register - * call.
+ * call. To see the registered locales too, use the getAvailableLocales + * function that returns a StringEnumeration object * @param count the output parameter of number of elements in the locale list * @return available locales * @stable ICU 2.0 */ - static const Locale* getAvailableLocales(int32_t& count); + static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count); /** - * Get name of the object for the desired Locale, in the desired langauge. + * Get name of the object for the desired Locale, in the desired language. * @param objectLocale must be from getAvailableLocales. * @param displayLocale specifies the desired locale for output. * @param name the fill-in parameter of the return value @@ -481,22 +481,24 @@ public: * @return user-displayable name * @stable ICU 2.0 */ - static UnicodeString& getDisplayName(const Locale& objectLocale, + static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale, const Locale& displayLocale, UnicodeString& name); /** - * Get name of the object for the desired Locale, in the langauge of the + * Get name of the object for the desired Locale, in the language of the * default locale. * @param objectLocale must be from getMatchingLocales * @param name the fill-in parameter of the return value * @return user-displayable name * @stable ICU 2.0 */ - static UnicodeString& getDisplayName(const Locale& objectLocale, + static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale, UnicodeString& name); /** + * Deprecated functionality. Use clone() instead. + * * Thread safe client-buffer-based cloning operation * Do NOT call delete on a safeclone, since 'new' is not used to create it. * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. @@ -511,90 +513,169 @@ public: * necessary. * @return pointer to the new clone * - * @stable ICU 2.0 + * @deprecated ICU 52. Use clone() instead. */ virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status) = 0; +#ifndef U_HIDE_DEPRECATED_API + /** * Determine whether the BreakIterator was created in user memory by * createBufferClone(), and thus should not be deleted. Such objects * must be closed by an explicit call to the destructor (not delete). - * @stable ICU 2.0 + * @deprecated ICU 52. Always delete the BreakIterator. */ inline UBool isBufferClone(void); +#endif /* U_HIDE_DEPRECATED_API */ + +#if !UCONFIG_NO_SERVICE /** * Register a new break iterator of the indicated kind, to use in the given locale. - * The break iterator will be adoped. Clones of the iterator will be returned + * The break iterator will be adopted. Clones of the iterator will be returned * if a request for a break iterator of the given kind matches or falls back to * this locale. + * Because ICU may choose to cache BreakIterators internally, this must + * be called at application startup, prior to any calls to + * BreakIterator::createXXXInstance to avoid undefined behavior. * @param toAdopt the BreakIterator instance to be adopted * @param locale the Locale for which this instance is to be registered * @param kind the type of iterator for which this instance is to be registered * @param status the in/out status code, no special meanings are assigned * @return a registry key that can be used to unregister this instance - * @draft ICU 2.4 + * @stable ICU 2.4 */ - static URegistryKey registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status); + static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt, + const Locale& locale, + UBreakIteratorType kind, + UErrorCode& status); /** * Unregister a previously-registered BreakIterator using the key returned from the * register call. Key becomes invalid after a successful call and should not be used again. * The BreakIterator corresponding to the key will be deleted. + * Because ICU may choose to cache BreakIterators internally, this should + * be called during application shutdown, after all calls to + * BreakIterator::createXXXInstance to avoid undefined behavior. * @param key the registry key returned by a previous call to registerInstance * @param status the in/out status code, no special meanings are assigned * @return TRUE if the iterator for the key was successfully unregistered - * @draft ICU 2.4 + * @stable ICU 2.4 */ - static UBool unregister(URegistryKey key, UErrorCode& status); + static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status); /** - * Return a StringEnumeration over the locales available at the time of the call, + * Return a StringEnumeration over the locales available at the time of the call, * including registered locales. * @return a StringEnumeration over the locales available at the time of the call - * @draft ICU 2.4 + * @stable ICU 2.4 */ - static StringEnumeration* getAvailableLocales(void); + static StringEnumeration* U_EXPORT2 getAvailableLocales(void); +#endif - private: - static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status); + /** + * Returns the locale for this break iterator. Two flavors are available: valid and + * actual locale. + * @stable ICU 2.8 + */ + Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const; + +#ifndef U_HIDE_INTERNAL_API + /** Get the locale for this break iterator object. You can choose between valid and actual locale. + * @param type type of the locale we're looking for (valid or actual) + * @param status error code for the operation + * @return the locale + * @internal + */ + const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const; +#endif /* U_HIDE_INTERNAL_API */ + + /** + * Set the subject text string upon which the break iterator is operating + * without changing any other aspect of the matching state. + * The new and previous text strings must have the same content. + * + * This function is intended for use in environments where ICU is operating on + * strings that may move around in memory. It provides a mechanism for notifying + * ICU that the string has been relocated, and providing a new UText to access the + * string in its new position. + * + * Note that the break iterator implementation never copies the underlying text + * of a string being processed, but always operates directly on the original text + * provided by the user. Refreshing simply drops the references to the old text + * and replaces them with references to the new. + * + * Caution: this function is normally used only by very specialized, + * system-level code. One example use case is with garbage collection that moves + * the text in memory. + * + * @param input The new (moved) text string. + * @param status Receives errors detected by this function. + * @return *this + * + * @stable ICU 49 + */ + virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0; - static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status); +#ifndef U_HIDE_INTERNAL_API + /** + * Set the ULineWordOptions for this break iterator. + * @param lineWordOpts The ULineWordOptions to set. + * @internal Apple only + */ + void setLineWordOpts(ULineWordOptions lineWordOpts); +#endif /* U_HIDE_INTERNAL_API */ + + private: + static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status); + static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status); static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status); friend class ICUBreakIteratorFactory; friend class ICUBreakIteratorService; protected: + // Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API + // or else the compiler will create a public ones. /** @internal */ BreakIterator(); /** @internal */ - UBool fBufferClone; + BreakIterator (const BreakIterator &other); +#ifndef U_HIDE_INTERNAL_API /** @internal */ - BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {} + BreakIterator (const Locale& valid, const Locale &actual); + /** @internal. Assignment Operator, used by RuleBasedBreakIterator. */ + BreakIterator &operator = (const BreakIterator &other); +#endif /* U_HIDE_INTERNAL_API */ + ULineWordOptions fLineWordOpts; + private: - /** - * The assignment operator has no real implementation. - * It's provided to make the compiler happy. Do not call. - */ - BreakIterator& operator=(const BreakIterator&) { return *this; } + + /** @internal (private) */ + char actualLocale[ULOC_FULLNAME_CAPACITY]; + char validLocale[ULOC_FULLNAME_CAPACITY]; }; +inline void BreakIterator::setLineWordOpts(ULineWordOptions lineWordOpts) +{ + fLineWordOpts = lineWordOpts; +} + +#ifndef U_HIDE_DEPRECATED_API + inline UBool BreakIterator::isBufferClone() { - return fBufferClone; + return FALSE; } +#endif /* U_HIDE_DEPRECATED_API */ + U_NAMESPACE_END +#endif // U_SHOW_CPLUSPLUS_API #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ -#endif // _BRKITER +#endif // BRKITER_H //eof -