/*
********************************************************************************
-* Copyright (C) 1997-2003, International Business Machines
+* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
#include "unicode/utypes.h"
+/**
+ * \file
+ * \brief C++ API: Break Iterator.
+ */
+
#if UCONFIG_NO_BREAK_ITERATION
U_NAMESPACE_BEGIN
#include "unicode/locid.h"
#include "unicode/ubrk.h"
#include "unicode/strenum.h"
+#include "unicode/utext.h"
+#include "unicode/umisc.h"
U_NAMESPACE_BEGIN
-typedef const void* URegistryKey;
-
/**
* The BreakIterator class implements methods for finding the location
* of boundaries in text. BreakIterator is an abstract base class.
* Instances of BreakIterator maintain a current position and scan over
* text returning the index of characters where boundaries occur.
- * <P>
+ * <p>
* Line boundary analysis determines where a text string can be broken
* when line-wrapping. The mechanism correctly handles punctuation and
* hyphenated words.
- * <P>
+ * <p>
* Sentence boundary analysis allows selection with correct
* interpretation of periods within numbers and abbreviations, and
* trailing punctuation marks such as quotation marks and parentheses.
- * <P>
+ * <p>
* Word boundary analysis is used by search and replace functions, as
* well as within text editing applications that allow the user to
* select words with a double click. Word selection provides correct
* interpretation of punctuation marks within and following
* words. Characters that are not part of a word, such as symbols or
* punctuation marks, have word-breaks on both sides.
- * <P>
+ * <p>
* Character boundary analysis allows users to interact with
* characters as they expect to, for example, when moving the cursor
* through a text string. Character boundary analysis provides correct
* character is stored. For example, an accented character might be
* stored as a base character and a diacritical mark. What users
* consider to be a character can differ between languages.
- * <P>
- * This is the interface for all text boundaries.
- * <P>
- * Examples:
- * <P>
- * Helper function to output text
- * <pre>
- * \code
- * void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
- * {
- * UnicodeString textBuffer, temp;
- * CharacterIterator *strIter = iterator.createText();
- * strIter->getText(temp);
- * cout << " " << start << " " << end << " |"
- * << temp.extractBetween(start, end, textBuffer)
- * << "|" << endl;
- * delete strIter;
- * }
- * \endcode
- * </pre>
- * Print each element in order:
- * <pre>
- * \code
- * void printEachForward( BreakIterator& boundary)
- * {
- * int32_t start = boundary.first();
- * for (int32_t end = boundary.next();
- * end != BreakIterator::DONE;
- * start = end, end = boundary.next())
- * {
- * printTextRange( boundary, start, end );
- * }
- * }
- * \code
- * </pre>
- * Print each element in reverse order:
- * <pre>
- * \code
- * void printEachBackward( BreakIterator& boundary)
- * {
- * int32_t end = boundary.last();
- * for (int32_t start = boundary.previous();
- * start != BreakIterator::DONE;
- * end = start, start = boundary.previous())
- * {
- * printTextRange( boundary, start, end );
- * }
- * }
- * \endcode
- * </pre>
- * Print first element
- * <pre>
- * \code
- * void printFirst(BreakIterator& boundary)
- * {
- * int32_t start = boundary.first();
- * int32_t end = boundary.next();
- * printTextRange( boundary, start, end );
- * }
- * \endcode
- * </pre>
- * Print last element
- * <pre>
- * \code
- * void printLast(BreakIterator& boundary)
- * {
- * int32_t end = boundary.last();
- * int32_t start = boundary.previous();
- * printTextRange( boundary, start, end );
- * }
- * \endcode
- * </pre>
- * Print the element at a specified position
- * <pre>
- * \code
- * void printAt(BreakIterator &boundary, int32_t pos )
- * {
- * int32_t end = boundary.following(pos);
- * int32_t start = boundary.previous();
- * printTextRange( boundary, start, end );
- * }
- * \endcode
- * </pre>
- * Creating and using text boundaries
- * <pre>
- * \code
- * void BreakIterator_Example( void )
- * {
- * BreakIterator* boundary;
- * UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
- * cout << "Examining: " << stringToExamine << endl;
+ * <p>
+ * The text boundary positions are found according to the rules
+ * described in Unicode Standard Annex #29, Text Boundaries, and
+ * Unicode Standard Annex #14, Line Breaking Properties. These
+ * are available at http://www.unicode.org/reports/tr14/ and
+ * http://www.unicode.org/reports/tr29/.
+ * <p>
+ * In addition to the C++ API defined in this header file, a
+ * plain C API with equivalent functionality is defined in the
+ * file ubrk.h
+ * <p>
+ * Code snippets illustrating the use of the Break Iterator APIs
+ * are available in the ICU User Guide,
+ * http://icu-project.org/userguide/boundaryAnalysis.html
+ * and in the sample program icu/source/samples/break/break.cpp
*
- * //print each sentence in forward and reverse order
- * boundary = BreakIterator::createSentenceInstance( Locale::US );
- * boundary->setText(stringToExamine);
- * cout << "----- forward: -----------" << endl;
- * printEachForward(*boundary);
- * cout << "----- backward: ----------" << endl;
- * printEachBackward(*boundary);
- * delete boundary;
- *
- * //print each word in order
- * boundary = BreakIterator::createWordInstance();
- * boundary->setText(stringToExamine);
- * cout << "----- forward: -----------" << endl;
- * printEachForward(*boundary);
- * //print first element
- * cout << "----- first: -------------" << endl;
- * printFirst(*boundary);
- * //print last element
- * cout << "----- last: --------------" << endl;
- * printLast(*boundary);
- * //print word at charpos 10
- * cout << "----- at pos 10: ---------" << endl;
- * printAt(*boundary, 10 );
- *
- * delete boundary;
- * }
- * \endcode
- * </pre>
*/
class U_COMMON_API BreakIterator : public UObject {
public:
/**
* Return a CharacterIterator over the text being analyzed.
- * Changing the state of the returned iterator can have undefined consequences
- * on the operation of the break iterator. If you need to change it, clone it first.
* @stable ICU 2.0
*/
- virtual const CharacterIterator& getText(void) const = 0;
+ virtual CharacterIterator& getText(void) const = 0;
+
+
+ /**
+ * Get a UText for the text being analyzed.
+ * The returned UText is a shallow clone of the UText used internally
+ * by the break iterator implementation. It can safely be used to
+ * access the text without impacting any break iterator operations,
+ * but the underlying text itself must not be altered.
+ *
+ * @param fillIn A UText to be filled in. If NULL, a new UText will be
+ * allocated to hold the result.
+ * @param status receives any error codes.
+ * @return The current UText for this break iterator. If an input
+ * UText was provided, it will always be returned.
+ * @stable ICU 3.4
+ */
+ virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
/**
* Change the text over which this operates. The text boundary is
*/
virtual void setText(const UnicodeString &text) = 0;
+ /**
+ * Reset the break iterator to operate over the text represented by
+ * the UText. The iterator position is reset to the start.
+ *
+ * This function makes a shallow clone of the supplied UText. This means
+ * that the caller is free to immediately close or otherwise reuse the
+ * Utext that was passed as a parameter, but that the underlying text itself
+ * must not be altered while being referenced by the break iterator.
+ *
+ * @param text The UText used to change the text.
+ * @param status receives any error codes.
+ * @stable ICU 3.4
+ */
+ virtual void setText(UText *text, UErrorCode &status) = 0;
+
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
+ * Note that setText(UText *) provides similar functionality to this function,
+ * and is more efficient.
* @param it The CharacterIterator used to change the text.
* @stable ICU 2.0
*/
virtual void adoptText(CharacterIterator* it) = 0;
- /**
- * DONE is returned by previous() and next() after all valid
- * boundaries have been returned.
- * @stable ICU 2.0
- */
- static const int32_t DONE;
+ enum {
+ /**
+ * DONE is returned by previous() and next() after all valid
+ * boundaries have been returned.
+ * @stable ICU 2.0
+ */
+ DONE = (int32_t)-1
+ };
/**
* Return the index of the first character in the text being scanned.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
- static BreakIterator* createWordInstance(const Locale& where,
- UErrorCode& status);
+ static BreakIterator* U_EXPORT2
+ createWordInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for line-breaks using specified locale.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
- static BreakIterator* createLineInstance(const Locale& where,
- UErrorCode& status);
+ static BreakIterator* U_EXPORT2
+ createLineInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for character-breaks using specified locale
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
- static BreakIterator* createCharacterInstance(const Locale& where,
- UErrorCode& status);
+ static BreakIterator* U_EXPORT2
+ createCharacterInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for sentence-breaks using specified locale
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
- static BreakIterator* createSentenceInstance(const Locale& where,
- UErrorCode& status);
+ static BreakIterator* U_EXPORT2
+ createSentenceInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for title-casing breaks using the specified locale
* Returns an instance of a BreakIterator implementing title breaks.
- * The iterator returned locates title boundaries as described for
+ * The iterator returned locates title boundaries as described for
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
- * please use Word Boundary iterator.{@link createWordInstance()}
+ * please use Word Boundary iterator.{@link #createWordInstance }
*
* @param where the locale.
* @param status The error code.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.1
*/
- static BreakIterator* createTitleInstance(const Locale& where,
- UErrorCode& status);
+ static BreakIterator* U_EXPORT2
+ createTitleInstance(const Locale& where, UErrorCode& status);
/**
* Get the set of Locales for which TextBoundaries are installed.
* <p><b>Note:</b> this will not return locales added through the register
- * call.</p>
+ * call. To see the registered locales too, use the getAvailableLocales
+ * function that returns a StringEnumeration object </p>
* @param count the output parameter of number of elements in the locale list
* @return available locales
* @stable ICU 2.0
*/
- static const Locale* getAvailableLocales(int32_t& count);
+ static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
/**
* Get name of the object for the desired Locale, in the desired langauge.
* @return user-displayable name
* @stable ICU 2.0
*/
- static UnicodeString& getDisplayName(const Locale& objectLocale,
+ static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
const Locale& displayLocale,
UnicodeString& name);
* @return user-displayable name
* @stable ICU 2.0
*/
- static UnicodeString& getDisplayName(const Locale& objectLocale,
+ static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
UnicodeString& name);
/**
*/
inline UBool isBufferClone(void);
+#if !UCONFIG_NO_SERVICE
/**
* Register a new break iterator of the indicated kind, to use in the given locale.
- * The break iterator will be adoped. Clones of the iterator will be returned
+ * The break iterator will be adopted. Clones of the iterator will be returned
* if a request for a break iterator of the given kind matches or falls back to
* this locale.
* @param toAdopt the BreakIterator instance to be adopted
* @param kind the type of iterator for which this instance is to be registered
* @param status the in/out status code, no special meanings are assigned
* @return a registry key that can be used to unregister this instance
- * @draft ICU 2.4
+ * @stable ICU 2.4
*/
- static URegistryKey registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status);
+ static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
+ const Locale& locale,
+ UBreakIteratorType kind,
+ UErrorCode& status);
/**
* Unregister a previously-registered BreakIterator using the key returned from the
* @param key the registry key returned by a previous call to registerInstance
* @param status the in/out status code, no special meanings are assigned
* @return TRUE if the iterator for the key was successfully unregistered
- * @draft ICU 2.4
+ * @stable ICU 2.4
*/
- static UBool unregister(URegistryKey key, UErrorCode& status);
+ static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
/**
- * Return a StringEnumeration over the locales available at the time of the call,
+ * Return a StringEnumeration over the locales available at the time of the call,
* including registered locales.
* @return a StringEnumeration over the locales available at the time of the call
- * @draft ICU 2.4
+ * @stable ICU 2.4
*/
- static StringEnumeration* getAvailableLocales(void);
+ static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
+#endif
- private:
- static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status);
- static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status);
- static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status);
- static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status);
- static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status);
+ /**
+ * Returns the locale for this break iterator. Two flavors are available: valid and
+ * actual locale.
+ * @stable ICU 2.8
+ */
+ Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
- static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
+ /** Get the locale for this break iterator object. You can choose between valid and actual locale.
+ * @param type type of the locale we're looking for (valid or actual)
+ * @param status error code for the operation
+ * @return the locale
+ * @internal
+ */
+ const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
+
+ private:
+ static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
+ static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
friend class ICUBreakIteratorFactory;
UBool fBufferClone;
/** @internal */
BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
+
private:
+
+ /** @internal */
+ char actualLocale[ULOC_FULLNAME_CAPACITY];
+ char validLocale[ULOC_FULLNAME_CAPACITY];
+
/**
* The assignment operator has no real implementation.
* It's provided to make the compiler happy. Do not call.
*/
- BreakIterator& operator=(const BreakIterator&) { return *this; }
+ BreakIterator& operator=(const BreakIterator&);
};
inline UBool BreakIterator::isBufferClone()