[apple/icu.git] / icuSources / common / unicode / ubrk.h

/*
******************************************************************************
* Copyright (C) 1996-2007, International Business Machines Corporation and others.
* All Rights Reserved.
******************************************************************************
*/

#ifndef UBRK_H
#define UBRK_H

#include "unicode/utypes.h"
#include "unicode/uloc.h"
#include "unicode/utext.h"

/**
 * A text-break iterator.
 *  For usage in C programs.
 */
#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
#   define UBRK_TYPEDEF_UBREAK_ITERATOR
    /**
     *  Opaque type representing an ICU Break iterator object.
     *  @stable ICU 2.0
     */
    typedef void UBreakIterator;
#endif

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/parseerr.h"

/**
 * \file
 * \brief C API: BreakIterator
 *
 * <h2> BreakIterator C API </h2>
 *
 * The BreakIterator C API defines  methods for finding the location
 * of boundaries in text. Pointer to a UBreakIterator maintain a
 * current position and scan over text returning the index of characters
 * where boundaries occur.
 * <p>
 * Line boundary analysis determines where a text string can be broken
 * when line-wrapping. The mechanism correctly handles punctuation and
 * hyphenated words.
 * <p>
 * Sentence boundary analysis allows selection with correct
 * interpretation of periods within numbers and abbreviations, and
 * trailing punctuation marks such as quotation marks and parentheses.
 * <p>
 * Word boundary analysis is used by search and replace functions, as
 * well as within text editing applications that allow the user to
 * select words with a double click. Word selection provides correct
 * interpretation of punctuation marks within and following
 * words. Characters that are not part of a word, such as symbols or
 * punctuation marks, have word-breaks on both sides.
 * <p>
 * Character boundary analysis allows users to interact with
 * characters as they expect to, for example, when moving the cursor
 * through a text string. Character boundary analysis provides correct
 * navigation of through character strings, regardless of how the
 * character is stored.  For example, an accented character might be
 * stored as a base character and a diacritical mark. What users
 * consider to be a character can differ between languages.
 * <p>
 * Title boundary analysis locates all positions,
 * typically starts of words, that should be set to Title Case
 * when title casing the text.
 * <p>
 * The text boundary positions are found according to the rules
 * described in Unicode Standard Annex #29, Text Boundaries, and
 * Unicode Standard Annex #14, Line Breaking Properties.  These
 * are available at http://www.unicode.org/reports/tr14/ and
 * http://www.unicode.org/reports/tr29/.
 * <p>
 * In addition to the plain C API defined in this header file, an
 * object oriented C++ API with equivalent functionality is defined in the
 * file brkiter.h.
 * <p>
 * Code snippits illustrating the use of the Break Iterator APIs
 * are available in the ICU User Guide,
 * http://icu-project.org/userguide/boundaryAnalysis.html
 * and in the sample program icu/source/samples/break/break.cpp"
 */

/** The possible types of text boundaries.  @stable ICU 2.0 */
typedef enum UBreakIteratorType {
  /** Character breaks  @stable ICU 2.0 */
  UBRK_CHARACTER = 0,
  /** Word breaks @stable ICU 2.0 */
  UBRK_WORD = 1,
  /** Line breaks @stable ICU 2.0 */
  UBRK_LINE = 2,
  /** Sentence breaks @stable ICU 2.0 */
  UBRK_SENTENCE = 3,

#ifndef U_HIDE_DEPRECATED_API
  /**
   * Title Case breaks
   * The iterator created using this type locates title boundaries as described for
   * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
   * please use Word Boundary iterator.
   *
   * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
   */
  UBRK_TITLE = 4,
#endif /* U_HIDE_DEPRECATED_API */
  UBRK_COUNT = 5
} UBreakIteratorType;

/** Value indicating all text boundaries have been returned.
 *  @stable ICU 2.0
 */
#define UBRK_DONE ((int32_t) -1)


/**
 *  Enum constants for the word break tags returned by
 *  getRuleStatus().  A range of values is defined for each category of
 *  word, to allow for further subdivisions of a category in future releases.
 *  Applications should check for tag values falling within the range, rather
 *  than for single individual values.
 *  @stable ICU 2.2
*/
typedef enum UWordBreak {
    /** Tag value for "words" that do not fit into any of other categories.
     *  Includes spaces and most punctuation. */
    UBRK_WORD_NONE           = 0,
    /** Upper bound for tags for uncategorized words. */
    UBRK_WORD_NONE_LIMIT     = 100,
    /** Tag value for words that appear to be numbers, lower limit.    */
    UBRK_WORD_NUMBER         = 100,
    /** Tag value for words that appear to be numbers, upper limit.    */
    UBRK_WORD_NUMBER_LIMIT   = 200,
    /** Tag value for words that contain letters, excluding
     *  hiragana, katakana or ideographic characters, lower limit.    */
    UBRK_WORD_LETTER         = 200,
    /** Tag value for words containing letters, upper limit  */
    UBRK_WORD_LETTER_LIMIT   = 300,
    /** Tag value for words containing kana characters, lower limit */
    UBRK_WORD_KANA           = 300,
    /** Tag value for words containing kana characters, upper limit */
    UBRK_WORD_KANA_LIMIT     = 400,
    /** Tag value for words containing ideographic characters, lower limit */
    UBRK_WORD_IDEO           = 400,
    /** Tag value for words containing ideographic characters, upper limit */
    UBRK_WORD_IDEO_LIMIT     = 500
} UWordBreak;

/**
 *  Enum constants for the line break tags returned by getRuleStatus().
 *  A range of values is defined for each category of
 *  word, to allow for further subdivisions of a category in future releases.
 *  Applications should check for tag values falling within the range, rather
 *  than for single individual values.
 *  @stable ICU 2.8
*/
typedef enum ULineBreakTag {
    /** Tag value for soft line breaks, positions at which a line break
      *  is acceptable but not required                */
    UBRK_LINE_SOFT            = 0,
    /** Upper bound for soft line breaks.              */
    UBRK_LINE_SOFT_LIMIT      = 100,
    /** Tag value for a hard, or mandatory line break  */
    UBRK_LINE_HARD            = 100,
    /** Upper bound for hard line breaks.              */
    UBRK_LINE_HARD_LIMIT      = 200
} ULineBreakTag;


/**
 *  Enum constants for the sentence break tags returned by getRuleStatus().
 *  A range of values is defined for each category of
 *  sentence, to allow for further subdivisions of a category in future releases.
 *  Applications should check for tag values falling within the range, rather
 *  than for single individual values.
 *  @stable ICU 2.8
*/
typedef enum USentenceBreakTag {
    /** Tag value for for sentences  ending with a sentence terminator
      * ('.', '?', '!', etc.) character, possibly followed by a
      * hard separator (CR, LF, PS, etc.)
      */
    UBRK_SENTENCE_TERM       = 0,
    /** Upper bound for tags for sentences ended by sentence terminators.    */
    UBRK_SENTENCE_TERM_LIMIT = 100,
    /** Tag value for for sentences that do not contain an ending
      * sentence terminator ('.', '?', '!', etc.) character, but
      * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
      */
    UBRK_SENTENCE_SEP        = 100,
    /** Upper bound for tags for sentences ended by a separator.              */
    UBRK_SENTENCE_SEP_LIMIT  = 200
    /** Tag value for a hard, or mandatory line break  */
} USentenceBreakTag;


/**
 * Open a new UBreakIterator for locating text boundaries for a specified locale.
 * A UBreakIterator may be used for detecting character, line, word,
 * and sentence breaks in text.
 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
 * UBRK_LINE, UBRK_SENTENCE
 * @param locale The locale specifying the text-breaking conventions.
 * @param text The text to be iterated over.
 * @param textLength The number of characters in text, or -1 if null-terminated.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified locale.
 * @see ubrk_openRules
 * @stable ICU 2.0
 */
U_STABLE UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
      const char *locale,
      const UChar *text,
      int32_t textLength,
      UErrorCode *status);

/**
 * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
 * The rule syntax is ... (TBD)
 * @param rules A set of rules specifying the text breaking conventions.
 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
 *        used to specify the text to be iterated.
 * @param textLength The number of characters in text, or -1 if null-terminated.
 * @param parseErr   Receives position and context information for any syntax errors
 *                   detected while parsing the rules.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @stable ICU 2.2
 */
U_STABLE UBreakIterator* U_EXPORT2
ubrk_openRules(const UChar     *rules,
               int32_t         rulesLength,
               const UChar     *text,
               int32_t          textLength,
               UParseError     *parseErr,
               UErrorCode      *status);

/**
 * Thread safe cloning operation
 * @param bi iterator to be cloned
 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
 *  If buffer is not large enough, new memory will be allocated.
 *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
 * @param pBufferSize pointer to size of allocated space.
 *  If *pBufferSize == 0, a sufficient size for use in cloning will
 *  be returned ('pre-flighting')
 *  If *pBufferSize is not enough for a stack-based safe clone,
 *  new memory will be allocated.
 * @param status to indicate whether the operation went on smoothly or there were errors
 *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
 * @return pointer to the new clone
 * @stable ICU 2.0
 */
U_STABLE UBreakIterator * U_EXPORT2
ubrk_safeClone(
          const UBreakIterator *bi,
          void *stackBuffer,
          int32_t *pBufferSize,
          UErrorCode *status);

/**
  * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
  * @stable ICU 2.0
  */
#define U_BRK_SAFECLONE_BUFFERSIZE 512

/**
* Close a UBreakIterator.
* Once closed, a UBreakIterator may no longer be used.
* @param bi The break iterator to close.
 * @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ubrk_close(UBreakIterator *bi);

/**
 * Sets an existing iterator to point to a new piece of text
 * @param bi The iterator to use
 * @param text The text to be set
 * @param textLength The length of the text
 * @param status The error code
 * @stable ICU 2.0
 */
U_STABLE void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
             const UChar*    text,
             int32_t         textLength,
             UErrorCode*     status);


/**
 * Sets an existing iterator to point to a new piece of text
 * @param bi The iterator to use
 * @param text The text to be set.
 *             This function makes a shallow clone of the supplied UText.  This means
 *             that the caller is free to immediately close or otherwise reuse the
 *             UText that was passed as a parameter, but that the underlying text itself
 *             must not be altered while being referenced by the break iterator.
 * @param status The error code
 * @stable ICU 3.4
 */
U_STABLE void U_EXPORT2
ubrk_setUText(UBreakIterator* bi,
             UText*          text,
             UErrorCode*     status);


/**
 * Determine the most recently-returned text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
 * \ref ubrk_first, or \ref ubrk_last.
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_current(const UBreakIterator *bi);

/**
 * Determine the text boundary following the current text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index of the next text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_previous
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_next(UBreakIterator *bi);

/**
 * Determine the text boundary preceding the current text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index of the preceding text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_next
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_previous(UBreakIterator *bi);

/**
 * Determine the index of the first character in the text being scanned.
 * This is not always the same as index 0 of the text.
 * @param bi The break iterator to use.
 * @return The character index of the first character in the text being scanned.
 * @see ubrk_last
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_first(UBreakIterator *bi);

/**
 * Determine the index immediately <EM>beyond</EM> the last character in the text being
 * scanned.
 * This is not the same as the last character.
 * @param bi The break iterator to use.
 * @return The character offset immediately <EM>beyond</EM> the last character in the
 * text being scanned.
 * @see ubrk_first
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_last(UBreakIterator *bi);

/**
 * Determine the text boundary preceding the specified offset.
 * The value returned is always smaller than offset, or UBRK_DONE.
 * @param bi The break iterator to use.
 * @param offset The offset to begin scanning.
 * @return The text boundary preceding offset, or UBRK_DONE.
 * @see ubrk_following
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_preceding(UBreakIterator *bi,
           int32_t offset);

/**
 * Determine the text boundary following the specified offset.
 * The value returned is always greater than offset, or UBRK_DONE.
 * @param bi The break iterator to use.
 * @param offset The offset to begin scanning.
 * @return The text boundary following offset, or UBRK_DONE.
 * @see ubrk_preceding
 * @stable ICU 2.0
 */
U_STABLE int32_t U_EXPORT2
ubrk_following(UBreakIterator *bi,
           int32_t offset);

/**
* Get a locale for which text breaking information is available.
* A UBreakIterator in a locale returned by this function will perform the correct
* text breaking for the locale.
* @param index The index of the desired locale.
* @return A locale for which number text breaking information is available, or 0 if none.
* @see ubrk_countAvailable
* @stable ICU 2.0
*/
U_STABLE const char* U_EXPORT2
ubrk_getAvailable(int32_t index);

/**
* Determine how many locales have text breaking information available.
* This function is most useful as determining the loop ending condition for
* calls to \ref ubrk_getAvailable.
* @return The number of locales for which text breaking information is available.
* @see ubrk_getAvailable
* @stable ICU 2.0
*/
U_STABLE int32_t U_EXPORT2
ubrk_countAvailable(void);


/**
* Returns true if the specfied position is a boundary position.  As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param bi The break iterator to use.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
U_STABLE  UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);

/**
 * Return the status from the break rule that determined the most recently
 * returned break position.  The values appear in the rule source
 * within brackets, {123}, for example.  For rules that do not specify a
 * status, a default value of 0 is returned.
 * <p>
 * For word break iterators, the possible values are defined in enum UWordBreak.
 * @stable ICU 2.2
 */
U_STABLE  int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);

/**
 * Get the statuses from the break rules that determined the most recently
 * returned break position.  The values appear in the rule source
 * within brackets, {123}, for example.  The default status value for rules
 * that do not explicitly provide one is zero.
 * <p>
 * For word break iterators, the possible values are defined in enum UWordBreak.
 * @param bi        The break iterator to use
 * @param fillInVec an array to be filled in with the status values.
 * @param capacity  the length of the supplied vector.  A length of zero causes
 *                  the function to return the number of status values, in the
 *                  normal way, without attemtping to store any values.
 * @param status    receives error codes.
 * @return          The number of rule status values from rules that determined
 *                  the most recent boundary returned by the break iterator.
 * @stable ICU 3.0
 */
U_STABLE  int32_t U_EXPORT2
ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);

/**
 * Return the locale of the break iterator. You can choose between the valid and
 * the actual locale.
 * @param bi break iterator
 * @param type locale type (valid or actual)
 * @param status error code
 * @return locale string
 * @stable ICU 2.8
 */
U_STABLE const char* U_EXPORT2
ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);


#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif
Commit	Line	Data
b75a7d8f	1	/*
73c04bcf	2	******************************************************************************
46f4442e	3	* Copyright (C) 1996-2007, International Business Machines Corporation and others.
73c04bcf A	4	* All Rights Reserved.
73c04bcf A	5	******************************************************************************
b75a7d8f A	6	*/
	7
	8	#ifndef UBRK_H
	9	#define UBRK_H
	10
	11	#include "unicode/utypes.h"
374ca955	12	#include "unicode/uloc.h"
73c04bcf	13	#include "unicode/utext.h"
b75a7d8f A	14
	15	/**
	16	* A text-break iterator.
	17	* For usage in C programs.
	18	*/
	19	#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
	20	# define UBRK_TYPEDEF_UBREAK_ITERATOR
	21	/**
	22	* Opaque type representing an ICU Break iterator object.
	23	* @stable ICU 2.0
	24	*/
	25	typedef void UBreakIterator;
	26	#endif
	27
	28	#if !UCONFIG_NO_BREAK_ITERATION
	29
	30	#include "unicode/parseerr.h"
	31
	32	/**
	33	* \file
	34	* \brief C API: BreakIterator
	35	*
	36	* <h2> BreakIterator C API </h2>
	37	*
	38	* The BreakIterator C API defines methods for finding the location
	39	* of boundaries in text. Pointer to a UBreakIterator maintain a
	40	* current position and scan over text returning the index of characters
	41	* where boundaries occur.
73c04bcf	42	* <p>
b75a7d8f A	43	* Line boundary analysis determines where a text string can be broken
	44	* when line-wrapping. The mechanism correctly handles punctuation and
	45	* hyphenated words.
73c04bcf	46	* <p>
b75a7d8f A	47	* Sentence boundary analysis allows selection with correct
	48	* interpretation of periods within numbers and abbreviations, and
	49	* trailing punctuation marks such as quotation marks and parentheses.
73c04bcf	50	* <p>
b75a7d8f A	51	* Word boundary analysis is used by search and replace functions, as
	52	* well as within text editing applications that allow the user to
	53	* select words with a double click. Word selection provides correct
	54	* interpretation of punctuation marks within and following
	55	* words. Characters that are not part of a word, such as symbols or
	56	* punctuation marks, have word-breaks on both sides.
73c04bcf	57	* <p>
b75a7d8f A	58	* Character boundary analysis allows users to interact with
	59	* characters as they expect to, for example, when moving the cursor
	60	* through a text string. Character boundary analysis provides correct
	61	* navigation of through character strings, regardless of how the
	62	* character is stored. For example, an accented character might be
	63	* stored as a base character and a diacritical mark. What users
	64	* consider to be a character can differ between languages.
73c04bcf	65	* <p>
b75a7d8f A	66	* Title boundary analysis locates all positions,
	67	* typically starts of words, that should be set to Title Case
	68	* when title casing the text.
73c04bcf A	69	* <p>
	70	* The text boundary positions are found according to the rules
	71	* described in Unicode Standard Annex #29, Text Boundaries, and
	72	* Unicode Standard Annex #14, Line Breaking Properties. These
	73	* are available at http://www.unicode.org/reports/tr14/ and
	74	* http://www.unicode.org/reports/tr29/.
	75	* <p>
	76	* In addition to the plain C API defined in this header file, an
	77	* object oriented C++ API with equivalent functionality is defined in the
	78	* file brkiter.h.
	79	* <p>
	80	* Code snippits illustrating the use of the Break Iterator APIs
46f4442e A	81	* are available in the ICU User Guide,
46f4442e A	82	* http://icu-project.org/userguide/boundaryAnalysis.html
73c04bcf	83	* and in the sample program icu/source/samples/break/break.cpp"
b75a7d8f A	84	*/
	85
	86	/** The possible types of text boundaries. @stable ICU 2.0 */
	87	typedef enum UBreakIteratorType {
	88	/** Character breaks @stable ICU 2.0 */
73c04bcf	89	UBRK_CHARACTER = 0,
b75a7d8f	90	/** Word breaks @stable ICU 2.0 */
73c04bcf	91	UBRK_WORD = 1,
b75a7d8f	92	/** Line breaks @stable ICU 2.0 */
73c04bcf	93	UBRK_LINE = 2,
b75a7d8f	94	/** Sentence breaks @stable ICU 2.0 */
73c04bcf	95	UBRK_SENTENCE = 3,
374ca955 A	96
374ca955 A	97	#ifndef U_HIDE_DEPRECATED_API
46f4442e A	98	/**
	99	* Title Case breaks
	100	* The iterator created using this type locates title boundaries as described for
b75a7d8f	101	* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
374ca955	102	* please use Word Boundary iterator.
b75a7d8f	103	*
374ca955	104	* @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
b75a7d8f	105	*/
73c04bcf	106	UBRK_TITLE = 4,
374ca955	107	#endif /* U_HIDE_DEPRECATED_API */
73c04bcf	108	UBRK_COUNT = 5
b75a7d8f A	109	} UBreakIteratorType;
	110
	111	/** Value indicating all text boundaries have been returned.
46f4442e	112	* @stable ICU 2.0
b75a7d8f A	113	*/
	114	#define UBRK_DONE ((int32_t) -1)
	115
	116
	117	/**
	118	* Enum constants for the word break tags returned by
	119	* getRuleStatus(). A range of values is defined for each category of
	120	* word, to allow for further subdivisions of a category in future releases.
	121	* Applications should check for tag values falling within the range, rather
	122	* than for single individual values.
374ca955	123	* @stable ICU 2.2
b75a7d8f A	124	*/
b75a7d8f A	125	typedef enum UWordBreak {
46f4442e	126	/** Tag value for "words" that do not fit into any of other categories.
b75a7d8f A	127	* Includes spaces and most punctuation. */
	128	UBRK_WORD_NONE = 0,
	129	/** Upper bound for tags for uncategorized words. */
	130	UBRK_WORD_NONE_LIMIT = 100,
	131	/** Tag value for words that appear to be numbers, lower limit. */
	132	UBRK_WORD_NUMBER = 100,
	133	/** Tag value for words that appear to be numbers, upper limit. */
	134	UBRK_WORD_NUMBER_LIMIT = 200,
	135	/** Tag value for words that contain letters, excluding
	136	* hiragana, katakana or ideographic characters, lower limit. */
	137	UBRK_WORD_LETTER = 200,
	138	/** Tag value for words containing letters, upper limit */
	139	UBRK_WORD_LETTER_LIMIT = 300,
	140	/** Tag value for words containing kana characters, lower limit */
	141	UBRK_WORD_KANA = 300,
	142	/** Tag value for words containing kana characters, upper limit */
	143	UBRK_WORD_KANA_LIMIT = 400,
	144	/** Tag value for words containing ideographic characters, lower limit */
	145	UBRK_WORD_IDEO = 400,
	146	/** Tag value for words containing ideographic characters, upper limit */
	147	UBRK_WORD_IDEO_LIMIT = 500
	148	} UWordBreak;
	149
374ca955 A	150	/**
	151	* Enum constants for the line break tags returned by getRuleStatus().
	152	* A range of values is defined for each category of
	153	* word, to allow for further subdivisions of a category in future releases.
	154	* Applications should check for tag values falling within the range, rather
	155	* than for single individual values.
73c04bcf	156	* @stable ICU 2.8
374ca955 A	157	*/
	158	typedef enum ULineBreakTag {
	159	/** Tag value for soft line breaks, positions at which a line break
	160	* is acceptable but not required */
	161	UBRK_LINE_SOFT = 0,
	162	/** Upper bound for soft line breaks. */
	163	UBRK_LINE_SOFT_LIMIT = 100,
	164	/** Tag value for a hard, or mandatory line break */
	165	UBRK_LINE_HARD = 100,
	166	/** Upper bound for hard line breaks. */
	167	UBRK_LINE_HARD_LIMIT = 200
	168	} ULineBreakTag;
	169
	170
	171
	172	/**
	173	* Enum constants for the sentence break tags returned by getRuleStatus().
	174	* A range of values is defined for each category of
	175	* sentence, to allow for further subdivisions of a category in future releases.
	176	* Applications should check for tag values falling within the range, rather
	177	* than for single individual values.
73c04bcf	178	* @stable ICU 2.8
374ca955 A	179	*/
	180	typedef enum USentenceBreakTag {
	181	/** Tag value for for sentences ending with a sentence terminator
	182	* ('.', '?', '!', etc.) character, possibly followed by a
	183	* hard separator (CR, LF, PS, etc.)
	184	*/
	185	UBRK_SENTENCE_TERM = 0,
	186	/** Upper bound for tags for sentences ended by sentence terminators. */
	187	UBRK_SENTENCE_TERM_LIMIT = 100,
	188	/** Tag value for for sentences that do not contain an ending
46f4442e	189	* sentence terminator ('.', '?', '!', etc.) character, but
374ca955 A	190	* are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
	191	*/
	192	UBRK_SENTENCE_SEP = 100,
	193	/** Upper bound for tags for sentences ended by a separator. */
	194	UBRK_SENTENCE_SEP_LIMIT = 200
	195	/** Tag value for a hard, or mandatory line break */
	196	} USentenceBreakTag;
	197
b75a7d8f A	198
	199	/**
	200	* Open a new UBreakIterator for locating text boundaries for a specified locale.
	201	* A UBreakIterator may be used for detecting character, line, word,
	202	* and sentence breaks in text.
	203	* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
	204	* UBRK_LINE, UBRK_SENTENCE
	205	* @param locale The locale specifying the text-breaking conventions.
	206	* @param text The text to be iterated over.
	207	* @param textLength The number of characters in text, or -1 if null-terminated.
	208	* @param status A UErrorCode to receive any errors.
	209	* @return A UBreakIterator for the specified locale.
	210	* @see ubrk_openRules
	211	* @stable ICU 2.0
	212	*/
374ca955	213	U_STABLE UBreakIterator* U_EXPORT2
b75a7d8f A	214	ubrk_open(UBreakIteratorType type,
	215	const char *locale,
	216	const UChar *text,
	217	int32_t textLength,
	218	UErrorCode *status);
	219
	220	/**
	221	* Open a new UBreakIterator for locating text boundaries using specified breaking rules.
	222	* The rule syntax is ... (TBD)
	223	* @param rules A set of rules specifying the text breaking conventions.
	224	* @param rulesLength The number of characters in rules, or -1 if null-terminated.
	225	* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
	226	* used to specify the text to be iterated.
	227	* @param textLength The number of characters in text, or -1 if null-terminated.
	228	* @param parseErr Receives position and context information for any syntax errors
	229	* detected while parsing the rules.
	230	* @param status A UErrorCode to receive any errors.
	231	* @return A UBreakIterator for the specified rules.
	232	* @see ubrk_open
374ca955	233	* @stable ICU 2.2
b75a7d8f	234	*/
374ca955	235	U_STABLE UBreakIterator* U_EXPORT2
b75a7d8f A	236	ubrk_openRules(const UChar *rules,
	237	int32_t rulesLength,
	238	const UChar *text,
	239	int32_t textLength,
	240	UParseError *parseErr,
	241	UErrorCode *status);
	242
	243	/**
	244	* Thread safe cloning operation
	245	* @param bi iterator to be cloned
	246	* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
	247	* If buffer is not large enough, new memory will be allocated.
	248	* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
	249	* @param pBufferSize pointer to size of allocated space.
	250	* If *pBufferSize == 0, a sufficient size for use in cloning will
	251	* be returned ('pre-flighting')
	252	* If *pBufferSize is not enough for a stack-based safe clone,
	253	* new memory will be allocated.
	254	* @param status to indicate whether the operation went on smoothly or there were errors
	255	* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
	256	* @return pointer to the new clone
	257	* @stable ICU 2.0
	258	*/
374ca955	259	U_STABLE UBreakIterator * U_EXPORT2
b75a7d8f A	260	ubrk_safeClone(
	261	const UBreakIterator *bi,
	262	void *stackBuffer,
	263	int32_t *pBufferSize,
	264	UErrorCode *status);
	265
	266	/**
	267	* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
	268	* @stable ICU 2.0
	269	*/
	270	#define U_BRK_SAFECLONE_BUFFERSIZE 512
	271
	272	/**
	273	* Close a UBreakIterator.
	274	* Once closed, a UBreakIterator may no longer be used.
	275	* @param bi The break iterator to close.
	276	* @stable ICU 2.0
	277	*/
374ca955	278	U_STABLE void U_EXPORT2
b75a7d8f A	279	ubrk_close(UBreakIterator *bi);
	280
	281	/**
	282	* Sets an existing iterator to point to a new piece of text
	283	* @param bi The iterator to use
	284	* @param text The text to be set
	285	* @param textLength The length of the text
	286	* @param status The error code
	287	* @stable ICU 2.0
	288	*/
374ca955	289	U_STABLE void U_EXPORT2
b75a7d8f A	290	ubrk_setText(UBreakIterator* bi,
	291	const UChar* text,
	292	int32_t textLength,
	293	UErrorCode* status);
	294
73c04bcf A	295
	296	/**
	297	* Sets an existing iterator to point to a new piece of text
	298	* @param bi The iterator to use
46f4442e A	299	* @param text The text to be set.
	300	* This function makes a shallow clone of the supplied UText. This means
	301	* that the caller is free to immediately close or otherwise reuse the
	302	* UText that was passed as a parameter, but that the underlying text itself
	303	* must not be altered while being referenced by the break iterator.
73c04bcf	304	* @param status The error code
46f4442e	305	* @stable ICU 3.4
73c04bcf	306	*/
46f4442e	307	U_STABLE void U_EXPORT2
73c04bcf A	308	ubrk_setUText(UBreakIterator* bi,
	309	UText* text,
	310	UErrorCode* status);
	311
	312
	313
b75a7d8f A	314	/**
	315	* Determine the most recently-returned text boundary.
	316	*
	317	* @param bi The break iterator to use.
374ca955 A	318	* @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
374ca955 A	319	* \ref ubrk_first, or \ref ubrk_last.
b75a7d8f A	320	* @stable ICU 2.0
b75a7d8f A	321	*/
374ca955	322	U_STABLE int32_t U_EXPORT2
b75a7d8f A	323	ubrk_current(const UBreakIterator *bi);
	324
	325	/**
	326	* Determine the text boundary following the current text boundary.
	327	*
	328	* @param bi The break iterator to use.
	329	* @return The character index of the next text boundary, or UBRK_DONE
	330	* if all text boundaries have been returned.
	331	* @see ubrk_previous
	332	* @stable ICU 2.0
	333	*/
374ca955	334	U_STABLE int32_t U_EXPORT2
b75a7d8f A	335	ubrk_next(UBreakIterator *bi);
	336
	337	/**
	338	* Determine the text boundary preceding the current text boundary.
	339	*
	340	* @param bi The break iterator to use.
	341	* @return The character index of the preceding text boundary, or UBRK_DONE
	342	* if all text boundaries have been returned.
	343	* @see ubrk_next
	344	* @stable ICU 2.0
	345	*/
374ca955	346	U_STABLE int32_t U_EXPORT2
b75a7d8f A	347	ubrk_previous(UBreakIterator *bi);
	348
	349	/**
	350	* Determine the index of the first character in the text being scanned.
	351	* This is not always the same as index 0 of the text.
	352	* @param bi The break iterator to use.
	353	* @return The character index of the first character in the text being scanned.
	354	* @see ubrk_last
	355	* @stable ICU 2.0
	356	*/
374ca955	357	U_STABLE int32_t U_EXPORT2
b75a7d8f A	358	ubrk_first(UBreakIterator *bi);
	359
	360	/**
	361	* Determine the index immediately <EM>beyond</EM> the last character in the text being
	362	* scanned.
	363	* This is not the same as the last character.
	364	* @param bi The break iterator to use.
	365	* @return The character offset immediately <EM>beyond</EM> the last character in the
	366	* text being scanned.
	367	* @see ubrk_first
	368	* @stable ICU 2.0
	369	*/
374ca955	370	U_STABLE int32_t U_EXPORT2
b75a7d8f A	371	ubrk_last(UBreakIterator *bi);
	372
	373	/**
	374	* Determine the text boundary preceding the specified offset.
	375	* The value returned is always smaller than offset, or UBRK_DONE.
	376	* @param bi The break iterator to use.
	377	* @param offset The offset to begin scanning.
	378	* @return The text boundary preceding offset, or UBRK_DONE.
	379	* @see ubrk_following
	380	* @stable ICU 2.0
	381	*/
374ca955	382	U_STABLE int32_t U_EXPORT2
b75a7d8f A	383	ubrk_preceding(UBreakIterator *bi,
	384	int32_t offset);
	385
	386	/**
	387	* Determine the text boundary following the specified offset.
	388	* The value returned is always greater than offset, or UBRK_DONE.
	389	* @param bi The break iterator to use.
	390	* @param offset The offset to begin scanning.
	391	* @return The text boundary following offset, or UBRK_DONE.
	392	* @see ubrk_preceding
	393	* @stable ICU 2.0
	394	*/
374ca955	395	U_STABLE int32_t U_EXPORT2
b75a7d8f A	396	ubrk_following(UBreakIterator *bi,
	397	int32_t offset);
	398
	399	/**
	400	* Get a locale for which text breaking information is available.
	401	* A UBreakIterator in a locale returned by this function will perform the correct
	402	* text breaking for the locale.
	403	* @param index The index of the desired locale.
	404	* @return A locale for which number text breaking information is available, or 0 if none.
	405	* @see ubrk_countAvailable
	406	* @stable ICU 2.0
	407	*/
374ca955	408	U_STABLE const char* U_EXPORT2
b75a7d8f A	409	ubrk_getAvailable(int32_t index);
	410
	411	/**
	412	* Determine how many locales have text breaking information available.
	413	* This function is most useful as determining the loop ending condition for
374ca955	414	* calls to \ref ubrk_getAvailable.
b75a7d8f A	415	* @return The number of locales for which text breaking information is available.
	416	* @see ubrk_getAvailable
	417	* @stable ICU 2.0
	418	*/
374ca955	419	U_STABLE int32_t U_EXPORT2
b75a7d8f A	420	ubrk_countAvailable(void);
	421
	422
	423	/**
	424	* Returns true if the specfied position is a boundary position. As a side
	425	* effect, leaves the iterator pointing to the first boundary position at
	426	* or after "offset".
	427	* @param bi The break iterator to use.
	428	* @param offset the offset to check.
	429	* @return True if "offset" is a boundary position.
	430	* @stable ICU 2.0
	431	*/
374ca955	432	U_STABLE UBool U_EXPORT2
b75a7d8f A	433	ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
	434
	435	/**
	436	* Return the status from the break rule that determined the most recently
	437	* returned break position. The values appear in the rule source
	438	* within brackets, {123}, for example. For rules that do not specify a
	439	* status, a default value of 0 is returned.
	440	* <p>
	441	* For word break iterators, the possible values are defined in enum UWordBreak.
374ca955	442	* @stable ICU 2.2
b75a7d8f	443	*/
374ca955	444	U_STABLE int32_t U_EXPORT2
b75a7d8f A	445	ubrk_getRuleStatus(UBreakIterator *bi);
b75a7d8f A	446
374ca955 A	447	/**
	448	* Get the statuses from the break rules that determined the most recently
	449	* returned break position. The values appear in the rule source
	450	* within brackets, {123}, for example. The default status value for rules
	451	* that do not explicitly provide one is zero.
	452	* <p>
	453	* For word break iterators, the possible values are defined in enum UWordBreak.
	454	* @param bi The break iterator to use
46f4442e	455	* @param fillInVec an array to be filled in with the status values.
374ca955 A	456	* @param capacity the length of the supplied vector. A length of zero causes
	457	* the function to return the number of status values, in the
	458	* normal way, without attemtping to store any values.
46f4442e A	459	* @param status receives error codes.
46f4442e A	460	* @return The number of rule status values from rules that determined
374ca955	461	* the most recent boundary returned by the break iterator.
73c04bcf	462	* @stable ICU 3.0
374ca955	463	*/
73c04bcf	464	U_STABLE int32_t U_EXPORT2
374ca955 A	465	ubrk_getRuleStatusVec(UBreakIterator bi, int32_t fillInVec, int32_t capacity, UErrorCode *status);
	466
	467	/**
	468	* Return the locale of the break iterator. You can choose between the valid and
	469	* the actual locale.
	470	* @param bi break iterator
	471	* @param type locale type (valid or actual)
	472	* @param status error code
	473	* @return locale string
73c04bcf	474	* @stable ICU 2.8
374ca955	475	*/
73c04bcf	476	U_STABLE const char* U_EXPORT2
374ca955 A	477	ubrk_getLocaleByType(const UBreakIterator bi, ULocDataLocaleType type, UErrorCode status);
	478
	479
b75a7d8f A	480	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	481
	482	#endif