ICU-66108.tar.gz

[apple/icu.git] / icuSources / common / unicode / ubrk.h
diff --git a/icuSources/common/unicode/ubrk.h b/icuSources/common/unicode/ubrk.h

index 2506540dfb572137cf82d8abcf94942b5b2c295b..826e1e9fee459bcb015e6704f1fdd7b10a66638c 100644 (file)
--- a/icuSources/common/unicode/ubrk.h
+++ b/icuSources/common/unicode/ubrk.h
@@ -1,6 +1,8 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  ******************************************************************************
-* Copyright (C) 1996-2013, International Business Machines Corporation and others.
+* Copyright (C) 1996-2015, International Business Machines Corporation and others.
  * All Rights Reserved.
  ******************************************************************************
  */
@@ -45,10 +47,20 @@
   * when line-wrapping. The mechanism correctly handles punctuation and
   * hyphenated words.
   * <p>
+ * Note: The locale keyword "lb" can be used to modify line break
+ * behavior according to the CSS level 3 line-break options, see
+ * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
+ * "ja@lb=strict", "zh@lb=loose".
+ * <p>
   * Sentence boundary analysis allows selection with correct
   * interpretation of periods within numbers and abbreviations, and
   * trailing punctuation marks such as quotation marks and parentheses.
   * <p>
+ * Note: The locale keyword "ss" can be used to enable use of
+ * segmentation suppression data (preventing breaks in English after
+ * abbreviations such as "Mr." or "Est.", for example), as follows:
+ * "en@ss=standard".
+ * <p>
   * Word boundary analysis is used by search and replace functions, as
   * well as within text editing applications that allow the user to
   * select words with a double click. Word selection provides correct
@@ -60,7 +72,7 @@
   * "Extended Grapheme Clusters", which are groupings of codepoints
   * that should be treated as character-like units for many text operations.
   * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
- * http://www.unicode.org/reports/tr29/ for additional information 
+ * http://www.unicode.org/reports/tr29/ for additional information
   * on grapheme clusters and guidelines on their use.
   * <p>
   * Title boundary analysis locates all positions,
@@ -104,8 +116,12 @@ typedef enum UBreakIteratorType {
     * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
     */
    UBRK_TITLE = 4,
-#endif /* U_HIDE_DEPRECATED_API */
-  UBRK_COUNT = 5
+    /**
+     * One more than the highest normal UBreakIteratorType value.
+     * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
+     */
+    UBRK_COUNT = 5
+#endif  // U_HIDE_DEPRECATED_API
  } UBreakIteratorType;
  
  /** Value indicating all text boundaries have been returned.
@@ -120,7 +136,10 @@ typedef enum UBreakIteratorType {
   *  word, to allow for further subdivisions of a category in future releases.
   *  Applications should check for tag values falling within the range, rather
   *  than for single individual values.
- *  @stable ICU 2.2
+ *
+ * The numeric values of all of these constants are stable (will not change).
+ *
+ * @stable ICU 2.2
  */
  typedef enum UWordBreak {
      /** Tag value for "words" that do not fit into any of other categories.
@@ -153,7 +172,10 @@ typedef enum UWordBreak {
   *  word, to allow for further subdivisions of a category in future releases.
   *  Applications should check for tag values falling within the range, rather
   *  than for single individual values.
- *  @stable ICU 2.8
+ *
+ * The numeric values of all of these constants are stable (will not change).
+ *
+ * @stable ICU 2.8
  */
  typedef enum ULineBreakTag {
      /** Tag value for soft line breaks, positions at which a line break
@@ -175,7 +197,10 @@ typedef enum ULineBreakTag {
   *  sentence, to allow for further subdivisions of a category in future releases.
   *  Applications should check for tag values falling within the range, rather
   *  than for single individual values.
- *  @stable ICU 2.8
+ *
+ * The numeric values of all of these constants are stable (will not change).
+ *
+ * @stable ICU 2.8
  */
  typedef enum USentenceBreakTag {
      /** Tag value for for sentences  ending with a sentence terminator
@@ -196,14 +221,36 @@ typedef enum USentenceBreakTag {
  } USentenceBreakTag;
  
  
+/**
+ *  Masks to control line break word options (per the CSS word-break property).
+ *  NORMAL allows breaks between CJK characters in the middle of words. Other masks
+ *  prohibit breaks between characters of specific scripts (or all scripts) except as 
+ *  determined by a dictionary, or by spaces or other mechanisms (Western-style breaking).
+ *
+ * @internal Apple only
+*/
+typedef enum ULineWordOptions {
+    /** Allow breaks between characters of all CJK scripts */
+    UBRK_LINEWORD_NORMAL      = 0,
+    /** Prevent breaks between Hangul characters, except as determined by a dictionary. */
+    UBRK_LINEWORD_KEEP_HANGUL = 1,
+    /** Prevent breaks between characters of any script, except as determined by a dictionary. */
+    UBRK_LINEWORD_KEEP_ALL    = 0x7F
+} ULineWordOptions;
+
+
+
  /**
   * Open a new UBreakIterator for locating text boundaries for a specified locale.
   * A UBreakIterator may be used for detecting character, line, word,
   * and sentence breaks in text.
   * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
   * UBRK_LINE, UBRK_SENTENCE
- * @param locale The locale specifying the text-breaking conventions.
- * @param text The text to be iterated over.
+ * @param locale The locale specifying the text-breaking conventions. Note that
+ * locale keys such as "lb" and "ss" may be used to modify text break behavior,
+ * see general discussion of BreakIterator C API.
+ * @param text The text to be iterated over. May be null, in which case ubrk_setText() is
+ *        used to specify the text to be iterated.
   * @param textLength The number of characters in text, or -1 if null-terminated.
   * @param status A UErrorCode to receive any errors.
   * @return A UBreakIterator for the specified locale.
@@ -240,13 +287,40 @@ ubrk_openRules(const UChar     *rules,
                 UParseError     *parseErr,
                 UErrorCode      *status);
  
+/**
+ * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
+ * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
+ * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
+ * compatible across different major versions of ICU, nor across platforms of different
+ * endianness or different base character set family (ASCII vs EBCDIC).
+ * @param binaryRules A set of compiled binary rules specifying the text breaking
+ *                    conventions. Ownership of the storage containing the compiled
+ *                    rules remains with the caller of this function. The compiled
+ *                    rules must not be modified or deleted during the life of the
+ *                    break iterator.
+ * @param rulesLength The length of binaryRules in bytes; must be >= 0.
+ * @param text        The text to be iterated over.  May be null, in which case
+ *                    ubrk_setText() is used to specify the text to be iterated.
+ * @param textLength  The number of characters in text, or -1 if null-terminated.
+ * @param status      Pointer to UErrorCode to receive any errors.
+ * @return            UBreakIterator for the specified rules.
+ * @see ubrk_getBinaryRules
+ * @stable ICU 59
+ */
+U_STABLE UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
+                     const UChar *  text, int32_t textLength,
+                     UErrorCode *   status);
+
  /**
   * Thread safe cloning operation
   * @param bi iterator to be cloned
- * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
+ * @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
+ *  user allocated space for the new clone. If NULL new memory will be allocated.
   *  If buffer is not large enough, new memory will be allocated.
- *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
- * @param pBufferSize pointer to size of allocated space.
+ *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
+ * @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
+ *  pointer to size of allocated space.
   *  If *pBufferSize == 0, a sufficient size for use in cloning will
   *  be returned ('pre-flighting')
   *  If *pBufferSize is not enough for a stack-based safe clone,
@@ -263,11 +337,15 @@ ubrk_safeClone(
            int32_t *pBufferSize,
            UErrorCode *status);
  
+#ifndef U_HIDE_DEPRECATED_API
+
  /**
    * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
-  * @stable ICU 2.0
+  * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
    */
-#define U_BRK_SAFECLONE_BUFFERSIZE 528
+#define U_BRK_SAFECLONE_BUFFERSIZE 1
+
+#endif /* U_HIDE_DEPRECATED_API */
  
  /**
  * Close a UBreakIterator.
@@ -297,8 +375,26 @@ U_NAMESPACE_END
  
  #endif
  
+#ifndef U_HIDE_INTERNAL_API
  /**
- * Sets an existing iterator to point to a new piece of text
+ * Set the ULineWordOptions for the specified break iterator.
+ *
+ * @param bi The iterator to use
+ * @param lineWordOpts The ULineWordOptions to set.
+ * @internal Apple only
+ */
+U_INTERNAL void U_EXPORT2
+ubrk_setLineWordOpts(UBreakIterator* bi,
+                     ULineWordOptions lineWordOpts);
+
+#endif  /* U_HIDE_INTERNAL_API */
+
+/**
+ * Sets an existing iterator to point to a new piece of text.
+ * The break iterator retains a pointer to the supplied text.
+ * The caller must not modify or delete the text while the BreakIterator
+ * retains the reference.
+ *
   * @param bi The iterator to use
   * @param text The text to be set
   * @param textLength The length of the text
@@ -372,10 +468,9 @@ U_STABLE int32_t U_EXPORT2
  ubrk_previous(UBreakIterator *bi);
  
  /**
- * Set the iterator position to the index of the first character in the text being scanned.
- * This is not always the same as index 0 of the text.
+ * Set the iterator position to zero, the start of the text being scanned.
   * @param bi The break iterator to use.
- * @return The character index of the first character in the text being scanned.
+ * @return The new iterator position (zero).
   * @see ubrk_last
   * @stable ICU 2.0
   */
@@ -445,7 +540,7 @@ ubrk_countAvailable(void);
  
  
  /**
-* Returns true if the specfied position is a boundary position.  As a side
+* Returns true if the specified position is a boundary position.  As a side
  * effect, leaves the iterator pointing to the first boundary position at
  * or after "offset".
  * @param bi The break iterator to use.
@@ -479,7 +574,7 @@ ubrk_getRuleStatus(UBreakIterator *bi);
   * @param fillInVec an array to be filled in with the status values.
   * @param capacity  the length of the supplied vector.  A length of zero causes
   *                  the function to return the number of status values, in the
- *                  normal way, without attemtping to store any values.
+ *                  normal way, without attempting to store any values.
   * @param status    receives error codes.
   * @return          The number of rule status values from rules that determined
   *                  the most recent boundary returned by the break iterator.
@@ -530,6 +625,37 @@ ubrk_refreshUText(UBreakIterator *bi,
                         UText          *text,
                         UErrorCode     *status);
  
+
+/**
+ * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
+ * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
+ * more quickly than using ubrk_openRules. The compiled rules are not compatible across
+ * different major versions of ICU, nor across platforms of different endianness or
+ * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
+ * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
+ * the binaryRules buffer. However, whether preflighting or not, if the actual length
+ * is greater than INT32_MAX, then the function returns 0 and sets *status to
+ * U_INDEX_OUTOFBOUNDS_ERROR.
+
+ * @param bi            The break iterator to use.
+ * @param binaryRules   Buffer to receive the compiled binary rules; set to NULL for
+ *                      preflighting.
+ * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
+ *                      preflighting. Must be >= 0.
+ * @param status        Pointer to UErrorCode to receive any errors, such as
+ *                      U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or
+ *                      U_ILLEGAL_ARGUMENT_ERROR.
+ * @return              The actual byte length of the binary rules, if <= INT32_MAX;
+ *                      otherwise 0. If not preflighting and this is larger than
+ *                      rulesCapacity, *status will be set to an error.
+ * @see ubrk_openBinaryRules
+ * @stable ICU 59
+ */
+U_STABLE int32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+                    uint8_t *       binaryRules, int32_t rulesCapacity,
+                    UErrorCode *    status);
+
  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  
  #endif