ICU-551.24.tar.gz

[apple/icu.git] / icuSources / common / unicode / ustring.h
diff --git a/icuSources/common/unicode/ustring.h b/icuSources/common/unicode/ustring.h

index 12411ef6d996c6551016ee669d6fd96d5c69c1ab..6d141e8df6338dc8e4ed0c9f44b76bb3ac51a057 100644 (file)
--- a/icuSources/common/unicode/ustring.h
+++ b/icuSources/common/unicode/ustring.h
@@ -1,6 +1,6 @@
  /*
  **********************************************************************
-*   Copyright (C) 1998-2008, International Business Machines
+*   Copyright (C) 1998-2014, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *
@@ -20,10 +20,15 @@
  #include "unicode/putil.h"
  #include "unicode/uiter.h"
  
-/** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
+/**
+ * \def UBRK_TYPEDEF_UBREAK_ITERATOR
+ * @internal 
+ */
+
  #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  #   define UBRK_TYPEDEF_UBREAK_ITERATOR
-    typedef void UBreakIterator;
+/** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
+    typedef struct UBreakIterator UBreakIterator;
  #endif
  
  /**
@@ -146,8 +151,8 @@ u_strcat(UChar     *dst,
   * If <code>n&lt;=0</code> then dst is not modified.
   *
   * @param dst The destination string.
- * @param src The source string.
- * @param n The maximum number of characters to compare.
+ * @param src The source string (can be NULL/invalid if n<=0).
+ * @param n The maximum number of characters to append; no-op if <=0.
   * @return A pointer to <code>dst</code>.
   * @stable ICU 2.0
   */
@@ -550,9 +555,9 @@ u_strCaseCompare(const UChar *s1, int32_t length1,
   * Compare two ustrings for bitwise equality. 
   * Compares at most <code>n</code> characters.
   *
- * @param ucs1 A string to compare.
- * @param ucs2 A string to compare.
- * @param n The maximum number of characters to compare.
+ * @param ucs1 A string to compare (can be NULL/invalid if n<=0).
+ * @param ucs2 A string to compare (can be NULL/invalid if n<=0).
+ * @param n The maximum number of characters to compare; always returns 0 if n<=0.
   * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
   * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
   * value if <code>s1</code> is bitwise greater than <code>s2</code>.
@@ -667,8 +672,8 @@ u_strcpy(UChar     *dst,
   * if the length of <code>src</code> is less than <code>n</code>.
   *
   * @param dst The destination string.
- * @param src The source string.
- * @param n The maximum number of characters to copy.
+ * @param src The source string (can be NULL/invalid if n<=0).
+ * @param n The maximum number of characters to copy; no-op if <=0.
   * @return A pointer to <code>dst</code>.
   * @stable ICU 2.0
   */
@@ -742,8 +747,8 @@ U_STABLE char* U_EXPORT2 u_austrncpy(char *dst,
  /**
   * Synonym for memcpy(), but with UChars only.
   * @param dest The destination string
- * @param src The source string
- * @param count The number of characters to copy
+ * @param src The source string (can be NULL/invalid if count<=0)
+ * @param count The number of characters to copy; no-op if <=0
   * @return A pointer to <code>dest</code>
   * @stable ICU 2.0
   */
@@ -753,8 +758,8 @@ u_memcpy(UChar *dest, const UChar *src, int32_t count);
  /**
   * Synonym for memmove(), but with UChars only.
   * @param dest The destination string
- * @param src The source string
- * @param count The number of characters to move
+ * @param src The source string (can be NULL/invalid if count<=0)
+ * @param count The number of characters to move; no-op if <=0
   * @return A pointer to <code>dest</code>
   * @stable ICU 2.0
   */
@@ -917,10 +922,24 @@ u_memrchr32(const UChar *s, UChar32 c, int32_t count);
   *        return u_strcmp(ustringVar1, ustringVar2);
   *    }
   * </pre>
+ * 
+ * Note that the macros will NOT consistently work if their argument is another <code>#define</code>. 
+ *  The following will not work on all platforms, don't use it.
+ * 
+ * <pre>
+ *     #define GLUCK "Mr. Gluck"
+ *     U_STRING_DECL(var, GLUCK, 9)
+ *     U_STRING_INIT(var, GLUCK, 9)
+ * </pre>
+ * 
+ * Instead, use the string literal "Mr. Gluck"  as the argument to both macro
+ * calls.
+ *
+ *
   * @stable ICU 2.0
   */
  #if defined(U_DECLARE_UTF16)
-#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=U_DECLARE_UTF16(cs)
+#   define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs)
      /**@stable ICU 2.0 */
  #   define U_STRING_INIT(var, cs, length)
  #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
@@ -953,7 +972,7 @@ u_memrchr32(const UChar *s, UChar32 c, int32_t count);
   *
   * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
   * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
- * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
+ * \\&quot; => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
   *
   * Anything else following a backslash is generically escaped.  For
   * example, "[a\\-z]" returns "[a-z]".
@@ -1140,10 +1159,12 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
  #endif
  
  /**
- * Case-fold the characters in a string.
+ * Case-folds the characters in a string.
+ *
   * Case-folding is locale-independent and not context-sensitive,
   * but there is an option for whether to include or exclude mappings for dotted I
- * and dotless i that are marked with 'I' in CaseFolding.txt.
+ * and dotless i that are marked with 'T' in CaseFolding.txt.
+ *
   * The result may be longer or shorter than the original.
   * The source string and the destination buffer are allowed to overlap.
   *
@@ -1169,7 +1190,10 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
  
  #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
  /**
- * Converts a sequence of UChars to wchar_t units.
+ * Convert a UTF-16 string to a wchar_t string.
+ * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
+ * this function simply calls the fast, dedicated function for that.
+ * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
   *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
@@ -1195,7 +1219,10 @@ u_strToWCS(wchar_t *dest,
             int32_t srcLength,
             UErrorCode *pErrorCode);
  /**
- * Converts a sequence of wchar_t units to UChars
+ * Convert a wchar_t string to UTF-16.
+ * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
+ * this function simply calls the fast, dedicated function for that.
+ * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
   *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
@@ -1223,7 +1250,8 @@ u_strFromWCS(UChar   *dest,
  #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
  
  /**
- * Converts a sequence of UChars (UTF-16) to UTF-8 bytes
+ * Convert a UTF-16 string to UTF-8.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
   *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
@@ -1252,7 +1280,8 @@ u_strToUTF8(char *dest,
              UErrorCode *pErrorCode);
  
  /**
- * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Convert a UTF-8 string to UTF-16.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
   *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
@@ -1281,7 +1310,8 @@ u_strFromUTF8(UChar *dest,
                UErrorCode *pErrorCode);
  
  /**
- * Converts a sequence of UChars (UTF-16) to UTF-8 bytes.
+ * Convert a UTF-16 string to UTF-8.
+ *
   * Same as u_strToUTF8() except for the additional subchar which is output for
   * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
   * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
@@ -1324,7 +1354,8 @@ u_strToUTF8WithSub(char *dest,
              UErrorCode *pErrorCode);
  
  /**
- * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Convert a UTF-8 string to UTF-16.
+ *
   * Same as u_strFromUTF8() except for the additional subchar which is output for
   * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
   * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
@@ -1368,7 +1399,8 @@ u_strFromUTF8WithSub(UChar *dest,
                UErrorCode *pErrorCode);
  
  /**
- * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Convert a UTF-8 string to UTF-16.
+ *
   * Same as u_strFromUTF8() except that this function is designed to be very fast,
   * which it achieves by being lenient about malformed UTF-8 sequences.
   * This function is intended for use in environments where UTF-8 text is
@@ -1387,6 +1419,9 @@ u_strFromUTF8WithSub(UChar *dest,
   * For further performance improvement, if srcLength is given (>=0),
   * then it must be destCapacity>=srcLength.
   *
+ * There is no inverse u_strToUTF8Lenient() function because there is practically
+ * no performance gain from not checking that a UTF-16 string is well-formed.
+ *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
   * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
@@ -1423,7 +1458,8 @@ u_strFromUTF8Lenient(UChar *dest,
                       UErrorCode *pErrorCode);
  
  /**
- * Converts a sequence of UChars (UTF-16) to UTF32 units.
+ * Convert a UTF-16 string to UTF-32.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
   *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
@@ -1439,6 +1475,8 @@ u_strFromUTF8Lenient(UChar *dest,
   * @param pErrorCode    Must be a valid pointer to an error code value,
   *                      which must not indicate a failure before the function call.
   * @return The pointer to destination buffer.
+ * @see u_strToUTF32WithSub
+ * @see u_strFromUTF32
   * @stable ICU 2.0
   */
  U_STABLE UChar32* U_EXPORT2 
@@ -1450,7 +1488,8 @@ u_strToUTF32(UChar32 *dest,
               UErrorCode *pErrorCode);
  
  /**
- * Converts a sequence of UTF32 units to UChars (UTF-16)
+ * Convert a UTF-32 string to UTF-16.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
   *
   * @param dest          A buffer for the result string. The result will be zero-terminated if
   *                      the buffer is large enough.
@@ -1466,6 +1505,8 @@ u_strToUTF32(UChar32 *dest,
   * @param pErrorCode    Must be a valid pointer to an error code value,
   *                      which must not indicate a failure before the function call.
   * @return The pointer to destination buffer.
+ * @see u_strFromUTF32WithSub
+ * @see u_strToUTF32
   * @stable ICU 2.0
   */
  U_STABLE UChar* U_EXPORT2 
@@ -1476,4 +1517,184 @@ u_strFromUTF32(UChar   *dest,
                 int32_t srcLength,
                 UErrorCode *pErrorCode);
  
+/**
+ * Convert a UTF-16 string to UTF-32.
+ *
+ * Same as u_strToUTF32() except for the additional subchar which is output for
+ * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
+ * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChar32s). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If
+ *                      pDestLength!=NULL then *pDestLength is always set to the
+ *                      number of output units corresponding to the transformation of
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strToUTF32
+ * @see u_strFromUTF32WithSub
+ * @stable ICU 4.2
+ */
+U_STABLE UChar32* U_EXPORT2
+u_strToUTF32WithSub(UChar32 *dest,
+             int32_t destCapacity,
+             int32_t *pDestLength,
+             const UChar *src,
+             int32_t srcLength,
+             UChar32 subchar, int32_t *pNumSubstitutions,
+             UErrorCode *pErrorCode);
+
+/**
+ * Convert a UTF-32 string to UTF-16.
+ *
+ * Same as u_strFromUTF32() except for the additional subchar which is output for
+ * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
+ * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If
+ *                      pDestLength!=NULL then *pDestLength is always set to the
+ *                      number of output units corresponding to the transformation of
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strFromUTF32
+ * @see u_strToUTF32WithSub
+ * @stable ICU 4.2
+ */
+U_STABLE UChar* U_EXPORT2
+u_strFromUTF32WithSub(UChar *dest,
+               int32_t destCapacity,
+               int32_t *pDestLength,
+               const UChar32 *src,
+               int32_t srcLength,
+               UChar32 subchar, int32_t *pNumSubstitutions,
+               UErrorCode *pErrorCode);
+
+/**
+ * Convert a 16-bit Unicode string to Java Modified UTF-8.
+ * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
+ *
+ * This function behaves according to the documentation for Java DataOutput.writeUTF()
+ * except that it does not encode the output length in the destination buffer
+ * and does not have an output length restriction.
+ * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
+ *
+ * The input string need not be well-formed UTF-16.
+ * (Therefore there is no subchar parameter.)
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of chars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the 
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If 
+ *                      pDestLength!=NULL then *pDestLength is always set to the 
+ *                      number of output units corresponding to the transformation of 
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @stable ICU 4.4
+ * @see u_strToUTF8WithSub
+ * @see u_strFromJavaModifiedUTF8WithSub
+ */
+U_STABLE char* U_EXPORT2 
+u_strToJavaModifiedUTF8(
+        char *dest,
+        int32_t destCapacity,
+        int32_t *pDestLength,
+        const UChar *src, 
+        int32_t srcLength,
+        UErrorCode *pErrorCode);
+
+/**
+ * Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
+ * If the input string is not well-formed and no substitution char is specified, 
+ * then the U_INVALID_CHAR_FOUND error code is set.
+ *
+ * This function behaves according to the documentation for Java DataInput.readUTF()
+ * except that it takes a length parameter rather than
+ * interpreting the first two input bytes as the length.
+ * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
+ *
+ * The output string may not be well-formed UTF-16.
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the 
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If 
+ *                      pDestLength!=NULL then *pDestLength is always set to the 
+ *                      number of output units corresponding to the transformation of 
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strFromUTF8WithSub
+ * @see u_strFromUTF8Lenient
+ * @see u_strToJavaModifiedUTF8
+ * @stable ICU 4.4
+ */
+U_STABLE UChar* U_EXPORT2
+u_strFromJavaModifiedUTF8WithSub(
+        UChar *dest,
+        int32_t destCapacity,
+        int32_t *pDestLength,
+        const char *src,
+        int32_t srcLength,
+        UChar32 subchar, int32_t *pNumSubstitutions,
+        UErrorCode *pErrorCode);
+
  #endif