X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..ba6d6ed23dec08b1cd5700a128c0752491c10ac9:/icuSources/common/unicode/ustring.h diff --git a/icuSources/common/unicode/ustring.h b/icuSources/common/unicode/ustring.h index 4777c269..388bc74a 100644 --- a/icuSources/common/unicode/ustring.h +++ b/icuSources/common/unicode/ustring.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1998-2006, International Business Machines +* Copyright (C) 1998-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -23,7 +23,7 @@ /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/ #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR # define UBRK_TYPEDEF_UBREAK_ITERATOR - typedef void UBreakIterator; + typedef struct UBreakIterator UBreakIterator; #endif /** @@ -65,13 +65,14 @@ * their occurrence is rare. Almost all characters in modern use require only * a single UChar code unit (i.e., their code point values are <=0xffff). * - * For more details see the User Guide Strings chapter (http://icu.sourceforge.net/userguide/strings.html). + * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html). * For a discussion of the handling of unpaired surrogates see also * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18. */ /** -* \defgroup ustring_ustrlen + * \defgroup ustring_ustrlen String Length + * \ingroup ustring_strlen */ /*@{*/ /** @@ -146,7 +147,7 @@ u_strcat(UChar *dst, * * @param dst The destination string. * @param src The source string. - * @param n The maximum number of characters to compare. + * @param n The maximum number of characters to append. * @return A pointer to dst. * @stable ICU 2.0 */ @@ -916,14 +917,32 @@ u_memrchr32(const UChar *s, UChar32 c, int32_t count); * return u_strcmp(ustringVar1, ustringVar2); * } * + * + * Note that the macros will NOT consistently work if their argument is another #define. + * The following will not work on all platforms, don't use it. + * + *
+ *     #define GLUCK "Mr. Gluck"
+ *     U_STRING_DECL(var, GLUCK, 9)
+ *     U_STRING_INIT(var, GLUCK, 9)
+ * 
+ * + * Instead, use the string literal "Mr. Gluck" as the argument to both macro + * calls. + * + * * @stable ICU 2.0 */ -#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16))) -# define U_STRING_DECL(var, cs, length) static const wchar_t var[(length)+1]={ L ## cs } +#if defined(U_DECLARE_UTF16) +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=U_DECLARE_UTF16(cs) + /**@stable ICU 2.0 */ +# define U_STRING_INIT(var, cs, length) +#elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16))) +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)cs } +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #else @@ -948,7 +967,7 @@ u_memrchr32(const UChar *s, UChar32 c, int32_t count); * * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, - * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C + * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C * * Anything else following a backslash is generically escaped. For * example, "[a\\-z]" returns "[a-z]". @@ -1101,7 +1120,7 @@ u_strToLower(UChar *dest, int32_t destCapacity, * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * - * This function uses only the first() and next() methods of the + * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * The result may be longer or shorter than the original. @@ -1164,7 +1183,10 @@ u_strFoldCase(UChar *dest, int32_t destCapacity, #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION /** - * Converts a sequence of UChars to wchar_t units. + * Convert a UTF-16 string to a wchar_t string. + * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then + * this function simply calls the fast, dedicated function for that. + * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. @@ -1190,7 +1212,10 @@ u_strToWCS(wchar_t *dest, int32_t srcLength, UErrorCode *pErrorCode); /** - * Converts a sequence of wchar_t units to UChars + * Convert a wchar_t string to UTF-16. + * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then + * this function simply calls the fast, dedicated function for that. + * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. @@ -1218,7 +1243,8 @@ u_strFromWCS(UChar *dest, #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */ /** - * Converts a sequence of UChars (UTF-16) to UTF-8 bytes + * Convert a UTF-16 string to UTF-8. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. @@ -1247,7 +1273,8 @@ u_strToUTF8(char *dest, UErrorCode *pErrorCode); /** - * Converts a sequence of UTF-8 bytes to UChars (UTF-16). + * Convert a UTF-8 string to UTF-16. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. @@ -1276,7 +1303,9 @@ u_strFromUTF8(UChar *dest, UErrorCode *pErrorCode); /** - * Converts a sequence of UChars (UTF-16) to UTF-8 bytes. + * Convert a UTF-16 string to UTF-8. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * * Same as u_strToUTF8() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8(). @@ -1307,9 +1336,9 @@ u_strFromUTF8(UChar *dest, * @return The pointer to destination buffer. * @see u_strToUTF8 * @see u_strFromUTF8WithSub - * @draft ICU 3.6 + * @stable ICU 3.6 */ -U_DRAFT char* U_EXPORT2 +U_STABLE char* U_EXPORT2 u_strToUTF8WithSub(char *dest, int32_t destCapacity, int32_t *pDestLength, @@ -1319,7 +1348,9 @@ u_strToUTF8WithSub(char *dest, UErrorCode *pErrorCode); /** - * Converts a sequence of UTF-8 bytes to UChars (UTF-16). + * Convert a UTF-8 string to UTF-16. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * * Same as u_strFromUTF8() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8(). @@ -1351,9 +1382,9 @@ u_strToUTF8WithSub(char *dest, * @see u_strFromUTF8 * @see u_strFromUTF8Lenient * @see u_strToUTF8WithSub - * @draft ICU 3.6 + * @stable ICU 3.6 */ -U_DRAFT UChar* U_EXPORT2 +U_STABLE UChar* U_EXPORT2 u_strFromUTF8WithSub(UChar *dest, int32_t destCapacity, int32_t *pDestLength, @@ -1363,7 +1394,8 @@ u_strFromUTF8WithSub(UChar *dest, UErrorCode *pErrorCode); /** - * Converts a sequence of UTF-8 bytes to UChars (UTF-16). + * Convert a UTF-8 string to UTF-16. + * * Same as u_strFromUTF8() except that this function is designed to be very fast, * which it achieves by being lenient about malformed UTF-8 sequences. * This function is intended for use in environments where UTF-8 text is @@ -1382,6 +1414,9 @@ u_strFromUTF8WithSub(UChar *dest, * For further performance improvement, if srcLength is given (>=0), * then it must be destCapacity>=srcLength. * + * There is no inverse u_strToUTF8Lenient() function because there is practically + * no performance gain from not checking that a UTF-16 string is well-formed. + * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then @@ -1407,9 +1442,9 @@ u_strFromUTF8WithSub(UChar *dest, * @see u_strFromUTF8 * @see u_strFromUTF8WithSub * @see u_strToUTF8WithSub - * @draft ICU 3.6 + * @stable ICU 3.6 */ -U_CAPI UChar * U_EXPORT2 +U_STABLE UChar * U_EXPORT2 u_strFromUTF8Lenient(UChar *dest, int32_t destCapacity, int32_t *pDestLength, @@ -1418,7 +1453,8 @@ u_strFromUTF8Lenient(UChar *dest, UErrorCode *pErrorCode); /** - * Converts a sequence of UChars (UTF-16) to UTF32 units. + * Convert a UTF-16 string to UTF-32. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. @@ -1434,6 +1470,8 @@ u_strFromUTF8Lenient(UChar *dest, * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. + * @see u_strToUTF32WithSub + * @see u_strFromUTF32 * @stable ICU 2.0 */ U_STABLE UChar32* U_EXPORT2 @@ -1445,7 +1483,8 @@ u_strToUTF32(UChar32 *dest, UErrorCode *pErrorCode); /** - * Converts a sequence of UTF32 units to UChars (UTF-16) + * Convert a UTF-32 string to UTF-16. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. @@ -1461,6 +1500,8 @@ u_strToUTF32(UChar32 *dest, * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. + * @see u_strFromUTF32WithSub + * @see u_strToUTF32 * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 @@ -1471,4 +1512,185 @@ u_strFromUTF32(UChar *dest, int32_t srcLength, UErrorCode *pErrorCode); +/** + * Convert a UTF-16 string to UTF-32. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * Same as u_strToUTF32() except for the additional subchar which is output for + * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. + * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32(). + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strToUTF32 + * @see u_strFromUTF32WithSub + * @stable ICU 4.2 + */ +U_STABLE UChar32* U_EXPORT2 +u_strToUTF32WithSub(UChar32 *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-32 string to UTF-16. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * Same as u_strFromUTF32() except for the additional subchar which is output for + * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. + * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32(). + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strFromUTF32 + * @see u_strToUTF32WithSub + * @stable ICU 4.2 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromUTF32WithSub(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar32 *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +/** + * Convert a 16-bit Unicode string to Java Modified UTF-8. + * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 + * + * This function behaves according to the documentation for Java DataOutput.writeUTF() + * except that it does not encode the output length in the destination buffer + * and does not have an output length restriction. + * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String) + * + * The input string need not be well-formed UTF-16. + * (Therefore there is no subchar parameter.) + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of chars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @stable ICU 4.4 + * @see u_strToUTF8WithSub + * @see u_strFromJavaModifiedUTF8WithSub + */ +U_STABLE char* U_EXPORT2 +u_strToJavaModifiedUTF8( + char *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a Java Modified UTF-8 string to a 16-bit Unicode string. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * This function behaves according to the documentation for Java DataInput.readUTF() + * except that it takes a length parameter rather than + * interpreting the first two input bytes as the length. + * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF() + * + * The output string may not be well-formed UTF-16. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strFromUTF8WithSub + * @see u_strFromUTF8Lenient + * @see u_strToJavaModifiedUTF8 + * @stable ICU 4.4 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromJavaModifiedUTF8WithSub( + UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const char *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + #endif