X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..fd0068a84e9996f225edba706498f6ed413d0673:/icuSources/common/unicode/ucnv.h?ds=sidebyside diff --git a/icuSources/common/unicode/ucnv.h b/icuSources/common/unicode/ucnv.h index 4bb0e6c5..f8fce55b 100644 --- a/icuSources/common/unicode/ucnv.h +++ b/icuSources/common/unicode/ucnv.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2003, International Business Machines +* Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * ucnv.h: @@ -23,37 +23,30 @@ *

Character Conversion C API

* *

This API is used to convert codepage or character encoded data to and - * from UTF-16. You can open a converter with {@link ucnv_open()}. With that + * from UTF-16. You can open a converter with {@link ucnv_open() }. With that * converter, you can get its properties, set options, convert your data and * close the converter.

* *

Since many software programs recogize different converter names for * different types of converters, there are other functions in this API to - * iterate over the converter aliases. The functions {@link ucnv_getAvailableName()}, - * {@link ucnv_getAlias()} and {@link ucnv_getStandardName()} are some of the + * iterate over the converter aliases. The functions {@link ucnv_getAvailableName() }, + * {@link ucnv_getAlias() } and {@link ucnv_getStandardName() } are some of the * more frequently used alias functions to get this information.

* *

When a converter encounters an illegal, irregular, invalid or unmappable character * its default behavior is to use a substitution character to replace the - * bad byte sequence. This behavior can be changed by using {@link ucnv_getFromUCallBack()} - * or {@link ucnv_getToUCallBack()} on the converter. The header ucnv_err.h defines + * bad byte sequence. This behavior can be changed by using {@link ucnv_setFromUCallBack() } + * or {@link ucnv_setToUCallBack() } on the converter. The header ucnv_err.h defines * many other callback actions that can be used instead of a character substitution.

* *

More information about this API can be found in our - * User's + * User's * Guide.

*/ #ifndef UCNV_H #define UCNV_H -/** Forward declaring the UConverter structure. @stable ICU 2.0 */ -struct UConverter; - -/** @stable ICU 2.0 */ -typedef struct UConverter UConverter; - -#include "unicode/utypes.h" #include "unicode/ucnv_err.h" #include "unicode/uenum.h" @@ -66,13 +59,16 @@ typedef struct UConverter UConverter; * See unicode/uset.h * * @see ucnv_getUnicodeSet - * @draft ICU 2.6 + * @stable ICU 2.6 */ struct USet; +/** @stable ICU 2.6 */ typedef struct USet USet; #endif +#if !UCONFIG_NO_CONVERSION + U_CDECL_BEGIN /** Maximum length of a converter name including the terminating NULL @stable ICU 2.0 */ @@ -155,6 +151,9 @@ typedef enum { * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked + * @param pErrorCode ICU error code in/out parameter. + * For converter callback functions, set to a conversion error + * before the call, and the callback may reset it to U_ZERO_ERROR. * @see ucnv_setToUCallBack * @see UConverterToUnicodeArgs * @stable ICU 2.0 @@ -165,7 +164,7 @@ typedef void (U_EXPORT2 *UConverterToUCallback) ( const char *codeUnits, int32_t length, UConverterCallbackReason reason, - UErrorCode *); + UErrorCode *pErrorCode); /** * Function pointer for error callback in the unicode to codepage direction. @@ -176,6 +175,9 @@ typedef void (U_EXPORT2 *UConverterToUCallback) ( * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked + * @param pErrorCode ICU error code in/out parameter. + * For converter callback functions, set to a conversion error + * before the call, and the callback may reset it to U_ZERO_ERROR. * @see ucnv_setFromUCallBack * @stable ICU 2.0 */ @@ -186,7 +188,7 @@ typedef void (U_EXPORT2 *UConverterFromUCallback) ( int32_t length, UChar32 codePoint, UConverterCallbackReason reason, - UErrorCode *); + UErrorCode *pErrorCode); U_CDECL_END @@ -234,7 +236,7 @@ U_CDECL_END * See convrtrs.txt. * * @see ucnv_open - * @draft ICU 2.4 + * @stable ICU 2.4 */ #define UCNV_VERSION_OPTION_STRING ",version=" @@ -246,16 +248,18 @@ U_CDECL_END * See convrtrs.txt. * * @see ucnv_open - * @draft ICU 2.4 + * @stable ICU 2.4 */ #define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" /** - * Do a fuzzy compare of a two converter/alias names. The comparison - * is case-insensitive. It also ignores the characters '-', '_', and - * ' ' (dash, underscore, and space). Thus the strings "UTF-8", - * "utf_8", and "Utf 8" are exactly equivalent. - * + * Do a fuzzy compare of two converter/alias names. + * The comparison is case-insensitive, ignores leading zeroes if they are not + * followed by further digits, and ignores all but letters and digits. + * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. + * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 + * at http://www.unicode.org/reports/tr22/ + * * @param name1 a converter name or alias, zero-terminated * @param name2 a converter name or alias, zero-terminated * @return 0 if the names match, or a negative value if the name1 @@ -263,16 +267,17 @@ U_CDECL_END * lexically follows name2. * @stable ICU 2.0 */ -U_CAPI int U_EXPORT2 +U_STABLE int U_EXPORT2 ucnv_compareNames(const char *name1, const char *name2); /** - * Creates a UConverter object with the names specified as a C string. + * Creates a UConverter object with the name of a coded character set specified as a C string. * The actual name will be resolved with the alias file * using a case-insensitive string comparison that ignores - * the delimiters '-', '_', and ' ' (dash, underscore, and space). - * E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent. + * leading zeroes and all non-alphanumeric characters. + * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. + * (See also ucnv_compareNames().) * If NULL is passed for the converter name, it will create one with the * getDefaultName return value. * @@ -291,18 +296,30 @@ ucnv_compareNames(const char *name1, const char *name2); * *

The conversion behavior and names can vary between platforms. ICU may * convert some characters differently from other platforms. Details on this topic - * are in the User's - * Guide.

- * - * @param converterName Name of the uconv table, may have options appended + * are in the User's + * Guide. Aliases starting with a "cp" prefix have no specific meaning + * other than its an alias starting with the letters "cp". Please do not + * associate any meaning to these aliases.

+ * + * @param converterName Name of the coded character set table. + * This may have options appended to the string. + * IANA alias character set names, IBM CCSIDs starting with "ibm-", + * Windows codepage numbers starting with "windows-" are frequently + * used for this parameter. See ucnv_getAvailableName and + * ucnv_getAlias for a complete list that is available. + * If this parameter is NULL, the default converter will be used. * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error occured * @see ucnv_openU * @see ucnv_openCCSID + * @see ucnv_getAvailableName + * @see ucnv_getAlias + * @see ucnv_getDefaultName * @see ucnv_close + * @ee ucnv_compareNames * @stable ICU 2.0 */ -U_CAPI UConverter* U_EXPORT2 +U_STABLE UConverter* U_EXPORT2 ucnv_open(const char *converterName, UErrorCode *err); @@ -311,13 +328,16 @@ ucnv_open(const char *converterName, UErrorCode *err); * The name should be limited to the ASCII-7 alphanumerics range. * The actual name will be resolved with the alias file * using a case-insensitive string comparison that ignores - * the delimiters '-', '_', and ' ' (dash, underscore, and space). - * E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent. + * leading zeroes and all non-alphanumeric characters. + * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. + * (See also ucnv_compareNames().) * If NULL is passed for the converter name, it will create * one with the ucnv_getDefaultName() return value. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. - * @param name : name of the uconv table in a zero terminated + * + *

See ucnv_open for the complete details

+ * @param name Name of the UConverter table in a zero terminated * Unicode string * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, * U_FILE_ACCESS_ERROR @@ -326,10 +346,10 @@ ucnv_open(const char *converterName, UErrorCode *err); * @see ucnv_open * @see ucnv_openCCSID * @see ucnv_close - * @see ucnv_getDefaultName + * @ee ucnv_compareNames * @stable ICU 2.0 */ -U_CAPI UConverter* U_EXPORT2 +U_STABLE UConverter* U_EXPORT2 ucnv_openU(const UChar *name, UErrorCode *err); @@ -397,7 +417,7 @@ ucnv_openU(const UChar *name, * @see UConverterPlatform * @stable ICU 2.0 */ -U_CAPI UConverter* U_EXPORT2 +U_STABLE UConverter* U_EXPORT2 ucnv_openCCSID(int32_t codepage, UConverterPlatform platform, UErrorCode * err); @@ -408,12 +428,15 @@ ucnv_openCCSID(int32_t codepage, *

The packageName and converterName must point to an ICU udata object, as defined by * udata_open( packageName, "cnv", converterName, err) or equivalent. * Typically, packageName will refer to a (.dat) file, or to a package registered with - * udata_setAppData().

+ * udata_setAppData(). Using a full file or directory pathname for packageName is deprecated.

* *

The name will NOT be looked up in the alias mechanism, nor will the converter be * stored in the converter cache or the alias table. The only way to open further converters * is call this function multiple times, or use the ucnv_safeClone() function to clone a * 'master' converter.

+ * + *

A future version of ICU may add alias table lookups and/or caching + * to this function.

* *

Example Use: * cnv = ucnv_openPackage("myapp", "myconverter", &err); @@ -427,35 +450,61 @@ ucnv_openCCSID(int32_t codepage, * @see ucnv_open * @see ucnv_safeClone * @see ucnv_close - * @draft ICU 2.2 + * @stable ICU 2.2 */ -U_CAPI UConverter* U_EXPORT2 +U_STABLE UConverter* U_EXPORT2 ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err); /** - * Thread safe cloning operation + * Thread safe converter cloning operation. + * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) + * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. + * If the buffer size is sufficient, then the clone will use the stack buffer; + * otherwise, it will be allocated, and *pBufferSize will indicate + * the actual size. (This should not occur with U_CNV_SAFECLONE_BUFFERSIZE.) + * + * You must ucnv_close() the clone in any case. + * + * If *pBufferSize==0, (regardless of whether stackBuffer==NULL or not) + * then *pBufferSize will be changed to a sufficient size + * for cloning this converter, + * without actually cloning the converter ("pure pre-flighting"). + * + * If *pBufferSize is greater than zero but not large enough for a stack-based + * clone, then the converter is cloned using newly allocated memory + * and *pBufferSize is changed to the necessary size. + * + * If the converter clone fits into the stack buffer but the stack buffer is not + * sufficiently aligned for the clone, then the clone will use an + * adjusted pointer and use an accordingly smaller buffer size. + * * @param cnv converter to be cloned * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. * If buffer is not large enough, new memory will be allocated. * Clients can use the U_CNV_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. - * @param pBufferSize pointer to size of allocated space. - * If *pBufferSize == 0, a sufficient size for use in cloning will - * be returned ('pre-flighting') - * If *pBufferSize is not enough for a stack-based safe clone, - * new memory will be allocated. + * @param pBufferSize pointer to size of allocated space. pBufferSize must not be NULL. * @param status to indicate whether the operation went on smoothly or there were errors - * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. + * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, + * is used if any allocations were necessary. + * However, it is better to check if *pBufferSize grew for checking for + * allocations because warning codes can be overridden by subsequent + * function calls. * @return pointer to the new clone * @stable ICU 2.0 */ -U_CAPI UConverter * U_EXPORT2 +U_STABLE UConverter * U_EXPORT2 ucnv_safeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); -/** @stable ICU 2.0 */ -#define U_CNV_SAFECLONE_BUFFERSIZE 3072 +/** + * \def U_CNV_SAFECLONE_BUFFERSIZE + * Definition of a buffer size that is designed to be large enough for + * converters to be cloned with ucnv_safeClone(). + * @stable ICU 2.0 + */ +#define U_CNV_SAFECLONE_BUFFERSIZE 1024 /** * Deletes the unicode converter and releases resources associated @@ -468,12 +517,14 @@ ucnv_safeClone(const UConverter *cnv, * @see ucnv_openCCSID * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_close(UConverter * converter); /** * Fills in the output parameter, subChars, with the substitution characters * as multiple bytes. + * If ucnv_setSubstString() set a Unicode string because the converter is + * stateful, then subChars will be an empty string. * * @param converter the Unicode converter * @param subChars the subsitution characters @@ -482,10 +533,11 @@ ucnv_close(UConverter * converter); * @param err the outgoing error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. + * @see ucnv_setSubstString * @see ucnv_setSubstChars * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getSubstChars(const UConverter *converter, char *subChars, int8_t *len, @@ -494,21 +546,61 @@ ucnv_getSubstChars(const UConverter *converter, /** * Sets the substitution chars when converting from unicode to a codepage. The * substitution is specified as a string of 1-4 bytes, and may contain - * NULL byte. + * NULL bytes. + * The subChars must represent a single character. The caller needs to know the + * byte sequence of a valid character in the converter's charset. + * For some converters, for example some ISO 2022 variants, only single-byte + * substitution characters may be supported. + * The newer ucnv_setSubstString() function relaxes these limitations. + * * @param converter the Unicode converter * @param subChars the substitution character byte sequence we want set * @param len the number of bytes in subChars * @param err the error status code. U_INDEX_OUTOFBOUNDS_ERROR if * len is bigger than the maximum number of bytes allowed in subchars + * @see ucnv_setSubstString * @see ucnv_getSubstChars * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_setSubstChars(UConverter *converter, const char *subChars, int8_t len, UErrorCode *err); +/** + * Set a substitution string for converting from Unicode to a charset. + * The caller need not know the charset byte sequence for each charset. + * + * Unlike ucnv_setSubstChars() which is designed to set a charset byte sequence + * for a single character, this function takes a Unicode string with + * zero, one or more characters, and immediately verifies that the string can be + * converted to the charset. + * If not, or if the result is too long (more than 32 bytes as of ICU 3.6), + * then the function returns with an error accordingly. + * + * Also unlike ucnv_setSubstChars(), this function works for stateful charsets + * by converting on the fly at the point of substitution rather than setting + * a fixed byte sequence. + * + * @param cnv The UConverter object. + * @param s The Unicode string. + * @param length The number of UChars in s, or -1 for a NUL-terminated string. + * @param err Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * + * @see ucnv_setSubstChars + * @see ucnv_getSubstChars + * @draft ICU 3.6 + */ +U_DRAFT void U_EXPORT2 +ucnv_setSubstString(UConverter *cnv, + const UChar *s, + int32_t length, + UErrorCode *err); + /** * Fills in the output parameter, errBytes, with the error characters from the * last failing conversion. @@ -522,7 +614,7 @@ ucnv_setSubstChars(UConverter *converter, * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getInvalidChars(const UConverter *converter, char *errBytes, int8_t *len, @@ -541,7 +633,7 @@ ucnv_getInvalidChars(const UConverter *converter, * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getInvalidUChars(const UConverter *converter, UChar *errUChars, int8_t *len, @@ -554,7 +646,7 @@ ucnv_getInvalidUChars(const UConverter *converter, * @param converter the Unicode converter * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_reset(UConverter *converter); /** @@ -565,7 +657,7 @@ ucnv_reset(UConverter *converter); * @param converter the Unicode converter * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_resetToUnicode(UConverter *converter); /** @@ -576,29 +668,91 @@ ucnv_resetToUnicode(UConverter *converter); * @param converter the Unicode converter * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_resetFromUnicode(UConverter *converter); /** - * Returns the maximum length of bytes used by a character. This varies - * between 1 and 4 - * @param converter the Unicode converter - * @return the maximum number of bytes allowed by this particular converter + * Returns the maximum number of bytes that are output per UChar in conversion + * from Unicode using this converter. + * The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING + * to calculate the size of a target buffer for conversion from Unicode. + * + * Note: Before ICU 2.8, this function did not return reliable numbers for + * some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS. + * + * This number may not be the same as the maximum number of bytes per + * "conversion unit". In other words, it may not be the intuitively expected + * number of bytes per character that would be published for a charset, + * and may not fulfill any other purpose than the allocation of an output + * buffer of guaranteed sufficient size for a given input length and converter. + * + * Examples for special cases that are taken into account: + * - Supplementary code points may convert to more bytes than BMP code points. + * This function returns bytes per UChar (UTF-16 code unit), not per + * Unicode code point, for efficient buffer allocation. + * - State-shifting output (SI/SO, escapes, etc.) from stateful converters. + * - When m input UChars are converted to n output bytes, then the maximum m/n + * is taken into account. + * + * The number returned here does not take into account + * (see UCNV_GET_MAX_BYTES_FOR_STRING): + * - callbacks which output more than one charset character sequence per call, + * like escape callbacks + * - initial and final non-character bytes that are output by some converters + * (automatic BOMs, initial escape sequence, final SI, etc.) + * + * Examples for returned values: + * - SBCS charsets: 1 + * - Shift-JIS: 2 + * - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted) + * - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_) + * - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS) + * - ISO-2022: 3 (always outputs UTF-8) + * - ISO-2022-JP: 6 (4-byte escape sequences + DBCS) + * - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS) + * + * @param converter The Unicode converter. + * @return The maximum number of bytes per UChar that are output by ucnv_fromUnicode(), + * to be used together with UCNV_GET_MAX_BYTES_FOR_STRING for buffer allocation. + * + * @see UCNV_GET_MAX_BYTES_FOR_STRING * @see ucnv_getMinCharSize * @stable ICU 2.0 */ -U_CAPI int8_t U_EXPORT2 +U_STABLE int8_t U_EXPORT2 ucnv_getMaxCharSize(const UConverter *converter); +/** + * Calculates the size of a buffer for conversion from Unicode to a charset. + * The calculated size is guaranteed to be sufficient for this conversion. + * + * It takes into account initial and final non-character bytes that are output + * by some converters. + * It does not take into account callbacks which output more than one charset + * character sequence per call, like escape callbacks. + * The default (substitution) callback only outputs one charset character sequence. + * + * @param length Number of UChars to be converted. + * @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter + * that will be used. + * @return Size of a buffer that will be large enough to hold the output bytes of + * converting length UChars with the converter that returned the maxCharSize. + * + * @see ucnv_getMaxCharSize + * @stable ICU 2.8 + */ +#define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \ + (((int32_t)(length)+10)*(int32_t)(maxCharSize)) + /** * Returns the minimum byte length for characters in this codepage. - * This is either 1 or 2 for all supported codepages. + * This is usually either 1 or 2. * @param converter the Unicode converter * @return the minimum number of bytes allowed by this particular converter * @see ucnv_getMaxCharSize * @stable ICU 2.0 */ -U_CAPI int8_t U_EXPORT2 +U_STABLE int8_t U_EXPORT2 ucnv_getMinCharSize(const UConverter *converter); /** @@ -615,7 +769,7 @@ ucnv_getMinCharSize(const UConverter *converter); * @see ucnv_getName * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_getDisplayName(const UConverter *converter, const char *displayLocale, UChar *displayName, @@ -632,7 +786,7 @@ ucnv_getDisplayName(const UConverter *converter, * @see ucnv_getDisplayName * @stable ICU 2.0 */ -U_CAPI const char * U_EXPORT2 +U_STABLE const char * U_EXPORT2 ucnv_getName(const UConverter *converter, UErrorCode *err); /** @@ -658,7 +812,7 @@ ucnv_getName(const UConverter *converter, UErrorCode *err); * @see ucnv_getPlatform * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_getCCSID(const UConverter *converter, UErrorCode *err); @@ -672,7 +826,7 @@ ucnv_getCCSID(const UConverter *converter, * @return The codepage platform * @stable ICU 2.0 */ -U_CAPI UConverterPlatform U_EXPORT2 +U_STABLE UConverterPlatform U_EXPORT2 ucnv_getPlatform(const UConverter *converter, UErrorCode *err); @@ -684,7 +838,7 @@ ucnv_getPlatform(const UConverter *converter, * @return the type of the converter * @stable ICU 2.0 */ -U_CAPI UConverterType U_EXPORT2 +U_STABLE UConverterType U_EXPORT2 ucnv_getType(const UConverter * converter); /** @@ -702,23 +856,25 @@ ucnv_getType(const UConverter * converter); * @see ucnv_getType * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getStarters(const UConverter* converter, UBool starters[256], UErrorCode* err); + /** * Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet(). * @see ucnv_getUnicodeSet - * @draft ICU 2.6 + * @stable ICU 2.6 */ typedef enum UConverterUnicodeSet { - /** Select the set of roundtrippable Unicode code points. @draft ICU 2.6 */ + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ UCNV_ROUNDTRIP_SET, - /** Number of UConverterUnicodeSet selectors. @draft ICU 2.6 */ + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ UCNV_SET_COUNT } UConverterUnicodeSet; + /** * Returns the set of Unicode code points that can be converted by an ICU converter. * @@ -741,9 +897,9 @@ typedef enum UConverterUnicodeSet { * sets with different properties. * * @param cnv The converter for which a set is requested. - * @param set A valid USet; it will be cleared by the function before + * @param setFillIn A valid USet *. It will be cleared by this function before * the converter's specific set is filled into the USet. - * @param which A UConverterUnicodeSet selector; + * @param whichSet A UConverterUnicodeSet selector; * currently UCNV_ROUNDTRIP_SET is the only supported value. * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. @@ -751,12 +907,12 @@ typedef enum UConverterUnicodeSet { * @see UConverterUnicodeSet * @see uset_open * @see uset_close - * @draft ICU 2.6 + * @stable ICU 2.6 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getUnicodeSet(const UConverter *cnv, - USet *set, - UConverterUnicodeSet which, + USet *setFillIn, + UConverterUnicodeSet whichSet, UErrorCode *pErrorCode); /** @@ -770,7 +926,7 @@ ucnv_getUnicodeSet(const UConverter *cnv, * @see ucnv_setToUCallBack * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getToUCallBack (const UConverter * converter, UConverterToUCallback *action, const void **context); @@ -786,7 +942,7 @@ ucnv_getToUCallBack (const UConverter * converter, * @see ucnv_setFromUCallBack * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getFromUCallBack (const UConverter * converter, UConverterFromUCallback *action, const void **context); @@ -806,7 +962,7 @@ ucnv_getFromUCallBack (const UConverter * converter, * @see ucnv_getToUCallBack * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_setToUCallBack (UConverter * converter, UConverterToUCallback newAction, const void* newContext, @@ -829,7 +985,7 @@ ucnv_setToUCallBack (UConverter * converter, * @see ucnv_getFromUCallBack * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_setFromUCallBack (UConverter * converter, UConverterFromUCallback newAction, const void *newContext, @@ -856,6 +1012,12 @@ ucnv_setFromUCallBack (UConverter * converter, * consumed. At that point, the caller should reset the source and * sourceLimit pointers to point to the next chunk. * + * At the end of the stream (flush==TRUE), the input is completely consumed + * when *source==sourceLimit and no error code is set. + * The converter object is then automatically reset by this function. + * (This means that a converter need not be reset explicitly between data + * streams if it finishes the previous stream without errors.) + * * This is a stateful conversion. Additionally, even when all source data has * been consumed, some data may be in the converters' internal state. * Call this function repeatedly, updating the target pointers with @@ -889,7 +1051,7 @@ ucnv_setFromUCallBack (UConverter * converter, * @see ucnv_setToUCallBack * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_fromUnicode (UConverter * converter, char **target, const char *targetLimit, @@ -918,6 +1080,12 @@ ucnv_fromUnicode (UConverter * converter, * returned, it means that all of the source buffer has been * consumed. At that point, the caller should reset the source and * sourceLimit pointers to point to the next chunk. + * + * At the end of the stream (flush==TRUE), the input is completely consumed + * when *source==sourceLimit and no error code is set + * The converter object is then automatically reset by this function. + * (This means that a converter need not be reset explicitly between data + * streams if it finishes the previous stream without errors.) * * This is a stateful conversion. Additionally, even when all source data has * been consumed, some data may be in the converters' internal state. @@ -952,7 +1120,7 @@ ucnv_fromUnicode (UConverter * converter, * @see ucnv_getNextUChar * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_toUnicode(UConverter *converter, UChar **target, const UChar *targetLimit, @@ -970,7 +1138,7 @@ ucnv_toUnicode(UConverter *converter, * It is only useful for whole strings, not for streaming conversion. * * The maximum output buffer capacity required (barring output from callbacks) will be - * srcLength*ucnv_getMaxCharSize(cnv). + * UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)). * * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called) * @param src the input Unicode string @@ -986,9 +1154,10 @@ ucnv_toUnicode(UConverter *converter, * and a buffer of the indicated length would need to be passed in * @see ucnv_fromUnicode * @see ucnv_convert + * @see UCNV_GET_MAX_BYTES_FOR_STRING * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, @@ -1020,18 +1189,46 @@ ucnv_fromUChars(UConverter *cnv, * @see ucnv_convert * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** - * Will convert a codepage buffer into unicode one character at a time. - *

This function was written to be efficient when transcoding small - * amounts of data at a time. - * In that case it will be more efficient than \Ref{ucnv_toUnicode}. - * When converting large buffers use \Ref{ucnv_toUnicode}.

+ * Convert a codepage buffer into Unicode one character at a time. + * The input is completely consumed when the U_INDEX_OUTOFBOUNDS_ERROR is set. + * + * Advantage compared to ucnv_toUnicode() or ucnv_toUChars(): + * - Faster for small amounts of data, for most converters, e.g., + * US-ASCII, ISO-8859-1, UTF-8/16/32, and most "normal" charsets. + * (For complex converters, e.g., SCSU, UTF-7 and ISO 2022 variants, + * it uses ucnv_toUnicode() internally.) + * - Convenient. + * + * Limitations compared to ucnv_toUnicode(): + * - Always assumes flush=TRUE. + * This makes ucnv_getNextUChar() unsuitable for "streaming" conversion, + * that is, for where the input is supplied in multiple buffers, + * because ucnv_getNextUChar() will assume the end of the input at the end + * of the first buffer. + * - Does not provide offset output. + * + * It is possible to "mix" ucnv_getNextUChar() and ucnv_toUnicode() because + * ucnv_getNextUChar() uses the current state of the converter + * (unlike ucnv_toUChars() which always resets first). + * However, if ucnv_getNextUChar() is called after ucnv_toUnicode() + * stopped in the middle of a character sequence (with flush=FALSE), + * then ucnv_getNextUChar() will always use the slower ucnv_toUnicode() + * internally until the next character boundary. + * (This is new in ICU 2.6. In earlier releases, ucnv_getNextUChar() had to + * start at a character boundary.) + * + * Instead of using ucnv_getNextUChar(), it is recommended + * to convert using ucnv_toUnicode() or ucnv_toUChars() + * and then iterate over the text using U16_NEXT() or a UCharIterator (uiter.h) + * or a C++ CharacterIterator or similar. + * This allows streaming conversion and offset output, for example. * *

Handling of surrogate pairs and supplementary-plane code points:
* There are two different kinds of codepages that provide mappings for surrogate characters: @@ -1068,7 +1265,7 @@ ucnv_toUChars(UConverter *cnv, * @see ucnv_convert * @stable ICU 2.0 */ -U_CAPI UChar32 U_EXPORT2 +U_STABLE UChar32 U_EXPORT2 ucnv_getNextUChar(UConverter * converter, const char **source, const char * sourceLimit, @@ -1079,6 +1276,12 @@ ucnv_getNextUChar(UConverter * converter, * Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - * are used, "pivoting" through 16-bit Unicode. * + * Important: For streaming conversion (multiple function calls for successive + * parts of a text stream), the caller must provide a pivot buffer explicitly, + * and must preserve the pivot buffer and associated pointers from one + * call to another. (The buffer may be moved if its contents and the relative + * pointer positions are preserved.) + * * There is a similar function, ucnv_convert(), * which has the following limitations: * - it takes charset names, not converter objects, so that @@ -1090,7 +1293,7 @@ ucnv_getNextUChar(UConverter * converter, * * By contrast, ucnv_convertEx() * - takes UConverter parameters instead of charset names - * - fully exposes the pivot buffer for complete error handling + * - fully exposes the pivot buffer for streaming conversion and complete error handling * * ucnv_convertEx() also provides further convenience: * - an option to reset the converters at the beginning @@ -1104,6 +1307,7 @@ ucnv_getNextUChar(UConverter * converter, * or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills * the target buffer * - the pivot buffer can be provided internally; + * possible only for whole-string conversion, not streaming conversion; * in this case, the caller will not be able to get details about where an * error occurred * (if pivotStart==NULL, see below) @@ -1200,9 +1404,9 @@ ucnv_getNextUChar(UConverter * converter, * @see ucnv_toUnicode * @see ucnv_fromUChars * @see ucnv_toUChars - * @draft ICU 2.6 + * @stable ICU 2.6 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, char **target, const char *targetLimit, const char **source, const char *sourceLimit, @@ -1266,7 +1470,7 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, * @see ucnv_getNextUChar * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_convert(const char *toConverterName, const char *fromConverterName, char *target, @@ -1318,9 +1522,9 @@ ucnv_convert(const char *toConverterName, * @see ucnv_toUnicode * @see ucnv_fromUChars * @see ucnv_toUChars - * @draft ICU 2.6 + * @stable ICU 2.6 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_toAlgorithmic(UConverterType algorithmicType, UConverter *cnv, char *target, int32_t targetCapacity, @@ -1370,9 +1574,9 @@ ucnv_toAlgorithmic(UConverterType algorithmicType, * @see ucnv_toUnicode * @see ucnv_fromUChars * @see ucnv_toUChars - * @draft ICU 2.6 + * @stable ICU 2.6 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_fromAlgorithmic(UConverter *cnv, UConverterType algorithmicType, char *target, int32_t targetCapacity, @@ -1386,7 +1590,7 @@ ucnv_fromAlgorithmic(UConverter *cnv, * @see ucnv_close * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_flushCache(void); /** @@ -1396,7 +1600,7 @@ ucnv_flushCache(void); * @see ucnv_getAvailableName * @stable ICU 2.0 */ -U_CAPI int32_t U_EXPORT2 +U_STABLE int32_t U_EXPORT2 ucnv_countAvailable(void); /** @@ -1409,7 +1613,7 @@ ucnv_countAvailable(void); * @see ucnv_countAvailable * @stable ICU 2.0 */ -U_CAPI const char* U_EXPORT2 +U_STABLE const char* U_EXPORT2 ucnv_getAvailableName(int32_t n); /** @@ -1422,9 +1626,9 @@ ucnv_getAvailableName(int32_t n); * @see ucnv_getAvailableName * @see uenum_close * @see uenum_next - * @draft ICU 2.4 + * @stable ICU 2.4 */ -U_CAPI UEnumeration * U_EXPORT2 +U_STABLE UEnumeration * U_EXPORT2 ucnv_openAllNames(UErrorCode *pErrorCode); /** @@ -1437,7 +1641,7 @@ ucnv_openAllNames(UErrorCode *pErrorCode); * @return number of names on alias list for given alias * @stable ICU 2.0 */ -U_CAPI uint16_t U_EXPORT2 +U_STABLE uint16_t U_EXPORT2 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); /** @@ -1452,7 +1656,7 @@ ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); * @see ucnv_countAliases * @stable ICU 2.0 */ -U_CAPI const char * U_EXPORT2 +U_STABLE const char * U_EXPORT2 ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); /** @@ -1468,7 +1672,7 @@ ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); * @param pErrorCode result of operation * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode); /** @@ -1492,9 +1696,9 @@ ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) * @see ucnv_getStandardName * @see uenum_close * @see uenum_next - * @draft ICU 2.2 + * @stable ICU 2.2 */ -U_CAPI UEnumeration * U_EXPORT2 +U_STABLE UEnumeration * U_EXPORT2 ucnv_openStandardNames(const char *convName, const char *standard, UErrorCode *pErrorCode); @@ -1504,7 +1708,7 @@ ucnv_openStandardNames(const char *convName, * @return number of standards * @stable ICU 2.0 */ -U_CAPI uint16_t U_EXPORT2 +U_STABLE uint16_t U_EXPORT2 ucnv_countStandards(void); /** @@ -1514,7 +1718,7 @@ ucnv_countStandards(void); * @return returns the name of the standard at given index. Owned by the library. * @stable ICU 2.0 */ -U_CAPI const char * U_EXPORT2 +U_STABLE const char * U_EXPORT2 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); /** @@ -1536,7 +1740,7 @@ ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); * then NULL is returned. Owned by the library. * @stable ICU 2.0 */ -U_CAPI const char * U_EXPORT2 +U_STABLE const char * U_EXPORT2 ucnv_getStandardName(const char *name, const char *standard, UErrorCode *pErrorCode); /** @@ -1556,9 +1760,9 @@ ucnv_getStandardName(const char *name, const char *standard, UErrorCode *pErrorC * then NULL is returned. The returned string is * owned by the library. * @see ucnv_getStandardName - * @draft ICU 2.4 + * @stable ICU 2.4 */ -U_CAPI const char * U_EXPORT2 +U_STABLE const char * U_EXPORT2 ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode); /** @@ -1571,18 +1775,21 @@ ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErro * @see ucnv_setDefaultName * @stable ICU 2.0 */ -U_CAPI const char * U_EXPORT2 +U_STABLE const char * U_EXPORT2 ucnv_getDefaultName(void); /** - * sets the current default converter name. Caller must own the storage for 'name' - * and preserve it indefinitely. + * This function sets the current default converter name. + * DO NOT call this function from multiple threads! This function is not + * thread safe. If this function needs to be called, it should be called + * during application initialization. Most of the time, the results from + * ucnv_getDefaultName() is sufficient for your application. * @param name the converter name to be the default (must exist). * @see ucnv_getDefaultName - * @system SYSTEM API + * @system * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_setDefaultName(const char *name); /** @@ -1602,7 +1809,7 @@ ucnv_setDefaultName(const char *name); * @see ucnv_isAmbiguous * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_fixFileSeparator(const UConverter *cnv, UChar *source, int32_t sourceLen); /** @@ -1613,7 +1820,7 @@ ucnv_fixFileSeparator(const UConverter *cnv, UChar *source, int32_t sourceLen); * character, FALSE otherwise. * @stable ICU 2.0 */ -U_CAPI UBool U_EXPORT2 +U_STABLE UBool U_EXPORT2 ucnv_isAmbiguous(const UConverter *cnv); /** @@ -1623,7 +1830,7 @@ ucnv_isAmbiguous(const UConverter *cnv); * mapping, FALSE otherwise. * @stable ICU 2.0 */ -U_CAPI void U_EXPORT2 +U_STABLE void U_EXPORT2 ucnv_setFallback(UConverter *cnv, UBool usesFallback); /** @@ -1632,7 +1839,7 @@ ucnv_setFallback(UConverter *cnv, UBool usesFallback); * @return TRUE if the converter uses fallback, FALSE otherwise. * @stable ICU 2.0 */ -U_CAPI UBool U_EXPORT2 +U_STABLE UBool U_EXPORT2 ucnv_usesFallback(const UConverter *cnv); /** @@ -1656,7 +1863,7 @@ ucnv_usesFallback(const UConverter *cnv); * UErrorCode err = U_ZERO_ERROR; * char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; * int32_t signatureLength = 0; - * char *encoding = ucnv_detectUnicodeSignatures(input,sizeof(input),&signatureLength,&err); + * char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); * UConverter *conv = NULL; * UChar output[100]; * UChar *target = output, *out; @@ -1686,18 +1893,46 @@ ucnv_usesFallback(const UConverter *cnv); * @param signatureLength A pointer to int32_t to receive the number of bytes that make up the signature * of the detected UTF. 0 if not detected. * Can be a NULL pointer. - * @param pErrorCode A pointer to receive information about any errors that may occur during detection. - * Must be a valid pointer to an error code value, which must not indicate a failure - * before the function call. + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. * @return The name of the encoding detected. NULL if encoding is not detected. * @stable ICU 2.4 */ -U_CAPI const char* U_EXPORT2 +U_STABLE const char* U_EXPORT2 ucnv_detectUnicodeSignature(const char* source, int32_t sourceLength, int32_t *signatureLength, UErrorCode *pErrorCode); +/** + * Returns the number of UChars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @param cnv The converter in which the input is held + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of UChars in the state. -1 if an error is encountered. + * @draft ICU 3.4 + */ +U_DRAFT int32_t U_EXPORT2 +ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status); + +/** + * Returns the number of chars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @param cnv The converter in which the input is held as internal state + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of chars in the state. -1 if an error is encountered. + * @draft ICU 3.4 + */ +U_DRAFT int32_t U_EXPORT2 +ucnv_toUCountPending(const UConverter* cnv, UErrorCode* status); + #endif -/*_UCNV*/ +#endif +/*_UCNV*/