[apple/icu.git] / icuSources / common / ucnv_cnv.h

/*
**********************************************************************
*   Copyright (C) 1999-2004, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
*   uconv_cnv.h:
*   defines all the low level conversion functions
*   T_UnicodeConverter_{to,from}Unicode_$ConversionType
*
* Modification History:
*
*   Date        Name        Description
*   05/09/00    helena      Added implementation to handle fallback mappings.
*   06/29/2000  helena      Major rewrite of the callback APIs.
*/

#ifndef UCNV_CNV_H
#define UCNV_CNV_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "unicode/uset.h"
#include "uset_imp.h"

U_CDECL_BEGIN

/* this is used in fromUnicode DBCS tables as an "unassigned" marker */
#define missingCharMarker 0xFFFF

/*
 * #define missingUCharMarker 0xfffe
 *
 * commented out because there are actually two values used in toUnicode tables:
 * U+fffe "unassigned"
 * U+ffff "illegal"
 */

/** Forward declaration, see ucnv_bld.h */
struct UConverterSharedData;
typedef struct UConverterSharedData UConverterSharedData;

/* function types for UConverterImpl ---------------------------------------- */

/* struct with arguments for UConverterLoad and ucnv_load() */
typedef struct {
    int32_t size;               /* sizeof(UConverterLoadArgs) */
    int32_t nestedLoads;        /* count nested ucnv_load() calls */
    int32_t reserved;           /* reserved - for good alignment of the pointers */
    uint32_t options;
    const char *pkg, *name;
} UConverterLoadArgs;

typedef void (*UConverterLoad) (UConverterSharedData *sharedData,
                                UConverterLoadArgs *pArgs,
                                const uint8_t *raw, UErrorCode *pErrorCode);
typedef void (*UConverterUnload) (UConverterSharedData *sharedData);

typedef void (*UConverterOpen) (UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *pErrorCode);
typedef void (*UConverterClose) (UConverter *cnv);

typedef enum UConverterResetChoice {
    UCNV_RESET_BOTH,
    UCNV_RESET_TO_UNICODE,
    UCNV_RESET_FROM_UNICODE
} UConverterResetChoice;

typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice);

/*
 * Converter implementation function(s) for ucnv_toUnicode().
 * If the toUnicodeWithOffsets function pointer is NULL,
 * then the toUnicode function will be used and the offsets will be set to -1.
 *
 * Must maintain state across buffers. Use toUBytes[toULength] for partial input
 * sequences; it will be checked in ucnv.c at the end of the input stream
 * to detect truncated input.
 * Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND.
 *
 * The toUnicodeWithOffsets must write exactly as many offset values as target
 * units. Write offset values of -1 for when the source index corresponding to
 * the output unit is not known (e.g., the character started in an earlier buffer).
 * The pArgs->offsets pointer need not be moved forward.
 *
 * At function return, either one of the following conditions must be true:
 * - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit
 * - another error code with toUBytes[toULength] set to the offending input
 * - no error, and the source is consumed: source==sourceLimit
 *
 * The ucnv.c code will handle the end of the input (reset)
 * (reset, and truncation detection) and callbacks.
 */
typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *);

/*
 * Same rules as for UConverterToUnicode.
 * A lead surrogate is kept in fromUChar32 across buffers, and if an error
 * occurs, then the offending input code point must be put into fromUChar32
 * as well.
 */
typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *);

/*
 * Converter implementation function for ucnv_getNextUChar().
 * If the function pointer is NULL, then the toUnicode function will be used.
 *
 * Will be called at a character boundary (toULength==0).
 * May return with
 * - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input
 *   (the return value will be ignored)
 * - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!)
 *   with toUBytes[toULength] set to the offending input
 *   (the return value will be ignored)
 * - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer,
 *   to indicate that the ucnv.c code shall call the toUnicode function instead
 * - return a real code point result
 *
 * Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed.
 *
 * The ucnv.c code will handle the end of the input (reset)
 * (except for truncation detection!) and callbacks.
 */
typedef UChar32 (*UConverterGetNextUChar) (UConverterToUnicodeArgs *, UErrorCode *);

typedef void (*UConverterGetStarters)(const UConverter* converter,
                                      UBool starters[256],
                                      UErrorCode *pErrorCode);

/* If this function pointer is null or if the function returns null
 * the name field in static data struct should be returned by 
 * ucnv_getName() API function
 */
typedef const char * (*UConverterGetName) (const UConverter *cnv);

/**
 * Write the codepage substitution character.
 * If this function is not set, then ucnv_cbFromUWriteSub() writes
 * the substitution character from UConverter.
 * For stateful converters, it is typically necessary to handle this
 * specificially for the converter in order to properly maintain the state.
 */
typedef void (*UConverterWriteSub) (UConverterFromUnicodeArgs *pArgs, int32_t offsetIndex, UErrorCode *pErrorCode);

/**
 * For converter-specific safeClone processing
 * If this function is not set, then ucnv_safeClone assumes that the converter has no private data that changes
 * after the converter is done opening.
 * If this function is set, then it is called just after a memcpy() of
 * converter data to the new, empty converter, and is expected to set up
 * the initial state of the converter.  It is not expected to increment the
 * reference counts of the standard data types such as the shared data.
 */
typedef UConverter * (*UConverterSafeClone) (const UConverter   *cnv, 
                                             void               *stackBuffer,
                                             int32_t            *pBufferSize, 
                                             UErrorCode         *status);

/**
 * Fills the set of Unicode code points that can be converted by an ICU converter.
 * The API function ucnv_getUnicodeSet() clears the USet before calling
 * the converter's getUnicodeSet() implementation; the converter should only
 * add the appropriate code points to allow recursive use.
 * For example, the ISO-2022-JP converter will call each subconverter's
 * getUnicodeSet() implementation to consecutively add code points to
 * the same USet, which will result in a union of the sets of all subconverters.
 *
 * For more documentation, see ucnv_getUnicodeSet() in ucnv.h.
 */
typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv,
                                         const USetAdder *sa,
                                         UConverterUnicodeSet which,
                                         UErrorCode *pErrorCode);

UBool CONVERSION_U_SUCCESS (UErrorCode err);

/**
 * UConverterImpl contains all the data and functions for a converter type.
 * Its function pointers work much like a C++ vtable.
 * Many converter types need to define only a subset of the functions;
 * when a function pointer is NULL, then a default action will be performed.
 *
 * Every converter type must implement toUnicode, fromUnicode, and getNextUChar,
 * otherwise the converter may crash.
 * Every converter type that has variable-length codepage sequences should
 * also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for
 * correct offset handling.
 * All other functions may or may not be implemented - it depends only on
 * whether the converter type needs them.
 *
 * When open() fails, then close() will be called, if present.
 */
struct UConverterImpl {
    UConverterType type;

    UConverterLoad load;
    UConverterUnload unload;

    UConverterOpen open;
    UConverterClose close;
    UConverterReset reset;

    UConverterToUnicode toUnicode;
    UConverterToUnicode toUnicodeWithOffsets;
    UConverterFromUnicode fromUnicode;
    UConverterFromUnicode fromUnicodeWithOffsets;
    UConverterGetNextUChar getNextUChar;

    UConverterGetStarters getStarters;
    UConverterGetName getName;
    UConverterWriteSub writeSub;
    UConverterSafeClone safeClone;
    UConverterGetUnicodeSet getUnicodeSet;
};

extern const UConverterSharedData
    _MBCSData, _Latin1Data,
    _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData,
    _ISO2022Data, 
    _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
    _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
    _HZData,_ISCIIData, _SCSUData, _ASCIIData,
    _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData;

U_CDECL_END

/** Always use fallbacks from codepage to Unicode */
#define TO_U_USE_FALLBACK(useFallback) TRUE
#define UCNV_TO_U_USE_FALLBACK(cnv) TRUE

/** Use fallbacks from Unicode to codepage when cnv->useFallback or for private-use code points */
#define IS_PRIVATE_USE(c) ((uint32_t)((c)-0xe000)<0x1900 || (uint32_t)((c)-0xf0000)<0x20000)
#define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) || IS_PRIVATE_USE(c))
#define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c)

/**
 * Magic number for ucnv_getNextUChar(), returned by a
 * getNextUChar() implementation to indicate to use the converter's toUnicode()
 * instead of the native function.
 * @internal
 */
#define UCNV_GET_NEXT_UCHAR_USE_TO_U -9

U_CFUNC void
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
                   const USetAdder *sa,
                   UConverterUnicodeSet which,
                   UErrorCode *pErrorCode);

U_CFUNC void
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
                               const USetAdder *sa,
                               UConverterUnicodeSet which,
                               UErrorCode *pErrorCode);

U_CFUNC void
ucnv_fromUWriteBytes(UConverter *cnv,
                     const char *bytes, int32_t length,
                     char **target, const char *targetLimit,
                     int32_t **offsets,
                     int32_t sourceIndex,
                     UErrorCode *pErrorCode);
U_CFUNC void
ucnv_toUWriteUChars(UConverter *cnv,
                    const UChar *uchars, int32_t length,
                    UChar **target, const UChar *targetLimit,
                    int32_t **offsets,
                    int32_t sourceIndex,
                    UErrorCode *pErrorCode);

U_CFUNC void
ucnv_toUWriteCodePoint(UConverter *cnv,
                       UChar32 c,
                       UChar **target, const UChar *targetLimit,
                       int32_t **offsets,
                       int32_t sourceIndex,
                       UErrorCode *pErrorCode);

#endif

#endif /* UCNV_CNV */
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	**********************************************************************
374ca955	3	* Copyright (C) 1999-2004, International Business Machines
b75a7d8f A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*
	7	* uconv_cnv.h:
	8	* defines all the low level conversion functions
	9	* T_UnicodeConverter_{to,from}Unicode_$ConversionType
	10	*
	11	* Modification History:
	12	*
	13	* Date Name Description
	14	* 05/09/00 helena Added implementation to handle fallback mappings.
	15	* 06/29/2000 helena Major rewrite of the callback APIs.
	16	*/
	17
	18	#ifndef UCNV_CNV_H
	19	#define UCNV_CNV_H
	20
	21	#include "unicode/utypes.h"
b75a7d8f	22
374ca955	23	#if !UCONFIG_NO_CONVERSION
b75a7d8f	24
374ca955 A	25	#include "unicode/ucnv.h"
	26	#include "unicode/ucnv_err.h"
	27	#include "unicode/uset.h"
	28	#include "uset_imp.h"
b75a7d8f A	29
	30	U_CDECL_BEGIN
	31
	32	/* this is used in fromUnicode DBCS tables as an "unassigned" marker */
	33	#define missingCharMarker 0xFFFF
	34
	35	/*
	36	* #define missingUCharMarker 0xfffe
	37	*
374ca955	38	* commented out because there are actually two values used in toUnicode tables:
b75a7d8f A	39	* U+fffe "unassigned"
	40	* U+ffff "illegal"
	41	*/
	42
374ca955 A	43	/** Forward declaration, see ucnv_bld.h */
	44	struct UConverterSharedData;
	45	typedef struct UConverterSharedData UConverterSharedData;
	46
	47	/* function types for UConverterImpl ---------------------------------------- */
b75a7d8f	48
374ca955 A	49	/* struct with arguments for UConverterLoad and ucnv_load() */
	50	typedef struct {
	51	int32_t size; /* sizeof(UConverterLoadArgs) */
	52	int32_t nestedLoads; /* count nested ucnv_load() calls */
	53	int32_t reserved; /* reserved - for good alignment of the pointers */
	54	uint32_t options;
	55	const char pkg, name;
	56	} UConverterLoadArgs;
b75a7d8f	57
374ca955 A	58	typedef void (UConverterLoad) (UConverterSharedData sharedData,
	59	UConverterLoadArgs *pArgs,
	60	const uint8_t raw, UErrorCode pErrorCode);
b75a7d8f A	61	typedef void (UConverterUnload) (UConverterSharedData sharedData);
	62
	63	typedef void (UConverterOpen) (UConverter cnv, const char name, const char locale,uint32_t options, UErrorCode *pErrorCode);
	64	typedef void (UConverterClose) (UConverter cnv);
	65
	66	typedef enum UConverterResetChoice {
	67	UCNV_RESET_BOTH,
	68	UCNV_RESET_TO_UNICODE,
	69	UCNV_RESET_FROM_UNICODE
	70	} UConverterResetChoice;
	71
	72	typedef void (UConverterReset) (UConverter cnv, UConverterResetChoice choice);
	73
374ca955 A	74	/*
	75	* Converter implementation function(s) for ucnv_toUnicode().
	76	* If the toUnicodeWithOffsets function pointer is NULL,
	77	* then the toUnicode function will be used and the offsets will be set to -1.
	78	*
	79	* Must maintain state across buffers. Use toUBytes[toULength] for partial input
	80	* sequences; it will be checked in ucnv.c at the end of the input stream
	81	* to detect truncated input.
	82	* Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND.
	83	*
	84	* The toUnicodeWithOffsets must write exactly as many offset values as target
	85	* units. Write offset values of -1 for when the source index corresponding to
	86	* the output unit is not known (e.g., the character started in an earlier buffer).
	87	* The pArgs->offsets pointer need not be moved forward.
	88	*
	89	* At function return, either one of the following conditions must be true:
	90	* - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit
	91	* - another error code with toUBytes[toULength] set to the offending input
	92	* - no error, and the source is consumed: source==sourceLimit
	93	*
	94	* The ucnv.c code will handle the end of the input (reset)
	95	* (reset, and truncation detection) and callbacks.
	96	*/
	97	typedef void (UConverterToUnicode) (UConverterToUnicodeArgs , UErrorCode *);
b75a7d8f	98
374ca955 A	99	/*
	100	* Same rules as for UConverterToUnicode.
	101	* A lead surrogate is kept in fromUChar32 across buffers, and if an error
	102	* occurs, then the offending input code point must be put into fromUChar32
	103	* as well.
	104	*/
	105	typedef void (UConverterFromUnicode) (UConverterFromUnicodeArgs , UErrorCode *);
b75a7d8f	106
374ca955 A	107	/*
	108	* Converter implementation function for ucnv_getNextUChar().
	109	* If the function pointer is NULL, then the toUnicode function will be used.
	110	*
	111	* Will be called at a character boundary (toULength==0).
	112	* May return with
	113	* - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input
	114	* (the return value will be ignored)
	115	* - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!)
	116	* with toUBytes[toULength] set to the offending input
	117	* (the return value will be ignored)
	118	* - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer,
	119	* to indicate that the ucnv.c code shall call the toUnicode function instead
	120	* - return a real code point result
	121	*
	122	* Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed.
	123	*
	124	* The ucnv.c code will handle the end of the input (reset)
	125	* (except for truncation detection!) and callbacks.
	126	*/
	127	typedef UChar32 (UConverterGetNextUChar) (UConverterToUnicodeArgs , UErrorCode *);
b75a7d8f A	128
	129	typedef void (UConverterGetStarters)(const UConverter converter,
	130	UBool starters[256],
	131	UErrorCode *pErrorCode);
	132
	133	/* If this function pointer is null or if the function returns null
	134	* the name field in static data struct should be returned by
	135	* ucnv_getName() API function
	136	*/
	137	typedef const char * (UConverterGetName) (const UConverter cnv);
	138
	139	/**
	140	* Write the codepage substitution character.
	141	* If this function is not set, then ucnv_cbFromUWriteSub() writes
	142	* the substitution character from UConverter.
	143	* For stateful converters, it is typically necessary to handle this
	144	* specificially for the converter in order to properly maintain the state.
	145	*/
	146	typedef void (UConverterWriteSub) (UConverterFromUnicodeArgs pArgs, int32_t offsetIndex, UErrorCode *pErrorCode);
	147
	148	/**
	149	* For converter-specific safeClone processing
	150	* If this function is not set, then ucnv_safeClone assumes that the converter has no private data that changes
	151	* after the converter is done opening.
	152	* If this function is set, then it is called just after a memcpy() of
	153	* converter data to the new, empty converter, and is expected to set up
	154	* the initial state of the converter. It is not expected to increment the
	155	* reference counts of the standard data types such as the shared data.
	156	*/
	157	typedef UConverter * (UConverterSafeClone) (const UConverter cnv,
	158	void *stackBuffer,
	159	int32_t *pBufferSize,
	160	UErrorCode *status);
	161
	162	/**
	163	* Fills the set of Unicode code points that can be converted by an ICU converter.
	164	* The API function ucnv_getUnicodeSet() clears the USet before calling
	165	* the converter's getUnicodeSet() implementation; the converter should only
	166	* add the appropriate code points to allow recursive use.
	167	* For example, the ISO-2022-JP converter will call each subconverter's
	168	* getUnicodeSet() implementation to consecutively add code points to
	169	* the same USet, which will result in a union of the sets of all subconverters.
	170	*
	171	* For more documentation, see ucnv_getUnicodeSet() in ucnv.h.
	172	*/
	173	typedef void (UConverterGetUnicodeSet) (const UConverter cnv,
73c04bcf	174	const USetAdder *sa,
b75a7d8f A	175	UConverterUnicodeSet which,
	176	UErrorCode *pErrorCode);
	177
	178	UBool CONVERSION_U_SUCCESS (UErrorCode err);
	179
b75a7d8f A	180	/**
	181	* UConverterImpl contains all the data and functions for a converter type.
	182	* Its function pointers work much like a C++ vtable.
	183	* Many converter types need to define only a subset of the functions;
	184	* when a function pointer is NULL, then a default action will be performed.
	185	*
	186	* Every converter type must implement toUnicode, fromUnicode, and getNextUChar,
	187	* otherwise the converter may crash.
	188	* Every converter type that has variable-length codepage sequences should
	189	* also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for
	190	* correct offset handling.
	191	* All other functions may or may not be implemented - it depends only on
	192	* whether the converter type needs them.
	193	*
	194	* When open() fails, then close() will be called, if present.
	195	*/
	196	struct UConverterImpl {
	197	UConverterType type;
	198
	199	UConverterLoad load;
	200	UConverterUnload unload;
	201
	202	UConverterOpen open;
	203	UConverterClose close;
	204	UConverterReset reset;
	205
374ca955 A	206	UConverterToUnicode toUnicode;
	207	UConverterToUnicode toUnicodeWithOffsets;
	208	UConverterFromUnicode fromUnicode;
	209	UConverterFromUnicode fromUnicodeWithOffsets;
	210	UConverterGetNextUChar getNextUChar;
b75a7d8f A	211
	212	UConverterGetStarters getStarters;
	213	UConverterGetName getName;
	214	UConverterWriteSub writeSub;
	215	UConverterSafeClone safeClone;
	216	UConverterGetUnicodeSet getUnicodeSet;
	217	};
	218
	219	extern const UConverterSharedData
	220	_MBCSData, _Latin1Data,
	221	_UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData,
	222	_ISO2022Data,
	223	_LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
	224	_LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
	225	_HZData,_ISCIIData, _SCSUData, _ASCIIData,
	226	_UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData;
	227
	228	U_CDECL_END
	229
b75a7d8f A	230	/** Always use fallbacks from codepage to Unicode */
	231	#define TO_U_USE_FALLBACK(useFallback) TRUE
	232	#define UCNV_TO_U_USE_FALLBACK(cnv) TRUE
	233
	234	/** Use fallbacks from Unicode to codepage when cnv->useFallback or for private-use code points */
	235	#define IS_PRIVATE_USE(c) ((uint32_t)((c)-0xe000)<0x1900 \|\| (uint32_t)((c)-0xf0000)<0x20000)
	236	#define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) \|\| IS_PRIVATE_USE(c))
	237	#define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c)
	238
	239	/**
374ca955 A	240	* Magic number for ucnv_getNextUChar(), returned by a
	241	* getNextUChar() implementation to indicate to use the converter's toUnicode()
	242	* instead of the native function.
	243	* @internal
b75a7d8f	244	*/
374ca955	245	#define UCNV_GET_NEXT_UCHAR_USE_TO_U -9
b75a7d8f A	246
	247	U_CFUNC void
	248	ucnv_getCompleteUnicodeSet(const UConverter *cnv,
73c04bcf	249	const USetAdder *sa,
b75a7d8f A	250	UConverterUnicodeSet which,
	251	UErrorCode *pErrorCode);
	252
	253	U_CFUNC void
	254	ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
73c04bcf	255	const USetAdder *sa,
b75a7d8f A	256	UConverterUnicodeSet which,
	257	UErrorCode *pErrorCode);
	258
374ca955 A	259	U_CFUNC void
	260	ucnv_fromUWriteBytes(UConverter *cnv,
	261	const char *bytes, int32_t length,
	262	char *target, const char targetLimit,
	263	int32_t **offsets,
	264	int32_t sourceIndex,
	265	UErrorCode *pErrorCode);
	266	U_CFUNC void
	267	ucnv_toUWriteUChars(UConverter *cnv,
	268	const UChar *uchars, int32_t length,
	269	UChar *target, const UChar targetLimit,
	270	int32_t **offsets,
	271	int32_t sourceIndex,
	272	UErrorCode *pErrorCode);
	273
	274	U_CFUNC void
	275	ucnv_toUWriteCodePoint(UConverter *cnv,
	276	UChar32 c,
	277	UChar *target, const UChar targetLimit,
	278	int32_t **offsets,
	279	int32_t sourceIndex,
	280	UErrorCode *pErrorCode);
	281
	282	#endif
	283
b75a7d8f	284	#endif /* UCNV_CNV */