/*
**********************************************************************
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u8.c
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
/* Keep these here to make finicky compilers happy */
-U_CFUNC void T_UConverter_toUnicode_UTF8(UConverterToUnicodeArgs *args,
- UErrorCode *err);
-U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode *err);
-U_CFUNC void T_UConverter_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
+U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
UErrorCode *err);
-U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
+U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
UErrorCode *err);
-U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
- UErrorCode *err);
/* UTF-8 -------------------------------------------------------------------- */
static const uint32_t
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
-/**
- * Calls invalid char callback when an invalid character sequence is encountered.
- * It presumes that the converter has a callback to call.
- *
- * @returns true when callback fails
- */
-static UBool
-T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
- UConverterCallbackReason reason,
- UErrorCode *err)
-{
- UConverter *converter = args->converter;
-
- if (U_SUCCESS(*err))
- {
- if (reason == UCNV_ILLEGAL) {
- *err = U_ILLEGAL_CHAR_FOUND;
- } else {
- *err = U_INVALID_CHAR_FOUND;
- }
- }
-
- /* copy the toUBytes to the invalidCharBuffer */
- uprv_memcpy(converter->invalidCharBuffer,
- converter->toUBytes,
- converter->toULength);
- converter->invalidCharLength = converter->toULength;
-
- /* Call the ErrorFunction */
- args->converter->fromCharErrorBehaviour(converter->toUContext,
- args,
- converter->invalidCharBuffer,
- converter->invalidCharLength,
- reason,
- err);
-
- return (UBool)U_FAILURE(*err);
-}
-
-static UBool
-T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
- int32_t currentOffset,
- UConverterCallbackReason reason,
- UErrorCode *err)
-{
- int32_t *saveOffsets = args->offsets;
- UBool result;
-
- result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
-
- while (saveOffsets < args->offsets)
- {
- *(saveOffsets++) = currentOffset;
- }
- return result;
-}
-
-U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
+static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
UErrorCode * err)
{
const unsigned char *mySource = (unsigned char *) args->source;
int32_t i, inBytes;
/* Restore size of current sequence */
-start:
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
{
inBytes = args->converter->mode; /* restore # of bytes to consume */
}
else
{
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- }
- }
- else
- { /* stores a partially calculated target*/
- args->converter->toUnicodeStatus = ch;
- args->converter->mode = inBytes;
- args->converter->toULength = (int8_t) i;
- }
+ /* stores a partially calculated target*/
+ args->converter->toUnicodeStatus = ch;
+ args->converter->mode = inBytes;
+ args->converter->toULength = (int8_t) i;
goto donefornow;
}
}
(isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+ args->converter->toULength = 0;
if (ch <= MAXIMUM_UCS2)
{
/* fits in 16 bits */
}
else
{
- args->source = (const char *) mySource;
- args->target = myTarget;
-
args->converter->toULength = (int8_t)i;
- if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- /* args and err should already be set properly */
- return;
- }
-
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
-
- /* goto the start to handle state left behind by the callback */
- goto start;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
}
args->source = (const char *) mySource;
}
-U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
+static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
UErrorCode * err)
{
const unsigned char *mySource = (unsigned char *) args->source;
int32_t i, inBytes;
/* Restore size of current sequence */
-start:
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
{
inBytes = args->converter->mode; /* restore # of bytes to consume */
}
else
{
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = 0;
- }
- }
- else
- {
- args->converter->toUnicodeStatus = ch;
- args->converter->mode = inBytes;
- args->converter->toULength = (int8_t)i;
- }
+ args->converter->toUnicodeStatus = ch;
+ args->converter->mode = inBytes;
+ args->converter->toULength = (int8_t)i;
goto donefornow;
}
}
(isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+ args->converter->toULength = 0;
if (ch <= MAXIMUM_UCS2)
{
/* fits in 16 bits */
}
else
{
- args->source = (const char *) mySource;
- args->target = myTarget;
- args->offsets = myOffsets;
-
args->converter->toULength = (int8_t)i;
- if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
- offsetNum, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- /* args and err should already be set properly */
- return;
- }
-
- offsetNum += i + ((unsigned char *) args->source - mySource);
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
- myOffsets = args->offsets;
-
- /* goto the start to handle state left behind by the callback */
- goto start;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
}
args->offsets = myOffsets;
}
-U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
+U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
UConverter *cnv = args->converter;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
- uint32_t ch, ch2;
+ UChar32 ch, ch2;
int16_t indexToWrite;
char temp[4];
- if (cnv->fromUSurrogateLead && myTarget < targetLimit)
+ if (cnv->fromUChar32 && myTarget < targetLimit)
{
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
+ ch = cnv->fromUChar32;
+ cnv->fromUChar32 = 0;
goto lowsurrogate;
}
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- ch2 = ch;
+ cnv->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* no more input */
- cnv->fromUSurrogateLead = (UChar)ch;
+ cnv->fromUChar32 = ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- ch2 = ch;
- }
-
- if(ch2 != 0) {
- /* call the callback function with all the preparations and post-processing */
+ cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
-
- /* update the arguments structure */
- args->source=mySource;
- args->target=(char *)myTarget;
-
- /* write the code point as code units */
- cnv->invalidUCharBuffer[0] = (UChar)ch2;
- cnv->invalidUCharLength = 1;
-
- /* call the callback function */
- cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
-
- /* get the converter state from UConverter */
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
-
- myTarget=(uint8_t *)args->target;
- mySource=args->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*err==U_BUFFER_OVERFLOW_ERROR) {
- break;
- } else if(U_FAILURE(*err)) {
- /* break on error */
- break;
- } else if(cnv->charErrorBufferLength>0) {
- /* target is full */
- *err=U_BUFFER_OVERFLOW_ERROR;
- break;
- /*
- * } else if(ch != 0) { ...
- * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
- * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
- * We would have to check myTarget<targetLimit and goto lowsurrogate?!
- */
- }
-
- continue;
+ break;
}
}
{
*err = U_BUFFER_OVERFLOW_ERROR;
}
- if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
- /* a Unicode code point remains incomplete (only a first surrogate) */
- *err = U_TRUNCATED_CHAR_FOUND;
- cnv->fromUSurrogateLead = 0;
- }
args->target = (char *) myTarget;
args->source = mySource;
}
-U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
+U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
UConverter *cnv = args->converter;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
- uint32_t ch, ch2;
+ UChar32 ch, ch2;
int32_t offsetNum, nextSourceIndex;
int16_t indexToWrite;
char temp[4];
- if (cnv->fromUSurrogateLead && myTarget < targetLimit)
+ if (cnv->fromUChar32 && myTarget < targetLimit)
{
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
+ ch = cnv->fromUChar32;
+ cnv->fromUChar32 = 0;
offsetNum = -1;
nextSourceIndex = 0;
goto lowsurrogate;
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- ch2 = ch;
+ cnv->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* no more input */
- cnv->fromUSurrogateLead = (UChar)ch;
+ cnv->fromUChar32 = ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- ch2 = ch;
- }
-
- if(ch2 != 0) {
- /* call the callback function with all the preparations and post-processing */
+ cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
-
- /* update the arguments structure */
- args->source=mySource;
- args->target=(char *)myTarget;
- args->offsets=myOffsets;
-
- /* write the code point as code units */
- cnv->invalidUCharBuffer[0] = (UChar)ch2;
- cnv->invalidUCharLength = 1;
-
- /* call the callback function */
- cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
-
- /* get the converter state from UConverter */
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
-
- /* update target and deal with offsets if necessary */
- myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum);
- myTarget=(uint8_t *)args->target;
-
- /* update the source pointer and index */
- offsetNum=nextSourceIndex+(args->source-mySource);
- mySource=args->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*err==U_BUFFER_OVERFLOW_ERROR) {
- break;
- } else if(U_FAILURE(*err)) {
- /* break on error */
- break;
- } else if(cnv->charErrorBufferLength>0) {
- /* target is full */
- *err=U_BUFFER_OVERFLOW_ERROR;
- break;
- /*
- * } else if(ch != 0) { ...
- * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
- * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
- * We would have to check myTarget<targetLimit and goto lowsurrogate?!
- */
- }
-
- continue;
+ break;
}
}
{
*err = U_BUFFER_OVERFLOW_ERROR;
}
- if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
- /* a Unicode code point remains incomplete (only a first surrogate) */
- *err = U_TRUNCATED_CHAR_FOUND;
- cnv->fromUSurrogateLead = 0;
- }
args->target = (char *) myTarget;
args->source = mySource;
args->offsets = myOffsets;
}
-U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
+static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
UErrorCode *err) {
- UChar buffer[2];
- const char *sourceInitial;
+ UConverter *cnv;
+ const uint8_t *sourceInitial;
const uint8_t *source;
- UChar* myUCharPtr;
uint16_t extraBytesToWrite;
uint8_t myByte;
UChar32 ch;
- int8_t isLegalSequence;
- UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
+ int8_t i, isLegalSequence;
- while (args->source < args->sourceLimit)
+ /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
+
+ cnv = args->converter;
+ sourceInitial = source = (const uint8_t *)args->source;
+ if (source >= (const uint8_t *)args->sourceLimit)
{
- sourceInitial = args->source;
- myByte = (uint8_t)*(args->source++);
- if (myByte < 0x80)
- {
- return (UChar32)myByte;
- }
+ /* no input */
+ *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0xffff;
+ }
- extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
- if (extraBytesToWrite == 0) {
- isLegalSequence = FALSE;
- ch = 0;
- goto CALL_ERROR_FUNCTION;
- }
+ myByte = (uint8_t)*(source++);
+ if (myByte < 0x80)
+ {
+ args->source = (const char *)source;
+ return (UChar32)myByte;
+ }
- /*The byte sequence is longer than the buffer area passed*/
- source = (const uint8_t *)args->source;
- if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- return 0xffff;
- }
- else
- {
- isLegalSequence = 1;
- ch = myByte << 6;
- switch(extraBytesToWrite)
- {
- /* note: code falls through cases! (sic)*/
- case 6:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 5:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 4:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 3:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 2:
- ch += (myByte = *source++);
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- }
- };
- }
- ch -= offsetsFromUTF8[extraBytesToWrite];
+ extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
+ if (extraBytesToWrite == 0) {
+ cnv->toUBytes[0] = myByte;
+ cnv->toULength = 1;
+ *err = U_ILLEGAL_CHAR_FOUND;
args->source = (const char *)source;
+ return 0xffff;
+ }
- /*
- * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
- * - use only trail bytes after a lead byte (checked above)
- * - use the right number of trail bytes for a given lead byte
- * - encode a code point <= U+10ffff
- * - use the fewest possible number of bytes for their code points
- * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
- *
- * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
- * There are no irregular sequences any more.
- * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
- */
- if (isLegalSequence && (uint32_t)ch <= MAXIMUM_UTF && (uint32_t)ch >= utf8_minChar32[extraBytesToWrite]) {
- if(isCESU8) {
- if(extraBytesToWrite <= 3) {
- if( UTF_IS_FIRST_SURROGATE(ch) &&
- (const char *)(source + 3) <= args->sourceLimit &&
- source[0] == 0xed && (source[1] & 0xf0) == 0xb0 && (source[2] & 0xc0) == 0x80
- ) {
- /* ch is a lead surrogate followed by a trail surrogate */
- ch = (ch << 10) +
- ((source[1] & 0xf) << 6) + (source[2] & 0x3f) -
- ((0xd800 << 10) - 0x10000);
- args->source = (const char *)(source + 3);
- }
- return ch; /* return the code point */
- }
- /* illegal CESU-8 */
+ /*The byte sequence is longer than the buffer area passed*/
+ if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
+ {
+ /* check if all of the remaining bytes are trail bytes */
+ cnv->toUBytes[0] = myByte;
+ i = 1;
+ *err = U_TRUNCATED_CHAR_FOUND;
+ while(source < (const uint8_t *)args->sourceLimit) {
+ if(U8_IS_TRAIL(myByte = *source)) {
+ cnv->toUBytes[i++] = myByte;
+ ++source;
} else {
- if(!UTF_IS_SURROGATE(ch)) {
- return ch; /* return the code point */
- }
- /* illegal UTF-8 */
+ /* error even before we run out of input */
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
+ cnv->toULength = i;
+ args->source = (const char *)source;
+ return 0xffff;
+ }
-CALL_ERROR_FUNCTION:
- extraBytesToWrite = (uint16_t)(args->source - sourceInitial);
- args->converter->invalidCharLength = (uint8_t)extraBytesToWrite;
- uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
-
- myUCharPtr = buffer;
- *err = U_ILLEGAL_CHAR_FOUND;
- args->target = myUCharPtr;
- args->targetLimit = buffer + 2;
- args->converter->fromCharErrorBehaviour(args->converter->toUContext,
- args,
- sourceInitial,
- extraBytesToWrite,
- UCNV_ILLEGAL,
- err);
-
- if(U_SUCCESS(*err)) {
- extraBytesToWrite = (uint16_t)(args->target - buffer);
- if(extraBytesToWrite > 0) {
- return ucnv_getUChar32KeepOverflow(args->converter, buffer, extraBytesToWrite);
- }
- /* else (callback did not write anything) continue */
- } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
- *err = U_ZERO_ERROR;
- return ucnv_getUChar32KeepOverflow(args->converter, buffer, 2);
- } else {
- /* break on error */
- /* ### what if a callback set an error but _also_ generated output?! */
- return 0xffff;
+ isLegalSequence = 1;
+ ch = myByte << 6;
+ switch(extraBytesToWrite)
+ {
+ /* note: code falls through cases! (sic)*/
+ case 6:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
}
+ ++source;
+ case 5:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ case 4:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ case 3:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ case 2:
+ ch += (myByte = *source);
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ };
+ ch -= offsetsFromUTF8[extraBytesToWrite];
+ args->source = (const char *)source;
+
+ /*
+ * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
+ * - use only trail bytes after a lead byte (checked above)
+ * - use the right number of trail bytes for a given lead byte
+ * - encode a code point <= U+10ffff
+ * - use the fewest possible number of bytes for their code points
+ * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
+ *
+ * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
+ * There are no irregular sequences any more.
+ */
+ if (isLegalSequence &&
+ (uint32_t)ch <= MAXIMUM_UTF &&
+ (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
+ !U_IS_SURROGATE(ch)
+ ) {
+ return ch; /* return the code point */
}
- /* no input or only skipping callback calls */
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ for(i = 0; sourceInitial < source; ++i) {
+ cnv->toUBytes[i] = *sourceInitial++;
+ }
+ cnv->toULength = i;
+ *err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}
NULL,
NULL,
- T_UConverter_toUnicode_UTF8,
- T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC,
- T_UConverter_fromUnicode_UTF8,
- T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
- T_UConverter_getNextUChar_UTF8,
+ ucnv_toUnicode_UTF8,
+ ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
+ ucnv_fromUnicode_UTF8,
+ ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
+ ucnv_getNextUChar_UTF8,
NULL,
NULL,
static const UConverterStaticData _UTF8StaticData={
sizeof(UConverterStaticData),
"UTF-8",
- 1208, UCNV_IBM, UCNV_UTF8, 1, 4,
+ 1208, UCNV_IBM, UCNV_UTF8,
+ 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
0,
0,
/* CESU-8 converter data ---------------------------------------------------- */
+static const UConverterImpl _CESU8Impl={
+ UCNV_CESU8,
+
+ NULL,
+ NULL,
+
+ NULL,
+ NULL,
+ NULL,
+
+ ucnv_toUnicode_UTF8,
+ ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
+ ucnv_fromUnicode_UTF8,
+ ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
+ NULL,
+
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ ucnv_getCompleteUnicodeSet
+};
+
static const UConverterStaticData _CESU8StaticData={
sizeof(UConverterStaticData),
"CESU-8",
const UConverterSharedData _CESU8Data={
sizeof(UConverterSharedData), ~((uint32_t) 0),
- NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl,
+ NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
0
};
+
+#endif