X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..08b89b0a244153b9f5bbb2f49c55ab0f7298122e:/icuSources/common/ucnv.c diff --git a/icuSources/common/ucnv.c b/icuSources/common/ucnv.c index 635c78fa..5cf4d15c 100644 --- a/icuSources/common/ucnv.c +++ b/icuSources/common/ucnv.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1998-2004, International Business Machines +* Copyright (C) 1998-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -27,6 +27,8 @@ #include "unicode/ucnv.h" #include "unicode/ucnv_err.h" #include "unicode/uset.h" +#include "unicode/utf.h" +#include "unicode/utf16.h" #include "putilimp.h" #include "cmemory.h" #include "cstring.h" @@ -34,7 +36,6 @@ #include "utracimp.h" #include "ustr_imp.h" #include "ucnv_imp.h" -#include "ucnv_io.h" #include "ucnv_cnv.h" #include "ucnv_bld.h" @@ -47,26 +48,22 @@ typedef struct UAmbiguousConverter { } UAmbiguousConverter; static const UAmbiguousConverter ambiguousConverters[]={ + { "ibm-897_P100-1995", 0xa5 }, { "ibm-942_P120-1999", 0xa5 }, { "ibm-943_P130-1999", 0xa5 }, - { "ibm-897_P100-1995", 0xa5 }, + { "ibm-946_P100-1995", 0xa5 }, { "ibm-33722_P120-1999", 0xa5 }, + { "ibm-1041_P100-1995", 0xa5 }, + /*{ "ibm-54191_P100-2006", 0xa5 },*/ + /*{ "ibm-62383_P100-2007", 0xa5 },*/ + /*{ "ibm-891_P100-1995", 0x20a9 },*/ + { "ibm-944_P100-1995", 0x20a9 }, { "ibm-949_P110-1999", 0x20a9 }, { "ibm-1363_P110-1997", 0x20a9 }, - { "ISO_2022,locale=ko,version=0", 0x20a9 } + { "ISO_2022,locale=ko,version=0", 0x20a9 }, + { "ibm-1088_P100-1995", 0x20a9 } }; -U_CAPI const char* U_EXPORT2 -ucnv_getDefaultName () -{ - return ucnv_io_getDefaultConverterName(); -} - -U_CAPI void U_EXPORT2 -ucnv_setDefaultName (const char *converterName) -{ - ucnv_io_setDefaultConverterName(converterName); -} /*Calls through createConverter */ U_CAPI UConverter* U_EXPORT2 ucnv_open (const char *name, @@ -107,6 +104,28 @@ ucnv_openU (const UChar * name, return ucnv_open(u_austrcpy(asciiName, name), err); } +/* Copy the string that is represented by the UConverterPlatform enum + * @param platformString An output buffer + * @param platform An enum representing a platform + * @return the length of the copied string. + */ +static int32_t +ucnv_copyPlatformString(char *platformString, UConverterPlatform pltfrm) +{ + switch (pltfrm) + { + case UCNV_IBM: + uprv_strcpy(platformString, "ibm-"); + return 4; + case UCNV_UNKNOWN: + break; + } + + /* default to empty string */ + *platformString = 0; + return 0; +} + /*Assumes a $platform-#codepage.$CONVERTER_FILE_EXTENSION scheme and calls *through createConverter*/ U_CAPI UConverter* U_EXPORT2 @@ -135,6 +154,7 @@ U_CAPI UConverter* U_EXPORT2 ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { UConverter *localConverter, *allocatedConverter; + int32_t stackBufferSize; int32_t bufferSizeNeeded; char *stackBufferChars = (char *)stackBuffer; UErrorCode cbErr; @@ -163,13 +183,13 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U if (status == NULL || U_FAILURE(*status)){ UTRACE_EXIT_STATUS(status? *status: U_ILLEGAL_ARGUMENT_ERROR); - return 0; + return NULL; } - if (!pBufferSize || !cnv){ + if (cnv == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; UTRACE_EXIT_STATUS(*status); - return 0; + return NULL; } UTRACE_DATA3(UTRACE_OPEN_CLOSE, "clone converter %s at %p into stackBuffer %p", @@ -179,6 +199,10 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U /* call the custom safeClone function for sizing */ bufferSizeNeeded = 0; cnv->sharedData->impl->safeClone(cnv, NULL, &bufferSizeNeeded, status); + if (U_FAILURE(*status)) { + UTRACE_EXIT_STATUS(*status); + return NULL; + } } else { @@ -186,10 +210,16 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U bufferSizeNeeded = sizeof(UConverter); } - if (*pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ - *pBufferSize = bufferSizeNeeded; - UTRACE_EXIT_VALUE(bufferSizeNeeded); - return 0; + if (pBufferSize == NULL) { + stackBufferSize = 1; + pBufferSize = &stackBufferSize; + } else { + stackBufferSize = *pBufferSize; + if (stackBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ + *pBufferSize = bufferSizeNeeded; + UTRACE_EXIT_VALUE(bufferSizeNeeded); + return NULL; + } } @@ -198,19 +228,19 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U */ if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); - if(*pBufferSize > offsetUp) { - *pBufferSize -= offsetUp; + if(stackBufferSize > offsetUp) { + stackBufferSize -= offsetUp; stackBufferChars += offsetUp; } else { /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ - *pBufferSize = 1; + stackBufferSize = 1; } } stackBuffer = (void *)stackBufferChars; /* Now, see if we must allocate any memory */ - if (*pBufferSize < bufferSizeNeeded || stackBuffer == NULL) + if (stackBufferSize < bufferSizeNeeded || stackBuffer == NULL) { /* allocate one here...*/ localConverter = allocatedConverter = (UConverter *) uprv_malloc (bufferSizeNeeded); @@ -220,11 +250,8 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U UTRACE_EXIT_STATUS(*status); return NULL; } - - if (U_SUCCESS(*status)) { - *status = U_SAFECLONE_ALLOCATED_WARNING; - } - + *status = U_SAFECLONE_ALLOCATED_WARNING; + /* record the fact that memory was allocated */ *pBufferSize = bufferSizeNeeded; } else { @@ -239,6 +266,19 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U uprv_memcpy(localConverter, cnv, sizeof(UConverter)); localConverter->isCopyLocal = localConverter->isExtraLocal = FALSE; + /* copy the substitution string */ + if (cnv->subChars == (uint8_t *)cnv->subUChars) { + localConverter->subChars = (uint8_t *)localConverter->subUChars; + } else { + localConverter->subChars = (uint8_t *)uprv_malloc(UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR); + if (localConverter->subChars == NULL) { + uprv_free(allocatedConverter); + UTRACE_EXIT_STATUS(*status); + return NULL; + } + uprv_memcpy(localConverter->subChars, cnv->subChars, UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR); + } + /* now either call the safeclone fcn or not */ if (cnv->sharedData->impl->safeClone != NULL) { /* call the custom safeClone function */ @@ -246,6 +286,9 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U } if(localConverter==NULL || U_FAILURE(*status)) { + if (allocatedConverter != NULL && allocatedConverter->subChars != (uint8_t *)allocatedConverter->subUChars) { + uprv_free(allocatedConverter->subChars); + } uprv_free(allocatedConverter); UTRACE_EXIT_STATUS(*status); return NULL; @@ -285,27 +328,6 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U U_CAPI void U_EXPORT2 ucnv_close (UConverter * converter) { - /* first, notify the callback functions that the converter is closed */ - UConverterToUnicodeArgs toUArgs = { - sizeof(UConverterToUnicodeArgs), - TRUE, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL - }; - UConverterFromUnicodeArgs fromUArgs = { - sizeof(UConverterFromUnicodeArgs), - TRUE, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL - }; UErrorCode errorCode = U_ZERO_ERROR; UTRACE_ENTRY_OC(UTRACE_UCNV_CLOSE); @@ -319,16 +341,50 @@ ucnv_close (UConverter * converter) UTRACE_DATA3(UTRACE_OPEN_CLOSE, "close converter %s at %p, isCopyLocal=%b", ucnv_getName(converter, &errorCode), converter, converter->isCopyLocal); - toUArgs.converter = fromUArgs.converter = converter; + /* In order to speed up the close, only call the callbacks when they have been changed. + This performance check will only work when the callbacks are set within a shared library + or from user code that statically links this code. */ + /* first, notify the callback functions that the converter is closed */ + if (converter->fromCharErrorBehaviour != UCNV_TO_U_DEFAULT_CALLBACK) { + UConverterToUnicodeArgs toUArgs = { + sizeof(UConverterToUnicodeArgs), + TRUE, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL + }; - converter->fromCharErrorBehaviour(converter->toUContext, &toUArgs, NULL, 0, UCNV_CLOSE, &errorCode); - errorCode = U_ZERO_ERROR; - converter->fromUCharErrorBehaviour(converter->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_CLOSE, &errorCode); + toUArgs.converter = converter; + errorCode = U_ZERO_ERROR; + converter->fromCharErrorBehaviour(converter->toUContext, &toUArgs, NULL, 0, UCNV_CLOSE, &errorCode); + } + if (converter->fromUCharErrorBehaviour != UCNV_FROM_U_DEFAULT_CALLBACK) { + UConverterFromUnicodeArgs fromUArgs = { + sizeof(UConverterFromUnicodeArgs), + TRUE, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL + }; + fromUArgs.converter = converter; + errorCode = U_ZERO_ERROR; + converter->fromUCharErrorBehaviour(converter->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_CLOSE, &errorCode); + } if (converter->sharedData->impl->close != NULL) { converter->sharedData->impl->close(converter); } + if (converter->subChars != (uint8_t *)converter->subUChars) { + uprv_free(converter->subChars); + } + /* Checking whether it's an algorithic converter is okay in multithreaded applications because the value never changes. @@ -339,7 +395,7 @@ ucnv_close (UConverter * converter) } if(!converter->isCopyLocal){ - uprv_free (converter); + uprv_free(converter); } UTRACE_EXIT(); @@ -350,47 +406,21 @@ ucnv_close (UConverter * converter) U_CAPI const char* U_EXPORT2 ucnv_getAvailableName (int32_t n) { - if (0 <= n && n <= 0xffff) { - UErrorCode err = U_ZERO_ERROR; - const char *name = ucnv_io_getAvailableConverter((uint16_t)n, &err); - if (U_SUCCESS(err)) { - return name; + if (0 <= n && n <= 0xffff) { + UErrorCode err = U_ZERO_ERROR; + const char *name = ucnv_bld_getAvailableConverter((uint16_t)n, &err); + if (U_SUCCESS(err)) { + return name; + } } - } - return NULL; + return NULL; } U_CAPI int32_t U_EXPORT2 ucnv_countAvailable () { UErrorCode err = U_ZERO_ERROR; - return ucnv_io_countAvailableConverters(&err); -} - -U_CAPI uint16_t U_EXPORT2 -ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) -{ - return ucnv_io_countAliases(alias, pErrorCode); -} - - -U_CAPI const char* U_EXPORT2 -ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) -{ - return ucnv_io_getAlias(alias, n, pErrorCode); -} - -U_CAPI void U_EXPORT2 -ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) -{ - ucnv_io_getAliases(alias, 0, aliases, pErrorCode); -} - -U_CAPI uint16_t U_EXPORT2 -ucnv_countStandards(void) -{ - UErrorCode err = U_ZERO_ERROR; - return ucnv_io_countStandards(&err); + return ucnv_bld_countAvailableConverters(&err); } U_CAPI void U_EXPORT2 @@ -402,15 +432,19 @@ ucnv_getSubstChars (const UConverter * converter, if (U_FAILURE (*err)) return; + if (converter->subCharLen <= 0) { + /* Unicode string or empty string from ucnv_setSubstString(). */ + *len = 0; + return; + } + if (*len < converter->subCharLen) /*not enough space in subChars */ { *err = U_INDEX_OUTOFBOUNDS_ERROR; return; } - uprv_memcpy (mySubChar, converter->subChar, converter->subCharLen); /*fills in the subchars */ - *len = converter->subCharLen; /*store # of bytes copied to buffer */ - uprv_memcpy (mySubChar, converter->subChar, converter->subCharLen); /*fills in the subchars */ + uprv_memcpy (mySubChar, converter->subChars, converter->subCharLen); /*fills in the subchars */ *len = converter->subCharLen; /*store # of bytes copied to buffer */ } @@ -431,7 +465,7 @@ ucnv_setSubstChars (UConverter * converter, return; } - uprv_memcpy (converter->subChar, mySubChar, len); /*copies the subchars */ + uprv_memcpy (converter->subChars, mySubChar, len); /*copies the subchars */ converter->subCharLen = len; /*sets the new len */ /* @@ -444,6 +478,93 @@ ucnv_setSubstChars (UConverter * converter, return; } +U_CAPI void U_EXPORT2 +ucnv_setSubstString(UConverter *cnv, + const UChar *s, + int32_t length, + UErrorCode *err) { + UAlignedMemory cloneBuffer[U_CNV_SAFECLONE_BUFFERSIZE / sizeof(UAlignedMemory) + 1]; + char chars[UCNV_ERROR_BUFFER_LENGTH]; + + UConverter *clone; + uint8_t *subChars; + int32_t cloneSize, length8; + + /* Let the following functions check all arguments. */ + cloneSize = sizeof(cloneBuffer); + clone = ucnv_safeClone(cnv, cloneBuffer, &cloneSize, err); + ucnv_setFromUCallBack(clone, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, err); + length8 = ucnv_fromUChars(clone, chars, (int32_t)sizeof(chars), s, length, err); + ucnv_close(clone); + if (U_FAILURE(*err)) { + return; + } + + if (cnv->sharedData->impl->writeSub == NULL +#if !UCONFIG_NO_LEGACY_CONVERSION + || (cnv->sharedData->staticData->conversionType == UCNV_MBCS && + ucnv_MBCSGetType(cnv) != UCNV_EBCDIC_STATEFUL) +#endif + ) { + /* The converter is not stateful. Store the charset bytes as a fixed string. */ + subChars = (uint8_t *)chars; + } else { + /* + * The converter has a non-default writeSub() function, indicating + * that it is stateful. + * Store the Unicode string for on-the-fly conversion for correct + * state handling. + */ + if (length > UCNV_ERROR_BUFFER_LENGTH) { + /* + * Should not occur. The converter should output at least one byte + * per UChar, which means that ucnv_fromUChars() should catch all + * overflows. + */ + *err = U_BUFFER_OVERFLOW_ERROR; + return; + } + subChars = (uint8_t *)s; + if (length < 0) { + length = u_strlen(s); + } + length8 = length * U_SIZEOF_UCHAR; + } + + /* + * For storing the substitution string, select either the small buffer inside + * UConverter or allocate a subChars buffer. + */ + if (length8 > UCNV_MAX_SUBCHAR_LEN) { + /* Use a separate buffer for the string. Outside UConverter to not make it too large. */ + if (cnv->subChars == (uint8_t *)cnv->subUChars) { + /* Allocate a new buffer for the string. */ + cnv->subChars = (uint8_t *)uprv_malloc(UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR); + if (cnv->subChars == NULL) { + cnv->subChars = (uint8_t *)cnv->subUChars; + *err = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memset(cnv->subChars, 0, UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR); + } + } + + /* Copy the substitution string into the UConverter or its subChars buffer. */ + if (length8 == 0) { + cnv->subCharLen = 0; + } else { + uprv_memcpy(cnv->subChars, subChars, length8); + if (subChars == (uint8_t *)chars) { + cnv->subCharLen = (int8_t)length8; + } else /* subChars == s */ { + cnv->subCharLen = (int8_t)-length; + } + } + + /* See comment in ucnv_setSubstChars(). */ + cnv->subChar1 = 0; +} + /*resets the internal states of a converter *goal : have the same behaviour than a freshly created converter */ @@ -455,8 +576,11 @@ static void _reset(UConverter *converter, UConverterResetChoice choice, if(callCallback) { /* first, notify the callback functions that the converter is reset */ - UConverterToUnicodeArgs toUArgs = { - sizeof(UConverterToUnicodeArgs), + UErrorCode errorCode; + + if(choice<=UCNV_RESET_TO_UNICODE && converter->fromCharErrorBehaviour != UCNV_TO_U_DEFAULT_CALLBACK) { + UConverterToUnicodeArgs toUArgs = { + sizeof(UConverterToUnicodeArgs), TRUE, NULL, NULL, @@ -464,9 +588,14 @@ static void _reset(UConverter *converter, UConverterResetChoice choice, NULL, NULL, NULL - }; - UConverterFromUnicodeArgs fromUArgs = { - sizeof(UConverterFromUnicodeArgs), + }; + toUArgs.converter = converter; + errorCode = U_ZERO_ERROR; + converter->fromCharErrorBehaviour(converter->toUContext, &toUArgs, NULL, 0, UCNV_RESET, &errorCode); + } + if(choice!=UCNV_RESET_TO_UNICODE && converter->fromUCharErrorBehaviour != UCNV_FROM_U_DEFAULT_CALLBACK) { + UConverterFromUnicodeArgs fromUArgs = { + sizeof(UConverterFromUnicodeArgs), TRUE, NULL, NULL, @@ -474,15 +603,8 @@ static void _reset(UConverter *converter, UConverterResetChoice choice, NULL, NULL, NULL - }; - UErrorCode errorCode; - - toUArgs.converter = fromUArgs.converter = converter; - if(choice<=UCNV_RESET_TO_UNICODE) { - errorCode = U_ZERO_ERROR; - converter->fromCharErrorBehaviour(converter->toUContext, &toUArgs, NULL, 0, UCNV_RESET, &errorCode); - } - if(choice!=UCNV_RESET_TO_UNICODE) { + }; + fromUArgs.converter = converter; errorCode = U_ZERO_ERROR; converter->fromUCharErrorBehaviour(converter->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_RESET, &errorCode); } @@ -566,7 +688,7 @@ ucnv_getCCSID(const UConverter * converter, ccsid = converter->sharedData->staticData->codepage; if (ccsid == 0) { /* Rare case. This is for cases like gb18030, - which doesn't have an IBM cannonical name, but does have an IBM alias. */ + which doesn't have an IBM canonical name, but does have an IBM alias. */ const char *standardName = ucnv_getStandardName(ucnv_getName(converter, err), "IBM", err); if (U_SUCCESS(*err) && standardName) { const char *ccsidStr = uprv_strchr(standardName, '-'); @@ -785,20 +907,25 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { * } */ for(;;) { - /* convert */ - fromUnicode(pArgs, err); + if(U_SUCCESS(*err)) { + /* convert */ + fromUnicode(pArgs, err); - /* - * set a flag for whether the converter - * successfully processed the end of the input - * - * need not check cnv->preFromULength==0 because a replay (<0) will cause - * sflush && pArgs->source==pArgs->sourceLimit && - cnv->fromUChar32==0); + /* + * set a flag for whether the converter + * successfully processed the end of the input + * + * need not check cnv->preFromULength==0 because a replay (<0) will cause + * sflush && pArgs->source==pArgs->sourceLimit && + cnv->fromUChar32==0); + } else { + /* handle error from ucnv_convertEx() */ + converterSawEndOfInput=FALSE; + } /* no callback called yet for this iteration */ calledCallback=FALSE; @@ -989,6 +1116,64 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { } } +/* + * Output the fromUnicode overflow buffer. + * Call this function if(cnv->charErrorBufferLength>0). + * @return TRUE if overflow + */ +static UBool +ucnv_outputOverflowFromUnicode(UConverter *cnv, + char **target, const char *targetLimit, + int32_t **pOffsets, + UErrorCode *err) { + int32_t *offsets; + char *overflow, *t; + int32_t i, length; + + t=*target; + if(pOffsets!=NULL) { + offsets=*pOffsets; + } else { + offsets=NULL; + } + + overflow=(char *)cnv->charErrorBuffer; + length=cnv->charErrorBufferLength; + i=0; + while(icharErrorBufferLength=(int8_t)j; + *target=t; + if(offsets!=NULL) { + *pOffsets=offsets; + } + *err=U_BUFFER_OVERFLOW_ERROR; + return TRUE; + } + + /* copy the overflow contents to the target */ + *t++=overflow[i++]; + if(offsets!=NULL) { + *offsets++=-1; /* no source index available for old output */ + } + } + + /* the overflow buffer is completely copied to the target */ + cnv->charErrorBufferLength=0; + *target=t; + if(offsets!=NULL) { + *pOffsets=offsets; + } + return FALSE; +} + U_CAPI void U_EXPORT2 ucnv_fromUnicode(UConverter *cnv, char **target, const char *targetLimit, @@ -1012,13 +1197,22 @@ ucnv_fromUnicode(UConverter *cnv, s=*source; t=*target; - if(sourceLimit= to the address source or target + * + * 2) Make sure that the buffer sizes do not exceed the number range for * int32_t because some functions use the size (in units or bytes) * rather than comparing pointers, and because offsets are int32_t values. * @@ -1028,52 +1222,30 @@ ucnv_fromUnicode(UConverter *cnv, * not be able to maintain the semantics that either the source must be * consumed or the target filled (unless an error occurs). * An adjustment would be targetLimit=t+0x7fffffff; for example. + * + * 3) Make sure that the user didn't incorrectly cast a UChar * pointer + * to a char * pointer and provide an incomplete UChar code unit. */ - if( + if (sourceLimit(size_t)0x3fffffff && sourceLimit>s) || - ((size_t)(targetLimit-t)>(size_t)0x7fffffff && targetLimit>t) - ) { + ((size_t)(targetLimit-t)>(size_t)0x7fffffff && targetLimit>t) || + (((const char *)sourceLimit-(const char *)s) & 1) != 0) + { *err=U_ILLEGAL_ARGUMENT_ERROR; return; } - /* flush the target overflow buffer */ - if(cnv->charErrorBufferLength>0) { - char *overflow; - int32_t i, length; - - overflow=(char *)cnv->charErrorBuffer; - length=cnv->charErrorBufferLength; - i=0; - do { - if(t==targetLimit) { - /* the overflow buffer contains too much, keep the rest */ - int32_t j=0; - - do { - overflow[j++]=overflow[i++]; - } while(icharErrorBufferLength=(int8_t)j; - *target=t; - *err=U_BUFFER_OVERFLOW_ERROR; - return; - } - - /* copy the overflow contents to the target */ - *t++=overflow[i++]; - if(offsets!=NULL) { - *offsets++=-1; /* no source index available for old output */ - } - } while(icharErrorBufferLength=0; + /* output the target overflow buffer */ + if( cnv->charErrorBufferLength>0 && + ucnv_outputOverflowFromUnicode(cnv, target, targetLimit, &offsets, err) + ) { + /* U_BUFFER_OVERFLOW_ERROR */ + return; } + /* *target may have moved, therefore stop using t */ if(!flush && s==sourceLimit && cnv->preFromULength>=0) { /* the overflow buffer is emptied and there is no new input: we are done */ - *target=t; return; } @@ -1091,7 +1263,7 @@ ucnv_fromUnicode(UConverter *cnv, args.offsets=offsets; args.source=s; args.sourceLimit=sourceLimit; - args.target=t; + args.target=*target; args.targetLimit=targetLimit; args.size=sizeof(args); @@ -1196,7 +1368,7 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { pArgs->flush && pArgs->source==pArgs->sourceLimit && cnv->toULength==0); } else { - /* handle error from getNextUChar() */ + /* handle error from getNextUChar() or ucnv_convertEx() */ converterSawEndOfInput=FALSE; } @@ -1369,11 +1541,14 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { cnv->toULength=0; /* call the callback function */ + if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) { + cnv->toUCallbackReason = UCNV_UNASSIGNED; + } cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, errorInputLength, - (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ? - UCNV_UNASSIGNED : UCNV_ILLEGAL, + cnv->toUCallbackReason, err); + cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */ /* * loop back to the offset handling @@ -1387,6 +1562,64 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { } } +/* + * Output the toUnicode overflow buffer. + * Call this function if(cnv->UCharErrorBufferLength>0). + * @return TRUE if overflow + */ +static UBool +ucnv_outputOverflowToUnicode(UConverter *cnv, + UChar **target, const UChar *targetLimit, + int32_t **pOffsets, + UErrorCode *err) { + int32_t *offsets; + UChar *overflow, *t; + int32_t i, length; + + t=*target; + if(pOffsets!=NULL) { + offsets=*pOffsets; + } else { + offsets=NULL; + } + + overflow=cnv->UCharErrorBuffer; + length=cnv->UCharErrorBufferLength; + i=0; + while(iUCharErrorBufferLength=(int8_t)j; + *target=t; + if(offsets!=NULL) { + *pOffsets=offsets; + } + *err=U_BUFFER_OVERFLOW_ERROR; + return TRUE; + } + + /* copy the overflow contents to the target */ + *t++=overflow[i++]; + if(offsets!=NULL) { + *offsets++=-1; /* no source index available for old output */ + } + } + + /* the overflow buffer is completely copied to the target */ + cnv->UCharErrorBufferLength=0; + *target=t; + if(offsets!=NULL) { + *pOffsets=offsets; + } + return FALSE; +} + U_CAPI void U_EXPORT2 ucnv_toUnicode(UConverter *cnv, UChar **target, const UChar *targetLimit, @@ -1410,13 +1643,22 @@ ucnv_toUnicode(UConverter *cnv, s=*source; t=*target; - if(sourceLimit= to the address source or target + * + * 2) Make sure that the buffer sizes do not exceed the number range for * int32_t because some functions use the size (in units or bytes) * rather than comparing pointers, and because offsets are int32_t values. * @@ -1426,52 +1668,30 @@ ucnv_toUnicode(UConverter *cnv, * not be able to maintain the semantics that either the source must be * consumed or the target filled (unless an error occurs). * An adjustment would be sourceLimit=t+0x7fffffff; for example. + * + * 3) Make sure that the user didn't incorrectly cast a UChar * pointer + * to a char * pointer and provide an incomplete UChar code unit. */ - if( + if (sourceLimit(size_t)0x7fffffff && sourceLimit>s) || - ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t) + ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t) || + (((const char *)targetLimit-(const char *)t) & 1) != 0 ) { *err=U_ILLEGAL_ARGUMENT_ERROR; return; } - /* flush the target overflow buffer */ - if(cnv->UCharErrorBufferLength>0) { - UChar *overflow; - int32_t i, length; - - overflow=cnv->UCharErrorBuffer; - length=cnv->UCharErrorBufferLength; - i=0; - do { - if(t==targetLimit) { - /* the overflow buffer contains too much, keep the rest */ - int32_t j=0; - - do { - overflow[j++]=overflow[i++]; - } while(iUCharErrorBufferLength=(int8_t)j; - *target=t; - *err=U_BUFFER_OVERFLOW_ERROR; - return; - } - - /* copy the overflow contents to the target */ - *t++=overflow[i++]; - if(offsets!=NULL) { - *offsets++=-1; /* no source index available for old output */ - } - } while(iUCharErrorBufferLength=0; + /* output the target overflow buffer */ + if( cnv->UCharErrorBufferLength>0 && + ucnv_outputOverflowToUnicode(cnv, target, targetLimit, &offsets, err) + ) { + /* U_BUFFER_OVERFLOW_ERROR */ + return; } + /* *target may have moved, therefore stop using t */ if(!flush && s==sourceLimit && cnv->preToULength>=0) { /* the overflow buffer is emptied and there is no new input: we are done */ - *target=t; return; } @@ -1489,7 +1709,7 @@ ucnv_toUnicode(UConverter *cnv, args.offsets=offsets; args.source=s; args.sourceLimit=sourceLimit; - args.target=t; + args.target=*target; args.targetLimit=targetLimit; args.size=sizeof(args); @@ -1587,7 +1807,7 @@ ucnv_toUChars(UConverter *cnv, ucnv_resetToUnicode(cnv); originalDest=dest; if(srcLength==-1) { - srcLength=uprv_strlen(src); + srcLength=(int32_t)uprv_strlen(src); } if(srcLength>0) { srcLimit=src+srcLength; @@ -1839,7 +2059,14 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, UBool reset, UBool flush, UErrorCode *pErrorCode) { UChar pivotBuffer[CHUNK_SIZE]; - UChar *myPivotSource, *myPivotTarget; + const UChar *myPivotSource; + UChar *myPivotTarget; + const char *s; + char *t; + + UConverterToUnicodeArgs toUArgs; + UConverterFromUnicodeArgs fromUArgs; + UConverterConvert convert; /* error checking */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { @@ -1854,10 +2081,35 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, return; } + s=*source; + t=*target; + if((sourceLimit!=NULL && sourceLimit(size_t)0x7fffffff && sourceLimit>s)) || + ((size_t)(targetLimit-t)>(size_t)0x7fffffff && targetLimit>t) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if(pivotStart==NULL) { + if(!flush) { + /* streaming conversion requires an explicit pivot buffer */ + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + /* use the stack pivot buffer */ - pivotStart=myPivotSource=myPivotTarget=pivotBuffer; - pivotSource=&myPivotSource; + myPivotSource=myPivotTarget=pivotStart=pivotBuffer; + pivotSource=(UChar **)&myPivotSource; pivotTarget=&myPivotTarget; pivotLimit=pivotBuffer+CHUNK_SIZE; } else if( pivotStart>=pivotLimit || @@ -1877,51 +2129,260 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, if(reset) { ucnv_resetToUnicode(sourceCnv); ucnv_resetFromUnicode(targetCnv); - *pivotTarget=*pivotSource=pivotStart; + *pivotSource=*pivotTarget=pivotStart; + } else if(targetCnv->charErrorBufferLength>0) { + /* output the targetCnv overflow buffer */ + if(ucnv_outputOverflowFromUnicode(targetCnv, target, targetLimit, NULL, pErrorCode)) { + /* U_BUFFER_OVERFLOW_ERROR */ + return; + } + /* *target has moved, therefore stop using t */ + + if( !flush && + targetCnv->preFromULength>=0 && *pivotSource==*pivotTarget && + sourceCnv->UCharErrorBufferLength==0 && sourceCnv->preToULength>=0 && s==sourceLimit + ) { + /* the fromUnicode overflow buffer is emptied and there is no new input: we are done */ + return; + } + } + + /* Is direct-UTF-8 conversion available? */ + if( sourceCnv->sharedData->staticData->conversionType==UCNV_UTF8 && + targetCnv->sharedData->impl->fromUTF8!=NULL + ) { + convert=targetCnv->sharedData->impl->fromUTF8; + } else if( targetCnv->sharedData->staticData->conversionType==UCNV_UTF8 && + sourceCnv->sharedData->impl->toUTF8!=NULL + ) { + convert=sourceCnv->sharedData->impl->toUTF8; + } else { + convert=NULL; + } + + /* + * If direct-UTF-8 conversion is available, then we use a smaller + * pivot buffer for error handling and partial matches + * so that we quickly return to direct conversion. + * + * 32 is large enough for UCNV_EXT_MAX_UCHARS and UCNV_ERROR_BUFFER_LENGTH. + * + * We could reduce the pivot buffer size further, at the cost of + * buffer overflows from callbacks. + * The pivot buffer should not be smaller than the maximum number of + * fromUnicode extension table input UChars + * (for m:n conversion, see + * targetCnv->sharedData->mbcs.extIndexes[UCNV_EXT_COUNT_UCHARS]) + * or 2 for surrogate pairs. + * + * Too small a buffer can cause thrashing between pivoting and direct + * conversion, with function call overhead outweighing the benefits + * of direct conversion. + */ + if(convert!=NULL && (pivotLimit-pivotStart)>32) { + pivotLimit=pivotStart+32; } - /* conversion loop */ + /* prepare the converter arguments */ + fromUArgs.converter=targetCnv; + fromUArgs.flush=FALSE; + fromUArgs.offsets=NULL; + fromUArgs.target=*target; + fromUArgs.targetLimit=targetLimit; + fromUArgs.size=sizeof(fromUArgs); + + toUArgs.converter=sourceCnv; + toUArgs.flush=flush; + toUArgs.offsets=NULL; + toUArgs.source=s; + toUArgs.sourceLimit=sourceLimit; + toUArgs.targetLimit=pivotLimit; + toUArgs.size=sizeof(toUArgs); + + /* + * TODO: Consider separating this function into two functions, + * extracting exactly the conversion loop, + * for readability and to reduce the set of visible variables. + * + * Otherwise stop using s and t from here on. + */ + s=t=NULL; + + /* + * conversion loop + * + * The sequence of steps in the loop may appear backward, + * but the principle is simple: + * In the chain of + * source - sourceCnv overflow - pivot - targetCnv overflow - target + * empty out later buffers before refilling them from earlier ones. + * + * The targetCnv overflow buffer is flushed out only once before the loop. + */ for(;;) { - if(reset) { + /* + * if(pivot not empty or error or replay or flush fromUnicode) { + * fromUnicode(pivot -> target); + * } + * + * For pivoting conversion; and for direct conversion for + * error callback handling and flushing the replay buffer. + */ + if( *pivotSource<*pivotTarget || + U_FAILURE(*pErrorCode) || + targetCnv->preFromULength<0 || + fromUArgs.flush + ) { + fromUArgs.source=*pivotSource; + fromUArgs.sourceLimit=*pivotTarget; + _fromUnicodeWithCallback(&fromUArgs, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + /* target overflow, or conversion error */ + *pivotSource=(UChar *)fromUArgs.source; + break; + } + /* - * if we did a reset in this function, we know that there is nothing - * to convert to the target yet, so we save a function call + * _fromUnicodeWithCallback() must have consumed the pivot contents + * (*pivotSource==*pivotTarget) since it returned with U_SUCCESS() */ - reset=FALSE; - } else { + } + + /* The pivot buffer is empty; reset it so we start at pivotStart. */ + *pivotSource=*pivotTarget=pivotStart; + + /* + * if(sourceCnv overflow buffer not empty) { + * move(sourceCnv overflow buffer -> pivot); + * continue; + * } + */ + /* output the sourceCnv overflow buffer */ + if(sourceCnv->UCharErrorBufferLength>0) { + if(ucnv_outputOverflowToUnicode(sourceCnv, pivotTarget, pivotLimit, NULL, pErrorCode)) { + /* U_BUFFER_OVERFLOW_ERROR */ + *pErrorCode=U_ZERO_ERROR; + } + continue; + } + + /* + * check for end of input and break if done + * + * Checking both flush and fromUArgs.flush ensures that the converters + * have been called with the flush flag set if the ucnv_convertEx() + * caller set it. + */ + if( toUArgs.source==sourceLimit && + sourceCnv->preToULength>=0 && sourceCnv->toULength==0 && + (!flush || fromUArgs.flush) + ) { + /* done successfully */ + break; + } + + /* + * use direct conversion if available + * but not if continuing a partial match + * or flushing the toUnicode replay buffer + */ + if(convert!=NULL && targetCnv->preFromUFirstCP<0 && sourceCnv->preToULength==0) { + if(*pErrorCode==U_USING_DEFAULT_WARNING) { + /* remove a warning that may be set by this function */ + *pErrorCode=U_ZERO_ERROR; + } + convert(&fromUArgs, &toUArgs, pErrorCode); + if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { + break; + } else if(U_FAILURE(*pErrorCode)) { + if(sourceCnv->toULength>0) { + /* + * Fall through to calling _toUnicodeWithCallback() + * for callback handling. + * + * The pivot buffer will be reset with + * *pivotSource=*pivotTarget=pivotStart; + * which indicates a toUnicode error to the caller + * (*pivotSource==pivotStart shows no pivot UChars consumed). + */ + } else { + /* + * Indicate a fromUnicode error to the caller + * (*pivotSource>pivotStart shows some pivot UChars consumed). + */ + *pivotSource=*pivotTarget=pivotStart+1; + /* + * Loop around to calling _fromUnicodeWithCallbacks() + * for callback handling. + */ + continue; + } + } else if(*pErrorCode==U_USING_DEFAULT_WARNING) { + /* + * No error, but the implementation requested to temporarily + * fall back to pivoting. + */ + *pErrorCode=U_ZERO_ERROR; /* - * convert to the target first in case the pivot is filled at entry - * or the targetCnv has some output bytes in its state + * The following else branches are almost identical to the end-of-input + * handling in _toUnicodeWithCallback(). + * Avoid calling it just for the end of input. */ - ucnv_fromUnicode(targetCnv, - target, targetLimit, - (const UChar **)pivotSource, *pivotTarget, - NULL, - (UBool)(flush && *source==sourceLimit), - pErrorCode); - if(U_FAILURE(*pErrorCode)) { + } else if(flush && sourceCnv->toULength>0) { /* flush==toUArgs.flush */ + /* + * the entire input stream is consumed + * and there is a partial, truncated input sequence left + */ + + /* inject an error and continue with callback handling */ + *pErrorCode=U_TRUNCATED_CHAR_FOUND; + } else { + /* input consumed */ + if(flush) { + /* reset the converters without calling the callback functions */ + _reset(sourceCnv, UCNV_RESET_TO_UNICODE, FALSE); + _reset(targetCnv, UCNV_RESET_FROM_UNICODE, FALSE); + } + + /* done successfully */ break; } - - /* ucnv_fromUnicode() must have consumed the pivot contents since it returned with U_SUCCESS() */ - *pivotSource=*pivotTarget=pivotStart; } - - /* convert from the source to the pivot */ - ucnv_toUnicode(sourceCnv, - pivotTarget, pivotLimit, - source, sourceLimit, - NULL, - flush, - pErrorCode); + + /* + * toUnicode(source -> pivot); + * + * For pivoting conversion; and for direct conversion for + * error callback handling, continuing partial matches + * and flushing the replay buffer. + * + * The pivot buffer is empty and reset. + */ + toUArgs.target=pivotStart; /* ==*pivotTarget */ + /* toUArgs.targetLimit=pivotLimit; already set before the loop */ + _toUnicodeWithCallback(&toUArgs, pErrorCode); + *pivotTarget=toUArgs.target; if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { /* pivot overflow: continue with the conversion loop */ *pErrorCode=U_ZERO_ERROR; - } else if(U_FAILURE(*pErrorCode) || *pivotTarget==pivotStart) { + } else if(U_FAILURE(*pErrorCode) || (!flush && *pivotTarget==pivotStart)) { /* conversion error, or there was nothing left to convert */ break; } - /* else ucnv_toUnicode() wrote into the pivot buffer: continue */ + /* + * else: + * _toUnicodeWithCallback() wrote into the pivot buffer, + * continue with fromUnicode conversion. + * + * Set the fromUnicode flush flag if we flush and if toUnicode has + * processed the end of the input. + */ + if( flush && toUArgs.source==sourceLimit && + sourceCnv->preToULength>=0 && + sourceCnv->UCharErrorBufferLength==0 + ) { + fromUArgs.flush=TRUE; + } } /* @@ -1931,6 +2392,9 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, * - a conversion error occurred */ + *source=toUArgs.source; + *target=fromUArgs.target; + /* terminate the target buffer if possible */ if(flush && U_SUCCESS(*pErrorCode)) { if(*target!=targetLimit) { @@ -1984,7 +2448,7 @@ ucnv_internalConvert(UConverter *outConverter, UConverter *inConverter, FALSE, TRUE, pErrorCode); - targetLength=myTarget-target; + targetLength=(int32_t)(myTarget-target); } /* @@ -2007,7 +2471,7 @@ ucnv_internalConvert(UConverter *outConverter, UConverter *inConverter, FALSE, TRUE, pErrorCode); - targetLength+=(myTarget-targetBuffer); + targetLength+=(int32_t)(myTarget-targetBuffer); } while(*pErrorCode==U_BUFFER_OVERFLOW_ERROR); /* done with preflighting, set warnings and errors as appropriate */ @@ -2322,7 +2786,7 @@ ucnv_detectUnicodeSignature( const char* source, } if(sourceLength==-1){ - sourceLength=uprv_strlen(source); + sourceLength=(int32_t)uprv_strlen(source); } @@ -2384,6 +2848,72 @@ ucnv_detectUnicodeSignature( const char* source, return NULL; } +U_CAPI int32_t U_EXPORT2 +ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status) +{ + if(status == NULL || U_FAILURE(*status)){ + return -1; + } + if(cnv == NULL){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return -1; + } + + if(cnv->preFromUFirstCP >= 0){ + return U16_LENGTH(cnv->preFromUFirstCP)+cnv->preFromULength ; + }else if(cnv->preFromULength < 0){ + return -cnv->preFromULength ; + }else if(cnv->fromUChar32 > 0){ + return 1; + } + return 0; + +} + +U_CAPI int32_t U_EXPORT2 +ucnv_toUCountPending(const UConverter* cnv, UErrorCode* status){ + + if(status == NULL || U_FAILURE(*status)){ + return -1; + } + if(cnv == NULL){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return -1; + } + + if(cnv->preToULength > 0){ + return cnv->preToULength ; + }else if(cnv->preToULength < 0){ + return -cnv->preToULength; + }else if(cnv->toULength > 0){ + return cnv->toULength; + } + return 0; +} + +U_CAPI UBool U_EXPORT2 +ucnv_isFixedWidth(UConverter *cnv, UErrorCode *status){ + if (U_FAILURE(*status)) { + return FALSE; + } + + if (cnv == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return FALSE; + } + + switch (ucnv_getType(cnv)) { + case UCNV_SBCS: + case UCNV_DBCS: + case UCNV_UTF32_BigEndian: + case UCNV_UTF32_LittleEndian: + case UCNV_UTF32: + case UCNV_US_ASCII: + return TRUE; + default: + return FALSE; + } +} #endif /*