/*
**********************************************************************
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u7.c
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
+#include "uassert.h"
/* UTF-7 -------------------------------------------------------------------- */
-/* ### TODO: in user guide, document version option (=1 for escaping set O characters) */
/*
* UTF-7 is a stateful encoding of Unicode.
* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
static void
_UTF7Open(UConverter *cnv,
- const char *name,
- const char *locale,
- uint32_t options,
+ UConverterLoadArgs *pArgs,
UErrorCode *pErrorCode) {
- if((options&0xf)<=1) {
- cnv->fromUnicodeStatus=(options&0xf)<<28;
+ if(UCNV_GET_VERSION(cnv)<=1) {
+ /* TODO(markus): Should just use cnv->options rather than copying the version number. */
+ cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
_UTF7Reset(cnv, UCNV_RESET_BOTH);
} else {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
-loop:
if(inDirectMode) {
directMode:
/*
* In Direct Mode, only the sourceIndex is used.
*/
byteIndex=0;
- length=sourceLimit-source;
- targetCapacity=targetLimit-target;
+ length=(int32_t)(sourceLimit-source);
+ targetCapacity=(int32_t)(targetLimit-target);
if(length>targetCapacity) {
length=targetCapacity;
}
/* illegal */
bytes[0]=b;
byteIndex=1;
- nextSourceIndex=sourceIndex+1;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if(b!=PLUS) {
/* write directly encoded character */
*target++=b;
if(target<targetLimit) {
bytes[byteIndex++]=b=*source++;
++nextSourceIndex;
- if(b>=126) {
- /* illegal - test other illegal US-ASCII values by base64Value==-3 */
+ base64Value = -3; /* initialize as illegal */
+ if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
+ /* either
+ * base64Value==-1 for any legal character except base64 and minus sign, or
+ * base64Value==-3 for illegal characters:
+ * 1. In either case, leave Unicode mode.
+ * 2.1. If we ended with an incomplete UChar or none after the +, then
+ * generate an error for the preceding erroneous sequence and deal with
+ * the current (possibly illegal) character next time through.
+ * 2.2. Else the current char comes after a complete UChar, which was already
+ * pushed to the output buf, so:
+ * 2.2.1. If the current char is legal, just save it for processing next time.
+ * It may be for example, a plus which we need to deal with in direct mode.
+ * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
+ */
inDirectMode=TRUE;
- goto callback;
- } else if((base64Value=fromBase64[b])>=0) {
+ if(base64Counter==-1) {
+ /* illegal: + immediately followed by something other than base64 or minus sign */
+ /* include the plus sign in the reported sequence, but not the subsequent char */
+ --source;
+ bytes[0]=PLUS;
+ byteIndex=1;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ } else if(bits!=0) {
+ /* bits are illegally left over, a UChar is incomplete */
+ /* don't include current char (legal or illegal) in error seq */
+ --source;
+ --byteIndex;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ } else {
+ /* previous UChar was complete */
+ if(base64Value==-3) {
+ /* current character is illegal, deal with it here */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ } else {
+ /* un-read the current character in case it is a plus sign */
+ --source;
+ sourceIndex=nextSourceIndex-1;
+ goto directMode;
+ }
+ }
+ } else if(base64Value>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
case -1: /* -1 is immediately after the + */
/* will never occur */
break;
}
- } else if(base64Value==-2) {
+ } else /*base64Value==-2*/ {
/* minus sign terminates the base64 sequence */
inDirectMode=TRUE;
if(base64Counter==-1) {
/* absorb the minus and leave the Unicode Mode */
if(bits!=0) {
/* bits are illegally left over, a UChar is incomplete */
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
sourceIndex=nextSourceIndex;
goto directMode;
- } else if(base64Value==-1) /* for any legal character except base64 and minus sign */ {
- /* leave the Unicode Mode */
- inDirectMode=TRUE;
- if(base64Counter==-1) {
- /* illegal: + immediately followed by something other than base64 or minus sign */
- /* include the plus sign in the reported sequence */
- --sourceIndex;
- bytes[0]=PLUS;
- bytes[1]=b;
- byteIndex=2;
- goto callback;
- } else if(bits==0) {
- /* un-read the character in case it is a plus sign */
- --source;
- sourceIndex=nextSourceIndex-1;
- goto directMode;
- } else {
- /* bits are illegally left over, a UChar is incomplete */
- goto callback;
- }
- } else /* base64Value==-3 for illegal characters */ {
- /* illegal */
- inDirectMode=TRUE;
- goto callback;
}
} else {
/* target is full */
}
}
}
-endloop:
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(!inDirectMode && bits!=0 && U_SUCCESS(*pErrorCode)) {
- /* a character byte sequence remains incomplete */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- } else {
- /* set the converter state back into UConverter */
- cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
- cnv->toULength=byteIndex;
+ if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
+ /*
+ * if we are in Unicode mode, then the byteIndex might not be 0,
+ * but that is ok if bits==0
+ * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
+ * (not true for IMAP-mailbox-name where we must end in direct mode)
+ */
+ byteIndex=0;
}
-finish:
+ /* set the converter state back into UConverter */
+ cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
+ cnv->toULength=byteIndex;
+
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
-
-callback:
- /* call the callback function with all the preparations and post-processing */
- /* update the arguments structure */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- pArgs->offsets=offsets;
-
- /* copy the current bytes to invalidCharBuffer */
- for(b=0; b<(uint8_t)byteIndex; ++b) {
- cnv->invalidCharBuffer[b]=(char)bytes[b];
- }
- cnv->invalidCharLength=byteIndex;
-
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
- cnv->toULength=0;
-
- /* call the callback function */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
- /* get the converter state from UConverter */
- {
- uint32_t status=cnv->toUnicodeStatus;
- inDirectMode=(UBool)((status>>24)&1);
- base64Counter=(int8_t)(status>>16);
- bits=(uint16_t)status;
- }
- byteIndex=cnv->toULength;
-
- /* update target and deal with offsets if necessary */
- offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
- target=pArgs->target;
-
- /* update the source pointer and index */
- sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
- source=(const uint8_t *)pArgs->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
- goto endloop;
- } else if(cnv->UCharErrorBufferLength>0) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- goto endloop;
- } else if(U_FAILURE(*pErrorCode)) {
- /* break on error */
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- goto finish;
- } else {
- goto loop;
- }
-}
-
-static UChar32
-_UTF7GetNextUChar(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- return ucnv_getNextUCharFromToUImpl(pArgs, pArgs->converter->sharedData->impl->toUnicode, TRUE, pErrorCode);
}
static void
inDirectMode=(UBool)((status>>24)&1);
base64Counter=(int8_t)(status>>16);
bits=(uint8_t)status;
+ U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0]));
}
/* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
if(inDirectMode) {
directMode:
- length=sourceLimit-source;
- targetCapacity=targetLimit-target;
+ length=(int32_t)(sourceLimit-source);
+ targetCapacity=(int32_t)(targetLimit-target);
if(length>targetCapacity) {
length=targetCapacity;
}
if(pArgs->flush && source>=sourceLimit) {
/* flush remaining bits to the target */
- if(!inDirectMode && base64Counter!=0) {
+ if(!inDirectMode) {
+ if (base64Counter!=0) {
+ if(target<targetLimit) {
+ *target++=toBase64[bits];
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex-1;
+ }
+ } else {
+ cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+ }
+ /* Add final MINUS to terminate unicodeMode */
if(target<targetLimit) {
- *target++=toBase64[bits];
+ *target++=MINUS;
if(offsets!=NULL) {
*offsets++=sourceIndex-1;
}
} else {
- cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
+ cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}
_UTF7ToUnicodeWithOffsets,
_UTF7FromUnicodeWithOffsets,
_UTF7FromUnicodeWithOffsets,
- _UTF7GetNextUChar,
+ NULL,
NULL,
_UTF7GetName,
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
-loop:
if(inDirectMode) {
directMode:
/*
* In Direct Mode, only the sourceIndex is used.
*/
byteIndex=0;
- length=sourceLimit-source;
- targetCapacity=targetLimit-target;
+ length=(int32_t)(sourceLimit-source);
+ targetCapacity=(int32_t)(targetLimit-target);
if(length>targetCapacity) {
length=targetCapacity;
}
/* illegal */
bytes[0]=b;
byteIndex=1;
- nextSourceIndex=sourceIndex+1;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if(b!=AMPERSAND) {
/* write directly encoded character */
*target++=b;
if(b>0x7e) {
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
*target++=c;
if(offsets!=NULL) {
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
*target++=c;
if(offsets!=NULL) {
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
*target++=c;
if(offsets!=NULL) {
if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
/* bits are illegally left over, a UChar is incomplete */
/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
sourceIndex=nextSourceIndex;
/* base64Value==-3 for illegal characters */
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* target is full */
}
endloop:
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(!inDirectMode && U_SUCCESS(*pErrorCode)) {
- /* a character byte sequence remains incomplete - IMAP must end in ASCII/direct mode */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+ /*
+ * the end of the input stream and detection of truncated input
+ * are handled by the framework, but here we must check if we are in Unicode
+ * mode and byteIndex==0 because we must end in direct mode
+ *
+ * conditions:
+ * successful
+ * in Unicode mode and byteIndex==0
+ * end of input and no truncated input
+ */
+ if( U_SUCCESS(*pErrorCode) &&
+ !inDirectMode && byteIndex==0 &&
+ pArgs->flush && source>=sourceLimit
+ ) {
+ if(base64Counter==-1) {
+ /* & at the very end of the input */
+ /* make the ampersand the reported sequence */
+ bytes[0]=AMPERSAND;
+ byteIndex=1;
}
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- } else {
- /* set the converter state back into UConverter */
- cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
- cnv->toULength=byteIndex;
+ /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
+
+ inDirectMode=TRUE; /* avoid looping */
+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
-finish:
+ /* set the converter state back into UConverter */
+ cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
+ cnv->toULength=byteIndex;
+
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
-
-callback:
- /* call the callback function with all the preparations and post-processing */
- /* update the arguments structure */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- pArgs->offsets=offsets;
-
- /* copy the current bytes to invalidCharBuffer */
- for(b=0; b<(uint8_t)byteIndex; ++b) {
- cnv->invalidCharBuffer[b]=(char)bytes[b];
- }
- cnv->invalidCharLength=byteIndex;
-
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
- cnv->toULength=0;
-
- /* call the callback function */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
- /* get the converter state from UConverter */
- {
- uint32_t status=cnv->toUnicodeStatus;
- inDirectMode=(UBool)((status>>24)&1);
- base64Counter=(int8_t)(status>>16);
- bits=(uint16_t)status;
- }
- byteIndex=cnv->toULength;
-
- /* update target and deal with offsets if necessary */
- offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
- target=pArgs->target;
-
- /* update the source pointer and index */
- sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
- source=(const uint8_t *)pArgs->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
- goto endloop;
- } else if(cnv->UCharErrorBufferLength>0) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- goto endloop;
- } else if(U_FAILURE(*pErrorCode)) {
- /* break on error */
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- goto finish;
- } else {
- goto loop;
- }
}
static void
if(inDirectMode) {
directMode:
- length=sourceLimit-source;
- targetCapacity=targetLimit-target;
+ length=(int32_t)(sourceLimit-source);
+ targetCapacity=(int32_t)(targetLimit-target);
if(length>targetCapacity) {
length=targetCapacity;
}
_IMAPToUnicodeWithOffsets,
_IMAPFromUnicodeWithOffsets,
_IMAPFromUnicodeWithOffsets,
- _UTF7GetNextUChar,
+ NULL,
NULL,
NULL,
static const UConverterStaticData _IMAPStaticData={
sizeof(UConverterStaticData),
"IMAP-mailbox-name",
- 0, /* TODO CCSID for UTF-7 */
+ 0, /* TODO CCSID for IMAP-mailbox-name */
UCNV_IBM, UCNV_IMAP_MAILBOX,
1, 4,
{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
0
};
+
+#endif