+ /* use optimized function if possible */
+ outputType=cnv->sharedData->mbcs.outputType;
+ unicodeMask=cnv->sharedData->mbcs.unicodeMask;
+ if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+ if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
+ } else {
+ ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
+ }
+ return;
+ } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
+ ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
+ return;
+ }
+
+ /* set up the local pointers */
+ source=pArgs->source;
+ sourceLimit=pArgs->sourceLimit;
+ target=(uint8_t *)pArgs->target;
+ targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
+ offsets=pArgs->offsets;
+
+ table=cnv->sharedData->mbcs.fromUnicodeTable;
+ if(cnv->sharedData->mbcs.utf8Friendly) {
+ mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
+ } else {
+ mbcsIndex=NULL;
+ }
+ if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
+ } else {
+ bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
+ }
+ asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
+
+ /* get the converter state from UConverter */
+ c=cnv->fromUChar32;
+
+ if(outputType==MBCS_OUTPUT_2_SISO) {
+ prevLength=cnv->fromUnicodeStatus;
+ if(prevLength==0) {
+ /* set the real value */
+ prevLength=1;
+ }
+ } else {
+ /* prevent fromUnicodeStatus from being set to something non-0 */
+ prevLength=0;
+ }
+
+ /* sourceIndex=-1 if the current character began in the previous buffer */
+ prevSourceIndex=-1;
+ sourceIndex= c==0 ? 0 : -1;
+ nextSourceIndex=0;
+
+ /* Get the SI/SO character for the converter */
+ siLength = getSISOBytes(SI, cnv->options, siBytes);
+ soLength = getSISOBytes(SO, cnv->options, soBytes);
+
+ /* conversion loop */
+ /*
+ * This is another piece of ugly code:
+ * A goto into the loop if the converter state contains a first surrogate
+ * from the previous function call.
+ * It saves me to check in each loop iteration a check of if(c==0)
+ * and duplicating the trail-surrogate-handling code in the else
+ * branch of that check.
+ * I could not find any other way to get around this other than
+ * using a function call for the conversion and callback, which would
+ * be even more inefficient.
+ *
+ * Markus Scherer 2000-jul-19
+ */
+ if(c!=0 && targetCapacity>0) {
+ goto getTrail;
+ }
+
+ while(source<sourceLimit) {
+ /*
+ * This following test is to see if available input would overflow the output.
+ * It does not catch output of more than one byte that
+ * overflows as a result of a multi-byte character or callback output
+ * from the last source character.
+ * Therefore, those situations also test for overflows and will
+ * then break the loop, too.
+ */
+ if(targetCapacity>0) {
+ /*
+ * Get a correct Unicode code point:
+ * a single UChar for a BMP code point or
+ * a matched surrogate pair for a "supplementary code point".
+ */
+ c=*source++;
+ ++nextSourceIndex;
+ if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
+ *target++=(uint8_t)c;
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex;
+ prevSourceIndex=sourceIndex;
+ sourceIndex=nextSourceIndex;
+ }
+ --targetCapacity;
+ c=0;
+ continue;
+ }
+ /*
+ * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
+ * to avoid dealing with surrogates.
+ * MBCS_FAST_MAX must be >=0xd7ff.
+ */
+ if(c<=0xd7ff && mbcsIndex!=NULL) {
+ value=mbcsIndex[c>>6];
+
+ /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
+ /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
+ switch(outputType) {
+ case MBCS_OUTPUT_2:
+ value=((const uint16_t *)bytes)[value +(c&0x3f)];
+ if(value<=0xff) {
+ if(value==0) {
+ goto unassigned;
+ } else {
+ length=1;
+ }
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_2_SISO:
+ /* 1/2-byte stateful with Shift-In/Shift-Out */
+ /*
+ * Save the old state in the converter object
+ * right here, then change the local prevLength state variable if necessary.
+ * Then, if this character turns out to be unassigned or a fallback that
+ * is not taken, the callback code must not save the new state in the converter
+ * because the new state is for a character that is not output.
+ * However, the callback must still restore the state from the converter
+ * in case the callback function changed it for its output.
+ */
+ cnv->fromUnicodeStatus=prevLength; /* save the old state */
+ value=((const uint16_t *)bytes)[value +(c&0x3f)];
+ if(value<=0xff) {
+ if(value==0) {
+ goto unassigned;
+ } else if(prevLength<=1) {
+ length=1;
+ } else {
+ /* change from double-byte mode to single-byte */
+ if (siLength == 1) {
+ value|=(uint32_t)siBytes[0]<<8;
+ length = 2;
+ } else if (siLength == 2) {
+ value|=(uint32_t)siBytes[1]<<8;
+ value|=(uint32_t)siBytes[0]<<16;
+ length = 3;
+ }
+ prevLength=1;
+ }
+ } else {
+ if(prevLength==2) {
+ length=2;
+ } else {
+ /* change from single-byte mode to double-byte */
+ if (soLength == 1) {
+ value|=(uint32_t)soBytes[0]<<16;
+ length = 3;
+ } else if (soLength == 2) {
+ value|=(uint32_t)soBytes[1]<<16;
+ value|=(uint32_t)soBytes[0]<<24;
+ length = 4;
+ }
+ prevLength=2;
+ }
+ }
+ break;
+ case MBCS_OUTPUT_DBCS_ONLY:
+ /* table with single-byte results, but only DBCS mappings used */
+ value=((const uint16_t *)bytes)[value +(c&0x3f)];
+ if(value<=0xff) {
+ /* no mapping or SBCS result, not taken for DBCS-only */
+ goto unassigned;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_3:
+ p=bytes+(value+(c&0x3f))*3;
+ value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ if(value<=0xff) {
+ if(value==0) {
+ goto unassigned;
+ } else {
+ length=1;
+ }
+ } else if(value<=0xffff) {
+ length=2;
+ } else {
+ length=3;
+ }
+ break;
+ case MBCS_OUTPUT_4:
+ value=((const uint32_t *)bytes)[value +(c&0x3f)];
+ if(value<=0xff) {
+ if(value==0) {
+ goto unassigned;
+ } else {
+ length=1;
+ }
+ } else if(value<=0xffff) {
+ length=2;
+ } else if(value<=0xffffff) {
+ length=3;
+ } else {
+ length=4;
+ }
+ break;
+ case MBCS_OUTPUT_3_EUC:
+ value=((const uint16_t *)bytes)[value +(c&0x3f)];
+ /* EUC 16-bit fixed-length representation */
+ if(value<=0xff) {
+ if(value==0) {
+ goto unassigned;
+ } else {
+ length=1;
+ }
+ } else if((value&0x8000)==0) {
+ value|=0x8e8000;
+ length=3;
+ } else if((value&0x80)==0) {
+ value|=0x8f0080;
+ length=3;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_4_EUC:
+ p=bytes+(value+(c&0x3f))*3;
+ value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ /* EUC 16-bit fixed-length representation applied to the first two bytes */
+ if(value<=0xff) {
+ if(value==0) {
+ goto unassigned;
+ } else {
+ length=1;
+ }
+ } else if(value<=0xffff) {
+ length=2;
+ } else if((value&0x800000)==0) {
+ value|=0x8e800000;
+ length=4;
+ } else if((value&0x8000)==0) {
+ value|=0x8f008000;
+ length=4;
+ } else {
+ length=3;
+ }
+ break;
+ default:
+ /* must not occur */
+ /*
+ * To avoid compiler warnings that value & length may be
+ * used without having been initialized, we set them here.
+ * In reality, this is unreachable code.
+ * Not having a default branch also causes warnings with
+ * some compilers.
+ */
+ value=0;
+ length=0;
+ break;
+ }
+ /* output the value */
+ } else {
+ /*
+ * This also tests if the codepage maps single surrogates.
+ * If it does, then surrogates are not paired but mapped separately.
+ * Note that in this case unmatched surrogates are not detected.
+ */
+ if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
+getTrail:
+ if(source<sourceLimit) {
+ /* test the following code unit */
+ UChar trail=*source;
+ if(U16_IS_TRAIL(trail)) {
+ ++source;
+ ++nextSourceIndex;
+ c=U16_GET_SUPPLEMENTARY(c, trail);
+ if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ cnv->fromUnicodeStatus=prevLength; /* save the old state */
+ /* callback(unassigned) */
+ goto unassigned;
+ }
+ /* convert this supplementary code point */
+ /* exit this condition tree */
+ } else {
+ /* this is an unmatched lead code unit (1st surrogate) */
+ /* callback(illegal) */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ } else {
+ /* no more input */
+ break;
+ }
+ } else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ }
+
+ /* convert the Unicode code point in c into codepage bytes */
+
+ /*
+ * The basic lookup is a triple-stage compact array (trie) lookup.
+ * For details see the beginning of this file.
+ *
+ * Single-byte codepages are handled with a different data structure
+ * by _MBCSSingle... functions.
+ *
+ * The result consists of a 32-bit value from stage 2 and
+ * a pointer to as many bytes as are stored per character.
+ * The pointer points to the character's bytes in stage 3.
+ * Bits 15..0 of the stage 2 entry contain the stage 3 index
+ * for that pointer, while bits 31..16 are flags for which of
+ * the 16 characters in the block are roundtrip-assigned.
+ *
+ * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
+ * respectively as uint32_t, in the platform encoding.
+ * For 3-byte codepages, the bytes are always stored in big-endian order.
+ *
+ * For EUC encodings that use only either 0x8e or 0x8f as the first
+ * byte of their longest byte sequences, the first two bytes in
+ * this third stage indicate with their 7th bits whether these bytes
+ * are to be written directly or actually need to be preceeded by
+ * one of the two Single-Shift codes. With this, the third stage
+ * stores one byte fewer per character than the actual maximum length of
+ * EUC byte sequences.
+ *
+ * Other than that, leading zero bytes are removed and the other
+ * bytes output. A single zero byte may be output if the "assigned"
+ * bit in stage 2 was on.
+ * The data structure does not support zero byte output as a fallback,
+ * and also does not allow output of leading zeros.
+ */
+ stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+
+ /* get the bytes and the length for the output */
+ switch(outputType) {
+ case MBCS_OUTPUT_2:
+ value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
+ if(value<=0xff) {
+ length=1;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_2_SISO:
+ /* 1/2-byte stateful with Shift-In/Shift-Out */
+ /*
+ * Save the old state in the converter object
+ * right here, then change the local prevLength state variable if necessary.
+ * Then, if this character turns out to be unassigned or a fallback that
+ * is not taken, the callback code must not save the new state in the converter
+ * because the new state is for a character that is not output.
+ * However, the callback must still restore the state from the converter
+ * in case the callback function changed it for its output.
+ */
+ cnv->fromUnicodeStatus=prevLength; /* save the old state */
+ value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
+ if(value<=0xff) {
+ if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
+ /* no mapping, leave value==0 */
+ length=0;
+ } else if(prevLength<=1) {
+ length=1;
+ } else {
+ /* change from double-byte mode to single-byte */
+ if (siLength == 1) {
+ value|=(uint32_t)siBytes[0]<<8;
+ length = 2;
+ } else if (siLength == 2) {
+ value|=(uint32_t)siBytes[1]<<8;
+ value|=(uint32_t)siBytes[0]<<16;
+ length = 3;
+ }
+ prevLength=1;
+ }
+ } else {
+ if(prevLength==2) {
+ length=2;
+ } else {
+ /* change from single-byte mode to double-byte */
+ if (soLength == 1) {
+ value|=(uint32_t)soBytes[0]<<16;
+ length = 3;
+ } else if (soLength == 2) {
+ value|=(uint32_t)soBytes[1]<<16;
+ value|=(uint32_t)soBytes[0]<<24;
+ length = 4;
+ }
+ prevLength=2;
+ }
+ }
+ break;
+ case MBCS_OUTPUT_DBCS_ONLY:
+ /* table with single-byte results, but only DBCS mappings used */
+ value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
+ if(value<=0xff) {
+ /* no mapping or SBCS result, not taken for DBCS-only */
+ value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
+ length=0;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_3:
+ p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
+ value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ if(value<=0xff) {
+ length=1;
+ } else if(value<=0xffff) {
+ length=2;
+ } else {
+ length=3;
+ }
+ break;
+ case MBCS_OUTPUT_4:
+ value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
+ if(value<=0xff) {
+ length=1;
+ } else if(value<=0xffff) {
+ length=2;
+ } else if(value<=0xffffff) {
+ length=3;
+ } else {
+ length=4;
+ }
+ break;
+ case MBCS_OUTPUT_3_EUC:
+ value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
+ /* EUC 16-bit fixed-length representation */
+ if(value<=0xff) {
+ length=1;
+ } else if((value&0x8000)==0) {
+ value|=0x8e8000;
+ length=3;
+ } else if((value&0x80)==0) {
+ value|=0x8f0080;
+ length=3;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_4_EUC:
+ p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
+ value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ /* EUC 16-bit fixed-length representation applied to the first two bytes */
+ if(value<=0xff) {
+ length=1;
+ } else if(value<=0xffff) {
+ length=2;
+ } else if((value&0x800000)==0) {
+ value|=0x8e800000;
+ length=4;
+ } else if((value&0x8000)==0) {
+ value|=0x8f008000;
+ length=4;
+ } else {
+ length=3;
+ }
+ break;
+ default:
+ /* must not occur */
+ /*
+ * To avoid compiler warnings that value & length may be
+ * used without having been initialized, we set them here.
+ * In reality, this is unreachable code.
+ * Not having a default branch also causes warnings with
+ * some compilers.
+ */
+ value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
+ length=0;
+ break;
+ }
+
+ /* is this code point assigned, or do we use fallbacks? */
+ if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
+ (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
+ ) {
+ /*
+ * We allow a 0 byte output if the "assigned" bit is set for this entry.
+ * There is no way with this data structure for fallback output
+ * to be a zero byte.
+ */
+
+unassigned:
+ /* try an extension mapping */
+ pArgs->source=source;
+ c=_extFromU(cnv, cnv->sharedData,
+ c, &source, sourceLimit,
+ &target, target+targetCapacity,
+ &offsets, sourceIndex,
+ pArgs->flush,
+ pErrorCode);
+ nextSourceIndex+=(int32_t)(source-pArgs->source);
+ prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* not mappable or buffer overflow */
+ break;
+ } else {
+ /* a mapping was written to the target, continue */
+
+ /* recalculate the targetCapacity after an extension mapping */
+ targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
+
+ /* normal end of conversion: prepare for a new character */
+ if(offsets!=NULL) {
+ prevSourceIndex=sourceIndex;
+ sourceIndex=nextSourceIndex;
+ }
+ continue;
+ }
+ }
+ }
+
+ /* write the output character bytes from value and length */
+ /* from the first if in the loop we know that targetCapacity>0 */
+ if(length<=targetCapacity) {
+ if(offsets==NULL) {
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 4:
+ *target++=(uint8_t)(value>>24);
+ case 3: /*fall through*/
+ *target++=(uint8_t)(value>>16);
+ case 2: /*fall through*/
+ *target++=(uint8_t)(value>>8);
+ case 1: /*fall through*/
+ *target++=(uint8_t)value;
+ default:
+ /* will never occur */
+ break;
+ }
+ } else {
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 4:
+ *target++=(uint8_t)(value>>24);
+ *offsets++=sourceIndex;
+ case 3: /*fall through*/
+ *target++=(uint8_t)(value>>16);
+ *offsets++=sourceIndex;
+ case 2: /*fall through*/
+ *target++=(uint8_t)(value>>8);
+ *offsets++=sourceIndex;
+ case 1: /*fall through*/
+ *target++=(uint8_t)value;
+ *offsets++=sourceIndex;
+ default:
+ /* will never occur */
+ break;
+ }
+ }
+ targetCapacity-=length;
+ } else {
+ uint8_t *charErrorBuffer;
+
+ /*
+ * We actually do this backwards here:
+ * In order to save an intermediate variable, we output
+ * first to the overflow buffer what does not fit into the
+ * regular target.
+ */
+ /* we know that 1<=targetCapacity<length<=4 */
+ length-=targetCapacity;
+ charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 3:
+ *charErrorBuffer++=(uint8_t)(value>>16);
+ case 2: /*fall through*/
+ *charErrorBuffer++=(uint8_t)(value>>8);
+ case 1: /*fall through*/
+ *charErrorBuffer=(uint8_t)value;
+ default:
+ /* will never occur */
+ break;
+ }
+ cnv->charErrorBufferLength=(int8_t)length;
+
+ /* now output what fits into the regular target */
+ value>>=8*length; /* length was reduced by targetCapacity */
+ switch(targetCapacity) {
+ /* each branch falls through to the next one */
+ case 3:
+ *target++=(uint8_t)(value>>16);
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex;
+ }
+ case 2: /*fall through*/
+ *target++=(uint8_t)(value>>8);
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex;
+ }
+ case 1: /*fall through*/
+ *target++=(uint8_t)value;
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex;
+ }
+ default:
+ /* will never occur */
+ break;
+ }
+
+ /* target overflow */
+ targetCapacity=0;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ c=0;
+ break;
+ }
+
+ /* normal end of conversion: prepare for a new character */
+ c=0;
+ if(offsets!=NULL) {
+ prevSourceIndex=sourceIndex;
+ sourceIndex=nextSourceIndex;
+ }
+ continue;
+ } else {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+
+ /*
+ * the end of the input stream and detection of truncated input
+ * are handled by the framework, but for EBCDIC_STATEFUL conversion
+ * we need to emit an SI at the very end
+ *
+ * conditions:
+ * successful
+ * EBCDIC_STATEFUL in DBCS mode
+ * end of input and no truncated input
+ */
+ if( U_SUCCESS(*pErrorCode) &&
+ outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
+ pArgs->flush && source>=sourceLimit && c==0
+ ) {
+ /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
+ if(targetCapacity>0) {
+ *target++=(uint8_t)siBytes[0];
+ if (siLength == 2) {
+ if (targetCapacity<2) {
+ cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
+ cnv->charErrorBufferLength=1;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ *target++=(uint8_t)siBytes[1];
+ }
+ }
+ if(offsets!=NULL) {
+ /* set the last source character's index (sourceIndex points at sourceLimit now) */
+ *offsets++=prevSourceIndex;
+ }
+ } else {
+ /* target is full */
+ cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
+ if (siLength == 2) {
+ cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
+ }
+ cnv->charErrorBufferLength=siLength;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+ prevLength=1; /* we switched into SBCS */
+ }
+
+ /* set the converter state back into UConverter */
+ cnv->fromUChar32=c;
+ cnv->fromUnicodeStatus=prevLength;
+
+ /* write back the updated pointers */
+ pArgs->source=source;
+ pArgs->target=(char *)target;
+ pArgs->offsets=offsets;
+}
+
+/*
+ * This is another simple conversion function for internal use by other
+ * conversion implementations.
+ * It does not use the converter state nor call callbacks.
+ * It does not handle the EBCDIC swaplfnl option (set in UConverter).
+ * It handles conversion extensions but not GB 18030.
+ *
+ * It converts one single Unicode code point into codepage bytes, encoded
+ * as one 32-bit value. The function returns the number of bytes in *pValue:
+ * 1..4 the number of bytes in *pValue
+ * 0 unassigned (*pValue undefined)
+ * -1 illegal (currently not used, *pValue undefined)
+ *
+ * *pValue will contain the resulting bytes with the last byte in bits 7..0,
+ * the second to last byte in bits 15..8, etc.
+ * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
+ */
+U_CFUNC int32_t
+ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
+ UChar32 c, uint32_t *pValue,
+ UBool useFallback) {
+ const int32_t *cx;
+ const uint16_t *table;
+#if 0
+/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
+ const uint8_t *p;
+#endif
+ uint32_t stage2Entry;
+ uint32_t value;
+ int32_t length;
+
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ table=sharedData->mbcs.fromUnicodeTable;
+
+ /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
+ if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
+ value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
+ /* is this code point assigned, or do we use fallbacks? */
+ if(useFallback ? value>=0x800 : value>=0xc00) {
+ *pValue=value&0xff;
+ return 1;
+ }
+ } else /* outputType!=MBCS_OUTPUT_1 */ {
+ stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+
+ /* get the bytes and the length for the output */
+ switch(sharedData->mbcs.outputType) {
+ case MBCS_OUTPUT_2:
+ value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ if(value<=0xff) {
+ length=1;
+ } else {
+ length=2;
+ }
+ break;
+#if 0
+/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
+ case MBCS_OUTPUT_DBCS_ONLY:
+ /* table with single-byte results, but only DBCS mappings used */
+ value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ if(value<=0xff) {
+ /* no mapping or SBCS result, not taken for DBCS-only */
+ value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
+ length=0;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_3:
+ p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ if(value<=0xff) {
+ length=1;
+ } else if(value<=0xffff) {
+ length=2;
+ } else {
+ length=3;
+ }
+ break;
+ case MBCS_OUTPUT_4:
+ value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ if(value<=0xff) {
+ length=1;
+ } else if(value<=0xffff) {
+ length=2;
+ } else if(value<=0xffffff) {
+ length=3;
+ } else {
+ length=4;
+ }
+ break;
+ case MBCS_OUTPUT_3_EUC:
+ value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ /* EUC 16-bit fixed-length representation */
+ if(value<=0xff) {
+ length=1;
+ } else if((value&0x8000)==0) {
+ value|=0x8e8000;
+ length=3;
+ } else if((value&0x80)==0) {
+ value|=0x8f0080;
+ length=3;
+ } else {
+ length=2;
+ }
+ break;
+ case MBCS_OUTPUT_4_EUC:
+ p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ /* EUC 16-bit fixed-length representation applied to the first two bytes */
+ if(value<=0xff) {
+ length=1;
+ } else if(value<=0xffff) {
+ length=2;
+ } else if((value&0x800000)==0) {
+ value|=0x8e800000;
+ length=4;
+ } else if((value&0x8000)==0) {
+ value|=0x8f008000;
+ length=4;
+ } else {
+ length=3;
+ }
+ break;
+#endif
+ default:
+ /* must not occur */
+ return -1;
+ }
+
+ /* is this code point assigned, or do we use fallbacks? */
+ if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
+ (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
+ ) {
+ /*
+ * We allow a 0 byte output if the "assigned" bit is set for this entry.
+ * There is no way with this data structure for fallback output
+ * to be a zero byte.
+ */
+ /* assigned */
+ *pValue=value;
+ return length;
+ }
+ }
+ }
+
+ cx=sharedData->mbcs.extIndexes;
+ if(cx!=NULL) {
+ length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
+ return length>=0 ? length : -length; /* return abs(length); */
+ }
+
+ /* unassigned */
+ return 0;
+}
+
+
+#if 0
+/*
+ * This function has been moved to ucnv2022.c for inlining.
+ * This implementation is here only for documentation purposes
+ */
+
+/**
+ * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
+ * It does not handle the EBCDIC swaplfnl option (set in UConverter).
+ * It does not handle conversion extensions (_extFromU()).
+ *
+ * It returns the codepage byte for the code point, or -1 if it is unassigned.
+ */
+U_CFUNC int32_t
+ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
+ UChar32 c,
+ UBool useFallback) {
+ const uint16_t *table;
+ int32_t value;
+
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ return -1;
+ }
+
+ /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
+ table=sharedData->mbcs.fromUnicodeTable;
+
+ /* get the byte for the output */
+ value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
+ /* is this code point assigned, or do we use fallbacks? */
+ if(useFallback ? value>=0x800 : value>=0xc00) {
+ return value&0xff;
+ } else {
+ return -1;
+ }
+}
+#endif
+
+/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
+
+/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
+static const UChar32
+utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
+
+/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
+static const UChar32
+utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
+
+static void
+ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
+ UConverterToUnicodeArgs *pToUArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *utf8, *cnv;
+ const uint8_t *source, *sourceLimit;
+ uint8_t *target;
+ int32_t targetCapacity;
+
+ const uint16_t *table, *sbcsIndex;
+ const uint16_t *results;
+
+ int8_t oldToULength, toULength, toULimit;
+
+ UChar32 c;
+ uint8_t b, t1, t2;
+
+ uint32_t asciiRoundtrips;
+ uint16_t value, minValue;
+ UBool hasSupplementary;
+
+ /* set up the local pointers */
+ utf8=pToUArgs->converter;
+ cnv=pFromUArgs->converter;
+ source=(uint8_t *)pToUArgs->source;
+ sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
+ target=(uint8_t *)pFromUArgs->target;
+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
+
+ table=cnv->sharedData->mbcs.fromUnicodeTable;
+ sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
+ if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
+ } else {
+ results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
+ }
+ asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
+
+ if(cnv->useFallback) {
+ /* use all roundtrip and fallback results */
+ minValue=0x800;
+ } else {
+ /* use only roundtrips and fallbacks from private-use characters */
+ minValue=0xc00;
+ }
+ hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
+
+ /* get the converter state from the UTF-8 UConverter */
+ c=(UChar32)utf8->toUnicodeStatus;
+ if(c!=0) {
+ toULength=oldToULength=utf8->toULength;
+ toULimit=(int8_t)utf8->mode;
+ } else {
+ toULength=oldToULength=toULimit=0;
+ }
+
+ /*
+ * Make sure that the last byte sequence before sourceLimit is complete
+ * or runs into a lead byte.
+ * Do not go back into the bytes that will be read for finishing a partial
+ * sequence from the previous buffer.
+ * In the conversion loop compare source with sourceLimit only once
+ * per multi-byte character.
+ */
+ {
+ int32_t i, length;
+
+ length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
+ for(i=0; i<3 && i<length;) {
+ b=*(sourceLimit-i-1);
+ if(U8_IS_TRAIL(b)) {
+ ++i;
+ } else {
+ if(i<U8_COUNT_TRAIL_BYTES(b)) {
+ /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
+ sourceLimit-=i+1;
+ }
+ break;
+ }
+ }
+ }
+
+ if(c!=0 && targetCapacity>0) {
+ utf8->toUnicodeStatus=0;
+ utf8->toULength=0;
+ goto moreBytes;
+ /*
+ * Note: We could avoid the goto by duplicating some of the moreBytes
+ * code, but only up to the point of collecting a complete UTF-8
+ * sequence; then recurse for the toUBytes[toULength]
+ * and then continue with normal conversion.
+ *
+ * If so, move this code to just after initializing the minimum
+ * set of local variables for reading the UTF-8 input
+ * (utf8, source, target, limits but not cnv, table, minValue, etc.).
+ *
+ * Potential advantages:
+ * - avoid the goto
+ * - oldToULength could become a local variable in just those code blocks
+ * that deal with buffer boundaries
+ * - possibly faster if the goto prevents some compiler optimizations
+ * (this would need measuring to confirm)
+ * Disadvantage:
+ * - code duplication
+ */
+ }
+
+ /* conversion loop */
+ while(source<sourceLimit) {
+ if(targetCapacity>0) {
+ b=*source++;
+ if((int8_t)b>=0) {
+ /* convert ASCII */
+ if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
+ *target++=(uint8_t)b;
+ --targetCapacity;
+ continue;
+ } else {
+ c=b;
+ value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
+ }
+ } else {
+ if(b<0xe0) {
+ if( /* handle U+0080..U+07FF inline */
+ b>=0xc2 &&
+ (t1=(uint8_t)(*source-0x80)) <= 0x3f
+ ) {
+ c=b&0x1f;
+ ++source;
+ value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
+ if(value>=minValue) {
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ continue;
+ } else {
+ c=(c<<6)|t1;
+ }
+ } else {
+ c=-1;
+ }
+ } else if(b==0xe0) {
+ if( /* handle U+0800..U+0FFF inline */
+ (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
+ (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
+ ) {
+ c=t1;
+ source+=2;
+ value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
+ if(value>=minValue) {
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ continue;
+ } else {
+ c=(c<<6)|t2;
+ }
+ } else {
+ c=-1;
+ }
+ } else {
+ c=-1;
+ }
+
+ if(c<0) {
+ /* handle "complicated" and error cases, and continuing partial characters */
+ oldToULength=0;
+ toULength=1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+ c=b;
+moreBytes:
+ while(toULength<toULimit) {
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * Here we need to use the real limit in case we have two truncated
+ * sequences at the end.
+ * See ticket #7492.
+ */
+ if(source<(uint8_t *)pToUArgs->sourceLimit) {
+ b=*source;
+ if(U8_IS_TRAIL(b)) {
+ ++source;
+ ++toULength;
+ c=(c<<6)+b;
+ } else {
+ break; /* sequence too short, stop with toULength<toULimit */
+ }
+ } else {
+ /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
+ source-=(toULength-oldToULength);
+ while(oldToULength<toULength) {
+ utf8->toUBytes[oldToULength++]=*source++;
+ }
+ utf8->toUnicodeStatus=c;
+ utf8->toULength=toULength;
+ utf8->mode=toULimit;
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+ return;
+ }
+ }
+
+ if( toULength==toULimit && /* consumed all trail bytes */
+ (toULength==3 || toULength==2) && /* BMP */
+ (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
+ (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
+ ) {
+ value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
+ } else if(
+ toULength==toULimit && toULength==4 &&
+ (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
+ ) {
+ /* supplementary code point */
+ if(!hasSupplementary) {
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ value=0;
+ } else {
+ value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
+ }
+ } else {
+ /* error handling: illegal UTF-8 byte sequence */
+ source-=(toULength-oldToULength);
+ while(oldToULength<toULength) {
+ utf8->toUBytes[oldToULength++]=*source++;
+ }
+ utf8->toULength=toULength;
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ return;
+ }
+ }
+ }
+
+ if(value>=minValue) {
+ /* output the mapping for c */
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ } else {
+ /* value<minValue means c is unassigned (unmappable) */
+ /*
+ * Try an extension mapping.
+ * Pass in no source because we don't have UTF-16 input.
+ * If we have a partial match on c, we will return and revert
+ * to UTF-8->UTF-16->charset conversion.
+ */
+ static const UChar nul=0;
+ const UChar *noSource=&nul;
+ c=_extFromU(cnv, cnv->sharedData,
+ c, &noSource, noSource,
+ &target, target+targetCapacity,
+ NULL, -1,
+ pFromUArgs->flush,
+ pErrorCode);
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* not mappable or buffer overflow */
+ cnv->fromUChar32=c;
+ break;
+ } else if(cnv->preFromUFirstCP>=0) {
+ /*
+ * Partial match, return and revert to pivoting.
+ * In normal from-UTF-16 conversion, we would just continue
+ * but then exit the loop because the extension match would
+ * have consumed the source.
+ */
+ *pErrorCode=U_USING_DEFAULT_WARNING;
+ break;
+ } else {
+ /* a mapping was written to the target, continue */
+
+ /* recalculate the targetCapacity after an extension mapping */
+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
+ }
+ }
+ } else {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * If so, then collect the truncated sequence now.
+ */
+ if(U_SUCCESS(*pErrorCode) &&
+ cnv->preFromUFirstCP<0 &&
+ source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ c=utf8->toUBytes[0]=b=*source++;
+ toULength=1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
+ while(source<sourceLimit) {
+ utf8->toUBytes[toULength++]=b=*source++;
+ c=(c<<6)+b;
+ }
+ utf8->toUnicodeStatus=c;
+ utf8->toULength=toULength;
+ utf8->mode=toULimit;
+ }
+
+ /* write back the updated pointers */
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+}
+
+static void
+ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
+ UConverterToUnicodeArgs *pToUArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *utf8, *cnv;
+ const uint8_t *source, *sourceLimit;
+ uint8_t *target;
+ int32_t targetCapacity;
+
+ const uint16_t *table, *mbcsIndex;
+ const uint16_t *results;
+
+ int8_t oldToULength, toULength, toULimit;
+
+ UChar32 c;
+ uint8_t b, t1, t2;