+/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
+
+/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
+static const UChar32
+utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
+
+/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
+static const UChar32
+utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
+
+static void
+ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
+ UConverterToUnicodeArgs *pToUArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *utf8, *cnv;
+ const uint8_t *source, *sourceLimit;
+ uint8_t *target;
+ int32_t targetCapacity;
+
+ const uint16_t *table, *sbcsIndex;
+ const uint16_t *results;
+
+ int8_t oldToULength, toULength, toULimit;
+
+ UChar32 c;
+ uint8_t b, t1, t2;
+
+ uint32_t asciiRoundtrips;
+ uint16_t value, minValue;
+ UBool hasSupplementary;
+
+ /* set up the local pointers */
+ utf8=pToUArgs->converter;
+ cnv=pFromUArgs->converter;
+ source=(uint8_t *)pToUArgs->source;
+ sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
+ target=(uint8_t *)pFromUArgs->target;
+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
+
+ table=cnv->sharedData->mbcs.fromUnicodeTable;
+ sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
+ if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
+ } else {
+ results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
+ }
+ asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
+
+ if(cnv->useFallback) {
+ /* use all roundtrip and fallback results */
+ minValue=0x800;
+ } else {
+ /* use only roundtrips and fallbacks from private-use characters */
+ minValue=0xc00;
+ }
+ hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
+
+ /* get the converter state from the UTF-8 UConverter */
+ c=(UChar32)utf8->toUnicodeStatus;
+ if(c!=0) {
+ toULength=oldToULength=utf8->toULength;
+ toULimit=(int8_t)utf8->mode;
+ } else {
+ toULength=oldToULength=toULimit=0;
+ }
+
+ /*
+ * Make sure that the last byte sequence before sourceLimit is complete
+ * or runs into a lead byte.
+ * Do not go back into the bytes that will be read for finishing a partial
+ * sequence from the previous buffer.
+ * In the conversion loop compare source with sourceLimit only once
+ * per multi-byte character.
+ */
+ {
+ int32_t i, length;
+
+ length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
+ for(i=0; i<3 && i<length;) {
+ b=*(sourceLimit-i-1);
+ if(U8_IS_TRAIL(b)) {
+ ++i;
+ } else {
+ if(i<utf8_countTrailBytes[b]) {
+ /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
+ sourceLimit-=i+1;
+ }
+ break;
+ }
+ }
+ }
+
+ if(c!=0 && targetCapacity>0) {
+ utf8->toUnicodeStatus=0;
+ utf8->toULength=0;
+ goto moreBytes;
+ /*
+ * Note: We could avoid the goto by duplicating some of the moreBytes
+ * code, but only up to the point of collecting a complete UTF-8
+ * sequence; then recurse for the toUBytes[toULength]
+ * and then continue with normal conversion.
+ *
+ * If so, move this code to just after initializing the minimum
+ * set of local variables for reading the UTF-8 input
+ * (utf8, source, target, limits but not cnv, table, minValue, etc.).
+ *
+ * Potential advantages:
+ * - avoid the goto
+ * - oldToULength could become a local variable in just those code blocks
+ * that deal with buffer boundaries
+ * - possibly faster if the goto prevents some compiler optimizations
+ * (this would need measuring to confirm)
+ * Disadvantage:
+ * - code duplication
+ */
+ }
+
+ /* conversion loop */
+ while(source<sourceLimit) {
+ if(targetCapacity>0) {
+ b=*source++;
+ if((int8_t)b>=0) {
+ /* convert ASCII */
+ if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
+ *target++=(uint8_t)b;
+ --targetCapacity;
+ continue;
+ } else {
+ c=b;
+ value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
+ }
+ } else {
+ if(b<0xe0) {
+ if( /* handle U+0080..U+07FF inline */
+ b>=0xc2 &&
+ (t1=(uint8_t)(*source-0x80)) <= 0x3f
+ ) {
+ c=b&0x1f;
+ ++source;
+ value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
+ if(value>=minValue) {
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ continue;
+ } else {
+ c=(c<<6)|t1;
+ }
+ } else {
+ c=-1;
+ }
+ } else if(b==0xe0) {
+ if( /* handle U+0800..U+0FFF inline */
+ (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
+ (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
+ ) {
+ c=t1;
+ source+=2;
+ value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
+ if(value>=minValue) {
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ continue;
+ } else {
+ c=(c<<6)|t2;
+ }
+ } else {
+ c=-1;
+ }
+ } else {
+ c=-1;
+ }
+
+ if(c<0) {
+ /* handle "complicated" and error cases, and continuing partial characters */
+ oldToULength=0;
+ toULength=1;
+ toULimit=utf8_countTrailBytes[b]+1;
+ c=b;
+moreBytes:
+ while(toULength<toULimit) {
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * Here we need to use the real limit in case we have two truncated
+ * sequences at the end.
+ * See ticket #7492.
+ */
+ if(source<(uint8_t *)pToUArgs->sourceLimit) {
+ b=*source;
+ if(U8_IS_TRAIL(b)) {
+ ++source;
+ ++toULength;
+ c=(c<<6)+b;
+ } else {
+ break; /* sequence too short, stop with toULength<toULimit */
+ }
+ } else {
+ /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
+ source-=(toULength-oldToULength);
+ while(oldToULength<toULength) {
+ utf8->toUBytes[oldToULength++]=*source++;
+ }
+ utf8->toUnicodeStatus=c;
+ utf8->toULength=toULength;
+ utf8->mode=toULimit;
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+ return;
+ }
+ }
+
+ if( toULength==toULimit && /* consumed all trail bytes */
+ (toULength==3 || toULength==2) && /* BMP */
+ (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
+ (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
+ ) {
+ value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
+ } else if(
+ toULength==toULimit && toULength==4 &&
+ (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
+ ) {
+ /* supplementary code point */
+ if(!hasSupplementary) {
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ value=0;
+ } else {
+ value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
+ }
+ } else {
+ /* error handling: illegal UTF-8 byte sequence */
+ source-=(toULength-oldToULength);
+ while(oldToULength<toULength) {
+ utf8->toUBytes[oldToULength++]=*source++;
+ }
+ utf8->toULength=toULength;
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ return;
+ }
+ }
+ }
+
+ if(value>=minValue) {
+ /* output the mapping for c */
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ } else {
+ /* value<minValue means c is unassigned (unmappable) */
+ /*
+ * Try an extension mapping.
+ * Pass in no source because we don't have UTF-16 input.
+ * If we have a partial match on c, we will return and revert
+ * to UTF-8->UTF-16->charset conversion.
+ */
+ static const UChar nul=0;
+ const UChar *noSource=&nul;
+ c=_extFromU(cnv, cnv->sharedData,
+ c, &noSource, noSource,
+ &target, target+targetCapacity,
+ NULL, -1,
+ pFromUArgs->flush,
+ pErrorCode);
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* not mappable or buffer overflow */
+ cnv->fromUChar32=c;
+ break;
+ } else if(cnv->preFromUFirstCP>=0) {
+ /*
+ * Partial match, return and revert to pivoting.
+ * In normal from-UTF-16 conversion, we would just continue
+ * but then exit the loop because the extension match would
+ * have consumed the source.
+ */
+ break;
+ } else {
+ /* a mapping was written to the target, continue */
+
+ /* recalculate the targetCapacity after an extension mapping */
+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
+ }
+ }
+ } else {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * If so, then collect the truncated sequence now.
+ */
+ if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ c=utf8->toUBytes[0]=b=*source++;
+ toULength=1;
+ toULimit=utf8_countTrailBytes[b]+1;
+ while(source<sourceLimit) {
+ utf8->toUBytes[toULength++]=b=*source++;
+ c=(c<<6)+b;
+ }
+ utf8->toUnicodeStatus=c;
+ utf8->toULength=toULength;
+ utf8->mode=toULimit;
+ }
+
+ /* write back the updated pointers */
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+}
+
+static void
+ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
+ UConverterToUnicodeArgs *pToUArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *utf8, *cnv;
+ const uint8_t *source, *sourceLimit;
+ uint8_t *target;
+ int32_t targetCapacity;
+
+ const uint16_t *table, *mbcsIndex;
+ const uint16_t *results;
+
+ int8_t oldToULength, toULength, toULimit;
+
+ UChar32 c;
+ uint8_t b, t1, t2;
+
+ uint32_t stage2Entry;
+ uint32_t asciiRoundtrips;
+ uint16_t value, minValue;
+ UBool hasSupplementary;
+
+ /* set up the local pointers */
+ utf8=pToUArgs->converter;
+ cnv=pFromUArgs->converter;
+ source=(uint8_t *)pToUArgs->source;
+ sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
+ target=(uint8_t *)pFromUArgs->target;
+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
+
+ table=cnv->sharedData->mbcs.fromUnicodeTable;
+ mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
+ if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
+ } else {
+ results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
+ }
+ asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
+
+ if(cnv->useFallback) {
+ /* use all roundtrip and fallback results */
+ minValue=0x800;
+ } else {
+ /* use only roundtrips and fallbacks from private-use characters */
+ minValue=0xc00;
+ }
+ hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
+
+ /* get the converter state from the UTF-8 UConverter */
+ c=(UChar32)utf8->toUnicodeStatus;
+ if(c!=0) {
+ toULength=oldToULength=utf8->toULength;
+ toULimit=(int8_t)utf8->mode;
+ } else {
+ toULength=oldToULength=toULimit=0;
+ }
+
+ /*
+ * Make sure that the last byte sequence before sourceLimit is complete
+ * or runs into a lead byte.
+ * Do not go back into the bytes that will be read for finishing a partial
+ * sequence from the previous buffer.
+ * In the conversion loop compare source with sourceLimit only once
+ * per multi-byte character.
+ */
+ {
+ int32_t i, length;
+
+ length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
+ for(i=0; i<3 && i<length;) {
+ b=*(sourceLimit-i-1);
+ if(U8_IS_TRAIL(b)) {
+ ++i;
+ } else {
+ if(i<utf8_countTrailBytes[b]) {
+ /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
+ sourceLimit-=i+1;
+ }
+ break;
+ }
+ }
+ }
+
+ if(c!=0 && targetCapacity>0) {
+ utf8->toUnicodeStatus=0;
+ utf8->toULength=0;
+ goto moreBytes;
+ /* See note in ucnv_SBCSFromUTF8() about this goto. */
+ }
+
+ /* conversion loop */
+ while(source<sourceLimit) {
+ if(targetCapacity>0) {
+ b=*source++;
+ if((int8_t)b>=0) {
+ /* convert ASCII */
+ if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
+ *target++=b;
+ --targetCapacity;
+ continue;
+ } else {
+ value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
+ if(value==0) {
+ c=b;
+ goto unassigned;
+ }
+ }
+ } else {
+ if(b>0xe0) {
+ if( /* handle U+1000..U+D7FF inline */
+ (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
+ (b==0xed && (t1 <= 0x1f))) &&
+ (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
+ ) {
+ c=((b&0xf)<<6)|t1;
+ source+=2;
+ value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
+ if(value==0) {
+ c=(c<<6)|t2;
+ goto unassigned;
+ }
+ } else {
+ c=-1;
+ }
+ } else if(b<0xe0) {
+ if( /* handle U+0080..U+07FF inline */
+ b>=0xc2 &&
+ (t1=(uint8_t)(*source-0x80)) <= 0x3f
+ ) {
+ c=b&0x1f;
+ ++source;
+ value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
+ if(value==0) {
+ c=(c<<6)|t1;
+ goto unassigned;
+ }
+ } else {
+ c=-1;
+ }
+ } else {
+ c=-1;
+ }
+
+ if(c<0) {
+ /* handle "complicated" and error cases, and continuing partial characters */
+ oldToULength=0;
+ toULength=1;
+ toULimit=utf8_countTrailBytes[b]+1;
+ c=b;
+moreBytes:
+ while(toULength<toULimit) {
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * Here we need to use the real limit in case we have two truncated
+ * sequences at the end.
+ * See ticket #7492.
+ */
+ if(source<(uint8_t *)pToUArgs->sourceLimit) {
+ b=*source;
+ if(U8_IS_TRAIL(b)) {
+ ++source;
+ ++toULength;
+ c=(c<<6)+b;
+ } else {
+ break; /* sequence too short, stop with toULength<toULimit */
+ }
+ } else {
+ /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
+ source-=(toULength-oldToULength);
+ while(oldToULength<toULength) {
+ utf8->toUBytes[oldToULength++]=*source++;
+ }
+ utf8->toUnicodeStatus=c;
+ utf8->toULength=toULength;
+ utf8->mode=toULimit;
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+ return;
+ }
+ }
+
+ if( toULength==toULimit && /* consumed all trail bytes */
+ (toULength==3 || toULength==2) && /* BMP */
+ (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
+ (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
+ ) {
+ stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+ } else if(
+ toULength==toULimit && toULength==4 &&
+ (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
+ ) {
+ /* supplementary code point */
+ if(!hasSupplementary) {
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ stage2Entry=0;
+ } else {
+ stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+ }
+ } else {
+ /* error handling: illegal UTF-8 byte sequence */
+ source-=(toULength-oldToULength);
+ while(oldToULength<toULength) {
+ utf8->toUBytes[oldToULength++]=*source++;
+ }
+ utf8->toULength=toULength;
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ return;
+ }
+
+ /* get the bytes and the length for the output */
+ /* MBCS_OUTPUT_2 */
+ value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
+
+ /* is this code point assigned, or do we use fallbacks? */
+ if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
+ (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
+ ) {
+ goto unassigned;
+ }
+ }
+ }
+
+ /* write the output character bytes from value and length */
+ /* from the first if in the loop we know that targetCapacity>0 */
+ if(value<=0xff) {
+ /* this is easy because we know that there is enough space */
+ *target++=(uint8_t)value;
+ --targetCapacity;
+ } else /* length==2 */ {
+ *target++=(uint8_t)(value>>8);
+ if(2<=targetCapacity) {
+ *target++=(uint8_t)value;
+ targetCapacity-=2;
+ } else {
+ cnv->charErrorBuffer[0]=(char)value;
+ cnv->charErrorBufferLength=1;
+
+ /* target overflow */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+ continue;
+
+unassigned:
+ {
+ /*
+ * Try an extension mapping.
+ * Pass in no source because we don't have UTF-16 input.
+ * If we have a partial match on c, we will return and revert
+ * to UTF-8->UTF-16->charset conversion.
+ */
+ static const UChar nul=0;
+ const UChar *noSource=&nul;
+ c=_extFromU(cnv, cnv->sharedData,
+ c, &noSource, noSource,
+ &target, target+targetCapacity,
+ NULL, -1,
+ pFromUArgs->flush,
+ pErrorCode);
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* not mappable or buffer overflow */
+ cnv->fromUChar32=c;
+ break;
+ } else if(cnv->preFromUFirstCP>=0) {
+ /*
+ * Partial match, return and revert to pivoting.
+ * In normal from-UTF-16 conversion, we would just continue
+ * but then exit the loop because the extension match would
+ * have consumed the source.
+ */
+ break;
+ } else {
+ /* a mapping was written to the target, continue */
+
+ /* recalculate the targetCapacity after an extension mapping */
+ targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
+ continue;
+ }
+ }
+ } else {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * If so, then collect the truncated sequence now.
+ */
+ if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ c=utf8->toUBytes[0]=b=*source++;
+ toULength=1;
+ toULimit=utf8_countTrailBytes[b]+1;
+ while(source<sourceLimit) {
+ utf8->toUBytes[toULength++]=b=*source++;
+ c=(c<<6)+b;
+ }
+ utf8->toUnicodeStatus=c;
+ utf8->toULength=toULength;
+ utf8->mode=toULimit;
+ }
+
+ /* write back the updated pointers */
+ pToUArgs->source=(char *)source;
+ pFromUArgs->target=(char *)target;
+}
+