/*
******************************************************************************
*
-* Copyright (C) 2000-2010, International Business Machines
+* Copyright (C) 2000-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
#include "unicode/ucnv_cb.h"
#include "unicode/udata.h"
#include "unicode/uset.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include "ucnv_cnv.h"
-#include "umutex.h"
#include "cmemory.h"
#include "cstring.h"
+#include "cmutex.h"
/* control optimizations according to the platform */
#define MBCS_UNROLL_SINGLE_TO_BMP 1
* as of the re-released mapping tables from 2000-nov-30.
*/
static const uint32_t
-gb18030Ranges[13][4]={
+gb18030Ranges[14][4]={
{0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
{0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
- {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
+ {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
+ {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
{0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
{0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
{0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
switch(st3Multiplier) {
case 4:
b|=*stage3++;
- case 3:
+ case 3: /*fall through*/
b|=*stage3++;
- case 2:
+ case 2: /*fall through*/
b|=stage3[0]|stage3[1];
stage3+=2;
default:
UErrorCode *pErrorCode) {
uint16_t *stage1;
uint32_t *stage2;
- uint8_t *bytes;
uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
if(mbcsTable->reconstitutedData==NULL) {
stage2Length*4);
mbcsTable->fromUnicodeTable=stage1;
- mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
+ mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
/* indexes into stage 2 count from the bottom of the fromUnicodeTable */
stage2=(uint32_t *)stage1;
* If it does, then surrogates are not paired but mapped separately.
* Note that in this case unmatched surrogates are not detected.
*/
- if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
- if(UTF_IS_SURROGATE_FIRST(c)) {
+ if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
/* callback(unassigned) */
*/
c=*source++;
++nextSourceIndex;
- if(UTF_IS_SURROGATE(c)) {
- if(UTF_IS_SURROGATE_FIRST(c)) {
+ if(U16_IS_SURROGATE(c)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
if(!hasSupplementary) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
/* callback(unassigned) */
/* normal end of conversion: prepare for a new character */
c=0;
continue;
- } else if(!UTF_IS_SURROGATE(c)) {
+ } else if(!U16_IS_SURROGATE(c)) {
/* normal, unassigned BMP character */
- } else if(UTF_IS_SURROGATE_FIRST(c)) {
+ } else if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
/* this codepage does not map supplementary code points */
/* callback(unassigned) */
} else {
uint32_t stage2Entry;
uint32_t asciiRoundtrips;
uint32_t value;
- uint8_t si_value[2] = {0, 0};
- uint8_t so_value[2] = {0, 0};
- uint8_t si_value_length, so_value_length;
+ /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
+ uint8_t siBytes[2] = {0, 0};
+ uint8_t soBytes[2] = {0, 0};
+ uint8_t siLength, soLength;
int32_t length = 0, prevLength;
uint8_t unicodeMask;
nextSourceIndex=0;
/* Get the SI/SO character for the converter */
- si_value_length = getSISOBytes(SI, cnv->options, si_value);
- so_value_length = getSISOBytes(SO, cnv->options, so_value);
+ siLength = getSISOBytes(SI, cnv->options, siBytes);
+ soLength = getSISOBytes(SO, cnv->options, soBytes);
/* conversion loop */
/*
length=1;
} else {
/* change from double-byte mode to single-byte */
- if (si_value_length == 1) {
- value|=(uint32_t)si_value[0]<<8;
+ if (siLength == 1) {
+ value|=(uint32_t)siBytes[0]<<8;
length = 2;
- } else if (si_value_length == 2) {
- value|=(uint32_t)si_value[1]<<8;
- value|=(uint32_t)si_value[0]<<16;
+ } else if (siLength == 2) {
+ value|=(uint32_t)siBytes[1]<<8;
+ value|=(uint32_t)siBytes[0]<<16;
length = 3;
}
prevLength=1;
length=2;
} else {
/* change from single-byte mode to double-byte */
- if (so_value_length == 1) {
- value|=(uint32_t)so_value[0]<<16;
+ if (soLength == 1) {
+ value|=(uint32_t)soBytes[0]<<16;
length = 3;
- } else if (so_value_length == 2) {
- value|=(uint32_t)so_value[1]<<16;
- value|=(uint32_t)so_value[0]<<24;
+ } else if (soLength == 2) {
+ value|=(uint32_t)soBytes[1]<<16;
+ value|=(uint32_t)soBytes[0]<<24;
length = 4;
}
prevLength=2;
* If it does, then surrogates are not paired but mapped separately.
* Note that in this case unmatched surrogates are not detected.
*/
- if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
- if(UTF_IS_SURROGATE_FIRST(c)) {
+ if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
cnv->fromUnicodeStatus=prevLength; /* save the old state */
length=1;
} else {
/* change from double-byte mode to single-byte */
- if (si_value_length == 1) {
- value|=(uint32_t)si_value[0]<<8;
+ if (siLength == 1) {
+ value|=(uint32_t)siBytes[0]<<8;
length = 2;
- } else if (si_value_length == 2) {
- value|=(uint32_t)si_value[1]<<8;
- value|=(uint32_t)si_value[0]<<16;
+ } else if (siLength == 2) {
+ value|=(uint32_t)siBytes[1]<<8;
+ value|=(uint32_t)siBytes[0]<<16;
length = 3;
}
prevLength=1;
length=2;
} else {
/* change from single-byte mode to double-byte */
- if (so_value_length == 1) {
- value|=(uint32_t)so_value[0]<<16;
+ if (soLength == 1) {
+ value|=(uint32_t)soBytes[0]<<16;
length = 3;
- } else if (so_value_length == 2) {
- value|=(uint32_t)so_value[1]<<16;
- value|=(uint32_t)so_value[0]<<24;
+ } else if (soLength == 2) {
+ value|=(uint32_t)soBytes[1]<<16;
+ value|=(uint32_t)soBytes[0]<<24;
length = 4;
}
prevLength=2;
/* each branch falls through to the next one */
case 4:
*target++=(uint8_t)(value>>24);
- case 3:
+ case 3: /*fall through*/
*target++=(uint8_t)(value>>16);
- case 2:
+ case 2: /*fall through*/
*target++=(uint8_t)(value>>8);
- case 1:
+ case 1: /*fall through*/
*target++=(uint8_t)value;
default:
/* will never occur */
case 4:
*target++=(uint8_t)(value>>24);
*offsets++=sourceIndex;
- case 3:
+ case 3: /*fall through*/
*target++=(uint8_t)(value>>16);
*offsets++=sourceIndex;
- case 2:
+ case 2: /*fall through*/
*target++=(uint8_t)(value>>8);
*offsets++=sourceIndex;
- case 1:
+ case 1: /*fall through*/
*target++=(uint8_t)value;
*offsets++=sourceIndex;
default:
/* each branch falls through to the next one */
case 3:
*charErrorBuffer++=(uint8_t)(value>>16);
- case 2:
+ case 2: /*fall through*/
*charErrorBuffer++=(uint8_t)(value>>8);
- case 1:
+ case 1: /*fall through*/
*charErrorBuffer=(uint8_t)value;
default:
/* will never occur */
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
- case 2:
+ case 2: /*fall through*/
*target++=(uint8_t)(value>>8);
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
- case 1:
+ case 1: /*fall through*/
*target++=(uint8_t)value;
if(offsets!=NULL) {
*offsets++=sourceIndex;
) {
/* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
if(targetCapacity>0) {
- *target++=(uint8_t)si_value[0];
- if (si_value_length == 2) {
+ *target++=(uint8_t)siBytes[0];
+ if (siLength == 2) {
if (targetCapacity<2) {
- cnv->charErrorBuffer[0]=(uint8_t)si_value[1];
+ cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
cnv->charErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
} else {
- *target++=(uint8_t)si_value[1];
+ *target++=(uint8_t)siBytes[1];
}
}
if(offsets!=NULL) {
}
} else {
/* target is full */
- cnv->charErrorBuffer[0]=(uint8_t)si_value[0];
- if (si_value_length == 2) {
- cnv->charErrorBuffer[1]=(uint8_t)si_value[1];
+ cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
+ if (siLength == 2) {
+ cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
}
- cnv->charErrorBufferLength=si_value_length;
+ cnv->charErrorBufferLength=siLength;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
prevLength=1; /* we switched into SBCS */
if(U8_IS_TRAIL(b)) {
++i;
} else {
- if(i<utf8_countTrailBytes[b]) {
+ if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
sourceLimit-=i+1;
}
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
c=b;
moreBytes:
while(toULength<toULimit) {
* but then exit the loop because the extension match would
* have consumed the source.
*/
+ *pErrorCode=U_USING_DEFAULT_WARNING;
break;
} else {
/* a mapping was written to the target, continue */
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
*/
- if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ if(U_SUCCESS(*pErrorCode) &&
+ cnv->preFromUFirstCP<0 &&
+ source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
while(source<sourceLimit) {
utf8->toUBytes[toULength++]=b=*source++;
c=(c<<6)+b;
uint32_t stage2Entry;
uint32_t asciiRoundtrips;
- uint16_t value, minValue;
+ uint16_t value;
UBool hasSupplementary;
/* set up the local pointers */
}
asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
- if(cnv->useFallback) {
- /* use all roundtrip and fallback results */
- minValue=0x800;
- } else {
- /* use only roundtrips and fallbacks from private-use characters */
- minValue=0xc00;
- }
hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
/* get the converter state from the UTF-8 UConverter */
if(U8_IS_TRAIL(b)) {
++i;
} else {
- if(i<utf8_countTrailBytes[b]) {
+ if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
sourceLimit-=i+1;
}
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
c=b;
moreBytes:
while(toULength<toULimit) {
* but then exit the loop because the extension match would
* have consumed the source.
*/
+ *pErrorCode=U_USING_DEFAULT_WARNING;
break;
} else {
/* a mapping was written to the target, continue */
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
*/
- if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ if(U_SUCCESS(*pErrorCode) &&
+ cnv->preFromUFirstCP<0 &&
+ source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
while(source<sourceLimit) {
utf8->toUBytes[toULength++]=b=*source++;
c=(c<<6)+b;