/*
******************************************************************************
*
-* Copyright (C) 2000-2008, International Business Machines
+* Copyright (C) 2000-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
#include "unicode/ucnv_cb.h"
#include "unicode/udata.h"
#include "unicode/uset.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include "ucnv_cnv.h"
-#include "umutex.h"
#include "cmemory.h"
#include "cstring.h"
+#include "cmutex.h"
/* control optimizations according to the platform */
#define MBCS_UNROLL_SINGLE_TO_BMP 1
* as of the re-released mapping tables from 2000-nov-30.
*/
static const uint32_t
-gb18030Ranges[13][4]={
+gb18030Ranges[14][4]={
{0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
{0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
- {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
+ {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
+ {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
{0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
{0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
{0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
/* bit flag for UConverter.options indicating GB 18030 special handling */
#define _MBCS_OPTION_GB18030 0x8000
+/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
+#define _MBCS_OPTION_KEIS 0x01000
+#define _MBCS_OPTION_JEF 0x02000
+#define _MBCS_OPTION_JIPS 0x04000
+
+#define KEIS_SO_CHAR_1 0x0A
+#define KEIS_SO_CHAR_2 0x42
+#define KEIS_SI_CHAR_1 0x0A
+#define KEIS_SI_CHAR_2 0x41
+
+#define JEF_SO_CHAR 0x28
+#define JEF_SI_CHAR 0x29
+
+#define JIPS_SO_CHAR_1 0x1A
+#define JIPS_SO_CHAR_2 0x70
+#define JIPS_SI_CHAR_1 0x1A
+#define JIPS_SI_CHAR_2 0x71
+
+enum SISO_Option {
+ SI,
+ SO
+};
+typedef enum SISO_Option SISO_Option;
+
+static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
+ int32_t SISOLength = 0;
+
+ switch (option) {
+ case SI:
+ if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
+ value[0] = KEIS_SI_CHAR_1;
+ value[1] = KEIS_SI_CHAR_2;
+ SISOLength = 2;
+ } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
+ value[0] = JEF_SI_CHAR;
+ SISOLength = 1;
+ } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
+ value[0] = JIPS_SI_CHAR_1;
+ value[1] = JIPS_SI_CHAR_2;
+ SISOLength = 2;
+ } else {
+ value[0] = UCNV_SI;
+ SISOLength = 1;
+ }
+ break;
+ case SO:
+ if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
+ value[0] = KEIS_SO_CHAR_1;
+ value[1] = KEIS_SO_CHAR_2;
+ SISOLength = 2;
+ } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
+ value[0] = JEF_SO_CHAR;
+ SISOLength = 1;
+ } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
+ value[0] = JIPS_SO_CHAR_1;
+ value[1] = JIPS_SO_CHAR_2;
+ SISOLength = 2;
+ } else {
+ value[0] = UCNV_SO;
+ SISOLength = 1;
+ }
+ break;
+ default:
+ /* Should never happen. */
+ break;
+ }
+
+ return SISOLength;
+}
+
/* Miscellaneous ------------------------------------------------------------ */
/**
switch(st3Multiplier) {
case 4:
b|=*stage3++;
- case 3:
+ case 3: /*fall through*/
b|=*stage3++;
- case 2:
+ case 2: /*fall through*/
b|=stage3[0]|stage3[1];
stage3+=2;
default:
UErrorCode *pErrorCode) {
uint16_t *stage1;
uint32_t *stage2;
- uint8_t *bytes;
uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
if(mbcsTable->reconstitutedData==NULL) {
stage2Length*4);
mbcsTable->fromUnicodeTable=stage1;
- mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
+ mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
/* indexes into stage 2 count from the bottom of the fromUnicodeTable */
stage2=(uint32_t *)stage1;
/* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
args.size=sizeof(UConverterLoadArgs);
args.nestedLoads=2;
+ args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
args.reserved=pArgs->reserved;
args.options=pArgs->options;
args.pkg=pArgs->pkg;
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
+ if(pArgs->onlyTestIsLoadable) {
+ /*
+ * Exit as soon as we know that we can load the converter
+ * and the format is valid and supported.
+ * The worst that can happen in the following code is a memory
+ * allocation error.
+ */
+ ucnv_unload(baseSharedData);
+ return;
+ }
/* copy the base table data */
uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
+ if(pArgs->onlyTestIsLoadable) {
+ /*
+ * Exit as soon as we know that we can load the converter
+ * and the format is valid and supported.
+ * The worst that can happen in the following code is a memory
+ * allocation error.
+ */
+ return;
+ }
mbcsTable->countStates=(uint8_t)header->countStates;
mbcsTable->countToUFallbacks=header->countToUFallbacks;
static void
ucnv_MBCSOpen(UConverter *cnv,
- const char *name,
- const char *locale,
- uint32_t options,
- UErrorCode *pErrorCode) {
+ UConverterLoadArgs *pArgs,
+ UErrorCode *pErrorCode) {
UConverterMBCSTable *mbcsTable;
const int32_t *extIndexes;
uint8_t outputType;
int8_t maxBytesPerUChar;
+ if(pArgs->onlyTestIsLoadable) {
+ return;
+ }
+
mbcsTable=&cnv->sharedData->mbcs;
outputType=mbcsTable->outputType;
if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
/* the swaplfnl option does not apply, remove it */
- cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
+ cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
}
- if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
/* do this because double-checked locking is broken */
UBool isCached;
}
/* the option does not apply, remove it */
- cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
+ cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
}
}
}
- if(uprv_strstr(name, "18030")!=NULL) {
- if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
+ if(uprv_strstr(pArgs->name, "18030")!=NULL) {
+ if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
/* set a flag for GB 18030 mode, which changes the callback behavior */
cnv->options|=_MBCS_OPTION_GB18030;
}
+ } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
+ /* set a flag for KEIS converter, which changes the SI/SO character sequence */
+ cnv->options|=_MBCS_OPTION_KEIS;
+ } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
+ /* set a flag for JEF converter, which changes the SI/SO character sequence */
+ cnv->options|=_MBCS_OPTION_JEF;
+ } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
+ /* set a flag for JIPS converter, which changes the SI/SO character sequence */
+ cnv->options|=_MBCS_OPTION_JIPS;
}
/* fix maxBytesPerUChar depending on outputType and options etc. */
#endif
/* conversion loop */
- while(targetCapacity>0) {
+ while(targetCapacity > 0 && source < sourceLimit) {
entry=stateTable[0][*source++];
/* MBCS_ENTRY_IS_FINAL(entry) */
* If it does, then surrogates are not paired but mapped separately.
* Note that in this case unmatched surrogates are not detected.
*/
- if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
- if(UTF_IS_SURROGATE_FIRST(c)) {
+ if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
/* callback(unassigned) */
*/
c=*source++;
++nextSourceIndex;
- if(UTF_IS_SURROGATE(c)) {
- if(UTF_IS_SURROGATE_FIRST(c)) {
+ if(U16_IS_SURROGATE(c)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
if(!hasSupplementary) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
/* callback(unassigned) */
/* normal end of conversion: prepare for a new character */
c=0;
continue;
- } else if(!UTF_IS_SURROGATE(c)) {
+ } else if(!U16_IS_SURROGATE(c)) {
/* normal, unassigned BMP character */
- } else if(UTF_IS_SURROGATE_FIRST(c)) {
+ } else if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
/* this codepage does not map supplementary code points */
/* callback(unassigned) */
} else {
uint32_t stage2Entry;
uint32_t asciiRoundtrips;
uint32_t value;
- int32_t length, prevLength;
+ /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
+ uint8_t siBytes[2] = {0, 0};
+ uint8_t soBytes[2] = {0, 0};
+ uint8_t siLength, soLength;
+ int32_t length = 0, prevLength;
uint8_t unicodeMask;
cnv=pArgs->converter;
sourceIndex= c==0 ? 0 : -1;
nextSourceIndex=0;
+ /* Get the SI/SO character for the converter */
+ siLength = getSISOBytes(SI, cnv->options, siBytes);
+ soLength = getSISOBytes(SO, cnv->options, soBytes);
+
/* conversion loop */
/*
* This is another piece of ugly code:
length=1;
} else {
/* change from double-byte mode to single-byte */
- value|=(uint32_t)UCNV_SI<<8;
- length=2;
+ if (siLength == 1) {
+ value|=(uint32_t)siBytes[0]<<8;
+ length = 2;
+ } else if (siLength == 2) {
+ value|=(uint32_t)siBytes[1]<<8;
+ value|=(uint32_t)siBytes[0]<<16;
+ length = 3;
+ }
prevLength=1;
}
} else {
length=2;
} else {
/* change from single-byte mode to double-byte */
- value|=(uint32_t)UCNV_SO<<16;
- length=3;
+ if (soLength == 1) {
+ value|=(uint32_t)soBytes[0]<<16;
+ length = 3;
+ } else if (soLength == 2) {
+ value|=(uint32_t)soBytes[1]<<16;
+ value|=(uint32_t)soBytes[0]<<24;
+ length = 4;
+ }
prevLength=2;
}
}
* If it does, then surrogates are not paired but mapped separately.
* Note that in this case unmatched surrogates are not detected.
*/
- if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
- if(UTF_IS_SURROGATE_FIRST(c)) {
+ if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
- c=UTF16_GET_PAIR_VALUE(c, trail);
+ c=U16_GET_SUPPLEMENTARY(c, trail);
if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
cnv->fromUnicodeStatus=prevLength; /* save the old state */
length=1;
} else {
/* change from double-byte mode to single-byte */
- value|=(uint32_t)UCNV_SI<<8;
- length=2;
+ if (siLength == 1) {
+ value|=(uint32_t)siBytes[0]<<8;
+ length = 2;
+ } else if (siLength == 2) {
+ value|=(uint32_t)siBytes[1]<<8;
+ value|=(uint32_t)siBytes[0]<<16;
+ length = 3;
+ }
prevLength=1;
}
} else {
length=2;
} else {
/* change from single-byte mode to double-byte */
- value|=(uint32_t)UCNV_SO<<16;
- length=3;
+ if (soLength == 1) {
+ value|=(uint32_t)soBytes[0]<<16;
+ length = 3;
+ } else if (soLength == 2) {
+ value|=(uint32_t)soBytes[1]<<16;
+ value|=(uint32_t)soBytes[0]<<24;
+ length = 4;
+ }
prevLength=2;
}
}
/* each branch falls through to the next one */
case 4:
*target++=(uint8_t)(value>>24);
- case 3:
+ case 3: /*fall through*/
*target++=(uint8_t)(value>>16);
- case 2:
+ case 2: /*fall through*/
*target++=(uint8_t)(value>>8);
- case 1:
+ case 1: /*fall through*/
*target++=(uint8_t)value;
default:
/* will never occur */
case 4:
*target++=(uint8_t)(value>>24);
*offsets++=sourceIndex;
- case 3:
+ case 3: /*fall through*/
*target++=(uint8_t)(value>>16);
*offsets++=sourceIndex;
- case 2:
+ case 2: /*fall through*/
*target++=(uint8_t)(value>>8);
*offsets++=sourceIndex;
- case 1:
+ case 1: /*fall through*/
*target++=(uint8_t)value;
*offsets++=sourceIndex;
default:
/* each branch falls through to the next one */
case 3:
*charErrorBuffer++=(uint8_t)(value>>16);
- case 2:
+ case 2: /*fall through*/
*charErrorBuffer++=(uint8_t)(value>>8);
- case 1:
+ case 1: /*fall through*/
*charErrorBuffer=(uint8_t)value;
default:
/* will never occur */
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
- case 2:
+ case 2: /*fall through*/
*target++=(uint8_t)(value>>8);
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
- case 1:
+ case 1: /*fall through*/
*target++=(uint8_t)value;
if(offsets!=NULL) {
*offsets++=sourceIndex;
) {
/* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
if(targetCapacity>0) {
- *target++=(uint8_t)UCNV_SI;
+ *target++=(uint8_t)siBytes[0];
+ if (siLength == 2) {
+ if (targetCapacity<2) {
+ cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
+ cnv->charErrorBufferLength=1;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ *target++=(uint8_t)siBytes[1];
+ }
+ }
if(offsets!=NULL) {
/* set the last source character's index (sourceIndex points at sourceLimit now) */
*offsets++=prevSourceIndex;
}
} else {
/* target is full */
- cnv->charErrorBuffer[0]=(char)UCNV_SI;
- cnv->charErrorBufferLength=1;
+ cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
+ if (siLength == 2) {
+ cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
+ }
+ cnv->charErrorBufferLength=siLength;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
prevLength=1; /* we switched into SBCS */
if(U8_IS_TRAIL(b)) {
++i;
} else {
- if(i<utf8_countTrailBytes[b]) {
+ if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
sourceLimit-=i+1;
}
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
c=b;
moreBytes:
while(toULength<toULimit) {
- if(source<sourceLimit) {
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * Here we need to use the real limit in case we have two truncated
+ * sequences at the end.
+ * See ticket #7492.
+ */
+ if(source<(uint8_t *)pToUArgs->sourceLimit) {
b=*source;
if(U8_IS_TRAIL(b)) {
++source;
* but then exit the loop because the extension match would
* have consumed the source.
*/
+ *pErrorCode=U_USING_DEFAULT_WARNING;
break;
} else {
/* a mapping was written to the target, continue */
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
*/
- if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ if(U_SUCCESS(*pErrorCode) &&
+ cnv->preFromUFirstCP<0 &&
+ source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
while(source<sourceLimit) {
utf8->toUBytes[toULength++]=b=*source++;
c=(c<<6)+b;
uint32_t stage2Entry;
uint32_t asciiRoundtrips;
- uint16_t value, minValue;
+ uint16_t value;
UBool hasSupplementary;
/* set up the local pointers */
}
asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
- if(cnv->useFallback) {
- /* use all roundtrip and fallback results */
- minValue=0x800;
- } else {
- /* use only roundtrips and fallbacks from private-use characters */
- minValue=0xc00;
- }
hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
/* get the converter state from the UTF-8 UConverter */
if(U8_IS_TRAIL(b)) {
++i;
} else {
- if(i<utf8_countTrailBytes[b]) {
+ if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
sourceLimit-=i+1;
}
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
c=b;
moreBytes:
while(toULength<toULimit) {
- if(source<sourceLimit) {
+ /*
+ * The sourceLimit may have been adjusted before the conversion loop
+ * to stop before a truncated sequence.
+ * Here we need to use the real limit in case we have two truncated
+ * sequences at the end.
+ * See ticket #7492.
+ */
+ if(source<(uint8_t *)pToUArgs->sourceLimit) {
b=*source;
if(U8_IS_TRAIL(b)) {
++source;
* but then exit the loop because the extension match would
* have consumed the source.
*/
+ *pErrorCode=U_USING_DEFAULT_WARNING;
break;
} else {
/* a mapping was written to the target, continue */
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
*/
- if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+ if(U_SUCCESS(*pErrorCode) &&
+ cnv->preFromUFirstCP<0 &&
+ source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
- toULimit=utf8_countTrailBytes[b]+1;
+ toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
while(source<sourceLimit) {
utf8->toUBytes[toULength++]=b=*source++;
c=(c<<6)+b;