/*
******************************************************************************
*
-* Copyright (C) 1998-2006, International Business Machines
+* Copyright (C) 1998-2006,2008 International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
e!=U_ILLEGAL_CHAR_FOUND &&
e!=U_TRUNCATED_CHAR_FOUND &&
e!=U_ILLEGAL_ESCAPE_SEQUENCE &&
- e!=U_UNSUPPORTED_ESCAPE_SEQUENCE)
+ e!=U_UNSUPPORTED_ESCAPE_SEQUENCE &&
+ e!=U_PARSE_ERROR) /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE below */
) {
/*
* the callback did not or cannot resolve the error:
cnv->toULength=0;
/* call the callback function */
- cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
- cnv->invalidCharBuffer, errorInputLength,
- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
- UCNV_UNASSIGNED : UCNV_ILLEGAL,
- err);
+ {
+ UConverterCallbackReason reason;
+ if (*err == U_PARSE_ERROR) { /* Here U_PARSE_ERROR indicates empty segment */
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ reason = UCNV_IRREGULAR;
+ } else {
+ reason = (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
+ UCNV_UNASSIGNED : UCNV_ILLEGAL;
+ }
+ cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
+ cnv->invalidCharBuffer, errorInputLength, reason, err);
+ }
/*
* loop back to the offset handling
/*
**********************************************************************
-* Copyright (C) 2000-2006, International Business Machines
+* Copyright (C) 2000-2006,2008 International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv2022.c
#ifdef U_ENABLE_GENERIC_ISO_2022
UBool isFirstBuffer;
#endif
+ UBool isEmptySegment;
char name[30];
char locale[3];
}UConverterDataISO2022;
if(choice<=UCNV_RESET_TO_UNICODE) {
uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
myConverterData->key = 0;
+ myConverterData->isEmptySegment = FALSE;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
continue;
} else {
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
break;
}
continue;
} else {
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
break;
}
case ESC_2022:
mySource--;
escape:
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_JP,err);
+ {
+ const char * mySourceBefore = mySource;
+ int8_t toULengthBefore = args->converter->toULength;
+
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_JP,err);
+
+ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
+ if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
+ *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
+ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
+ }
+ }
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
+ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
return;
}
+ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
+ if (myData->key == 0) {
+ myData->isEmptySegment = TRUE;
+ }
continue;
/* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
/* falls through */
default:
/* convert one or two bytes */
+ myData->isEmptySegment = FALSE;
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
!IS_JP_DBCS(cs)
if(mySourceChar==UCNV_SI){
myData->toU2022State.g = 0;
+ if (myData->isEmptySegment) {
+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
+ *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
+ args->converter->toUBytes[0] = mySourceChar;
+ args->converter->toULength = 1;
+ args->target = myTarget;
+ args->source = mySource;
+ return;
+ }
/*consume the source */
continue;
}else if(mySourceChar==UCNV_SO){
myData->toU2022State.g = 1;
+ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
/*consume the source */
continue;
}else if(mySourceChar==ESC_2022){
mySource--;
escape:
+ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_KR, err);
if(U_FAILURE(*err)){
continue;
}
+ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
if(myData->toU2022State.g == 1) {
if(mySource < mySourceLimit) {
char trailByte;
switch(mySourceChar){
case UCNV_SI:
pToU2022State->g=0;
+ if (myData->isEmptySegment) {
+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
+ *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
+ args->converter->toUBytes[0] = mySourceChar;
+ args->converter->toULength = 1;
+ args->target = myTarget;
+ args->source = mySource;
+ return;
+ }
continue;
case UCNV_SO:
if(pToU2022State->cs[1] != 0) {
pToU2022State->g=1;
+ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
continue;
} else {
/* illegal to have SO before a matching designator */
+ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
break;
}
case ESC_2022:
mySource--;
escape:
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_CN,err);
+ {
+ const char * mySourceBefore = mySource;
+ int8_t toULengthBefore = args->converter->toULength;
+
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_CN,err);
+
+ /* After SO there must be at least one character before a designator (designator error handled separately) */
+ if ( myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
+ *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
+ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
+ }
+ }
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
+ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
return;
}
continue;
/* falls through */
default:
/* convert one or two bytes */
+ myData->isEmptySegment = FALSE;
if(pToU2022State->g != 0) {
if(mySource < mySourceLimit) {
UConverterSharedData *cnv;
/*
**********************************************************************
-* Copyright (C) 2000-2006, International Business Machines
+* Copyright (C) 2000-2006, 2008 International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvhz.c
UBool isEscapeAppended;
UBool isStateDBCS;
UBool isTargetUCharDBCS;
+ UBool isEmptySegment;
}UConverterDataHZ;
cnv->mode=0;
if(cnv->extraInfo != NULL){
((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
+ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
}
}
if(choice!=UCNV_RESET_TO_UNICODE) {
}
*(myTarget++)=(UChar)mySourceChar;
+ myData->isEmptySegment = FALSE;
continue;
case UCNV_TILDE:
if(args->converter->mode ==UCNV_TILDE){
*(myTarget++)=(UChar)mySourceChar;
args->converter->mode=0;
+ myData->isEmptySegment = FALSE;
continue;
}
if(args->converter->mode == UCNV_TILDE){
args->converter->mode=0;
myData->isStateDBCS = TRUE;
+ myData->isEmptySegment = TRUE;
continue;
}
else{
if(args->converter->mode == UCNV_TILDE){
args->converter->mode=0;
myData->isStateDBCS = FALSE;
+ if (myData->isEmptySegment) {
+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
+ *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
+ args->converter->toUBytes[0] = UCNV_TILDE;
+ args->converter->toUBytes[1] = mySourceChar;
+ args->converter->toULength = 2;
+ goto EXIT;
+ }
+ myData->isEmptySegment = TRUE;
continue;
}
else{
if(args->converter->mode == UCNV_TILDE){
args->converter->mode=0;
mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
+ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
goto SAVE_STATE;
}
}
+ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
if(myData->isStateDBCS){
if(args->converter->toUnicodeStatus == 0x00){
args->converter->toUnicodeStatus = (UChar) mySourceChar;
break;
}
}
-
+EXIT:
args->target = myTarget;
args->source = mySource;
}
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2006, International Business Machines Corporation and
+ * Copyright (c) 1997-2006,2008 International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
static void TestCoverageMBCS(void);
static void TestJitterbug2346(void);
static void TestJitterbug2411(void);
+static void TestJitterbug6175(void);
#endif
static void TestRoundTrippingAllUTF(void);
#if !UCONFIG_NO_LEGACY_CONVERSION
addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");
addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");
+ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");
#endif
-
}
free(offsets);
}
+/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */
+typedef struct {
+ const char * converterName;
+ const char * inputText;
+ int inputTextLength;
+} EmptySegmentTest;
+
+/* Callback for TestJitterbug6175, should only get called for empty segment errors */
+static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits,
+ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) {
+ if (reason > UCNV_IRREGULAR)
+ return;
+ if (reason != UCNV_IRREGULAR)
+ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");
+ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */
+ *err = U_ZERO_ERROR;
+ ucnv_cbToUWriteSub(toArgs,0,err);
+}
+
+enum { kEmptySegmentToUCharsMax = 64 };
+static void TestJitterbug6175(void) {
+ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A };
+ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };
+ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };
+ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };
+ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 };
+ static const EmptySegmentTest emptySegmentTests[] = {
+ /* converterName inputText inputTextLength */
+ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },
+ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },
+ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },
+ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },
+ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },
+ /* terminator: */
+ { NULL, NULL, 0, }
+ };
+ const EmptySegmentTest * testPtr;
+ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) {
+ UErrorCode err = U_ZERO_ERROR;
+ UConverter * cnv = ucnv_open(testPtr->converterName, &err);
+ if (U_FAILURE(err)) {
+ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err));
+ return;
+ }
+ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err);
+ if (U_FAILURE(err)) {
+ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err));
+ ucnv_close(cnv);
+ return;
+ }
+ {
+ UChar toUChars[kEmptySegmentToUCharsMax];
+ UChar * toUCharsPtr = toUChars;
+ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax;
+ const char * inCharsPtr = testPtr->inputText;
+ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;
+ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err);
+ }
+ ucnv_close(cnv);
+ }
+}
+
static void
TestEBCDIC_STATEFUL() {
/* test input */
:intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },
:int{1}, :int{1}, "", "&", :bin{""}
}
+ // empty segment (using substitution and stop)
+ {
+ "ISO-2022-KR",
+ :bin{ 1b242943610e0f620d0a },
+ "a\uFFFDb\u000D\u000A",
+ :intvector{ 4, 6, 7, 8, 9 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ {
+ "ISO-2022-KR",
+ :bin{ 1b242943610e0f620d0a },
+ "a",
+ :intvector{ 4 },
+ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
+ }
// ISO-2022-JP
:bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },
:int{1}, :int{1}, "", ".", :bin{""}
}
+ // empty segment (using substitution and stop)
+ {
+ "ISO-2022-JP",
+ :bin{ 61621b24421b284263640d0a },
+ "ab\uFFFDcd\u000D\u000A",
+ :intvector{ 0, 1, 5, 8, 9, 10, 11 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ {
+ "ISO-2022-JP",
+ :bin{ 61621b24421b284263640d0a },
+ "ab",
+ :intvector{ 0, 1 },
+ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}
+ }
// ISO-2022-CN
:bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },
:int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }
}
+ // empty segment 1 (using substitution and stop)
+ {
+ "ISO-2022-CN",
+ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
+ "ab\uFFFD\u994Cc\u000D\u000A",
+ :intvector{ 0, 5, 7, 14, 16, 17, 18 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ {
+ "ISO-2022-CN",
+ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
+ "ab",
+ :intvector{ 0, 5 },
+ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
+ }
+ // empty segment 2 (using substitution and stop)
+ {
+ "ISO-2022-CN",
+ :bin{ 611b242941620e1b24294768640f630d0a },
+ "ab\uFFFD\u5F70c\u000D\u000A",
+ :intvector{ 0, 5, 7, 11, 14, 15, 16 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ {
+ "ISO-2022-CN",
+ :bin{ 611b242941620e1b24294768640f630d0a },
+ "ab",
+ :intvector{ 0, 5 },
+ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}
+ }
// ISO-2022 SBCS
// [U_ENABLE_GENERIC_ISO_2022]
// :int{1}, :int{1}, "", ".", :bin{""}
//}
+ // HZ-GB-2312
+
+ // empty segment (using substitution and stop)
+ {
+ "HZ-GB-2312",
+ :bin{ 61627e7b7e7d6364 },
+ "ab\uFFFDcd",
+ :intvector{ 0, 1, 4, 6, 7 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ {
+ "HZ-GB-2312",
+ :bin{ 61627e7b7e7d63640d0a },
+ "ab",
+ :intvector{ 0, 1 },
+ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}
+ }
+
// DBCS-only extensions
{
"ibm-970",