From: Apple Date: Fri, 7 Mar 2008 01:20:00 +0000 (+0000) Subject: ICU-8.11.1.tar.gz X-Git-Tag: mac-os-x-1053^0 X-Git-Url: https://git.saurik.com/apple/icu.git/commitdiff_plain/d5d484b0fbe924d3663b177965538d517ee412c1?ds=inline ICU-8.11.1.tar.gz --- diff --git a/icuSources/common/ucnv.c b/icuSources/common/ucnv.c index ff6eebaf..f764f361 100644 --- a/icuSources/common/ucnv.c +++ b/icuSources/common/ucnv.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1998-2006, International Business Machines +* Copyright (C) 1998-2006,2008 International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -1429,7 +1429,8 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { e!=U_ILLEGAL_CHAR_FOUND && e!=U_TRUNCATED_CHAR_FOUND && e!=U_ILLEGAL_ESCAPE_SEQUENCE && - e!=U_UNSUPPORTED_ESCAPE_SEQUENCE) + e!=U_UNSUPPORTED_ESCAPE_SEQUENCE && + e!=U_PARSE_ERROR) /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE below */ ) { /* * the callback did not or cannot resolve the error: @@ -1473,11 +1474,18 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { cnv->toULength=0; /* call the callback function */ - cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, - cnv->invalidCharBuffer, errorInputLength, - (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ? - UCNV_UNASSIGNED : UCNV_ILLEGAL, - err); + { + UConverterCallbackReason reason; + if (*err == U_PARSE_ERROR) { /* Here U_PARSE_ERROR indicates empty segment */ + *err = U_ILLEGAL_ESCAPE_SEQUENCE; + reason = UCNV_IRREGULAR; + } else { + reason = (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ? + UCNV_UNASSIGNED : UCNV_ILLEGAL; + } + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, + cnv->invalidCharBuffer, errorInputLength, reason, err); + } /* * loop back to the offset handling diff --git a/icuSources/common/ucnv2022.c b/icuSources/common/ucnv2022.c index 9dc1c283..cd83c69c 100644 --- a/icuSources/common/ucnv2022.c +++ b/icuSources/common/ucnv2022.c @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2000-2006, International Business Machines +* Copyright (C) 2000-2006,2008 International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucnv2022.c @@ -181,6 +181,7 @@ typedef struct{ #ifdef U_ENABLE_GENERIC_ISO_2022 UBool isFirstBuffer; #endif + UBool isEmptySegment; char name[30]; char locale[3]; }UConverterDataISO2022; @@ -590,6 +591,7 @@ _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { if(choice<=UCNV_RESET_TO_UNICODE) { uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); myConverterData->key = 0; + myConverterData->isEmptySegment = FALSE; } if(choice!=UCNV_RESET_TO_UNICODE) { uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); @@ -1705,6 +1707,7 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, continue; } else { /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ + myData->isEmptySegment = FALSE; /* reset this, we have a different error */ break; } @@ -1716,21 +1719,38 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, continue; } else { /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ + myData->isEmptySegment = FALSE; /* reset this, we have a different error */ break; } case ESC_2022: mySource--; escape: - changeState_2022(args->converter,&(mySource), - mySourceLimit, ISO_2022_JP,err); + { + const char * mySourceBefore = mySource; + int8_t toULengthBefore = args->converter->toULength; + + changeState_2022(args->converter,&(mySource), + mySourceLimit, ISO_2022_JP,err); + + /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ + if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) { + *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */ + args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); + } + } /* invalid or illegal escape sequence */ if(U_FAILURE(*err)){ args->target = myTarget; args->source = mySource; + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ return; } + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ + if (myData->key == 0) { + myData->isEmptySegment = TRUE; + } continue; /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ @@ -1747,6 +1767,7 @@ escape: /* falls through */ default: /* convert one or two bytes */ + myData->isEmptySegment = FALSE; cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && !IS_JP_DBCS(cs) @@ -2240,15 +2261,26 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, if(mySourceChar==UCNV_SI){ myData->toU2022State.g = 0; + if (myData->isEmptySegment) { + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ + *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */ + args->converter->toUBytes[0] = mySourceChar; + args->converter->toULength = 1; + args->target = myTarget; + args->source = mySource; + return; + } /*consume the source */ continue; }else if(mySourceChar==UCNV_SO){ myData->toU2022State.g = 1; + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ /*consume the source */ continue; }else if(mySourceChar==ESC_2022){ mySource--; escape: + myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ changeState_2022(args->converter,&(mySource), mySourceLimit, ISO_2022_KR, err); if(U_FAILURE(*err)){ @@ -2259,6 +2291,7 @@ escape: continue; } + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ if(myData->toU2022State.g == 1) { if(mySource < mySourceLimit) { char trailByte; @@ -2759,27 +2792,50 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, switch(mySourceChar){ case UCNV_SI: pToU2022State->g=0; + if (myData->isEmptySegment) { + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ + *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */ + args->converter->toUBytes[0] = mySourceChar; + args->converter->toULength = 1; + args->target = myTarget; + args->source = mySource; + return; + } continue; case UCNV_SO: if(pToU2022State->cs[1] != 0) { pToU2022State->g=1; + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ continue; } else { /* illegal to have SO before a matching designator */ + myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ break; } case ESC_2022: mySource--; escape: - changeState_2022(args->converter,&(mySource), - mySourceLimit, ISO_2022_CN,err); + { + const char * mySourceBefore = mySource; + int8_t toULengthBefore = args->converter->toULength; + + changeState_2022(args->converter,&(mySource), + mySourceLimit, ISO_2022_CN,err); + + /* After SO there must be at least one character before a designator (designator error handled separately) */ + if ( myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) { + *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */ + args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); + } + } /* invalid or illegal escape sequence */ if(U_FAILURE(*err)){ args->target = myTarget; args->source = mySource; + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ return; } continue; @@ -2793,6 +2849,7 @@ escape: /* falls through */ default: /* convert one or two bytes */ + myData->isEmptySegment = FALSE; if(pToU2022State->g != 0) { if(mySource < mySourceLimit) { UConverterSharedData *cnv; diff --git a/icuSources/common/ucnvhz.c b/icuSources/common/ucnvhz.c index b94811b8..c3f63fca 100644 --- a/icuSources/common/ucnvhz.c +++ b/icuSources/common/ucnvhz.c @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2000-2006, International Business Machines +* Copyright (C) 2000-2006, 2008 International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucnvhz.c @@ -59,6 +59,7 @@ typedef struct{ UBool isEscapeAppended; UBool isStateDBCS; UBool isTargetUCharDBCS; + UBool isEmptySegment; }UConverterDataHZ; @@ -98,6 +99,7 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){ cnv->mode=0; if(cnv->extraInfo != NULL){ ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; + ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; } } if(choice!=UCNV_RESET_TO_UNICODE) { @@ -163,12 +165,14 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, } *(myTarget++)=(UChar)mySourceChar; + myData->isEmptySegment = FALSE; continue; case UCNV_TILDE: if(args->converter->mode ==UCNV_TILDE){ *(myTarget++)=(UChar)mySourceChar; args->converter->mode=0; + myData->isEmptySegment = FALSE; continue; } @@ -186,6 +190,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, if(args->converter->mode == UCNV_TILDE){ args->converter->mode=0; myData->isStateDBCS = TRUE; + myData->isEmptySegment = TRUE; continue; } else{ @@ -197,6 +202,15 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, if(args->converter->mode == UCNV_TILDE){ args->converter->mode=0; myData->isStateDBCS = FALSE; + if (myData->isEmptySegment) { + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ + *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */ + args->converter->toUBytes[0] = UCNV_TILDE; + args->converter->toUBytes[1] = mySourceChar; + args->converter->toULength = 2; + goto EXIT; + } + myData->isEmptySegment = TRUE; continue; } else{ @@ -210,6 +224,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, if(args->converter->mode == UCNV_TILDE){ args->converter->mode=0; mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); + myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ goto SAVE_STATE; } @@ -217,6 +232,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, } + myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ if(myData->isStateDBCS){ if(args->converter->toUnicodeStatus == 0x00){ args->converter->toUnicodeStatus = (UChar) mySourceChar; @@ -281,7 +297,7 @@ SAVE_STATE: break; } } - +EXIT: args->target = myTarget; args->source = mySource; } diff --git a/icuSources/test/cintltst/nucnvtst.c b/icuSources/test/cintltst/nucnvtst.c index baecb458..6791a1eb 100644 --- a/icuSources/test/cintltst/nucnvtst.c +++ b/icuSources/test/cintltst/nucnvtst.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2006, International Business Machines Corporation and + * Copyright (c) 1997-2006,2008 International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************* @@ -79,6 +79,7 @@ static void TestISCII(void); static void TestCoverageMBCS(void); static void TestJitterbug2346(void); static void TestJitterbug2411(void); +static void TestJitterbug6175(void); #endif static void TestRoundTrippingAllUTF(void); @@ -294,8 +295,8 @@ void addTestNewConvert(TestNode** root) #if !UCONFIG_NO_LEGACY_CONVERSION addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346"); addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411"); + addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175"); #endif - } @@ -4454,6 +4455,68 @@ TestISO_2022_CN() { free(offsets); } +/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */ +typedef struct { + const char * converterName; + const char * inputText; + int inputTextLength; +} EmptySegmentTest; + +/* Callback for TestJitterbug6175, should only get called for empty segment errors */ +static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits, + int32_t length, UConverterCallbackReason reason, UErrorCode * err ) { + if (reason > UCNV_IRREGULAR) + return; + if (reason != UCNV_IRREGULAR) + log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n"); + /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */ + *err = U_ZERO_ERROR; + ucnv_cbToUWriteSub(toArgs,0,err); +} + +enum { kEmptySegmentToUCharsMax = 64 }; +static void TestJitterbug6175(void) { + static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A }; + static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A }; + static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A }; + static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A }; + static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 }; + static const EmptySegmentTest emptySegmentTests[] = { + /* converterName inputText inputTextLength */ + { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) }, + { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) }, + { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) }, + { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) }, + { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) }, + /* terminator: */ + { NULL, NULL, 0, } + }; + const EmptySegmentTest * testPtr; + for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) { + UErrorCode err = U_ZERO_ERROR; + UConverter * cnv = ucnv_open(testPtr->converterName, &err); + if (U_FAILURE(err)) { + log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err)); + return; + } + ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err); + if (U_FAILURE(err)) { + log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err)); + ucnv_close(cnv); + return; + } + { + UChar toUChars[kEmptySegmentToUCharsMax]; + UChar * toUCharsPtr = toUChars; + const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax; + const char * inCharsPtr = testPtr->inputText; + const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength; + ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err); + } + ucnv_close(cnv); + } +} + static void TestEBCDIC_STATEFUL() { /* test input */ diff --git a/icuSources/test/testdata/conversion.txt b/icuSources/test/testdata/conversion.txt index 4e94f471..a8cd8a90 100644 --- a/icuSources/test/testdata/conversion.txt +++ b/icuSources/test/testdata/conversion.txt @@ -182,6 +182,21 @@ conversion:table(nofallback) { :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, :int{1}, :int{1}, "", "&", :bin{""} } + // empty segment (using substitution and stop) + { + "ISO-2022-KR", + :bin{ 1b242943610e0f620d0a }, + "a\uFFFDb\u000D\u000A", + :intvector{ 4, 6, 7, 8, 9 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + { + "ISO-2022-KR", + :bin{ 1b242943610e0f620d0a }, + "a", + :intvector{ 4 }, + :int{1}, :int{1}, "illesc", ".", :bin{"0f"} + } // ISO-2022-JP @@ -232,6 +247,21 @@ conversion:table(nofallback) { :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 }, :int{1}, :int{1}, "", ".", :bin{""} } + // empty segment (using substitution and stop) + { + "ISO-2022-JP", + :bin{ 61621b24421b284263640d0a }, + "ab\uFFFDcd\u000D\u000A", + :intvector{ 0, 1, 5, 8, 9, 10, 11 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + { + "ISO-2022-JP", + :bin{ 61621b24421b284263640d0a }, + "ab", + :intvector{ 0, 1 }, + :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"} + } // ISO-2022-CN @@ -302,6 +332,36 @@ conversion:table(nofallback) { :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } } + // empty segment 1 (using substitution and stop) + { + "ISO-2022-CN", + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, + "ab\uFFFD\u994Cc\u000D\u000A", + :intvector{ 0, 5, 7, 14, 16, 17, 18 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + { + "ISO-2022-CN", + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, + "ab", + :intvector{ 0, 5 }, + :int{1}, :int{1}, "illesc", ".", :bin{"0f"} + } + // empty segment 2 (using substitution and stop) + { + "ISO-2022-CN", + :bin{ 611b242941620e1b24294768640f630d0a }, + "ab\uFFFD\u5F70c\u000D\u000A", + :intvector{ 0, 5, 7, 11, 14, 15, 16 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + { + "ISO-2022-CN", + :bin{ 611b242941620e1b24294768640f630d0a }, + "ab", + :intvector{ 0, 5 }, + :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"} + } // ISO-2022 SBCS // [U_ENABLE_GENERIC_ISO_2022] @@ -316,6 +376,24 @@ conversion:table(nofallback) { // :int{1}, :int{1}, "", ".", :bin{""} //} + // HZ-GB-2312 + + // empty segment (using substitution and stop) + { + "HZ-GB-2312", + :bin{ 61627e7b7e7d6364 }, + "ab\uFFFDcd", + :intvector{ 0, 1, 4, 6, 7 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + { + "HZ-GB-2312", + :bin{ 61627e7b7e7d63640d0a }, + "ab", + :intvector{ 0, 1 }, + :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"} + } + // DBCS-only extensions { "ibm-970",