+/* Test illegal UTF-8 input: Data and functions for TestConvertExFromUTF8(). */
+static const char *const badUTF8[]={
+ /* trail byte */
+ "\x80",
+
+ /* truncated multi-byte sequences */
+ "\xd0",
+ "\xe0",
+ "\xe1",
+ "\xed",
+ "\xee",
+ "\xf0",
+ "\xf1",
+ "\xf4",
+ "\xf8",
+ "\xfc",
+
+ "\xe0\x80",
+ "\xe0\xa0",
+ "\xe1\x80",
+ "\xed\x80",
+ "\xed\xa0",
+ "\xee\x80",
+ "\xf0\x80",
+ "\xf0\x90",
+ "\xf1\x80",
+ "\xf4\x80",
+ "\xf4\x90",
+ "\xf8\x80",
+ "\xfc\x80",
+
+ "\xf0\x80\x80",
+ "\xf0\x90\x80",
+ "\xf1\x80\x80",
+ "\xf4\x80\x80",
+ "\xf4\x90\x80",
+ "\xf8\x80\x80",
+ "\xfc\x80\x80",
+
+ "\xf8\x80\x80\x80",
+ "\xfc\x80\x80\x80",
+
+ "\xfc\x80\x80\x80\x80",
+
+ /* complete sequences but non-shortest forms or out of range etc. */
+ "\xc0\x80",
+ "\xe0\x80\x80",
+ "\xed\xa0\x80",
+ "\xf0\x80\x80\x80",
+ "\xf4\x90\x80\x80",
+ "\xf8\x80\x80\x80\x80",
+ "\xfc\x80\x80\x80\x80\x80",
+ "\xfe",
+ "\xff"
+};
+
+#define ARG_CHAR_ARR_SIZE 8
+
+/* get some character that can be converted and convert it */
+static UBool getTestChar(UConverter *cnv, const char *converterName,
+ char charUTF8[4], int32_t *pCharUTF8Length,
+ char char0[ARG_CHAR_ARR_SIZE], int32_t *pChar0Length,
+ char char1[ARG_CHAR_ARR_SIZE], int32_t *pChar1Length) {
+ UChar utf16[U16_MAX_LENGTH];
+ int32_t utf16Length;
+
+ const UChar *utf16Source;
+ char *target;
+
+ USet *set;
+ UChar32 c;
+ UErrorCode errorCode;
+
+ errorCode=U_ZERO_ERROR;
+ set=uset_open(1, 0);
+ ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &errorCode);
+ c=uset_charAt(set, uset_size(set)/2);
+ uset_close(set);
+
+ utf16Length=0;
+ U16_APPEND_UNSAFE(utf16, utf16Length, c);
+ *pCharUTF8Length=0;
+ U8_APPEND_UNSAFE(charUTF8, *pCharUTF8Length, c);
+
+ utf16Source=utf16;
+ target=char0;
+ ucnv_fromUnicode(cnv,
+ &target, char0+ARG_CHAR_ARR_SIZE,
+ &utf16Source, utf16+utf16Length,
+ NULL, FALSE, &errorCode);
+ *pChar0Length=(int32_t)(target-char0);
+
+ utf16Source=utf16;
+ target=char1;
+ ucnv_fromUnicode(cnv,
+ &target, char1+ARG_CHAR_ARR_SIZE,
+ &utf16Source, utf16+utf16Length,
+ NULL, FALSE, &errorCode);
+ *pChar1Length=(int32_t)(target-char1);
+
+ if(U_FAILURE(errorCode)) {
+ log_err("unable to get test character for %s - %s\n", converterName, u_errorName(errorCode));
+ return FALSE;
+ }
+ return TRUE;
+}
+
+static UBool isOneTruncatedUTF8(const char *s, int32_t length) {
+ if(length==0) {
+ return FALSE;
+ } else if(length==1) {
+ return U8_IS_LEAD(s[0]);
+ } else {
+ int32_t count=U8_COUNT_TRAIL_BYTES(s[0]);
+ if(length<=count) {
+ // 2 or more bytes, but fewer than the lead byte indicates.
+ int32_t oneLength=0;
+ U8_FWD_1(s, oneLength, length);
+ // Truncated if we reach the end of the string.
+ // Not true if the lead byte and first trail byte do not start a valid sequence,
+ // e.g., E0 80 -> oneLength=1.
+ return oneLength==length;
+ }
+ return FALSE;
+ }
+}
+
+static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
+ char charUTF8[4], int32_t charUTF8Length,
+ char char0[8], int32_t char0Length,
+ char char1[8], int32_t char1Length) {
+ char utf8[16];
+ int32_t utf8Length;
+
+ char output[16];
+ int32_t outputLength;
+
+ char invalidChars[8];
+ int8_t invalidLength;
+
+ const char *source;
+ char *target;
+
+ UChar pivotBuffer[8];
+ UChar *pivotSource, *pivotTarget;
+
+ UErrorCode errorCode;
+ int32_t i;
+
+ /* test truncated sequences */
+ errorCode=U_ZERO_ERROR;
+ ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
+
+ memcpy(utf8, charUTF8, charUTF8Length);
+
+ for(i=0; i<UPRV_LENGTHOF(badUTF8); ++i) {
+ /* truncated sequence? */
+ int32_t length = (int32_t)strlen(badUTF8[i]);
+ if(!isOneTruncatedUTF8(badUTF8[i], length)) {
+ continue;
+ }
+
+ /* assemble a string with the test character and the truncated sequence */
+ memcpy(utf8+charUTF8Length, badUTF8[i], length);
+ utf8Length=charUTF8Length+length;
+
+ /* convert and check the invalidChars */
+ source=utf8;
+ target=output;
+ pivotSource=pivotTarget=pivotBuffer;
+ errorCode=U_ZERO_ERROR;
+ ucnv_convertEx(cnv, utf8Cnv,
+ &target, output+sizeof(output),
+ &source, utf8+utf8Length,
+ pivotBuffer, &pivotSource, &pivotTarget, pivotBuffer+UPRV_LENGTHOF(pivotBuffer),
+ TRUE, TRUE, /* reset & flush */
+ &errorCode);
+ outputLength=(int32_t)(target-output);
+ (void)outputLength; /* Suppress set but not used warning. */
+ if(errorCode!=U_TRUNCATED_CHAR_FOUND || pivotSource!=pivotBuffer) {
+ log_err("unexpected error %s from %s badUTF8[%ld]\n", u_errorName(errorCode), converterName, (long)i);
+ continue;
+ }
+
+ errorCode=U_ZERO_ERROR;
+ invalidLength=(int8_t)sizeof(invalidChars);
+ ucnv_getInvalidChars(utf8Cnv, invalidChars, &invalidLength, &errorCode);
+ if(invalidLength!=length || 0!=memcmp(invalidChars, badUTF8[i], length)) {
+ log_err("wrong invalidChars from %s badUTF8[%ld]\n", converterName, (long)i);
+ }
+ }
+}
+
+static void testFromBadUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
+ char charUTF8[4], int32_t charUTF8Length,
+ char char0[8], int32_t char0Length,
+ char char1[8], int32_t char1Length) {
+ char utf8[600], expect[600];
+ int32_t utf8Length, expectLength;
+
+ char testName[32];
+
+ UErrorCode errorCode;
+ int32_t i;
+
+ errorCode=U_ZERO_ERROR;
+ ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &errorCode);
+
+ /*
+ * assemble an input string with the test character between each
+ * bad sequence,
+ * and an expected string with repeated test character output
+ */
+ memcpy(utf8, charUTF8, charUTF8Length);
+ utf8Length=charUTF8Length;
+
+ memcpy(expect, char0, char0Length);
+ expectLength=char0Length;
+
+ for(i=0; i<UPRV_LENGTHOF(badUTF8); ++i) {
+ int32_t length = (int32_t)strlen(badUTF8[i]);
+ memcpy(utf8+utf8Length, badUTF8[i], length);
+ utf8Length+=length;
+
+ memcpy(utf8+utf8Length, charUTF8, charUTF8Length);
+ utf8Length+=charUTF8Length;
+
+ memcpy(expect+expectLength, char1, char1Length);
+ expectLength+=char1Length;
+ }
+
+ /* expect that each bad UTF-8 sequence is detected and skipped */
+ strcpy(testName, "from bad UTF-8 to ");
+ strcat(testName, converterName);
+
+ convertExMultiStreaming(utf8Cnv, cnv,
+ utf8, utf8Length,
+ expect, expectLength,
+ testName,
+ U_ZERO_ERROR);
+}
+
+/* Test illegal UTF-8 input. */
+static void TestConvertExFromUTF8() {
+ static const char *const converterNames[]={
+#if !UCONFIG_NO_LEGACY_CONVERSION
+ "windows-1252",
+ "shift-jis",
+#endif
+ "us-ascii",
+ "iso-8859-1",
+ "utf-8"
+ };
+
+ UConverter *utf8Cnv, *cnv;
+ UErrorCode errorCode;
+ int32_t i;
+
+ /* fromUnicode versions of some character, from initial state and later */
+ char charUTF8[4], char0[8], char1[8];
+ int32_t charUTF8Length, char0Length, char1Length;
+
+ errorCode=U_ZERO_ERROR;
+ utf8Cnv=ucnv_open("UTF-8", &errorCode);
+ if(U_FAILURE(errorCode)) {
+ log_data_err("unable to open UTF-8 converter - %s\n", u_errorName(errorCode));
+ return;
+ }
+
+ for(i=0; i<UPRV_LENGTHOF(converterNames); ++i) {
+ errorCode=U_ZERO_ERROR;
+ cnv=ucnv_open(converterNames[i], &errorCode);
+ if(U_FAILURE(errorCode)) {
+ log_data_err("unable to open %s converter - %s\n", converterNames[i], u_errorName(errorCode));
+ continue;
+ }
+ if(!getTestChar(cnv, converterNames[i], charUTF8, &charUTF8Length, char0, &char0Length, char1, &char1Length)) {
+ continue;
+ }
+ testFromTruncatedUTF8(utf8Cnv, cnv, converterNames[i], charUTF8, charUTF8Length, char0, char0Length, char1, char1Length);
+ testFromBadUTF8(utf8Cnv, cnv, converterNames[i], charUTF8, charUTF8Length, char0, char0Length, char1, char1Length);
+ ucnv_close(cnv);
+ }
+ ucnv_close(utf8Cnv);
+}
+
+static void TestConvertExFromUTF8_C5F0() {
+ static const char *const converterNames[]={
+#if !UCONFIG_NO_LEGACY_CONVERSION
+ "windows-1251",
+ "shift-jis",
+#endif
+ "us-ascii",
+ "iso-8859-1",
+ "utf-8"
+ };
+
+ UConverter *utf8Cnv, *cnv;
+ UErrorCode errorCode;
+ int32_t i;
+
+ static const char bad_utf8[2]={ (char)0xC5, (char)0xF0 };
+ /* Expect "��" (2x U+FFFD as decimal NCRs) */
+ static const char twoNCRs[16]={
+ 0x26, 0x23, 0x36, 0x35, 0x35, 0x33, 0x33, 0x3B,
+ 0x26, 0x23, 0x36, 0x35, 0x35, 0x33, 0x33, 0x3B
+ };
+ static const char twoFFFD[6]={
+ (char)0xef, (char)0xbf, (char)0xbd,
+ (char)0xef, (char)0xbf, (char)0xbd
+ };
+ const char *expected;
+ int32_t expectedLength;
+ char dest[20]; /* longer than longest expectedLength */
+
+ const char *src;
+ char *target;
+
+ UChar pivotBuffer[128];
+ UChar *pivotSource, *pivotTarget;
+
+ errorCode=U_ZERO_ERROR;
+ utf8Cnv=ucnv_open("UTF-8", &errorCode);
+ if(U_FAILURE(errorCode)) {
+ log_data_err("unable to open UTF-8 converter - %s\n", u_errorName(errorCode));
+ return;
+ }
+
+ for(i=0; i<UPRV_LENGTHOF(converterNames); ++i) {
+ errorCode=U_ZERO_ERROR;
+ cnv=ucnv_open(converterNames[i], &errorCode);
+ ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
+ NULL, NULL, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ log_data_err("unable to open %s converter - %s\n",
+ converterNames[i], u_errorName(errorCode));
+ continue;
+ }
+ src=bad_utf8;
+ target=dest;
+ uprv_memset(dest, 9, sizeof(dest));
+ if(i==UPRV_LENGTHOF(converterNames)-1) {
+ /* conversion to UTF-8 yields two U+FFFD directly */
+ expected=twoFFFD;
+ expectedLength=6;
+ } else {
+ /* conversion to a non-Unicode charset yields two NCRs */
+ expected=twoNCRs;
+ expectedLength=16;
+ }
+ pivotBuffer[0]=0;
+ pivotBuffer[1]=1;
+ pivotBuffer[2]=2;
+ pivotSource=pivotTarget=pivotBuffer;
+ ucnv_convertEx(
+ cnv, utf8Cnv,
+ &target, dest+expectedLength,
+ &src, bad_utf8+sizeof(bad_utf8),
+ pivotBuffer, &pivotSource, &pivotTarget, pivotBuffer+UPRV_LENGTHOF(pivotBuffer),
+ TRUE, TRUE, &errorCode);
+ if( errorCode!=U_STRING_NOT_TERMINATED_WARNING || src!=bad_utf8+2 ||
+ target!=dest+expectedLength || 0!=uprv_memcmp(dest, expected, expectedLength) ||
+ dest[expectedLength]!=9
+ ) {
+ log_err("ucnv_convertEx(UTF-8 C5 F0 -> %s/decimal NCRs) failed\n", converterNames[i]);
+ }
+ ucnv_close(cnv);
+ }
+ ucnv_close(utf8Cnv);
+}
+