git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/convtest.cpp

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	*******************************************************************************
	5	*
	6	* Copyright (C) 2003-2014, International Business Machines
	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: convtest.cpp
	11	* encoding: UTF-8
	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2003jul15
	16	* created by: Markus W. Scherer
	17	*
	18	* Test file for data-driven conversion tests.
	19	*/
	20
	21	#include "unicode/utypes.h"
	22
	23	#if !UCONFIG_NO_LEGACY_CONVERSION
	24	/*
	25	* Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
	26	* is slightly unnecessary - it removes tests for Unicode charsets
	27	* like UTF-8 that should work.
	28	* However, there is no easy way for the test to detect whether a test case
	29	* is for a Unicode charset, so it would be difficult to only exclude those.
	30	* Also, regular testing of ICU is done with all modules on, therefore
	31	* not testing conversion for a custom configuration like this should be ok.
	32	*/
	33
	34	#include "unicode/ucnv.h"
	35	#include "unicode/unistr.h"
	36	#include "unicode/parsepos.h"
	37	#include "unicode/uniset.h"
	38	#include "unicode/ustring.h"
	39	#include "unicode/ures.h"
	40	#include "convtest.h"
	41	#include "cmemory.h"
	42	#include "unicode/tstdtmod.h"
	43	#include <string.h>
	44	#include <stdlib.h>
	45
	46	enum {
	47	// characters used in test data for callbacks
	48	SUB_CB='?',
	49	SKIP_CB='0',
	50	STOP_CB='.',
	51	ESC_CB='&'
	52	};
	53
	54	ConversionTest::ConversionTest() {
	55	UErrorCode errorCode=U_ZERO_ERROR;
	56	utf8Cnv=ucnv_open("UTF-8", &errorCode);
	57	ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
	58	if(U_FAILURE(errorCode)) {
	59	errln("unable to open UTF-8 converter");
	60	}
	61	}
	62
	63	ConversionTest::~ConversionTest() {
	64	ucnv_close(utf8Cnv);
	65	}
	66
	67	void
	68	ConversionTest::runIndexedTest(int32_t index, UBool exec, const char &name, char /par/) {
	69	if (exec) logln("TestSuite ConversionTest: ");
	70	switch (index) {
	71	#if !UCONFIG_NO_FILE_IO
	72	case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
	73	case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
	74	case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
	75	case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break;
	76	#else
	77	case 0:
	78	case 1:
	79	case 2:
	80	case 3: name="skip"; break;
	81	#endif
	82	case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
	83	default: name=""; break; //needed to end loop
	84	}
	85	}
	86
	87	// test data interface ----------------------------------------------------- ***
	88
	89	void
	90	ConversionTest::TestToUnicode() {
	91	ConversionCase cc;
	92	char charset[100], cbopt[4];
	93	const char *option;
	94	UnicodeString s, unicode;
	95	int32_t offsetsLength;
	96	UConverterToUCallback callback;
	97
	98	TestDataModule *dataModule;
	99	TestData *testData;
	100	const DataMap *testCase;
	101	UErrorCode errorCode;
	102	int32_t i;
	103
	104	errorCode=U_ZERO_ERROR;
	105	dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
	106	if(U_SUCCESS(errorCode)) {
	107	testData=dataModule->createTestData("toUnicode", errorCode);
	108	if(U_SUCCESS(errorCode)) {
	109	for(i=0; testData->nextCase(testCase, errorCode); ++i) {
	110	if(U_FAILURE(errorCode)) {
	111	errln("error retrieving conversion/toUnicode test case %d - %s",
	112	i, u_errorName(errorCode));
	113	errorCode=U_ZERO_ERROR;
	114	continue;
	115	}
	116
	117	cc.caseNr=i;
	118
	119	s=testCase->getString("charset", errorCode);
	120	s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
	121	cc.charset=charset;
	122
	123	cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
	124	unicode=testCase->getString("unicode", errorCode);
	125	cc.unicode=unicode.getBuffer();
	126	cc.unicodeLength=unicode.length();
	127
	128	offsetsLength=0;
	129	cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
	130	if(offsetsLength==0) {
	131	cc.offsets=NULL;
	132	} else if(offsetsLength!=unicode.length()) {
	133	errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
	134	i, unicode.length(), offsetsLength);
	135	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	136	}
	137
	138	cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
	139	cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
	140
	141	s=testCase->getString("errorCode", errorCode);
	142	if(s==UNICODE_STRING("invalid", 7)) {
	143	cc.outErrorCode=U_INVALID_CHAR_FOUND;
	144	} else if(s==UNICODE_STRING("illegal", 7)) {
	145	cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
	146	} else if(s==UNICODE_STRING("truncated", 9)) {
	147	cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
	148	} else if(s==UNICODE_STRING("illesc", 6)) {
	149	cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
	150	} else if(s==UNICODE_STRING("unsuppesc", 9)) {
	151	cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
	152	} else {
	153	cc.outErrorCode=U_ZERO_ERROR;
	154	}
	155
	156	s=testCase->getString("callback", errorCode);
	157	s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
	158	cc.cbopt=cbopt;
	159	switch(cbopt[0]) {
	160	case SUB_CB:
	161	callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
	162	break;
	163	case SKIP_CB:
	164	callback=UCNV_TO_U_CALLBACK_SKIP;
	165	break;
	166	case STOP_CB:
	167	callback=UCNV_TO_U_CALLBACK_STOP;
	168	break;
	169	case ESC_CB:
	170	callback=UCNV_TO_U_CALLBACK_ESCAPE;
	171	break;
	172	default:
	173	callback=NULL;
	174	break;
	175	}
	176	option=callback==NULL ? cbopt : cbopt+1;
	177	if(*option==0) {
	178	option=NULL;
	179	}
	180
	181	cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
	182
	183	if(U_FAILURE(errorCode)) {
	184	errln("error parsing conversion/toUnicode test case %d - %s",
	185	i, u_errorName(errorCode));
	186	errorCode=U_ZERO_ERROR;
	187	} else {
	188	logln("TestToUnicode[%d] %s", i, charset);
	189	ToUnicodeCase(cc, callback, option);
	190	}
	191	}
	192	delete testData;
	193	}
	194	delete dataModule;
	195	}
	196	else {
	197	dataerrln("Could not load test conversion data");
	198	}
	199	}
	200
	201	void
	202	ConversionTest::TestFromUnicode() {
	203	ConversionCase cc;
	204	char charset[100], cbopt[4];
	205	const char *option;
	206	UnicodeString s, unicode, invalidUChars;
	207	int32_t offsetsLength, index;
	208	UConverterFromUCallback callback;
	209
	210	TestDataModule *dataModule;
	211	TestData *testData;
	212	const DataMap *testCase;
	213	const UChar *p;
	214	UErrorCode errorCode;
	215	int32_t i, length;
	216
	217	errorCode=U_ZERO_ERROR;
	218	dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
	219	if(U_SUCCESS(errorCode)) {
	220	testData=dataModule->createTestData("fromUnicode", errorCode);
	221	if(U_SUCCESS(errorCode)) {
	222	for(i=0; testData->nextCase(testCase, errorCode); ++i) {
	223	if(U_FAILURE(errorCode)) {
	224	errln("error retrieving conversion/fromUnicode test case %d - %s",
	225	i, u_errorName(errorCode));
	226	errorCode=U_ZERO_ERROR;
	227	continue;
	228	}
	229
	230	cc.caseNr=i;
	231
	232	s=testCase->getString("charset", errorCode);
	233	s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
	234	cc.charset=charset;
	235
	236	unicode=testCase->getString("unicode", errorCode);
	237	cc.unicode=unicode.getBuffer();
	238	cc.unicodeLength=unicode.length();
	239	cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
	240
	241	offsetsLength=0;
	242	cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
	243	if(offsetsLength==0) {
	244	cc.offsets=NULL;
	245	} else if(offsetsLength!=cc.bytesLength) {
	246	errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
	247	i, cc.bytesLength, offsetsLength);
	248	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	249	}
	250
	251	cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
	252	cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
	253
	254	s=testCase->getString("errorCode", errorCode);
	255	if(s==UNICODE_STRING("invalid", 7)) {
	256	cc.outErrorCode=U_INVALID_CHAR_FOUND;
	257	} else if(s==UNICODE_STRING("illegal", 7)) {
	258	cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
	259	} else if(s==UNICODE_STRING("truncated", 9)) {
	260	cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
	261	} else {
	262	cc.outErrorCode=U_ZERO_ERROR;
	263	}
	264
	265	s=testCase->getString("callback", errorCode);
	266	cc.setSub=0; // default: no subchar
	267
	268	if((index=s.indexOf((UChar)0))>0) {
	269	// read NUL-separated subchar first, if any
	270	// copy the subchar from Latin-1 characters
	271	// start after the NUL
	272	p=s.getTerminatedBuffer();
	273	length=index+1;
	274	p+=length;
	275	length=s.length()-length;
	276	if(length<=0 \|\| length>=(int32_t)sizeof(cc.subchar)) {
	277	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	278	} else {
	279	int32_t j;
	280
	281	for(j=0; j<length; ++j) {
	282	cc.subchar[j]=(char)p[j];
	283	}
	284	// NUL-terminate the subchar
	285	cc.subchar[j]=0;
	286	cc.setSub=1;
	287	}
	288
	289	// remove the NUL and subchar from s
	290	s.truncate(index);
	291	} else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
	292	// read a substitution string, separated by an equal sign
	293	p=s.getBuffer()+index+1;
	294	length=s.length()-(index+1);
	295	if(length<0 \|\| length>=UPRV_LENGTHOF(cc.subString)) {
	296	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	297	} else {
	298	u_memcpy(cc.subString, p, length);
	299	// NUL-terminate the subString
	300	cc.subString[length]=0;
	301	cc.setSub=-1;
	302	}
	303
	304	// remove the equal sign and subString from s
	305	s.truncate(index);
	306	}
	307
	308	s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
	309	cc.cbopt=cbopt;
	310	switch(cbopt[0]) {
	311	case SUB_CB:
	312	callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
	313	break;
	314	case SKIP_CB:
	315	callback=UCNV_FROM_U_CALLBACK_SKIP;
	316	break;
	317	case STOP_CB:
	318	callback=UCNV_FROM_U_CALLBACK_STOP;
	319	break;
	320	case ESC_CB:
	321	callback=UCNV_FROM_U_CALLBACK_ESCAPE;
	322	break;
	323	default:
	324	callback=NULL;
	325	break;
	326	}
	327	option=callback==NULL ? cbopt : cbopt+1;
	328	if(*option==0) {
	329	option=NULL;
	330	}
	331
	332	invalidUChars=testCase->getString("invalidUChars", errorCode);
	333	cc.invalidUChars=invalidUChars.getBuffer();
	334	cc.invalidLength=invalidUChars.length();
	335
	336	if(U_FAILURE(errorCode)) {
	337	errln("error parsing conversion/fromUnicode test case %d - %s",
	338	i, u_errorName(errorCode));
	339	errorCode=U_ZERO_ERROR;
	340	} else {
	341	logln("TestFromUnicode[%d] %s", i, charset);
	342	FromUnicodeCase(cc, callback, option);
	343	}
	344	}
	345	delete testData;
	346	}
	347	delete dataModule;
	348	}
	349	else {
	350	dataerrln("Could not load test conversion data");
	351	}
	352	}
	353
	354	static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
	355
	356	void
	357	ConversionTest::TestGetUnicodeSet() {
	358	char charset[100];
	359	UnicodeString s, map, mapnot;
	360	int32_t which;
	361
	362	ParsePosition pos;
	363	UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
	364	UnicodeSet *cnvSetPtr = &cnvSet;
	365	LocalUConverterPointer cnv;
	366
	367	TestDataModule *dataModule;
	368	TestData *testData;
	369	const DataMap *testCase;
	370	UErrorCode errorCode;
	371	int32_t i;
	372
	373	errorCode=U_ZERO_ERROR;
	374	dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
	375	if(U_SUCCESS(errorCode)) {
	376	testData=dataModule->createTestData("getUnicodeSet", errorCode);
	377	if(U_SUCCESS(errorCode)) {
	378	for(i=0; testData->nextCase(testCase, errorCode); ++i) {
	379	if(U_FAILURE(errorCode)) {
	380	errln("error retrieving conversion/getUnicodeSet test case %d - %s",
	381	i, u_errorName(errorCode));
	382	errorCode=U_ZERO_ERROR;
	383	continue;
	384	}
	385
	386	s=testCase->getString("charset", errorCode);
	387	s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
	388
	389	map=testCase->getString("map", errorCode);
	390	mapnot=testCase->getString("mapnot", errorCode);
	391
	392	which=testCase->getInt28("which", errorCode);
	393
	394	if(U_FAILURE(errorCode)) {
	395	errln("error parsing conversion/getUnicodeSet test case %d - %s",
	396	i, u_errorName(errorCode));
	397	errorCode=U_ZERO_ERROR;
	398	continue;
	399	}
	400
	401	// test this test case
	402	mapSet.clear();
	403	mapnotSet.clear();
	404
	405	pos.setIndex(0);
	406	mapSet.applyPattern(map, pos, 0, NULL, errorCode);
	407	if(U_FAILURE(errorCode) \|\| pos.getIndex()!=map.length()) {
	408	errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
	409	" error index %d index %d U+%04x",
	410	i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
	411	errorCode=U_ZERO_ERROR;
	412	continue;
	413	}
	414
	415	pos.setIndex(0);
	416	mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
	417	if(U_FAILURE(errorCode) \|\| pos.getIndex()!=mapnot.length()) {
	418	errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
	419	" error index %d index %d U+%04x",
	420	i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
	421	errorCode=U_ZERO_ERROR;
	422	continue;
	423	}
	424
	425	logln("TestGetUnicodeSet[%d] %s", i, charset);
	426
	427	cnv.adoptInstead(cnv_open(charset, errorCode));
	428	if(U_FAILURE(errorCode)) {
	429	errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
	430	charset, i, u_errorName(errorCode));
	431	errorCode=U_ZERO_ERROR;
	432	continue;
	433	}
	434
	435	ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
	436
	437	if(U_FAILURE(errorCode)) {
	438	errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
	439	charset, i, u_errorName(errorCode));
	440	errorCode=U_ZERO_ERROR;
	441	continue;
	442	}
	443
	444	// are there items that must be in cnvSet but are not?
	445	(diffSet=mapSet).removeAll(cnvSet);
	446	if(!diffSet.isEmpty()) {
	447	diffSet.toPattern(s, TRUE);
	448	if(s.length()>100) {
	449	s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
	450	}
	451	errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
	452	charset, i);
	453	errln(s);
	454	}
	455
	456	// are there items that must not be in cnvSet but are?
	457	(diffSet=mapnotSet).retainAll(cnvSet);
	458	if(!diffSet.isEmpty()) {
	459	diffSet.toPattern(s, TRUE);
	460	if(s.length()>100) {
	461	s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
	462	}
	463	errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
	464	charset, i);
	465	errln(s);
	466	}
	467	}
	468	delete testData;
	469	}
	470	delete dataModule;
	471	}
	472	else {
	473	dataerrln("Could not load test conversion data");
	474	}
	475	}
	476
	477	U_CDECL_BEGIN
	478	static void U_CALLCONV
	479	getUnicodeSetCallback(const void *context,
	480	UConverterFromUnicodeArgs * /fromUArgs/,
	481	const UChar* /codeUnits/,
	482	int32_t /length/,
	483	UChar32 codePoint,
	484	UConverterCallbackReason reason,
	485	UErrorCode *pErrorCode) {
	486	if(reason<=UCNV_IRREGULAR) {
	487	((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
	488	*pErrorCode=U_ZERO_ERROR; // skip
	489	} // else ignore the reset, close and clone calls.
	490	}
	491	U_CDECL_END
	492
	493	// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
	494	void
	495	ConversionTest::TestGetUnicodeSet2() {
	496	// Build a string with all code points.
	497	UChar32 cpLimit;
	498	int32_t s0Length;
	499	if(quick) {
	500	cpLimit=s0Length=0x10000; // BMP only
	501	} else {
	502	cpLimit=0x110000;
	503	s0Length=0x10000+0x200000; // BMP + surrogate pairs
	504	}
	505	UChar *s0=new UChar[s0Length];
	506	if(s0==NULL) {
	507	return;
	508	}
	509	UChar *s=s0;
	510	UChar32 c;
	511	UChar c2;
	512	// low BMP
	513	for(c=0; c<=0xd7ff; ++c) {
	514	*s++=(UChar)c;
	515	}
	516	// trail surrogates
	517	for(c=0xdc00; c<=0xdfff; ++c) {
	518	*s++=(UChar)c;
	519	}
	520	// lead surrogates
	521	// (after trails so that there is not even one surrogate pair in between)
	522	for(c=0xd800; c<=0xdbff; ++c) {
	523	*s++=(UChar)c;
	524	}
	525	// high BMP
	526	for(c=0xe000; c<=0xffff; ++c) {
	527	*s++=(UChar)c;
	528	}
	529	// supplementary code points = surrogate pairs
	530	if(cpLimit==0x110000) {
	531	for(c=0xd800; c<=0xdbff; ++c) {
	532	for(c2=0xdc00; c2<=0xdfff; ++c2) {
	533	*s++=(UChar)c;
	534	*s++=c2;
	535	}
	536	}
	537	}
	538
	539	static const char *const cnvNames[]={
	540	"UTF-8",
	541	"UTF-7",
	542	"UTF-16",
	543	"US-ASCII",
	544	"ISO-8859-1",
	545	"windows-1252",
	546	"Shift-JIS",
	547	"ibm-1390", // EBCDIC_STATEFUL table
	548	"ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
	549	"HZ",
	550	"ISO-2022-JP",
	551	"JIS7",
	552	"ISO-2022-CN",
	553	"ISO-2022-CN-EXT",
	554	"LMBCS"
	555	};
	556	LocalUConverterPointer cnv;
	557	char buffer[1024];
	558	int32_t i;
	559	for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
	560	UErrorCode errorCode=U_ZERO_ERROR;
	561	cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
	562	if(U_FAILURE(errorCode)) {
	563	errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
	564	continue;
	565	}
	566	UnicodeSet expected;
	567	ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
	568	if(U_FAILURE(errorCode)) {
	569	errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
	570	continue;
	571	}
	572	UConverterUnicodeSet which;
	573	for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
	574	if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
	575	ucnv_setFallback(cnv.getAlias(), TRUE);
	576	}
	577	expected.add(0, cpLimit-1);
	578	s=s0;
	579	UBool flush;
	580	do {
	581	char *t=buffer;
	582	flush=(UBool)(s==s0+s0Length);
	583	ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
	584	if(U_FAILURE(errorCode)) {
	585	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
	586	errorCode=U_ZERO_ERROR;
	587	continue;
	588	} else {
	589	break; // unexpected error, should not occur
	590	}
	591	}
	592	} while(!flush);
	593	UnicodeSet set;
	594	ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
	595	if(cpLimit<0x110000) {
	596	set.remove(cpLimit, 0x10ffff);
	597	}
	598	if(which==UCNV_ROUNDTRIP_SET) {
	599	// ignore PUA code points because they will be converted even if they
	600	// are fallbacks and when other fallbacks are turned off,
	601	// but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
	602	expected.remove(0xe000, 0xf8ff);
	603	expected.remove(0xf0000, 0xffffd);
	604	expected.remove(0x100000, 0x10fffd);
	605	set.remove(0xe000, 0xf8ff);
	606	set.remove(0xf0000, 0xffffd);
	607	set.remove(0x100000, 0x10fffd);
	608	}
	609	if(set!=expected) {
	610	// First try to see if we have different sets because ucnv_getUnicodeSet()
	611	// added strings: The above conversion method does not tell us what strings might be convertible.
	612	// Remove strings from the set and compare again.
	613	// Unfortunately, there are no good, direct set methods for finding out whether there are strings
	614	// in the set, nor for enumerating or removing just them.
	615	// Intersect all code points with the set. The intersection will not contain strings.
	616	UnicodeSet temp(0, 0x10ffff);
	617	temp.retainAll(set);
	618	set=temp;
	619	}
	620	if(set!=expected) {
	621	UnicodeSet diffSet;
	622	UnicodeString out;
	623
	624	// are there items that must be in the set but are not?
	625	(diffSet=expected).removeAll(set);
	626	if(!diffSet.isEmpty()) {
	627	diffSet.toPattern(out, TRUE);
	628	if(out.length()>100) {
	629	out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
	630	}
	631	errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
	632	cnvNames[i], which);
	633	errln(out);
	634	}
	635
	636	// are there items that must not be in the set but are?
	637	(diffSet=set).removeAll(expected);
	638	if(!diffSet.isEmpty()) {
	639	diffSet.toPattern(out, TRUE);
	640	if(out.length()>100) {
	641	out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
	642	}
	643	errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
	644	cnvNames[i], which);
	645	errln(out);
	646	}
	647	}
	648	}
	649	}
	650
	651	delete [] s0;
	652	}
	653
	654	// Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
	655	// If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
	656	void
	657	ConversionTest::TestDefaultIgnorableCallback() {
	658	UErrorCode status = U_ZERO_ERROR;
	659	const char *cnv_name = "euc-jp-2007";
	660	const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
	661	const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
	662
	663	UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
	664	if (U_FAILURE(status)) {
	665	dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
	666	return;
	667	}
	668
	669	UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
	670	if (U_FAILURE(status)) {
	671	dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
	672	return;
	673	}
	674
	675	UConverter *cnv = cnv_open(cnv_name, status);
	676	if (U_FAILURE(status)) {
	677	dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
	678	return;
	679	}
	680
	681	// set callback for the converter
	682	ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
	683
	684	UChar32 input[1];
	685	char output[10];
	686	int32_t outputLength;
	687
	688	// test default ignorables are ignored
	689	int size = set_ignorable->size();
	690	for (int i = 0; i < size; i++) {
	691	status = U_ZERO_ERROR;
	692	outputLength= 0;
	693
	694	input[0] = set_ignorable->charAt(i);
	695
	696	outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
	697	if (U_FAILURE(status) \|\| outputLength != 0) {
	698	errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
	699	}
	700	}
	701
	702	// test non-ignorables are not ignored
	703	size = set_not_ignorable->size();
	704	for (int i = 0; i < size; i++) {
	705	status = U_ZERO_ERROR;
	706	outputLength= 0;
	707
	708	input[0] = set_not_ignorable->charAt(i);
	709
	710	if (input[0] == 0) {
	711	continue;
	712	}
	713
	714	outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
	715	if (U_FAILURE(status) \|\| outputLength <= 0) {
	716	errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
	717	}
	718	}
	719
	720	ucnv_close(cnv);
	721	delete set_not_ignorable;
	722	delete set_ignorable;
	723	}
	724
	725	// open testdata or ICU data converter ------------------------------------- ***
	726
	727	UConverter *
	728	ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
	729	if(name!=NULL && *name=='+') {
	730	// Converter names that start with '+' are ignored in ICU4J tests.
	731	++name;
	732	}
	733	if(name!=NULL && name=='') {
	734	/* loadTestData(): set the data directory */
	735	return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
	736	} else {
	737	return ucnv_open(name, &errorCode);
	738	}
	739	}
	740
	741	// output helpers ---------------------------------------------------------- ***
	742
	743	static inline char
	744	hexDigit(uint8_t digit) {
	745	return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
	746	}
	747
	748	static char *
	749	printBytes(const uint8_t bytes, int32_t length, char out) {
	750	uint8_t b;
	751
	752	if(length>0) {
	753	b=*bytes++;
	754	--length;
	755	*out++=hexDigit((uint8_t)(b>>4));
	756	*out++=hexDigit((uint8_t)(b&0xf));
	757	}
	758
	759	while(length>0) {
	760	b=*bytes++;
	761	--length;
	762	*out++=' ';
	763	*out++=hexDigit((uint8_t)(b>>4));
	764	*out++=hexDigit((uint8_t)(b&0xf));
	765	}
	766	*out++=0;
	767	return out;
	768	}
	769
	770	static char *
	771	printUnicode(const UChar unicode, int32_t length, char out) {
	772	UChar32 c;
	773	int32_t i;
	774
	775	for(i=0; i<length;) {
	776	if(i>0) {
	777	*out++=' ';
	778	}
	779	U16_NEXT(unicode, i, length, c);
	780	// write 4..6 digits
	781	if(c>=0x100000) {
	782	*out++='1';
	783	}
	784	if(c>=0x10000) {
	785	*out++=hexDigit((uint8_t)((c>>16)&0xf));
	786	}
	787	*out++=hexDigit((uint8_t)((c>>12)&0xf));
	788	*out++=hexDigit((uint8_t)((c>>8)&0xf));
	789	*out++=hexDigit((uint8_t)((c>>4)&0xf));
	790	*out++=hexDigit((uint8_t)(c&0xf));
	791	}
	792	*out++=0;
	793	return out;
	794	}
	795
	796	static char *
	797	printOffsets(const int32_t offsets, int32_t length, char out) {
	798	int32_t i, o, d;
	799
	800	if(offsets==NULL) {
	801	length=0;
	802	}
	803
	804	for(i=0; i<length; ++i) {
	805	if(i>0) {
	806	*out++=' ';
	807	}
	808	o=offsets[i];
	809
	810	// print all offsets with 2 characters each (-x, -9..99, xx)
	811	if(o<-9) {
	812	*out++='-';
	813	*out++='x';
	814	} else if(o<0) {
	815	*out++='-';
	816	*out++=(char)('0'-o);
	817	} else if(o<=99) {
	818	*out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
	819	*out++=(char)('0'+o%10);
	820	} else /* o>99 */ {
	821	*out++='x';
	822	*out++='x';
	823	}
	824	}
	825	*out++=0;
	826	return out;
	827	}
	828
	829	// toUnicode test worker functions ----------------------------------------- ***
	830
	831	static int32_t
	832	stepToUnicode(ConversionCase &cc, UConverter *cnv,
	833	UChar *result, int32_t resultCapacity,
	834	int32_t resultOffsets, / also resultCapacity */
	835	int32_t step,
	836	UErrorCode *pErrorCode) {
	837	const char source, sourceLimit, *bytesLimit;
	838	UChar target, targetLimit, *resultLimit;
	839	UBool flush;
	840
	841	source=(const char *)cc.bytes;
	842	target=result;
	843	bytesLimit=source+cc.bytesLength;
	844	resultLimit=result+resultCapacity;
	845
	846	if(step>=0) {
	847	// call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
	848	// move only one buffer (in vs. out) at a time to be extra mean
	849	// step==0 performs bulk conversion and generates offsets
	850
	851	// initialize the partial limits for the loop
	852	if(step==0) {
	853	// use the entire buffers
	854	sourceLimit=bytesLimit;
	855	targetLimit=resultLimit;
	856	flush=cc.finalFlush;
	857	} else {
	858	// start with empty partial buffers
	859	sourceLimit=source;
	860	targetLimit=target;
	861	flush=FALSE;
	862
	863	// output offsets only for bulk conversion
	864	resultOffsets=NULL;
	865	}
	866
	867	for(;;) {
	868	// resetting the opposite conversion direction must not affect this one
	869	ucnv_resetFromUnicode(cnv);
	870
	871	// convert
	872	ucnv_toUnicode(cnv,
	873	&target, targetLimit,
	874	&source, sourceLimit,
	875	resultOffsets,
	876	flush, pErrorCode);
	877
	878	// check pointers and errors
	879	if(source>sourceLimit \|\| target>targetLimit) {
	880	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	881	break;
	882	} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
	883	if(target!=targetLimit) {
	884	// buffer overflow must only be set when the target is filled
	885	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	886	break;
	887	} else if(targetLimit==resultLimit) {
	888	// not just a partial overflow
	889	break;
	890	}
	891
	892	// the partial target is filled, set a new limit, reset the error and continue
	893	targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
	894	*pErrorCode=U_ZERO_ERROR;
	895	} else if(U_FAILURE(*pErrorCode)) {
	896	// some other error occurred, done
	897	break;
	898	} else {
	899	if(source!=sourceLimit) {
	900	// when no error occurs, then the input must be consumed
	901	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	902	break;
	903	}
	904
	905	if(sourceLimit==bytesLimit) {
	906	// we are done
	907	break;
	908	}
	909
	910	// the partial conversion succeeded, set a new limit and continue
	911	sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
	912	flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
	913	}
	914	}
	915	} else /* step<0 */ {
	916	/*
	917	* step==-1: call only ucnv_getNextUChar()
	918	* otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
	919	* if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
	920	* else give it at most (-step-2)/2 bytes
	921	*/
	922	UChar32 c;
	923
	924	// end the loop by getting an index out of bounds error
	925	for(;;) {
	926	// resetting the opposite conversion direction must not affect this one
	927	ucnv_resetFromUnicode(cnv);
	928
	929	// convert
	930	if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
	931	sourceLimit=source; // use sourceLimit not as a real limit
	932	// but to remember the pre-getNextUChar source pointer
	933	c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
	934
	935	// check pointers and errors
	936	if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
	937	if(source!=bytesLimit) {
	938	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	939	} else {
	940	*pErrorCode=U_ZERO_ERROR;
	941	}
	942	break;
	943	} else if(U_FAILURE(*pErrorCode)) {
	944	break;
	945	}
	946	// source may not move if c is from previous overflow
	947
	948	if(target==resultLimit) {
	949	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
	950	break;
	951	}
	952	if(c<=0xffff) {
	953	*target++=(UChar)c;
	954	} else {
	955	*target++=U16_LEAD(c);
	956	if(target==resultLimit) {
	957	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
	958	break;
	959	}
	960	*target++=U16_TRAIL(c);
	961	}
	962
	963	// alternate between -n-1 and -n but leave -1 alone
	964	if(step<-1) {
	965	++step;
	966	}
	967	} else /* step is even */ {
	968	// allow only one UChar output
	969	targetLimit=target<resultLimit ? target+1 : resultLimit;
	970
	971	// as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
	972	// and never output offsets
	973	if(step==-2) {
	974	sourceLimit=bytesLimit;
	975	} else {
	976	sourceLimit=source+(-step-2)/2;
	977	if(sourceLimit>bytesLimit) {
	978	sourceLimit=bytesLimit;
	979	}
	980	}
	981
	982	ucnv_toUnicode(cnv,
	983	&target, targetLimit,
	984	&source, sourceLimit,
	985	NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
	986
	987	// check pointers and errors
	988	if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
	989	if(target!=targetLimit) {
	990	// buffer overflow must only be set when the target is filled
	991	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	992	break;
	993	} else if(targetLimit==resultLimit) {
	994	// not just a partial overflow
	995	break;
	996	}
	997
	998	// the partial target is filled, set a new limit and continue
	999	*pErrorCode=U_ZERO_ERROR;
	1000	} else if(U_FAILURE(*pErrorCode)) {
	1001	// some other error occurred, done
	1002	break;
	1003	} else {
	1004	if(source!=sourceLimit) {
	1005	// when no error occurs, then the input must be consumed
	1006	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1007	break;
	1008	}
	1009
	1010	// we are done (flush==TRUE) but we continue, to get the index out of bounds error above
	1011	}
	1012
	1013	--step;
	1014	}
	1015	}
	1016	}
	1017
	1018	return (int32_t)(target-result);
	1019	}
	1020
	1021	UBool
	1022	ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
	1023	// open the converter
	1024	IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
	1025	LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
	1026	// with no data, the above crashes with "pointer being freed was not allocated" for charset "x11-compound-text", see #13078
	1027	if(errorCode.isFailure()) {
	1028	errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
	1029	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
	1030	errorCode.reset();
	1031	return FALSE;
	1032	}
	1033
	1034	// set the callback
	1035	if(callback!=NULL) {
	1036	ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
	1037	if(U_FAILURE(errorCode)) {
	1038	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
	1039	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
	1040	return FALSE;
	1041	}
	1042	}
	1043
	1044	int32_t resultOffsets[256];
	1045	UChar result[256];
	1046	int32_t resultLength;
	1047	UBool ok;
	1048
	1049	static const struct {
	1050	int32_t step;
	1051	const char *name;
	1052	} steps[]={
	1053	{ 0, "bulk" }, // must be first for offsets to be checked
	1054	{ 1, "step=1" },
	1055	{ 3, "step=3" },
	1056	{ 7, "step=7" },
	1057	{ -1, "getNext" },
	1058	{ -2, "toU(bulk)+getNext" },
	1059	{ -3, "getNext+toU(bulk)" },
	1060	{ -4, "toU(1)+getNext" },
	1061	{ -5, "getNext+toU(1)" },
	1062	{ -12, "toU(5)+getNext" },
	1063	{ -13, "getNext+toU(5)" },
	1064	};
	1065	int32_t i, step;
	1066
	1067	ok=TRUE;
	1068	for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
	1069	step=steps[i].step;
	1070	if(step<0 && !cc.finalFlush) {
	1071	// skip ucnv_getNextUChar() if !finalFlush because
	1072	// ucnv_getNextUChar() always implies flush
	1073	continue;
	1074	}
	1075	if(step!=0) {
	1076	// bulk test is first, then offsets are not checked any more
	1077	cc.offsets=NULL;
	1078	}
	1079	else {
	1080	memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
	1081	}
	1082	memset(result, -1, UPRV_LENGTHOF(result));
	1083	errorCode.reset();
	1084	resultLength=stepToUnicode(cc, cnv.getAlias(),
	1085	result, UPRV_LENGTHOF(result),
	1086	step==0 ? resultOffsets : NULL,
	1087	step, errorCode);
	1088	ok=checkToUnicode(
	1089	cc, cnv.getAlias(), steps[i].name,
	1090	result, resultLength,
	1091	cc.offsets!=NULL ? resultOffsets : NULL,
	1092	errorCode);
	1093	if(errorCode.isFailure() \|\| !cc.finalFlush) {
	1094	// reset if an error occurred or we did not flush
	1095	// otherwise do nothing to make sure that flushing resets
	1096	ucnv_resetToUnicode(cnv.getAlias());
	1097	}
	1098	if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
	1099	errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
	1100	cc.caseNr, cc.charset, resultLength);
	1101	}
	1102	if (result[resultLength] != (UChar)-1) {
	1103	errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
	1104	cc.caseNr, cc.charset, resultLength);
	1105	}
	1106	}
	1107
	1108	// not a real loop, just a convenience for breaking out of the block
	1109	while(ok && cc.finalFlush) {
	1110	// test ucnv_toUChars()
	1111	memset(result, 0, sizeof(result));
	1112
	1113	errorCode.reset();
	1114	resultLength=ucnv_toUChars(cnv.getAlias(),
	1115	result, UPRV_LENGTHOF(result),
	1116	(const char *)cc.bytes, cc.bytesLength,
	1117	errorCode);
	1118	ok=checkToUnicode(
	1119	cc, cnv.getAlias(), "toUChars",
	1120	result, resultLength,
	1121	NULL,
	1122	errorCode);
	1123	if(!ok) {
	1124	break;
	1125	}
	1126
	1127	// test preflighting
	1128	// keep the correct result for simple checking
	1129	errorCode.reset();
	1130	resultLength=ucnv_toUChars(cnv.getAlias(),
	1131	NULL, 0,
	1132	(const char *)cc.bytes, cc.bytesLength,
	1133	errorCode);
	1134	if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING \|\| errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
	1135	errorCode.reset();
	1136	}
	1137	ok=checkToUnicode(
	1138	cc, cnv.getAlias(), "preflight toUChars",
	1139	result, resultLength,
	1140	NULL,
	1141	errorCode);
	1142	break;
	1143	}
	1144
	1145	errorCode.reset(); // all errors have already been reported
	1146	return ok;
	1147	}
	1148
	1149	UBool
	1150	ConversionTest::checkToUnicode(ConversionCase &cc, UConverter cnv, const char name,
	1151	const UChar *result, int32_t resultLength,
	1152	const int32_t *resultOffsets,
	1153	UErrorCode resultErrorCode) {
	1154	char resultInvalidChars[8];
	1155	int8_t resultInvalidLength;
	1156	UErrorCode errorCode;
	1157
	1158	const char *msg;
	1159
	1160	// reset the message; NULL will mean "ok"
	1161	msg=NULL;
	1162
	1163	errorCode=U_ZERO_ERROR;
	1164	resultInvalidLength=sizeof(resultInvalidChars);
	1165	ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
	1166	if(U_FAILURE(errorCode)) {
	1167	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
	1168	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
	1169	return FALSE;
	1170	}
	1171
	1172	// check everything that might have gone wrong
	1173	if(cc.unicodeLength!=resultLength) {
	1174	msg="wrong result length";
	1175	} else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
	1176	msg="wrong result string";
	1177	} else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLengthsizeof(cc.offsets))) {
	1178	msg="wrong offsets";
	1179	} else if(cc.outErrorCode!=resultErrorCode) {
	1180	msg="wrong error code";
	1181	} else if(cc.invalidLength!=resultInvalidLength) {
	1182	msg="wrong length of last invalid input";
	1183	} else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
	1184	msg="wrong last invalid input";
	1185	}
	1186
	1187	if(msg==NULL) {
	1188	return TRUE;
	1189	} else {
	1190	char buffer[2000]; // one buffer for all strings
	1191	char s, bytesString, unicodeString, resultString,
	1192	offsetsString, resultOffsetsString,
	1193	invalidCharsString, resultInvalidCharsString;
	1194
	1195	bytesString=s=buffer;
	1196	s=printBytes(cc.bytes, cc.bytesLength, bytesString);
	1197	s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
	1198	s=printUnicode(result, resultLength, resultString=s);
	1199	s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
	1200	s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
	1201	s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
	1202	s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
	1203
	1204	if((s-buffer)>(int32_t)sizeof(buffer)) {
	1205	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
	1206	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
	1207	exit(1);
	1208	}
	1209
	1210	errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
	1211	" bytes <%s>[%d]\n"
	1212	" expected <%s>[%d]\n"
	1213	" result <%s>[%d]\n"
	1214	" offsets <%s>\n"
	1215	" result offsets <%s>\n"
	1216	" error code expected %s got %s\n"
	1217	" invalidChars expected <%s> got <%s>\n",
	1218	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
	1219	bytesString, cc.bytesLength,
	1220	unicodeString, cc.unicodeLength,
	1221	resultString, resultLength,
	1222	offsetsString,
	1223	resultOffsetsString,
	1224	u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
	1225	invalidCharsString, resultInvalidCharsString);
	1226
	1227	return FALSE;
	1228	}
	1229	}
	1230
	1231	// fromUnicode test worker functions --------------------------------------- ***
	1232
	1233	static int32_t
	1234	stepFromUTF8(ConversionCase &cc,
	1235	UConverter utf8Cnv, UConverter cnv,
	1236	char *result, int32_t resultCapacity,
	1237	int32_t step,
	1238	UErrorCode *pErrorCode) {
	1239	const char source, sourceLimit, *utf8Limit;
	1240	UChar pivotBuffer[32];
	1241	UChar pivotSource, pivotTarget, *pivotLimit;
	1242	char target, targetLimit, *resultLimit;
	1243	UBool flush;
	1244
	1245	source=cc.utf8;
	1246	pivotSource=pivotTarget=pivotBuffer;
	1247	target=result;
	1248	utf8Limit=source+cc.utf8Length;
	1249	resultLimit=result+resultCapacity;
	1250
	1251	// call ucnv_convertEx() with in/out buffers no larger than (step) at a time
	1252	// move only one buffer (in vs. out) at a time to be extra mean
	1253	// step==0 performs bulk conversion
	1254
	1255	// initialize the partial limits for the loop
	1256	if(step==0) {
	1257	// use the entire buffers
	1258	sourceLimit=utf8Limit;
	1259	targetLimit=resultLimit;
	1260	flush=cc.finalFlush;
	1261
	1262	pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
	1263	} else {
	1264	// start with empty partial buffers
	1265	sourceLimit=source;
	1266	targetLimit=target;
	1267	flush=FALSE;
	1268
	1269	// empty pivot is not allowed, make it of length step
	1270	pivotLimit=pivotBuffer+step;
	1271	}
	1272
	1273	for(;;) {
	1274	// resetting the opposite conversion direction must not affect this one
	1275	ucnv_resetFromUnicode(utf8Cnv);
	1276	ucnv_resetToUnicode(cnv);
	1277
	1278	// convert
	1279	ucnv_convertEx(cnv, utf8Cnv,
	1280	&target, targetLimit,
	1281	&source, sourceLimit,
	1282	pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
	1283	FALSE, flush, pErrorCode);
	1284
	1285	// check pointers and errors
	1286	if(source>sourceLimit \|\| target>targetLimit) {
	1287	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1288	break;
	1289	} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
	1290	if(target!=targetLimit) {
	1291	// buffer overflow must only be set when the target is filled
	1292	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1293	break;
	1294	} else if(targetLimit==resultLimit) {
	1295	// not just a partial overflow
	1296	break;
	1297	}
	1298
	1299	// the partial target is filled, set a new limit, reset the error and continue
	1300	targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
	1301	*pErrorCode=U_ZERO_ERROR;
	1302	} else if(U_FAILURE(*pErrorCode)) {
	1303	if(pivotSource==pivotBuffer) {
	1304	// toUnicode error, should not occur
	1305	// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
	1306	break;
	1307	} else {
	1308	// fromUnicode error
	1309	// some other error occurred, done
	1310	break;
	1311	}
	1312	} else {
	1313	if(source!=sourceLimit) {
	1314	// when no error occurs, then the input must be consumed
	1315	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1316	break;
	1317	}
	1318
	1319	if(sourceLimit==utf8Limit) {
	1320	// we are done
	1321	if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
	1322	// ucnv_convertEx() warns about not terminating the output
	1323	// but ucnv_fromUnicode() does not and so
	1324	// checkFromUnicode() does not expect it
	1325	*pErrorCode=U_ZERO_ERROR;
	1326	}
	1327	break;
	1328	}
	1329
	1330	// the partial conversion succeeded, set a new limit and continue
	1331	sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
	1332	flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
	1333	}
	1334	}
	1335
	1336	return (int32_t)(target-result);
	1337	}
	1338
	1339	static int32_t
	1340	stepFromUnicode(ConversionCase &cc, UConverter *cnv,
	1341	char *result, int32_t resultCapacity,
	1342	int32_t resultOffsets, / also resultCapacity */
	1343	int32_t step,
	1344	UErrorCode *pErrorCode) {
	1345	const UChar source, sourceLimit, *unicodeLimit;
	1346	char target, targetLimit, *resultLimit;
	1347	UBool flush;
	1348
	1349	source=cc.unicode;
	1350	target=result;
	1351	unicodeLimit=source+cc.unicodeLength;
	1352	resultLimit=result+resultCapacity;
	1353
	1354	// call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
	1355	// move only one buffer (in vs. out) at a time to be extra mean
	1356	// step==0 performs bulk conversion and generates offsets
	1357
	1358	// initialize the partial limits for the loop
	1359	if(step==0) {
	1360	// use the entire buffers
	1361	sourceLimit=unicodeLimit;
	1362	targetLimit=resultLimit;
	1363	flush=cc.finalFlush;
	1364	} else {
	1365	// start with empty partial buffers
	1366	sourceLimit=source;
	1367	targetLimit=target;
	1368	flush=FALSE;
	1369
	1370	// output offsets only for bulk conversion
	1371	resultOffsets=NULL;
	1372	}
	1373
	1374	for(;;) {
	1375	// resetting the opposite conversion direction must not affect this one
	1376	ucnv_resetToUnicode(cnv);
	1377
	1378	// convert
	1379	ucnv_fromUnicode(cnv,
	1380	&target, targetLimit,
	1381	&source, sourceLimit,
	1382	resultOffsets,
	1383	flush, pErrorCode);
	1384
	1385	// check pointers and errors
	1386	if(source>sourceLimit \|\| target>targetLimit) {
	1387	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1388	break;
	1389	} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
	1390	if(target!=targetLimit) {
	1391	// buffer overflow must only be set when the target is filled
	1392	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1393	break;
	1394	} else if(targetLimit==resultLimit) {
	1395	// not just a partial overflow
	1396	break;
	1397	}
	1398
	1399	// the partial target is filled, set a new limit, reset the error and continue
	1400	targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
	1401	*pErrorCode=U_ZERO_ERROR;
	1402	} else if(U_FAILURE(*pErrorCode)) {
	1403	// some other error occurred, done
	1404	break;
	1405	} else {
	1406	if(source!=sourceLimit) {
	1407	// when no error occurs, then the input must be consumed
	1408	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	1409	break;
	1410	}
	1411
	1412	if(sourceLimit==unicodeLimit) {
	1413	// we are done
	1414	break;
	1415	}
	1416
	1417	// the partial conversion succeeded, set a new limit and continue
	1418	sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
	1419	flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
	1420	}
	1421	}
	1422
	1423	return (int32_t)(target-result);
	1424	}
	1425
	1426	UBool
	1427	ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
	1428	UConverter *cnv;
	1429	UErrorCode errorCode;
	1430
	1431	// open the converter
	1432	errorCode=U_ZERO_ERROR;
	1433	cnv=cnv_open(cc.charset, errorCode);
	1434	if(U_FAILURE(errorCode)) {
	1435	errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
	1436	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
	1437	return FALSE;
	1438	}
	1439	ucnv_resetToUnicode(utf8Cnv);
	1440
	1441	// set the callback
	1442	if(callback!=NULL) {
	1443	ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
	1444	if(U_FAILURE(errorCode)) {
	1445	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
	1446	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
	1447	ucnv_close(cnv);
	1448	return FALSE;
	1449	}
	1450	}
	1451
	1452	// set the fallbacks flag
	1453	// TODO change with Jitterbug 2401, then add a similar call for toUnicode too
	1454	ucnv_setFallback(cnv, cc.fallbacks);
	1455
	1456	// set the subchar
	1457	int32_t length;
	1458
	1459	if(cc.setSub>0) {
	1460	length=(int32_t)strlen(cc.subchar);
	1461	ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
	1462	if(U_FAILURE(errorCode)) {
	1463	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
	1464	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
	1465	ucnv_close(cnv);
	1466	return FALSE;
	1467	}
	1468	} else if(cc.setSub<0) {
	1469	ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
	1470	if(U_FAILURE(errorCode)) {
	1471	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
	1472	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
	1473	ucnv_close(cnv);
	1474	return FALSE;
	1475	}
	1476	}
	1477
	1478	// convert unicode to utf8
	1479	char utf8[256];
	1480	cc.utf8=utf8;
	1481	u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
	1482	cc.unicode, cc.unicodeLength,
	1483	&errorCode);
	1484	if(U_FAILURE(errorCode)) {
	1485	// skip UTF-8 testing of a string with an unpaired surrogate,
	1486	// or of one that's too long
	1487	// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
	1488	cc.utf8Length=-1;
	1489	}
	1490
	1491	int32_t resultOffsets[256];
	1492	char result[256];
	1493	int32_t resultLength;
	1494	UBool ok;
	1495
	1496	static const struct {
	1497	int32_t step;
	1498	const char name, utf8Name;
	1499	} steps[]={
	1500	{ 0, "bulk", "utf8" }, // must be first for offsets to be checked
	1501	{ 1, "step=1", "utf8 step=1" },
	1502	{ 3, "step=3", "utf8 step=3" },
	1503	{ 7, "step=7", "utf8 step=7" }
	1504	};
	1505	int32_t i, step;
	1506
	1507	ok=TRUE;
	1508	for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
	1509	step=steps[i].step;
	1510	memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
	1511	memset(result, -1, UPRV_LENGTHOF(result));
	1512	errorCode=U_ZERO_ERROR;
	1513	resultLength=stepFromUnicode(cc, cnv,
	1514	result, UPRV_LENGTHOF(result),
	1515	step==0 ? resultOffsets : NULL,
	1516	step, &errorCode);
	1517	ok=checkFromUnicode(
	1518	cc, cnv, steps[i].name,
	1519	(uint8_t *)result, resultLength,
	1520	cc.offsets!=NULL ? resultOffsets : NULL,
	1521	errorCode);
	1522	if(U_FAILURE(errorCode) \|\| !cc.finalFlush) {
	1523	// reset if an error occurred or we did not flush
	1524	// otherwise do nothing to make sure that flushing resets
	1525	ucnv_resetFromUnicode(cnv);
	1526	}
	1527	if (resultOffsets[resultLength] != -1) {
	1528	errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
	1529	cc.caseNr, cc.charset, resultLength);
	1530	}
	1531	if (result[resultLength] != (char)-1) {
	1532	errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
	1533	cc.caseNr, cc.charset, resultLength);
	1534	}
	1535
	1536	// bulk test is first, then offsets are not checked any more
	1537	cc.offsets=NULL;
	1538
	1539	// test direct conversion from UTF-8
	1540	if(cc.utf8Length>=0) {
	1541	errorCode=U_ZERO_ERROR;
	1542	resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
	1543	result, UPRV_LENGTHOF(result),
	1544	step, &errorCode);
	1545	ok=checkFromUnicode(
	1546	cc, cnv, steps[i].utf8Name,
	1547	(uint8_t *)result, resultLength,
	1548	NULL,
	1549	errorCode);
	1550	if(U_FAILURE(errorCode) \|\| !cc.finalFlush) {
	1551	// reset if an error occurred or we did not flush
	1552	// otherwise do nothing to make sure that flushing resets
	1553	ucnv_resetToUnicode(utf8Cnv);
	1554	ucnv_resetFromUnicode(cnv);
	1555	}
	1556	}
	1557	}
	1558
	1559	// not a real loop, just a convenience for breaking out of the block
	1560	while(ok && cc.finalFlush) {
	1561	// test ucnv_fromUChars()
	1562	memset(result, 0, sizeof(result));
	1563
	1564	errorCode=U_ZERO_ERROR;
	1565	resultLength=ucnv_fromUChars(cnv,
	1566	result, UPRV_LENGTHOF(result),
	1567	cc.unicode, cc.unicodeLength,
	1568	&errorCode);
	1569	ok=checkFromUnicode(
	1570	cc, cnv, "fromUChars",
	1571	(uint8_t *)result, resultLength,
	1572	NULL,
	1573	errorCode);
	1574	if(!ok) {
	1575	break;
	1576	}
	1577
	1578	// test preflighting
	1579	// keep the correct result for simple checking
	1580	errorCode=U_ZERO_ERROR;
	1581	resultLength=ucnv_fromUChars(cnv,
	1582	NULL, 0,
	1583	cc.unicode, cc.unicodeLength,
	1584	&errorCode);
	1585	if(errorCode==U_STRING_NOT_TERMINATED_WARNING \|\| errorCode==U_BUFFER_OVERFLOW_ERROR) {
	1586	errorCode=U_ZERO_ERROR;
	1587	}
	1588	ok=checkFromUnicode(
	1589	cc, cnv, "preflight fromUChars",
	1590	(uint8_t *)result, resultLength,
	1591	NULL,
	1592	errorCode);
	1593	break;
	1594	}
	1595
	1596	ucnv_close(cnv);
	1597	return ok;
	1598	}
	1599
	1600	UBool
	1601	ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter cnv, const char name,
	1602	const uint8_t *result, int32_t resultLength,
	1603	const int32_t *resultOffsets,
	1604	UErrorCode resultErrorCode) {
	1605	UChar resultInvalidUChars[8];
	1606	int8_t resultInvalidLength;
	1607	UErrorCode errorCode;
	1608
	1609	const char *msg;
	1610
	1611	// reset the message; NULL will mean "ok"
	1612	msg=NULL;
	1613
	1614	errorCode=U_ZERO_ERROR;
	1615	resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
	1616	ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
	1617	if(U_FAILURE(errorCode)) {
	1618	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
	1619	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
	1620	return FALSE;
	1621	}
	1622
	1623	// check everything that might have gone wrong
	1624	if(cc.bytesLength!=resultLength) {
	1625	msg="wrong result length";
	1626	} else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
	1627	msg="wrong result string";
	1628	} else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLengthsizeof(cc.offsets))) {
	1629	msg="wrong offsets";
	1630	} else if(cc.outErrorCode!=resultErrorCode) {
	1631	msg="wrong error code";
	1632	} else if(cc.invalidLength!=resultInvalidLength) {
	1633	msg="wrong length of last invalid input";
	1634	} else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
	1635	msg="wrong last invalid input";
	1636	}
	1637
	1638	if(msg==NULL) {
	1639	return TRUE;
	1640	} else {
	1641	char buffer[2000]; // one buffer for all strings
	1642	char s, unicodeString, bytesString, resultString,
	1643	offsetsString, resultOffsetsString,
	1644	invalidCharsString, resultInvalidUCharsString;
	1645
	1646	unicodeString=s=buffer;
	1647	s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
	1648	s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
	1649	s=printBytes(result, resultLength, resultString=s);
	1650	s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
	1651	s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
	1652	s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
	1653	s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
	1654
	1655	if((s-buffer)>(int32_t)sizeof(buffer)) {
	1656	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
	1657	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
	1658	exit(1);
	1659	}
	1660
	1661	errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
	1662	" unicode <%s>[%d]\n"
	1663	" expected <%s>[%d]\n"
	1664	" result <%s>[%d]\n"
	1665	" offsets <%s>\n"
	1666	" result offsets <%s>\n"
	1667	" error code expected %s got %s\n"
	1668	" invalidChars expected <%s> got <%s>\n",
	1669	cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
	1670	unicodeString, cc.unicodeLength,
	1671	bytesString, cc.bytesLength,
	1672	resultString, resultLength,
	1673	offsetsString,
	1674	resultOffsetsString,
	1675	u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
	1676	invalidCharsString, resultInvalidUCharsString);
	1677
	1678	return FALSE;
	1679	}
	1680	}
	1681
	1682	#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */