git.saurik.com Git - wxWidgets.git/blame - tests/strings/unicode.cpp

Commit	Line	Data
387f829e VS	1	///////////////////////////////////////////////////////////////////////////////
	2	// Name: tests/strings/unicode.cpp
	3	// Purpose: Unicode unit test
	4	// Author: Vadim Zeitlin, Wlodzimierz ABX Skiba
	5	// Created: 2004-04-28
387f829e VS	6	// Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
	7	///////////////////////////////////////////////////////////////////////////////
	8
	9	// ----------------------------------------------------------------------------
	10	// headers
	11	// ----------------------------------------------------------------------------
	12
8899b155	13	#include "testprec.h"
387f829e VS	14
	15	#ifdef __BORLANDC__
	16	#pragma hdrstop
	17	#endif
	18
	19	#ifndef WX_PRECOMP
31c06391	20	#include "wx/wx.h"
387f829e VS	21	#endif // WX_PRECOMP
387f829e VS	22
210bfffb FM	23	#include "wx/encconv.h"
	24
	25	// ----------------------------------------------------------------------------
42e8b52f	26	// helper class holding the matching MB and WC strings
210bfffb FM	27	// ----------------------------------------------------------------------------
210bfffb FM	28
42e8b52f VZ	29	struct StringConversionData
42e8b52f VZ	30	{
210bfffb FM	31	// either str or wcs (but not both) may be NULL, this means that the conversion
210bfffb FM	32	// to it should fail
527587d3 VZ	33	StringConversionData(const char str_, const wchar_t wcs_, int flags_ = 0)
	34	: str(str_), wcs(wcs_), flags(flags_)
	35	{
	36	}
	37
	38	const char * const str;
	39	const wchar_t * const wcs;
42e8b52f VZ	40
	41	enum
	42	{
	43	TEST_BOTH = 0, // test both str -> wcs and wcs -> str
	44	ONLY_MB2WC = 1 // only test str -> wcs conversion
	45	};
	46
527587d3	47	const int flags;
42e8b52f VZ	48
	49	// test that the conversion between str and wcs (subject to flags) succeeds
	50	//
	51	// the first argument is the index in the test array and is used solely for
	52	// diagnostics
	53	void Test(size_t n, wxMBConv& conv) const
	54	{
	55	if ( str )
	56	{
	57	wxWCharBuffer wbuf = conv.cMB2WC(str);
	58
	59	if ( wcs )
	60	{
	61	CPPUNIT_ASSERT_MESSAGE
	62	(
	63	Message(n, "MB2WC failed"),
	64	wbuf.data()
	65	);
	66
	67	CPPUNIT_ASSERT_MESSAGE
	68	(
	69	Message(n, "MB2WC", wbuf, wcs),
	70	wxStrcmp(wbuf, wcs) == 0
	71	);
	72	}
	73	else // conversion is supposed to fail
	74	{
	75	CPPUNIT_ASSERT_MESSAGE
	76	(
	77	Message(n, "MB2WC succeeded"),
	78	!wbuf.data()
	79	);
	80	}
	81	}
	82
	83	if ( wcs && !(flags & ONLY_MB2WC) )
	84	{
	85	wxCharBuffer buf = conv.cWC2MB(wcs);
	86
	87	if ( str )
	88	{
	89	CPPUNIT_ASSERT_MESSAGE
	90	(
	91	Message(n, "WC2MB failed"),
	92	buf.data()
	93	);
	94
	95	CPPUNIT_ASSERT_MESSAGE
	96	(
	97	Message(n, "WC2MB", buf, str),
	98	strcmp(buf, str) == 0
	99	);
	100	}
	101	else
	102	{
	103	CPPUNIT_ASSERT_MESSAGE
	104	(
	105	Message(n, "WC2MB succeeded"),
	106	!buf.data()
	107	);
	108	}
	109	}
	110	}
	111
112	private:
113	static std::string
114	Message(size_t n, const wxString& msg)
115	{
116	return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg));
117	}
118
119	template <typename T>
120	static std::string
121	Message(size_t n,
122	const char *func,
123	const wxCharTypeBuffer<T>& actual,
124	const T *expected)
125	{
126	return Message(n,
127	wxString::Format("%s returned \"%s\", expected \"%s\"",
128	func, actual.data(), expected));
129	}
130	};
131
387f829e VS	132	// ----------------------------------------------------------------------------
	133	// test class
	134	// ----------------------------------------------------------------------------
	135
	136	class UnicodeTestCase : public CppUnit::TestCase
	137	{
	138	public:
	139	UnicodeTestCase();
	140
	141	private:
	142	CPPUNIT_TEST_SUITE( UnicodeTestCase );
	143	CPPUNIT_TEST( ToFromAscii );
a65ca3e6	144	CPPUNIT_TEST( ConstructorsWithConversion );
bbb0ff36	145	CPPUNIT_TEST( ConversionFixed );
5975f198	146	CPPUNIT_TEST( ConversionWithNULs );
a65ca3e6 VZ	147	CPPUNIT_TEST( ConversionUTF7 );
a65ca3e6 VZ	148	CPPUNIT_TEST( ConversionUTF8 );
5975f198	149	CPPUNIT_TEST( ConversionUTF16 );
a7823b26	150	CPPUNIT_TEST( ConversionUTF32 );
0f0298b1	151	CPPUNIT_TEST( IsConvOk );
b0c4d5d7 VS	152	#if wxUSE_UNICODE
	153	CPPUNIT_TEST( Iteration );
	154	#endif
387f829e VS	155	CPPUNIT_TEST_SUITE_END();
	156
	157	void ToFromAscii();
a65ca3e6	158	void ConstructorsWithConversion();
bbb0ff36	159	void ConversionFixed();
5975f198	160	void ConversionWithNULs();
a65ca3e6 VZ	161	void ConversionUTF7();
a65ca3e6 VZ	162	void ConversionUTF8();
5975f198	163	void ConversionUTF16();
a7823b26	164	void ConversionUTF32();
0f0298b1	165	void IsConvOk();
b0c4d5d7 VS	166	#if wxUSE_UNICODE
	167	void Iteration();
	168	#endif
a65ca3e6	169
387f829e VS	170	DECLARE_NO_COPY_CLASS(UnicodeTestCase)
	171	};
	172
	173	// register in the unnamed registry so that these tests are run by default
	174	CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
	175
e3778b4d	176	// also include in its own registry so that these tests can be run alone
81e9dec6	177	CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
387f829e VS	178
	179	UnicodeTestCase::UnicodeTestCase()
	180	{
	181	}
	182
	183	void UnicodeTestCase::ToFromAscii()
	184	{
	185
	186	#define TEST_TO_FROM_ASCII(txt) \
	187	{ \
	188	static const char *msg = txt; \
	189	wxString s = wxString::FromAscii(msg); \
	190	CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \
	191	}
	192
	193	TEST_TO_FROM_ASCII( "Hello, world!" );
	194	TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
	195	}
	196
a65ca3e6 VZ	197	void UnicodeTestCase::ConstructorsWithConversion()
a65ca3e6 VZ	198	{
4bc9acbe	199	// the string "Déjà" in UTF-8 and wchar_t:
a65ca3e6	200	const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
4bc9acbe	201	const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
a65ca3e6 VZ	202	const char utf8 = (char )utf8Buf;
	203	const char utf8sub = (char )utf8subBuf;
	204
	205	wxString s1(utf8, wxConvUTF8);
a65ca3e6 VZ	206
a65ca3e6 VZ	207	#if wxUSE_UNICODE
ae431935	208	const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
1de532f5	209	CPPUNIT_ASSERT_EQUAL( wchar, s1 );
ae431935 VZ	210
ae431935 VZ	211	wxString s2(wchar);
1de532f5	212	CPPUNIT_ASSERT_EQUAL( wchar, s2 );
727e8d84	213	CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 );
a65ca3e6	214	#else
1de532f5	215	CPPUNIT_ASSERT_EQUAL( utf8, s1 );
a65ca3e6 VZ	216	#endif
	217
	218	wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
	219	wxString s3(utf8, wxConvUTF8, 4);
fa0584f1	220	CPPUNIT_ASSERT_EQUAL( sub, s3 );
a65ca3e6 VZ	221
a65ca3e6 VZ	222	#if wxUSE_UNICODE
ae431935 VZ	223	wxString s4(wchar, wxConvUTF8, 3);
	224	CPPUNIT_ASSERT_EQUAL( sub, s4 );
	225
fa0584f1	226	// conversion should stop with failure at pos 35
4bc9acbe	227	wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8);
fa0584f1	228	CPPUNIT_ASSERT( s.empty() );
ae431935	229	#endif // wxUSE_UNICODE
d7330233 VS	230
	231
	232	// test using Unicode strings together with char* strings (this must work
	233	// in ANSI mode as well, of course):
	234	wxString s5("ascii");
1de532f5	235	CPPUNIT_ASSERT_EQUAL( "ascii", s5 );
d7330233 VS	236
	237	s5 += " value";
	238
	239	CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
1de532f5	240	CPPUNIT_ASSERT_EQUAL( "ascii value", s5 );
d7330233	241	CPPUNIT_ASSERT( s5 != "SomethingElse" );
a65ca3e6 VZ	242	}
a65ca3e6 VZ	243
bbb0ff36	244	void UnicodeTestCase::ConversionFixed()
85d3e5a9 VZ	245	{
	246	size_t len;
	247
	248	#if wxUSE_UNICODE
93a800a9	249	wxConvLibc.cWC2MB(L"", 0, &len);
85d3e5a9	250	#else // !wxUSE_UNICODE
93a800a9	251	wxConvLibc.cMB2WC("", 0, &len);
85d3e5a9 VZ	252	#endif // wxUSE_UNICODE/!wxUSE_UNICODE
85d3e5a9 VZ	253
93a800a9	254	CPPUNIT_ASSERT_EQUAL( 0, len );
bbb0ff36 VZ	255
	256	#if wxUSE_UNICODE
	257	// check that when we convert a fixed number of characters we obtain the
	258	// expected return value
	259	CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) );
	260	CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) );
	261	CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) );
	262	CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) );
	263	#endif // wxUSE_UNICODE
85d3e5a9 VZ	264	}
85d3e5a9 VZ	265
5975f198	266	void UnicodeTestCase::ConversionWithNULs()
a65ca3e6 VZ	267	{
a65ca3e6 VZ	268	#if wxUSE_UNICODE
85d3e5a9	269	static const size_t lenNulString = 10;
7ce0c58f	270
85d3e5a9 VZ	271	wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
85d3e5a9 VZ	272	wxCharBuffer theBuffer = szTheString.mb_str();
a65ca3e6	273
85d3e5a9 VZ	274	CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
85d3e5a9 VZ	275	lenNulString + 1) == 0 );
a65ca3e6	276
85d3e5a9 VZ	277	wxString szTheString2("The\0String", wxConvLocal, lenNulString);
	278	CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
	279	CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
	280	lenNulString + 1) == 0 );
2877b828	281	#else // !wxUSE_UNICODE
ae431935	282	wxString szTheString("TheString");
85d3e5a9 VZ	283	szTheString.insert(3, 1, '\0');
85d3e5a9 VZ	284	wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
a65ca3e6	285
85d3e5a9	286	CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
a65ca3e6	287
ae431935	288	wxString szLocalTheString("TheString");
85d3e5a9 VZ	289	szLocalTheString.insert(3, 1, '\0');
85d3e5a9 VZ	290	wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
a65ca3e6	291
85d3e5a9	292	CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
2877b828	293	#endif // wxUSE_UNICODE/!wxUSE_UNICODE
a65ca3e6 VZ	294	}
a65ca3e6 VZ	295
a65ca3e6 VZ	296	void UnicodeTestCase::ConversionUTF7()
	297	{
	298	static const StringConversionData utf7data[] =
	299	{
ae431935	300	// normal fragments
527587d3 VZ	301	StringConversionData("+AKM-", L"\xa3"),
527587d3 VZ	302	StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"),
ae431935	303
42e8b52f	304	// this one is an alternative valid encoding of the same string
527587d3 VZ	305	StringConversionData("+AOk-t+AOk", L"\xe9t\xe9",
527587d3 VZ	306	StringConversionData::ONLY_MB2WC),
42e8b52f	307
ae431935	308	// some special cases
527587d3 VZ	309	StringConversionData("+-", L"+"),
527587d3 VZ	310	StringConversionData("+--", L"+-"),
8da7a00a	311
8da7a00a	312	// the following are invalid UTF-7 sequences
527587d3 VZ	313	StringConversionData("\xa3", NULL),
	314	StringConversionData("+", NULL),
	315	StringConversionData("+~", NULL),
	316	StringConversionData("a+", NULL),
a65ca3e6 VZ	317	};
a65ca3e6 VZ	318
a65ca3e6 VZ	319	for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
	320	{
	321	const StringConversionData& d = utf7data[n];
b901ac2c VZ	322
	323	// converting to/from UTF-7 using iconv() currently doesn't work
	324	// because of several problems:
	325	// - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
	326	// to an incomplete and anyhow nonsensical "+AA" string)
	327	// - iconv refuses to convert "+-" (although it converts "+-\n" just
	328	// fine, go figure)
	329	//
	330	// I have no idea how to fix this so just disable the test for now
	331	#if 0
42e8b52f	332	d.Test(n, wxCSConv("utf-7"));
b901ac2c	333	#endif
42e8b52f	334	d.Test(n, wxConvUTF7);
a65ca3e6 VZ	335	}
	336	}
	337
	338	void UnicodeTestCase::ConversionUTF8()
	339	{
	340	static const StringConversionData utf8data[] =
	341	{
8da7a00a	342	#ifdef wxHAVE_U_ESCAPE
527587d3	343	StringConversionData("\xc2\xa3", L"\u00a3"),
a65ca3e6	344	#endif
527587d3	345	StringConversionData("\xc2", NULL),
a65ca3e6 VZ	346	};
a65ca3e6 VZ	347
9a83f860	348	wxCSConv conv(wxT("utf-8"));
a65ca3e6 VZ	349	for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
	350	{
	351	const StringConversionData& d = utf8data[n];
42e8b52f VZ	352	d.Test(n, conv);
42e8b52f VZ	353	d.Test(n, wxConvUTF8);
a65ca3e6 VZ	354	}
	355	}
	356
5975f198 VZ	357	void UnicodeTestCase::ConversionUTF16()
	358	{
	359	static const StringConversionData utf16data[] =
	360	{
	361	#ifdef wxHAVE_U_ESCAPE
527587d3 VZ	362	StringConversionData(
	363	"\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
	364	L"\u041f\u0440\u0438\u0432\u0435\u0442"),
	365	StringConversionData(
	366	"\x01\0\0b\x01\0\0a\x01\0\0r\0\0",
	367	L"\u0100b\u0100a\u0100r"),
5975f198	368	#endif
527587d3	369	StringConversionData("\0f\0o\0o\0\0", L"foo"),
5975f198 VZ	370	};
	371
	372	wxCSConv conv(wxFONTENCODING_UTF16BE);
	373	for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
	374	{
	375	const StringConversionData& d = utf16data[n];
42e8b52f	376	d.Test(n, conv);
5975f198	377	}
2877b828 VZ	378
	379	// special case: this string has consecutive NULs inside it which don't
	380	// terminate the string, this exposed a bug in our conversion code which
	381	// got confused in this case
	382	size_t len;
93a800a9 VZ	383	conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len);
93a800a9 VZ	384	CPPUNIT_ASSERT_EQUAL( 3, len );
5975f198 VZ	385	}
5975f198 VZ	386
a7823b26 VZ	387	void UnicodeTestCase::ConversionUTF32()
	388	{
	389	static const StringConversionData utf32data[] =
	390	{
	391	#ifdef wxHAVE_U_ESCAPE
527587d3	392	StringConversionData(
72b2fc5c	393	"\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
527587d3	394	L"\u041f\u0440\u0438\u0432\u0435\u0442"),
a7823b26	395	#endif
527587d3	396	StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"),
a7823b26 VZ	397	};
	398
	399	wxCSConv conv(wxFONTENCODING_UTF32BE);
	400	for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
	401	{
	402	const StringConversionData& d = utf32data[n];
42e8b52f	403	d.Test(n, conv);
a7823b26 VZ	404	}
	405
	406	size_t len;
93a800a9 VZ	407	conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len);
93a800a9 VZ	408	CPPUNIT_ASSERT_EQUAL( 3, len );
a7823b26 VZ	409	}
a7823b26 VZ	410
0f0298b1 VZ	411	void UnicodeTestCase::IsConvOk()
	412	{
	413	CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
e3276230 VZ	414	CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() );
	415	CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() );
	416	CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() );
0f0298b1 VZ	417
0f0298b1 VZ	418	#ifdef __WINDOWS__
e3276230	419	CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() );
0f0298b1 VZ	420	#endif
	421	}
	422
b0c4d5d7 VS	423	#if wxUSE_UNICODE
	424	void UnicodeTestCase::Iteration()
	425	{
	426	// "czech" in Czech ("cestina"):
	427	static const char *textUTF8 = "\304\215e\305\241tina";
	428	static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
	429
	430	wxString text(wxString::FromUTF8(textUTF8));
	431	CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
	432
	433	// verify the string was decoded correctly:
	434	{
	435	size_t idx = 0;
	436	for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
	437	{
	438	CPPUNIT_ASSERT( *i == textUTF16[idx] );
	439	}
	440	}
	441
	442	// overwrite the string with something that is shorter in UTF-8:
	443	{
	444	for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
	445	*i = 'x';
	446	}
	447
	448	// restore the original text now:
	449	{
	450	wxString::iterator end1 = text.end();
	451	wxString::const_iterator end2 = text.end();
	452
	453	size_t idx = 0;
	454	for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
	455	{
	456	*i = textUTF16[idx];
	457
	458	CPPUNIT_ASSERT( end1 == text.end() );
	459	CPPUNIT_ASSERT( end2 == text.end() );
	460	}
	461
	462	CPPUNIT_ASSERT( end1 == text.end() );
	463	CPPUNIT_ASSERT( end2 == text.end() );
	464	}
	465
	466	// and verify it again:
	467	{
	468	size_t idx = 0;
	469	for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
	470	{
	471	CPPUNIT_ASSERT( *i == textUTF16[idx] );
	472	}
	473	}
	474	}
	475	#endif // wxUSE_UNICODE
210bfffb	476