don't run the tests which can't succeed in ANSI build; remove the tests for wxUSE_WCH...
[wxWidgets.git] / tests / strings / unicode.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: tests/strings/unicode.cpp
3 // Purpose: Unicode unit test
4 // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba
5 // Created: 2004-04-28
6 // RCS-ID: $Id$
7 // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
8 ///////////////////////////////////////////////////////////////////////////////
9
10 // ----------------------------------------------------------------------------
11 // headers
12 // ----------------------------------------------------------------------------
13
14 #include "testprec.h"
15
16 #ifdef __BORLANDC__
17 #pragma hdrstop
18 #endif
19
20 #ifndef WX_PRECOMP
21 #include "wx/wx.h"
22 #endif // WX_PRECOMP
23
24 // ----------------------------------------------------------------------------
25 // test class
26 // ----------------------------------------------------------------------------
27
28 class UnicodeTestCase : public CppUnit::TestCase
29 {
30 public:
31 UnicodeTestCase();
32
33 private:
34 CPPUNIT_TEST_SUITE( UnicodeTestCase );
35 CPPUNIT_TEST( ToFromAscii );
36 CPPUNIT_TEST( ConstructorsWithConversion );
37 CPPUNIT_TEST( ConversionEmpty );
38 CPPUNIT_TEST( ConversionWithNULs );
39 CPPUNIT_TEST( ConversionUTF7 );
40 CPPUNIT_TEST( ConversionUTF8 );
41 CPPUNIT_TEST( ConversionUTF16 );
42 CPPUNIT_TEST( ConversionUTF32 );
43 CPPUNIT_TEST( IsConvOk );
44 #if wxUSE_UNICODE
45 CPPUNIT_TEST( Iteration );
46 #endif
47 CPPUNIT_TEST_SUITE_END();
48
49 void ToFromAscii();
50 void ConstructorsWithConversion();
51 void ConversionEmpty();
52 void ConversionWithNULs();
53 void ConversionUTF7();
54 void ConversionUTF8();
55 void ConversionUTF16();
56 void ConversionUTF32();
57 void IsConvOk();
58 #if wxUSE_UNICODE
59 void Iteration();
60 #endif
61
62 // test if converting s using the given encoding gives ws and vice versa
63 //
64 // if either of the first 2 arguments is NULL, the conversion is supposed
65 // to fail
66 void DoTestConversion(const char *s, const wchar_t *w, wxMBConv& conv);
67
68
69 DECLARE_NO_COPY_CLASS(UnicodeTestCase)
70 };
71
72 // register in the unnamed registry so that these tests are run by default
73 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
74
75 // also include in it's own registry so that these tests can be run alone
76 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
77
78 UnicodeTestCase::UnicodeTestCase()
79 {
80 }
81
82 void UnicodeTestCase::ToFromAscii()
83 {
84
85 #define TEST_TO_FROM_ASCII(txt) \
86 { \
87 static const char *msg = txt; \
88 wxString s = wxString::FromAscii(msg); \
89 CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \
90 }
91
92 TEST_TO_FROM_ASCII( "Hello, world!" );
93 TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
94 }
95
96 void UnicodeTestCase::ConstructorsWithConversion()
97 {
98 // the string "Déjà" in UTF-8 and wchar_t:
99 const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
100 const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
101 const char *utf8 = (char *)utf8Buf;
102 const char *utf8sub = (char *)utf8subBuf;
103
104 wxString s1(utf8, wxConvUTF8);
105
106 #if wxUSE_UNICODE
107 const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
108 WX_ASSERT_STR_EQUAL( wchar, s1 );
109
110 wxString s2(wchar);
111 WX_ASSERT_STR_EQUAL( wchar, s2 );
112 WX_ASSERT_STR_EQUAL( utf8, s2 );
113 #else
114 WX_ASSERT_STR_EQUAL( utf8, s1 );
115 #endif
116
117 wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
118 wxString s3(utf8, wxConvUTF8, 4);
119 CPPUNIT_ASSERT_EQUAL( sub, s3 );
120
121 #if wxUSE_UNICODE
122 wxString s4(wchar, wxConvUTF8, 3);
123 CPPUNIT_ASSERT_EQUAL( sub, s4 );
124
125 // conversion should stop with failure at pos 35
126 wxString s("\t[pl]open.format.Sformatuj dyskietkê=gfloppy %f", wxConvUTF8);
127 CPPUNIT_ASSERT( s.empty() );
128 #endif // wxUSE_UNICODE
129
130
131 // test using Unicode strings together with char* strings (this must work
132 // in ANSI mode as well, of course):
133 wxString s5("ascii");
134 WX_ASSERT_STR_EQUAL( "ascii", s5 );
135
136 s5 += " value";
137
138 CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
139 WX_ASSERT_STR_EQUAL( "ascii value", s5 );
140 CPPUNIT_ASSERT( s5 != "SomethingElse" );
141 }
142
143 void UnicodeTestCase::ConversionEmpty()
144 {
145 size_t len;
146
147 #if wxUSE_UNICODE
148 wxCharBuffer buf = wxConvLibc.cWC2MB(L"", 0, &len);
149 #else // !wxUSE_UNICODE
150 wxWCharBuffer wbuf = wxConvLibc.cMB2WC("", 0, &len);
151 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
152
153 CPPUNIT_ASSERT(len == 0);
154 }
155
156 void UnicodeTestCase::ConversionWithNULs()
157 {
158 #if wxUSE_UNICODE
159 static const size_t lenNulString = 10;
160
161 wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
162 wxCharBuffer theBuffer = szTheString.mb_str();
163
164 CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
165 lenNulString + 1) == 0 );
166
167 wxString szTheString2("The\0String", wxConvLocal, lenNulString);
168 CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
169 CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
170 lenNulString + 1) == 0 );
171 #else // !wxUSE_UNICODE
172 wxString szTheString("TheString");
173 szTheString.insert(3, 1, '\0');
174 wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
175
176 CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
177
178 wxString szLocalTheString("TheString");
179 szLocalTheString.insert(3, 1, '\0');
180 wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
181
182 CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
183 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
184 }
185
186 void
187 UnicodeTestCase::DoTestConversion(const char *s,
188 const wchar_t *ws,
189 wxMBConv& conv)
190 {
191 if ( ws )
192 {
193 wxCharBuffer buf = conv.cWC2MB(ws, (size_t)-1, NULL);
194
195 CPPUNIT_ASSERT( strcmp(buf, s) == 0 );
196 }
197
198 if ( s )
199 {
200 wxWCharBuffer wbuf = conv.cMB2WC(s, (size_t)-1, NULL);
201
202 if ( ws )
203 {
204 CPPUNIT_ASSERT( wbuf.data() );
205 CPPUNIT_ASSERT( wxStrcmp(wbuf, ws) == 0 );
206 }
207 else // conversion is supposed to fail
208 {
209 CPPUNIT_ASSERT_EQUAL( (wchar_t *)NULL, wbuf.data() );
210 }
211 }
212 }
213
214 struct StringConversionData
215 {
216 const char *str;
217 const wchar_t *wcs;
218 };
219
220 void UnicodeTestCase::ConversionUTF7()
221 {
222 static const StringConversionData utf7data[] =
223 {
224 // normal fragments
225 { "+AKM-", L"\xa3" },
226 { "+AOk-t+AOk-", L"\xe9t\xe9" },
227
228 // some special cases
229 { "+-", L"+" },
230 { "+--", L"+-" },
231
232 // the following are invalid UTF-7 sequences
233 { "\xa3", NULL },
234 { "+", NULL },
235 { "+~", NULL },
236 { "a+", NULL },
237 };
238
239 for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
240 {
241 const StringConversionData& d = utf7data[n];
242
243 // converting to/from UTF-7 using iconv() currently doesn't work
244 // because of several problems:
245 // - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
246 // to an incomplete and anyhow nonsensical "+AA" string)
247 // - iconv refuses to convert "+-" (although it converts "+-\n" just
248 // fine, go figure)
249 //
250 // I have no idea how to fix this so just disable the test for now
251 #if 0
252 DoTestConversion(d.str, d.wcs, wxCSConv("utf-7"));
253 #endif
254 DoTestConversion(d.str, d.wcs, wxConvUTF7);
255 }
256 }
257
258 void UnicodeTestCase::ConversionUTF8()
259 {
260 static const StringConversionData utf8data[] =
261 {
262 #ifdef wxHAVE_U_ESCAPE
263 { "\xc2\xa3", L"\u00a3" },
264 #endif
265 { "\xc2", NULL },
266 };
267
268 wxCSConv conv(_T("utf-8"));
269 for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
270 {
271 const StringConversionData& d = utf8data[n];
272 DoTestConversion(d.str, d.wcs, conv);
273 DoTestConversion(d.str, d.wcs, wxConvUTF8);
274 }
275 }
276
277 void UnicodeTestCase::ConversionUTF16()
278 {
279 static const StringConversionData utf16data[] =
280 {
281 #ifdef wxHAVE_U_ESCAPE
282 { "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
283 L"\u041f\u0440\u0438\u0432\u0435\u0442" },
284 { "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", L"\u0100b\u0100a\u0100r" },
285 #endif
286 { "\0f\0o\0o\0\0", L"foo" },
287 };
288
289 wxCSConv conv(wxFONTENCODING_UTF16BE);
290 for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
291 {
292 const StringConversionData& d = utf16data[n];
293 DoTestConversion(d.str, d.wcs, conv);
294 }
295
296 // special case: this string has consecutive NULs inside it which don't
297 // terminate the string, this exposed a bug in our conversion code which
298 // got confused in this case
299 size_t len;
300 wxWCharBuffer wbuf(conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len));
301 CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
302 }
303
304 void UnicodeTestCase::ConversionUTF32()
305 {
306 static const StringConversionData utf32data[] =
307 {
308 #ifdef wxHAVE_U_ESCAPE
309 {
310 "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
311 L"\u041f\u0440\u0438\u0432\u0435\u0442" },
312 #endif
313 { "\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo" },
314 };
315
316 wxCSConv conv(wxFONTENCODING_UTF32BE);
317 for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
318 {
319 const StringConversionData& d = utf32data[n];
320 DoTestConversion(d.str, d.wcs, conv);
321 }
322
323 size_t len;
324 wxWCharBuffer wbuf(conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */,
325 12, &len));
326 CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
327 }
328
329 void UnicodeTestCase::IsConvOk()
330 {
331 CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
332 CPPUNIT_ASSERT( wxCSConv(_T("UTF-8")).IsOk() );
333 CPPUNIT_ASSERT( !wxCSConv(_T("NoSuchConversion")).IsOk() );
334
335 #ifdef __WINDOWS__
336 CPPUNIT_ASSERT( wxCSConv(_T("WINDOWS-437")).IsOk() );
337 #endif
338 }
339
340 #if wxUSE_UNICODE
341 void UnicodeTestCase::Iteration()
342 {
343 // "czech" in Czech ("cestina"):
344 static const char *textUTF8 = "\304\215e\305\241tina";
345 static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
346
347 wxString text(wxString::FromUTF8(textUTF8));
348 CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
349
350 // verify the string was decoded correctly:
351 {
352 size_t idx = 0;
353 for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
354 {
355 CPPUNIT_ASSERT( *i == textUTF16[idx] );
356 }
357 }
358
359 // overwrite the string with something that is shorter in UTF-8:
360 {
361 for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
362 *i = 'x';
363 }
364
365 // restore the original text now:
366 {
367 wxString::iterator end1 = text.end();
368 wxString::const_iterator end2 = text.end();
369
370 size_t idx = 0;
371 for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
372 {
373 *i = textUTF16[idx];
374
375 CPPUNIT_ASSERT( end1 == text.end() );
376 CPPUNIT_ASSERT( end2 == text.end() );
377 }
378
379 CPPUNIT_ASSERT( end1 == text.end() );
380 CPPUNIT_ASSERT( end2 == text.end() );
381 }
382
383 // and verify it again:
384 {
385 size_t idx = 0;
386 for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
387 {
388 CPPUNIT_ASSERT( *i == textUTF16[idx] );
389 }
390 }
391 }
392 #endif // wxUSE_UNICODE