]> git.saurik.com Git - wxWidgets.git/blob - tests/strings/unicode.cpp
Added quick implementation of wxDataViewChoiceByIndexRenderer, closes #11970 (wxDataV...
[wxWidgets.git] / tests / strings / unicode.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: tests/strings/unicode.cpp
3 // Purpose: Unicode unit test
4 // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba
5 // Created: 2004-04-28
6 // RCS-ID: $Id$
7 // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
8 ///////////////////////////////////////////////////////////////////////////////
9
10 // ----------------------------------------------------------------------------
11 // headers
12 // ----------------------------------------------------------------------------
13
14 #include "testprec.h"
15
16 #ifdef __BORLANDC__
17 #pragma hdrstop
18 #endif
19
20 #ifndef WX_PRECOMP
21 #include "wx/wx.h"
22 #endif // WX_PRECOMP
23
24 #include "wx/encconv.h"
25
26 // ----------------------------------------------------------------------------
27 // helper class holding the matching MB and WC strings
28 // ----------------------------------------------------------------------------
29
30 struct StringConversionData
31 {
32 // either str or wcs (but not both) may be NULL, this means that the conversion
33 // to it should fail
34 StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0)
35 : str(str_), wcs(wcs_), flags(flags_)
36 {
37 }
38
39 const char * const str;
40 const wchar_t * const wcs;
41
42 enum
43 {
44 TEST_BOTH = 0, // test both str -> wcs and wcs -> str
45 ONLY_MB2WC = 1 // only test str -> wcs conversion
46 };
47
48 const int flags;
49
50 // test that the conversion between str and wcs (subject to flags) succeeds
51 //
52 // the first argument is the index in the test array and is used solely for
53 // diagnostics
54 void Test(size_t n, wxMBConv& conv) const
55 {
56 if ( str )
57 {
58 wxWCharBuffer wbuf = conv.cMB2WC(str);
59
60 if ( wcs )
61 {
62 CPPUNIT_ASSERT_MESSAGE
63 (
64 Message(n, "MB2WC failed"),
65 wbuf.data()
66 );
67
68 CPPUNIT_ASSERT_MESSAGE
69 (
70 Message(n, "MB2WC", wbuf, wcs),
71 wxStrcmp(wbuf, wcs) == 0
72 );
73 }
74 else // conversion is supposed to fail
75 {
76 CPPUNIT_ASSERT_MESSAGE
77 (
78 Message(n, "MB2WC succeeded"),
79 !wbuf.data()
80 );
81 }
82 }
83
84 if ( wcs && !(flags & ONLY_MB2WC) )
85 {
86 wxCharBuffer buf = conv.cWC2MB(wcs);
87
88 if ( str )
89 {
90 CPPUNIT_ASSERT_MESSAGE
91 (
92 Message(n, "WC2MB failed"),
93 buf.data()
94 );
95
96 CPPUNIT_ASSERT_MESSAGE
97 (
98 Message(n, "WC2MB", buf, str),
99 strcmp(buf, str) == 0
100 );
101 }
102 else
103 {
104 CPPUNIT_ASSERT_MESSAGE
105 (
106 Message(n, "WC2MB succeeded"),
107 !buf.data()
108 );
109 }
110 }
111 }
112
113 private:
114 static std::string
115 Message(size_t n, const wxString& msg)
116 {
117 return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg));
118 }
119
120 template <typename T>
121 static std::string
122 Message(size_t n,
123 const char *func,
124 const wxCharTypeBuffer<T>& actual,
125 const T *expected)
126 {
127 return Message(n,
128 wxString::Format("%s returned \"%s\", expected \"%s\"",
129 func, actual.data(), expected));
130 }
131 };
132
133 // ----------------------------------------------------------------------------
134 // test data for UnicodeTestCase::Utf8()
135 // ----------------------------------------------------------------------------
136
137 static const unsigned char utf8koi8r[] =
138 {
139 208, 157, 208, 181, 209, 129, 208, 186, 208, 176, 208, 183, 208, 176,
140 208, 189, 208, 189, 208, 190, 32, 208, 191, 208, 190, 209, 128, 208,
141 176, 208, 180, 208, 190, 208, 178, 208, 176, 208, 187, 32, 208, 188,
142 208, 181, 208, 189, 209, 143, 32, 209, 129, 208, 178, 208, 190, 208,
143 181, 208, 185, 32, 208, 186, 209, 128, 209, 131, 209, 130, 208, 181,
144 208, 185, 209, 136, 208, 181, 208, 185, 32, 208, 189, 208, 190, 208,
145 178, 208, 190, 209, 129, 209, 130, 209, 140, 209, 142, 0
146 };
147
148 static const unsigned char utf8iso8859_1[] =
149 {
150 0x53, 0x79, 0x73, 0x74, 0xc3, 0xa8, 0x6d, 0x65, 0x73, 0x20, 0x49, 0x6e,
151 0x74, 0xc3, 0xa9, 0x67, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x73, 0x20, 0x65,
152 0x6e, 0x20, 0x4d, 0xc3, 0xa9, 0x63, 0x61, 0x6e, 0x69, 0x71, 0x75, 0x65,
153 0x20, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x71, 0x75, 0x65, 0x20, 0x65,
154 0x74, 0x20, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x71, 0x75, 0x65, 0
155 };
156
157 static const unsigned char utf8Invalid[] =
158 {
159 0x3c, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3e, 0x32, 0x30, 0x30,
160 0x32, 0xe5, 0xb9, 0xb4, 0x30, 0x39, 0xe6, 0x9c, 0x88, 0x32, 0x35, 0xe6,
161 0x97, 0xa5, 0x20, 0x30, 0x37, 0xe6, 0x99, 0x82, 0x33, 0x39, 0xe5, 0x88,
162 0x86, 0x35, 0x37, 0xe7, 0xa7, 0x92, 0x3c, 0x2f, 0x64, 0x69, 0x73, 0x70,
163 0x6c, 0x61, 0x79, 0
164 };
165
166 static const struct Utf8Data
167 {
168 const unsigned char *text;
169 size_t len;
170 const wxChar *charset;
171 wxFontEncoding encoding;
172 } utf8data[] =
173 {
174 { utf8Invalid, WXSIZEOF(utf8Invalid), wxT("iso8859-1"), wxFONTENCODING_ISO8859_1 },
175 { utf8koi8r, WXSIZEOF(utf8koi8r), wxT("koi8-r"), wxFONTENCODING_KOI8 },
176 { utf8iso8859_1, WXSIZEOF(utf8iso8859_1), wxT("iso8859-1"), wxFONTENCODING_ISO8859_1 },
177 };
178
179
180 // ----------------------------------------------------------------------------
181 // test class
182 // ----------------------------------------------------------------------------
183
184 class UnicodeTestCase : public CppUnit::TestCase
185 {
186 public:
187 UnicodeTestCase();
188
189 private:
190 CPPUNIT_TEST_SUITE( UnicodeTestCase );
191 CPPUNIT_TEST( ToFromAscii );
192 CPPUNIT_TEST( ConstructorsWithConversion );
193 CPPUNIT_TEST( ConversionFixed );
194 CPPUNIT_TEST( ConversionWithNULs );
195 CPPUNIT_TEST( ConversionUTF7 );
196 CPPUNIT_TEST( ConversionUTF8 );
197 CPPUNIT_TEST( ConversionUTF16 );
198 CPPUNIT_TEST( ConversionUTF32 );
199 CPPUNIT_TEST( IsConvOk );
200 #if wxUSE_UNICODE
201 CPPUNIT_TEST( Iteration );
202 #endif
203 CPPUNIT_TEST( Utf8 );
204 CPPUNIT_TEST( EncodingConverter );
205 CPPUNIT_TEST_SUITE_END();
206
207 void ToFromAscii();
208 void ConstructorsWithConversion();
209 void ConversionFixed();
210 void ConversionWithNULs();
211 void ConversionUTF7();
212 void ConversionUTF8();
213 void ConversionUTF16();
214 void ConversionUTF32();
215 void IsConvOk();
216 #if wxUSE_UNICODE
217 void Iteration();
218 #endif
219 void Utf8();
220 void EncodingConverter();
221
222 DECLARE_NO_COPY_CLASS(UnicodeTestCase)
223 };
224
225 // register in the unnamed registry so that these tests are run by default
226 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
227
228 // also include in it's own registry so that these tests can be run alone
229 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
230
231 UnicodeTestCase::UnicodeTestCase()
232 {
233 }
234
235 void UnicodeTestCase::ToFromAscii()
236 {
237
238 #define TEST_TO_FROM_ASCII(txt) \
239 { \
240 static const char *msg = txt; \
241 wxString s = wxString::FromAscii(msg); \
242 CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \
243 }
244
245 TEST_TO_FROM_ASCII( "Hello, world!" );
246 TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
247 }
248
249 void UnicodeTestCase::ConstructorsWithConversion()
250 {
251 // the string "Déjà" in UTF-8 and wchar_t:
252 const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
253 const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
254 const char *utf8 = (char *)utf8Buf;
255 const char *utf8sub = (char *)utf8subBuf;
256
257 wxString s1(utf8, wxConvUTF8);
258
259 #if wxUSE_UNICODE
260 const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
261 CPPUNIT_ASSERT_EQUAL( wchar, s1 );
262
263 wxString s2(wchar);
264 CPPUNIT_ASSERT_EQUAL( wchar, s2 );
265 CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 );
266 #else
267 CPPUNIT_ASSERT_EQUAL( utf8, s1 );
268 #endif
269
270 wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
271 wxString s3(utf8, wxConvUTF8, 4);
272 CPPUNIT_ASSERT_EQUAL( sub, s3 );
273
274 #if wxUSE_UNICODE
275 wxString s4(wchar, wxConvUTF8, 3);
276 CPPUNIT_ASSERT_EQUAL( sub, s4 );
277
278 // conversion should stop with failure at pos 35
279 wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8);
280 CPPUNIT_ASSERT( s.empty() );
281 #endif // wxUSE_UNICODE
282
283
284 // test using Unicode strings together with char* strings (this must work
285 // in ANSI mode as well, of course):
286 wxString s5("ascii");
287 CPPUNIT_ASSERT_EQUAL( "ascii", s5 );
288
289 s5 += " value";
290
291 CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
292 CPPUNIT_ASSERT_EQUAL( "ascii value", s5 );
293 CPPUNIT_ASSERT( s5 != "SomethingElse" );
294 }
295
296 void UnicodeTestCase::ConversionFixed()
297 {
298 size_t len;
299
300 #if wxUSE_UNICODE
301 wxConvLibc.cWC2MB(L"", 0, &len);
302 #else // !wxUSE_UNICODE
303 wxConvLibc.cMB2WC("", 0, &len);
304 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
305
306 CPPUNIT_ASSERT_EQUAL( 0, len );
307
308 #if wxUSE_UNICODE
309 // check that when we convert a fixed number of characters we obtain the
310 // expected return value
311 CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) );
312 CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) );
313 CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) );
314 CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) );
315 #endif // wxUSE_UNICODE
316 }
317
318 void UnicodeTestCase::ConversionWithNULs()
319 {
320 #if wxUSE_UNICODE
321 static const size_t lenNulString = 10;
322
323 wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
324 wxCharBuffer theBuffer = szTheString.mb_str();
325
326 CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
327 lenNulString + 1) == 0 );
328
329 wxString szTheString2("The\0String", wxConvLocal, lenNulString);
330 CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
331 CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
332 lenNulString + 1) == 0 );
333 #else // !wxUSE_UNICODE
334 wxString szTheString("TheString");
335 szTheString.insert(3, 1, '\0');
336 wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
337
338 CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
339
340 wxString szLocalTheString("TheString");
341 szLocalTheString.insert(3, 1, '\0');
342 wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
343
344 CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
345 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
346 }
347
348 void UnicodeTestCase::ConversionUTF7()
349 {
350 static const StringConversionData utf7data[] =
351 {
352 // normal fragments
353 StringConversionData("+AKM-", L"\xa3"),
354 StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"),
355
356 // this one is an alternative valid encoding of the same string
357 StringConversionData("+AOk-t+AOk", L"\xe9t\xe9",
358 StringConversionData::ONLY_MB2WC),
359
360 // some special cases
361 StringConversionData("+-", L"+"),
362 StringConversionData("+--", L"+-"),
363
364 // the following are invalid UTF-7 sequences
365 StringConversionData("\xa3", NULL),
366 StringConversionData("+", NULL),
367 StringConversionData("+~", NULL),
368 StringConversionData("a+", NULL),
369 };
370
371 for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
372 {
373 const StringConversionData& d = utf7data[n];
374
375 // converting to/from UTF-7 using iconv() currently doesn't work
376 // because of several problems:
377 // - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
378 // to an incomplete and anyhow nonsensical "+AA" string)
379 // - iconv refuses to convert "+-" (although it converts "+-\n" just
380 // fine, go figure)
381 //
382 // I have no idea how to fix this so just disable the test for now
383 #if 0
384 d.Test(n, wxCSConv("utf-7"));
385 #endif
386 d.Test(n, wxConvUTF7);
387 }
388 }
389
390 void UnicodeTestCase::ConversionUTF8()
391 {
392 static const StringConversionData utf8data[] =
393 {
394 #ifdef wxHAVE_U_ESCAPE
395 StringConversionData("\xc2\xa3", L"\u00a3"),
396 #endif
397 StringConversionData("\xc2", NULL),
398 };
399
400 wxCSConv conv(wxT("utf-8"));
401 for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
402 {
403 const StringConversionData& d = utf8data[n];
404 d.Test(n, conv);
405 d.Test(n, wxConvUTF8);
406 }
407 }
408
409 void UnicodeTestCase::ConversionUTF16()
410 {
411 static const StringConversionData utf16data[] =
412 {
413 #ifdef wxHAVE_U_ESCAPE
414 StringConversionData(
415 "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
416 L"\u041f\u0440\u0438\u0432\u0435\u0442"),
417 StringConversionData(
418 "\x01\0\0b\x01\0\0a\x01\0\0r\0\0",
419 L"\u0100b\u0100a\u0100r"),
420 #endif
421 StringConversionData("\0f\0o\0o\0\0", L"foo"),
422 };
423
424 wxCSConv conv(wxFONTENCODING_UTF16BE);
425 for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
426 {
427 const StringConversionData& d = utf16data[n];
428 d.Test(n, conv);
429 }
430
431 // special case: this string has consecutive NULs inside it which don't
432 // terminate the string, this exposed a bug in our conversion code which
433 // got confused in this case
434 size_t len;
435 conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len);
436 CPPUNIT_ASSERT_EQUAL( 3, len );
437 }
438
439 void UnicodeTestCase::ConversionUTF32()
440 {
441 static const StringConversionData utf32data[] =
442 {
443 #ifdef wxHAVE_U_ESCAPE
444 StringConversionData(
445 "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
446 L"\u041f\u0440\u0438\u0432\u0435\u0442"),
447 #endif
448 StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"),
449 };
450
451 wxCSConv conv(wxFONTENCODING_UTF32BE);
452 for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
453 {
454 const StringConversionData& d = utf32data[n];
455 d.Test(n, conv);
456 }
457
458 size_t len;
459 conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len);
460 CPPUNIT_ASSERT_EQUAL( 3, len );
461 }
462
463 void UnicodeTestCase::IsConvOk()
464 {
465 CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
466 CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() );
467 CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() );
468 CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() );
469
470 #ifdef __WINDOWS__
471 CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() );
472 #endif
473 }
474
475 #if wxUSE_UNICODE
476 void UnicodeTestCase::Iteration()
477 {
478 // "czech" in Czech ("cestina"):
479 static const char *textUTF8 = "\304\215e\305\241tina";
480 static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
481
482 wxString text(wxString::FromUTF8(textUTF8));
483 CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
484
485 // verify the string was decoded correctly:
486 {
487 size_t idx = 0;
488 for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
489 {
490 CPPUNIT_ASSERT( *i == textUTF16[idx] );
491 }
492 }
493
494 // overwrite the string with something that is shorter in UTF-8:
495 {
496 for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
497 *i = 'x';
498 }
499
500 // restore the original text now:
501 {
502 wxString::iterator end1 = text.end();
503 wxString::const_iterator end2 = text.end();
504
505 size_t idx = 0;
506 for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
507 {
508 *i = textUTF16[idx];
509
510 CPPUNIT_ASSERT( end1 == text.end() );
511 CPPUNIT_ASSERT( end2 == text.end() );
512 }
513
514 CPPUNIT_ASSERT( end1 == text.end() );
515 CPPUNIT_ASSERT( end2 == text.end() );
516 }
517
518 // and verify it again:
519 {
520 size_t idx = 0;
521 for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
522 {
523 CPPUNIT_ASSERT( *i == textUTF16[idx] );
524 }
525 }
526 }
527 #endif // wxUSE_UNICODE
528
529 void UnicodeTestCase::Utf8()
530 {
531 // test code extracted from console sample r64320
532
533 char buf[1024];
534 wchar_t wbuf[1024];
535
536 for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
537 {
538 const Utf8Data& u8d = utf8data[n];
539 CPPUNIT_ASSERT( wxConvUTF8.MB2WC(wbuf, (const char *)u8d.text, WXSIZEOF(wbuf)) != (size_t)-1 );
540
541 #if 0 // FIXME: this conversion seem not to work...
542 wxCSConv conv(u8d.charset);
543 CPPUNIT_ASSERT( conv.WC2MB(buf, wbuf, WXSIZEOF(buf)) != (size_t)-1 );
544 #endif
545 wxString s(wxConvUTF8.cMB2WC((const char *)u8d.text));
546 CPPUNIT_ASSERT( !s.empty() );
547 }
548 }
549
550 void UnicodeTestCase::EncodingConverter()
551 {
552 // test code extracted from console sample r64320
553
554 #if 0
555 char buf[1024];
556 wchar_t wbuf[1024];
557
558 CPPUNIT_ASSERT( wxConvUTF8.MB2WC(wbuf, (const char *)utf8koi8r, WXSIZEOF(utf8koi8r)) != (size_t)-1 );
559
560 wxString s1(wxConvUTF8.cMB2WC((const char *)utf8koi8r));
561 CPPUNIT_ASSERT( !s1.empty() );
562
563 wxEncodingConverter ec;
564 ec.Init(wxFONTENCODING_UNICODE, wxFONTENCODING_KOI8);
565 ec.Convert(wbuf, buf);
566 wxString s2(buf);
567
568 CPPUNIT_ASSERT_EQUAL( s1, s2 );
569 #endif
570 }