]> git.saurik.com Git - wxWidgets.git/blob - tests/strings/unicode.cpp
added some Replace() benchmarks (#9802)
[wxWidgets.git] / tests / strings / unicode.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: tests/strings/unicode.cpp
3 // Purpose: Unicode unit test
4 // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba
5 // Created: 2004-04-28
6 // RCS-ID: $Id$
7 // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
8 ///////////////////////////////////////////////////////////////////////////////
9
10 // ----------------------------------------------------------------------------
11 // headers
12 // ----------------------------------------------------------------------------
13
14 #include "testprec.h"
15
16 #ifdef __BORLANDC__
17 #pragma hdrstop
18 #endif
19
20 #ifndef WX_PRECOMP
21 #include "wx/wx.h"
22 #endif // WX_PRECOMP
23
24 // helper class holding the matching MB and WC strings
25 //
26 // either str or wcs (but not both) may be NULL, this means that the conversion
27 // to it should fail
28 struct StringConversionData
29 {
30 const char *str;
31 const wchar_t *wcs;
32
33 enum
34 {
35 TEST_BOTH = 0, // test both str -> wcs and wcs -> str
36 ONLY_MB2WC = 1 // only test str -> wcs conversion
37 };
38
39 int flags;
40
41 // test that the conversion between str and wcs (subject to flags) succeeds
42 //
43 // the first argument is the index in the test array and is used solely for
44 // diagnostics
45 void Test(size_t n, wxMBConv& conv) const
46 {
47 if ( str )
48 {
49 wxWCharBuffer wbuf = conv.cMB2WC(str);
50
51 if ( wcs )
52 {
53 CPPUNIT_ASSERT_MESSAGE
54 (
55 Message(n, "MB2WC failed"),
56 wbuf.data()
57 );
58
59 CPPUNIT_ASSERT_MESSAGE
60 (
61 Message(n, "MB2WC", wbuf, wcs),
62 wxStrcmp(wbuf, wcs) == 0
63 );
64 }
65 else // conversion is supposed to fail
66 {
67 CPPUNIT_ASSERT_MESSAGE
68 (
69 Message(n, "MB2WC succeeded"),
70 !wbuf.data()
71 );
72 }
73 }
74
75 if ( wcs && !(flags & ONLY_MB2WC) )
76 {
77 wxCharBuffer buf = conv.cWC2MB(wcs);
78
79 if ( str )
80 {
81 CPPUNIT_ASSERT_MESSAGE
82 (
83 Message(n, "WC2MB failed"),
84 buf.data()
85 );
86
87 CPPUNIT_ASSERT_MESSAGE
88 (
89 Message(n, "WC2MB", buf, str),
90 strcmp(buf, str) == 0
91 );
92 }
93 else
94 {
95 CPPUNIT_ASSERT_MESSAGE
96 (
97 Message(n, "WC2MB succeeded"),
98 !buf.data()
99 );
100 }
101 }
102 }
103
104 private:
105 static std::string
106 Message(size_t n, const wxString& msg)
107 {
108 return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg));
109 }
110
111 template <typename T>
112 static std::string
113 Message(size_t n,
114 const char *func,
115 const wxCharTypeBuffer<T>& actual,
116 const T *expected)
117 {
118 return Message(n,
119 wxString::Format("%s returned \"%s\", expected \"%s\"",
120 func, actual.data(), expected));
121 }
122 };
123
124 // ----------------------------------------------------------------------------
125 // test class
126 // ----------------------------------------------------------------------------
127
128 class UnicodeTestCase : public CppUnit::TestCase
129 {
130 public:
131 UnicodeTestCase();
132
133 private:
134 CPPUNIT_TEST_SUITE( UnicodeTestCase );
135 CPPUNIT_TEST( ToFromAscii );
136 CPPUNIT_TEST( ConstructorsWithConversion );
137 CPPUNIT_TEST( ConversionEmpty );
138 CPPUNIT_TEST( ConversionWithNULs );
139 CPPUNIT_TEST( ConversionUTF7 );
140 CPPUNIT_TEST( ConversionUTF8 );
141 CPPUNIT_TEST( ConversionUTF16 );
142 CPPUNIT_TEST( ConversionUTF32 );
143 CPPUNIT_TEST( IsConvOk );
144 #if wxUSE_UNICODE
145 CPPUNIT_TEST( Iteration );
146 #endif
147 CPPUNIT_TEST_SUITE_END();
148
149 void ToFromAscii();
150 void ConstructorsWithConversion();
151 void ConversionEmpty();
152 void ConversionWithNULs();
153 void ConversionUTF7();
154 void ConversionUTF8();
155 void ConversionUTF16();
156 void ConversionUTF32();
157 void IsConvOk();
158 #if wxUSE_UNICODE
159 void Iteration();
160 #endif
161
162 DECLARE_NO_COPY_CLASS(UnicodeTestCase)
163 };
164
165 // register in the unnamed registry so that these tests are run by default
166 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
167
168 // also include in it's own registry so that these tests can be run alone
169 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
170
171 UnicodeTestCase::UnicodeTestCase()
172 {
173 }
174
175 void UnicodeTestCase::ToFromAscii()
176 {
177
178 #define TEST_TO_FROM_ASCII(txt) \
179 { \
180 static const char *msg = txt; \
181 wxString s = wxString::FromAscii(msg); \
182 CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \
183 }
184
185 TEST_TO_FROM_ASCII( "Hello, world!" );
186 TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
187 }
188
189 void UnicodeTestCase::ConstructorsWithConversion()
190 {
191 // the string "Déjà" in UTF-8 and wchar_t:
192 const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
193 const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
194 const char *utf8 = (char *)utf8Buf;
195 const char *utf8sub = (char *)utf8subBuf;
196
197 wxString s1(utf8, wxConvUTF8);
198
199 #if wxUSE_UNICODE
200 const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
201 CPPUNIT_ASSERT_EQUAL( wchar, s1 );
202
203 wxString s2(wchar);
204 CPPUNIT_ASSERT_EQUAL( wchar, s2 );
205 CPPUNIT_ASSERT_EQUAL( utf8, s2 );
206 #else
207 CPPUNIT_ASSERT_EQUAL( utf8, s1 );
208 #endif
209
210 wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
211 wxString s3(utf8, wxConvUTF8, 4);
212 CPPUNIT_ASSERT_EQUAL( sub, s3 );
213
214 #if wxUSE_UNICODE
215 wxString s4(wchar, wxConvUTF8, 3);
216 CPPUNIT_ASSERT_EQUAL( sub, s4 );
217
218 // conversion should stop with failure at pos 35
219 wxString s("\t[pl]open.format.Sformatuj dyskietkê=gfloppy %f", wxConvUTF8);
220 CPPUNIT_ASSERT( s.empty() );
221 #endif // wxUSE_UNICODE
222
223
224 // test using Unicode strings together with char* strings (this must work
225 // in ANSI mode as well, of course):
226 wxString s5("ascii");
227 CPPUNIT_ASSERT_EQUAL( "ascii", s5 );
228
229 s5 += " value";
230
231 CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
232 CPPUNIT_ASSERT_EQUAL( "ascii value", s5 );
233 CPPUNIT_ASSERT( s5 != "SomethingElse" );
234 }
235
236 void UnicodeTestCase::ConversionEmpty()
237 {
238 size_t len;
239
240 #if wxUSE_UNICODE
241 wxCharBuffer buf = wxConvLibc.cWC2MB(L"", 0, &len);
242 #else // !wxUSE_UNICODE
243 wxWCharBuffer wbuf = wxConvLibc.cMB2WC("", 0, &len);
244 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
245
246 CPPUNIT_ASSERT(len == 0);
247 }
248
249 void UnicodeTestCase::ConversionWithNULs()
250 {
251 #if wxUSE_UNICODE
252 static const size_t lenNulString = 10;
253
254 wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
255 wxCharBuffer theBuffer = szTheString.mb_str();
256
257 CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
258 lenNulString + 1) == 0 );
259
260 wxString szTheString2("The\0String", wxConvLocal, lenNulString);
261 CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
262 CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
263 lenNulString + 1) == 0 );
264 #else // !wxUSE_UNICODE
265 wxString szTheString("TheString");
266 szTheString.insert(3, 1, '\0');
267 wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
268
269 CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
270
271 wxString szLocalTheString("TheString");
272 szLocalTheString.insert(3, 1, '\0');
273 wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
274
275 CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
276 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
277 }
278
279 void UnicodeTestCase::ConversionUTF7()
280 {
281 static const StringConversionData utf7data[] =
282 {
283 // normal fragments
284 { "+AKM-", L"\xa3" },
285 { "+AOk-t+AOk-", L"\xe9t\xe9" },
286
287 // this one is an alternative valid encoding of the same string
288 { "+AOk-t+AOk", L"\xe9t\xe9", StringConversionData::ONLY_MB2WC },
289
290 // some special cases
291 { "+-", L"+" },
292 { "+--", L"+-" },
293
294 // the following are invalid UTF-7 sequences
295 { "\xa3", NULL },
296 { "+", NULL },
297 { "+~", NULL },
298 { "a+", NULL },
299 };
300
301 for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
302 {
303 const StringConversionData& d = utf7data[n];
304
305 // converting to/from UTF-7 using iconv() currently doesn't work
306 // because of several problems:
307 // - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
308 // to an incomplete and anyhow nonsensical "+AA" string)
309 // - iconv refuses to convert "+-" (although it converts "+-\n" just
310 // fine, go figure)
311 //
312 // I have no idea how to fix this so just disable the test for now
313 #if 0
314 d.Test(n, wxCSConv("utf-7"));
315 #endif
316 d.Test(n, wxConvUTF7);
317 }
318 }
319
320 void UnicodeTestCase::ConversionUTF8()
321 {
322 static const StringConversionData utf8data[] =
323 {
324 #ifdef wxHAVE_U_ESCAPE
325 { "\xc2\xa3", L"\u00a3" },
326 #endif
327 { "\xc2", NULL },
328 };
329
330 wxCSConv conv(_T("utf-8"));
331 for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
332 {
333 const StringConversionData& d = utf8data[n];
334 d.Test(n, conv);
335 d.Test(n, wxConvUTF8);
336 }
337 }
338
339 void UnicodeTestCase::ConversionUTF16()
340 {
341 static const StringConversionData utf16data[] =
342 {
343 #ifdef wxHAVE_U_ESCAPE
344 { "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
345 L"\u041f\u0440\u0438\u0432\u0435\u0442" },
346 { "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", L"\u0100b\u0100a\u0100r" },
347 #endif
348 { "\0f\0o\0o\0\0", L"foo" },
349 };
350
351 wxCSConv conv(wxFONTENCODING_UTF16BE);
352 for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
353 {
354 const StringConversionData& d = utf16data[n];
355 d.Test(n, conv);
356 }
357
358 // special case: this string has consecutive NULs inside it which don't
359 // terminate the string, this exposed a bug in our conversion code which
360 // got confused in this case
361 size_t len;
362 wxWCharBuffer wbuf(conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len));
363 CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
364 }
365
366 void UnicodeTestCase::ConversionUTF32()
367 {
368 static const StringConversionData utf32data[] =
369 {
370 #ifdef wxHAVE_U_ESCAPE
371 {
372 "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
373 L"\u041f\u0440\u0438\u0432\u0435\u0442" },
374 #endif
375 { "\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo" },
376 };
377
378 wxCSConv conv(wxFONTENCODING_UTF32BE);
379 for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
380 {
381 const StringConversionData& d = utf32data[n];
382 d.Test(n, conv);
383 }
384
385 size_t len;
386 wxWCharBuffer wbuf(conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */,
387 12, &len));
388 CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
389 }
390
391 void UnicodeTestCase::IsConvOk()
392 {
393 CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
394 CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() );
395 CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() );
396 CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() );
397
398 #ifdef __WINDOWS__
399 CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() );
400 #endif
401 }
402
403 #if wxUSE_UNICODE
404 void UnicodeTestCase::Iteration()
405 {
406 // "czech" in Czech ("cestina"):
407 static const char *textUTF8 = "\304\215e\305\241tina";
408 static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
409
410 wxString text(wxString::FromUTF8(textUTF8));
411 CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
412
413 // verify the string was decoded correctly:
414 {
415 size_t idx = 0;
416 for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
417 {
418 CPPUNIT_ASSERT( *i == textUTF16[idx] );
419 }
420 }
421
422 // overwrite the string with something that is shorter in UTF-8:
423 {
424 for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
425 *i = 'x';
426 }
427
428 // restore the original text now:
429 {
430 wxString::iterator end1 = text.end();
431 wxString::const_iterator end2 = text.end();
432
433 size_t idx = 0;
434 for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
435 {
436 *i = textUTF16[idx];
437
438 CPPUNIT_ASSERT( end1 == text.end() );
439 CPPUNIT_ASSERT( end2 == text.end() );
440 }
441
442 CPPUNIT_ASSERT( end1 == text.end() );
443 CPPUNIT_ASSERT( end2 == text.end() );
444 }
445
446 // and verify it again:
447 {
448 size_t idx = 0;
449 for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
450 {
451 CPPUNIT_ASSERT( *i == textUTF16[idx] );
452 }
453 }
454 }
455 #endif // wxUSE_UNICODE