]>
Commit | Line | Data |
---|---|---|
1 | /////////////////////////////////////////////////////////////////////////////// | |
2 | // Name: tests/strings/unicode.cpp | |
3 | // Purpose: Unicode unit test | |
4 | // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba | |
5 | // Created: 2004-04-28 | |
6 | // RCS-ID: $Id$ | |
7 | // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba | |
8 | /////////////////////////////////////////////////////////////////////////////// | |
9 | ||
10 | // ---------------------------------------------------------------------------- | |
11 | // headers | |
12 | // ---------------------------------------------------------------------------- | |
13 | ||
14 | #include "testprec.h" | |
15 | ||
16 | #ifdef __BORLANDC__ | |
17 | #pragma hdrstop | |
18 | #endif | |
19 | ||
20 | #ifndef WX_PRECOMP | |
21 | #include "wx/wx.h" | |
22 | #endif // WX_PRECOMP | |
23 | ||
24 | // helper class holding the matching MB and WC strings | |
25 | // | |
26 | // either str or wcs (but not both) may be NULL, this means that the conversion | |
27 | // to it should fail | |
28 | struct StringConversionData | |
29 | { | |
30 | StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0) | |
31 | : str(str_), wcs(wcs_), flags(flags_) | |
32 | { | |
33 | } | |
34 | ||
35 | const char * const str; | |
36 | const wchar_t * const wcs; | |
37 | ||
38 | enum | |
39 | { | |
40 | TEST_BOTH = 0, // test both str -> wcs and wcs -> str | |
41 | ONLY_MB2WC = 1 // only test str -> wcs conversion | |
42 | }; | |
43 | ||
44 | const int flags; | |
45 | ||
46 | // test that the conversion between str and wcs (subject to flags) succeeds | |
47 | // | |
48 | // the first argument is the index in the test array and is used solely for | |
49 | // diagnostics | |
50 | void Test(size_t n, wxMBConv& conv) const | |
51 | { | |
52 | if ( str ) | |
53 | { | |
54 | wxWCharBuffer wbuf = conv.cMB2WC(str); | |
55 | ||
56 | if ( wcs ) | |
57 | { | |
58 | CPPUNIT_ASSERT_MESSAGE | |
59 | ( | |
60 | Message(n, "MB2WC failed"), | |
61 | wbuf.data() | |
62 | ); | |
63 | ||
64 | CPPUNIT_ASSERT_MESSAGE | |
65 | ( | |
66 | Message(n, "MB2WC", wbuf, wcs), | |
67 | wxStrcmp(wbuf, wcs) == 0 | |
68 | ); | |
69 | } | |
70 | else // conversion is supposed to fail | |
71 | { | |
72 | CPPUNIT_ASSERT_MESSAGE | |
73 | ( | |
74 | Message(n, "MB2WC succeeded"), | |
75 | !wbuf.data() | |
76 | ); | |
77 | } | |
78 | } | |
79 | ||
80 | if ( wcs && !(flags & ONLY_MB2WC) ) | |
81 | { | |
82 | wxCharBuffer buf = conv.cWC2MB(wcs); | |
83 | ||
84 | if ( str ) | |
85 | { | |
86 | CPPUNIT_ASSERT_MESSAGE | |
87 | ( | |
88 | Message(n, "WC2MB failed"), | |
89 | buf.data() | |
90 | ); | |
91 | ||
92 | CPPUNIT_ASSERT_MESSAGE | |
93 | ( | |
94 | Message(n, "WC2MB", buf, str), | |
95 | strcmp(buf, str) == 0 | |
96 | ); | |
97 | } | |
98 | else | |
99 | { | |
100 | CPPUNIT_ASSERT_MESSAGE | |
101 | ( | |
102 | Message(n, "WC2MB succeeded"), | |
103 | !buf.data() | |
104 | ); | |
105 | } | |
106 | } | |
107 | } | |
108 | ||
109 | private: | |
110 | static std::string | |
111 | Message(size_t n, const wxString& msg) | |
112 | { | |
113 | return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg)); | |
114 | } | |
115 | ||
116 | template <typename T> | |
117 | static std::string | |
118 | Message(size_t n, | |
119 | const char *func, | |
120 | const wxCharTypeBuffer<T>& actual, | |
121 | const T *expected) | |
122 | { | |
123 | return Message(n, | |
124 | wxString::Format("%s returned \"%s\", expected \"%s\"", | |
125 | func, actual.data(), expected)); | |
126 | } | |
127 | }; | |
128 | ||
129 | // ---------------------------------------------------------------------------- | |
130 | // test class | |
131 | // ---------------------------------------------------------------------------- | |
132 | ||
133 | class UnicodeTestCase : public CppUnit::TestCase | |
134 | { | |
135 | public: | |
136 | UnicodeTestCase(); | |
137 | ||
138 | private: | |
139 | CPPUNIT_TEST_SUITE( UnicodeTestCase ); | |
140 | CPPUNIT_TEST( ToFromAscii ); | |
141 | CPPUNIT_TEST( ConstructorsWithConversion ); | |
142 | CPPUNIT_TEST( ConversionFixed ); | |
143 | CPPUNIT_TEST( ConversionWithNULs ); | |
144 | CPPUNIT_TEST( ConversionUTF7 ); | |
145 | CPPUNIT_TEST( ConversionUTF8 ); | |
146 | CPPUNIT_TEST( ConversionUTF16 ); | |
147 | CPPUNIT_TEST( ConversionUTF32 ); | |
148 | CPPUNIT_TEST( IsConvOk ); | |
149 | #if wxUSE_UNICODE | |
150 | CPPUNIT_TEST( Iteration ); | |
151 | #endif | |
152 | CPPUNIT_TEST_SUITE_END(); | |
153 | ||
154 | void ToFromAscii(); | |
155 | void ConstructorsWithConversion(); | |
156 | void ConversionFixed(); | |
157 | void ConversionWithNULs(); | |
158 | void ConversionUTF7(); | |
159 | void ConversionUTF8(); | |
160 | void ConversionUTF16(); | |
161 | void ConversionUTF32(); | |
162 | void IsConvOk(); | |
163 | #if wxUSE_UNICODE | |
164 | void Iteration(); | |
165 | #endif | |
166 | ||
167 | DECLARE_NO_COPY_CLASS(UnicodeTestCase) | |
168 | }; | |
169 | ||
170 | // register in the unnamed registry so that these tests are run by default | |
171 | CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase ); | |
172 | ||
173 | // also include in it's own registry so that these tests can be run alone | |
174 | CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" ); | |
175 | ||
176 | UnicodeTestCase::UnicodeTestCase() | |
177 | { | |
178 | } | |
179 | ||
180 | void UnicodeTestCase::ToFromAscii() | |
181 | { | |
182 | ||
183 | #define TEST_TO_FROM_ASCII(txt) \ | |
184 | { \ | |
185 | static const char *msg = txt; \ | |
186 | wxString s = wxString::FromAscii(msg); \ | |
187 | CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \ | |
188 | } | |
189 | ||
190 | TEST_TO_FROM_ASCII( "Hello, world!" ); | |
191 | TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" ); | |
192 | } | |
193 | ||
194 | void UnicodeTestCase::ConstructorsWithConversion() | |
195 | { | |
196 | // the string "Déjà" in UTF-8 and wchar_t: | |
197 | const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0}; | |
198 | const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj" | |
199 | const char *utf8 = (char *)utf8Buf; | |
200 | const char *utf8sub = (char *)utf8subBuf; | |
201 | ||
202 | wxString s1(utf8, wxConvUTF8); | |
203 | ||
204 | #if wxUSE_UNICODE | |
205 | const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0}; | |
206 | CPPUNIT_ASSERT_EQUAL( wchar, s1 ); | |
207 | ||
208 | wxString s2(wchar); | |
209 | CPPUNIT_ASSERT_EQUAL( wchar, s2 ); | |
210 | CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 ); | |
211 | #else | |
212 | CPPUNIT_ASSERT_EQUAL( utf8, s1 ); | |
213 | #endif | |
214 | ||
215 | wxString sub(utf8sub, wxConvUTF8); // "Dej" substring | |
216 | wxString s3(utf8, wxConvUTF8, 4); | |
217 | CPPUNIT_ASSERT_EQUAL( sub, s3 ); | |
218 | ||
219 | #if wxUSE_UNICODE | |
220 | wxString s4(wchar, wxConvUTF8, 3); | |
221 | CPPUNIT_ASSERT_EQUAL( sub, s4 ); | |
222 | ||
223 | // conversion should stop with failure at pos 35 | |
224 | wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8); | |
225 | CPPUNIT_ASSERT( s.empty() ); | |
226 | #endif // wxUSE_UNICODE | |
227 | ||
228 | ||
229 | // test using Unicode strings together with char* strings (this must work | |
230 | // in ANSI mode as well, of course): | |
231 | wxString s5("ascii"); | |
232 | CPPUNIT_ASSERT_EQUAL( "ascii", s5 ); | |
233 | ||
234 | s5 += " value"; | |
235 | ||
236 | CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 ); | |
237 | CPPUNIT_ASSERT_EQUAL( "ascii value", s5 ); | |
238 | CPPUNIT_ASSERT( s5 != "SomethingElse" ); | |
239 | } | |
240 | ||
241 | void UnicodeTestCase::ConversionFixed() | |
242 | { | |
243 | size_t len; | |
244 | ||
245 | #if wxUSE_UNICODE | |
246 | wxConvLibc.cWC2MB(L"", 0, &len); | |
247 | #else // !wxUSE_UNICODE | |
248 | wxConvLibc.cMB2WC("", 0, &len); | |
249 | #endif // wxUSE_UNICODE/!wxUSE_UNICODE | |
250 | ||
251 | CPPUNIT_ASSERT_EQUAL( 0, len ); | |
252 | ||
253 | #if wxUSE_UNICODE | |
254 | // check that when we convert a fixed number of characters we obtain the | |
255 | // expected return value | |
256 | CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) ); | |
257 | CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) ); | |
258 | CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) ); | |
259 | CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) ); | |
260 | #endif // wxUSE_UNICODE | |
261 | } | |
262 | ||
263 | void UnicodeTestCase::ConversionWithNULs() | |
264 | { | |
265 | #if wxUSE_UNICODE | |
266 | static const size_t lenNulString = 10; | |
267 | ||
268 | wxString szTheString(L"The\0String", wxConvLibc, lenNulString); | |
269 | wxCharBuffer theBuffer = szTheString.mb_str(); | |
270 | ||
271 | CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String", | |
272 | lenNulString + 1) == 0 ); | |
273 | ||
274 | wxString szTheString2("The\0String", wxConvLocal, lenNulString); | |
275 | CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() ); | |
276 | CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String", | |
277 | lenNulString + 1) == 0 ); | |
278 | #else // !wxUSE_UNICODE | |
279 | wxString szTheString("TheString"); | |
280 | szTheString.insert(3, 1, '\0'); | |
281 | wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc); | |
282 | ||
283 | CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); | |
284 | ||
285 | wxString szLocalTheString("TheString"); | |
286 | szLocalTheString.insert(3, 1, '\0'); | |
287 | wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal); | |
288 | ||
289 | CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); | |
290 | #endif // wxUSE_UNICODE/!wxUSE_UNICODE | |
291 | } | |
292 | ||
293 | void UnicodeTestCase::ConversionUTF7() | |
294 | { | |
295 | static const StringConversionData utf7data[] = | |
296 | { | |
297 | // normal fragments | |
298 | StringConversionData("+AKM-", L"\xa3"), | |
299 | StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"), | |
300 | ||
301 | // this one is an alternative valid encoding of the same string | |
302 | StringConversionData("+AOk-t+AOk", L"\xe9t\xe9", | |
303 | StringConversionData::ONLY_MB2WC), | |
304 | ||
305 | // some special cases | |
306 | StringConversionData("+-", L"+"), | |
307 | StringConversionData("+--", L"+-"), | |
308 | ||
309 | // the following are invalid UTF-7 sequences | |
310 | StringConversionData("\xa3", NULL), | |
311 | StringConversionData("+", NULL), | |
312 | StringConversionData("+~", NULL), | |
313 | StringConversionData("a+", NULL), | |
314 | }; | |
315 | ||
316 | for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ ) | |
317 | { | |
318 | const StringConversionData& d = utf7data[n]; | |
319 | ||
320 | // converting to/from UTF-7 using iconv() currently doesn't work | |
321 | // because of several problems: | |
322 | // - GetMBNulLen() doesn't return correct result (iconv converts L'\0' | |
323 | // to an incomplete and anyhow nonsensical "+AA" string) | |
324 | // - iconv refuses to convert "+-" (although it converts "+-\n" just | |
325 | // fine, go figure) | |
326 | // | |
327 | // I have no idea how to fix this so just disable the test for now | |
328 | #if 0 | |
329 | d.Test(n, wxCSConv("utf-7")); | |
330 | #endif | |
331 | d.Test(n, wxConvUTF7); | |
332 | } | |
333 | } | |
334 | ||
335 | void UnicodeTestCase::ConversionUTF8() | |
336 | { | |
337 | static const StringConversionData utf8data[] = | |
338 | { | |
339 | #ifdef wxHAVE_U_ESCAPE | |
340 | StringConversionData("\xc2\xa3", L"\u00a3"), | |
341 | #endif | |
342 | StringConversionData("\xc2", NULL), | |
343 | }; | |
344 | ||
345 | wxCSConv conv(wxT("utf-8")); | |
346 | for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ ) | |
347 | { | |
348 | const StringConversionData& d = utf8data[n]; | |
349 | d.Test(n, conv); | |
350 | d.Test(n, wxConvUTF8); | |
351 | } | |
352 | } | |
353 | ||
354 | void UnicodeTestCase::ConversionUTF16() | |
355 | { | |
356 | static const StringConversionData utf16data[] = | |
357 | { | |
358 | #ifdef wxHAVE_U_ESCAPE | |
359 | StringConversionData( | |
360 | "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0", | |
361 | L"\u041f\u0440\u0438\u0432\u0435\u0442"), | |
362 | StringConversionData( | |
363 | "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", | |
364 | L"\u0100b\u0100a\u0100r"), | |
365 | #endif | |
366 | StringConversionData("\0f\0o\0o\0\0", L"foo"), | |
367 | }; | |
368 | ||
369 | wxCSConv conv(wxFONTENCODING_UTF16BE); | |
370 | for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ ) | |
371 | { | |
372 | const StringConversionData& d = utf16data[n]; | |
373 | d.Test(n, conv); | |
374 | } | |
375 | ||
376 | // special case: this string has consecutive NULs inside it which don't | |
377 | // terminate the string, this exposed a bug in our conversion code which | |
378 | // got confused in this case | |
379 | size_t len; | |
380 | conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len); | |
381 | CPPUNIT_ASSERT_EQUAL( 3, len ); | |
382 | } | |
383 | ||
384 | void UnicodeTestCase::ConversionUTF32() | |
385 | { | |
386 | static const StringConversionData utf32data[] = | |
387 | { | |
388 | #ifdef wxHAVE_U_ESCAPE | |
389 | StringConversionData( | |
390 | "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0", | |
391 | L"\u041f\u0440\u0438\u0432\u0435\u0442"), | |
392 | #endif | |
393 | StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"), | |
394 | }; | |
395 | ||
396 | wxCSConv conv(wxFONTENCODING_UTF32BE); | |
397 | for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ ) | |
398 | { | |
399 | const StringConversionData& d = utf32data[n]; | |
400 | d.Test(n, conv); | |
401 | } | |
402 | ||
403 | size_t len; | |
404 | conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len); | |
405 | CPPUNIT_ASSERT_EQUAL( 3, len ); | |
406 | } | |
407 | ||
408 | void UnicodeTestCase::IsConvOk() | |
409 | { | |
410 | CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() ); | |
411 | CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() ); | |
412 | CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() ); | |
413 | CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() ); | |
414 | ||
415 | #ifdef __WINDOWS__ | |
416 | CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() ); | |
417 | #endif | |
418 | } | |
419 | ||
420 | #if wxUSE_UNICODE | |
421 | void UnicodeTestCase::Iteration() | |
422 | { | |
423 | // "czech" in Czech ("cestina"): | |
424 | static const char *textUTF8 = "\304\215e\305\241tina"; | |
425 | static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0}; | |
426 | ||
427 | wxString text(wxString::FromUTF8(textUTF8)); | |
428 | CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 ); | |
429 | ||
430 | // verify the string was decoded correctly: | |
431 | { | |
432 | size_t idx = 0; | |
433 | for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
434 | { | |
435 | CPPUNIT_ASSERT( *i == textUTF16[idx] ); | |
436 | } | |
437 | } | |
438 | ||
439 | // overwrite the string with something that is shorter in UTF-8: | |
440 | { | |
441 | for ( wxString::iterator i = text.begin(); i != text.end(); ++i ) | |
442 | *i = 'x'; | |
443 | } | |
444 | ||
445 | // restore the original text now: | |
446 | { | |
447 | wxString::iterator end1 = text.end(); | |
448 | wxString::const_iterator end2 = text.end(); | |
449 | ||
450 | size_t idx = 0; | |
451 | for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
452 | { | |
453 | *i = textUTF16[idx]; | |
454 | ||
455 | CPPUNIT_ASSERT( end1 == text.end() ); | |
456 | CPPUNIT_ASSERT( end2 == text.end() ); | |
457 | } | |
458 | ||
459 | CPPUNIT_ASSERT( end1 == text.end() ); | |
460 | CPPUNIT_ASSERT( end2 == text.end() ); | |
461 | } | |
462 | ||
463 | // and verify it again: | |
464 | { | |
465 | size_t idx = 0; | |
466 | for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
467 | { | |
468 | CPPUNIT_ASSERT( *i == textUTF16[idx] ); | |
469 | } | |
470 | } | |
471 | } | |
472 | #endif // wxUSE_UNICODE |