]>
Commit | Line | Data |
---|---|---|
1 | /////////////////////////////////////////////////////////////////////////////// | |
2 | // Name: tests/strings/unicode.cpp | |
3 | // Purpose: Unicode unit test | |
4 | // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba | |
5 | // Created: 2004-04-28 | |
6 | // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba | |
7 | /////////////////////////////////////////////////////////////////////////////// | |
8 | ||
9 | // ---------------------------------------------------------------------------- | |
10 | // headers | |
11 | // ---------------------------------------------------------------------------- | |
12 | ||
13 | #include "testprec.h" | |
14 | ||
15 | #ifdef __BORLANDC__ | |
16 | #pragma hdrstop | |
17 | #endif | |
18 | ||
19 | #ifndef WX_PRECOMP | |
20 | #include "wx/wx.h" | |
21 | #endif // WX_PRECOMP | |
22 | ||
23 | #include "wx/encconv.h" | |
24 | ||
25 | // ---------------------------------------------------------------------------- | |
26 | // helper class holding the matching MB and WC strings | |
27 | // ---------------------------------------------------------------------------- | |
28 | ||
29 | struct StringConversionData | |
30 | { | |
31 | // either str or wcs (but not both) may be NULL, this means that the conversion | |
32 | // to it should fail | |
33 | StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0) | |
34 | : str(str_), wcs(wcs_), flags(flags_) | |
35 | { | |
36 | } | |
37 | ||
38 | const char * const str; | |
39 | const wchar_t * const wcs; | |
40 | ||
41 | enum | |
42 | { | |
43 | TEST_BOTH = 0, // test both str -> wcs and wcs -> str | |
44 | ONLY_MB2WC = 1 // only test str -> wcs conversion | |
45 | }; | |
46 | ||
47 | const int flags; | |
48 | ||
49 | // test that the conversion between str and wcs (subject to flags) succeeds | |
50 | // | |
51 | // the first argument is the index in the test array and is used solely for | |
52 | // diagnostics | |
53 | void Test(size_t n, wxMBConv& conv) const | |
54 | { | |
55 | if ( str ) | |
56 | { | |
57 | wxWCharBuffer wbuf = conv.cMB2WC(str); | |
58 | ||
59 | if ( wcs ) | |
60 | { | |
61 | CPPUNIT_ASSERT_MESSAGE | |
62 | ( | |
63 | Message(n, "MB2WC failed"), | |
64 | wbuf.data() | |
65 | ); | |
66 | ||
67 | CPPUNIT_ASSERT_MESSAGE | |
68 | ( | |
69 | Message(n, "MB2WC", wbuf, wcs), | |
70 | wxStrcmp(wbuf, wcs) == 0 | |
71 | ); | |
72 | } | |
73 | else // conversion is supposed to fail | |
74 | { | |
75 | CPPUNIT_ASSERT_MESSAGE | |
76 | ( | |
77 | Message(n, "MB2WC succeeded"), | |
78 | !wbuf.data() | |
79 | ); | |
80 | } | |
81 | } | |
82 | ||
83 | if ( wcs && !(flags & ONLY_MB2WC) ) | |
84 | { | |
85 | wxCharBuffer buf = conv.cWC2MB(wcs); | |
86 | ||
87 | if ( str ) | |
88 | { | |
89 | CPPUNIT_ASSERT_MESSAGE | |
90 | ( | |
91 | Message(n, "WC2MB failed"), | |
92 | buf.data() | |
93 | ); | |
94 | ||
95 | CPPUNIT_ASSERT_MESSAGE | |
96 | ( | |
97 | Message(n, "WC2MB", buf, str), | |
98 | strcmp(buf, str) == 0 | |
99 | ); | |
100 | } | |
101 | else | |
102 | { | |
103 | CPPUNIT_ASSERT_MESSAGE | |
104 | ( | |
105 | Message(n, "WC2MB succeeded"), | |
106 | !buf.data() | |
107 | ); | |
108 | } | |
109 | } | |
110 | } | |
111 | ||
112 | private: | |
113 | static std::string | |
114 | Message(size_t n, const wxString& msg) | |
115 | { | |
116 | return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg)); | |
117 | } | |
118 | ||
119 | template <typename T> | |
120 | static std::string | |
121 | Message(size_t n, | |
122 | const char *func, | |
123 | const wxCharTypeBuffer<T>& actual, | |
124 | const T *expected) | |
125 | { | |
126 | return Message(n, | |
127 | wxString::Format("%s returned \"%s\", expected \"%s\"", | |
128 | func, actual.data(), expected)); | |
129 | } | |
130 | }; | |
131 | ||
132 | // ---------------------------------------------------------------------------- | |
133 | // test class | |
134 | // ---------------------------------------------------------------------------- | |
135 | ||
136 | class UnicodeTestCase : public CppUnit::TestCase | |
137 | { | |
138 | public: | |
139 | UnicodeTestCase(); | |
140 | ||
141 | private: | |
142 | CPPUNIT_TEST_SUITE( UnicodeTestCase ); | |
143 | CPPUNIT_TEST( ToFromAscii ); | |
144 | CPPUNIT_TEST( ConstructorsWithConversion ); | |
145 | CPPUNIT_TEST( ConversionFixed ); | |
146 | CPPUNIT_TEST( ConversionWithNULs ); | |
147 | CPPUNIT_TEST( ConversionUTF7 ); | |
148 | CPPUNIT_TEST( ConversionUTF8 ); | |
149 | CPPUNIT_TEST( ConversionUTF16 ); | |
150 | CPPUNIT_TEST( ConversionUTF32 ); | |
151 | CPPUNIT_TEST( IsConvOk ); | |
152 | #if wxUSE_UNICODE | |
153 | CPPUNIT_TEST( Iteration ); | |
154 | #endif | |
155 | CPPUNIT_TEST_SUITE_END(); | |
156 | ||
157 | void ToFromAscii(); | |
158 | void ConstructorsWithConversion(); | |
159 | void ConversionFixed(); | |
160 | void ConversionWithNULs(); | |
161 | void ConversionUTF7(); | |
162 | void ConversionUTF8(); | |
163 | void ConversionUTF16(); | |
164 | void ConversionUTF32(); | |
165 | void IsConvOk(); | |
166 | #if wxUSE_UNICODE | |
167 | void Iteration(); | |
168 | #endif | |
169 | ||
170 | DECLARE_NO_COPY_CLASS(UnicodeTestCase) | |
171 | }; | |
172 | ||
173 | // register in the unnamed registry so that these tests are run by default | |
174 | CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase ); | |
175 | ||
176 | // also include in its own registry so that these tests can be run alone | |
177 | CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" ); | |
178 | ||
179 | UnicodeTestCase::UnicodeTestCase() | |
180 | { | |
181 | } | |
182 | ||
183 | void UnicodeTestCase::ToFromAscii() | |
184 | { | |
185 | ||
186 | #define TEST_TO_FROM_ASCII(txt) \ | |
187 | { \ | |
188 | static const char *msg = txt; \ | |
189 | wxString s = wxString::FromAscii(msg); \ | |
190 | CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \ | |
191 | } | |
192 | ||
193 | TEST_TO_FROM_ASCII( "Hello, world!" ); | |
194 | TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" ); | |
195 | } | |
196 | ||
197 | void UnicodeTestCase::ConstructorsWithConversion() | |
198 | { | |
199 | // the string "Déjà" in UTF-8 and wchar_t: | |
200 | const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0}; | |
201 | const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj" | |
202 | const char *utf8 = (char *)utf8Buf; | |
203 | const char *utf8sub = (char *)utf8subBuf; | |
204 | ||
205 | wxString s1(utf8, wxConvUTF8); | |
206 | ||
207 | #if wxUSE_UNICODE | |
208 | const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0}; | |
209 | CPPUNIT_ASSERT_EQUAL( wchar, s1 ); | |
210 | ||
211 | wxString s2(wchar); | |
212 | CPPUNIT_ASSERT_EQUAL( wchar, s2 ); | |
213 | CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 ); | |
214 | #else | |
215 | CPPUNIT_ASSERT_EQUAL( utf8, s1 ); | |
216 | #endif | |
217 | ||
218 | wxString sub(utf8sub, wxConvUTF8); // "Dej" substring | |
219 | wxString s3(utf8, wxConvUTF8, 4); | |
220 | CPPUNIT_ASSERT_EQUAL( sub, s3 ); | |
221 | ||
222 | #if wxUSE_UNICODE | |
223 | wxString s4(wchar, wxConvUTF8, 3); | |
224 | CPPUNIT_ASSERT_EQUAL( sub, s4 ); | |
225 | ||
226 | // conversion should stop with failure at pos 35 | |
227 | wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8); | |
228 | CPPUNIT_ASSERT( s.empty() ); | |
229 | #endif // wxUSE_UNICODE | |
230 | ||
231 | ||
232 | // test using Unicode strings together with char* strings (this must work | |
233 | // in ANSI mode as well, of course): | |
234 | wxString s5("ascii"); | |
235 | CPPUNIT_ASSERT_EQUAL( "ascii", s5 ); | |
236 | ||
237 | s5 += " value"; | |
238 | ||
239 | CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 ); | |
240 | CPPUNIT_ASSERT_EQUAL( "ascii value", s5 ); | |
241 | CPPUNIT_ASSERT( s5 != "SomethingElse" ); | |
242 | } | |
243 | ||
244 | void UnicodeTestCase::ConversionFixed() | |
245 | { | |
246 | size_t len; | |
247 | ||
248 | #if wxUSE_UNICODE | |
249 | wxConvLibc.cWC2MB(L"", 0, &len); | |
250 | #else // !wxUSE_UNICODE | |
251 | wxConvLibc.cMB2WC("", 0, &len); | |
252 | #endif // wxUSE_UNICODE/!wxUSE_UNICODE | |
253 | ||
254 | CPPUNIT_ASSERT_EQUAL( 0, len ); | |
255 | ||
256 | #if wxUSE_UNICODE | |
257 | // check that when we convert a fixed number of characters we obtain the | |
258 | // expected return value | |
259 | CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) ); | |
260 | CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) ); | |
261 | CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) ); | |
262 | CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) ); | |
263 | #endif // wxUSE_UNICODE | |
264 | } | |
265 | ||
266 | void UnicodeTestCase::ConversionWithNULs() | |
267 | { | |
268 | #if wxUSE_UNICODE | |
269 | static const size_t lenNulString = 10; | |
270 | ||
271 | wxString szTheString(L"The\0String", wxConvLibc, lenNulString); | |
272 | wxCharBuffer theBuffer = szTheString.mb_str(); | |
273 | ||
274 | CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String", | |
275 | lenNulString + 1) == 0 ); | |
276 | ||
277 | wxString szTheString2("The\0String", wxConvLocal, lenNulString); | |
278 | CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() ); | |
279 | CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String", | |
280 | lenNulString + 1) == 0 ); | |
281 | #else // !wxUSE_UNICODE | |
282 | wxString szTheString("TheString"); | |
283 | szTheString.insert(3, 1, '\0'); | |
284 | wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc); | |
285 | ||
286 | CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); | |
287 | ||
288 | wxString szLocalTheString("TheString"); | |
289 | szLocalTheString.insert(3, 1, '\0'); | |
290 | wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal); | |
291 | ||
292 | CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); | |
293 | #endif // wxUSE_UNICODE/!wxUSE_UNICODE | |
294 | } | |
295 | ||
296 | void UnicodeTestCase::ConversionUTF7() | |
297 | { | |
298 | static const StringConversionData utf7data[] = | |
299 | { | |
300 | // normal fragments | |
301 | StringConversionData("+AKM-", L"\xa3"), | |
302 | StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"), | |
303 | ||
304 | // this one is an alternative valid encoding of the same string | |
305 | StringConversionData("+AOk-t+AOk", L"\xe9t\xe9", | |
306 | StringConversionData::ONLY_MB2WC), | |
307 | ||
308 | // some special cases | |
309 | StringConversionData("+-", L"+"), | |
310 | StringConversionData("+--", L"+-"), | |
311 | ||
312 | // the following are invalid UTF-7 sequences | |
313 | StringConversionData("\xa3", NULL), | |
314 | StringConversionData("+", NULL), | |
315 | StringConversionData("+~", NULL), | |
316 | StringConversionData("a+", NULL), | |
317 | }; | |
318 | ||
319 | for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ ) | |
320 | { | |
321 | const StringConversionData& d = utf7data[n]; | |
322 | ||
323 | // converting to/from UTF-7 using iconv() currently doesn't work | |
324 | // because of several problems: | |
325 | // - GetMBNulLen() doesn't return correct result (iconv converts L'\0' | |
326 | // to an incomplete and anyhow nonsensical "+AA" string) | |
327 | // - iconv refuses to convert "+-" (although it converts "+-\n" just | |
328 | // fine, go figure) | |
329 | // | |
330 | // I have no idea how to fix this so just disable the test for now | |
331 | #if 0 | |
332 | d.Test(n, wxCSConv("utf-7")); | |
333 | #endif | |
334 | d.Test(n, wxConvUTF7); | |
335 | } | |
336 | } | |
337 | ||
338 | void UnicodeTestCase::ConversionUTF8() | |
339 | { | |
340 | static const StringConversionData utf8data[] = | |
341 | { | |
342 | #ifdef wxHAVE_U_ESCAPE | |
343 | StringConversionData("\xc2\xa3", L"\u00a3"), | |
344 | #endif | |
345 | StringConversionData("\xc2", NULL), | |
346 | }; | |
347 | ||
348 | wxCSConv conv(wxT("utf-8")); | |
349 | for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ ) | |
350 | { | |
351 | const StringConversionData& d = utf8data[n]; | |
352 | d.Test(n, conv); | |
353 | d.Test(n, wxConvUTF8); | |
354 | } | |
355 | } | |
356 | ||
357 | void UnicodeTestCase::ConversionUTF16() | |
358 | { | |
359 | static const StringConversionData utf16data[] = | |
360 | { | |
361 | #ifdef wxHAVE_U_ESCAPE | |
362 | StringConversionData( | |
363 | "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0", | |
364 | L"\u041f\u0440\u0438\u0432\u0435\u0442"), | |
365 | StringConversionData( | |
366 | "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", | |
367 | L"\u0100b\u0100a\u0100r"), | |
368 | #endif | |
369 | StringConversionData("\0f\0o\0o\0\0", L"foo"), | |
370 | }; | |
371 | ||
372 | wxCSConv conv(wxFONTENCODING_UTF16BE); | |
373 | for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ ) | |
374 | { | |
375 | const StringConversionData& d = utf16data[n]; | |
376 | d.Test(n, conv); | |
377 | } | |
378 | ||
379 | // special case: this string has consecutive NULs inside it which don't | |
380 | // terminate the string, this exposed a bug in our conversion code which | |
381 | // got confused in this case | |
382 | size_t len; | |
383 | conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len); | |
384 | CPPUNIT_ASSERT_EQUAL( 3, len ); | |
385 | } | |
386 | ||
387 | void UnicodeTestCase::ConversionUTF32() | |
388 | { | |
389 | static const StringConversionData utf32data[] = | |
390 | { | |
391 | #ifdef wxHAVE_U_ESCAPE | |
392 | StringConversionData( | |
393 | "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0", | |
394 | L"\u041f\u0440\u0438\u0432\u0435\u0442"), | |
395 | #endif | |
396 | StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"), | |
397 | }; | |
398 | ||
399 | wxCSConv conv(wxFONTENCODING_UTF32BE); | |
400 | for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ ) | |
401 | { | |
402 | const StringConversionData& d = utf32data[n]; | |
403 | d.Test(n, conv); | |
404 | } | |
405 | ||
406 | size_t len; | |
407 | conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len); | |
408 | CPPUNIT_ASSERT_EQUAL( 3, len ); | |
409 | } | |
410 | ||
411 | void UnicodeTestCase::IsConvOk() | |
412 | { | |
413 | CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() ); | |
414 | CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() ); | |
415 | CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() ); | |
416 | CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() ); | |
417 | ||
418 | #ifdef __WINDOWS__ | |
419 | CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() ); | |
420 | #endif | |
421 | } | |
422 | ||
423 | #if wxUSE_UNICODE | |
424 | void UnicodeTestCase::Iteration() | |
425 | { | |
426 | // "czech" in Czech ("cestina"): | |
427 | static const char *textUTF8 = "\304\215e\305\241tina"; | |
428 | static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0}; | |
429 | ||
430 | wxString text(wxString::FromUTF8(textUTF8)); | |
431 | CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 ); | |
432 | ||
433 | // verify the string was decoded correctly: | |
434 | { | |
435 | size_t idx = 0; | |
436 | for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
437 | { | |
438 | CPPUNIT_ASSERT( *i == textUTF16[idx] ); | |
439 | } | |
440 | } | |
441 | ||
442 | // overwrite the string with something that is shorter in UTF-8: | |
443 | { | |
444 | for ( wxString::iterator i = text.begin(); i != text.end(); ++i ) | |
445 | *i = 'x'; | |
446 | } | |
447 | ||
448 | // restore the original text now: | |
449 | { | |
450 | wxString::iterator end1 = text.end(); | |
451 | wxString::const_iterator end2 = text.end(); | |
452 | ||
453 | size_t idx = 0; | |
454 | for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
455 | { | |
456 | *i = textUTF16[idx]; | |
457 | ||
458 | CPPUNIT_ASSERT( end1 == text.end() ); | |
459 | CPPUNIT_ASSERT( end2 == text.end() ); | |
460 | } | |
461 | ||
462 | CPPUNIT_ASSERT( end1 == text.end() ); | |
463 | CPPUNIT_ASSERT( end2 == text.end() ); | |
464 | } | |
465 | ||
466 | // and verify it again: | |
467 | { | |
468 | size_t idx = 0; | |
469 | for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
470 | { | |
471 | CPPUNIT_ASSERT( *i == textUTF16[idx] ); | |
472 | } | |
473 | } | |
474 | } | |
475 | #endif // wxUSE_UNICODE | |
476 |