]>
Commit | Line | Data |
---|---|---|
1 | /////////////////////////////////////////////////////////////////////////////// | |
2 | // Name: tests/strings/unicode.cpp | |
3 | // Purpose: Unicode unit test | |
4 | // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba | |
5 | // Created: 2004-04-28 | |
6 | // RCS-ID: $Id$ | |
7 | // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba | |
8 | /////////////////////////////////////////////////////////////////////////////// | |
9 | ||
10 | // ---------------------------------------------------------------------------- | |
11 | // headers | |
12 | // ---------------------------------------------------------------------------- | |
13 | ||
14 | #include "testprec.h" | |
15 | ||
16 | #ifdef __BORLANDC__ | |
17 | #pragma hdrstop | |
18 | #endif | |
19 | ||
20 | #ifndef WX_PRECOMP | |
21 | #include "wx/wx.h" | |
22 | #endif // WX_PRECOMP | |
23 | ||
24 | #include "wx/encconv.h" | |
25 | ||
26 | // ---------------------------------------------------------------------------- | |
27 | // helper class holding the matching MB and WC strings | |
28 | // ---------------------------------------------------------------------------- | |
29 | ||
30 | struct StringConversionData | |
31 | { | |
32 | // either str or wcs (but not both) may be NULL, this means that the conversion | |
33 | // to it should fail | |
34 | StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0) | |
35 | : str(str_), wcs(wcs_), flags(flags_) | |
36 | { | |
37 | } | |
38 | ||
39 | const char * const str; | |
40 | const wchar_t * const wcs; | |
41 | ||
42 | enum | |
43 | { | |
44 | TEST_BOTH = 0, // test both str -> wcs and wcs -> str | |
45 | ONLY_MB2WC = 1 // only test str -> wcs conversion | |
46 | }; | |
47 | ||
48 | const int flags; | |
49 | ||
50 | // test that the conversion between str and wcs (subject to flags) succeeds | |
51 | // | |
52 | // the first argument is the index in the test array and is used solely for | |
53 | // diagnostics | |
54 | void Test(size_t n, wxMBConv& conv) const | |
55 | { | |
56 | if ( str ) | |
57 | { | |
58 | wxWCharBuffer wbuf = conv.cMB2WC(str); | |
59 | ||
60 | if ( wcs ) | |
61 | { | |
62 | CPPUNIT_ASSERT_MESSAGE | |
63 | ( | |
64 | Message(n, "MB2WC failed"), | |
65 | wbuf.data() | |
66 | ); | |
67 | ||
68 | CPPUNIT_ASSERT_MESSAGE | |
69 | ( | |
70 | Message(n, "MB2WC", wbuf, wcs), | |
71 | wxStrcmp(wbuf, wcs) == 0 | |
72 | ); | |
73 | } | |
74 | else // conversion is supposed to fail | |
75 | { | |
76 | CPPUNIT_ASSERT_MESSAGE | |
77 | ( | |
78 | Message(n, "MB2WC succeeded"), | |
79 | !wbuf.data() | |
80 | ); | |
81 | } | |
82 | } | |
83 | ||
84 | if ( wcs && !(flags & ONLY_MB2WC) ) | |
85 | { | |
86 | wxCharBuffer buf = conv.cWC2MB(wcs); | |
87 | ||
88 | if ( str ) | |
89 | { | |
90 | CPPUNIT_ASSERT_MESSAGE | |
91 | ( | |
92 | Message(n, "WC2MB failed"), | |
93 | buf.data() | |
94 | ); | |
95 | ||
96 | CPPUNIT_ASSERT_MESSAGE | |
97 | ( | |
98 | Message(n, "WC2MB", buf, str), | |
99 | strcmp(buf, str) == 0 | |
100 | ); | |
101 | } | |
102 | else | |
103 | { | |
104 | CPPUNIT_ASSERT_MESSAGE | |
105 | ( | |
106 | Message(n, "WC2MB succeeded"), | |
107 | !buf.data() | |
108 | ); | |
109 | } | |
110 | } | |
111 | } | |
112 | ||
113 | private: | |
114 | static std::string | |
115 | Message(size_t n, const wxString& msg) | |
116 | { | |
117 | return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg)); | |
118 | } | |
119 | ||
120 | template <typename T> | |
121 | static std::string | |
122 | Message(size_t n, | |
123 | const char *func, | |
124 | const wxCharTypeBuffer<T>& actual, | |
125 | const T *expected) | |
126 | { | |
127 | return Message(n, | |
128 | wxString::Format("%s returned \"%s\", expected \"%s\"", | |
129 | func, actual.data(), expected)); | |
130 | } | |
131 | }; | |
132 | ||
133 | // ---------------------------------------------------------------------------- | |
134 | // test class | |
135 | // ---------------------------------------------------------------------------- | |
136 | ||
137 | class UnicodeTestCase : public CppUnit::TestCase | |
138 | { | |
139 | public: | |
140 | UnicodeTestCase(); | |
141 | ||
142 | private: | |
143 | CPPUNIT_TEST_SUITE( UnicodeTestCase ); | |
144 | CPPUNIT_TEST( ToFromAscii ); | |
145 | CPPUNIT_TEST( ConstructorsWithConversion ); | |
146 | CPPUNIT_TEST( ConversionFixed ); | |
147 | CPPUNIT_TEST( ConversionWithNULs ); | |
148 | CPPUNIT_TEST( ConversionUTF7 ); | |
149 | CPPUNIT_TEST( ConversionUTF8 ); | |
150 | CPPUNIT_TEST( ConversionUTF16 ); | |
151 | CPPUNIT_TEST( ConversionUTF32 ); | |
152 | CPPUNIT_TEST( IsConvOk ); | |
153 | #if wxUSE_UNICODE | |
154 | CPPUNIT_TEST( Iteration ); | |
155 | #endif | |
156 | CPPUNIT_TEST_SUITE_END(); | |
157 | ||
158 | void ToFromAscii(); | |
159 | void ConstructorsWithConversion(); | |
160 | void ConversionFixed(); | |
161 | void ConversionWithNULs(); | |
162 | void ConversionUTF7(); | |
163 | void ConversionUTF8(); | |
164 | void ConversionUTF16(); | |
165 | void ConversionUTF32(); | |
166 | void IsConvOk(); | |
167 | #if wxUSE_UNICODE | |
168 | void Iteration(); | |
169 | #endif | |
170 | ||
171 | DECLARE_NO_COPY_CLASS(UnicodeTestCase) | |
172 | }; | |
173 | ||
174 | // register in the unnamed registry so that these tests are run by default | |
175 | CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase ); | |
176 | ||
177 | // also include in its own registry so that these tests can be run alone | |
178 | CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" ); | |
179 | ||
180 | UnicodeTestCase::UnicodeTestCase() | |
181 | { | |
182 | } | |
183 | ||
184 | void UnicodeTestCase::ToFromAscii() | |
185 | { | |
186 | ||
187 | #define TEST_TO_FROM_ASCII(txt) \ | |
188 | { \ | |
189 | static const char *msg = txt; \ | |
190 | wxString s = wxString::FromAscii(msg); \ | |
191 | CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \ | |
192 | } | |
193 | ||
194 | TEST_TO_FROM_ASCII( "Hello, world!" ); | |
195 | TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" ); | |
196 | } | |
197 | ||
198 | void UnicodeTestCase::ConstructorsWithConversion() | |
199 | { | |
200 | // the string "Déjà" in UTF-8 and wchar_t: | |
201 | const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0}; | |
202 | const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj" | |
203 | const char *utf8 = (char *)utf8Buf; | |
204 | const char *utf8sub = (char *)utf8subBuf; | |
205 | ||
206 | wxString s1(utf8, wxConvUTF8); | |
207 | ||
208 | #if wxUSE_UNICODE | |
209 | const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0}; | |
210 | CPPUNIT_ASSERT_EQUAL( wchar, s1 ); | |
211 | ||
212 | wxString s2(wchar); | |
213 | CPPUNIT_ASSERT_EQUAL( wchar, s2 ); | |
214 | CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 ); | |
215 | #else | |
216 | CPPUNIT_ASSERT_EQUAL( utf8, s1 ); | |
217 | #endif | |
218 | ||
219 | wxString sub(utf8sub, wxConvUTF8); // "Dej" substring | |
220 | wxString s3(utf8, wxConvUTF8, 4); | |
221 | CPPUNIT_ASSERT_EQUAL( sub, s3 ); | |
222 | ||
223 | #if wxUSE_UNICODE | |
224 | wxString s4(wchar, wxConvUTF8, 3); | |
225 | CPPUNIT_ASSERT_EQUAL( sub, s4 ); | |
226 | ||
227 | // conversion should stop with failure at pos 35 | |
228 | wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8); | |
229 | CPPUNIT_ASSERT( s.empty() ); | |
230 | #endif // wxUSE_UNICODE | |
231 | ||
232 | ||
233 | // test using Unicode strings together with char* strings (this must work | |
234 | // in ANSI mode as well, of course): | |
235 | wxString s5("ascii"); | |
236 | CPPUNIT_ASSERT_EQUAL( "ascii", s5 ); | |
237 | ||
238 | s5 += " value"; | |
239 | ||
240 | CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 ); | |
241 | CPPUNIT_ASSERT_EQUAL( "ascii value", s5 ); | |
242 | CPPUNIT_ASSERT( s5 != "SomethingElse" ); | |
243 | } | |
244 | ||
245 | void UnicodeTestCase::ConversionFixed() | |
246 | { | |
247 | size_t len; | |
248 | ||
249 | #if wxUSE_UNICODE | |
250 | wxConvLibc.cWC2MB(L"", 0, &len); | |
251 | #else // !wxUSE_UNICODE | |
252 | wxConvLibc.cMB2WC("", 0, &len); | |
253 | #endif // wxUSE_UNICODE/!wxUSE_UNICODE | |
254 | ||
255 | CPPUNIT_ASSERT_EQUAL( 0, len ); | |
256 | ||
257 | #if wxUSE_UNICODE | |
258 | // check that when we convert a fixed number of characters we obtain the | |
259 | // expected return value | |
260 | CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) ); | |
261 | CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) ); | |
262 | CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) ); | |
263 | CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) ); | |
264 | #endif // wxUSE_UNICODE | |
265 | } | |
266 | ||
267 | void UnicodeTestCase::ConversionWithNULs() | |
268 | { | |
269 | #if wxUSE_UNICODE | |
270 | static const size_t lenNulString = 10; | |
271 | ||
272 | wxString szTheString(L"The\0String", wxConvLibc, lenNulString); | |
273 | wxCharBuffer theBuffer = szTheString.mb_str(); | |
274 | ||
275 | CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String", | |
276 | lenNulString + 1) == 0 ); | |
277 | ||
278 | wxString szTheString2("The\0String", wxConvLocal, lenNulString); | |
279 | CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() ); | |
280 | CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String", | |
281 | lenNulString + 1) == 0 ); | |
282 | #else // !wxUSE_UNICODE | |
283 | wxString szTheString("TheString"); | |
284 | szTheString.insert(3, 1, '\0'); | |
285 | wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc); | |
286 | ||
287 | CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); | |
288 | ||
289 | wxString szLocalTheString("TheString"); | |
290 | szLocalTheString.insert(3, 1, '\0'); | |
291 | wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal); | |
292 | ||
293 | CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); | |
294 | #endif // wxUSE_UNICODE/!wxUSE_UNICODE | |
295 | } | |
296 | ||
297 | void UnicodeTestCase::ConversionUTF7() | |
298 | { | |
299 | static const StringConversionData utf7data[] = | |
300 | { | |
301 | // normal fragments | |
302 | StringConversionData("+AKM-", L"\xa3"), | |
303 | StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"), | |
304 | ||
305 | // this one is an alternative valid encoding of the same string | |
306 | StringConversionData("+AOk-t+AOk", L"\xe9t\xe9", | |
307 | StringConversionData::ONLY_MB2WC), | |
308 | ||
309 | // some special cases | |
310 | StringConversionData("+-", L"+"), | |
311 | StringConversionData("+--", L"+-"), | |
312 | ||
313 | // the following are invalid UTF-7 sequences | |
314 | StringConversionData("\xa3", NULL), | |
315 | StringConversionData("+", NULL), | |
316 | StringConversionData("+~", NULL), | |
317 | StringConversionData("a+", NULL), | |
318 | }; | |
319 | ||
320 | for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ ) | |
321 | { | |
322 | const StringConversionData& d = utf7data[n]; | |
323 | ||
324 | // converting to/from UTF-7 using iconv() currently doesn't work | |
325 | // because of several problems: | |
326 | // - GetMBNulLen() doesn't return correct result (iconv converts L'\0' | |
327 | // to an incomplete and anyhow nonsensical "+AA" string) | |
328 | // - iconv refuses to convert "+-" (although it converts "+-\n" just | |
329 | // fine, go figure) | |
330 | // | |
331 | // I have no idea how to fix this so just disable the test for now | |
332 | #if 0 | |
333 | d.Test(n, wxCSConv("utf-7")); | |
334 | #endif | |
335 | d.Test(n, wxConvUTF7); | |
336 | } | |
337 | } | |
338 | ||
339 | void UnicodeTestCase::ConversionUTF8() | |
340 | { | |
341 | static const StringConversionData utf8data[] = | |
342 | { | |
343 | #ifdef wxHAVE_U_ESCAPE | |
344 | StringConversionData("\xc2\xa3", L"\u00a3"), | |
345 | #endif | |
346 | StringConversionData("\xc2", NULL), | |
347 | }; | |
348 | ||
349 | wxCSConv conv(wxT("utf-8")); | |
350 | for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ ) | |
351 | { | |
352 | const StringConversionData& d = utf8data[n]; | |
353 | d.Test(n, conv); | |
354 | d.Test(n, wxConvUTF8); | |
355 | } | |
356 | } | |
357 | ||
358 | void UnicodeTestCase::ConversionUTF16() | |
359 | { | |
360 | static const StringConversionData utf16data[] = | |
361 | { | |
362 | #ifdef wxHAVE_U_ESCAPE | |
363 | StringConversionData( | |
364 | "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0", | |
365 | L"\u041f\u0440\u0438\u0432\u0435\u0442"), | |
366 | StringConversionData( | |
367 | "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", | |
368 | L"\u0100b\u0100a\u0100r"), | |
369 | #endif | |
370 | StringConversionData("\0f\0o\0o\0\0", L"foo"), | |
371 | }; | |
372 | ||
373 | wxCSConv conv(wxFONTENCODING_UTF16BE); | |
374 | for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ ) | |
375 | { | |
376 | const StringConversionData& d = utf16data[n]; | |
377 | d.Test(n, conv); | |
378 | } | |
379 | ||
380 | // special case: this string has consecutive NULs inside it which don't | |
381 | // terminate the string, this exposed a bug in our conversion code which | |
382 | // got confused in this case | |
383 | size_t len; | |
384 | conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len); | |
385 | CPPUNIT_ASSERT_EQUAL( 3, len ); | |
386 | } | |
387 | ||
388 | void UnicodeTestCase::ConversionUTF32() | |
389 | { | |
390 | static const StringConversionData utf32data[] = | |
391 | { | |
392 | #ifdef wxHAVE_U_ESCAPE | |
393 | StringConversionData( | |
394 | "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0", | |
395 | L"\u041f\u0440\u0438\u0432\u0435\u0442"), | |
396 | #endif | |
397 | StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"), | |
398 | }; | |
399 | ||
400 | wxCSConv conv(wxFONTENCODING_UTF32BE); | |
401 | for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ ) | |
402 | { | |
403 | const StringConversionData& d = utf32data[n]; | |
404 | d.Test(n, conv); | |
405 | } | |
406 | ||
407 | size_t len; | |
408 | conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len); | |
409 | CPPUNIT_ASSERT_EQUAL( 3, len ); | |
410 | } | |
411 | ||
412 | void UnicodeTestCase::IsConvOk() | |
413 | { | |
414 | CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() ); | |
415 | CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() ); | |
416 | CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() ); | |
417 | CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() ); | |
418 | ||
419 | #ifdef __WINDOWS__ | |
420 | CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() ); | |
421 | #endif | |
422 | } | |
423 | ||
424 | #if wxUSE_UNICODE | |
425 | void UnicodeTestCase::Iteration() | |
426 | { | |
427 | // "czech" in Czech ("cestina"): | |
428 | static const char *textUTF8 = "\304\215e\305\241tina"; | |
429 | static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0}; | |
430 | ||
431 | wxString text(wxString::FromUTF8(textUTF8)); | |
432 | CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 ); | |
433 | ||
434 | // verify the string was decoded correctly: | |
435 | { | |
436 | size_t idx = 0; | |
437 | for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
438 | { | |
439 | CPPUNIT_ASSERT( *i == textUTF16[idx] ); | |
440 | } | |
441 | } | |
442 | ||
443 | // overwrite the string with something that is shorter in UTF-8: | |
444 | { | |
445 | for ( wxString::iterator i = text.begin(); i != text.end(); ++i ) | |
446 | *i = 'x'; | |
447 | } | |
448 | ||
449 | // restore the original text now: | |
450 | { | |
451 | wxString::iterator end1 = text.end(); | |
452 | wxString::const_iterator end2 = text.end(); | |
453 | ||
454 | size_t idx = 0; | |
455 | for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
456 | { | |
457 | *i = textUTF16[idx]; | |
458 | ||
459 | CPPUNIT_ASSERT( end1 == text.end() ); | |
460 | CPPUNIT_ASSERT( end2 == text.end() ); | |
461 | } | |
462 | ||
463 | CPPUNIT_ASSERT( end1 == text.end() ); | |
464 | CPPUNIT_ASSERT( end2 == text.end() ); | |
465 | } | |
466 | ||
467 | // and verify it again: | |
468 | { | |
469 | size_t idx = 0; | |
470 | for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) | |
471 | { | |
472 | CPPUNIT_ASSERT( *i == textUTF16[idx] ); | |
473 | } | |
474 | } | |
475 | } | |
476 | #endif // wxUSE_UNICODE | |
477 |