1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: tests/strings/unicode.cpp
3 // Purpose: Unicode unit test
4 // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba
7 // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
8 ///////////////////////////////////////////////////////////////////////////////
10 // ----------------------------------------------------------------------------
12 // ----------------------------------------------------------------------------
24 #include "wx/encconv.h"
26 // ----------------------------------------------------------------------------
27 // helper class holding the matching MB and WC strings
28 // ----------------------------------------------------------------------------
30 struct StringConversionData
32 // either str or wcs (but not both) may be NULL, this means that the conversion
34 StringConversionData(const char *str_
, const wchar_t *wcs_
, int flags_
= 0)
35 : str(str_
), wcs(wcs_
), flags(flags_
)
39 const char * const str
;
40 const wchar_t * const wcs
;
44 TEST_BOTH
= 0, // test both str -> wcs and wcs -> str
45 ONLY_MB2WC
= 1 // only test str -> wcs conversion
50 // test that the conversion between str and wcs (subject to flags) succeeds
52 // the first argument is the index in the test array and is used solely for
54 void Test(size_t n
, wxMBConv
& conv
) const
58 wxWCharBuffer wbuf
= conv
.cMB2WC(str
);
62 CPPUNIT_ASSERT_MESSAGE
64 Message(n
, "MB2WC failed"),
68 CPPUNIT_ASSERT_MESSAGE
70 Message(n
, "MB2WC", wbuf
, wcs
),
71 wxStrcmp(wbuf
, wcs
) == 0
74 else // conversion is supposed to fail
76 CPPUNIT_ASSERT_MESSAGE
78 Message(n
, "MB2WC succeeded"),
84 if ( wcs
&& !(flags
& ONLY_MB2WC
) )
86 wxCharBuffer buf
= conv
.cWC2MB(wcs
);
90 CPPUNIT_ASSERT_MESSAGE
92 Message(n
, "WC2MB failed"),
96 CPPUNIT_ASSERT_MESSAGE
98 Message(n
, "WC2MB", buf
, str
),
104 CPPUNIT_ASSERT_MESSAGE
106 Message(n
, "WC2MB succeeded"),
115 Message(size_t n
, const wxString
& msg
)
117 return std::string(wxString::Format("#%lu: %s", (unsigned long)n
, msg
));
120 template <typename T
>
124 const wxCharTypeBuffer
<T
>& actual
,
128 wxString::Format("%s returned \"%s\", expected \"%s\"",
129 func
, actual
.data(), expected
));
133 // ----------------------------------------------------------------------------
134 // test data for UnicodeTestCase::Utf8()
135 // ----------------------------------------------------------------------------
137 static const unsigned char utf8koi8r
[] =
139 208, 157, 208, 181, 209, 129, 208, 186, 208, 176, 208, 183, 208, 176,
140 208, 189, 208, 189, 208, 190, 32, 208, 191, 208, 190, 209, 128, 208,
141 176, 208, 180, 208, 190, 208, 178, 208, 176, 208, 187, 32, 208, 188,
142 208, 181, 208, 189, 209, 143, 32, 209, 129, 208, 178, 208, 190, 208,
143 181, 208, 185, 32, 208, 186, 209, 128, 209, 131, 209, 130, 208, 181,
144 208, 185, 209, 136, 208, 181, 208, 185, 32, 208, 189, 208, 190, 208,
145 178, 208, 190, 209, 129, 209, 130, 209, 140, 209, 142, 0
148 static const unsigned char utf8iso8859_1
[] =
150 0x53, 0x79, 0x73, 0x74, 0xc3, 0xa8, 0x6d, 0x65, 0x73, 0x20, 0x49, 0x6e,
151 0x74, 0xc3, 0xa9, 0x67, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x73, 0x20, 0x65,
152 0x6e, 0x20, 0x4d, 0xc3, 0xa9, 0x63, 0x61, 0x6e, 0x69, 0x71, 0x75, 0x65,
153 0x20, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x71, 0x75, 0x65, 0x20, 0x65,
154 0x74, 0x20, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x71, 0x75, 0x65, 0
157 static const unsigned char utf8Invalid
[] =
159 0x3c, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3e, 0x32, 0x30, 0x30,
160 0x32, 0xe5, 0xb9, 0xb4, 0x30, 0x39, 0xe6, 0x9c, 0x88, 0x32, 0x35, 0xe6,
161 0x97, 0xa5, 0x20, 0x30, 0x37, 0xe6, 0x99, 0x82, 0x33, 0x39, 0xe5, 0x88,
162 0x86, 0x35, 0x37, 0xe7, 0xa7, 0x92, 0x3c, 0x2f, 0x64, 0x69, 0x73, 0x70,
166 static const struct Utf8Data
168 const unsigned char *text
;
170 const wxChar
*charset
;
171 wxFontEncoding encoding
;
174 { utf8Invalid
, WXSIZEOF(utf8Invalid
), wxT("iso8859-1"), wxFONTENCODING_ISO8859_1
},
175 { utf8koi8r
, WXSIZEOF(utf8koi8r
), wxT("koi8-r"), wxFONTENCODING_KOI8
},
176 { utf8iso8859_1
, WXSIZEOF(utf8iso8859_1
), wxT("iso8859-1"), wxFONTENCODING_ISO8859_1
},
180 // ----------------------------------------------------------------------------
182 // ----------------------------------------------------------------------------
184 class UnicodeTestCase
: public CppUnit::TestCase
190 CPPUNIT_TEST_SUITE( UnicodeTestCase
);
191 CPPUNIT_TEST( ToFromAscii
);
192 CPPUNIT_TEST( ConstructorsWithConversion
);
193 CPPUNIT_TEST( ConversionFixed
);
194 CPPUNIT_TEST( ConversionWithNULs
);
195 CPPUNIT_TEST( ConversionUTF7
);
196 CPPUNIT_TEST( ConversionUTF8
);
197 CPPUNIT_TEST( ConversionUTF16
);
198 CPPUNIT_TEST( ConversionUTF32
);
199 CPPUNIT_TEST( IsConvOk
);
201 CPPUNIT_TEST( Iteration
);
203 CPPUNIT_TEST( Utf8
);
204 CPPUNIT_TEST( EncodingConverter
);
205 CPPUNIT_TEST_SUITE_END();
208 void ConstructorsWithConversion();
209 void ConversionFixed();
210 void ConversionWithNULs();
211 void ConversionUTF7();
212 void ConversionUTF8();
213 void ConversionUTF16();
214 void ConversionUTF32();
220 void EncodingConverter();
222 DECLARE_NO_COPY_CLASS(UnicodeTestCase
)
225 // register in the unnamed registry so that these tests are run by default
226 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase
);
228 // also include in it's own registry so that these tests can be run alone
229 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase
, "UnicodeTestCase" );
231 UnicodeTestCase::UnicodeTestCase()
235 void UnicodeTestCase::ToFromAscii()
238 #define TEST_TO_FROM_ASCII(txt) \
240 static const char *msg = txt; \
241 wxString s = wxString::FromAscii(msg); \
242 CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 ); \
245 TEST_TO_FROM_ASCII( "Hello, world!" );
246 TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
249 void UnicodeTestCase::ConstructorsWithConversion()
251 // the string "Déjà " in UTF-8 and wchar_t:
252 const unsigned char utf8Buf
[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
253 const unsigned char utf8subBuf
[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
254 const char *utf8
= (char *)utf8Buf
;
255 const char *utf8sub
= (char *)utf8subBuf
;
257 wxString
s1(utf8
, wxConvUTF8
);
260 const wchar_t wchar
[] = {0x44,0xE9,0x6A,0xE0,0};
261 CPPUNIT_ASSERT_EQUAL( wchar
, s1
);
264 CPPUNIT_ASSERT_EQUAL( wchar
, s2
);
265 CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8
), s2
);
267 CPPUNIT_ASSERT_EQUAL( utf8
, s1
);
270 wxString
sub(utf8sub
, wxConvUTF8
); // "Dej" substring
271 wxString
s3(utf8
, wxConvUTF8
, 4);
272 CPPUNIT_ASSERT_EQUAL( sub
, s3
);
275 wxString
s4(wchar
, wxConvUTF8
, 3);
276 CPPUNIT_ASSERT_EQUAL( sub
, s4
);
278 // conversion should stop with failure at pos 35
279 wxString
s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8
);
280 CPPUNIT_ASSERT( s
.empty() );
281 #endif // wxUSE_UNICODE
284 // test using Unicode strings together with char* strings (this must work
285 // in ANSI mode as well, of course):
286 wxString
s5("ascii");
287 CPPUNIT_ASSERT_EQUAL( "ascii", s5
);
291 CPPUNIT_ASSERT( strcmp(s5
.mb_str(), "ascii value") == 0 );
292 CPPUNIT_ASSERT_EQUAL( "ascii value", s5
);
293 CPPUNIT_ASSERT( s5
!= "SomethingElse" );
296 void UnicodeTestCase::ConversionFixed()
301 wxConvLibc
.cWC2MB(L
"", 0, &len
);
302 #else // !wxUSE_UNICODE
303 wxConvLibc
.cMB2WC("", 0, &len
);
304 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
306 CPPUNIT_ASSERT_EQUAL( 0, len
);
309 // check that when we convert a fixed number of characters we obtain the
310 // expected return value
311 CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc
.ToWChar(NULL
, 0, "", 0) );
312 CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc
.ToWChar(NULL
, 0, "x", 1) );
313 CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc
.ToWChar(NULL
, 0, "x", 2) );
314 CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc
.ToWChar(NULL
, 0, "xy", 2) );
315 #endif // wxUSE_UNICODE
318 void UnicodeTestCase::ConversionWithNULs()
321 static const size_t lenNulString
= 10;
323 wxString
szTheString(L
"The\0String", wxConvLibc
, lenNulString
);
324 wxCharBuffer theBuffer
= szTheString
.mb_str();
326 CPPUNIT_ASSERT( memcmp(theBuffer
.data(), "The\0String",
327 lenNulString
+ 1) == 0 );
329 wxString
szTheString2("The\0String", wxConvLocal
, lenNulString
);
330 CPPUNIT_ASSERT_EQUAL( lenNulString
, szTheString2
.length() );
331 CPPUNIT_ASSERT( wxTmemcmp(szTheString2
.c_str(), L
"The\0String",
332 lenNulString
+ 1) == 0 );
333 #else // !wxUSE_UNICODE
334 wxString
szTheString("TheString");
335 szTheString
.insert(3, 1, '\0');
336 wxWCharBuffer theBuffer
= szTheString
.wc_str(wxConvLibc
);
338 CPPUNIT_ASSERT( memcmp(theBuffer
.data(), L
"The\0String", 11 * sizeof(wchar_t)) == 0 );
340 wxString
szLocalTheString("TheString");
341 szLocalTheString
.insert(3, 1, '\0');
342 wxWCharBuffer theLocalBuffer
= szLocalTheString
.wc_str(wxConvLocal
);
344 CPPUNIT_ASSERT( memcmp(theLocalBuffer
.data(), L
"The\0String", 11 * sizeof(wchar_t)) == 0 );
345 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
348 void UnicodeTestCase::ConversionUTF7()
350 static const StringConversionData utf7data
[] =
353 StringConversionData("+AKM-", L
"\xa3"),
354 StringConversionData("+AOk-t+AOk-", L
"\xe9t\xe9"),
356 // this one is an alternative valid encoding of the same string
357 StringConversionData("+AOk-t+AOk", L
"\xe9t\xe9",
358 StringConversionData::ONLY_MB2WC
),
360 // some special cases
361 StringConversionData("+-", L
"+"),
362 StringConversionData("+--", L
"+-"),
364 // the following are invalid UTF-7 sequences
365 StringConversionData("\xa3", NULL
),
366 StringConversionData("+", NULL
),
367 StringConversionData("+~", NULL
),
368 StringConversionData("a+", NULL
),
371 for ( size_t n
= 0; n
< WXSIZEOF(utf7data
); n
++ )
373 const StringConversionData
& d
= utf7data
[n
];
375 // converting to/from UTF-7 using iconv() currently doesn't work
376 // because of several problems:
377 // - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
378 // to an incomplete and anyhow nonsensical "+AA" string)
379 // - iconv refuses to convert "+-" (although it converts "+-\n" just
382 // I have no idea how to fix this so just disable the test for now
384 d
.Test(n
, wxCSConv("utf-7"));
386 d
.Test(n
, wxConvUTF7
);
390 void UnicodeTestCase::ConversionUTF8()
392 static const StringConversionData utf8data
[] =
394 #ifdef wxHAVE_U_ESCAPE
395 StringConversionData("\xc2\xa3", L
"\u00a3"),
397 StringConversionData("\xc2", NULL
),
400 wxCSConv
conv(wxT("utf-8"));
401 for ( size_t n
= 0; n
< WXSIZEOF(utf8data
); n
++ )
403 const StringConversionData
& d
= utf8data
[n
];
405 d
.Test(n
, wxConvUTF8
);
409 void UnicodeTestCase::ConversionUTF16()
411 static const StringConversionData utf16data
[] =
413 #ifdef wxHAVE_U_ESCAPE
414 StringConversionData(
415 "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
416 L
"\u041f\u0440\u0438\u0432\u0435\u0442"),
417 StringConversionData(
418 "\x01\0\0b\x01\0\0a\x01\0\0r\0\0",
419 L
"\u0100b\u0100a\u0100r"),
421 StringConversionData("\0f\0o\0o\0\0", L
"foo"),
424 wxCSConv
conv(wxFONTENCODING_UTF16BE
);
425 for ( size_t n
= 0; n
< WXSIZEOF(utf16data
); n
++ )
427 const StringConversionData
& d
= utf16data
[n
];
431 // special case: this string has consecutive NULs inside it which don't
432 // terminate the string, this exposed a bug in our conversion code which
433 // got confused in this case
435 conv
.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len
);
436 CPPUNIT_ASSERT_EQUAL( 3, len
);
439 void UnicodeTestCase::ConversionUTF32()
441 static const StringConversionData utf32data
[] =
443 #ifdef wxHAVE_U_ESCAPE
444 StringConversionData(
445 "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
446 L
"\u041f\u0440\u0438\u0432\u0435\u0442"),
448 StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L
"foo"),
451 wxCSConv
conv(wxFONTENCODING_UTF32BE
);
452 for ( size_t n
= 0; n
< WXSIZEOF(utf32data
); n
++ )
454 const StringConversionData
& d
= utf32data
[n
];
459 conv
.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len
);
460 CPPUNIT_ASSERT_EQUAL( 3, len
);
463 void UnicodeTestCase::IsConvOk()
465 CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM
).IsOk() );
466 CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() );
467 CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() );
468 CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() );
471 CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() );
476 void UnicodeTestCase::Iteration()
478 // "czech" in Czech ("cestina"):
479 static const char *textUTF8
= "\304\215e\305\241tina";
480 static const wchar_t textUTF16
[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
482 wxString
text(wxString::FromUTF8(textUTF8
));
483 CPPUNIT_ASSERT( wxStrcmp(text
.wc_str(), textUTF16
) == 0 );
485 // verify the string was decoded correctly:
488 for ( wxString::const_iterator i
= text
.begin(); i
!= text
.end(); ++i
, ++idx
)
490 CPPUNIT_ASSERT( *i
== textUTF16
[idx
] );
494 // overwrite the string with something that is shorter in UTF-8:
496 for ( wxString::iterator i
= text
.begin(); i
!= text
.end(); ++i
)
500 // restore the original text now:
502 wxString::iterator end1
= text
.end();
503 wxString::const_iterator end2
= text
.end();
506 for ( wxString::iterator i
= text
.begin(); i
!= text
.end(); ++i
, ++idx
)
510 CPPUNIT_ASSERT( end1
== text
.end() );
511 CPPUNIT_ASSERT( end2
== text
.end() );
514 CPPUNIT_ASSERT( end1
== text
.end() );
515 CPPUNIT_ASSERT( end2
== text
.end() );
518 // and verify it again:
521 for ( wxString::const_iterator i
= text
.begin(); i
!= text
.end(); ++i
, ++idx
)
523 CPPUNIT_ASSERT( *i
== textUTF16
[idx
] );
527 #endif // wxUSE_UNICODE
529 void UnicodeTestCase::Utf8()
531 // test code extracted from console sample r64320
536 for ( size_t n
= 0; n
< WXSIZEOF(utf8data
); n
++ )
538 const Utf8Data
& u8d
= utf8data
[n
];
539 CPPUNIT_ASSERT( wxConvUTF8
.MB2WC(wbuf
, (const char *)u8d
.text
, WXSIZEOF(wbuf
)) != (size_t)-1 );
541 #if 0 // FIXME: this conversion seem not to work...
542 wxCSConv
conv(u8d
.charset
);
543 CPPUNIT_ASSERT( conv
.WC2MB(buf
, wbuf
, WXSIZEOF(buf
)) != (size_t)-1 );
545 wxString
s(wxConvUTF8
.cMB2WC((const char *)u8d
.text
));
546 CPPUNIT_ASSERT( !s
.empty() );
550 void UnicodeTestCase::EncodingConverter()
552 // test code extracted from console sample r64320
558 CPPUNIT_ASSERT( wxConvUTF8
.MB2WC(wbuf
, (const char *)utf8koi8r
, WXSIZEOF(utf8koi8r
)) != (size_t)-1 );
560 wxString
s1(wxConvUTF8
.cMB2WC((const char *)utf8koi8r
));
561 CPPUNIT_ASSERT( !s1
.empty() );
563 wxEncodingConverter ec
;
564 ec
.Init(wxFONTENCODING_UNICODE
, wxFONTENCODING_KOI8
);
565 ec
.Convert(wbuf
, buf
);
568 CPPUNIT_ASSERT_EQUAL( s1
, s2
);