1 /////////////////////////////////////////////////////////////////////////////// 
   2 // Name:        tests/strings/unicode.cpp 
   3 // Purpose:     Unicode unit test 
   4 // Author:      Vadim Zeitlin, Wlodzimierz ABX Skiba 
   7 // Copyright:   (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba 
   8 /////////////////////////////////////////////////////////////////////////////// 
  10 // ---------------------------------------------------------------------------- 
  12 // ---------------------------------------------------------------------------- 
  24 // helper class holding the matching MB and WC strings 
  26 // either str or wcs (but not both) may be NULL, this means that the conversion 
  28 struct StringConversionData
 
  30     StringConversionData(const char *str_
, const wchar_t *wcs_
, int flags_ 
= 0) 
  31         : str(str_
), wcs(wcs_
), flags(flags_
) 
  35     const char * const str
; 
  36     const wchar_t * const wcs
; 
  40         TEST_BOTH  
= 0, // test both str -> wcs and wcs -> str 
  41         ONLY_MB2WC 
= 1  // only test str -> wcs conversion 
  46     // test that the conversion between str and wcs (subject to flags) succeeds 
  48     // the first argument is the index in the test array and is used solely for 
  50     void Test(size_t n
, wxMBConv
& conv
) const 
  54             wxWCharBuffer wbuf 
= conv
.cMB2WC(str
); 
  58                 CPPUNIT_ASSERT_MESSAGE
 
  60                     Message(n
, "MB2WC failed"), 
  64                 CPPUNIT_ASSERT_MESSAGE
 
  66                     Message(n
, "MB2WC", wbuf
, wcs
), 
  67                     wxStrcmp(wbuf
, wcs
) == 0 
  70             else // conversion is supposed to fail 
  72                 CPPUNIT_ASSERT_MESSAGE
 
  74                     Message(n
, "MB2WC succeeded"), 
  80         if ( wcs 
&& !(flags 
& ONLY_MB2WC
) ) 
  82             wxCharBuffer buf 
= conv
.cWC2MB(wcs
); 
  86                 CPPUNIT_ASSERT_MESSAGE
 
  88                     Message(n
, "WC2MB failed"), 
  92                 CPPUNIT_ASSERT_MESSAGE
 
  94                     Message(n
, "WC2MB", buf
, str
), 
 100                 CPPUNIT_ASSERT_MESSAGE
 
 102                     Message(n
, "WC2MB succeeded"), 
 111     Message(size_t n
, const wxString
& msg
) 
 113         return std::string(wxString::Format("#%lu: %s", (unsigned long)n
, msg
)); 
 116     template <typename T
> 
 120             const wxCharTypeBuffer
<T
>& actual
, 
 124                        wxString::Format("%s returned \"%s\", expected \"%s\"", 
 125                                         func
, actual
.data(), expected
)); 
 129 // ---------------------------------------------------------------------------- 
 131 // ---------------------------------------------------------------------------- 
 133 class UnicodeTestCase 
: public CppUnit::TestCase
 
 139     CPPUNIT_TEST_SUITE( UnicodeTestCase 
); 
 140         CPPUNIT_TEST( ToFromAscii 
); 
 141         CPPUNIT_TEST( ConstructorsWithConversion 
); 
 142         CPPUNIT_TEST( ConversionEmpty 
); 
 143         CPPUNIT_TEST( ConversionWithNULs 
); 
 144         CPPUNIT_TEST( ConversionUTF7 
); 
 145         CPPUNIT_TEST( ConversionUTF8 
); 
 146         CPPUNIT_TEST( ConversionUTF16 
); 
 147         CPPUNIT_TEST( ConversionUTF32 
); 
 148         CPPUNIT_TEST( IsConvOk 
); 
 150         CPPUNIT_TEST( Iteration 
); 
 152     CPPUNIT_TEST_SUITE_END(); 
 155     void ConstructorsWithConversion(); 
 156     void ConversionEmpty(); 
 157     void ConversionWithNULs(); 
 158     void ConversionUTF7(); 
 159     void ConversionUTF8(); 
 160     void ConversionUTF16(); 
 161     void ConversionUTF32(); 
 167     DECLARE_NO_COPY_CLASS(UnicodeTestCase
) 
 170 // register in the unnamed registry so that these tests are run by default 
 171 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase 
); 
 173 // also include in it's own registry so that these tests can be run alone 
 174 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase
, "UnicodeTestCase" ); 
 176 UnicodeTestCase::UnicodeTestCase() 
 180 void UnicodeTestCase::ToFromAscii() 
 183 #define TEST_TO_FROM_ASCII(txt)                              \ 
 185         static const char *msg = txt;                        \ 
 186         wxString s = wxString::FromAscii(msg);               \ 
 187         CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 );  \ 
 190     TEST_TO_FROM_ASCII( "Hello, world!" ); 
 191     TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" ); 
 194 void UnicodeTestCase::ConstructorsWithConversion() 
 196     // the string "Déjà" in UTF-8 and wchar_t: 
 197     const unsigned char utf8Buf
[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0}; 
 198     const unsigned char utf8subBuf
[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj" 
 199     const char *utf8 
= (char *)utf8Buf
; 
 200     const char *utf8sub 
= (char *)utf8subBuf
; 
 202     wxString 
s1(utf8
, wxConvUTF8
); 
 205     const wchar_t wchar
[] = {0x44,0xE9,0x6A,0xE0,0}; 
 206     CPPUNIT_ASSERT_EQUAL( wchar
, s1 
); 
 209     CPPUNIT_ASSERT_EQUAL( wchar
, s2 
); 
 210     CPPUNIT_ASSERT_EQUAL( utf8
, s2 
); 
 212     CPPUNIT_ASSERT_EQUAL( utf8
, s1 
); 
 215     wxString 
sub(utf8sub
, wxConvUTF8
); // "Dej" substring 
 216     wxString 
s3(utf8
, wxConvUTF8
, 4); 
 217     CPPUNIT_ASSERT_EQUAL( sub
, s3 
); 
 220     wxString 
s4(wchar
, wxConvUTF8
, 3); 
 221     CPPUNIT_ASSERT_EQUAL( sub
, s4 
); 
 223     // conversion should stop with failure at pos 35 
 224     wxString 
s("\t[pl]open.format.Sformatuj dyskietkê=gfloppy %f", wxConvUTF8
); 
 225     CPPUNIT_ASSERT( s
.empty() ); 
 226 #endif // wxUSE_UNICODE 
 229     // test using Unicode strings together with char* strings (this must work 
 230     // in ANSI mode as well, of course): 
 231     wxString 
s5("ascii"); 
 232     CPPUNIT_ASSERT_EQUAL( "ascii", s5 
); 
 236     CPPUNIT_ASSERT( strcmp(s5
.mb_str(), "ascii value") == 0 ); 
 237     CPPUNIT_ASSERT_EQUAL( "ascii value", s5 
); 
 238     CPPUNIT_ASSERT( s5 
!= "SomethingElse" ); 
 241 void UnicodeTestCase::ConversionEmpty() 
 246     wxConvLibc
.cWC2MB(L
"", 0, &len
); 
 247 #else // !wxUSE_UNICODE 
 248     wxConvLibc
.cMB2WC("", 0, &len
); 
 249 #endif // wxUSE_UNICODE/!wxUSE_UNICODE 
 251     CPPUNIT_ASSERT_EQUAL( 0, len 
); 
 254 void UnicodeTestCase::ConversionWithNULs() 
 257     static const size_t lenNulString 
= 10; 
 259     wxString 
szTheString(L
"The\0String", wxConvLibc
, lenNulString
); 
 260     wxCharBuffer theBuffer 
= szTheString
.mb_str(); 
 262     CPPUNIT_ASSERT( memcmp(theBuffer
.data(), "The\0String", 
 263                     lenNulString 
+ 1) == 0 ); 
 265     wxString 
szTheString2("The\0String", wxConvLocal
, lenNulString
); 
 266     CPPUNIT_ASSERT_EQUAL( lenNulString
, szTheString2
.length() ); 
 267     CPPUNIT_ASSERT( wxTmemcmp(szTheString2
.c_str(), L
"The\0String", 
 268                     lenNulString 
+ 1) == 0 ); 
 269 #else // !wxUSE_UNICODE 
 270     wxString 
szTheString("TheString"); 
 271     szTheString
.insert(3, 1, '\0'); 
 272     wxWCharBuffer theBuffer 
= szTheString
.wc_str(wxConvLibc
); 
 274     CPPUNIT_ASSERT( memcmp(theBuffer
.data(), L
"The\0String", 11 * sizeof(wchar_t)) == 0 ); 
 276     wxString 
szLocalTheString("TheString"); 
 277     szLocalTheString
.insert(3, 1, '\0'); 
 278     wxWCharBuffer theLocalBuffer 
= szLocalTheString
.wc_str(wxConvLocal
); 
 280     CPPUNIT_ASSERT( memcmp(theLocalBuffer
.data(), L
"The\0String", 11 * sizeof(wchar_t)) == 0 ); 
 281 #endif // wxUSE_UNICODE/!wxUSE_UNICODE 
 284 void UnicodeTestCase::ConversionUTF7() 
 286     static const StringConversionData utf7data
[] = 
 289         StringConversionData("+AKM-", L
"\xa3"), 
 290         StringConversionData("+AOk-t+AOk-", L
"\xe9t\xe9"), 
 292         // this one is an alternative valid encoding of the same string 
 293         StringConversionData("+AOk-t+AOk", L
"\xe9t\xe9", 
 294                              StringConversionData::ONLY_MB2WC
), 
 296         // some special cases 
 297         StringConversionData("+-", L
"+"), 
 298         StringConversionData("+--", L
"+-"), 
 300         // the following are invalid UTF-7 sequences 
 301         StringConversionData("\xa3", NULL
), 
 302         StringConversionData("+", NULL
), 
 303         StringConversionData("+~", NULL
), 
 304         StringConversionData("a+", NULL
), 
 307     for ( size_t n 
= 0; n 
< WXSIZEOF(utf7data
); n
++ ) 
 309         const StringConversionData
& d 
= utf7data
[n
]; 
 311         // converting to/from UTF-7 using iconv() currently doesn't work 
 312         // because of several problems: 
 313         //  - GetMBNulLen() doesn't return correct result (iconv converts L'\0' 
 314         //    to an incomplete and anyhow nonsensical "+AA" string) 
 315         //  - iconv refuses to convert "+-" (although it converts "+-\n" just 
 318         // I have no idea how to fix this so just disable the test for now 
 320         d
.Test(n
, wxCSConv("utf-7")); 
 322         d
.Test(n
, wxConvUTF7
); 
 326 void UnicodeTestCase::ConversionUTF8() 
 328     static const StringConversionData utf8data
[] = 
 330 #ifdef wxHAVE_U_ESCAPE 
 331         StringConversionData("\xc2\xa3", L
"\u00a3"), 
 333         StringConversionData("\xc2", NULL
), 
 336     wxCSConv 
conv(_T("utf-8")); 
 337     for ( size_t n 
= 0; n 
< WXSIZEOF(utf8data
); n
++ ) 
 339         const StringConversionData
& d 
= utf8data
[n
]; 
 341         d
.Test(n
, wxConvUTF8
); 
 345 void UnicodeTestCase::ConversionUTF16() 
 347     static const StringConversionData utf16data
[] = 
 349 #ifdef wxHAVE_U_ESCAPE 
 350         StringConversionData( 
 351             "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0", 
 352             L
"\u041f\u0440\u0438\u0432\u0435\u0442"), 
 353         StringConversionData( 
 354             "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", 
 355             L
"\u0100b\u0100a\u0100r"), 
 357         StringConversionData("\0f\0o\0o\0\0", L
"foo"), 
 360     wxCSConv 
conv(wxFONTENCODING_UTF16BE
); 
 361     for ( size_t n 
= 0; n 
< WXSIZEOF(utf16data
); n
++ ) 
 363         const StringConversionData
& d 
= utf16data
[n
]; 
 367     // special case: this string has consecutive NULs inside it which don't 
 368     // terminate the string, this exposed a bug in our conversion code which 
 369     // got confused in this case 
 371     conv
.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len
); 
 372     CPPUNIT_ASSERT_EQUAL( 3, len 
); 
 375 void UnicodeTestCase::ConversionUTF32() 
 377     static const StringConversionData utf32data
[] = 
 379 #ifdef wxHAVE_U_ESCAPE 
 380         StringConversionData( 
 381             "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0", 
 382           L
"\u041f\u0440\u0438\u0432\u0435\u0442"), 
 384         StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L
"foo"), 
 387     wxCSConv 
conv(wxFONTENCODING_UTF32BE
); 
 388     for ( size_t n 
= 0; n 
< WXSIZEOF(utf32data
); n
++ ) 
 390         const StringConversionData
& d 
= utf32data
[n
]; 
 395     conv
.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len
); 
 396     CPPUNIT_ASSERT_EQUAL( 3, len 
); 
 399 void UnicodeTestCase::IsConvOk() 
 401     CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM
).IsOk() ); 
 402     CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() ); 
 403     CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() ); 
 404     CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() ); 
 407     CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() ); 
 412 void UnicodeTestCase::Iteration() 
 414     // "czech" in Czech ("cestina"): 
 415     static const char *textUTF8 
= "\304\215e\305\241tina"; 
 416     static const wchar_t textUTF16
[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0}; 
 418     wxString 
text(wxString::FromUTF8(textUTF8
)); 
 419     CPPUNIT_ASSERT( wxStrcmp(text
.wc_str(), textUTF16
) == 0 ); 
 421     // verify the string was decoded correctly: 
 424         for ( wxString::const_iterator i 
= text
.begin(); i 
!= text
.end(); ++i
, ++idx 
) 
 426             CPPUNIT_ASSERT( *i 
== textUTF16
[idx
] ); 
 430     // overwrite the string with something that is shorter in UTF-8: 
 432         for ( wxString::iterator i 
= text
.begin(); i 
!= text
.end(); ++i 
) 
 436     // restore the original text now: 
 438         wxString::iterator end1 
= text
.end(); 
 439         wxString::const_iterator end2 
= text
.end(); 
 442         for ( wxString::iterator i 
= text
.begin(); i 
!= text
.end(); ++i
, ++idx 
) 
 446             CPPUNIT_ASSERT( end1 
== text
.end() ); 
 447             CPPUNIT_ASSERT( end2 
== text
.end() ); 
 450         CPPUNIT_ASSERT( end1 
== text
.end() ); 
 451         CPPUNIT_ASSERT( end2 
== text
.end() ); 
 454     // and verify it again: 
 457         for ( wxString::const_iterator i 
= text
.begin(); i 
!= text
.end(); ++i
, ++idx 
) 
 459             CPPUNIT_ASSERT( *i 
== textUTF16
[idx
] ); 
 463 #endif // wxUSE_UNICODE