tests/strings/unicode.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        tests/strings/unicode.cpp
   3 // Purpose:     Unicode unit test
   4 // Author:      Vadim Zeitlin, Wlodzimierz ABX Skiba
   5 // Created:     2004-04-28
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
   8 ///////////////////////////////////////////////////////////////////////////////
   9
  10 // ----------------------------------------------------------------------------
  11 // headers
  12 // ----------------------------------------------------------------------------
  13
  14 #include "testprec.h"
  15
  16 #ifdef __BORLANDC__
  17     #pragma hdrstop
  18 #endif
  19
  20 #ifndef WX_PRECOMP
  21     #include "wx/wx.h"
  22 #endif // WX_PRECOMP
  23
  24 // ----------------------------------------------------------------------------
  25 // test class
  26 // ----------------------------------------------------------------------------
  27
  28 class UnicodeTestCase : public CppUnit::TestCase
  29 {
  30 public:
  31     UnicodeTestCase();
  32
  33 private:
  34     CPPUNIT_TEST_SUITE( UnicodeTestCase );
  35         CPPUNIT_TEST( ToFromAscii );
  36         CPPUNIT_TEST( ConstructorsWithConversion );
  37         CPPUNIT_TEST( ConversionEmpty );
  38         CPPUNIT_TEST( ConversionWithNULs );
  39         CPPUNIT_TEST( ConversionUTF7 );
  40         CPPUNIT_TEST( ConversionUTF8 );
  41         CPPUNIT_TEST( ConversionUTF16 );
  42         CPPUNIT_TEST( ConversionUTF32 );
  43         CPPUNIT_TEST( IsConvOk );
  44 #if wxUSE_UNICODE
  45         CPPUNIT_TEST( Iteration );
  46 #endif
  47     CPPUNIT_TEST_SUITE_END();
  48
  49     void ToFromAscii();
  50     void ConstructorsWithConversion();
  51     void ConversionEmpty();
  52     void ConversionWithNULs();
  53     void ConversionUTF7();
  54     void ConversionUTF8();
  55     void ConversionUTF16();
  56     void ConversionUTF32();
  57     void IsConvOk();
  58 #if wxUSE_UNICODE
  59     void Iteration();
  60 #endif
  61
  62     // test if converting s using the given encoding gives ws and vice versa
  63     //
  64     // if either of the first 2 arguments is NULL, the conversion is supposed
  65     // to fail
  66     void DoTestConversion(const char *s, const wchar_t *w, wxMBConv& conv);
  67
  68
  69     DECLARE_NO_COPY_CLASS(UnicodeTestCase)
  70 };
  71
  72 // register in the unnamed registry so that these tests are run by default
  73 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
  74
  75 // also include in it's own registry so that these tests can be run alone
  76 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
  77
  78 UnicodeTestCase::UnicodeTestCase()
  79 {
  80 }
  81
  82 void UnicodeTestCase::ToFromAscii()
  83 {
  84
  85 #define TEST_TO_FROM_ASCII(txt)                              \
  86     {                                                        \
  87         static const char *msg = txt;                        \
  88         wxString s = wxString::FromAscii(msg);               \
  89         CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 );  \
  90     }
  91
  92     TEST_TO_FROM_ASCII( "Hello, world!" );
  93     TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
  94 }
  95
  96 void UnicodeTestCase::ConstructorsWithConversion()
  97 {
  98     // the string "Déjà" in UTF-8 and wchar_t:
  99     const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
 100     const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
 101     const char *utf8 = (char *)utf8Buf;
 102     const char *utf8sub = (char *)utf8subBuf;
 103
 104     wxString s1(utf8, wxConvUTF8);
 105
 106 #if wxUSE_UNICODE
 107     const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
 108     WX_ASSERT_STR_EQUAL( wchar, s1 );
 109
 110     wxString s2(wchar);
 111     WX_ASSERT_STR_EQUAL( wchar, s2 );
 112     WX_ASSERT_STR_EQUAL( utf8, s2 );
 113 #else
 114     WX_ASSERT_STR_EQUAL( utf8, s1 );
 115 #endif
 116
 117     wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
 118     wxString s3(utf8, wxConvUTF8, 4);
 119     CPPUNIT_ASSERT_EQUAL( sub, s3 );
 120
 121 #if wxUSE_UNICODE
 122     wxString s4(wchar, wxConvUTF8, 3);
 123     CPPUNIT_ASSERT_EQUAL( sub, s4 );
 124
 125     // conversion should stop with failure at pos 35
 126     wxString s("\t[pl]open.format.Sformatuj dyskietkê=gfloppy %f", wxConvUTF8);
 127     CPPUNIT_ASSERT( s.empty() );
 128 #endif // wxUSE_UNICODE
 129
 130
 131     // test using Unicode strings together with char* strings (this must work
 132     // in ANSI mode as well, of course):
 133     wxString s5("ascii");
 134     WX_ASSERT_STR_EQUAL( "ascii", s5 );
 135
 136     s5 += " value";
 137
 138     CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
 139     WX_ASSERT_STR_EQUAL( "ascii value", s5 );
 140     CPPUNIT_ASSERT( s5 != "SomethingElse" );
 141 }
 142
 143 void UnicodeTestCase::ConversionEmpty()
 144 {
 145     size_t len;
 146
 147 #if wxUSE_UNICODE
 148     wxCharBuffer buf = wxConvLibc.cWC2MB(L"", 0, &len);
 149 #else // !wxUSE_UNICODE
 150     wxWCharBuffer wbuf = wxConvLibc.cMB2WC("", 0, &len);
 151 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 152
 153     CPPUNIT_ASSERT(len == 0);
 154 }
 155
 156 void UnicodeTestCase::ConversionWithNULs()
 157 {
 158 #if wxUSE_UNICODE
 159     static const size_t lenNulString = 10;
 160
 161     wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
 162     wxCharBuffer theBuffer = szTheString.mb_str();
 163
 164     CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
 165                     lenNulString + 1) == 0 );
 166
 167     wxString szTheString2("The\0String", wxConvLocal, lenNulString);
 168     CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
 169     CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
 170                     lenNulString + 1) == 0 );
 171 #else // !wxUSE_UNICODE
 172     wxString szTheString("TheString");
 173     szTheString.insert(3, 1, '\0');
 174     wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
 175
 176     CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 177
 178     wxString szLocalTheString("TheString");
 179     szLocalTheString.insert(3, 1, '\0');
 180     wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
 181
 182     CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 183 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 184 }
 185
 186 void
 187 UnicodeTestCase::DoTestConversion(const char *s,
 188                                   const wchar_t *ws,
 189                                   wxMBConv& conv)
 190 {
 191     if ( ws )
 192     {
 193         wxCharBuffer buf = conv.cWC2MB(ws, (size_t)-1, NULL);
 194
 195         CPPUNIT_ASSERT( strcmp(buf, s) == 0 );
 196     }
 197
 198     if ( s )
 199     {
 200         wxWCharBuffer wbuf = conv.cMB2WC(s, (size_t)-1, NULL);
 201
 202         if ( ws )
 203         {
 204             CPPUNIT_ASSERT( wbuf.data() );
 205             CPPUNIT_ASSERT( wxStrcmp(wbuf, ws) == 0 );
 206         }
 207         else // conversion is supposed to fail
 208         {
 209             CPPUNIT_ASSERT_EQUAL( (wchar_t *)NULL, wbuf.data() );
 210         }
 211     }
 212 }
 213
 214 struct StringConversionData
 215 {
 216     const char *str;
 217     const wchar_t *wcs;
 218 };
 219
 220 void UnicodeTestCase::ConversionUTF7()
 221 {
 222     static const StringConversionData utf7data[] =
 223     {
 224         // normal fragments
 225         { "+AKM-", L"\xa3" },
 226         { "+AOk-t+AOk-", L"\xe9t\xe9" },
 227
 228         // some special cases
 229         { "+-", L"+" },
 230         { "+--", L"+-" },
 231
 232         // the following are invalid UTF-7 sequences
 233         { "\xa3", NULL },
 234         { "+", NULL },
 235         { "+~", NULL },
 236         { "a+", NULL },
 237     };
 238
 239     for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
 240     {
 241         const StringConversionData& d = utf7data[n];
 242
 243         // converting to/from UTF-7 using iconv() currently doesn't work
 244         // because of several problems:
 245         //  - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
 246         //    to an incomplete and anyhow nonsensical "+AA" string)
 247         //  - iconv refuses to convert "+-" (although it converts "+-\n" just
 248         //    fine, go figure)
 249         //
 250         // I have no idea how to fix this so just disable the test for now
 251 #if 0
 252         DoTestConversion(d.str, d.wcs, wxCSConv("utf-7"));
 253 #endif
 254         DoTestConversion(d.str, d.wcs, wxConvUTF7);
 255     }
 256 }
 257
 258 void UnicodeTestCase::ConversionUTF8()
 259 {
 260     static const StringConversionData utf8data[] =
 261     {
 262 #ifdef wxHAVE_U_ESCAPE
 263         { "\xc2\xa3", L"\u00a3" },
 264 #endif
 265         { "\xc2", NULL },
 266     };
 267
 268     wxCSConv conv(_T("utf-8"));
 269     for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
 270     {
 271         const StringConversionData& d = utf8data[n];
 272         DoTestConversion(d.str, d.wcs, conv);
 273         DoTestConversion(d.str, d.wcs, wxConvUTF8);
 274     }
 275 }
 276
 277 void UnicodeTestCase::ConversionUTF16()
 278 {
 279     static const StringConversionData utf16data[] =
 280     {
 281 #ifdef wxHAVE_U_ESCAPE
 282         { "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
 283           L"\u041f\u0440\u0438\u0432\u0435\u0442" },
 284         { "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", L"\u0100b\u0100a\u0100r" },
 285 #endif
 286         { "\0f\0o\0o\0\0", L"foo" },
 287     };
 288
 289     wxCSConv conv(wxFONTENCODING_UTF16BE);
 290     for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
 291     {
 292         const StringConversionData& d = utf16data[n];
 293         DoTestConversion(d.str, d.wcs, conv);
 294     }
 295
 296     // special case: this string has consecutive NULs inside it which don't
 297     // terminate the string, this exposed a bug in our conversion code which
 298     // got confused in this case
 299     size_t len;
 300     wxWCharBuffer wbuf(conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len));
 301     CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
 302 }
 303
 304 void UnicodeTestCase::ConversionUTF32()
 305 {
 306     static const StringConversionData utf32data[] =
 307     {
 308 #ifdef wxHAVE_U_ESCAPE
 309         {
 310             "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
 311           L"\u041f\u0440\u0438\u0432\u0435\u0442" },
 312 #endif
 313         { "\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo" },
 314     };
 315
 316     wxCSConv conv(wxFONTENCODING_UTF32BE);
 317     for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
 318     {
 319         const StringConversionData& d = utf32data[n];
 320         DoTestConversion(d.str, d.wcs, conv);
 321     }
 322
 323     size_t len;
 324     wxWCharBuffer wbuf(conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */,
 325                                    12, &len));
 326     CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
 327 }
 328
 329 void UnicodeTestCase::IsConvOk()
 330 {
 331     CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
 332     CPPUNIT_ASSERT( wxCSConv(_T("UTF-8")).IsOk() );
 333     CPPUNIT_ASSERT( !wxCSConv(_T("NoSuchConversion")).IsOk() );
 334
 335 #ifdef __WINDOWS__
 336     CPPUNIT_ASSERT( wxCSConv(_T("WINDOWS-437")).IsOk() );
 337 #endif
 338 }
 339
 340 #if wxUSE_UNICODE
 341 void UnicodeTestCase::Iteration()
 342 {
 343     // "czech" in Czech ("cestina"):
 344     static const char *textUTF8 = "\304\215e\305\241tina";
 345     static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
 346
 347     wxString text(wxString::FromUTF8(textUTF8));
 348     CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
 349
 350     // verify the string was decoded correctly:
 351     {
 352         size_t idx = 0;
 353         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 354         {
 355             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 356         }
 357     }
 358
 359     // overwrite the string with something that is shorter in UTF-8:
 360     {
 361         for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
 362             *i = 'x';
 363     }
 364
 365     // restore the original text now:
 366     {
 367         wxString::iterator end1 = text.end();
 368         wxString::const_iterator end2 = text.end();
 369
 370         size_t idx = 0;
 371         for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
 372         {
 373             *i = textUTF16[idx];
 374
 375             CPPUNIT_ASSERT( end1 == text.end() );
 376             CPPUNIT_ASSERT( end2 == text.end() );
 377         }
 378
 379         CPPUNIT_ASSERT( end1 == text.end() );
 380         CPPUNIT_ASSERT( end2 == text.end() );
 381     }
 382
 383     // and verify it again:
 384     {
 385         size_t idx = 0;
 386         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 387         {
 388             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 389         }
 390     }
 391 }
 392 #endif // wxUSE_UNICODE