tests/strings/unicode.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        tests/strings/unicode.cpp
   3 // Purpose:     Unicode unit test
   4 // Author:      Vadim Zeitlin, Wlodzimierz ABX Skiba
   5 // Created:     2004-04-28
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
   8 ///////////////////////////////////////////////////////////////////////////////
   9
  10 // ----------------------------------------------------------------------------
  11 // headers
  12 // ----------------------------------------------------------------------------
  13
  14 #include "testprec.h"
  15
  16 #ifdef __BORLANDC__
  17     #pragma hdrstop
  18 #endif
  19
  20 #ifndef WX_PRECOMP
  21     #include "wx/wx.h"
  22 #endif // WX_PRECOMP
  23
  24 #include "wx/encconv.h"
  25
  26 // ----------------------------------------------------------------------------
  27 // helper class holding the matching MB and WC strings
  28 // ----------------------------------------------------------------------------
  29
  30 struct StringConversionData
  31 {
  32     // either str or wcs (but not both) may be NULL, this means that the conversion
  33     // to it should fail
  34     StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0)
  35         : str(str_), wcs(wcs_), flags(flags_)
  36     {
  37     }
  38
  39     const char * const str;
  40     const wchar_t * const wcs;
  41
  42     enum
  43     {
  44         TEST_BOTH  = 0, // test both str -> wcs and wcs -> str
  45         ONLY_MB2WC = 1  // only test str -> wcs conversion
  46     };
  47
  48     const int flags;
  49
  50     // test that the conversion between str and wcs (subject to flags) succeeds
  51     //
  52     // the first argument is the index in the test array and is used solely for
  53     // diagnostics
  54     void Test(size_t n, wxMBConv& conv) const
  55     {
  56         if ( str )
  57         {
  58             wxWCharBuffer wbuf = conv.cMB2WC(str);
  59
  60             if ( wcs )
  61             {
  62                 CPPUNIT_ASSERT_MESSAGE
  63                 (
  64                     Message(n, "MB2WC failed"),
  65                     wbuf.data()
  66                 );
  67
  68                 CPPUNIT_ASSERT_MESSAGE
  69                 (
  70                     Message(n, "MB2WC", wbuf, wcs),
  71                     wxStrcmp(wbuf, wcs) == 0
  72                 );
  73             }
  74             else // conversion is supposed to fail
  75             {
  76                 CPPUNIT_ASSERT_MESSAGE
  77                 (
  78                     Message(n, "MB2WC succeeded"),
  79                     !wbuf.data()
  80                 );
  81             }
  82         }
  83
  84         if ( wcs && !(flags & ONLY_MB2WC) )
  85         {
  86             wxCharBuffer buf = conv.cWC2MB(wcs);
  87
  88             if ( str )
  89             {
  90                 CPPUNIT_ASSERT_MESSAGE
  91                 (
  92                     Message(n, "WC2MB failed"),
  93                     buf.data()
  94                 );
  95
  96                 CPPUNIT_ASSERT_MESSAGE
  97                 (
  98                     Message(n, "WC2MB", buf, str),
  99                     strcmp(buf, str) == 0
 100                 );
 101             }
 102             else
 103             {
 104                 CPPUNIT_ASSERT_MESSAGE
 105                 (
 106                     Message(n, "WC2MB succeeded"),
 107                     !buf.data()
 108                 );
 109             }
 110         }
 111     }
 112
 113 private:
 114     static std::string
 115     Message(size_t n, const wxString& msg)
 116     {
 117         return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg));
 118     }
 119
 120     template <typename T>
 121     static std::string
 122     Message(size_t n,
 123             const char *func,
 124             const wxCharTypeBuffer<T>& actual,
 125             const T *expected)
 126     {
 127         return Message(n,
 128                        wxString::Format("%s returned \"%s\", expected \"%s\"",
 129                                         func, actual.data(), expected));
 130     }
 131 };
 132
 133 // ----------------------------------------------------------------------------
 134 // test data for UnicodeTestCase::Utf8()
 135 // ----------------------------------------------------------------------------
 136
 137 static const unsigned char utf8koi8r[] =
 138 {
 139     208, 157, 208, 181, 209, 129, 208, 186, 208, 176, 208, 183, 208, 176,
 140     208, 189, 208, 189, 208, 190, 32, 208, 191, 208, 190, 209, 128, 208,
 141     176, 208, 180, 208, 190, 208, 178, 208, 176, 208, 187, 32, 208, 188,
 142     208, 181, 208, 189, 209, 143, 32, 209, 129, 208, 178, 208, 190, 208,
 143     181, 208, 185, 32, 208, 186, 209, 128, 209, 131, 209, 130, 208, 181,
 144     208, 185, 209, 136, 208, 181, 208, 185, 32, 208, 189, 208, 190, 208,
 145     178, 208, 190, 209, 129, 209, 130, 209, 140, 209, 142, 0
 146 };
 147
 148 static const unsigned char utf8iso8859_1[] =
 149 {
 150     0x53, 0x79, 0x73, 0x74, 0xc3, 0xa8, 0x6d, 0x65, 0x73, 0x20, 0x49, 0x6e,
 151     0x74, 0xc3, 0xa9, 0x67, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x73, 0x20, 0x65,
 152     0x6e, 0x20, 0x4d, 0xc3, 0xa9, 0x63, 0x61, 0x6e, 0x69, 0x71, 0x75, 0x65,
 153     0x20, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x71, 0x75, 0x65, 0x20, 0x65,
 154     0x74, 0x20, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x71, 0x75, 0x65, 0
 155 };
 156
 157 static const unsigned char utf8Invalid[] =
 158 {
 159     0x3c, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3e, 0x32, 0x30, 0x30,
 160     0x32, 0xe5, 0xb9, 0xb4, 0x30, 0x39, 0xe6, 0x9c, 0x88, 0x32, 0x35, 0xe6,
 161     0x97, 0xa5, 0x20, 0x30, 0x37, 0xe6, 0x99, 0x82, 0x33, 0x39, 0xe5, 0x88,
 162     0x86, 0x35, 0x37, 0xe7, 0xa7, 0x92, 0x3c, 0x2f, 0x64, 0x69, 0x73, 0x70,
 163     0x6c, 0x61, 0x79, 0
 164 };
 165
 166 static const struct Utf8Data
 167 {
 168     const unsigned char *text;
 169     size_t len;
 170     const wxChar *charset;
 171     wxFontEncoding encoding;
 172 } utf8data[] =
 173 {
 174     { utf8Invalid, WXSIZEOF(utf8Invalid), wxT("iso8859-1"), wxFONTENCODING_ISO8859_1 },
 175     { utf8koi8r, WXSIZEOF(utf8koi8r), wxT("koi8-r"), wxFONTENCODING_KOI8 },
 176     { utf8iso8859_1, WXSIZEOF(utf8iso8859_1), wxT("iso8859-1"), wxFONTENCODING_ISO8859_1 },
 177 };
 178
 179
 180 // ----------------------------------------------------------------------------
 181 // test class
 182 // ----------------------------------------------------------------------------
 183
 184 class UnicodeTestCase : public CppUnit::TestCase
 185 {
 186 public:
 187     UnicodeTestCase();
 188
 189 private:
 190     CPPUNIT_TEST_SUITE( UnicodeTestCase );
 191         CPPUNIT_TEST( ToFromAscii );
 192         CPPUNIT_TEST( ConstructorsWithConversion );
 193         CPPUNIT_TEST( ConversionFixed );
 194         CPPUNIT_TEST( ConversionWithNULs );
 195         CPPUNIT_TEST( ConversionUTF7 );
 196         CPPUNIT_TEST( ConversionUTF8 );
 197         CPPUNIT_TEST( ConversionUTF16 );
 198         CPPUNIT_TEST( ConversionUTF32 );
 199         CPPUNIT_TEST( IsConvOk );
 200 #if wxUSE_UNICODE
 201         CPPUNIT_TEST( Iteration );
 202 #endif
 203         CPPUNIT_TEST( Utf8 );
 204         CPPUNIT_TEST( EncodingConverter );
 205     CPPUNIT_TEST_SUITE_END();
 206
 207     void ToFromAscii();
 208     void ConstructorsWithConversion();
 209     void ConversionFixed();
 210     void ConversionWithNULs();
 211     void ConversionUTF7();
 212     void ConversionUTF8();
 213     void ConversionUTF16();
 214     void ConversionUTF32();
 215     void IsConvOk();
 216 #if wxUSE_UNICODE
 217     void Iteration();
 218 #endif
 219     void Utf8();
 220     void EncodingConverter();
 221
 222     DECLARE_NO_COPY_CLASS(UnicodeTestCase)
 223 };
 224
 225 // register in the unnamed registry so that these tests are run by default
 226 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
 227
 228 // also include in it's own registry so that these tests can be run alone
 229 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
 230
 231 UnicodeTestCase::UnicodeTestCase()
 232 {
 233 }
 234
 235 void UnicodeTestCase::ToFromAscii()
 236 {
 237
 238 #define TEST_TO_FROM_ASCII(txt)                              \
 239     {                                                        \
 240         static const char *msg = txt;                        \
 241         wxString s = wxString::FromAscii(msg);               \
 242         CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 );  \
 243     }
 244
 245     TEST_TO_FROM_ASCII( "Hello, world!" );
 246     TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
 247 }
 248
 249 void UnicodeTestCase::ConstructorsWithConversion()
 250 {
 251     // the string "Déjà" in UTF-8 and wchar_t:
 252     const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
 253     const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
 254     const char *utf8 = (char *)utf8Buf;
 255     const char *utf8sub = (char *)utf8subBuf;
 256
 257     wxString s1(utf8, wxConvUTF8);
 258
 259 #if wxUSE_UNICODE
 260     const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
 261     CPPUNIT_ASSERT_EQUAL( wchar, s1 );
 262
 263     wxString s2(wchar);
 264     CPPUNIT_ASSERT_EQUAL( wchar, s2 );
 265     CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 );
 266 #else
 267     CPPUNIT_ASSERT_EQUAL( utf8, s1 );
 268 #endif
 269
 270     wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
 271     wxString s3(utf8, wxConvUTF8, 4);
 272     CPPUNIT_ASSERT_EQUAL( sub, s3 );
 273
 274 #if wxUSE_UNICODE
 275     wxString s4(wchar, wxConvUTF8, 3);
 276     CPPUNIT_ASSERT_EQUAL( sub, s4 );
 277
 278     // conversion should stop with failure at pos 35
 279     wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8);
 280     CPPUNIT_ASSERT( s.empty() );
 281 #endif // wxUSE_UNICODE
 282
 283
 284     // test using Unicode strings together with char* strings (this must work
 285     // in ANSI mode as well, of course):
 286     wxString s5("ascii");
 287     CPPUNIT_ASSERT_EQUAL( "ascii", s5 );
 288
 289     s5 += " value";
 290
 291     CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
 292     CPPUNIT_ASSERT_EQUAL( "ascii value", s5 );
 293     CPPUNIT_ASSERT( s5 != "SomethingElse" );
 294 }
 295
 296 void UnicodeTestCase::ConversionFixed()
 297 {
 298     size_t len;
 299
 300 #if wxUSE_UNICODE
 301     wxConvLibc.cWC2MB(L"", 0, &len);
 302 #else // !wxUSE_UNICODE
 303     wxConvLibc.cMB2WC("", 0, &len);
 304 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 305
 306     CPPUNIT_ASSERT_EQUAL( 0, len );
 307
 308 #if wxUSE_UNICODE
 309     // check that when we convert a fixed number of characters we obtain the
 310     // expected return value
 311     CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) );
 312     CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) );
 313     CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) );
 314     CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) );
 315 #endif // wxUSE_UNICODE
 316 }
 317
 318 void UnicodeTestCase::ConversionWithNULs()
 319 {
 320 #if wxUSE_UNICODE
 321     static const size_t lenNulString = 10;
 322
 323     wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
 324     wxCharBuffer theBuffer = szTheString.mb_str();
 325
 326     CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
 327                     lenNulString + 1) == 0 );
 328
 329     wxString szTheString2("The\0String", wxConvLocal, lenNulString);
 330     CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
 331     CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
 332                     lenNulString + 1) == 0 );
 333 #else // !wxUSE_UNICODE
 334     wxString szTheString("TheString");
 335     szTheString.insert(3, 1, '\0');
 336     wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
 337
 338     CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 339
 340     wxString szLocalTheString("TheString");
 341     szLocalTheString.insert(3, 1, '\0');
 342     wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
 343
 344     CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 345 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 346 }
 347
 348 void UnicodeTestCase::ConversionUTF7()
 349 {
 350     static const StringConversionData utf7data[] =
 351     {
 352         // normal fragments
 353         StringConversionData("+AKM-", L"\xa3"),
 354         StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"),
 355
 356         // this one is an alternative valid encoding of the same string
 357         StringConversionData("+AOk-t+AOk", L"\xe9t\xe9",
 358                              StringConversionData::ONLY_MB2WC),
 359
 360         // some special cases
 361         StringConversionData("+-", L"+"),
 362         StringConversionData("+--", L"+-"),
 363
 364         // the following are invalid UTF-7 sequences
 365         StringConversionData("\xa3", NULL),
 366         StringConversionData("+", NULL),
 367         StringConversionData("+~", NULL),
 368         StringConversionData("a+", NULL),
 369     };
 370
 371     for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
 372     {
 373         const StringConversionData& d = utf7data[n];
 374
 375         // converting to/from UTF-7 using iconv() currently doesn't work
 376         // because of several problems:
 377         //  - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
 378         //    to an incomplete and anyhow nonsensical "+AA" string)
 379         //  - iconv refuses to convert "+-" (although it converts "+-\n" just
 380         //    fine, go figure)
 381         //
 382         // I have no idea how to fix this so just disable the test for now
 383 #if 0
 384         d.Test(n, wxCSConv("utf-7"));
 385 #endif
 386         d.Test(n, wxConvUTF7);
 387     }
 388 }
 389
 390 void UnicodeTestCase::ConversionUTF8()
 391 {
 392     static const StringConversionData utf8data[] =
 393     {
 394 #ifdef wxHAVE_U_ESCAPE
 395         StringConversionData("\xc2\xa3", L"\u00a3"),
 396 #endif
 397         StringConversionData("\xc2", NULL),
 398     };
 399
 400     wxCSConv conv(wxT("utf-8"));
 401     for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
 402     {
 403         const StringConversionData& d = utf8data[n];
 404         d.Test(n, conv);
 405         d.Test(n, wxConvUTF8);
 406     }
 407 }
 408
 409 void UnicodeTestCase::ConversionUTF16()
 410 {
 411     static const StringConversionData utf16data[] =
 412     {
 413 #ifdef wxHAVE_U_ESCAPE
 414         StringConversionData(
 415             "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
 416             L"\u041f\u0440\u0438\u0432\u0435\u0442"),
 417         StringConversionData(
 418             "\x01\0\0b\x01\0\0a\x01\0\0r\0\0",
 419             L"\u0100b\u0100a\u0100r"),
 420 #endif
 421         StringConversionData("\0f\0o\0o\0\0", L"foo"),
 422     };
 423
 424     wxCSConv conv(wxFONTENCODING_UTF16BE);
 425     for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
 426     {
 427         const StringConversionData& d = utf16data[n];
 428         d.Test(n, conv);
 429     }
 430
 431     // special case: this string has consecutive NULs inside it which don't
 432     // terminate the string, this exposed a bug in our conversion code which
 433     // got confused in this case
 434     size_t len;
 435     conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len);
 436     CPPUNIT_ASSERT_EQUAL( 3, len );
 437 }
 438
 439 void UnicodeTestCase::ConversionUTF32()
 440 {
 441     static const StringConversionData utf32data[] =
 442     {
 443 #ifdef wxHAVE_U_ESCAPE
 444         StringConversionData(
 445             "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
 446           L"\u041f\u0440\u0438\u0432\u0435\u0442"),
 447 #endif
 448         StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"),
 449     };
 450
 451     wxCSConv conv(wxFONTENCODING_UTF32BE);
 452     for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
 453     {
 454         const StringConversionData& d = utf32data[n];
 455         d.Test(n, conv);
 456     }
 457
 458     size_t len;
 459     conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len);
 460     CPPUNIT_ASSERT_EQUAL( 3, len );
 461 }
 462
 463 void UnicodeTestCase::IsConvOk()
 464 {
 465     CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
 466     CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() );
 467     CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() );
 468     CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() );
 469
 470 #ifdef __WINDOWS__
 471     CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() );
 472 #endif
 473 }
 474
 475 #if wxUSE_UNICODE
 476 void UnicodeTestCase::Iteration()
 477 {
 478     // "czech" in Czech ("cestina"):
 479     static const char *textUTF8 = "\304\215e\305\241tina";
 480     static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
 481
 482     wxString text(wxString::FromUTF8(textUTF8));
 483     CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
 484
 485     // verify the string was decoded correctly:
 486     {
 487         size_t idx = 0;
 488         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 489         {
 490             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 491         }
 492     }
 493
 494     // overwrite the string with something that is shorter in UTF-8:
 495     {
 496         for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
 497             *i = 'x';
 498     }
 499
 500     // restore the original text now:
 501     {
 502         wxString::iterator end1 = text.end();
 503         wxString::const_iterator end2 = text.end();
 504
 505         size_t idx = 0;
 506         for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
 507         {
 508             *i = textUTF16[idx];
 509
 510             CPPUNIT_ASSERT( end1 == text.end() );
 511             CPPUNIT_ASSERT( end2 == text.end() );
 512         }
 513
 514         CPPUNIT_ASSERT( end1 == text.end() );
 515         CPPUNIT_ASSERT( end2 == text.end() );
 516     }
 517
 518     // and verify it again:
 519     {
 520         size_t idx = 0;
 521         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 522         {
 523             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 524         }
 525     }
 526 }
 527 #endif // wxUSE_UNICODE
 528
 529 void UnicodeTestCase::Utf8()
 530 {
 531     // test code extracted from console sample r64320
 532
 533     char buf[1024];
 534     wchar_t wbuf[1024];
 535
 536     for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
 537     {
 538         const Utf8Data& u8d = utf8data[n];
 539         CPPUNIT_ASSERT( wxConvUTF8.MB2WC(wbuf, (const char *)u8d.text, WXSIZEOF(wbuf)) != (size_t)-1 );
 540
 541 #if 0       // FIXME: this conversion seem not to work...
 542         wxCSConv conv(u8d.charset);
 543         CPPUNIT_ASSERT( conv.WC2MB(buf, wbuf, WXSIZEOF(buf)) != (size_t)-1 );
 544 #endif
 545         wxString s(wxConvUTF8.cMB2WC((const char *)u8d.text));
 546         CPPUNIT_ASSERT( !s.empty() );
 547     }
 548 }
 549
 550 void UnicodeTestCase::EncodingConverter()
 551 {
 552     // test code extracted from console sample r64320
 553
 554 #if 0
 555     char buf[1024];
 556     wchar_t wbuf[1024];
 557
 558     CPPUNIT_ASSERT( wxConvUTF8.MB2WC(wbuf, (const char *)utf8koi8r, WXSIZEOF(utf8koi8r)) != (size_t)-1 );
 559
 560     wxString s1(wxConvUTF8.cMB2WC((const char *)utf8koi8r));
 561     CPPUNIT_ASSERT( !s1.empty() );
 562
 563     wxEncodingConverter ec;
 564     ec.Init(wxFONTENCODING_UNICODE, wxFONTENCODING_KOI8);
 565     ec.Convert(wbuf, buf);
 566     wxString s2(buf);
 567
 568     CPPUNIT_ASSERT_EQUAL( s1, s2 );
 569 #endif
 570 }