tests/strings/unicode.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        tests/strings/unicode.cpp
   3 // Purpose:     Unicode unit test
   4 // Author:      Vadim Zeitlin, Wlodzimierz ABX Skiba
   5 // Created:     2004-04-28
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
   8 ///////////////////////////////////////////////////////////////////////////////
   9
  10 // ----------------------------------------------------------------------------
  11 // headers
  12 // ----------------------------------------------------------------------------
  13
  14 #include "testprec.h"
  15
  16 #ifdef __BORLANDC__
  17     #pragma hdrstop
  18 #endif
  19
  20 #ifndef WX_PRECOMP
  21     #include "wx/wx.h"
  22 #endif // WX_PRECOMP
  23
  24 #include "wx/encconv.h"
  25
  26 // ----------------------------------------------------------------------------
  27 // helper class holding the matching MB and WC strings
  28 // ----------------------------------------------------------------------------
  29
  30 struct StringConversionData
  31 {
  32     // either str or wcs (but not both) may be NULL, this means that the conversion
  33     // to it should fail
  34     StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0)
  35         : str(str_), wcs(wcs_), flags(flags_)
  36     {
  37     }
  38
  39     const char * const str;
  40     const wchar_t * const wcs;
  41
  42     enum
  43     {
  44         TEST_BOTH  = 0, // test both str -> wcs and wcs -> str
  45         ONLY_MB2WC = 1  // only test str -> wcs conversion
  46     };
  47
  48     const int flags;
  49
  50     // test that the conversion between str and wcs (subject to flags) succeeds
  51     //
  52     // the first argument is the index in the test array and is used solely for
  53     // diagnostics
  54     void Test(size_t n, wxMBConv& conv) const
  55     {
  56         if ( str )
  57         {
  58             wxWCharBuffer wbuf = conv.cMB2WC(str);
  59
  60             if ( wcs )
  61             {
  62                 CPPUNIT_ASSERT_MESSAGE
  63                 (
  64                     Message(n, "MB2WC failed"),
  65                     wbuf.data()
  66                 );
  67
  68                 CPPUNIT_ASSERT_MESSAGE
  69                 (
  70                     Message(n, "MB2WC", wbuf, wcs),
  71                     wxStrcmp(wbuf, wcs) == 0
  72                 );
  73             }
  74             else // conversion is supposed to fail
  75             {
  76                 CPPUNIT_ASSERT_MESSAGE
  77                 (
  78                     Message(n, "MB2WC succeeded"),
  79                     !wbuf.data()
  80                 );
  81             }
  82         }
  83
  84         if ( wcs && !(flags & ONLY_MB2WC) )
  85         {
  86             wxCharBuffer buf = conv.cWC2MB(wcs);
  87
  88             if ( str )
  89             {
  90                 CPPUNIT_ASSERT_MESSAGE
  91                 (
  92                     Message(n, "WC2MB failed"),
  93                     buf.data()
  94                 );
  95
  96                 CPPUNIT_ASSERT_MESSAGE
  97                 (
  98                     Message(n, "WC2MB", buf, str),
  99                     strcmp(buf, str) == 0
 100                 );
 101             }
 102             else
 103             {
 104                 CPPUNIT_ASSERT_MESSAGE
 105                 (
 106                     Message(n, "WC2MB succeeded"),
 107                     !buf.data()
 108                 );
 109             }
 110         }
 111     }
 112
 113 private:
 114     static std::string
 115     Message(size_t n, const wxString& msg)
 116     {
 117         return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg));
 118     }
 119
 120     template <typename T>
 121     static std::string
 122     Message(size_t n,
 123             const char *func,
 124             const wxCharTypeBuffer<T>& actual,
 125             const T *expected)
 126     {
 127         return Message(n,
 128                        wxString::Format("%s returned \"%s\", expected \"%s\"",
 129                                         func, actual.data(), expected));
 130     }
 131 };
 132
 133 // ----------------------------------------------------------------------------
 134 // test class
 135 // ----------------------------------------------------------------------------
 136
 137 class UnicodeTestCase : public CppUnit::TestCase
 138 {
 139 public:
 140     UnicodeTestCase();
 141
 142 private:
 143     CPPUNIT_TEST_SUITE( UnicodeTestCase );
 144         CPPUNIT_TEST( ToFromAscii );
 145         CPPUNIT_TEST( ConstructorsWithConversion );
 146         CPPUNIT_TEST( ConversionFixed );
 147         CPPUNIT_TEST( ConversionWithNULs );
 148         CPPUNIT_TEST( ConversionUTF7 );
 149         CPPUNIT_TEST( ConversionUTF8 );
 150         CPPUNIT_TEST( ConversionUTF16 );
 151         CPPUNIT_TEST( ConversionUTF32 );
 152         CPPUNIT_TEST( IsConvOk );
 153 #if wxUSE_UNICODE
 154         CPPUNIT_TEST( Iteration );
 155 #endif
 156     CPPUNIT_TEST_SUITE_END();
 157
 158     void ToFromAscii();
 159     void ConstructorsWithConversion();
 160     void ConversionFixed();
 161     void ConversionWithNULs();
 162     void ConversionUTF7();
 163     void ConversionUTF8();
 164     void ConversionUTF16();
 165     void ConversionUTF32();
 166     void IsConvOk();
 167 #if wxUSE_UNICODE
 168     void Iteration();
 169 #endif
 170
 171     DECLARE_NO_COPY_CLASS(UnicodeTestCase)
 172 };
 173
 174 // register in the unnamed registry so that these tests are run by default
 175 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
 176
 177 // also include in its own registry so that these tests can be run alone
 178 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
 179
 180 UnicodeTestCase::UnicodeTestCase()
 181 {
 182 }
 183
 184 void UnicodeTestCase::ToFromAscii()
 185 {
 186
 187 #define TEST_TO_FROM_ASCII(txt)                              \
 188     {                                                        \
 189         static const char *msg = txt;                        \
 190         wxString s = wxString::FromAscii(msg);               \
 191         CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 );  \
 192     }
 193
 194     TEST_TO_FROM_ASCII( "Hello, world!" );
 195     TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
 196 }
 197
 198 void UnicodeTestCase::ConstructorsWithConversion()
 199 {
 200     // the string "Déjà" in UTF-8 and wchar_t:
 201     const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
 202     const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
 203     const char *utf8 = (char *)utf8Buf;
 204     const char *utf8sub = (char *)utf8subBuf;
 205
 206     wxString s1(utf8, wxConvUTF8);
 207
 208 #if wxUSE_UNICODE
 209     const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
 210     CPPUNIT_ASSERT_EQUAL( wchar, s1 );
 211
 212     wxString s2(wchar);
 213     CPPUNIT_ASSERT_EQUAL( wchar, s2 );
 214     CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 );
 215 #else
 216     CPPUNIT_ASSERT_EQUAL( utf8, s1 );
 217 #endif
 218
 219     wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
 220     wxString s3(utf8, wxConvUTF8, 4);
 221     CPPUNIT_ASSERT_EQUAL( sub, s3 );
 222
 223 #if wxUSE_UNICODE
 224     wxString s4(wchar, wxConvUTF8, 3);
 225     CPPUNIT_ASSERT_EQUAL( sub, s4 );
 226
 227     // conversion should stop with failure at pos 35
 228     wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8);
 229     CPPUNIT_ASSERT( s.empty() );
 230 #endif // wxUSE_UNICODE
 231
 232
 233     // test using Unicode strings together with char* strings (this must work
 234     // in ANSI mode as well, of course):
 235     wxString s5("ascii");
 236     CPPUNIT_ASSERT_EQUAL( "ascii", s5 );
 237
 238     s5 += " value";
 239
 240     CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
 241     CPPUNIT_ASSERT_EQUAL( "ascii value", s5 );
 242     CPPUNIT_ASSERT( s5 != "SomethingElse" );
 243 }
 244
 245 void UnicodeTestCase::ConversionFixed()
 246 {
 247     size_t len;
 248
 249 #if wxUSE_UNICODE
 250     wxConvLibc.cWC2MB(L"", 0, &len);
 251 #else // !wxUSE_UNICODE
 252     wxConvLibc.cMB2WC("", 0, &len);
 253 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 254
 255     CPPUNIT_ASSERT_EQUAL( 0, len );
 256
 257 #if wxUSE_UNICODE
 258     // check that when we convert a fixed number of characters we obtain the
 259     // expected return value
 260     CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) );
 261     CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) );
 262     CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) );
 263     CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) );
 264 #endif // wxUSE_UNICODE
 265 }
 266
 267 void UnicodeTestCase::ConversionWithNULs()
 268 {
 269 #if wxUSE_UNICODE
 270     static const size_t lenNulString = 10;
 271
 272     wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
 273     wxCharBuffer theBuffer = szTheString.mb_str();
 274
 275     CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
 276                     lenNulString + 1) == 0 );
 277
 278     wxString szTheString2("The\0String", wxConvLocal, lenNulString);
 279     CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
 280     CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
 281                     lenNulString + 1) == 0 );
 282 #else // !wxUSE_UNICODE
 283     wxString szTheString("TheString");
 284     szTheString.insert(3, 1, '\0');
 285     wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
 286
 287     CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 288
 289     wxString szLocalTheString("TheString");
 290     szLocalTheString.insert(3, 1, '\0');
 291     wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
 292
 293     CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 294 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 295 }
 296
 297 void UnicodeTestCase::ConversionUTF7()
 298 {
 299     static const StringConversionData utf7data[] =
 300     {
 301         // normal fragments
 302         StringConversionData("+AKM-", L"\xa3"),
 303         StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"),
 304
 305         // this one is an alternative valid encoding of the same string
 306         StringConversionData("+AOk-t+AOk", L"\xe9t\xe9",
 307                              StringConversionData::ONLY_MB2WC),
 308
 309         // some special cases
 310         StringConversionData("+-", L"+"),
 311         StringConversionData("+--", L"+-"),
 312
 313         // the following are invalid UTF-7 sequences
 314         StringConversionData("\xa3", NULL),
 315         StringConversionData("+", NULL),
 316         StringConversionData("+~", NULL),
 317         StringConversionData("a+", NULL),
 318     };
 319
 320     for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
 321     {
 322         const StringConversionData& d = utf7data[n];
 323
 324         // converting to/from UTF-7 using iconv() currently doesn't work
 325         // because of several problems:
 326         //  - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
 327         //    to an incomplete and anyhow nonsensical "+AA" string)
 328         //  - iconv refuses to convert "+-" (although it converts "+-\n" just
 329         //    fine, go figure)
 330         //
 331         // I have no idea how to fix this so just disable the test for now
 332 #if 0
 333         d.Test(n, wxCSConv("utf-7"));
 334 #endif
 335         d.Test(n, wxConvUTF7);
 336     }
 337 }
 338
 339 void UnicodeTestCase::ConversionUTF8()
 340 {
 341     static const StringConversionData utf8data[] =
 342     {
 343 #ifdef wxHAVE_U_ESCAPE
 344         StringConversionData("\xc2\xa3", L"\u00a3"),
 345 #endif
 346         StringConversionData("\xc2", NULL),
 347     };
 348
 349     wxCSConv conv(wxT("utf-8"));
 350     for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
 351     {
 352         const StringConversionData& d = utf8data[n];
 353         d.Test(n, conv);
 354         d.Test(n, wxConvUTF8);
 355     }
 356 }
 357
 358 void UnicodeTestCase::ConversionUTF16()
 359 {
 360     static const StringConversionData utf16data[] =
 361     {
 362 #ifdef wxHAVE_U_ESCAPE
 363         StringConversionData(
 364             "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
 365             L"\u041f\u0440\u0438\u0432\u0435\u0442"),
 366         StringConversionData(
 367             "\x01\0\0b\x01\0\0a\x01\0\0r\0\0",
 368             L"\u0100b\u0100a\u0100r"),
 369 #endif
 370         StringConversionData("\0f\0o\0o\0\0", L"foo"),
 371     };
 372
 373     wxCSConv conv(wxFONTENCODING_UTF16BE);
 374     for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
 375     {
 376         const StringConversionData& d = utf16data[n];
 377         d.Test(n, conv);
 378     }
 379
 380     // special case: this string has consecutive NULs inside it which don't
 381     // terminate the string, this exposed a bug in our conversion code which
 382     // got confused in this case
 383     size_t len;
 384     conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len);
 385     CPPUNIT_ASSERT_EQUAL( 3, len );
 386 }
 387
 388 void UnicodeTestCase::ConversionUTF32()
 389 {
 390     static const StringConversionData utf32data[] =
 391     {
 392 #ifdef wxHAVE_U_ESCAPE
 393         StringConversionData(
 394             "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
 395           L"\u041f\u0440\u0438\u0432\u0435\u0442"),
 396 #endif
 397         StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"),
 398     };
 399
 400     wxCSConv conv(wxFONTENCODING_UTF32BE);
 401     for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
 402     {
 403         const StringConversionData& d = utf32data[n];
 404         d.Test(n, conv);
 405     }
 406
 407     size_t len;
 408     conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len);
 409     CPPUNIT_ASSERT_EQUAL( 3, len );
 410 }
 411
 412 void UnicodeTestCase::IsConvOk()
 413 {
 414     CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
 415     CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() );
 416     CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() );
 417     CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() );
 418
 419 #ifdef __WINDOWS__
 420     CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() );
 421 #endif
 422 }
 423
 424 #if wxUSE_UNICODE
 425 void UnicodeTestCase::Iteration()
 426 {
 427     // "czech" in Czech ("cestina"):
 428     static const char *textUTF8 = "\304\215e\305\241tina";
 429     static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
 430
 431     wxString text(wxString::FromUTF8(textUTF8));
 432     CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
 433
 434     // verify the string was decoded correctly:
 435     {
 436         size_t idx = 0;
 437         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 438         {
 439             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 440         }
 441     }
 442
 443     // overwrite the string with something that is shorter in UTF-8:
 444     {
 445         for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
 446             *i = 'x';
 447     }
 448
 449     // restore the original text now:
 450     {
 451         wxString::iterator end1 = text.end();
 452         wxString::const_iterator end2 = text.end();
 453
 454         size_t idx = 0;
 455         for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
 456         {
 457             *i = textUTF16[idx];
 458
 459             CPPUNIT_ASSERT( end1 == text.end() );
 460             CPPUNIT_ASSERT( end2 == text.end() );
 461         }
 462
 463         CPPUNIT_ASSERT( end1 == text.end() );
 464         CPPUNIT_ASSERT( end2 == text.end() );
 465     }
 466
 467     // and verify it again:
 468     {
 469         size_t idx = 0;
 470         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 471         {
 472             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 473         }
 474     }
 475 }
 476 #endif // wxUSE_UNICODE
 477