tests/strings/unicode.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        tests/strings/unicode.cpp
   3 // Purpose:     Unicode unit test
   4 // Author:      Vadim Zeitlin, Wlodzimierz ABX Skiba
   5 // Created:     2004-04-28
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba
   8 ///////////////////////////////////////////////////////////////////////////////
   9
  10 // ----------------------------------------------------------------------------
  11 // headers
  12 // ----------------------------------------------------------------------------
  13
  14 #include "testprec.h"
  15
  16 #ifdef __BORLANDC__
  17     #pragma hdrstop
  18 #endif
  19
  20 #ifndef WX_PRECOMP
  21 #endif // WX_PRECOMP
  22
  23 // ----------------------------------------------------------------------------
  24 // local functions
  25 // ----------------------------------------------------------------------------
  26
  27 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
  28
  29 // in case wcscmp is missing
  30 static int wx_wcscmp(const wchar_t *s1, const wchar_t *s2)
  31 {
  32     while (*s1 == *s2 && *s1 != 0)
  33     {
  34         s1++;
  35         s2++;
  36     }
  37     return *s1 - *s2;
  38 }
  39
  40 #endif // wxUSE_WCHAR_T && !wxUSE_UNICODE
  41
  42 // ----------------------------------------------------------------------------
  43 // test class
  44 // ----------------------------------------------------------------------------
  45
  46 class UnicodeTestCase : public CppUnit::TestCase
  47 {
  48 public:
  49     UnicodeTestCase();
  50
  51 private:
  52     CPPUNIT_TEST_SUITE( UnicodeTestCase );
  53         CPPUNIT_TEST( ToFromAscii );
  54 #if wxUSE_WCHAR_T
  55         CPPUNIT_TEST( ConstructorsWithConversion );
  56         CPPUNIT_TEST( ConversionEmpty );
  57         CPPUNIT_TEST( ConversionWithNULs );
  58         CPPUNIT_TEST( ConversionUTF7 );
  59         CPPUNIT_TEST( ConversionUTF8 );
  60         CPPUNIT_TEST( ConversionUTF16 );
  61         CPPUNIT_TEST( ConversionUTF32 );
  62         CPPUNIT_TEST( IsConvOk );
  63 #endif // wxUSE_WCHAR_T
  64 #if wxUSE_UNICODE
  65         CPPUNIT_TEST( Iteration );
  66 #endif
  67     CPPUNIT_TEST_SUITE_END();
  68
  69     void ToFromAscii();
  70 #if wxUSE_WCHAR_T
  71     void ConstructorsWithConversion();
  72     void ConversionEmpty();
  73     void ConversionWithNULs();
  74     void ConversionUTF7();
  75     void ConversionUTF8();
  76     void ConversionUTF16();
  77     void ConversionUTF32();
  78     void IsConvOk();
  79 #if wxUSE_UNICODE
  80     void Iteration();
  81 #endif
  82
  83     // test if converting s using the given encoding gives ws and vice versa
  84     //
  85     // if either of the first 2 arguments is NULL, the conversion is supposed
  86     // to fail
  87     void DoTestConversion(const char *s, const wchar_t *w, wxMBConv& conv);
  88 #endif // wxUSE_WCHAR_T
  89
  90
  91     DECLARE_NO_COPY_CLASS(UnicodeTestCase)
  92 };
  93
  94 // register in the unnamed registry so that these tests are run by default
  95 CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase );
  96
  97 // also include in it's own registry so that these tests can be run alone
  98 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" );
  99
 100 UnicodeTestCase::UnicodeTestCase()
 101 {
 102 }
 103
 104 void UnicodeTestCase::ToFromAscii()
 105 {
 106
 107 #define TEST_TO_FROM_ASCII(txt)                              \
 108     {                                                        \
 109         static const char *msg = txt;                        \
 110         wxString s = wxString::FromAscii(msg);               \
 111         CPPUNIT_ASSERT( strcmp( s.ToAscii() , msg ) == 0 );  \
 112     }
 113
 114     TEST_TO_FROM_ASCII( "Hello, world!" );
 115     TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" );
 116 }
 117
 118 #if wxUSE_WCHAR_T
 119 void UnicodeTestCase::ConstructorsWithConversion()
 120 {
 121     // the string "Déjà" in UTF-8 and wchar_t:
 122     const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0};
 123     const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0};
 124     const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj"
 125     const char *utf8 = (char *)utf8Buf;
 126     const char *utf8sub = (char *)utf8subBuf;
 127
 128     wxString s1(utf8, wxConvUTF8);
 129     wxString s2(wchar, wxConvUTF8);
 130
 131 #if wxUSE_UNICODE
 132     CPPUNIT_ASSERT( s1 == wchar );
 133     CPPUNIT_ASSERT( s2 == wchar );
 134 #else
 135     CPPUNIT_ASSERT( s1 == utf8 );
 136     CPPUNIT_ASSERT( s2 == utf8 );
 137 #endif
 138
 139     wxString sub(utf8sub, wxConvUTF8); // "Dej" substring
 140     wxString s3(utf8, wxConvUTF8, 4);
 141     wxString s4(wchar, wxConvUTF8, 3);
 142
 143     CPPUNIT_ASSERT( s3 == sub );
 144     CPPUNIT_ASSERT( s4 == sub );
 145
 146 #if wxUSE_UNICODE
 147     CPPUNIT_ASSERT ( wxString("\t[pl]open.format.Sformatuj dyskietkê=gfloppy %f",
 148                                wxConvUTF8) == wxT("") ); //should stop at pos 35
 149 #endif
 150
 151
 152     // test using Unicode strings together with char* strings (this must work
 153     // in ANSI mode as well, of course):
 154     wxString s5("ascii");
 155     CPPUNIT_ASSERT( s5 == "ascii" );
 156
 157     s5 += " value";
 158
 159     CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 );
 160     CPPUNIT_ASSERT( s5 == "ascii value" );
 161     CPPUNIT_ASSERT( s5 != "SomethingElse" );
 162 }
 163
 164 void UnicodeTestCase::ConversionEmpty()
 165 {
 166     size_t len;
 167
 168 #if wxUSE_UNICODE
 169     wxCharBuffer buf = wxConvLibc.cWC2MB(L"", 0, &len);
 170 #else // !wxUSE_UNICODE
 171     wxWCharBuffer wbuf = wxConvLibc.cMB2WC("", 0, &len);
 172 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 173
 174     CPPUNIT_ASSERT(len == 0);
 175 }
 176
 177 void UnicodeTestCase::ConversionWithNULs()
 178 {
 179 #if wxUSE_UNICODE
 180     static const size_t lenNulString = 10;
 181
 182     wxString szTheString(L"The\0String", wxConvLibc, lenNulString);
 183     wxCharBuffer theBuffer = szTheString.mb_str();
 184
 185     CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String",
 186                     lenNulString + 1) == 0 );
 187
 188     wxString szTheString2("The\0String", wxConvLocal, lenNulString);
 189     CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() );
 190     CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String",
 191                     lenNulString + 1) == 0 );
 192 #else // !wxUSE_UNICODE
 193     wxString szTheString(wxT("TheString"));
 194     szTheString.insert(3, 1, '\0');
 195     wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc);
 196
 197     CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 198
 199     wxString szLocalTheString(wxT("TheString"));
 200     szLocalTheString.insert(3, 1, '\0');
 201     wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal);
 202
 203     CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 );
 204 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 205 }
 206
 207 void
 208 UnicodeTestCase::DoTestConversion(const char *s,
 209                                   const wchar_t *ws,
 210                                   wxMBConv& conv)
 211 {
 212 #if wxUSE_UNICODE
 213     if ( ws )
 214     {
 215         wxCharBuffer buf = conv.cWC2MB(ws, (size_t)-1, NULL);
 216
 217         CPPUNIT_ASSERT( strcmp(buf, s) == 0 );
 218     }
 219 #else // wxUSE_UNICODE
 220     if ( s )
 221     {
 222         wxWCharBuffer wbuf = conv.cMB2WC(s, (size_t)-1, NULL);
 223
 224         if ( ws )
 225         {
 226             CPPUNIT_ASSERT( wbuf.data() );
 227             CPPUNIT_ASSERT( wx_wcscmp(wbuf, ws) == 0 );
 228         }
 229         else // conversion is supposed to fail
 230         {
 231             CPPUNIT_ASSERT_EQUAL( (wchar_t *)NULL, wbuf.data() );
 232         }
 233     }
 234 #endif // wxUSE_UNICODE/!wxUSE_UNICODE
 235 }
 236
 237 struct StringConversionData
 238 {
 239     const char *str;
 240     const wchar_t *wcs;
 241 };
 242
 243 void UnicodeTestCase::ConversionUTF7()
 244 {
 245     static const StringConversionData utf7data[] =
 246     {
 247         { "+-", L"+" },
 248         { "+--", L"+-" },
 249
 250 #ifdef wxHAVE_U_ESCAPE
 251         { "+AKM-", L"\u00a3" },
 252 #endif // wxHAVE_U_ESCAPE
 253
 254         // the following are invalid UTF-7 sequences
 255         { "+", NULL },
 256         { "+~", NULL },
 257         { "a+", NULL },
 258     };
 259
 260     wxCSConv conv(_T("utf-7"));
 261     for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ )
 262     {
 263         const StringConversionData& d = utf7data[n];
 264
 265         // converting to/from UTF-7 using iconv() currently doesn't work
 266         // because of several problems:
 267         //  - GetMBNulLen() doesn't return correct result (iconv converts L'\0'
 268         //    to an incomplete and anyhow nonsensical "+AA" string)
 269         //  - iconv refuses to convert "+-" (although it converts "+-\n" just
 270         //    fine, go figure)
 271         //
 272         // I have no idea how to fix this so just disable the test for now
 273 #if 0
 274         DoTestConversion(d.str, d.wcs, conv);
 275 #endif
 276         DoTestConversion(d.str, d.wcs, wxConvUTF7);
 277     }
 278 }
 279
 280 void UnicodeTestCase::ConversionUTF8()
 281 {
 282     static const StringConversionData utf8data[] =
 283     {
 284 #ifdef wxHAVE_U_ESCAPE
 285         { "\xc2\xa3", L"\u00a3" },
 286 #endif
 287         { "\xc2", NULL },
 288     };
 289
 290     wxCSConv conv(_T("utf-8"));
 291     for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ )
 292     {
 293         const StringConversionData& d = utf8data[n];
 294         DoTestConversion(d.str, d.wcs, conv);
 295         DoTestConversion(d.str, d.wcs, wxConvUTF8);
 296     }
 297 }
 298
 299 void UnicodeTestCase::ConversionUTF16()
 300 {
 301     static const StringConversionData utf16data[] =
 302     {
 303 #ifdef wxHAVE_U_ESCAPE
 304         { "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0",
 305           L"\u041f\u0440\u0438\u0432\u0435\u0442" },
 306         { "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", L"\u0100b\u0100a\u0100r" },
 307 #endif
 308         { "\0f\0o\0o\0\0", L"foo" },
 309     };
 310
 311     wxCSConv conv(wxFONTENCODING_UTF16BE);
 312     for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ )
 313     {
 314         const StringConversionData& d = utf16data[n];
 315         DoTestConversion(d.str, d.wcs, conv);
 316     }
 317
 318     // special case: this string has consecutive NULs inside it which don't
 319     // terminate the string, this exposed a bug in our conversion code which
 320     // got confused in this case
 321     size_t len;
 322     wxWCharBuffer wbuf(conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len));
 323     CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
 324 }
 325
 326 void UnicodeTestCase::ConversionUTF32()
 327 {
 328     static const StringConversionData utf32data[] =
 329     {
 330 #ifdef wxHAVE_U_ESCAPE
 331         {
 332             "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0",
 333           L"\u041f\u0440\u0438\u0432\u0435\u0442" },
 334 #endif
 335         { "\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo" },
 336     };
 337
 338     wxCSConv conv(wxFONTENCODING_UTF32BE);
 339     for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ )
 340     {
 341         const StringConversionData& d = utf32data[n];
 342         DoTestConversion(d.str, d.wcs, conv);
 343     }
 344
 345     size_t len;
 346     wxWCharBuffer wbuf(conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */,
 347                                    12, &len));
 348     CPPUNIT_ASSERT_EQUAL( (size_t)3, len );
 349 }
 350
 351 void UnicodeTestCase::IsConvOk()
 352 {
 353     CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() );
 354     CPPUNIT_ASSERT( wxCSConv(_T("UTF-8")).IsOk() );
 355     CPPUNIT_ASSERT( !wxCSConv(_T("NoSuchConversion")).IsOk() );
 356
 357 #ifdef __WINDOWS__
 358     CPPUNIT_ASSERT( wxCSConv(_T("WINDOWS-437")).IsOk() );
 359 #endif
 360 }
 361
 362 #endif // wxUSE_WCHAR_T
 363
 364 #if wxUSE_UNICODE
 365 void UnicodeTestCase::Iteration()
 366 {
 367     // "czech" in Czech ("cestina"):
 368     static const char *textUTF8 = "\304\215e\305\241tina";
 369     static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0};
 370
 371     wxString text(wxString::FromUTF8(textUTF8));
 372     CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 );
 373
 374     // verify the string was decoded correctly:
 375     {
 376         size_t idx = 0;
 377         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 378         {
 379             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 380         }
 381     }
 382
 383     // overwrite the string with something that is shorter in UTF-8:
 384     {
 385         for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
 386             *i = 'x';
 387     }
 388
 389     // restore the original text now:
 390     {
 391         wxString::iterator end1 = text.end();
 392         wxString::const_iterator end2 = text.end();
 393
 394         size_t idx = 0;
 395         for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
 396         {
 397             *i = textUTF16[idx];
 398
 399             CPPUNIT_ASSERT( end1 == text.end() );
 400             CPPUNIT_ASSERT( end2 == text.end() );
 401         }
 402
 403         CPPUNIT_ASSERT( end1 == text.end() );
 404         CPPUNIT_ASSERT( end2 == text.end() );
 405     }
 406
 407     // and verify it again:
 408     {
 409         size_t idx = 0;
 410         for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
 411         {
 412             CPPUNIT_ASSERT( *i == textUTF16[idx] );
 413         }
 414     }
 415 }
 416 #endif // wxUSE_UNICODE