tests/mbconv/mbconvtest.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        tests/mbconv/main.cpp
   3 // Purpose:     wxMBConv unit test
   4 // Author:      Vadim Zeitlin, Mike Wetherell
   5 // Created:     14.02.04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2003 TT-Solutions, (c) 2005 Mike Wetherell
   8 ///////////////////////////////////////////////////////////////////////////////
   9
  10 // ----------------------------------------------------------------------------
  11 // headers
  12 // ----------------------------------------------------------------------------
  13
  14 #include "testprec.h"
  15
  16 #ifdef __BORLANDC__
  17     #pragma hdrstop
  18 #endif
  19
  20 #ifndef WX_PRECOMP
  21     #include "wx/wx.h"
  22 #endif // WX_PRECOMP
  23
  24 #include "wx/strconv.h"
  25 #include "wx/string.h"
  26
  27 #if defined wxHAVE_TCHAR_SUPPORT && !defined HAVE_WCHAR_H
  28     #define HAVE_WCHAR_H
  29 #endif
  30
  31 // ----------------------------------------------------------------------------
  32 // Some wide character constants. "\uXXXX" escapes aren't supported by old
  33 // compilers such as VC++ 5 and g++ 2.95.
  34 // ----------------------------------------------------------------------------
  35
  36 wchar_t u41[] = { 0x41, 0 };
  37 wchar_t u7f[] = { 0x7f, 0 };
  38
  39 wchar_t u80[] = { 0x80, 0 };
  40 wchar_t u391[] = { 0x391, 0 };
  41 wchar_t u7ff[] = { 0x7ff, 0 };
  42
  43 wchar_t u800[] = { 0x800, 0 };
  44 wchar_t u2620[] = { 0x2620, 0 };
  45 wchar_t ufffd[] = { 0xfffd, 0 };
  46
  47 #if SIZEOF_WCHAR_T == 4
  48 wchar_t u10000[] = { 0x10000, 0 };
  49 wchar_t u1000a5[] = { 0x1000a5, 0 };
  50 wchar_t u10fffd[] = { 0x10fffd, 0 };
  51 #else
  52 wchar_t u10000[] = { 0xd800, 0xdc00, 0 };
  53 wchar_t u1000a5[] = { 0xdbc0, 0xdca5, 0 };
  54 wchar_t u10fffd[] = { 0xdbff, 0xdffd, 0 };
  55 #endif
  56
  57 // ----------------------------------------------------------------------------
  58 // test class
  59 // ----------------------------------------------------------------------------
  60
  61 class MBConvTestCase : public CppUnit::TestCase
  62 {
  63 public:
  64     MBConvTestCase() { }
  65
  66 private:
  67     CPPUNIT_TEST_SUITE( MBConvTestCase );
  68         CPPUNIT_TEST( WC2CP1250 );
  69 #ifdef HAVE_WCHAR_H
  70         CPPUNIT_TEST( UTF8_41 );
  71         CPPUNIT_TEST( UTF8_7f );
  72         CPPUNIT_TEST( UTF8_80 );
  73         CPPUNIT_TEST( UTF8_c2_7f );
  74         CPPUNIT_TEST( UTF8_c2_80 );
  75         CPPUNIT_TEST( UTF8_ce_91 );
  76         CPPUNIT_TEST( UTF8_df_bf );
  77         CPPUNIT_TEST( UTF8_df_c0 );
  78         CPPUNIT_TEST( UTF8_e0_a0_7f );
  79         CPPUNIT_TEST( UTF8_e0_a0_80 );
  80         CPPUNIT_TEST( UTF8_e2_98_a0 );
  81         CPPUNIT_TEST( UTF8_ef_bf_bd );
  82         CPPUNIT_TEST( UTF8_ef_bf_c0 );
  83         CPPUNIT_TEST( UTF8_f0_90_80_7f );
  84         CPPUNIT_TEST( UTF8_f0_90_80_80 );
  85         CPPUNIT_TEST( UTF8_f4_8f_bf_bd );
  86         CPPUNIT_TEST( UTF8PUA_f4_80_82_a5 );
  87         CPPUNIT_TEST( UTF8Octal_backslash245 );
  88 #endif // HAVE_WCHAR_H
  89     CPPUNIT_TEST_SUITE_END();
  90
  91     void WC2CP1250();
  92
  93 #ifdef HAVE_WCHAR_H
  94     // UTF-8 tests. Test the first, last and one in the middle for sequences
  95     // of each length
  96     void UTF8_41() { UTF8("\x41", u41); }
  97     void UTF8_7f() { UTF8("\x7f", u7f); }
  98     void UTF8_80() { UTF8("\x80", NULL); }
  99
 100     void UTF8_c2_7f() { UTF8("\xc2\x7f", NULL); }
 101     void UTF8_c2_80() { UTF8("\xc2\x80", u80); }
 102     void UTF8_ce_91() { UTF8("\xce\x91", u391); }
 103     void UTF8_df_bf() { UTF8("\xdf\xbf", u7ff); }
 104     void UTF8_df_c0() { UTF8("\xdf\xc0", NULL); }
 105
 106     void UTF8_e0_a0_7f() { UTF8("\xe0\xa0\x7f", NULL); }
 107     void UTF8_e0_a0_80() { UTF8("\xe0\xa0\x80", u800); }
 108     void UTF8_e2_98_a0() { UTF8("\xe2\x98\xa0", u2620); }
 109     void UTF8_ef_bf_bd() { UTF8("\xef\xbf\xbd", ufffd); }
 110     void UTF8_ef_bf_c0() { UTF8("\xef\xbf\xc0", NULL); }
 111
 112     void UTF8_f0_90_80_7f() { UTF8("\xf0\x90\x80\x7f", NULL); }
 113     void UTF8_f0_90_80_80() { UTF8("\xf0\x90\x80\x80", u10000); }
 114     void UTF8_f4_8f_bf_bd() { UTF8("\xf4\x8f\xbf\xbd", u10fffd); }
 115
 116     // test 'escaping the escape characters' for the two escaping schemes
 117     void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
 118     void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }
 119
 120     // implementation for the utf-8 tests (see comments below)
 121     void UTF8(const char *charSequence, const wchar_t *wideSequence);
 122     void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
 123     void UTF8Octal(const char *charSequence, const wchar_t *wideSequence);
 124     void UTF8(const char *charSequence, const wchar_t *wideSequence, int option);
 125 #endif // HAVE_WCHAR_H
 126
 127     DECLARE_NO_COPY_CLASS(MBConvTestCase)
 128 };
 129
 130 // register in the unnamed registry so that these tests are run by default
 131 CPPUNIT_TEST_SUITE_REGISTRATION( MBConvTestCase );
 132
 133 // also include in it's own registry so that these tests can be run alone
 134 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( MBConvTestCase, "MBConvTestCase" );
 135
 136 void MBConvTestCase::WC2CP1250()
 137 {
 138     static const struct Data
 139     {
 140         const wchar_t *wc;
 141         const char *cp1250;
 142     } data[] =
 143     {
 144         { L"hello", "hello" },  // test that it works in simplest case
 145         { L"\xBD of \xBD is \xBC", NULL }, // this should fail as cp1250 doesn't have 1/2
 146     };
 147
 148     wxCSConv cs1250(wxFONTENCODING_CP1250);
 149     for ( size_t n = 0; n < WXSIZEOF(data); n++ )
 150     {
 151         const Data& d = data[n];
 152         if (d.cp1250)
 153         {
 154             CPPUNIT_ASSERT( strcmp(cs1250.cWC2MB(d.wc), d.cp1250) == 0 );
 155         }
 156         else
 157         {
 158             CPPUNIT_ASSERT( (const char*)cs1250.cWC2MB(d.wc) == NULL );
 159         }
 160     }
 161 }
 162
 163 // ----------------------------------------------------------------------------
 164 // UTF-8 tests
 165 // ----------------------------------------------------------------------------
 166
 167 #ifdef HAVE_WCHAR_H
 168
 169 // Check that 'charSequence' translates to 'wideSequence' and back.
 170 // Invalid sequences can be tested by giving NULL for 'wideSequence'. Even
 171 // invalid sequences should roundtrip when an option is given and this is
 172 // checked.
 173 //
 174 void MBConvTestCase::UTF8(const char *charSequence,
 175                           const wchar_t *wideSequence)
 176 {
 177     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
 178     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 179     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 180 }
 181
 182 // Use this alternative when 'charSequence' contains a PUA character. Such
 183 // sequences should still roundtrip ok, and this is checked.
 184 //
 185 void MBConvTestCase::UTF8PUA(const char *charSequence,
 186                              const wchar_t *wideSequence)
 187 {
 188     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
 189     UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 190     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 191 }
 192
 193 // Use this alternative when 'charSequence' contains an octal escape sequence.
 194 // Such sequences should still roundtrip ok, and this is checked.
 195 //
 196 void MBConvTestCase::UTF8Octal(const char *charSequence,
 197                                const wchar_t *wideSequence)
 198 {
 199     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
 200     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 201     UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 202 }
 203
 204 // in case wcscpy is missing
 205 //
 206 static wchar_t *wx_wcscpy(wchar_t *dest, const wchar_t *src)
 207 {
 208     wchar_t *d = dest;
 209     while ((*d++ = *src++) != 0)
 210         ;
 211     return dest;
 212 }
 213
 214 // in case wcscat is missing
 215 //
 216 static wchar_t *wx_wcscat(wchar_t *dest, const wchar_t *src)
 217 {
 218     wchar_t *d = dest;
 219     while (*d)
 220         d++;
 221     while ((*d++ = *src++) != 0)
 222         ;
 223     return dest;
 224 }
 225
 226 // include the option in the error messages so it's possible to see which
 227 // test failed
 228 #define UTF8ASSERT(expr) CPPUNIT_ASSERT_MESSAGE(#expr + errmsg,  expr)
 229
 230 // The test implementation
 231 //
 232 void MBConvTestCase::UTF8(const char *charSequence,
 233                           const wchar_t *wideSequence,
 234                           int option)
 235 {
 236     const size_t BUFSIZE = 128;
 237     wxASSERT(strlen(charSequence) * 3 + 10 < BUFSIZE);
 238     char bytes[BUFSIZE];
 239
 240     // include the option in the error messages so it's possible to see
 241     // which test failed
 242     sprintf(bytes, " (with option == %d)", option);
 243     std::string errmsg(bytes);
 244
 245     // put the charSequence at the start, middle and end of a string
 246     strcpy(bytes, charSequence);
 247     strcat(bytes, "ABC");
 248     strcat(bytes, charSequence);
 249     strcat(bytes, "XYZ");
 250     strcat(bytes, charSequence);
 251
 252     // translate it into wide characters
 253     wxMBConvUTF8 utf8(option);
 254     wchar_t widechars[BUFSIZE];
 255     size_t lenResult = utf8.MB2WC(NULL, bytes, 0);
 256     size_t result = utf8.MB2WC(widechars, bytes, BUFSIZE);
 257     UTF8ASSERT(result == lenResult);
 258
 259     // check we got the expected result
 260     if (wideSequence) {
 261         UTF8ASSERT(result != (size_t)-1);
 262         wxASSERT(result < BUFSIZE);
 263
 264         wchar_t expected[BUFSIZE];
 265         wx_wcscpy(expected, wideSequence);
 266         wx_wcscat(expected, L"ABC");
 267         wx_wcscat(expected, wideSequence);
 268         wx_wcscat(expected, L"XYZ");
 269         wx_wcscat(expected, wideSequence);
 270
 271         UTF8ASSERT(wcscmp(widechars, expected) == 0);
 272         UTF8ASSERT(wcslen(widechars) == result);
 273     }
 274     else {
 275         // If 'wideSequence' is NULL, then the result is expected to be
 276         // invalid.  Normally that is as far as we can go, but if there is an
 277         // option then the conversion should succeed anyway, and it should be
 278         // possible to translate back to the original
 279         if (!option) {
 280             UTF8ASSERT(result == (size_t)-1);
 281             return;
 282         }
 283         else {
 284             UTF8ASSERT(result != (size_t)-1);
 285         }
 286     }
 287
 288     // translate it back and check we get the original
 289     char bytesAgain[BUFSIZE];
 290     size_t lenResultAgain = utf8.WC2MB(NULL, widechars, 0);
 291     size_t resultAgain = utf8.WC2MB(bytesAgain, widechars, BUFSIZE);
 292     UTF8ASSERT(resultAgain == lenResultAgain);
 293     UTF8ASSERT(resultAgain != (size_t)-1);
 294     wxASSERT(resultAgain < BUFSIZE);
 295
 296     UTF8ASSERT(strcmp(bytes, bytesAgain) == 0);
 297     UTF8ASSERT(strlen(bytesAgain) == resultAgain);
 298 }
 299
 300 #endif // HAVE_WCHAR_H