tests/mbconv/mbconvtest.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // Name:        tests/mbconv/main.cpp
   3 // Purpose:     wxMBConv unit test
   4 // Author:      Vadim Zeitlin, Mike Wetherell
   5 // Created:     14.02.04
   6 // RCS-ID:      $Id$
   7 // Copyright:   (c) 2003 TT-Solutions, (c) 2005 Mike Wetherell
   8 ///////////////////////////////////////////////////////////////////////////////
   9
  10 // ----------------------------------------------------------------------------
  11 // headers
  12 // ----------------------------------------------------------------------------
  13
  14 #include "testprec.h"
  15
  16 #ifdef __BORLANDC__
  17     #pragma hdrstop
  18 #endif
  19
  20 #ifndef WX_PRECOMP
  21     #include "wx/wx.h"
  22 #endif // WX_PRECOMP
  23
  24 #include "wx/strconv.h"
  25 #include "wx/string.h"
  26
  27 #if defined wxHAVE_TCHAR_SUPPORT && !defined HAVE_WCHAR_H
  28     #define HAVE_WCHAR_H
  29 #endif
  30
  31 // ----------------------------------------------------------------------------
  32 // Some wide character constants. "\uXXXX" escapes aren't supported by old
  33 // compilers such as VC++ 5 and g++ 2.95.
  34 // ----------------------------------------------------------------------------
  35
  36 wchar_t u41[] = { 0x41, 0 };
  37 wchar_t u7f[] = { 0x7f, 0 };
  38
  39 wchar_t u80[] = { 0x80, 0 };
  40 wchar_t u391[] = { 0x391, 0 };
  41 wchar_t u7ff[] = { 0x7ff, 0 };
  42
  43 wchar_t u800[] = { 0x800, 0 };
  44 wchar_t u2620[] = { 0x2620, 0 };
  45 wchar_t ufffd[] = { 0xfffd, 0 };
  46
  47 #if SIZEOF_WCHAR_T == 4
  48 wchar_t u10000[] = { 0x10000, 0 };
  49 wchar_t u1000a5[] = { 0x1000a5, 0 };
  50 wchar_t u10fffd[] = { 0x10fffd, 0 };
  51 #else
  52 wchar_t u10000[] = { 0xd800, 0xdc00, 0 };
  53 wchar_t u1000a5[] = { 0xdbc0, 0xdca5, 0 };
  54 wchar_t u10fffd[] = { 0xdbff, 0xdffd, 0 };
  55 #endif
  56
  57 // ----------------------------------------------------------------------------
  58 // test class
  59 // ----------------------------------------------------------------------------
  60
  61 class MBConvTestCase : public CppUnit::TestCase
  62 {
  63 public:
  64     MBConvTestCase() { }
  65
  66 private:
  67     CPPUNIT_TEST_SUITE( MBConvTestCase );
  68         CPPUNIT_TEST( WC2CP1250 );
  69 #ifdef HAVE_WCHAR_H
  70         CPPUNIT_TEST( UTF8_41 );
  71         CPPUNIT_TEST( UTF8_7f );
  72         CPPUNIT_TEST( UTF8_80 );
  73         CPPUNIT_TEST( UTF8_c2_7f );
  74         CPPUNIT_TEST( UTF8_c2_80 );
  75         CPPUNIT_TEST( UTF8_ce_91 );
  76         CPPUNIT_TEST( UTF8_df_bf );
  77         CPPUNIT_TEST( UTF8_df_c0 );
  78         CPPUNIT_TEST( UTF8_e0_a0_7f );
  79         CPPUNIT_TEST( UTF8_e0_a0_80 );
  80         CPPUNIT_TEST( UTF8_e2_98_a0 );
  81         CPPUNIT_TEST( UTF8_ef_bf_bd );
  82         CPPUNIT_TEST( UTF8_ef_bf_c0 );
  83         CPPUNIT_TEST( UTF8_f0_90_80_7f );
  84         CPPUNIT_TEST( UTF8_f0_90_80_80 );
  85         CPPUNIT_TEST( UTF8_f4_8f_bf_bd );
  86         CPPUNIT_TEST( UTF8PUA_f4_80_82_a5 );
  87         CPPUNIT_TEST( UTF8Octal_backslash245 );
  88 #endif // HAVE_WCHAR_H
  89     CPPUNIT_TEST_SUITE_END();
  90
  91     void WC2CP1250();
  92
  93 #ifdef HAVE_WCHAR_H
  94     // UTF-8 tests. Test the first, last and one in the middle for sequences
  95     // of each length
  96     void UTF8_41() { UTF8("\x41", u41); }
  97     void UTF8_7f() { UTF8("\x7f", u7f); }
  98     void UTF8_80() { UTF8("\x80", NULL); }
  99
 100     void UTF8_c2_7f() { UTF8("\xc2\x7f", NULL); }
 101     void UTF8_c2_80() { UTF8("\xc2\x80", u80); }
 102     void UTF8_ce_91() { UTF8("\xce\x91", u391); }
 103     void UTF8_df_bf() { UTF8("\xdf\xbf", u7ff); }
 104     void UTF8_df_c0() { UTF8("\xdf\xc0", NULL); }
 105
 106     void UTF8_e0_a0_7f() { UTF8("\xe0\xa0\x7f", NULL); }
 107     void UTF8_e0_a0_80() { UTF8("\xe0\xa0\x80", u800); }
 108     void UTF8_e2_98_a0() { UTF8("\xe2\x98\xa0", u2620); }
 109     void UTF8_ef_bf_bd() { UTF8("\xef\xbf\xbd", ufffd); }
 110     void UTF8_ef_bf_c0() { UTF8("\xef\xbf\xc0", NULL); }
 111
 112     void UTF8_f0_90_80_7f() { UTF8("\xf0\x90\x80\x7f", NULL); }
 113     void UTF8_f0_90_80_80() { UTF8("\xf0\x90\x80\x80", u10000); }
 114     void UTF8_f4_8f_bf_bd() { UTF8("\xf4\x8f\xbf\xbd", u10fffd); }
 115
 116     // test 'escaping the escape characters' for the two escaping schemes
 117     void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
 118     void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }
 119
 120     // implementation for the utf-8 tests (see comments below)
 121     void UTF8(const char *charSequence, const wchar_t *wideSequence);
 122     void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
 123     void UTF8Octal(const char *charSequence, const wchar_t *wideSequence);
 124     void UTF8(const char *charSequence, const wchar_t *wideSequence, int option);
 125 #endif // HAVE_WCHAR_H
 126
 127     DECLARE_NO_COPY_CLASS(MBConvTestCase)
 128 };
 129
 130 // register in the unnamed registry so that these tests are run by default
 131 CPPUNIT_TEST_SUITE_REGISTRATION( MBConvTestCase );
 132
 133 // also include in it's own registry so that these tests can be run alone
 134 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( MBConvTestCase, "MBConvTestCase" );
 135
 136 void MBConvTestCase::WC2CP1250()
 137 {
 138     static const struct Data
 139     {
 140         const wchar_t *wc;
 141         const char *cp1250;
 142     } data[] =
 143     {
 144         { L"hello", "hello" },  // test that it works in simplest case
 145         { L"\xBD of \xBD is \xBC", NULL }, // this should fail as cp1250 doesn't have 1/2
 146     };
 147
 148     wxCSConv cs1250(wxFONTENCODING_CP1250);
 149     for ( size_t n = 0; n < WXSIZEOF(data); n++ )
 150     {
 151         const Data& d = data[n];
 152         if (d.cp1250)
 153         {
 154             CPPUNIT_ASSERT( strcmp(cs1250.cWC2MB(d.wc), d.cp1250) == 0 );
 155         }
 156         else
 157         {
 158             CPPUNIT_ASSERT( (const char*)cs1250.cWC2MB(d.wc) == NULL );
 159         }
 160     }
 161 }
 162
 163 // ----------------------------------------------------------------------------
 164 // UTF-8 tests
 165 // ----------------------------------------------------------------------------
 166
 167 #ifdef HAVE_WCHAR_H
 168
 169 // Check that 'charSequence' translates to 'wideSequence' and back.
 170 // Invalid sequences can be tested by giving NULL for 'wideSequence'. Even
 171 // invalid sequences should roundtrip when an option is given and this is
 172 // checked.
 173 //
 174 void MBConvTestCase::UTF8(const char *charSequence,
 175                           const wchar_t *wideSequence)
 176 {
 177     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
 178     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 179     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 180 }
 181
 182 // Use this alternative when 'charSequence' contains a PUA character. Such
 183 // sequences should still roundtrip ok, and this is checked.
 184 //
 185 void MBConvTestCase::UTF8PUA(const char *charSequence,
 186                              const wchar_t *wideSequence)
 187 {
 188     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
 189     UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 190     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 191 }
 192
 193 // Use this alternative when 'charSequence' contains an octal escape sequence.
 194 // Such sequences should still roundtrip ok, and this is checked.
 195 //
 196 void MBConvTestCase::UTF8Octal(const char *charSequence,
 197                                const wchar_t *wideSequence)
 198 {
 199     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
 200     UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 201     UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 202 }
 203
 204 // include the option in the error messages so it's possible to see which
 205 // test failed
 206 #define UTF8ASSERT(expr) CPPUNIT_ASSERT_MESSAGE(#expr + errmsg,  expr)
 207
 208 // The test implementation
 209 //
 210 void MBConvTestCase::UTF8(const char *charSequence,
 211                           const wchar_t *wideSequence,
 212                           int option)
 213 {
 214     const size_t BUFSIZE = 128;
 215     wxASSERT(strlen(charSequence) * 3 + 10 < BUFSIZE);
 216     char bytes[BUFSIZE];
 217
 218     // include the option in the error messages so it's possible to see
 219     // which test failed
 220     sprintf(bytes, " (with option == %d)", option);
 221     std::string errmsg(bytes);
 222
 223     // put the charSequence at the start, middle and end of a string
 224     strcpy(bytes, charSequence);
 225     strcat(bytes, "ABC");
 226     strcat(bytes, charSequence);
 227     strcat(bytes, "XYZ");
 228     strcat(bytes, charSequence);
 229
 230     // translate it into wide characters
 231     wxMBConvUTF8 utf8(option);
 232     wchar_t widechars[BUFSIZE];
 233     size_t lenResult = utf8.MB2WC(NULL, bytes, 0);
 234     size_t result = utf8.MB2WC(widechars, bytes, BUFSIZE);
 235     UTF8ASSERT(result == lenResult);
 236
 237     // check we got the expected result
 238     if (wideSequence) {
 239         UTF8ASSERT(result != (size_t)-1);
 240         wxASSERT(result < BUFSIZE);
 241
 242         wchar_t expected[BUFSIZE];
 243         wcscpy(expected, wideSequence);
 244         wcscat(expected, L"ABC");
 245         wcscat(expected, wideSequence);
 246         wcscat(expected, L"XYZ");
 247         wcscat(expected, wideSequence);
 248
 249         UTF8ASSERT(wcscmp(widechars, expected) == 0);
 250         UTF8ASSERT(wcslen(widechars) == result);
 251     }
 252     else {
 253         // If 'wideSequence' is NULL, then the result is expected to be
 254         // invalid.  Normally that is as far as we can go, but if there is an
 255         // option then the conversion should succeed anyway, and it should be
 256         // possible to translate back to the original
 257         if (!option) {
 258             UTF8ASSERT(result == (size_t)-1);
 259             return;
 260         }
 261         else {
 262             UTF8ASSERT(result != (size_t)-1);
 263         }
 264     }
 265
 266     // translate it back and check we get the original
 267     char bytesAgain[BUFSIZE];
 268     size_t lenResultAgain = utf8.WC2MB(NULL, widechars, 0);
 269     size_t resultAgain = utf8.WC2MB(bytesAgain, widechars, BUFSIZE);
 270     UTF8ASSERT(resultAgain == lenResultAgain);
 271     UTF8ASSERT(resultAgain != (size_t)-1);
 272     wxASSERT(resultAgain < BUFSIZE);
 273
 274     UTF8ASSERT(strcmp(bytes, bytesAgain) == 0);
 275     UTF8ASSERT(strlen(bytesAgain) == resultAgain);
 276 }
 277
 278 #endif // HAVE_WCHAR_H