| 1 | /////////////////////////////////////////////////////////////////////////////// |
| 2 | // Name: tests/mbconv/main.cpp |
| 3 | // Purpose: wxMBConv unit test |
| 4 | // Author: Vadim Zeitlin, Mike Wetherell |
| 5 | // Created: 14.02.04 |
| 6 | // RCS-ID: $Id$ |
| 7 | // Copyright: (c) 2003 TT-Solutions, (c) 2005 Mike Wetherell |
| 8 | /////////////////////////////////////////////////////////////////////////////// |
| 9 | |
| 10 | // ---------------------------------------------------------------------------- |
| 11 | // headers |
| 12 | // ---------------------------------------------------------------------------- |
| 13 | |
| 14 | #include "testprec.h" |
| 15 | |
| 16 | #ifdef __BORLANDC__ |
| 17 | #pragma hdrstop |
| 18 | #endif |
| 19 | |
| 20 | #ifndef WX_PRECOMP |
| 21 | #include "wx/wx.h" |
| 22 | #endif // WX_PRECOMP |
| 23 | |
| 24 | #include "wx/strconv.h" |
| 25 | #include "wx/string.h" |
| 26 | |
| 27 | #if defined wxHAVE_TCHAR_SUPPORT && !defined HAVE_WCHAR_H |
| 28 | #define HAVE_WCHAR_H |
| 29 | #endif |
| 30 | |
| 31 | // ---------------------------------------------------------------------------- |
| 32 | // Some wide character constants. "\uXXXX" escapes aren't supported by old |
| 33 | // compilers such as VC++ 5 and g++ 2.95. |
| 34 | // ---------------------------------------------------------------------------- |
| 35 | |
| 36 | wchar_t u41[] = { 0x41, 0 }; |
| 37 | wchar_t u7f[] = { 0x7f, 0 }; |
| 38 | |
| 39 | wchar_t u80[] = { 0x80, 0 }; |
| 40 | wchar_t u391[] = { 0x391, 0 }; |
| 41 | wchar_t u7ff[] = { 0x7ff, 0 }; |
| 42 | |
| 43 | wchar_t u800[] = { 0x800, 0 }; |
| 44 | wchar_t u2620[] = { 0x2620, 0 }; |
| 45 | wchar_t ufffd[] = { 0xfffd, 0 }; |
| 46 | |
| 47 | #if SIZEOF_WCHAR_T == 4 |
| 48 | wchar_t u10000[] = { 0x10000, 0 }; |
| 49 | wchar_t u1000a5[] = { 0x1000a5, 0 }; |
| 50 | wchar_t u10fffd[] = { 0x10fffd, 0 }; |
| 51 | #else |
| 52 | wchar_t u10000[] = { 0xd800, 0xdc00, 0 }; |
| 53 | wchar_t u1000a5[] = { 0xdbc0, 0xdca5, 0 }; |
| 54 | wchar_t u10fffd[] = { 0xdbff, 0xdffd, 0 }; |
| 55 | #endif |
| 56 | |
| 57 | // ---------------------------------------------------------------------------- |
| 58 | // test class |
| 59 | // ---------------------------------------------------------------------------- |
| 60 | |
| 61 | class MBConvTestCase : public CppUnit::TestCase |
| 62 | { |
| 63 | public: |
| 64 | MBConvTestCase() { } |
| 65 | |
| 66 | private: |
| 67 | CPPUNIT_TEST_SUITE( MBConvTestCase ); |
| 68 | CPPUNIT_TEST( WC2CP1250 ); |
| 69 | #ifdef HAVE_WCHAR_H |
| 70 | CPPUNIT_TEST( UTF8_41 ); |
| 71 | CPPUNIT_TEST( UTF8_7f ); |
| 72 | CPPUNIT_TEST( UTF8_80 ); |
| 73 | CPPUNIT_TEST( UTF8_c2_7f ); |
| 74 | CPPUNIT_TEST( UTF8_c2_80 ); |
| 75 | CPPUNIT_TEST( UTF8_ce_91 ); |
| 76 | CPPUNIT_TEST( UTF8_df_bf ); |
| 77 | CPPUNIT_TEST( UTF8_df_c0 ); |
| 78 | CPPUNIT_TEST( UTF8_e0_a0_7f ); |
| 79 | CPPUNIT_TEST( UTF8_e0_a0_80 ); |
| 80 | CPPUNIT_TEST( UTF8_e2_98_a0 ); |
| 81 | CPPUNIT_TEST( UTF8_ef_bf_bd ); |
| 82 | CPPUNIT_TEST( UTF8_ef_bf_c0 ); |
| 83 | CPPUNIT_TEST( UTF8_f0_90_80_7f ); |
| 84 | CPPUNIT_TEST( UTF8_f0_90_80_80 ); |
| 85 | CPPUNIT_TEST( UTF8_f4_8f_bf_bd ); |
| 86 | CPPUNIT_TEST( UTF8PUA_f4_80_82_a5 ); |
| 87 | CPPUNIT_TEST( UTF8Octal_backslash245 ); |
| 88 | #endif // HAVE_WCHAR_H |
| 89 | CPPUNIT_TEST_SUITE_END(); |
| 90 | |
| 91 | void WC2CP1250(); |
| 92 | |
| 93 | #ifdef HAVE_WCHAR_H |
| 94 | // UTF-8 tests. Test the first, last and one in the middle for sequences |
| 95 | // of each length |
| 96 | void UTF8_41() { UTF8("\x41", u41); } |
| 97 | void UTF8_7f() { UTF8("\x7f", u7f); } |
| 98 | void UTF8_80() { UTF8("\x80", NULL); } |
| 99 | |
| 100 | void UTF8_c2_7f() { UTF8("\xc2\x7f", NULL); } |
| 101 | void UTF8_c2_80() { UTF8("\xc2\x80", u80); } |
| 102 | void UTF8_ce_91() { UTF8("\xce\x91", u391); } |
| 103 | void UTF8_df_bf() { UTF8("\xdf\xbf", u7ff); } |
| 104 | void UTF8_df_c0() { UTF8("\xdf\xc0", NULL); } |
| 105 | |
| 106 | void UTF8_e0_a0_7f() { UTF8("\xe0\xa0\x7f", NULL); } |
| 107 | void UTF8_e0_a0_80() { UTF8("\xe0\xa0\x80", u800); } |
| 108 | void UTF8_e2_98_a0() { UTF8("\xe2\x98\xa0", u2620); } |
| 109 | void UTF8_ef_bf_bd() { UTF8("\xef\xbf\xbd", ufffd); } |
| 110 | void UTF8_ef_bf_c0() { UTF8("\xef\xbf\xc0", NULL); } |
| 111 | |
| 112 | void UTF8_f0_90_80_7f() { UTF8("\xf0\x90\x80\x7f", NULL); } |
| 113 | void UTF8_f0_90_80_80() { UTF8("\xf0\x90\x80\x80", u10000); } |
| 114 | void UTF8_f4_8f_bf_bd() { UTF8("\xf4\x8f\xbf\xbd", u10fffd); } |
| 115 | |
| 116 | // test 'escaping the escape characters' for the two escaping schemes |
| 117 | void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); } |
| 118 | void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); } |
| 119 | |
| 120 | // implementation for the utf-8 tests (see comments below) |
| 121 | void UTF8(const char *charSequence, const wchar_t *wideSequence); |
| 122 | void UTF8PUA(const char *charSequence, const wchar_t *wideSequence); |
| 123 | void UTF8Octal(const char *charSequence, const wchar_t *wideSequence); |
| 124 | void UTF8(const char *charSequence, const wchar_t *wideSequence, int option); |
| 125 | #endif // HAVE_WCHAR_H |
| 126 | |
| 127 | DECLARE_NO_COPY_CLASS(MBConvTestCase) |
| 128 | }; |
| 129 | |
| 130 | // register in the unnamed registry so that these tests are run by default |
| 131 | CPPUNIT_TEST_SUITE_REGISTRATION( MBConvTestCase ); |
| 132 | |
| 133 | // also include in it's own registry so that these tests can be run alone |
| 134 | CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( MBConvTestCase, "MBConvTestCase" ); |
| 135 | |
| 136 | void MBConvTestCase::WC2CP1250() |
| 137 | { |
| 138 | static const struct Data |
| 139 | { |
| 140 | const wchar_t *wc; |
| 141 | const char *cp1250; |
| 142 | } data[] = |
| 143 | { |
| 144 | { L"hello", "hello" }, // test that it works in simplest case |
| 145 | { L"\xBD of \xBD is \xBC", NULL }, // this should fail as cp1250 doesn't have 1/2 |
| 146 | }; |
| 147 | |
| 148 | wxCSConv cs1250(wxFONTENCODING_CP1250); |
| 149 | for ( size_t n = 0; n < WXSIZEOF(data); n++ ) |
| 150 | { |
| 151 | const Data& d = data[n]; |
| 152 | if (d.cp1250) |
| 153 | { |
| 154 | CPPUNIT_ASSERT( strcmp(cs1250.cWC2MB(d.wc), d.cp1250) == 0 ); |
| 155 | } |
| 156 | else |
| 157 | { |
| 158 | CPPUNIT_ASSERT( (const char*)cs1250.cWC2MB(d.wc) == NULL ); |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | // ---------------------------------------------------------------------------- |
| 164 | // UTF-8 tests |
| 165 | // ---------------------------------------------------------------------------- |
| 166 | |
| 167 | #ifdef HAVE_WCHAR_H |
| 168 | |
| 169 | // Check that 'charSequence' translates to 'wideSequence' and back. |
| 170 | // Invalid sequences can be tested by giving NULL for 'wideSequence'. Even |
| 171 | // invalid sequences should roundtrip when an option is given and this is |
| 172 | // checked. |
| 173 | // |
| 174 | void MBConvTestCase::UTF8(const char *charSequence, |
| 175 | const wchar_t *wideSequence) |
| 176 | { |
| 177 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT); |
| 178 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA); |
| 179 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL); |
| 180 | } |
| 181 | |
| 182 | // Use this alternative when 'charSequence' contains a PUA character. Such |
| 183 | // sequences should still roundtrip ok, and this is checked. |
| 184 | // |
| 185 | void MBConvTestCase::UTF8PUA(const char *charSequence, |
| 186 | const wchar_t *wideSequence) |
| 187 | { |
| 188 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT); |
| 189 | UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA); |
| 190 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL); |
| 191 | } |
| 192 | |
| 193 | // Use this alternative when 'charSequence' contains an octal escape sequence. |
| 194 | // Such sequences should still roundtrip ok, and this is checked. |
| 195 | // |
| 196 | void MBConvTestCase::UTF8Octal(const char *charSequence, |
| 197 | const wchar_t *wideSequence) |
| 198 | { |
| 199 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT); |
| 200 | UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA); |
| 201 | UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL); |
| 202 | } |
| 203 | |
| 204 | // include the option in the error messages so it's possible to see which |
| 205 | // test failed |
| 206 | #define UTF8ASSERT(expr) CPPUNIT_ASSERT_MESSAGE(#expr + errmsg, expr) |
| 207 | |
| 208 | // The test implementation |
| 209 | // |
| 210 | void MBConvTestCase::UTF8(const char *charSequence, |
| 211 | const wchar_t *wideSequence, |
| 212 | int option) |
| 213 | { |
| 214 | const size_t BUFSIZE = 128; |
| 215 | wxASSERT(strlen(charSequence) * 3 + 10 < BUFSIZE); |
| 216 | char bytes[BUFSIZE]; |
| 217 | |
| 218 | // include the option in the error messages so it's possible to see |
| 219 | // which test failed |
| 220 | sprintf(bytes, " (with option == %d)", option); |
| 221 | std::string errmsg(bytes); |
| 222 | |
| 223 | // put the charSequence at the start, middle and end of a string |
| 224 | strcpy(bytes, charSequence); |
| 225 | strcat(bytes, "ABC"); |
| 226 | strcat(bytes, charSequence); |
| 227 | strcat(bytes, "XYZ"); |
| 228 | strcat(bytes, charSequence); |
| 229 | |
| 230 | // translate it into wide characters |
| 231 | wxMBConvUTF8 utf8(option); |
| 232 | wchar_t widechars[BUFSIZE]; |
| 233 | size_t lenResult = utf8.MB2WC(NULL, bytes, 0); |
| 234 | size_t result = utf8.MB2WC(widechars, bytes, BUFSIZE); |
| 235 | UTF8ASSERT(result == lenResult); |
| 236 | |
| 237 | // check we got the expected result |
| 238 | if (wideSequence) { |
| 239 | UTF8ASSERT(result != (size_t)-1); |
| 240 | wxASSERT(result < BUFSIZE); |
| 241 | |
| 242 | wchar_t expected[BUFSIZE]; |
| 243 | wcscpy(expected, wideSequence); |
| 244 | wcscat(expected, L"ABC"); |
| 245 | wcscat(expected, wideSequence); |
| 246 | wcscat(expected, L"XYZ"); |
| 247 | wcscat(expected, wideSequence); |
| 248 | |
| 249 | UTF8ASSERT(wcscmp(widechars, expected) == 0); |
| 250 | UTF8ASSERT(wcslen(widechars) == result); |
| 251 | } |
| 252 | else { |
| 253 | // If 'wideSequence' is NULL, then the result is expected to be |
| 254 | // invalid. Normally that is as far as we can go, but if there is an |
| 255 | // option then the conversion should succeed anyway, and it should be |
| 256 | // possible to translate back to the original |
| 257 | if (!option) { |
| 258 | UTF8ASSERT(result == (size_t)-1); |
| 259 | return; |
| 260 | } |
| 261 | else { |
| 262 | UTF8ASSERT(result != (size_t)-1); |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | // translate it back and check we get the original |
| 267 | char bytesAgain[BUFSIZE]; |
| 268 | size_t lenResultAgain = utf8.WC2MB(NULL, widechars, 0); |
| 269 | size_t resultAgain = utf8.WC2MB(bytesAgain, widechars, BUFSIZE); |
| 270 | UTF8ASSERT(resultAgain == lenResultAgain); |
| 271 | UTF8ASSERT(resultAgain != (size_t)-1); |
| 272 | wxASSERT(resultAgain < BUFSIZE); |
| 273 | |
| 274 | UTF8ASSERT(strcmp(bytes, bytesAgain) == 0); |
| 275 | UTF8ASSERT(strlen(bytesAgain) == resultAgain); |
| 276 | } |
| 277 | |
| 278 | #endif // HAVE_WCHAR_H |