Tests for UTF-8 and PUA characters and octal escapes
[wxWidgets.git] / tests / mbconv / mbconvtest.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: tests/mbconv/main.cpp
3 // Purpose: wxMBConv unit test
4 // Author: Vadim Zeitlin, Mike Wetherell
5 // Created: 14.02.04
6 // RCS-ID: $Id$
7 // Copyright: (c) 2003 TT-Solutions, (c) 2005 Mike Wetherell
8 ///////////////////////////////////////////////////////////////////////////////
9
10 // ----------------------------------------------------------------------------
11 // headers
12 // ----------------------------------------------------------------------------
13
14 #include "testprec.h"
15
16 #ifdef __BORLANDC__
17 #pragma hdrstop
18 #endif
19
20 #ifndef WX_PRECOMP
21 #include "wx/wx.h"
22 #endif // WX_PRECOMP
23
24 #include "wx/strconv.h"
25 #include "wx/string.h"
26
27 #if defined wxHAVE_TCHAR_SUPPORT && !defined HAVE_WCHAR_H
28 #define HAVE_WCHAR_H
29 #endif
30
31 // ----------------------------------------------------------------------------
32 // Some wide character constants. "\uXXXX" escapes aren't supported by old
33 // compilers such as VC++ 5 and g++ 2.95.
34 // ----------------------------------------------------------------------------
35
36 wchar_t u41[] = { 0x41, 0 };
37 wchar_t u7f[] = { 0x7f, 0 };
38
39 wchar_t u80[] = { 0x80, 0 };
40 wchar_t u391[] = { 0x391, 0 };
41 wchar_t u7ff[] = { 0x7ff, 0 };
42
43 wchar_t u800[] = { 0x800, 0 };
44 wchar_t u2620[] = { 0x2620, 0 };
45 wchar_t ufffd[] = { 0xfffd, 0 };
46
47 #if SIZEOF_WCHAR_T == 4
48 wchar_t u10000[] = { 0x10000, 0 };
49 wchar_t u1000a5[] = { 0x1000a5, 0 };
50 wchar_t u10fffd[] = { 0x10fffd, 0 };
51 #else
52 wchar_t u10000[] = { 0xd800, 0xdc00, 0 };
53 wchar_t u1000a5[] = { 0xdbc0, 0xdca5, 0 };
54 wchar_t u10fffd[] = { 0xdbff, 0xdffd, 0 };
55 #endif
56
57 // ----------------------------------------------------------------------------
58 // test class
59 // ----------------------------------------------------------------------------
60
61 class MBConvTestCase : public CppUnit::TestCase
62 {
63 public:
64 MBConvTestCase() { }
65
66 private:
67 CPPUNIT_TEST_SUITE( MBConvTestCase );
68 CPPUNIT_TEST( WC2CP1250 );
69 #ifdef HAVE_WCHAR_H
70 CPPUNIT_TEST( UTF8_41 );
71 CPPUNIT_TEST( UTF8_7f );
72 CPPUNIT_TEST( UTF8_80 );
73 CPPUNIT_TEST( UTF8_c2_7f );
74 CPPUNIT_TEST( UTF8_c2_80 );
75 CPPUNIT_TEST( UTF8_ce_91 );
76 CPPUNIT_TEST( UTF8_df_bf );
77 CPPUNIT_TEST( UTF8_df_c0 );
78 CPPUNIT_TEST( UTF8_e0_a0_7f );
79 CPPUNIT_TEST( UTF8_e0_a0_80 );
80 CPPUNIT_TEST( UTF8_e2_98_a0 );
81 CPPUNIT_TEST( UTF8_ef_bf_bd );
82 CPPUNIT_TEST( UTF8_ef_bf_c0 );
83 CPPUNIT_TEST( UTF8_f0_90_80_7f );
84 CPPUNIT_TEST( UTF8_f0_90_80_80 );
85 CPPUNIT_TEST( UTF8_f4_8f_bf_bd );
86 CPPUNIT_TEST( UTF8PUA_f4_80_82_a5 );
87 CPPUNIT_TEST( UTF8Octal_backslash245 );
88 #endif // HAVE_WCHAR_H
89 CPPUNIT_TEST_SUITE_END();
90
91 void WC2CP1250();
92
93 #ifdef HAVE_WCHAR_H
94 // UTF-8 tests. Test the first, last and one in the middle for sequences
95 // of each length
96 void UTF8_41() { UTF8("\x41", u41); }
97 void UTF8_7f() { UTF8("\x7f", u7f); }
98 void UTF8_80() { UTF8("\x80", NULL); }
99
100 void UTF8_c2_7f() { UTF8("\xc2\x7f", NULL); }
101 void UTF8_c2_80() { UTF8("\xc2\x80", u80); }
102 void UTF8_ce_91() { UTF8("\xce\x91", u391); }
103 void UTF8_df_bf() { UTF8("\xdf\xbf", u7ff); }
104 void UTF8_df_c0() { UTF8("\xdf\xc0", NULL); }
105
106 void UTF8_e0_a0_7f() { UTF8("\xe0\xa0\x7f", NULL); }
107 void UTF8_e0_a0_80() { UTF8("\xe0\xa0\x80", u800); }
108 void UTF8_e2_98_a0() { UTF8("\xe2\x98\xa0", u2620); }
109 void UTF8_ef_bf_bd() { UTF8("\xef\xbf\xbd", ufffd); }
110 void UTF8_ef_bf_c0() { UTF8("\xef\xbf\xc0", NULL); }
111
112 void UTF8_f0_90_80_7f() { UTF8("\xf0\x90\x80\x7f", NULL); }
113 void UTF8_f0_90_80_80() { UTF8("\xf0\x90\x80\x80", u10000); }
114 void UTF8_f4_8f_bf_bd() { UTF8("\xf4\x8f\xbf\xbd", u10fffd); }
115
116 // test 'escaping the escape characters' for the two escaping schemes
117 void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
118 void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }
119
120 // implementation for the utf-8 tests (see comments below)
121 void UTF8(const char *charSequence, const wchar_t *wideSequence);
122 void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
123 void UTF8Octal(const char *charSequence, const wchar_t *wideSequence);
124 void UTF8(const char *charSequence, const wchar_t *wideSequence, int option);
125 #endif // HAVE_WCHAR_H
126
127 DECLARE_NO_COPY_CLASS(MBConvTestCase)
128 };
129
130 // register in the unnamed registry so that these tests are run by default
131 CPPUNIT_TEST_SUITE_REGISTRATION( MBConvTestCase );
132
133 // also include in it's own registry so that these tests can be run alone
134 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( MBConvTestCase, "MBConvTestCase" );
135
136 void MBConvTestCase::WC2CP1250()
137 {
138 static const struct Data
139 {
140 const wchar_t *wc;
141 const char *cp1250;
142 } data[] =
143 {
144 { L"hello", "hello" }, // test that it works in simplest case
145 { L"\xBD of \xBD is \xBC", NULL }, // this should fail as cp1250 doesn't have 1/2
146 };
147
148 wxCSConv cs1250(wxFONTENCODING_CP1250);
149 for ( size_t n = 0; n < WXSIZEOF(data); n++ )
150 {
151 const Data& d = data[n];
152 if (d.cp1250)
153 {
154 CPPUNIT_ASSERT( strcmp(cs1250.cWC2MB(d.wc), d.cp1250) == 0 );
155 }
156 else
157 {
158 CPPUNIT_ASSERT( (const char*)cs1250.cWC2MB(d.wc) == NULL );
159 }
160 }
161 }
162
163 // ----------------------------------------------------------------------------
164 // UTF-8 tests
165 // ----------------------------------------------------------------------------
166
167 #ifdef HAVE_WCHAR_H
168
169 // Check that 'charSequence' translates to 'wideSequence' and back.
170 // Invalid sequences can be tested by giving NULL or 'wideSequence'. Even
171 // invalid sequences should roundtrip when an option is given and this is
172 // checked.
173 //
174 void MBConvTestCase::UTF8(const char *charSequence,
175 const wchar_t *wideSequence)
176 {
177 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
178 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
179 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
180 }
181
182 // Use this alternative when 'charSequence' contains a PUA character. Such
183 // sequences should still roundtrip ok, and this is checked.
184 //
185 void MBConvTestCase::UTF8PUA(const char *charSequence,
186 const wchar_t *wideSequence)
187 {
188 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
189 UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
190 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
191 }
192
193 // Use this alternative when 'charSequence' contains an octal escape sequence.
194 // Such sequences should still roundtrip ok, and this is checked.
195 //
196 void MBConvTestCase::UTF8Octal(const char *charSequence,
197 const wchar_t *wideSequence)
198 {
199 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
200 UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
201 UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
202 }
203
204 // include the option in the error messages so it's possible to see which
205 // test failed
206 #define UTF8ASSERT(expr) CPPUNIT_ASSERT_MESSAGE(#expr + errmsg, expr)
207
208 // The test implementation
209 //
210 void MBConvTestCase::UTF8(const char *charSequence,
211 const wchar_t *wideSequence,
212 int option)
213 {
214 const size_t BUFSIZE = 128;
215 wxASSERT(strlen(charSequence) * 3 + 10 < BUFSIZE);
216 char bytes[BUFSIZE];
217
218 // include the option in the error messages so it's possible to see
219 // which test failed
220 sprintf(bytes, " (with option == %d)", option);
221 std::string errmsg(bytes);
222
223 // put the charSequence at the start, middle and end of a string
224 strcpy(bytes, charSequence);
225 strcat(bytes, "ABC");
226 strcat(bytes, charSequence);
227 strcat(bytes, "XYZ");
228 strcat(bytes, charSequence);
229
230 // translate it into wide characters
231 wxMBConvUTF8 utf8(option);
232 wchar_t widechars[BUFSIZE];
233 size_t result = utf8.MB2WC(widechars, bytes, BUFSIZE);
234
235 // check we got the expected result
236 if (wideSequence) {
237 UTF8ASSERT(result != (size_t)-1);
238 wxASSERT(result < BUFSIZE);
239
240 wchar_t expected[BUFSIZE];
241 wcscpy(expected, wideSequence);
242 wcscat(expected, L"ABC");
243 wcscat(expected, wideSequence);
244 wcscat(expected, L"XYZ");
245 wcscat(expected, wideSequence);
246
247 UTF8ASSERT(wcscmp(widechars, expected) == 0);
248 }
249 else {
250 // If 'wideSequence' is NULL, then the result is expected to be
251 // invalid. Normally that is as far as we can go, but if there is an
252 // option then the conversion should succeed anyway, and it should be
253 // possible to translate back to the original
254 if (!option) {
255 UTF8ASSERT(result == (size_t)-1);
256 return;
257 }
258 else {
259 UTF8ASSERT(result != (size_t)-1);
260 }
261 }
262
263 // translate it back and check we get the original
264 char bytesAgain[BUFSIZE];
265 size_t resultAgain = utf8.WC2MB(bytesAgain, widechars, BUFSIZE);
266 UTF8ASSERT(resultAgain != (size_t)-1);
267 wxASSERT(resultAgain < BUFSIZE);
268
269 UTF8ASSERT(strcmp(bytes, bytesAgain) == 0);
270 }
271
272 #endif // HAVE_WCHAR_H