Fixes to allow compilation with no wchar_t (djgpp probably has a real wchar_t
[wxWidgets.git] / tests / mbconv / mbconvtest.cpp
index 6c509c838ddb354f85f3a1ef3258bbf87099f4d5..450e0ce183d3634a85a4fc1bad89b613ef0b099c 100644 (file)
@@ -1,17 +1,17 @@
 ///////////////////////////////////////////////////////////////////////////////
 // Name:        tests/mbconv/main.cpp
 // Purpose:     wxMBConv unit test
-// Author:      Vadim Zeitlin
+// Author:      Vadim Zeitlin, Mike Wetherell
 // Created:     14.02.04
 // RCS-ID:      $Id$
-// Copyright:   (c) 2003 TT-Solutions
+// Copyright:   (c) 2003 TT-Solutions, (c) 2005 Mike Wetherell
 ///////////////////////////////////////////////////////////////////////////////
 
 // ----------------------------------------------------------------------------
 // headers
 // ----------------------------------------------------------------------------
 
-#include "wx/wxprec.h"
+#include "testprec.h"
 
 #ifdef __BORLANDC__
     #pragma hdrstop
 #include "wx/strconv.h"
 #include "wx/string.h"
 
-#include "wx/cppunit.h"
+#if defined wxHAVE_TCHAR_SUPPORT && !defined HAVE_WCHAR_H
+    #define HAVE_WCHAR_H
+#endif
+
+// ----------------------------------------------------------------------------
+// Some wide character constants. "\uXXXX" escapes aren't supported by old
+// compilers such as VC++ 5 and g++ 2.95.
+// ----------------------------------------------------------------------------
+
+wchar_t u41[] = { 0x41, 0 };
+wchar_t u7f[] = { 0x7f, 0 };
+
+wchar_t u80[] = { 0x80, 0 };
+wchar_t u391[] = { 0x391, 0 };
+wchar_t u7ff[] = { 0x7ff, 0 };
+
+wchar_t u800[] = { 0x800, 0 };
+wchar_t u2620[] = { 0x2620, 0 };
+wchar_t ufffd[] = { 0xfffd, 0 };
+
+#if SIZEOF_WCHAR_T == 4
+wchar_t u10000[] = { 0x10000, 0 };
+wchar_t u1000a5[] = { 0x1000a5, 0 };
+wchar_t u10fffd[] = { 0x10fffd, 0 };
+#else
+wchar_t u10000[] = { 0xd800, 0xdc00, 0 };
+wchar_t u1000a5[] = { 0xdbc0, 0xdca5, 0 };
+wchar_t u10fffd[] = { 0xdbff, 0xdffd, 0 };
+#endif
 
 // ----------------------------------------------------------------------------
 // test class
@@ -38,10 +66,64 @@ public:
 private:
     CPPUNIT_TEST_SUITE( MBConvTestCase );
         CPPUNIT_TEST( WC2CP1250 );
+#ifdef HAVE_WCHAR_H
+        CPPUNIT_TEST( UTF8_41 );
+        CPPUNIT_TEST( UTF8_7f );
+        CPPUNIT_TEST( UTF8_80 );
+        CPPUNIT_TEST( UTF8_c2_7f );
+        CPPUNIT_TEST( UTF8_c2_80 );
+        CPPUNIT_TEST( UTF8_ce_91 );
+        CPPUNIT_TEST( UTF8_df_bf );
+        CPPUNIT_TEST( UTF8_df_c0 );
+        CPPUNIT_TEST( UTF8_e0_a0_7f );
+        CPPUNIT_TEST( UTF8_e0_a0_80 );
+        CPPUNIT_TEST( UTF8_e2_98_a0 );
+        CPPUNIT_TEST( UTF8_ef_bf_bd );
+        CPPUNIT_TEST( UTF8_ef_bf_c0 );
+        CPPUNIT_TEST( UTF8_f0_90_80_7f );
+        CPPUNIT_TEST( UTF8_f0_90_80_80 );
+        CPPUNIT_TEST( UTF8_f4_8f_bf_bd );
+        CPPUNIT_TEST( UTF8PUA_f4_80_82_a5 );
+        CPPUNIT_TEST( UTF8Octal_backslash245 );
+#endif // HAVE_WCHAR_H
     CPPUNIT_TEST_SUITE_END();
 
     void WC2CP1250();
 
+#ifdef HAVE_WCHAR_H
+    // UTF-8 tests. Test the first, last and one in the middle for sequences
+    // of each length
+    void UTF8_41() { UTF8("\x41", u41); }
+    void UTF8_7f() { UTF8("\x7f", u7f); }
+    void UTF8_80() { UTF8("\x80", NULL); }
+
+    void UTF8_c2_7f() { UTF8("\xc2\x7f", NULL); }
+    void UTF8_c2_80() { UTF8("\xc2\x80", u80); }
+    void UTF8_ce_91() { UTF8("\xce\x91", u391); }
+    void UTF8_df_bf() { UTF8("\xdf\xbf", u7ff); }
+    void UTF8_df_c0() { UTF8("\xdf\xc0", NULL); }
+
+    void UTF8_e0_a0_7f() { UTF8("\xe0\xa0\x7f", NULL); }
+    void UTF8_e0_a0_80() { UTF8("\xe0\xa0\x80", u800); }
+    void UTF8_e2_98_a0() { UTF8("\xe2\x98\xa0", u2620); }
+    void UTF8_ef_bf_bd() { UTF8("\xef\xbf\xbd", ufffd); }
+    void UTF8_ef_bf_c0() { UTF8("\xef\xbf\xc0", NULL); }
+
+    void UTF8_f0_90_80_7f() { UTF8("\xf0\x90\x80\x7f", NULL); }
+    void UTF8_f0_90_80_80() { UTF8("\xf0\x90\x80\x80", u10000); }
+    void UTF8_f4_8f_bf_bd() { UTF8("\xf4\x8f\xbf\xbd", u10fffd); }
+
+    // test 'escaping the escape characters' for the two escaping schemes
+    void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
+    void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }
+
+    // implementation for the utf-8 tests (see comments below)
+    void UTF8(const char *charSequence, const wchar_t *wideSequence);
+    void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
+    void UTF8Octal(const char *charSequence, const wchar_t *wideSequence);
+    void UTF8(const char *charSequence, const wchar_t *wideSequence, int option);
+#endif // HAVE_WCHAR_H
+
     DECLARE_NO_COPY_CLASS(MBConvTestCase)
 };
 
@@ -77,3 +159,120 @@ void MBConvTestCase::WC2CP1250()
         }
     }
 }
+
+// ----------------------------------------------------------------------------
+// UTF-8 tests
+// ----------------------------------------------------------------------------
+
+#ifdef HAVE_WCHAR_H
+
+// Check that 'charSequence' translates to 'wideSequence' and back.
+// Invalid sequences can be tested by giving NULL for 'wideSequence'. Even
+// invalid sequences should roundtrip when an option is given and this is
+// checked.
+//
+void MBConvTestCase::UTF8(const char *charSequence,
+                          const wchar_t *wideSequence)
+{
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
+}
+
+// Use this alternative when 'charSequence' contains a PUA character. Such
+// sequences should still roundtrip ok, and this is checked.
+//
+void MBConvTestCase::UTF8PUA(const char *charSequence,
+                             const wchar_t *wideSequence)
+{
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
+    UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
+}
+
+// Use this alternative when 'charSequence' contains an octal escape sequence.
+// Such sequences should still roundtrip ok, and this is checked.
+//
+void MBConvTestCase::UTF8Octal(const char *charSequence,
+                               const wchar_t *wideSequence)
+{
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_NOT);
+    UTF8(charSequence, wideSequence, wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
+    UTF8(charSequence, NULL, wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
+}
+
+// include the option in the error messages so it's possible to see which
+// test failed
+#define UTF8ASSERT(expr) CPPUNIT_ASSERT_MESSAGE(#expr + errmsg,  expr)
+
+// The test implementation
+//
+void MBConvTestCase::UTF8(const char *charSequence,
+                          const wchar_t *wideSequence,
+                          int option)
+{
+    const size_t BUFSIZE = 128;
+    wxASSERT(strlen(charSequence) * 3 + 10 < BUFSIZE);
+    char bytes[BUFSIZE];
+    
+    // include the option in the error messages so it's possible to see
+    // which test failed
+    sprintf(bytes, " (with option == %d)", option);
+    std::string errmsg(bytes);
+    
+    // put the charSequence at the start, middle and end of a string
+    strcpy(bytes, charSequence);
+    strcat(bytes, "ABC");
+    strcat(bytes, charSequence);
+    strcat(bytes, "XYZ");
+    strcat(bytes, charSequence);
+
+    // translate it into wide characters
+    wxMBConvUTF8 utf8(option);
+    wchar_t widechars[BUFSIZE];
+    size_t lenResult = utf8.MB2WC(NULL, bytes, 0);
+    size_t result = utf8.MB2WC(widechars, bytes, BUFSIZE);
+    UTF8ASSERT(result == lenResult);
+
+    // check we got the expected result
+    if (wideSequence) {
+        UTF8ASSERT(result != (size_t)-1);
+        wxASSERT(result < BUFSIZE);
+
+        wchar_t expected[BUFSIZE];
+        wcscpy(expected, wideSequence);
+        wcscat(expected, L"ABC");
+        wcscat(expected, wideSequence);
+        wcscat(expected, L"XYZ");
+        wcscat(expected, wideSequence);
+
+        UTF8ASSERT(wcscmp(widechars, expected) == 0);
+        UTF8ASSERT(wcslen(widechars) == result);
+    }
+    else {
+        // If 'wideSequence' is NULL, then the result is expected to be
+        // invalid.  Normally that is as far as we can go, but if there is an
+        // option then the conversion should succeed anyway, and it should be
+        // possible to translate back to the original
+        if (!option) {
+            UTF8ASSERT(result == (size_t)-1);
+            return;
+        }
+        else {
+            UTF8ASSERT(result != (size_t)-1);
+        }
+    }
+
+    // translate it back and check we get the original
+    char bytesAgain[BUFSIZE];
+    size_t lenResultAgain = utf8.WC2MB(NULL, widechars, 0);
+    size_t resultAgain = utf8.WC2MB(bytesAgain, widechars, BUFSIZE);
+    UTF8ASSERT(resultAgain == lenResultAgain);
+    UTF8ASSERT(resultAgain != (size_t)-1);
+    wxASSERT(resultAgain < BUFSIZE);
+
+    UTF8ASSERT(strcmp(bytes, bytesAgain) == 0);
+    UTF8ASSERT(strlen(bytesAgain) == resultAgain);
+}
+
+#endif // HAVE_WCHAR_H