X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/a65ca3e600a5147a1e75dec9b0d7357770faf155..e733c4ce1e24cf7e4b0b0d8362fc59aaa7a7641c:/tests/strings/unicode.cpp diff --git a/tests/strings/unicode.cpp b/tests/strings/unicode.cpp index 94f4c90e8c..29b7777a2a 100644 --- a/tests/strings/unicode.cpp +++ b/tests/strings/unicode.cpp @@ -3,7 +3,6 @@ // Purpose: Unicode unit test // Author: Vadim Zeitlin, Wlodzimierz ABX Skiba // Created: 2004-04-28 -// RCS-ID: $Id$ // Copyright: (c) 2004 Vadim Zeitlin, Wlodzimierz Skiba /////////////////////////////////////////////////////////////////////////////// @@ -18,8 +17,118 @@ #endif #ifndef WX_PRECOMP + #include "wx/wx.h" #endif // WX_PRECOMP +#include "wx/encconv.h" + +// ---------------------------------------------------------------------------- +// helper class holding the matching MB and WC strings +// ---------------------------------------------------------------------------- + +struct StringConversionData +{ + // either str or wcs (but not both) may be NULL, this means that the conversion + // to it should fail + StringConversionData(const char *str_, const wchar_t *wcs_, int flags_ = 0) + : str(str_), wcs(wcs_), flags(flags_) + { + } + + const char * const str; + const wchar_t * const wcs; + + enum + { + TEST_BOTH = 0, // test both str -> wcs and wcs -> str + ONLY_MB2WC = 1 // only test str -> wcs conversion + }; + + const int flags; + + // test that the conversion between str and wcs (subject to flags) succeeds + // + // the first argument is the index in the test array and is used solely for + // diagnostics + void Test(size_t n, wxMBConv& conv) const + { + if ( str ) + { + wxWCharBuffer wbuf = conv.cMB2WC(str); + + if ( wcs ) + { + CPPUNIT_ASSERT_MESSAGE + ( + Message(n, "MB2WC failed"), + wbuf.data() + ); + + CPPUNIT_ASSERT_MESSAGE + ( + Message(n, "MB2WC", wbuf, wcs), + wxStrcmp(wbuf, wcs) == 0 + ); + } + else // conversion is supposed to fail + { + CPPUNIT_ASSERT_MESSAGE + ( + Message(n, "MB2WC succeeded"), + !wbuf.data() + ); + } + } + + if ( wcs && !(flags & ONLY_MB2WC) ) + { + wxCharBuffer buf = conv.cWC2MB(wcs); + + if ( str ) + { + CPPUNIT_ASSERT_MESSAGE + ( + Message(n, "WC2MB failed"), + buf.data() + ); + + CPPUNIT_ASSERT_MESSAGE + ( + Message(n, "WC2MB", buf, str), + strcmp(buf, str) == 0 + ); + } + else + { + CPPUNIT_ASSERT_MESSAGE + ( + Message(n, "WC2MB succeeded"), + !buf.data() + ); + } + } + } + +private: + static std::string + Message(size_t n, const wxString& msg) + { + return std::string(wxString::Format("#%lu: %s", (unsigned long)n, msg)); + } + + template + static std::string + Message(size_t n, + const char *func, + const wxCharTypeBuffer& actual, + const T *expected) + { + return Message(n, + wxString::Format("%s returned \"%s\", expected \"%s\"", + func, actual.data(), expected)); + } +}; + // ---------------------------------------------------------------------------- // test class // ---------------------------------------------------------------------------- @@ -32,28 +141,31 @@ public: private: CPPUNIT_TEST_SUITE( UnicodeTestCase ); CPPUNIT_TEST( ToFromAscii ); -#if wxUSE_WCHAR_T CPPUNIT_TEST( ConstructorsWithConversion ); - CPPUNIT_TEST( Conversion ); + CPPUNIT_TEST( ConversionFixed ); + CPPUNIT_TEST( ConversionWithNULs ); CPPUNIT_TEST( ConversionUTF7 ); CPPUNIT_TEST( ConversionUTF8 ); -#endif // wxUSE_WCHAR_T + CPPUNIT_TEST( ConversionUTF16 ); + CPPUNIT_TEST( ConversionUTF32 ); + CPPUNIT_TEST( IsConvOk ); +#if wxUSE_UNICODE + CPPUNIT_TEST( Iteration ); +#endif CPPUNIT_TEST_SUITE_END(); void ToFromAscii(); -#if wxUSE_WCHAR_T void ConstructorsWithConversion(); - void Conversion(); + void ConversionFixed(); + void ConversionWithNULs(); void ConversionUTF7(); void ConversionUTF8(); - - // test if converting s using the given encoding gives ws and vice versa - // - // if either of the first 2 arguments is NULL, the conversion is supposed - // to fail - void DoTestConversion(const char *s, const wchar_t *w, wxCSConv& conv); -#endif // wxUSE_WCHAR_T - + void ConversionUTF16(); + void ConversionUTF32(); + void IsConvOk(); +#if wxUSE_UNICODE + void Iteration(); +#endif DECLARE_NO_COPY_CLASS(UnicodeTestCase) }; @@ -61,7 +173,7 @@ private: // register in the unnamed registry so that these tests are run by default CPPUNIT_TEST_SUITE_REGISTRATION( UnicodeTestCase ); -// also include in it's own registry so that these tests can be run alone +// also include in its own registry so that these tests can be run alone CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( UnicodeTestCase, "UnicodeTestCase" ); UnicodeTestCase::UnicodeTestCase() @@ -82,162 +194,283 @@ void UnicodeTestCase::ToFromAscii() TEST_TO_FROM_ASCII( "additional \" special \t test \\ component \n :-)" ); } -#if wxUSE_WCHAR_T void UnicodeTestCase::ConstructorsWithConversion() { - // the string "Déjà" in UTF-8 and wchar_t: + // the string "Déjà" in UTF-8 and wchar_t: const unsigned char utf8Buf[] = {0x44,0xC3,0xA9,0x6A,0xC3,0xA0,0}; - const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0}; - const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj" + const unsigned char utf8subBuf[] = {0x44,0xC3,0xA9,0x6A,0}; // just "Déj" const char *utf8 = (char *)utf8Buf; const char *utf8sub = (char *)utf8subBuf; wxString s1(utf8, wxConvUTF8); - wxString s2(wchar, wxConvUTF8); #if wxUSE_UNICODE - CPPUNIT_ASSERT( s1 == wchar ); - CPPUNIT_ASSERT( s2 == wchar ); + const wchar_t wchar[] = {0x44,0xE9,0x6A,0xE0,0}; + CPPUNIT_ASSERT_EQUAL( wchar, s1 ); + + wxString s2(wchar); + CPPUNIT_ASSERT_EQUAL( wchar, s2 ); + CPPUNIT_ASSERT_EQUAL( wxString::FromUTF8(utf8), s2 ); #else - CPPUNIT_ASSERT( s1 == utf8 ); - CPPUNIT_ASSERT( s2 == utf8 ); + CPPUNIT_ASSERT_EQUAL( utf8, s1 ); #endif wxString sub(utf8sub, wxConvUTF8); // "Dej" substring wxString s3(utf8, wxConvUTF8, 4); + CPPUNIT_ASSERT_EQUAL( sub, s3 ); + +#if wxUSE_UNICODE wxString s4(wchar, wxConvUTF8, 3); + CPPUNIT_ASSERT_EQUAL( sub, s4 ); + + // conversion should stop with failure at pos 35 + wxString s("\t[pl]open.format.Sformatuj dyskietk\xea=gfloppy %f", wxConvUTF8); + CPPUNIT_ASSERT( s.empty() ); +#endif // wxUSE_UNICODE + + + // test using Unicode strings together with char* strings (this must work + // in ANSI mode as well, of course): + wxString s5("ascii"); + CPPUNIT_ASSERT_EQUAL( "ascii", s5 ); - CPPUNIT_ASSERT( s3 == sub ); - CPPUNIT_ASSERT( s4 == sub ); + s5 += " value"; + + CPPUNIT_ASSERT( strcmp(s5.mb_str(), "ascii value") == 0 ); + CPPUNIT_ASSERT_EQUAL( "ascii value", s5 ); + CPPUNIT_ASSERT( s5 != "SomethingElse" ); +} + +void UnicodeTestCase::ConversionFixed() +{ + size_t len; #if wxUSE_UNICODE - CPPUNIT_ASSERT ( wxString("\t[pl]open.format.Sformatuj dyskietkê=gfloppy %f", - wxConvUTF8) == wxT("") ); //should stop at pos 35 -#endif + wxConvLibc.cWC2MB(L"", 0, &len); +#else // !wxUSE_UNICODE + wxConvLibc.cMB2WC("", 0, &len); +#endif // wxUSE_UNICODE/!wxUSE_UNICODE + + CPPUNIT_ASSERT_EQUAL( 0, len ); + +#if wxUSE_UNICODE + // check that when we convert a fixed number of characters we obtain the + // expected return value + CPPUNIT_ASSERT_EQUAL( 0, wxConvLibc.ToWChar(NULL, 0, "", 0) ); + CPPUNIT_ASSERT_EQUAL( 1, wxConvLibc.ToWChar(NULL, 0, "x", 1) ); + CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "x", 2) ); + CPPUNIT_ASSERT_EQUAL( 2, wxConvLibc.ToWChar(NULL, 0, "xy", 2) ); +#endif // wxUSE_UNICODE } -void UnicodeTestCase::Conversion() +void UnicodeTestCase::ConversionWithNULs() { #if wxUSE_UNICODE - wxString szTheString(L"The\0String", wxConvLibc, 10); - wxCharBuffer theBuffer = szTheString.mb_str(); + static const size_t lenNulString = 10; - CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String", 11) == 0 ); + wxString szTheString(L"The\0String", wxConvLibc, lenNulString); + wxCharBuffer theBuffer = szTheString.mb_str(); - wxString szTheString2("The\0String", wxConvLocal, 10); - CPPUNIT_ASSERT( szTheString2.length() == 11 ); - CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String", 11) == 0 ); -#else - wxString szTheString(wxT("TheString")); - szTheString.insert(3, 1, '\0'); - wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc); + CPPUNIT_ASSERT( memcmp(theBuffer.data(), "The\0String", + lenNulString + 1) == 0 ); - CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); + wxString szTheString2("The\0String", wxConvLocal, lenNulString); + CPPUNIT_ASSERT_EQUAL( lenNulString, szTheString2.length() ); + CPPUNIT_ASSERT( wxTmemcmp(szTheString2.c_str(), L"The\0String", + lenNulString + 1) == 0 ); +#else // !wxUSE_UNICODE + wxString szTheString("TheString"); + szTheString.insert(3, 1, '\0'); + wxWCharBuffer theBuffer = szTheString.wc_str(wxConvLibc); - wxString szLocalTheString(wxT("TheString")); - szLocalTheString.insert(3, 1, '\0'); - wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal); + CPPUNIT_ASSERT( memcmp(theBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); - CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); -#endif + wxString szLocalTheString("TheString"); + szLocalTheString.insert(3, 1, '\0'); + wxWCharBuffer theLocalBuffer = szLocalTheString.wc_str(wxConvLocal); + + CPPUNIT_ASSERT( memcmp(theLocalBuffer.data(), L"The\0String", 11 * sizeof(wchar_t)) == 0 ); +#endif // wxUSE_UNICODE/!wxUSE_UNICODE } -#if !wxUSE_UNICODE -// in case wcscmp is missing -// -static int wx_wcscmp(const wchar_t *s1, const wchar_t *s2) +void UnicodeTestCase::ConversionUTF7() { - while (*s1 == *s2 && *s1 != 0) + static const StringConversionData utf7data[] = + { + // normal fragments + StringConversionData("+AKM-", L"\xa3"), + StringConversionData("+AOk-t+AOk-", L"\xe9t\xe9"), + + // this one is an alternative valid encoding of the same string + StringConversionData("+AOk-t+AOk", L"\xe9t\xe9", + StringConversionData::ONLY_MB2WC), + + // some special cases + StringConversionData("+-", L"+"), + StringConversionData("+--", L"+-"), + + // the following are invalid UTF-7 sequences + StringConversionData("\xa3", NULL), + StringConversionData("+", NULL), + StringConversionData("+~", NULL), + StringConversionData("a+", NULL), + }; + + for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ ) { - s1++; - s2++; + const StringConversionData& d = utf7data[n]; + + // converting to/from UTF-7 using iconv() currently doesn't work + // because of several problems: + // - GetMBNulLen() doesn't return correct result (iconv converts L'\0' + // to an incomplete and anyhow nonsensical "+AA" string) + // - iconv refuses to convert "+-" (although it converts "+-\n" just + // fine, go figure) + // + // I have no idea how to fix this so just disable the test for now +#if 0 + d.Test(n, wxCSConv("utf-7")); +#endif + d.Test(n, wxConvUTF7); } - return *s1 - *s2; } -#endif -void -UnicodeTestCase::DoTestConversion(const char *s, - const wchar_t *ws, - wxCSConv& conv) +void UnicodeTestCase::ConversionUTF8() { -#if wxUSE_UNICODE - if ( ws ) + static const StringConversionData utf8data[] = { - wxCharBuffer buf(wxString(ws).mb_str(conv)); +#ifdef wxHAVE_U_ESCAPE + StringConversionData("\xc2\xa3", L"\u00a3"), +#endif + StringConversionData("\xc2", NULL), + }; - CPPUNIT_ASSERT( strcmp(buf, s) == 0 ); - } -#else // wxUSE_UNICODE - if ( s ) + wxCSConv conv(wxT("utf-8")); + for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ ) { - wxWCharBuffer wbuf(wxString(s).wc_str(conv)); - - if ( ws ) - CPPUNIT_ASSERT( wx_wcscmp(wbuf, ws) == 0 ); - else - CPPUNIT_ASSERT( !*wbuf ); + const StringConversionData& d = utf8data[n]; + d.Test(n, conv); + d.Test(n, wxConvUTF8); } -#endif // wxUSE_UNICODE/!wxUSE_UNICODE } -struct StringConversionData -{ - const char *str; - const wchar_t *wcs; -}; - -void UnicodeTestCase::ConversionUTF7() +void UnicodeTestCase::ConversionUTF16() { - static const StringConversionData utf7data[] = + static const StringConversionData utf16data[] = { - { "+-", L"+" }, - { "+--", L"+-" }, - //\u isn't recognized on MSVC 6 -#if !defined(_MSC_VER) -#if !defined(__GNUC__) || (__GNUC__ >= 3) - { "+AKM-", L"\u00a3" }, -#endif +#ifdef wxHAVE_U_ESCAPE + StringConversionData( + "\x04\x1f\x04\x40\x04\x38\x04\x32\x04\x35\x04\x42\0\0", + L"\u041f\u0440\u0438\u0432\u0435\u0442"), + StringConversionData( + "\x01\0\0b\x01\0\0a\x01\0\0r\0\0", + L"\u0100b\u0100a\u0100r"), #endif - // Windows accepts invalid UTF-7 strings and so does our UTF-7 - // conversion code -- this is wrong IMO but the way it is for now - // - // notice that converting "+" still behaves as expected because the - // result is just an empty string, i.e. the same as if there were an - // error, but converting "a+" results in "a" while it really should - // fail - { "+", NULL }, - { "a+", L"a" }, + StringConversionData("\0f\0o\0o\0\0", L"foo"), }; - wxCSConv conv(_T("utf-7")); - for ( size_t n = 0; n < WXSIZEOF(utf7data); n++ ) + wxCSConv conv(wxFONTENCODING_UTF16BE); + for ( size_t n = 0; n < WXSIZEOF(utf16data); n++ ) { - const StringConversionData& d = utf7data[n]; - DoTestConversion(d.str, d.wcs, conv); + const StringConversionData& d = utf16data[n]; + d.Test(n, conv); } + + // special case: this string has consecutive NULs inside it which don't + // terminate the string, this exposed a bug in our conversion code which + // got confused in this case + size_t len; + conv.cMB2WC("\x01\0\0B\0C" /* A macron BC */, 6, &len); + CPPUNIT_ASSERT_EQUAL( 3, len ); } -void UnicodeTestCase::ConversionUTF8() +void UnicodeTestCase::ConversionUTF32() { - static const StringConversionData utf8data[] = + static const StringConversionData utf32data[] = { - //\u isn't recognized on MSVC 6 -#if !defined(_MSC_VER) -#if !defined(__GNUC__) || (__GNUC__ >= 3) - { "\xc2\xa3", L"\u00a3" }, -#endif +#ifdef wxHAVE_U_ESCAPE + StringConversionData( + "\0\0\x04\x1f\0\0\x04\x40\0\0\x04\x38\0\0\x04\x32\0\0\x04\x35\0\0\x04\x42\0\0\0\0", + L"\u041f\u0440\u0438\u0432\u0435\u0442"), #endif - { "\xc2", NULL }, + StringConversionData("\0\0\0f\0\0\0o\0\0\0o\0\0\0\0", L"foo"), }; - wxCSConv conv(_T("utf-8")); - for ( size_t n = 0; n < WXSIZEOF(utf8data); n++ ) + wxCSConv conv(wxFONTENCODING_UTF32BE); + for ( size_t n = 0; n < WXSIZEOF(utf32data); n++ ) { - const StringConversionData& d = utf8data[n]; - DoTestConversion(d.str, d.wcs, conv); + const StringConversionData& d = utf32data[n]; + d.Test(n, conv); } + + size_t len; + conv.cMB2WC("\0\0\x01\0\0\0\0B\0\0\0C" /* A macron BC */, 12, &len); + CPPUNIT_ASSERT_EQUAL( 3, len ); } -#endif // wxUSE_WCHAR_T +void UnicodeTestCase::IsConvOk() +{ + CPPUNIT_ASSERT( wxCSConv(wxFONTENCODING_SYSTEM).IsOk() ); + CPPUNIT_ASSERT( wxCSConv("US-ASCII").IsOk() ); + CPPUNIT_ASSERT( wxCSConv("UTF-8").IsOk() ); + CPPUNIT_ASSERT( !wxCSConv("NoSuchConversion").IsOk() ); + +#ifdef __WINDOWS__ + CPPUNIT_ASSERT( wxCSConv("WINDOWS-437").IsOk() ); +#endif +} + +#if wxUSE_UNICODE +void UnicodeTestCase::Iteration() +{ + // "czech" in Czech ("cestina"): + static const char *textUTF8 = "\304\215e\305\241tina"; + static const wchar_t textUTF16[] = {0x10D, 0x65, 0x161, 0x74, 0x69, 0x6E, 0x61, 0}; + + wxString text(wxString::FromUTF8(textUTF8)); + CPPUNIT_ASSERT( wxStrcmp(text.wc_str(), textUTF16) == 0 ); + + // verify the string was decoded correctly: + { + size_t idx = 0; + for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) + { + CPPUNIT_ASSERT( *i == textUTF16[idx] ); + } + } + + // overwrite the string with something that is shorter in UTF-8: + { + for ( wxString::iterator i = text.begin(); i != text.end(); ++i ) + *i = 'x'; + } + + // restore the original text now: + { + wxString::iterator end1 = text.end(); + wxString::const_iterator end2 = text.end(); + + size_t idx = 0; + for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx ) + { + *i = textUTF16[idx]; + + CPPUNIT_ASSERT( end1 == text.end() ); + CPPUNIT_ASSERT( end2 == text.end() ); + } + + CPPUNIT_ASSERT( end1 == text.end() ); + CPPUNIT_ASSERT( end2 == text.end() ); + } + + // and verify it again: + { + size_t idx = 0; + for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx ) + { + CPPUNIT_ASSERT( *i == textUTF16[idx] ); + } + } +} +#endif // wxUSE_UNICODE