X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/335d31e0072cb86c917b59c22209242af6064123..f938a756a1c5944f7cc3ee6f2e51effbc06fd9c0:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index fc2ad99177..7b39e4bb3f 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -42,6 +42,10 @@ #include "wx/msw/private.h" #endif +#ifdef __WINDOWS__ + #include "wx/msw/missing.h" +#endif + #ifndef __WXWINCE__ #include #endif @@ -68,11 +72,12 @@ #include "wx/encconv.h" #include "wx/fontmap.h" +#include "wx/utils.h" #ifdef __WXMAC__ -#include "ATSUnicode.h" -#include "TextCommon.h" -#include "TextEncodingConverter.h" +#include +#include +#include #include "wx/mac/private.h" // includes mac headers #endif @@ -176,9 +181,11 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const { // now do the actual conversion wxWCharBuffer buf(nLen); - MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL - - return buf; + nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL + if ( nLen != (size_t)-1 ) + { + return buf; + } } } @@ -195,9 +202,11 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const if ( nLen != (size_t)-1 ) { wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero - WC2MB(buf.data(), pwz, nLen + 4); - - return buf; + nLen = WC2MB(buf.data(), pwz, nLen + 4); + if ( nLen != (size_t)-1 ) + { + return buf; + } } } @@ -950,7 +959,7 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) { ms_wcCharsetName = NULL; - // VS: we must not output an error here, since wxWindows will safely + // VS: we must not output an error here, since wxWidgets will safely // fall back to using wxEncodingConverter. wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name); //wxLogError( @@ -1112,8 +1121,10 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const #ifdef wxHAVE_WIN32_MB2WC // from utils.cpp +#if wxUSE_FONTMAP extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset); extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding); +#endif class wxMBConv_win32 : public wxMBConv { @@ -1123,6 +1134,7 @@ public: m_CodePage = CP_ACP; } +#if wxUSE_FONTMAP wxMBConv_win32(const wxChar* name) { m_CodePage = wxCharsetToCodepage(name); @@ -1132,13 +1144,19 @@ public: { m_CodePage = wxEncodingToCodepage(encoding); } +#endif size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const { + // note that we have to use MB_ERR_INVALID_CHARS flag as it without it + // the behaviour is not compatible with the Unix version (using iconv) + // and break the library itself, e.g. wxTextInputStream::NextChar() + // wouldn't work if reading an incomplete MB char didn't result in an + // error const size_t len = ::MultiByteToWideChar ( m_CodePage, // code page - 0, // flags (none) + MB_ERR_INVALID_CHARS, // flags: fall on error psz, // input string -1, // its length (NUL-terminated) buf, // output string @@ -1151,28 +1169,118 @@ public: return len ? len - 1 : (size_t)-1; } - size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const - { + size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const + { + /* + we have a problem here: by default, WideCharToMultiByte() may + replace characters unrepresentable in the target code page with bad + quality approximations such as turning "1/2" symbol (U+00BD) into + "1" for the code pages which don't have it and we, obviously, want + to avoid this at any price + + the trouble is that this function does it _silently_, i.e. it won't + even tell us whether it did or not... Win98/2000 and higher provide + WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and + we have to resort to a round trip, i.e. check that converting back + results in the same string -- this is, of course, expensive but + otherwise we simply can't be sure to not garble the data. + */ + + // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN + // it doesn't work with CJK encodings (which we test for rather roughly + // here...) nor with UTF-7/8 nor, of course, with Windows versions not + // supporting it + BOOL usedDef wxDUMMY_INITIALIZE(false), + *pUsedDef; + int flags; + if ( CanUseNoBestFit() && m_CodePage < 50000 ) + { + // it's our lucky day + flags = WC_NO_BEST_FIT_CHARS; + pUsedDef = &usedDef; + } + else // old system or unsupported encoding + { + flags = 0; + pUsedDef = NULL; + } + const size_t len = ::WideCharToMultiByte ( m_CodePage, // code page - 0, // flags (none) - psz, // input string + flags, // either none or no best fit + pwz, // input string -1, // it is (wide) NUL-terminated buf, // output buffer buf ? n : 0, // and its size NULL, // default "replacement" char - NULL // [out] was it used? + pUsedDef // [out] was it used? ); + if ( !len ) + { + // function totally failed + return (size_t)-1; + } + + // if we were really converting, check if we succeeded + if ( buf ) + { + if ( flags ) + { + // check if the conversion failed, i.e. if any replacements + // were done + if ( usedDef ) + return (size_t)-1; + } + else // we must resort to double tripping... + { + wxWCharBuffer wcBuf(n); + if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 || + wcscmp(wcBuf, pwz) != 0 ) + { + // we didn't obtain the same thing we started from, hence + // the conversion was lossy and we consider that it failed + return (size_t)-1; + } + } + } + // see the comment above for the reason of "len - 1" - return len ? len - 1 : (size_t)-1; + return len - 1; } - bool IsOk() const - { return m_CodePage != -1; } + bool IsOk() const { return m_CodePage != -1; } + +private: + static bool CanUseNoBestFit() + { + static int s_isWin98Or2k = -1; + + if ( s_isWin98Or2k == -1 ) + { + int verMaj, verMin; + switch ( wxGetOsVersion(&verMaj, &verMin) ) + { + case wxWIN95: + s_isWin98Or2k = verMaj >= 4 && verMin >= 10; + break; + + case wxWINDOWS_NT: + s_isWin98Or2k = verMaj >= 5; + break; + + default: + // unknown, be conseravtive by default + s_isWin98Or2k = 0; + } + + wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") ); + } + + return s_isWin98Or2k == 1; + } -public: long m_CodePage; }; @@ -1194,12 +1302,12 @@ public: wxMBConv_mac(const wxChar* name) { - Init( EncodingToSystem(wxFontMapper::Get()->CharsetToEncoding(name, FALSE) ) ) ; + Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, FALSE) ) ) ; } wxMBConv_mac(wxFontEncoding encoding) { - Init( EncodingToSystem(encoding) ); + Init( wxMacGetSystemEncFromFontEnc(encoding) ); } ~wxMBConv_mac() @@ -1209,153 +1317,13 @@ public: status = TECDisposeConverter(m_WC2MB_converter); } - static TextEncodingBase EncodingToSystem(wxFontEncoding encoding) - { - TextEncodingBase enc = CFStringGetSystemEncoding() ; - - switch( encoding) - { - case wxFONTENCODING_ISO8859_1 : - enc = kTextEncodingISOLatin1 ; - break ; - case wxFONTENCODING_ISO8859_2 : - enc = kTextEncodingISOLatin2; - break ; - case wxFONTENCODING_ISO8859_3 : - enc = kTextEncodingISOLatin3 ; - break ; - case wxFONTENCODING_ISO8859_4 : - enc = kTextEncodingISOLatin4; - break ; - case wxFONTENCODING_ISO8859_5 : - enc = kTextEncodingISOLatinCyrillic; - break ; - case wxFONTENCODING_ISO8859_6 : - enc = kTextEncodingISOLatinArabic; - break ; - case wxFONTENCODING_ISO8859_7 : - enc = kTextEncodingISOLatinGreek; - break ; - case wxFONTENCODING_ISO8859_8 : - enc = kTextEncodingISOLatinHebrew; - break ; - case wxFONTENCODING_ISO8859_9 : - enc = kTextEncodingISOLatin5; - break ; - case wxFONTENCODING_ISO8859_10 : - enc = kTextEncodingISOLatin6; - break ; - case wxFONTENCODING_ISO8859_13 : - enc = kTextEncodingISOLatin7; - break ; - case wxFONTENCODING_ISO8859_14 : - enc = kTextEncodingISOLatin8; - break ; - case wxFONTENCODING_ISO8859_15 : - enc = kTextEncodingISOLatin9; - break ; - - case wxFONTENCODING_KOI8 : - enc = kTextEncodingKOI8_R; - break ; - case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866 - enc = kTextEncodingDOSRussian; - break ; -/* - case wxFONTENCODING_BULGARIAN : - enc = ; - break ; -*/ - case wxFONTENCODING_CP437 : - enc =kTextEncodingDOSLatinUS ; - break ; - case wxFONTENCODING_CP850 : - enc = kTextEncodingDOSLatin1; - break ; - case wxFONTENCODING_CP852 : - enc = kTextEncodingDOSLatin2; - break ; - case wxFONTENCODING_CP855 : - enc = kTextEncodingDOSCyrillic; - break ; - case wxFONTENCODING_CP866 : - enc =kTextEncodingDOSRussian ; - break ; - case wxFONTENCODING_CP874 : - enc = kTextEncodingDOSThai; - break ; - case wxFONTENCODING_CP932 : - enc = kTextEncodingDOSJapanese; - break ; - case wxFONTENCODING_CP936 : - enc =kTextEncodingDOSChineseSimplif ; - break ; - case wxFONTENCODING_CP949 : - enc = kTextEncodingDOSKorean; - break ; - case wxFONTENCODING_CP950 : - enc = kTextEncodingDOSChineseTrad; - break ; - - case wxFONTENCODING_CP1250 : - enc = kTextEncodingWindowsLatin2; - break ; - case wxFONTENCODING_CP1251 : - enc =kTextEncodingWindowsCyrillic ; - break ; - case wxFONTENCODING_CP1252 : - enc =kTextEncodingWindowsLatin1 ; - break ; - case wxFONTENCODING_CP1253 : - enc = kTextEncodingWindowsGreek; - break ; - case wxFONTENCODING_CP1254 : - enc = kTextEncodingWindowsLatin5; - break ; - case wxFONTENCODING_CP1255 : - enc =kTextEncodingWindowsHebrew ; - break ; - case wxFONTENCODING_CP1256 : - enc =kTextEncodingWindowsArabic ; - break ; - case wxFONTENCODING_CP1257 : - enc = kTextEncodingWindowsBalticRim; - break ; - - case wxFONTENCODING_UTF7 : - enc = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicodeUTF7Format) ; - break ; - case wxFONTENCODING_UTF8 : - enc = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicodeUTF8Format) ; - break ; - case wxFONTENCODING_EUC_JP : - enc = kTextEncodingEUC_JP; - break ; - case wxFONTENCODING_UTF16BE : - enc = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ; - break ; - case wxFONTENCODING_UTF16LE : - enc = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ; - break ; - case wxFONTENCODING_UTF32BE : - enc = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode32BitFormat) ; - break ; - case wxFONTENCODING_UTF32LE : - enc = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode32BitFormat) ; - break ; - } ; - return enc ; - } void Init( TextEncodingBase encoding) { OSStatus status = noErr ; m_char_encoding = encoding ; -#if SIZEOF_WCHAR_T == 4 - m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode32BitFormat) ; -#else m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ; -#endif + status = TECCreateConverter(&m_MB2WC_converter, m_char_encoding, m_unicode_encoding); @@ -1369,22 +1337,36 @@ public: OSStatus status = noErr ; ByteCount byteOutLen ; ByteCount byteInLen = strlen(psz) ; - ByteCount byteBufferLen = n ; wchar_t *tbuf = NULL ; + UniChar* ubuf = NULL ; + size_t res = 0 ; if (buf == NULL) { - n = byteInLen * SIZEOF_WCHAR_T ; - tbuf = (wchar_t*) malloc( n ) ; + n = byteInLen ; + tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ; } - + ByteCount byteBufferLen = n * sizeof( UniChar ) ; +#if SIZEOF_WCHAR_T == 4 + ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; +#else + ubuf = (UniChar*) (buf ? buf : tbuf) ; +#endif status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen, - (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen); - + (TextPtr) ubuf , byteBufferLen, &byteOutLen); +#if SIZEOF_WCHAR_T == 4 + // we have to terminate here, because n might be larger for the trailing zero, and if UniChar + // is not properly terminated we get random characters at the end + ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; + wxMBConvUTF16BE converter ; + res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ; + free( ubuf ) ; +#else + res = byteOutLen / sizeof( UniChar ) ; +#endif if ( buf == NULL ) free(tbuf) ; - size_t res = byteOutLen / SIZEOF_WCHAR_T ; if ( buf && res < n) buf[res] = 0; @@ -1396,19 +1378,32 @@ public: OSStatus status = noErr ; ByteCount byteOutLen ; ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; - ByteCount byteBufferLen = n ; char *tbuf = NULL ; if (buf == NULL) { - n = byteInLen ; + // worst case + n = byteInLen * 2 ; tbuf = (char*) malloc( n ) ; } - status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) psz , byteInLen, &byteInLen, - (TextPtr) ( buf ? buf : tbuf ) , byteBufferLen, &byteOutLen); - + ByteCount byteBufferLen = n ; + UniChar* ubuf = NULL ; +#if SIZEOF_WCHAR_T == 4 + wxMBConvUTF16BE converter ; + size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ; + byteInLen = unicharlen ; + ubuf = (UniChar*) malloc( byteInLen + 2 ) ; + converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ; +#else + ubuf = (UniChar*) psz ; +#endif + status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen, + (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen); +#if SIZEOF_WCHAR_T == 4 + free( ubuf ) ; +#endif if ( buf == NULL ) free(tbuf) ; @@ -1619,12 +1614,16 @@ wxMBConv *wxCSConv::DoCreate() const #ifdef wxHAVE_WIN32_MB2WC { +#if wxUSE_FONTMAP wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name) : new wxMBConv_win32(m_encoding); if ( conv->IsOk() ) return conv; delete conv; +#else + return NULL; +#endif } #endif // wxHAVE_WIN32_MB2WC #if defined(__WXMAC__) @@ -1791,6 +1790,8 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const #ifdef __WINDOWS__ static wxMBConv_win32 wxConvLibcObj; +#elif defined(__WXMAC__) && !defined(__MACH__) + static wxMBConv_mac wxConvLibcObj ; #else static wxMBConvLibc wxConvLibcObj; #endif