X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/b5153fd845f0b5a0e2c9f3d95ca8dd57b3227e83..f938a756a1c5944f7cc3ee6f2e51effbc06fd9c0:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 63611c8657..7b39e4bb3f 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -42,6 +42,10 @@ #include "wx/msw/private.h" #endif +#ifdef __WINDOWS__ + #include "wx/msw/missing.h" +#endif + #ifndef __WXWINCE__ #include #endif @@ -68,7 +72,15 @@ #include "wx/encconv.h" #include "wx/fontmap.h" +#include "wx/utils.h" +#ifdef __WXMAC__ +#include +#include +#include + +#include "wx/mac/private.h" // includes mac headers +#endif // ---------------------------------------------------------------------------- // macros // ---------------------------------------------------------------------------- @@ -111,7 +123,8 @@ static size_t encode_utf16(wxUint32 input, wxUint16 *output) { if (input<=0xffff) { - if (output) *output++ = (wxUint16) input; + if (output) + *output = (wxUint16) input; return 1; } else if (input>=0x110000) @@ -123,7 +136,7 @@ static size_t encode_utf16(wxUint32 input, wxUint16 *output) if (output) { *output++ = (wxUint16) ((input >> 10)+0xd7c0); - *output++ = (wxUint16) ((input&0x3ff)+0xdc00); + *output = (wxUint16) ((input&0x3ff)+0xdc00); } return 2; } @@ -168,9 +181,11 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const { // now do the actual conversion wxWCharBuffer buf(nLen); - MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL - - return buf; + nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL + if ( nLen != (size_t)-1 ) + { + return buf; + } } } @@ -187,9 +202,11 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const if ( nLen != (size_t)-1 ) { wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero - WC2MB(buf.data(), pwz, nLen + 4); - - return buf; + nLen = WC2MB(buf.data(), pwz, nLen + 4); + if ( nLen != (size_t)-1 ) + { + return buf; + } } } @@ -942,7 +959,7 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) { ms_wcCharsetName = NULL; - // VS: we must not output an error here, since wxWindows will safely + // VS: we must not output an error here, since wxWidgets will safely // fall back to using wxEncodingConverter. wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name); //wxLogError( @@ -1104,8 +1121,10 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const #ifdef wxHAVE_WIN32_MB2WC // from utils.cpp +#if wxUSE_FONTMAP extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset); extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding); +#endif class wxMBConv_win32 : public wxMBConv { @@ -1115,6 +1134,7 @@ public: m_CodePage = CP_ACP; } +#if wxUSE_FONTMAP wxMBConv_win32(const wxChar* name) { m_CodePage = wxCharsetToCodepage(name); @@ -1124,51 +1144,288 @@ public: { m_CodePage = wxEncodingToCodepage(encoding); } +#endif size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const { + // note that we have to use MB_ERR_INVALID_CHARS flag as it without it + // the behaviour is not compatible with the Unix version (using iconv) + // and break the library itself, e.g. wxTextInputStream::NextChar() + // wouldn't work if reading an incomplete MB char didn't result in an + // error const size_t len = ::MultiByteToWideChar ( m_CodePage, // code page - 0, // flags (none) + MB_ERR_INVALID_CHARS, // flags: fall on error psz, // input string -1, // its length (NUL-terminated) buf, // output string buf ? n : 0 // size of output buffer ); - // note that it returns # of written chars for buf != NULL and *size* - // of the needed buffer for buf == NULL - return len ? (buf ? len : len - 1) : (size_t)-1; + // note that it returns count of written chars for buf != NULL and size + // of the needed buffer for buf == NULL so in either case the length of + // the string (which never includes the terminating NUL) is one less + return len ? len - 1 : (size_t)-1; } - size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const + size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const { + /* + we have a problem here: by default, WideCharToMultiByte() may + replace characters unrepresentable in the target code page with bad + quality approximations such as turning "1/2" symbol (U+00BD) into + "1" for the code pages which don't have it and we, obviously, want + to avoid this at any price + + the trouble is that this function does it _silently_, i.e. it won't + even tell us whether it did or not... Win98/2000 and higher provide + WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and + we have to resort to a round trip, i.e. check that converting back + results in the same string -- this is, of course, expensive but + otherwise we simply can't be sure to not garble the data. + */ + + // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN + // it doesn't work with CJK encodings (which we test for rather roughly + // here...) nor with UTF-7/8 nor, of course, with Windows versions not + // supporting it + BOOL usedDef wxDUMMY_INITIALIZE(false), + *pUsedDef; + int flags; + if ( CanUseNoBestFit() && m_CodePage < 50000 ) + { + // it's our lucky day + flags = WC_NO_BEST_FIT_CHARS; + pUsedDef = &usedDef; + } + else // old system or unsupported encoding + { + flags = 0; + pUsedDef = NULL; + } + const size_t len = ::WideCharToMultiByte ( m_CodePage, // code page - 0, // flags (none) - psz, // input string + flags, // either none or no best fit + pwz, // input string -1, // it is (wide) NUL-terminated buf, // output buffer buf ? n : 0, // and its size NULL, // default "replacement" char - NULL // [out] was it used? + pUsedDef // [out] was it used? ); - // see the comment above! - return len ? (buf ? len : len - 1) : (size_t)-1; + if ( !len ) + { + // function totally failed + return (size_t)-1; + } + + // if we were really converting, check if we succeeded + if ( buf ) + { + if ( flags ) + { + // check if the conversion failed, i.e. if any replacements + // were done + if ( usedDef ) + return (size_t)-1; + } + else // we must resort to double tripping... + { + wxWCharBuffer wcBuf(n); + if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 || + wcscmp(wcBuf, pwz) != 0 ) + { + // we didn't obtain the same thing we started from, hence + // the conversion was lossy and we consider that it failed + return (size_t)-1; + } + } + } + + // see the comment above for the reason of "len - 1" + return len - 1; } - bool IsOk() const - { return m_CodePage != -1; } + bool IsOk() const { return m_CodePage != -1; } + +private: + static bool CanUseNoBestFit() + { + static int s_isWin98Or2k = -1; + + if ( s_isWin98Or2k == -1 ) + { + int verMaj, verMin; + switch ( wxGetOsVersion(&verMaj, &verMin) ) + { + case wxWIN95: + s_isWin98Or2k = verMaj >= 4 && verMin >= 10; + break; + + case wxWINDOWS_NT: + s_isWin98Or2k = verMaj >= 5; + break; + + default: + // unknown, be conseravtive by default + s_isWin98Or2k = 0; + } + + wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") ); + } + + return s_isWin98Or2k == 1; + } -public: long m_CodePage; }; #endif // wxHAVE_WIN32_MB2WC +// ============================================================================ +// Mac conversion classes +// ============================================================================ + +#if defined(__WXMAC__) && defined(TARGET_CARBON) + +class wxMBConv_mac : public wxMBConv +{ +public: + wxMBConv_mac() + { + Init(CFStringGetSystemEncoding()) ; + } + + wxMBConv_mac(const wxChar* name) + { + Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, FALSE) ) ) ; + } + + wxMBConv_mac(wxFontEncoding encoding) + { + Init( wxMacGetSystemEncFromFontEnc(encoding) ); + } + + ~wxMBConv_mac() + { + OSStatus status = noErr ; + status = TECDisposeConverter(m_MB2WC_converter); + status = TECDisposeConverter(m_WC2MB_converter); + } + + + void Init( TextEncodingBase encoding) + { + OSStatus status = noErr ; + m_char_encoding = encoding ; + m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ; + + status = TECCreateConverter(&m_MB2WC_converter, + m_char_encoding, + m_unicode_encoding); + status = TECCreateConverter(&m_WC2MB_converter, + m_unicode_encoding, + m_char_encoding); + } + + size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const + { + OSStatus status = noErr ; + ByteCount byteOutLen ; + ByteCount byteInLen = strlen(psz) ; + wchar_t *tbuf = NULL ; + UniChar* ubuf = NULL ; + size_t res = 0 ; + + if (buf == NULL) + { + n = byteInLen ; + tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ; + } + ByteCount byteBufferLen = n * sizeof( UniChar ) ; +#if SIZEOF_WCHAR_T == 4 + ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; +#else + ubuf = (UniChar*) (buf ? buf : tbuf) ; +#endif + status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen, + (TextPtr) ubuf , byteBufferLen, &byteOutLen); +#if SIZEOF_WCHAR_T == 4 + // we have to terminate here, because n might be larger for the trailing zero, and if UniChar + // is not properly terminated we get random characters at the end + ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; + wxMBConvUTF16BE converter ; + res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ; + free( ubuf ) ; +#else + res = byteOutLen / sizeof( UniChar ) ; +#endif + if ( buf == NULL ) + free(tbuf) ; + + if ( buf && res < n) + buf[res] = 0; + + return res ; + } + + size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const + { + OSStatus status = noErr ; + ByteCount byteOutLen ; + ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; + + char *tbuf = NULL ; + + if (buf == NULL) + { + // worst case + n = byteInLen * 2 ; + tbuf = (char*) malloc( n ) ; + } + + ByteCount byteBufferLen = n ; + UniChar* ubuf = NULL ; +#if SIZEOF_WCHAR_T == 4 + wxMBConvUTF16BE converter ; + size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ; + byteInLen = unicharlen ; + ubuf = (UniChar*) malloc( byteInLen + 2 ) ; + converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ; +#else + ubuf = (UniChar*) psz ; +#endif + status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen, + (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen); +#if SIZEOF_WCHAR_T == 4 + free( ubuf ) ; +#endif + if ( buf == NULL ) + free(tbuf) ; + + size_t res = byteOutLen ; + if ( buf && res < n) + buf[res] = 0; + + return res ; + } + + bool IsOk() const + { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; } + +private: + TECObjectRef m_MB2WC_converter ; + TECObjectRef m_WC2MB_converter ; + + TextEncodingBase m_char_encoding ; + TextEncodingBase m_unicode_encoding ; +}; + +#endif // defined(__WXMAC__) && defined(TARGET_CARBON) // ============================================================================ // wxEncodingConverter based conversion classes @@ -1247,37 +1504,21 @@ void wxCSConv::Init() m_deferred = true; } -// find a valid value for the encoding -void wxCSConv::SetEncoding() -{ -#if wxUSE_INTL - m_encoding = wxLocale::GetSystemEncoding(); -#else - m_encoding = wxFONTENCODING_SYSTEM; -#endif -} - wxCSConv::wxCSConv(const wxChar *charset) { Init(); if ( charset ) { - // not used - m_encoding = wxFONTENCODING_SYSTEM; - SetName(charset); } - else // no charset specified - { - SetEncoding(); - } + + m_encoding = wxFONTENCODING_SYSTEM; } wxCSConv::wxCSConv(wxFontEncoding encoding) { - if ( encoding == wxFONTENCODING_MAX || - encoding == wxFONTENCODING_DEFAULT ) + if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT ) { wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") ); @@ -1286,14 +1527,7 @@ wxCSConv::wxCSConv(wxFontEncoding encoding) Init(); - if ( encoding == wxFONTENCODING_SYSTEM ) - { - SetEncoding(); - } - else // have valid encoding, use it - { - m_encoding = encoding; - } + m_encoding = encoding; } wxCSConv::~wxCSConv() @@ -1338,28 +1572,12 @@ void wxCSConv::SetName(const wxChar *charset) } } -static inline bool DoesntNeedConv(wxFontEncoding enc) -{ - return enc == wxFONTENCODING_DEFAULT || - enc == wxFONTENCODING_SYSTEM || - enc == wxFONTENCODING_ISO8859_1; -} - wxMBConv *wxCSConv::DoCreate() const { -#if wxUSE_FONTMAP - wxFontMapper * const fontMapper = wxFontMapper::Get(); - - wxFontEncoding encFromName = m_name ? fontMapper->CharsetToEncoding(m_name) - : wxFONTENCODING_SYSTEM; -#endif // wxUSE_FONTMAP - - // check for the special case of ASCII charset - if ( (!m_name && DoesntNeedConv(m_encoding)) -#if wxUSE_FONTMAP - || (m_name && DoesntNeedConv(encFromName)) -#endif // wxUSE_FONTMAP - ) + // check for the special case of ASCII or ISO8859-1 charset: as we have + // special knowledge of it anyhow, we don't need to create a special + // conversion object + if ( m_encoding == wxFONTENCODING_ISO8859_1 ) { // don't convert at all return NULL; @@ -1375,9 +1593,18 @@ wxMBConv *wxCSConv::DoCreate() const // step (1) #ifdef HAVE_ICONV +#if !wxUSE_FONTMAP if ( m_name ) +#endif // !wxUSE_FONTMAP { - wxMBConv_iconv *conv = new wxMBConv_iconv(m_name); + wxString name(m_name); + +#if wxUSE_FONTMAP + if ( name.empty() ) + name = wxFontMapper::Get()->GetEncodingName(m_encoding); +#endif // wxUSE_FONTMAP + + wxMBConv_iconv *conv = new wxMBConv_iconv(name); if ( conv->IsOk() ) return conv; @@ -1387,20 +1614,42 @@ wxMBConv *wxCSConv::DoCreate() const #ifdef wxHAVE_WIN32_MB2WC { +#if wxUSE_FONTMAP wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name) : new wxMBConv_win32(m_encoding); if ( conv->IsOk() ) return conv; delete conv; +#else + return NULL; +#endif } #endif // wxHAVE_WIN32_MB2WC - +#if defined(__WXMAC__) + { + if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) ) + { + + wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name) + : new wxMBConv_mac(m_encoding); + if ( conv->IsOk() ) + return conv; + + delete conv; + } + } +#endif // step (2) wxFontEncoding enc = m_encoding; #if wxUSE_FONTMAP - if ( enc == wxFONTENCODING_SYSTEM ) - enc = encFromName; + if ( enc == wxFONTENCODING_SYSTEM && m_name ) + { + // use "false" to suppress interactive dialogs -- we can be called from + // anywhere and popping up a dialog from here is the last thing we want to + // do + enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false); + } #endif // wxUSE_FONTMAP switch ( enc ) @@ -1411,18 +1660,12 @@ wxMBConv *wxCSConv::DoCreate() const case wxFONTENCODING_UTF8: return new wxMBConvUTF8; - case wxFONTENCODING_UTF16: - return new wxMBConvUTF16; - case wxFONTENCODING_UTF16BE: return new wxMBConvUTF16BE; case wxFONTENCODING_UTF16LE: return new wxMBConvUTF16LE; - case wxFONTENCODING_UTF32: - return new wxMBConvUTF32; - case wxFONTENCODING_UTF32BE: return new wxMBConvUTF32BE; @@ -1446,8 +1689,19 @@ wxMBConv *wxCSConv::DoCreate() const } #endif // wxUSE_FONTMAP - wxLogError(_("Cannot convert from the charset '%s'!"), - m_name ? m_name + // NB: This is a hack to prevent deadlock. What could otherwise happen + // in Unicode build: wxConvLocal creation ends up being here + // because of some failure and logs the error. But wxLog will try to + // attach timestamp, for which it will need wxConvLocal (to convert + // time to char* and then wchar_t*), but that fails, tries to log + // error, but wxLog has a (already locked) critical section that + // guards static buffer. + static bool alreadyLoggingError = false; + if (!alreadyLoggingError) + { + alreadyLoggingError = true; + wxLogError(_("Cannot convert from the charset '%s'!"), + m_name ? m_name : #if wxUSE_FONTMAP wxFontMapper::GetEncodingDescription(m_encoding).c_str() @@ -1455,6 +1709,8 @@ wxMBConv *wxCSConv::DoCreate() const wxString::Format(_("encoding %s"), m_encoding).c_str() #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP ); + alreadyLoggingError = false; + } return NULL; } @@ -1464,6 +1720,16 @@ void wxCSConv::CreateConvIfNeeded() const if ( m_deferred ) { wxCSConv *self = (wxCSConv *)this; // const_cast + +#if wxUSE_INTL + // if we don't have neither the name nor the encoding, use the default + // encoding for this system + if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM ) + { + self->m_name = wxStrdup(wxLocale::GetSystemEncodingName()); + } +#endif // wxUSE_INTL + self->m_convReal = DoCreate(); self->m_deferred = false; } @@ -1500,7 +1766,19 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const if (buf) { for (size_t c = 0; c <= len; c++) - buf[c] = (psz[c] > 0xff) ? '?' : psz[c]; + { + if (psz[c] > 0xFF) + return (size_t)-1; + buf[c] = psz[c]; + } + } + else + { + for (size_t c = 0; c <= len; c++) + { + if (psz[c] > 0xFF) + return (size_t)-1; + } } return len; @@ -1512,8 +1790,10 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const #ifdef __WINDOWS__ static wxMBConv_win32 wxConvLibcObj; +#elif defined(__WXMAC__) && !defined(__MACH__) + static wxMBConv_mac wxConvLibcObj ; #else - static wxMBConvSystem wxConvLibcObj; + static wxMBConvLibc wxConvLibcObj; #endif static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);