X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/e4dd1e19a281da5d1e23b12fd76c9fc242bcb1d6..9e50ed28d99a07bff5f63c9f7952c3ae6be6cab0:/src/common/strconv.cpp?ds=sidebyside diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 4472ea8fb3..8acfe82ead 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -44,10 +44,6 @@ #define wxHAVE_WIN32_MB2WC #endif -#ifdef __SALFORDC__ - #include -#endif - #ifdef HAVE_ICONV #include #include "wx/thread.h" @@ -60,17 +56,6 @@ #include "wx/mac/corefoundation/private/strconv_cf.h" #endif //def __DARWIN__ -#ifdef __WXMAC__ -#ifndef __DARWIN__ -#include -#include -#include -#endif - -// includes Mac headers -#include "wx/mac/private.h" -#endif - #define TRACE_STRCONV _T("strconv") @@ -365,14 +350,14 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const if ( psz ) { // calculate the length of the buffer needed first - const size_t nLen = MB2WC(NULL, psz, 0); + const size_t nLen = ToWChar(NULL, 0, psz); if ( nLen != wxCONV_FAILED ) { // now do the actual conversion - wxWCharBuffer buf(nLen /* +1 added implicitly */); + wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */); // +1 for the trailing NULL - if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) + if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED ) return buf; } } @@ -384,14 +369,11 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const { if ( pwz ) { - const size_t nLen = WC2MB(NULL, pwz, 0); + const size_t nLen = FromWChar(NULL, 0, pwz); if ( nLen != wxCONV_FAILED ) { - // extra space for trailing NUL(s) - static const size_t extraLen = GetMaxMBNulLen(); - - wxCharBuffer buf(nLen + extraLen - 1); - if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) + wxCharBuffer buf(nLen - 1); + if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED ) return buf; } } @@ -405,7 +387,11 @@ wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen); if ( dstLen != wxCONV_FAILED ) { - wxWCharBuffer wbuf(dstLen - 1); + // notice that we allocate space for dstLen+1 wide characters here + // because we want the buffer to always be NUL-terminated, even if the + // input isn't (as otherwise the caller has no way to know its length) + wxWCharBuffer wbuf(dstLen); + wbuf.data()[dstLen - 1] = L'\0'; if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) @@ -431,16 +417,18 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); if ( dstLen != wxCONV_FAILED ) { - // special case of empty input: can't allocate 0 size buffer below as - // wxCharBuffer insists on NUL-terminating it - wxCharBuffer buf(dstLen ? dstLen - 1 : 1); + const size_t nulLen = GetMBNulLen(); + + // as above, ensure that the buffer is always NUL-terminated, even if + // the input is not + wxCharBuffer buf(dstLen + nulLen - 1); + memset(buf.data() + dstLen, 0, nulLen); if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) { *outLen = dstLen; - const size_t nulLen = GetMBNulLen(); if ( dstLen >= nulLen && !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) { @@ -717,7 +705,7 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const // UTF-8 // ---------------------------------------------------------------------------- -static wxUint32 utf8_max[]= +static const wxUint32 utf8_max[]= { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; // boundaries of the private use area we use to (temporarily) remap invalid @@ -725,11 +713,287 @@ static wxUint32 utf8_max[]= const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; -size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const +// this table gives the length of the UTF-8 encoding from its first character: +const unsigned char tableUtf8Lengths[256] = { + // single-byte sequences (ASCII): + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F + + // these are invalid: + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF + 0, 0, // C0,C1 + + // two-byte sequences: + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF + + // three-byte sequences: + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF + + // four-byte sequences: + 4, 4, 4, 4, 4, // F0..F4 + + // these are invalid again (5- or 6-byte + // sequences and sequences for code points + // above U+10FFFF, as restricted by RFC 3629): + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF +}; + +size_t +wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const +{ + wchar_t *out = dstLen ? dst : NULL; + size_t written = 0; + + if ( srcLen == wxNO_LEN ) + srcLen = strlen(src) + 1; + + for ( const char *p = src; ; p++ ) + { + if ( !(srcLen == wxNO_LEN ? *p : srcLen) ) + { + // all done successfully, just add the trailing NULL if we are not + // using explicit length + if ( srcLen == wxNO_LEN ) + { + if ( out ) + { + if ( !dstLen ) + break; + + *out = L'\0'; + } + + written++; + } + + return written; + } + + if ( out && !dstLen-- ) + break; + + wxUint32 code; + unsigned char c = *p; + + if ( c < 0x80 ) + { + if ( srcLen == 0 ) // the test works for wxNO_LEN too + break; + + if ( srcLen != wxNO_LEN ) + srcLen--; + + code = c; + } + else + { + unsigned len = tableUtf8Lengths[c]; + if ( !len ) + break; + + if ( srcLen < len ) // the test works for wxNO_LEN too + break; + + if ( srcLen != wxNO_LEN ) + srcLen -= len; + + // Char. number range | UTF-8 octet sequence + // (hexadecimal) | (binary) + // ----------------------+---------------------------------------- + // 0000 0000 - 0000 007F | 0xxxxxxx + // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx + // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Code point value is stored in bits marked with 'x', + // lowest-order bit of the value on the right side in the diagram + // above. (from RFC 3629) + + // mask to extract lead byte's value ('x' bits above), by sequence + // length: + static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; + + // mask and value of lead byte's most significant bits, by length: + static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; + static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; + + len--; // it's more convenient to work with 0-based length here + + // extract the lead byte's value bits: + if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) + break; + + code = c & leadValueMask[len]; + + // all remaining bytes, if any, are handled in the same way + // regardless of sequence's length: + for ( ; len; --len ) + { + c = *++p; + if ( (c & 0xC0) != 0x80 ) + return wxCONV_FAILED; + + code <<= 6; + code |= c & 0x3F; + } + } + +#ifdef WC_UTF16 + // cast is ok because wchar_t == wxUint16 if WC_UTF16 + if ( encode_utf16(code, (wxUint16 *)out) == 2 ) + { + if ( out ) + out++; + written++; + } +#else // !WC_UTF16 + if ( out ) + *out = code; +#endif // WC_UTF16/!WC_UTF16 + + if ( out ) + out++; + + written++; + } + + return wxCONV_FAILED; +} + +size_t +wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { + char *out = dstLen ? dst : NULL; + size_t written = 0; + + for ( const wchar_t *wp = src; ; wp++ ) + { + if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) ) + { + // all done successfully, just add the trailing NULL if we are not + // using explicit length + if ( srcLen == wxNO_LEN ) + { + if ( out ) + { + if ( !dstLen ) + break; + + *out = '\0'; + } + + written++; + } + + return written; + } + + + wxUint32 code; +#ifdef WC_UTF16 + // cast is ok for WC_UTF16 + if ( decode_utf16((const wxUint16 *)wp, code) == 2 ) + { + // skip the next char too as we decoded a surrogate + wp++; + } +#else // wchar_t is UTF-32 + code = *wp & 0x7fffffff; +#endif + + unsigned len; + if ( code <= 0x7F ) + { + len = 1; + if ( out ) + { + if ( dstLen < len ) + break; + + out[0] = (char)code; + } + } + else if ( code <= 0x07FF ) + { + len = 2; + if ( out ) + { + if ( dstLen < len ) + break; + + // NB: this line takes 6 least significant bits, encodes them as + // 10xxxxxx and discards them so that the next byte can be encoded: + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xC0 | code; + } + } + else if ( code < 0xFFFF ) + { + len = 3; + if ( out ) + { + if ( dstLen < len ) + break; + + out[2] = 0x80 | (code & 0x3F); code >>= 6; + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xE0 | code; + } + } + else if ( code <= 0x10FFFF ) + { + len = 4; + if ( out ) + { + if ( dstLen < len ) + break; + + out[3] = 0x80 | (code & 0x3F); code >>= 6; + out[2] = 0x80 | (code & 0x3F); code >>= 6; + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xF0 | code; + } + } + else + { + wxFAIL_MSG( _T("trying to encode undefined Unicode character") ); + break; + } + + if ( out ) + { + out += len; + dstLen -= len; + } + + written += len; + } + + // we only get here if an error occurs during decoding + return wxCONV_FAILED; +} + +size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n, + const char *psz, size_t srcLen) const +{ + if ( m_options == MAP_INVALID_UTF8_NOT ) + return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen); + size_t len = 0; - while (*psz && ((!buf) || (len < n))) + while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n))) { const char *opsz = psz; bool invalid = false; @@ -796,7 +1060,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const else { #ifdef WC_UTF16 - // cast is ok because wchar_t == wxUuint16 if WC_UTF16 + // cast is ok because wchar_t == wxUint16 if WC_UTF16 size_t pa = encode_utf16(res, (wxUint16 *)buf); if (pa == wxCONV_FAILED) { @@ -863,10 +1127,10 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const } } - if (buf && (len < n)) + if (srcLen == wxNO_LEN && buf && (len < n)) *buf = 0; - return len; + return len + 1; } static inline bool isoctal(wchar_t wch) @@ -874,11 +1138,15 @@ static inline bool isoctal(wchar_t wch) return L'0' <= wch && wch <= L'7'; } -size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, + const wchar_t *psz, size_t srcLen) const { + if ( m_options == MAP_INVALID_UTF8_NOT ) + return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen); + size_t len = 0; - while (*psz && ((!buf) || (len < n))) + while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n))) { wxUint32 cc; @@ -946,10 +1214,10 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const } } - if (buf && (len < n)) + if (srcLen == wxNO_LEN && buf && (len < n)) *buf = 0; - return len; + return len + 1; } // ============================================================================ @@ -1713,7 +1981,7 @@ wxMBConv_iconv::wxMBConv_iconv(const char *name) if ( m2w != ICONV_T_INVALID ) { char buf[2], *bufPtr; - wchar_t wbuf[2], *wbufPtr; + wchar_t wbuf[2]; size_t insz, outsz; size_t res; @@ -1722,12 +1990,12 @@ wxMBConv_iconv::wxMBConv_iconv(const char *name) wbuf[0] = 0; insz = 2; outsz = SIZEOF_WCHAR_T * 2; - wbufPtr = wbuf; + char* wbufPtr = (char*)wbuf; bufPtr = buf; res = iconv( m2w, ICONV_CHAR_CAST(&bufPtr), &insz, - (char**)&wbufPtr, &outsz); + &wbufPtr, &outsz); if (ICONV_FAILED(res, insz)) { @@ -1823,16 +2091,16 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const size_t outbuf = n * SIZEOF_WCHAR_T; size_t res, cres; - // VS: Use these instead of psz, buf because iconv() modifies its arguments: - wchar_t *bufPtr = buf; const char *pszPtr = psz; if (buf) { + char* bufPtr = (char*)buf; + // have destination buffer, convert there cres = iconv(m2w, ICONV_CHAR_CAST(&pszPtr), &inbuf, - (char**)&bufPtr, &outbuf); + &bufPtr, &outbuf); res = n - (outbuf / SIZEOF_WCHAR_T); if (ms_wcNeedsSwap) @@ -1855,12 +2123,12 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const do { - bufPtr = tbuf; + char* bufPtr = (char*)tbuf; outbuf = 8 * SIZEOF_WCHAR_T; cres = iconv(m2w, ICONV_CHAR_CAST(&pszPtr), &inbuf, - (char**)&bufPtr, &outbuf ); + &bufPtr, &outbuf ); res += 8 - (outbuf / SIZEOF_WCHAR_T); } @@ -1885,8 +2153,8 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const #endif size_t inlen = wxWcslen(psz); - size_t inbuf = inlen * SIZEOF_WCHAR_T; - size_t outbuf = n; + size_t inbuflen = inlen * SIZEOF_WCHAR_T; + size_t outbuflen = n; size_t res, cres; wchar_t *tmpbuf = 0; @@ -1896,7 +2164,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const // need to copy to temp buffer to switch endianness // (doing WC_BSWAP twice on the original buffer won't help, as it // could be in read-only memory, or be accessed in some other thread) - tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T); + tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T); for ( size_t i = 0; i < inlen; i++ ) tmpbuf[n] = WC_BSWAP(psz[i]); @@ -1904,12 +2172,13 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const psz = tmpbuf; } + char* inbuf = (char*)psz; if (buf) { // have destination buffer, convert there - cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); + cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen); - res = n - outbuf; + res = n - outbuflen; // NB: iconv was given only wcslen(psz) characters on input, and so // it couldn't convert the trailing zero. Let's do it ourselves @@ -1926,11 +2195,11 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const do { buf = tbuf; - outbuf = 16; + outbuflen = 16; - cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); + cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen); - res += 16 - outbuf; + res += 16 - outbuflen; } while ((cres == (size_t)-1) && (errno == E2BIG)); } @@ -1940,7 +2209,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const free(tmpbuf); } - if (ICONV_FAILED(cres, inbuf)) + if (ICONV_FAILED(cres, inbuflen)) { wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); return wxCONV_FAILED; @@ -2162,26 +2431,38 @@ public: return wxCONV_FAILED; } - // if we were really converting, check if we succeeded - if ( buf ) + // we did something, check if we really succeeded + if ( flags ) { - if ( flags ) + // check if the conversion failed, i.e. if any replacements + // were done + if ( usedDef ) + return wxCONV_FAILED; + } + else // we must resort to double tripping... + { + // first we need to ensure that we really have the MB data: this is + // not the case if we're called with NULL buffer, in which case we + // need to do the conversion yet again + wxCharBuffer bufDef; + if ( !buf ) { - // check if the conversion failed, i.e. if any replacements - // were done - if ( usedDef ) + bufDef = wxCharBuffer(len); + buf = bufDef.data(); + if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1, + buf, len, NULL, NULL) ) return wxCONV_FAILED; } - else // we must resort to double tripping... + + if ( !n ) + n = wcslen(pwz); + wxWCharBuffer wcBuf(n); + if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED || + wcscmp(wcBuf, pwz) != 0 ) { - wxWCharBuffer wcBuf(n); - if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || - wcscmp(wcBuf, pwz) != 0 ) - { - // we didn't obtain the same thing we started from, hence - // the conversion was lossy and we consider that it failed - return wxCONV_FAILED; - } + // we didn't obtain the same thing we started from, hence + // the conversion was lossy and we consider that it failed + return wxCONV_FAILED; } } @@ -2301,379 +2582,6 @@ private: #endif // wxHAVE_WIN32_MB2WC -// ============================================================================ -// Mac conversion classes -// ============================================================================ - -/* Although we are in the base library we currently have this wxMac - * conditional. This is not generally good but fortunately does not affect - * the ABI of the base library, only what encodings might work. - * It does mean that a wxBase built as part of wxMac has slightly more support - * than one built for wxCocoa or even wxGtk. - */ -#if defined(__WXMAC__) && defined(TARGET_CARBON) - -class wxMBConv_mac : public wxMBConv -{ -public: - wxMBConv_mac() - { - Init(CFStringGetSystemEncoding()) ; - } - - wxMBConv_mac(const wxMBConv_mac& conv) - { - Init(conv.m_char_encoding); - } - -#if wxUSE_FONTMAP - wxMBConv_mac(const char* name) - { - Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ); - } -#endif - - wxMBConv_mac(wxFontEncoding encoding) - { - Init( wxMacGetSystemEncFromFontEnc(encoding) ); - } - - virtual ~wxMBConv_mac() - { - OSStatus status = noErr ; - if (m_MB2WC_converter) - status = TECDisposeConverter(m_MB2WC_converter); - if (m_WC2MB_converter) - status = TECDisposeConverter(m_WC2MB_converter); - } - - void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant , - TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat) - { - m_MB2WC_converter = NULL ; - m_WC2MB_converter = NULL ; - m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ; - m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ; - } - - virtual void CreateIfNeeded() const - { - if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL ) - { - OSStatus status = noErr ; - status = TECCreateConverter(&m_MB2WC_converter, - m_char_encoding, - m_unicode_encoding); - wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; - status = TECCreateConverter(&m_WC2MB_converter, - m_unicode_encoding, - m_char_encoding); - wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; - } - } - - size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const - { - CreateIfNeeded() ; - OSStatus status = noErr ; - ByteCount byteOutLen ; - ByteCount byteInLen = strlen(psz) + 1; - wchar_t *tbuf = NULL ; - UniChar* ubuf = NULL ; - size_t res = 0 ; - - if (buf == NULL) - { - // Apple specs say at least 32 - n = wxMax( 32, byteInLen ) ; - tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; - } - - ByteCount byteBufferLen = n * sizeof( UniChar ) ; - -#if SIZEOF_WCHAR_T == 4 - ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; -#else - ubuf = (UniChar*) (buf ? buf : tbuf) ; -#endif - - status = TECConvertText( - m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, - (TextPtr) ubuf, byteBufferLen, &byteOutLen); - -#if SIZEOF_WCHAR_T == 4 - // we have to terminate here, because n might be larger for the trailing zero, and if UniChar - // is not properly terminated we get random characters at the end - ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; - wxMBConvUTF16 converter ; - res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; - free( ubuf ) ; -#else - res = byteOutLen / sizeof( UniChar ) ; -#endif - - if ( buf == NULL ) - free(tbuf) ; - - if ( buf && res < n) - buf[res] = 0; - - return res ; - } - - size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const - { - CreateIfNeeded() ; - OSStatus status = noErr ; - ByteCount byteOutLen ; - ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; - - char *tbuf = NULL ; - - if (buf == NULL) - { - // Apple specs say at least 32 - n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); - tbuf = (char*) malloc( n ) ; - } - - ByteCount byteBufferLen = n ; - UniChar* ubuf = NULL ; - -#if SIZEOF_WCHAR_T == 4 - wxMBConvUTF16 converter ; - size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; - byteInLen = unicharlen ; - ubuf = (UniChar*) malloc( byteInLen + 2 ) ; - converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; -#else - ubuf = (UniChar*) psz ; -#endif - - status = TECConvertText( - m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen, - (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); - -#if SIZEOF_WCHAR_T == 4 - free( ubuf ) ; -#endif - - if ( buf == NULL ) - free(tbuf) ; - - size_t res = byteOutLen ; - if ( buf && res < n) - { - buf[res] = 0; - - //we need to double-trip to verify it didn't insert any ? in place - //of bogus characters - wxWCharBuffer wcBuf(n); - size_t pszlen = wxWcslen(psz); - if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || - wxWcslen(wcBuf) != pszlen || - memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 ) - { - // we didn't obtain the same thing we started from, hence - // the conversion was lossy and we consider that it failed - return wxCONV_FAILED; - } - } - - return res ; - } - - virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); } - - bool IsOk() const - { - CreateIfNeeded() ; - return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; - } - -protected : - mutable TECObjectRef m_MB2WC_converter; - mutable TECObjectRef m_WC2MB_converter; - - TextEncodingBase m_char_encoding; - TextEncodingBase m_unicode_encoding; -}; - -// MB is decomposed (D) normalized UTF8 - -class wxMBConv_macUTF8D : public wxMBConv_mac -{ -public : - wxMBConv_macUTF8D() - { - Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ; - m_uni = NULL; - m_uniBack = NULL ; - } - - virtual ~wxMBConv_macUTF8D() - { - if (m_uni!=NULL) - DisposeUnicodeToTextInfo(&m_uni); - if (m_uniBack!=NULL) - DisposeUnicodeToTextInfo(&m_uniBack); - } - - size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const - { - CreateIfNeeded() ; - OSStatus status = noErr ; - ByteCount byteOutLen ; - ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; - - char *tbuf = NULL ; - - if (buf == NULL) - { - // Apple specs say at least 32 - n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); - tbuf = (char*) malloc( n ) ; - } - - ByteCount byteBufferLen = n ; - UniChar* ubuf = NULL ; - -#if SIZEOF_WCHAR_T == 4 - wxMBConvUTF16 converter ; - size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; - byteInLen = unicharlen ; - ubuf = (UniChar*) malloc( byteInLen + 2 ) ; - converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; -#else - ubuf = (UniChar*) psz ; -#endif - - // ubuf is a non-decomposed UniChar buffer - - ByteCount dcubuflen = byteInLen * 2 + 2 ; - ByteCount dcubufread , dcubufwritten ; - UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; - - ConvertFromUnicodeToText( m_uni , byteInLen , ubuf , - kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ; - - // we now convert that decomposed buffer into UTF8 - - status = TECConvertText( - m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread, - (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); - - free( dcubuf ); - -#if SIZEOF_WCHAR_T == 4 - free( ubuf ) ; -#endif - - if ( buf == NULL ) - free(tbuf) ; - - size_t res = byteOutLen ; - if ( buf && res < n) - { - buf[res] = 0; - // don't test for round-trip fidelity yet, we cannot guarantee it yet - } - - return res ; - } - - size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const - { - CreateIfNeeded() ; - OSStatus status = noErr ; - ByteCount byteOutLen ; - ByteCount byteInLen = strlen(psz) + 1; - wchar_t *tbuf = NULL ; - UniChar* ubuf = NULL ; - size_t res = 0 ; - - if (buf == NULL) - { - // Apple specs say at least 32 - n = wxMax( 32, byteInLen ) ; - tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; - } - - ByteCount byteBufferLen = n * sizeof( UniChar ) ; - -#if SIZEOF_WCHAR_T == 4 - ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; -#else - ubuf = (UniChar*) (buf ? buf : tbuf) ; -#endif - - ByteCount dcubuflen = byteBufferLen * 2 + 2 ; - ByteCount dcubufread , dcubufwritten ; - UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; - - status = TECConvertText( - m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, - (TextPtr) dcubuf, dcubuflen, &byteOutLen); - // we have to terminate here, because n might be larger for the trailing zero, and if UniChar - // is not properly terminated we get random characters at the end - dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; - - // now from the decomposed UniChar to properly composed uniChar - ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf , - kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ; - - free( dcubuf ); - byteOutLen = dcubufwritten ; - ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; - - -#if SIZEOF_WCHAR_T == 4 - wxMBConvUTF16 converter ; - res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; - free( ubuf ) ; -#else - res = byteOutLen / sizeof( UniChar ) ; -#endif - - if ( buf == NULL ) - free(tbuf) ; - - if ( buf && res < n) - buf[res] = 0; - - return res ; - } - - virtual void CreateIfNeeded() const - { - wxMBConv_mac::CreateIfNeeded() ; - if ( m_uni == NULL ) - { - m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, - kUnicodeNoSubset, kTextEncodingDefaultFormat); - m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, - kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat); - m_map.mappingVersion = kUnicodeUseLatestMapping; - - OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni); - wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; - - m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, - kUnicodeNoSubset, kTextEncodingDefaultFormat); - m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, - kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat); - m_map.mappingVersion = kUnicodeUseLatestMapping; - err = CreateUnicodeToTextInfo(&m_map, &m_uniBack); - wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; - } - } -protected : - mutable UnicodeToTextInfo m_uni; - mutable UnicodeToTextInfo m_uniBack; - mutable UnicodeMapping m_map; -}; -#endif // defined(__WXMAC__) && defined(TARGET_CARBON) - // ============================================================================ // wxEncodingConverter based conversion classes // ============================================================================ @@ -2685,7 +2593,10 @@ class wxMBConv_wxwin : public wxMBConv private: void Init() { - m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) && + // Refuse to use broken wxEncodingConverter code for Mac-specific encodings. + // The wxMBConv_cf class does a better job. + m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) && + m2w.Init(m_enc, wxFONTENCODING_UNICODE) && w2m.Init(wxFONTENCODING_UNICODE, m_enc); } @@ -2857,7 +2768,7 @@ void wxCSConv::SetName(const char *charset) { if (charset) { - m_name = strdup(charset); + m_name = wxStrdup(charset); m_deferred = true; } } @@ -2980,26 +2891,6 @@ wxMBConv *wxCSConv::DoCreate() const } #endif // wxHAVE_WIN32_MB2WC -#if defined(__WXMAC__) - { - // leave UTF16 and UTF32 to the built-ins of wx - if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE || - ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) ) - { -#if wxUSE_FONTMAP - wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name) - : new wxMBConv_mac(m_encoding); -#else - wxMBConv_mac *conv = new wxMBConv_mac(m_encoding); -#endif - if ( conv->IsOk() ) - return conv; - - delete conv; - } - } -#endif - #ifdef __DARWIN__ { // leave UTF16 and UTF32 to the built-ins of wx @@ -3300,14 +3191,18 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) #ifdef __WINDOWS__ WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE); -#elif defined(__WXMAC__) && !defined(__MACH__) - WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE); #else WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE); #endif -WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE); -WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE); +// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's +// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still +// provokes an error message about "not enough macro parameters"; and we +// can't use "()" here as the name##Obj declaration would be parsed as a +// function declaration then, so use a semicolon and live with an extra +// empty statement (and hope that no compilers warns about this) +WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;); +WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM)); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1)); @@ -3315,19 +3210,18 @@ WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1)); WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr(); WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr(); -#if defined(__WXMAC__) && defined(TARGET_CARBON) -static wxMBConv_macUTF8D wxConvMacUTF8DObj; +#ifdef __DARWIN__ +// The xnu kernel always communicates file paths in decomposed UTF-8. +// WARNING: Are we sure that CFString's conversion will cause decomposition? +static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8); #endif + WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = -#ifdef __WXOSX__ -#if defined(__WXMAC__) && defined(TARGET_CARBON) +#ifdef __DARWIN__ &wxConvMacUTF8DObj; -#else - wxGet_wxConvUTF8Ptr(); -#endif -#else // !__WXOSX__ +#else // !__DARWIN__ wxGet_wxConvLibcPtr(); -#endif // __WXOSX__/!__WXOSX__ +#endif // __DARWIN__/!__DARWIN__ #else // !wxUSE_WCHAR_T