X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/d32a507d0268fe9af0aa317acd980167fe8d69c8..507abc7b3af8d73e7caf05261d1f40b51775ae7a:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 91483e70e0..a603d02f18 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -12,25 +12,15 @@ // Licence: wxWindows licence ///////////////////////////////////////////////////////////////////////////// -// ============================================================================ -// declarations -// ============================================================================ - -// ---------------------------------------------------------------------------- -// headers -// ---------------------------------------------------------------------------- - // For compilers that support precompilation, includes "wx.h". #include "wx/wxprec.h" -#ifdef __BORLANDC__ - #pragma hdrstop -#endif - #ifndef WX_PRECOMP #include "wx/intl.h" #include "wx/log.h" -#endif // WX_PRECOMP + #include "wx/utils.h" + #include "wx/hashmap.h" +#endif #include "wx/strconv.h" @@ -51,7 +41,7 @@ #if defined(__WIN32__) && !defined(__WXMICROWIN__) #define wxHAVE_WIN32_MB2WC -#endif // __WIN32__ but !__WXMICROWIN__ +#endif #ifdef __SALFORDC__ #include @@ -64,7 +54,6 @@ #include "wx/encconv.h" #include "wx/fontmap.h" -#include "wx/utils.h" #ifdef __WXMAC__ #ifndef __DARWIN__ @@ -73,15 +62,20 @@ #include #endif -#include "wx/mac/private.h" // includes mac headers +// includes Mac headers +#include "wx/mac/private.h" #endif + #define TRACE_STRCONV _T("strconv") +// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to +// be 4 bytes #if SIZEOF_WCHAR_T == 2 #define WC_UTF16 #endif + // ============================================================================ // implementation // ============================================================================ @@ -96,44 +90,45 @@ static bool NotAllNULs(const char *p, size_t n) } // ---------------------------------------------------------------------------- -// UTF-16 en/decoding to/from UCS-4 +// UTF-16 en/decoding to/from UCS-4 with surrogates handling // ---------------------------------------------------------------------------- - static size_t encode_utf16(wxUint32 input, wxUint16 *output) { - if (input<=0xffff) + if (input <= 0xffff) { if (output) *output = (wxUint16) input; + return 1; } - else if (input>=0x110000) + else if (input >= 0x110000) { - return (size_t)-1; + return wxCONV_FAILED; } else { if (output) { - *output++ = (wxUint16) ((input >> 10)+0xd7c0); - *output = (wxUint16) ((input&0x3ff)+0xdc00); + *output++ = (wxUint16) ((input >> 10) + 0xd7c0); + *output = (wxUint16) ((input & 0x3ff) + 0xdc00); } + return 2; } } static size_t decode_utf16(const wxUint16* input, wxUint32& output) { - if ((*input<0xd800) || (*input>0xdfff)) + if ((*input < 0xd800) || (*input > 0xdfff)) { output = *input; return 1; } - else if ((input[1]<0xdc00) || (input[1]>0xdfff)) + else if ((input[1] < 0xdc00) || (input[1] > 0xdfff)) { output = *input; - return (size_t)-1; + return wxCONV_FAILED; } else { @@ -142,6 +137,29 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) } } +#ifdef WC_UTF16 + typedef wchar_t wxDecodeSurrogate_t; +#else // !WC_UTF16 + typedef wxUint16 wxDecodeSurrogate_t; +#endif // WC_UTF16/!WC_UTF16 + +// returns the next UTF-32 character from the wchar_t buffer and advances the +// pointer to the character after this one +// +// if an invalid character is found, *pSrc is set to NULL, the caller must +// check for this +static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc) +{ + wxUint32 out; + const size_t + n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out); + if ( n == wxCONV_FAILED ) + *pSrc = NULL; + else + *pSrc += n; + + return out; +} // ---------------------------------------------------------------------------- // wxMBConv @@ -162,7 +180,7 @@ wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, size_t dstWritten = 0; // the number of NULs terminating this string - size_t nulLen wxDUMMY_INITIALIZE(0); + size_t nulLen = 0; // not really needed, but just to avoid warnings // if we were not given the input size we just have to assume that the // string is properly terminated as we have no way of knowing how long it @@ -170,7 +188,7 @@ wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, // NULs at the end wxCharBuffer bufTmp; const char *srcEnd; - if ( srcLen != (size_t)-1 ) + if ( srcLen != wxNO_LEN ) { // we need to know how to find the end of this string nulLen = GetMBNulLen(); @@ -201,21 +219,19 @@ wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, { // try to convert the current chunk size_t lenChunk = MB2WC(NULL, src, 0); - if ( lenChunk == 0 ) - { - // nothing left in the input string, conversion succeeded; - // but still account for the trailing NULL - dstWritten++; - break; - } - if ( lenChunk == wxCONV_FAILED ) return wxCONV_FAILED; - lenChunk++; // for trailing NUL + lenChunk++; // for the L'\0' at the end of this chunk dstWritten += lenChunk; + if ( lenChunk == 1 ) + { + // nothing left in the input string, conversion succeeded + break; + } + if ( dst ) { if ( dstWritten > dstLen ) @@ -229,8 +245,8 @@ wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, if ( !srcEnd ) { - // we convert the entire string in this case, as we suppose that the - // string is NUL-terminated and so srcEnd is not used at all + // we convert just one chunk in this case as this is the entire + // string anyhow break; } @@ -269,7 +285,7 @@ wxMBConv::FromWChar(char *dst, size_t dstLen, // if we don't know its length we have no choice but to assume that it is, // indeed, properly terminated wxWCharBuffer bufTmp; - if ( srcLen == (size_t)-1 ) + if ( srcLen == wxNO_LEN ) { srcLen = wxWcslen(src) + 1; } @@ -313,7 +329,7 @@ wxMBConv::FromWChar(char *dst, size_t dstLen, size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const { size_t rc = ToWChar(outBuff, outLen, inBuff); - if ( rc != (size_t)wxCONV_FAILED ) + if ( rc != wxCONV_FAILED ) { // ToWChar() returns the buffer length, i.e. including the trailing // NUL, while this method doesn't take it into account @@ -326,7 +342,7 @@ size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) cons size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const { size_t rc = FromWChar(outBuff, outLen, inBuff); - if ( rc != (size_t)wxCONV_FAILED ) + if ( rc != wxCONV_FAILED ) { rc -= GetMBNulLen(); } @@ -345,7 +361,7 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const { // calculate the length of the buffer needed first const size_t nLen = MB2WC(NULL, psz, 0); - if ( nLen != (size_t)wxCONV_FAILED ) + if ( nLen != wxCONV_FAILED ) { // now do the actual conversion wxWCharBuffer buf(nLen /* +1 added implicitly */); @@ -364,7 +380,7 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const if ( pwz ) { const size_t nLen = WC2MB(NULL, pwz, 0); - if ( nLen != (size_t)wxCONV_FAILED ) + if ( nLen != wxCONV_FAILED ) { // extra space for trailing NUL(s) static const size_t extraLen = GetMaxMBNulLen(); @@ -382,13 +398,18 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const { const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen); - if ( dstLen != (size_t)wxCONV_FAILED ) + if ( dstLen != wxCONV_FAILED ) { wxWCharBuffer wbuf(dstLen - 1); - if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) ) + if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) - *outLen = dstLen - 1; + { + *outLen = dstLen; + if ( wbuf[dstLen - 1] == L'\0' ) + (*outLen)--; + } + return wbuf; } } @@ -402,14 +423,27 @@ wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const { - const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); - if ( dstLen != (size_t)wxCONV_FAILED ) + size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); + if ( dstLen != wxCONV_FAILED ) { - wxCharBuffer buf(dstLen - 1); - if ( FromWChar(buf.data(), dstLen, inBuff, inLen) ) + // special case of empty input: can't allocate 0 size buffer below as + // wxCharBuffer insists on NUL-terminating it + wxCharBuffer buf(dstLen ? dstLen - 1 : 1); + if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) - *outLen = dstLen - 1; + { + *outLen = dstLen; + + const size_t nulLen = GetMBNulLen(); + if ( dstLen >= nulLen && + !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) + { + // in this case the output is NUL-terminated and we're not + // supposed to count NUL + *outLen -= nulLen; + } + } return buf; } @@ -551,7 +585,7 @@ size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const if ( !ok ) { // in valid UTF7 we should have valid characters after '+' - return (size_t)-1; + return wxCONV_FAILED; } if (*psz == '-') @@ -619,13 +653,14 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const else if (((wxUint32)cc) > 0xffff) { // no surrogate pair generation (yet?) - return (size_t)-1; + return wxCONV_FAILED; } #endif else { if (buf) *buf++ = '+'; + len++; if (cc != '+') { @@ -646,14 +681,17 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const len++; } } + cc = *psz; if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1)) break; } + if (l != 0) { if (buf) *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64]; + len++; } } @@ -736,6 +774,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const psz++; res = (res << 6) | (cc & 0x3f); } + if (invalid || res <= utf8_max[ocnt]) { // illegal UTF-8 encoding @@ -754,7 +793,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const #ifdef WC_UTF16 // cast is ok because wchar_t == wxUuint16 if WC_UTF16 size_t pa = encode_utf16(res, (wxUint16 *)buf); - if (pa == (size_t)-1) + if (pa == wxCONV_FAILED) { invalid = true; } @@ -771,6 +810,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const #endif // WC_UTF16/!WC_UTF16 } } + if (invalid) { if (m_options & MAP_INVALID_UTF8_TO_PUA) @@ -780,7 +820,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const #ifdef WC_UTF16 // cast is ok because wchar_t == wxUuint16 if WC_UTF16 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf); - wxASSERT(pa != (size_t)-1); + wxASSERT(pa != wxCONV_FAILED); if (buf) buf += pa; opsz++; @@ -812,7 +852,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const } else // MAP_INVALID_UTF8_NOT { - return (size_t)-1; + return wxCONV_FAILED; } } } @@ -840,7 +880,7 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const #ifdef WC_UTF16 // cast is ok for WC_UTF16 size_t pa = decode_utf16((const wxUint16 *)psz, cc); - psz += (pa == (size_t)-1) ? 1 : pa; + psz += (pa == wxCONV_FAILED) ? 1 : pa; #else cc = (*psz++) & 0x7fffffff; #endif @@ -888,7 +928,6 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const *buf++ = (char) cc; len++; } - else { len += cnt + 1; @@ -908,9 +947,9 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const return len; } -// ---------------------------------------------------------------------------- +// ============================================================================ // UTF-16 -// ---------------------------------------------------------------------------- +// ============================================================================ #ifdef WORDS_BIGENDIAN #define wxMBConvUTF16straight wxMBConvUTF16BE @@ -920,491 +959,589 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const #define wxMBConvUTF16straight wxMBConvUTF16LE #endif +/* static */ +size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen) +{ + if ( srcLen == wxNO_LEN ) + { + // count the number of bytes in input, including the trailing NULs + const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); + for ( srcLen = 1; *inBuff++; srcLen++ ) + ; + srcLen *= BYTES_PER_CHAR; + } + else // we already have the length + { + // we can only convert an entire number of UTF-16 characters + if ( srcLen % BYTES_PER_CHAR ) + return wxCONV_FAILED; + } + + return srcLen; +} + +// case when in-memory representation is UTF-16 too #ifdef WC_UTF16 -// copy 16bit MB to 16bit String -size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- + +size_t +wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len = 0; + // set up the scene for using memcpy() (which is presumably more efficient + // than copying the bytes one by one) + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint16*)psz && (!buf || len < n)) + const size_t inLen = srcLen / BYTES_PER_CHAR; + if ( dst ) { - if (buf) - *buf++ = *(wxUint16*)psz; - len++; + if ( dstLen < inLen ) + return wxCONV_FAILED; - psz += sizeof(wxUint16); + memcpy(dst, src, srcLen); } - if (buf && len < n) - *buf = 0; - - return len; + return inLen; } - -// copy 16bit String to 16bit MB -size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; + + srcLen *= BYTES_PER_CHAR; - while (*psz && (!buf || len < n)) + if ( dst ) { - if (buf) - { - *(wxUint16*)buf = *psz; - buf += sizeof(wxUint16); - } + if ( dstLen < srcLen ) + return wxCONV_FAILED; - len += sizeof(wxUint16); - psz++; + memcpy(dst, src, srcLen); } - if (buf && len <= n - sizeof(wxUint16)) - *(wxUint16*)buf = 0; - - return len; + return srcLen; } +// ---------------------------------------------------------------------------- +// endian-reversing conversions +// ---------------------------------------------------------------------------- -// swap 16bit MB to 16bit String -size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len = 0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - // UTF16 string must be terminated by 2 NULs as single NULs may occur - // inside the string - while ( (psz[0] || psz[1]) && (!buf || len < n) ) + srcLen /= BYTES_PER_CHAR; + + if ( dst ) { - if ( buf ) + if ( dstLen < srcLen ) + return wxCONV_FAILED; + + const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); + for ( size_t n = 0; n < srcLen; n++, inBuff++ ) { - ((char *)buf)[0] = psz[1]; - ((char *)buf)[1] = psz[0]; - buf++; + *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff); } - len++; - psz += 2; } - if ( buf && len < n ) - *buf = L'\0'; - - return len; + return srcLen; } - -// swap 16bit MB to 16bit String -size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; + + srcLen *= BYTES_PER_CHAR; - while ( *psz && (!buf || len < n) ) + if ( dst ) { - if ( buf ) + if ( dstLen < srcLen ) + return wxCONV_FAILED; + + wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); + for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ ) { - *buf++ = ((char*)psz)[1]; - *buf++ = ((char*)psz)[0]; + *outBuff++ = wxUINT16_SWAP_ALWAYS(*src); } - - len += 2; - psz++; } - if ( buf && len < n - 1 ) - { - buf[0] = - buf[1] = '\0'; - } - - return len; + return srcLen; } +#else // !WC_UTF16: wchar_t is UTF-32 -#else // WC_UTF16 - +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- -// copy 16bit MB to 32bit String -size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len = 0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint16*)psz && (!buf || len < n)) + const size_t inLen = srcLen / BYTES_PER_CHAR; + if ( !dst ) { - wxUint32 cc; - size_t pa = decode_utf16((wxUint16*)psz, cc); - if (pa == (size_t)-1) - return pa; + // optimization: return maximal space which could be needed for this + // string even if the real size could be smaller if the buffer contains + // any surrogates + return inLen; + } - if (buf) - *buf++ = (wchar_t)cc; - len++; - psz += pa * sizeof(wxUint16); + size_t outLen = 0; + const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); + for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) + { + const wxUint32 ch = wxDecodeSurrogate(&inBuff); + if ( !inBuff ) + return wxCONV_FAILED; + + if ( ++outLen > dstLen ) + return wxCONV_FAILED; + + *dst++ = ch; } - if (buf && len < n) - *buf = 0; - return len; + return outLen; } - -// copy 32bit String to 16bit MB -size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len=0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; - while (*psz && (!buf || len < n)) + size_t outLen = 0; + wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); + for ( size_t n = 0; n < srcLen; n++ ) { wxUint16 cc[2]; - size_t pa = encode_utf16(*psz, cc); - - if (pa == (size_t)-1) - return pa; + const size_t numChars = encode_utf16(*src++, cc); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - if (buf) + outLen += numChars * BYTES_PER_CHAR; + if ( outBuff ) { - *(wxUint16*)buf = cc[0]; - buf += sizeof(wxUint16); - if (pa > 1) + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *outBuff++ = cc[0]; + if ( numChars == 2 ) { - *(wxUint16*)buf = cc[1]; - buf += sizeof(wxUint16); + // second character of a surrogate + *outBuff++ = cc[1]; } } - - len += pa*sizeof(wxUint16); - psz++; } - if (buf && len <= n - sizeof(wxUint16)) - *(wxUint16*)buf = 0; - - return len; + return outLen; } +// ---------------------------------------------------------------------------- +// endian-reversing conversions +// ---------------------------------------------------------------------------- -// swap 16bit MB to 32bit String -size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint16*)psz && (!buf || len < n)) + const size_t inLen = srcLen / BYTES_PER_CHAR; + if ( !dst ) { - wxUint32 cc; - char tmp[4]; + // optimization: return maximal space which could be needed for this + // string even if the real size could be smaller if the buffer contains + // any surrogates + return inLen; + } + + size_t outLen = 0; + const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); + for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) + { + wxUint32 ch; + wxUint16 tmp[2]; - tmp[0] = psz[1]; - tmp[1] = psz[0]; - tmp[2] = psz[3]; - tmp[3] = psz[2]; + tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); + inBuff++; + tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); - size_t pa = decode_utf16((wxUint16*)tmp, cc); - if (pa == (size_t)-1) - return pa; + const size_t numChars = decode_utf16(tmp, ch); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - if (buf) - *buf++ = (wchar_t)cc; + if ( numChars == 2 ) + inBuff++; - len++; - psz += pa * sizeof(wxUint16); + if ( ++outLen > dstLen ) + return wxCONV_FAILED; + + *dst++ = ch; } - if (buf && len < n) - *buf = 0; - return len; + return outLen; } - -// swap 32bit String to 16bit MB -size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; - while (*psz && (!buf || len < n)) + size_t outLen = 0; + wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); + for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ ) { wxUint16 cc[2]; - size_t pa = encode_utf16(*psz, cc); - - if (pa == (size_t)-1) - return pa; + const size_t numChars = encode_utf16(*src, cc); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - if (buf) + outLen += numChars * BYTES_PER_CHAR; + if ( outBuff ) { - *buf++ = ((char*)cc)[1]; - *buf++ = ((char*)cc)[0]; - if (pa > 1) + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]); + if ( numChars == 2 ) { - *buf++ = ((char*)cc)[3]; - *buf++ = ((char*)cc)[2]; + // second character of a surrogate + *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]); } } - - len += pa * sizeof(wxUint16); - psz++; } - if (buf && len <= n - sizeof(wxUint16)) - *(wxUint16*)buf = 0; - - return len; + return outLen; } -#endif // WC_UTF16 +#endif // WC_UTF16/!WC_UTF16 -// ---------------------------------------------------------------------------- +// ============================================================================ // UTF-32 -// ---------------------------------------------------------------------------- +// ============================================================================ #ifdef WORDS_BIGENDIAN -#define wxMBConvUTF32straight wxMBConvUTF32BE -#define wxMBConvUTF32swap wxMBConvUTF32LE + #define wxMBConvUTF32straight wxMBConvUTF32BE + #define wxMBConvUTF32swap wxMBConvUTF32LE #else -#define wxMBConvUTF32swap wxMBConvUTF32BE -#define wxMBConvUTF32straight wxMBConvUTF32LE + #define wxMBConvUTF32swap wxMBConvUTF32BE + #define wxMBConvUTF32straight wxMBConvUTF32LE #endif WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE; WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE; +/* static */ +size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen) +{ + if ( srcLen == wxNO_LEN ) + { + // count the number of bytes in input, including the trailing NULs + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + for ( srcLen = 1; *inBuff++; srcLen++ ) + ; + srcLen *= BYTES_PER_CHAR; + } + else // we already have the length + { + // we can only convert an entire number of UTF-32 characters + if ( srcLen % BYTES_PER_CHAR ) + return wxCONV_FAILED; + } + + return srcLen; +} + +// case when in-memory representation is UTF-16 #ifdef WC_UTF16 -// copy 32bit MB to 16bit String -size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- + +size_t +wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len = 0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + const size_t inLen = srcLen / BYTES_PER_CHAR; + size_t outLen = 0; + for ( size_t n = 0; n < inLen; n++ ) { wxUint16 cc[2]; + const size_t numChars = encode_utf16(*inBuff++, cc); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - size_t pa = encode_utf16(*(wxUint32*)psz, cc); - if (pa == (size_t)-1) - return pa; - - if (buf) + outLen += numChars; + if ( dst ) { - *buf++ = cc[0]; - if (pa > 1) - *buf++ = cc[1]; - } + if ( outLen > dstLen ) + return wxCONV_FAILED; - len += pa; - psz += sizeof(wxUint32); + *dst++ = cc[0]; + if ( numChars == 2 ) + { + // second character of a surrogate + *dst++ = cc[1]; + } + } } - if (buf && len < n) - *buf = 0; - - return len; + return outLen; } - -// copy 16bit String to 32bit MB -size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; - while (*psz && (!buf || len < n)) + if ( !dst ) { - wxUint32 cc; + // optimization: return maximal space which could be needed for this + // string instead of the exact amount which could be less if there are + // any surrogates in the input + // + // we consider that surrogates are rare enough to make it worthwhile to + // avoid running the loop below at the cost of slightly extra memory + // consumption + return srcLen * BYTES_PER_CHAR; + } - // cast is ok for WC_UTF16 - size_t pa = decode_utf16((const wxUint16 *)psz, cc); - if (pa == (size_t)-1) - return pa; + wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); + size_t outLen = 0; + for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) + { + const wxUint32 ch = wxDecodeSurrogate(&src); + if ( !src ) + return wxCONV_FAILED; - if (buf) - { - *(wxUint32*)buf = cc; - buf += sizeof(wxUint32); - } + outLen += BYTES_PER_CHAR; - len += sizeof(wxUint32); - psz += pa; - } + if ( outLen > dstLen ) + return wxCONV_FAILED; - if (buf && len <= n - sizeof(wxUint32)) - *(wxUint32*)buf = 0; + *outBuff++ = ch; + } - return len; + return outLen; } +// ---------------------------------------------------------------------------- +// endian-reversing conversions +// ---------------------------------------------------------------------------- -// swap 32bit MB to 16bit String -size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len = 0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + const size_t inLen = srcLen / BYTES_PER_CHAR; + size_t outLen = 0; + for ( size_t n = 0; n < inLen; n++, inBuff++ ) { - char tmp[4]; - tmp[0] = psz[3]; - tmp[1] = psz[2]; - tmp[2] = psz[1]; - tmp[3] = psz[0]; - wxUint16 cc[2]; + const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - size_t pa = encode_utf16(*(wxUint32*)tmp, cc); - if (pa == (size_t)-1) - return pa; - - if (buf) + outLen += numChars; + if ( dst ) { - *buf++ = cc[0]; - if (pa > 1) - *buf++ = cc[1]; - } + if ( outLen > dstLen ) + return wxCONV_FAILED; - len += pa; - psz += sizeof(wxUint32); + *dst++ = cc[0]; + if ( numChars == 2 ) + { + // second character of a surrogate + *dst++ = cc[1]; + } + } } - if (buf && len < n) - *buf = 0; - - return len; + return outLen; } - -// swap 16bit String to 32bit MB -size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; + + if ( !dst ) + { + // optimization: return maximal space which could be needed for this + // string instead of the exact amount which could be less if there are + // any surrogates in the input + // + // we consider that surrogates are rare enough to make it worthwhile to + // avoid running the loop below at the cost of slightly extra memory + // consumption + return srcLen*BYTES_PER_CHAR; + } - while (*psz && (!buf || len < n)) + wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); + size_t outLen = 0; + for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) { - char cc[4]; + const wxUint32 ch = wxDecodeSurrogate(&src); + if ( !src ) + return wxCONV_FAILED; - // cast is ok for WC_UTF16 - size_t pa = decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc); - if (pa == (size_t)-1) - return pa; + outLen += BYTES_PER_CHAR; - if (buf) - { - *buf++ = cc[3]; - *buf++ = cc[2]; - *buf++ = cc[1]; - *buf++ = cc[0]; - } + if ( outLen > dstLen ) + return wxCONV_FAILED; - len += sizeof(wxUint32); - psz += pa; + *outBuff++ = wxUINT32_SWAP_ALWAYS(ch); } - if (buf && len <= n - sizeof(wxUint32)) - *(wxUint32*)buf = 0; - - return len; + return outLen; } -#else // WC_UTF16 +#else // !WC_UTF16: wchar_t is UTF-32 +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- -// copy 32bit MB to 32bit String -size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + // use memcpy() as it should be much faster than hand-written loop + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const size_t inLen = srcLen/BYTES_PER_CHAR; + if ( dst ) { - if (buf) - *buf++ = (wchar_t)(*(wxUint32*)psz); - len++; - psz += sizeof(wxUint32); - } + if ( dstLen < inLen ) + return wxCONV_FAILED; - if (buf && len < n) - *buf = 0; + memcpy(dst, src, srcLen); + } - return len; + return inLen; } - -// copy 32bit String to 32bit MB -size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; + + srcLen *= BYTES_PER_CHAR; - while (*psz && (!buf || len < n)) + if ( dst ) { - if (buf) - { - *(wxUint32*)buf = *psz; - buf += sizeof(wxUint32); - } + if ( dstLen < srcLen ) + return wxCONV_FAILED; - len += sizeof(wxUint32); - psz++; + memcpy(dst, src, srcLen); } - if (buf && len <= n - sizeof(wxUint32)) - *(wxUint32*)buf = 0; - - return len; + return srcLen; } +// ---------------------------------------------------------------------------- +// endian-reversing conversions +// ---------------------------------------------------------------------------- -// swap 32bit MB to 32bit String -size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len = 0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; + + srcLen /= BYTES_PER_CHAR; - while (*(wxUint32*)psz && (!buf || len < n)) + if ( dst ) { - if (buf) + if ( dstLen < srcLen ) + return wxCONV_FAILED; + + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + for ( size_t n = 0; n < srcLen; n++, inBuff++ ) { - ((char *)buf)[0] = psz[3]; - ((char *)buf)[1] = psz[2]; - ((char *)buf)[2] = psz[1]; - ((char *)buf)[3] = psz[0]; - buf++; + *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff); } - - len++; - psz += sizeof(wxUint32); } - if (buf && len < n) - *buf = 0; - - return len; + return srcLen; } - -// swap 32bit String to 32bit MB -size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t +wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { - size_t len = 0; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; + + srcLen *= BYTES_PER_CHAR; - while (*psz && (!buf || len < n)) + if ( dst ) { - if (buf) + if ( dstLen < srcLen ) + return wxCONV_FAILED; + + wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); + for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ ) { - *buf++ = ((char *)psz)[3]; - *buf++ = ((char *)psz)[2]; - *buf++ = ((char *)psz)[1]; - *buf++ = ((char *)psz)[0]; + *outBuff++ = wxUINT32_SWAP_ALWAYS(*src); } - - len += sizeof(wxUint32); - psz++; } - if (buf && len <= n - sizeof(wxUint32)) - *(wxUint32*)buf = 0; - - return len; + return srcLen; } - -#endif // WC_UTF16 +#endif // WC_UTF16/!WC_UTF16 // ============================================================================ @@ -1470,8 +1607,8 @@ public: { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); } protected: - // the iconv handlers used to translate from multibyte to wide char and in - // the other direction + // the iconv handlers used to translate from multibyte + // to wide char and in the other direction iconv_t m2w, w2m; @@ -1548,11 +1685,12 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) // first try charset with explicit bytesex info (e.g. "UCS-4LE"): wxString nameXE(nameCS); - #ifdef WORDS_BIGENDIAN + +#ifdef WORDS_BIGENDIAN nameXE += _T("BE"); - #else // little endian +#else // little endian nameXE += _T("LE"); - #endif +#endif wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), nameXE.c_str()); @@ -1581,8 +1719,9 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) wbufPtr = wbuf; bufPtr = buf; - res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz, - (char**)&wbufPtr, &outsz); + res = iconv( + m2w, ICONV_CHAR_CAST(&bufPtr), &insz, + (char**)&wbufPtr, &outsz); if (ICONV_FAILED(res, insz)) { @@ -1648,7 +1787,7 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const switch ( nulLen ) { default: - return (size_t)-1; + return wxCONV_FAILED; case 1: inbuf = strlen(psz); // arguably more optimized than our version @@ -1667,8 +1806,8 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const } #if wxUSE_THREADS - // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle. - // Unfortunately there is a couple of global wxCSConv objects such as + // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle. + // Unfortunately there are a couple of global wxCSConv objects such as // wxConvLocal that are used all over wx code, so we have to make sure // the handle is used by at most one thread at the time. Otherwise // only a few wx classes would be safe to use from non-main threads @@ -1726,7 +1865,7 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const { //VS: it is ok if iconv fails, hence trace only wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); - return (size_t)-1; + return wxCONV_FAILED; } return res; @@ -1774,7 +1913,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const } else { - // no destination buffer... convert using temp buffer + // no destination buffer: convert using temp buffer // to calculate destination buffer requirement char tbuf[16]; res = 0; @@ -1798,7 +1937,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const if (ICONV_FAILED(cres, inbuf)) { wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); - return (size_t)-1; + return wxCONV_FAILED; } return res; @@ -1859,6 +1998,7 @@ public: } wxMBConv_win32(const wxMBConv_win32& conv) + : wxMBConv() { m_CodePage = conv.m_CodePage; m_minMBCharWidth = conv.m_minMBCharWidth; @@ -1920,7 +2060,7 @@ public: if ( !len ) { // function totally failed - return (size_t)-1; + return wxCONV_FAILED; } // if we were really converting and didn't use MB_ERR_INVALID_CHARS, @@ -1944,7 +2084,7 @@ public: { // we didn't obtain the same thing we started from, hence // the conversion was lossy and we consider that it failed - return (size_t)-1; + return wxCONV_FAILED; } } @@ -2005,7 +2145,7 @@ public: if ( !len ) { // function totally failed - return (size_t)-1; + return wxCONV_FAILED; } // if we were really converting, check if we succeeded @@ -2016,17 +2156,17 @@ public: // check if the conversion failed, i.e. if any replacements // were done if ( usedDef ) - return (size_t)-1; + return wxCONV_FAILED; } else // we must resort to double tripping... { wxWCharBuffer wcBuf(n); - if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 || + if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || wcscmp(wcBuf, pwz) != 0 ) { // we didn't obtain the same thing we started from, hence // the conversion was lossy and we consider that it failed - return (size_t)-1; + return wxCONV_FAILED; } } } @@ -2097,7 +2237,7 @@ private: break; default: - // unknown, be conservative by default + // unknown: be conservative by default s_isWin98Or2k = 0; break; } @@ -2154,7 +2294,7 @@ private: // RN: There is no UTF-32 support in either Core Foundation or Cocoa. // Strangely enough, internally Core Foundation uses -// UTF 32 internally quite a bit - its just not public (yet). +// UTF-32 internally quite a bit - its just not public (yet). #include #include @@ -2485,12 +2625,12 @@ public: CFRelease(theString); - szUniCharBuffer[nOutLength] = '\0' ; + szUniCharBuffer[nOutLength] = '\0'; #if SIZEOF_WCHAR_T == 4 - wxMBConvUTF16 converter ; - converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ; - delete[] szUniCharBuffer; + wxMBConvUTF16 converter; + converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize ); + delete [] szUniCharBuffer; #endif return nOutLength; @@ -2506,9 +2646,9 @@ public: #if SIZEOF_WCHAR_T == 4 wxMBConvUTF16 converter ; - nBufSize = converter.WC2MB( NULL , szUnConv , 0 ); - szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ; - converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ; + nBufSize = converter.WC2MB( NULL, szUnConv, 0 ); + szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1]; + converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar)); nBufSize /= sizeof(UniChar); #endif @@ -2590,7 +2730,7 @@ public: #if wxUSE_FONTMAP wxMBConv_mac(const wxChar* name) { - Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ; + Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ); } #endif @@ -2602,39 +2742,52 @@ public: ~wxMBConv_mac() { OSStatus status = noErr ; - status = TECDisposeConverter(m_MB2WC_converter); - status = TECDisposeConverter(m_WC2MB_converter); + if (m_MB2WC_converter) + status = TECDisposeConverter(m_MB2WC_converter); + if (m_WC2MB_converter) + status = TECDisposeConverter(m_WC2MB_converter); } - - void Init( TextEncodingBase encoding) + void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant , + TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat) { - OSStatus status = noErr ; - m_char_encoding = encoding ; + m_MB2WC_converter = NULL ; + m_WC2MB_converter = NULL ; + m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ; m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ; + } - status = TECCreateConverter(&m_MB2WC_converter, + virtual void CreateIfNeeded() const + { + if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL ) + { + OSStatus status = noErr ; + status = TECCreateConverter(&m_MB2WC_converter, m_char_encoding, m_unicode_encoding); - status = TECCreateConverter(&m_WC2MB_converter, + wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; + status = TECCreateConverter(&m_WC2MB_converter, m_unicode_encoding, m_char_encoding); + wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; + } } - + size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const { + CreateIfNeeded() ; OSStatus status = noErr ; ByteCount byteOutLen ; - ByteCount byteInLen = strlen(psz) ; + ByteCount byteInLen = strlen(psz) + 1; wchar_t *tbuf = NULL ; UniChar* ubuf = NULL ; size_t res = 0 ; if (buf == NULL) { - //apple specs say at least 32 - n = wxMax( 32 , byteInLen ) ; - tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ; + // Apple specs say at least 32 + n = wxMax( 32, byteInLen ) ; + tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; } ByteCount byteBufferLen = n * sizeof( UniChar ) ; @@ -2644,14 +2797,17 @@ public: #else ubuf = (UniChar*) (buf ? buf : tbuf) ; #endif - status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen, - (TextPtr) ubuf , byteBufferLen, &byteOutLen); + + status = TECConvertText( + m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, + (TextPtr) ubuf, byteBufferLen, &byteOutLen); + #if SIZEOF_WCHAR_T == 4 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar // is not properly terminated we get random characters at the end ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; wxMBConvUTF16 converter ; - res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ; + res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; free( ubuf ) ; #else res = byteOutLen / sizeof( UniChar ) ; @@ -2668,6 +2824,7 @@ public: size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const { + CreateIfNeeded() ; OSStatus status = noErr ; ByteCount byteOutLen ; ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; @@ -2676,8 +2833,8 @@ public: if (buf == NULL) { - //apple specs say at least 32 - n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); + // Apple specs say at least 32 + n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); tbuf = (char*) malloc( n ) ; } @@ -2686,10 +2843,10 @@ public: #if SIZEOF_WCHAR_T == 4 wxMBConvUTF16 converter ; - size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ; + size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; byteInLen = unicharlen ; ubuf = (UniChar*) malloc( byteInLen + 2 ) ; - converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ; + converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; #else ubuf = (UniChar*) psz ; #endif @@ -2714,13 +2871,13 @@ public: //of bogus characters wxWCharBuffer wcBuf(n); size_t pszlen = wxWcslen(psz); - if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 || + if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || wxWcslen(wcBuf) != pszlen || memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 ) { // we didn't obtain the same thing we started from, hence // the conversion was lossy and we consider that it failed - return (size_t)-1; + return wxCONV_FAILED; } } @@ -2730,16 +2887,192 @@ public: virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); } bool IsOk() const - { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; } + { + CreateIfNeeded() ; + return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; + } -private: - TECObjectRef m_MB2WC_converter; - TECObjectRef m_WC2MB_converter; +protected : + mutable TECObjectRef m_MB2WC_converter; + mutable TECObjectRef m_WC2MB_converter; TextEncodingBase m_char_encoding; TextEncodingBase m_unicode_encoding; }; +// MB is decomposed (D) normalized UTF8 + +class wxMBConv_macUTF8D : public wxMBConv_mac +{ +public : + wxMBConv_macUTF8D() + { + Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ; + m_uni = NULL; + m_uniBack = NULL ; + } + + ~wxMBConv_macUTF8D() + { + if (m_uni!=NULL) + DisposeUnicodeToTextInfo(&m_uni); + if (m_uniBack!=NULL) + DisposeUnicodeToTextInfo(&m_uniBack); + } + + size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const + { + CreateIfNeeded() ; + OSStatus status = noErr ; + ByteCount byteOutLen ; + ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; + + char *tbuf = NULL ; + + if (buf == NULL) + { + // Apple specs say at least 32 + n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); + tbuf = (char*) malloc( n ) ; + } + + ByteCount byteBufferLen = n ; + UniChar* ubuf = NULL ; + +#if SIZEOF_WCHAR_T == 4 + wxMBConvUTF16 converter ; + size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; + byteInLen = unicharlen ; + ubuf = (UniChar*) malloc( byteInLen + 2 ) ; + converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; +#else + ubuf = (UniChar*) psz ; +#endif + + // ubuf is a non-decomposed UniChar buffer + + ByteCount dcubuflen = byteInLen * 2 + 2 ; + ByteCount dcubufread , dcubufwritten ; + UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; + + ConvertFromUnicodeToText( m_uni , byteInLen , ubuf , + kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ; + + // we now convert that decomposed buffer into UTF8 + + status = TECConvertText( + m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread, + (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); + + free( dcubuf ); + +#if SIZEOF_WCHAR_T == 4 + free( ubuf ) ; +#endif + + if ( buf == NULL ) + free(tbuf) ; + + size_t res = byteOutLen ; + if ( buf && res < n) + { + buf[res] = 0; + // don't test for round-trip fidelity yet, we cannot guarantee it yet + } + + return res ; + } + + size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const + { + CreateIfNeeded() ; + OSStatus status = noErr ; + ByteCount byteOutLen ; + ByteCount byteInLen = strlen(psz) + 1; + wchar_t *tbuf = NULL ; + UniChar* ubuf = NULL ; + size_t res = 0 ; + + if (buf == NULL) + { + // Apple specs say at least 32 + n = wxMax( 32, byteInLen ) ; + tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; + } + + ByteCount byteBufferLen = n * sizeof( UniChar ) ; + +#if SIZEOF_WCHAR_T == 4 + ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; +#else + ubuf = (UniChar*) (buf ? buf : tbuf) ; +#endif + + ByteCount dcubuflen = byteBufferLen * 2 + 2 ; + ByteCount dcubufread , dcubufwritten ; + UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; + + status = TECConvertText( + m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, + (TextPtr) dcubuf, dcubuflen, &byteOutLen); + // we have to terminate here, because n might be larger for the trailing zero, and if UniChar + // is not properly terminated we get random characters at the end + dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; + + // now from the decomposed UniChar to properly composed uniChar + ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf , + kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ; + + free( dcubuf ); + byteOutLen = dcubufwritten ; + ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; + + +#if SIZEOF_WCHAR_T == 4 + wxMBConvUTF16 converter ; + res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; + free( ubuf ) ; +#else + res = byteOutLen / sizeof( UniChar ) ; +#endif + + if ( buf == NULL ) + free(tbuf) ; + + if ( buf && res < n) + buf[res] = 0; + + return res ; + } + + virtual void CreateIfNeeded() const + { + wxMBConv_mac::CreateIfNeeded() ; + if ( m_uni == NULL ) + { + m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, + kUnicodeNoSubset, kTextEncodingDefaultFormat); + m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, + kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat); + m_map.mappingVersion = kUnicodeUseLatestMapping; + + OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni); + wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; + + m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, + kUnicodeNoSubset, kTextEncodingDefaultFormat); + m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, + kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat); + m_map.mappingVersion = kUnicodeUseLatestMapping; + err = CreateUnicodeToTextInfo(&m_map, &m_uniBack); + wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; + } + } +protected : + mutable UnicodeToTextInfo m_uni; + mutable UnicodeToTextInfo m_uniBack; + mutable UnicodeMapping m_map; +}; #endif // defined(__WXMAC__) && defined(TARGET_CARBON) // ============================================================================ @@ -2783,7 +3116,7 @@ public: if (buf) { if (!m2w.Convert(psz, buf)) - return (size_t)-1; + return wxCONV_FAILED; } return inbuf; } @@ -2794,7 +3127,7 @@ public: if (buf) { if (!w2m.Convert(psz, buf)) - return (size_t)-1; + return wxCONV_FAILED; } return inbuf; @@ -2931,7 +3264,6 @@ void wxCSConv::SetName(const wxChar *charset) } #if wxUSE_FONTMAP -#include "wx/hashmap.h" WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual, wxEncodingNameCache ); @@ -3128,10 +3460,10 @@ wxMBConv *wxCSConv::DoCreate() const // NB: This is a hack to prevent deadlock. What could otherwise happen // in Unicode build: wxConvLocal creation ends up being here // because of some failure and logs the error. But wxLog will try to - // attach timestamp, for which it will need wxConvLocal (to convert - // time to char* and then wchar_t*), but that fails, tries to log - // error, but wxLog has a (already locked) critical section that - // guards static buffer. + // attach a timestamp, for which it will need wxConvLocal (to convert + // time to char* and then wchar_t*), but that fails, tries to log the + // error, but wxLog has an (already locked) critical section that + // guards the static buffer. static bool alreadyLoggingError = false; if (!alreadyLoggingError) { @@ -3205,7 +3537,7 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const for (size_t c = 0; c <= len; c++) { if (psz[c] > 0xFF) - return (size_t)-1; + return wxCONV_FAILED; buf[c] = (char)psz[c]; } @@ -3215,7 +3547,7 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const for (size_t c = 0; c <= len; c++) { if (psz[c] > 0xFF) - return (size_t)-1; + return wxCONV_FAILED; } } @@ -3250,21 +3582,27 @@ static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM); static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1); static wxMBConvUTF7 wxConvUTF7Obj; static wxMBConvUTF8 wxConvUTF8Obj; - +#if defined(__WXMAC__) && defined(TARGET_CARBON) +static wxMBConv_macUTF8D wxConvMacUTF8DObj; +#endif WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj; WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj; WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj; WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj; WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj; WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj; +WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal; WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = & #ifdef __WXOSX__ +#if defined(__WXMAC__) && defined(TARGET_CARBON) + wxConvMacUTF8DObj; +#else wxConvUTF8Obj; +#endif #else wxConvLibcObj; #endif - #else // !wxUSE_WCHAR_T // stand-ins in absence of wchar_t