X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/8907154c1a8a6882c6797d1f16393ddfb23e7f3a..137c8bde085d6d5b7c459902d2ea1a198ab48765:/src/common/strconv.cpp?ds=inline diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 1136bde236..ef07724797 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -1,5 +1,5 @@ ///////////////////////////////////////////////////////////////////////////// -// Name: strconv.cpp +// Name: src/common/strconv.cpp // Purpose: Unicode conversion classes // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik, // Ryan Norton, Fredrik Roubert (UTF7) @@ -12,25 +12,15 @@ // Licence: wxWindows licence ///////////////////////////////////////////////////////////////////////////// -// ============================================================================ -// declarations -// ============================================================================ - -// ---------------------------------------------------------------------------- -// headers -// ---------------------------------------------------------------------------- - // For compilers that support precompilation, includes "wx.h". #include "wx/wxprec.h" -#ifdef __BORLANDC__ - #pragma hdrstop -#endif - #ifndef WX_PRECOMP #include "wx/intl.h" #include "wx/log.h" -#endif // WX_PRECOMP + #include "wx/utils.h" + #include "wx/hashmap.h" +#endif #include "wx/strconv.h" @@ -51,7 +41,7 @@ #if defined(__WIN32__) && !defined(__WXMICROWIN__) #define wxHAVE_WIN32_MB2WC -#endif // __WIN32__ but !__WXMICROWIN__ +#endif #ifdef __SALFORDC__ #include @@ -64,7 +54,6 @@ #include "wx/encconv.h" #include "wx/fontmap.h" -#include "wx/utils.h" #ifdef __WXMAC__ #ifndef __DARWIN__ @@ -73,54 +62,73 @@ #include #endif -#include "wx/mac/private.h" // includes mac headers +// includes Mac headers +#include "wx/mac/private.h" #endif + #define TRACE_STRCONV _T("strconv") +// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to +// be 4 bytes +#if SIZEOF_WCHAR_T == 2 + #define WC_UTF16 +#endif + + // ============================================================================ // implementation // ============================================================================ +// helper function of cMB2WC(): check if n bytes at this location are all NUL +static bool NotAllNULs(const char *p, size_t n) +{ + while ( n && *p++ == '\0' ) + n--; + + return n != 0; +} + // ---------------------------------------------------------------------------- -// UTF-16 en/decoding to/from UCS-4 +// UTF-16 en/decoding to/from UCS-4 with surrogates handling // ---------------------------------------------------------------------------- - static size_t encode_utf16(wxUint32 input, wxUint16 *output) { - if (input<=0xffff) + if (input <= 0xffff) { if (output) *output = (wxUint16) input; + return 1; } - else if (input>=0x110000) + else if (input >= 0x110000) { - return (size_t)-1; + return wxCONV_FAILED; } else { if (output) { - *output++ = (wxUint16) ((input >> 10)+0xd7c0); - *output = (wxUint16) ((input&0x3ff)+0xdc00); + *output++ = (wxUint16) ((input >> 10) + 0xd7c0); + *output = (wxUint16) ((input & 0x3ff) + 0xdc00); } + return 2; } } static size_t decode_utf16(const wxUint16* input, wxUint32& output) { - if ((*input<0xd800) || (*input>0xdfff)) + if ((*input < 0xd800) || (*input > 0xdfff)) { output = *input; return 1; } - else if ((input[1]<0xdc00) || (input[1]>0xdfff)) + else if ((input[1] < 0xdc00) || (input[1] > 0xdfff)) { output = *input; - return (size_t)-1; + return wxCONV_FAILED; } else { @@ -129,183 +137,322 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) } } +#ifdef WC_UTF16 + typedef wchar_t wxDecodeSurrogate_t; +#else // !WC_UTF16 + typedef wxUint16 wxDecodeSurrogate_t; +#endif // WC_UTF16/!WC_UTF16 + +// returns the next UTF-32 character from the wchar_t buffer and advances the +// pointer to the character after this one +// +// if an invalid character is found, *pSrc is set to NULL, the caller must +// check for this +static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc) +{ + wxUint32 out; + const size_t + n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out); + if ( n == wxCONV_FAILED ) + *pSrc = NULL; + else + *pSrc += n; + + return out; +} // ---------------------------------------------------------------------------- // wxMBConv // ---------------------------------------------------------------------------- -wxMBConv::~wxMBConv() +size_t +wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - // nothing to do here (necessary for Darwin linking probably) -} + // although new conversion classes are supposed to implement this function + // directly, the existins ones only implement the old MB2WC() and so, to + // avoid to have to rewrite all conversion classes at once, we provide a + // default (but not efficient) implementation of this one in terms of the + // old function by copying the input to ensure that it's NUL-terminated and + // then using MB2WC() to convert it + + // the number of chars [which would be] written to dst [if it were not NULL] + size_t dstWritten = 0; + + // the number of NULs terminating this string + size_t nulLen = 0; // not really needed, but just to avoid warnings + + // if we were not given the input size we just have to assume that the + // string is properly terminated as we have no way of knowing how long it + // is anyhow, but if we do have the size check whether there are enough + // NULs at the end + wxCharBuffer bufTmp; + const char *srcEnd; + if ( srcLen != wxNO_LEN ) + { + // we need to know how to find the end of this string + nulLen = GetMBNulLen(); + if ( nulLen == wxCONV_FAILED ) + return wxCONV_FAILED; + + // if there are enough NULs we can avoid the copy + if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) ) + { + // make a copy in order to properly NUL-terminate the string + bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */); + char * const p = bufTmp.data(); + memcpy(p, src, srcLen); + for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ ) + *s = '\0'; + + src = bufTmp; + } -const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const -{ - if ( psz ) + srcEnd = src + srcLen; + } + else // quit after the first loop iteration { - // calculate the length of the buffer needed first - size_t nLen = MB2WC(NULL, psz, 0); - if ( nLen != (size_t)-1 ) + srcEnd = NULL; + } + + for ( ;; ) + { + // try to convert the current chunk + size_t lenChunk = MB2WC(NULL, src, 0); + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; + + lenChunk++; // for the L'\0' at the end of this chunk + + dstWritten += lenChunk; + + if ( lenChunk == 1 ) { - // now do the actual conversion - wxWCharBuffer buf(nLen); - nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL - if ( nLen != (size_t)-1 ) - { - return buf; - } + // nothing left in the input string, conversion succeeded + break; } - } - wxWCharBuffer buf((wchar_t *)NULL); + if ( dst ) + { + if ( dstWritten > dstLen ) + return wxCONV_FAILED; - return buf; -} + if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED ) + return wxCONV_FAILED; -const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const -{ - if ( pwz ) - { - size_t nLen = WC2MB(NULL, pwz, 0); - if ( nLen != (size_t)-1 ) + dst += lenChunk; + } + + if ( !srcEnd ) { - wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero - nLen = WC2MB(buf.data(), pwz, nLen + 4); - if ( nLen != (size_t)-1 ) - { - return buf; - } + // we convert just one chunk in this case as this is the entire + // string anyhow + break; } - } - wxCharBuffer buf((char *)NULL); + // advance the input pointer past the end of this chunk + while ( NotAllNULs(src, nulLen) ) + { + // notice that we must skip over multiple bytes here as we suppose + // that if NUL takes 2 or 4 bytes, then all the other characters do + // too and so if advanced by a single byte we might erroneously + // detect sequences of NUL bytes in the middle of the input + src += nulLen; + } - return buf; -} + src += nulLen; // skipping over its terminator as well -const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const -{ - wxASSERT(pOutSize != NULL); + // note that ">=" (and not just "==") is needed here as the terminator + // we skipped just above could be inside or just after the buffer + // delimited by inEnd + if ( src >= srcEnd ) + break; + } - const char* szEnd = szString + nStringLen + 1; - const char* szPos = szString; - const char* szStart = szPos; + return dstWritten; +} - size_t nActualLength = 0; - size_t nCurrentSize = nStringLen; //try normal size first (should never resize?) +size_t +wxMBConv::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + // the number of chars [which would be] written to dst [if it were not NULL] + size_t dstWritten = 0; - wxWCharBuffer theBuffer(nCurrentSize); + // make a copy of the input string unless it is already properly + // NUL-terminated + // + // if we don't know its length we have no choice but to assume that it is, + // indeed, properly terminated + wxWCharBuffer bufTmp; + if ( srcLen == wxNO_LEN ) + { + srcLen = wxWcslen(src) + 1; + } + else if ( srcLen != 0 && src[srcLen - 1] != L'\0' ) + { + // make a copy in order to properly NUL-terminate the string + bufTmp = wxWCharBuffer(srcLen); + memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t)); + src = bufTmp; + } - //Convert the string until the length() is reached, continuing the - //loop every time a null character is reached - while(szPos != szEnd) + const size_t lenNul = GetMBNulLen(); + for ( const wchar_t * const srcEnd = src + srcLen; + src < srcEnd; + src += wxWcslen(src) + 1 /* skip L'\0' too */ ) { - wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true + // try to convert the current chunk + size_t lenChunk = WC2MB(NULL, src, 0); - //Get the length of the current (sub)string - size_t nLen = MB2WC(NULL, szPos, 0); + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; - //Invalid conversion? - if( nLen == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } + lenChunk += lenNul; + dstWritten += lenChunk; + if ( dst ) + { + if ( dstWritten > dstLen ) + return wxCONV_FAILED; - //Increase the actual length (+1 for current null character) - nActualLength += nLen + 1; + if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED ) + return wxCONV_FAILED; - //if buffer too big, realloc the buffer - if (nActualLength > (nCurrentSize+1)) - { - wxWCharBuffer theNewBuffer(nCurrentSize << 1); - memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t)); - theBuffer = theNewBuffer; - nCurrentSize <<= 1; + dst += lenChunk; } + } - //Convert the current (sub)string - if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } + return dstWritten; +} - //Increment to next (sub)string - //Note that we have to use strlen instead of nLen here - //because XX2XX gives us the size of the output buffer, - //which is not necessarily the length of the string - szPos += strlen(szPos) + 1; +size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const +{ + size_t rc = ToWChar(outBuff, outLen, inBuff); + if ( rc != wxCONV_FAILED ) + { + // ToWChar() returns the buffer length, i.e. including the trailing + // NUL, while this method doesn't take it into account + rc--; } - //success - return actual length and the buffer - *pOutSize = nActualLength; - return theBuffer; + return rc; } -const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const +size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const { - wxASSERT(pOutSize != NULL); - - const wchar_t* szEnd = szString + nStringLen + 1; - const wchar_t* szPos = szString; - const wchar_t* szStart = szPos; + size_t rc = FromWChar(outBuff, outLen, inBuff); + if ( rc != wxCONV_FAILED ) + { + rc -= GetMBNulLen(); + } - size_t nActualLength = 0; - size_t nCurrentSize = nStringLen << 2; //try * 4 first + return rc; +} - wxCharBuffer theBuffer(nCurrentSize); +wxMBConv::~wxMBConv() +{ + // nothing to do here (necessary for Darwin linking probably) +} - //Convert the string until the length() is reached, continuing the - //loop every time a null character is reached - while(szPos != szEnd) +const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const +{ + if ( psz ) { - wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true + // calculate the length of the buffer needed first + const size_t nLen = MB2WC(NULL, psz, 0); + if ( nLen != wxCONV_FAILED ) + { + // now do the actual conversion + wxWCharBuffer buf(nLen /* +1 added implicitly */); + + // +1 for the trailing NULL + if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) + return buf; + } + } - //Get the length of the current (sub)string - size_t nLen = WC2MB(NULL, szPos, 0); + return wxWCharBuffer(); +} - //Invalid conversion? - if( nLen == (size_t)-1 ) +const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const +{ + if ( pwz ) + { + const size_t nLen = WC2MB(NULL, pwz, 0); + if ( nLen != wxCONV_FAILED ) { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; + // extra space for trailing NUL(s) + static const size_t extraLen = GetMaxMBNulLen(); + + wxCharBuffer buf(nLen + extraLen - 1); + if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) + return buf; } + } - //Increase the actual length (+1 for current null character) - nActualLength += nLen + 1; + return wxCharBuffer(); +} - //if buffer too big, realloc the buffer - if (nActualLength > (nCurrentSize+1)) +const wxWCharBuffer +wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const +{ + const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen); + if ( dstLen != wxCONV_FAILED ) + { + wxWCharBuffer wbuf(dstLen - 1); + if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { - wxCharBuffer theNewBuffer(nCurrentSize << 1); - memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize); - theBuffer = theNewBuffer; - nCurrentSize <<= 1; + if ( outLen ) + { + *outLen = dstLen; + if ( wbuf[dstLen - 1] == L'\0' ) + (*outLen)--; + } + + return wbuf; } + } + + if ( outLen ) + *outLen = 0; - //Convert the current (sub)string - if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 ) + return wxWCharBuffer(); +} + +const wxCharBuffer +wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const +{ + size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); + if ( dstLen != wxCONV_FAILED ) + { + // special case of empty input: can't allocate 0 size buffer below as + // wxCharBuffer insists on NUL-terminating it + wxCharBuffer buf(dstLen ? dstLen - 1 : 1); + if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } + if ( outLen ) + { + *outLen = dstLen; + + const size_t nulLen = GetMBNulLen(); + if ( dstLen >= nulLen && + !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) + { + // in this case the output is NUL-terminated and we're not + // supposed to count NUL + *outLen -= nulLen; + } + } - //Increment to next (sub)string - //Note that we have to use wxWcslen instead of nLen here - //because XX2XX gives us the size of the output buffer, - //which is not necessarily the length of the string - szPos += wxWcslen(szPos) + 1; + return buf; + } } - //success - return actual length and the buffer - *pOutSize = nActualLength; - return theBuffer; + if ( outLen ) + *outLen = 0; + + return wxCharBuffer(); } // ---------------------------------------------------------------------------- @@ -322,12 +469,12 @@ size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const return wxWC2MB(buf, psz, n); } -#ifdef __UNIX__ - // ---------------------------------------------------------------------------- // wxConvBrokenFileNames // ---------------------------------------------------------------------------- +#ifdef __UNIX__ + wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) { if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0 @@ -337,23 +484,7 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) m_conv = new wxCSConv(charset); } -size_t -wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, - const char *psz, - size_t outputSize) const -{ - return m_conv->MB2WC( outputBuf, psz, outputSize ); -} - -size_t -wxConvBrokenFileNames::WC2MB(char *outputBuf, - const wchar_t *psz, - size_t outputSize) const -{ - return m_conv->WC2MB( outputBuf, psz, outputSize ); -} - -#endif +#endif // __UNIX__ // ---------------------------------------------------------------------------- // UTF-7 @@ -404,7 +535,7 @@ size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const { size_t len = 0; - while (*psz && ((!buf) || (len < n))) + while ( *psz && (!buf || (len < n)) ) { unsigned char cc = *psz++; if (cc != '+') @@ -422,20 +553,19 @@ size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const len++; psz++; } - else + else // start of BASE64 encoded string { - // BASE64 encoded string - bool lsb; - unsigned char c; + bool lsb, ok; unsigned int d, l; - for (lsb = false, d = 0, l = 0; - (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++) + for ( ok = lsb = false, d = 0, l = 0; + (cc = utf7unb64[(unsigned char)*psz]) != 0xff; + psz++ ) { d <<= 6; d += cc; for (l += 6; l >= 8; lsb = !lsb) { - c = (unsigned char)((d >> (l -= 8)) % 256); + unsigned char c = (unsigned char)((d >> (l -= 8)) % 256); if (lsb) { if (buf) @@ -443,16 +573,29 @@ size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const len ++; } else + { if (buf) *buf = (wchar_t)(c << 8); + } + + ok = true; } } + + if ( !ok ) + { + // in valid UTF7 we should have valid characters after '+' + return wxCONV_FAILED; + } + if (*psz == '-') psz++; } } - if (buf && (len < n)) - *buf = 0; + + if ( buf && (len < n) ) + *buf = '\0'; + return len; } @@ -493,8 +636,6 @@ static const unsigned char utf7encode[128] = size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const { - - size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -505,25 +646,27 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const // plain ASCII char if (buf) *buf++ = (char)cc; + len++; } #ifndef WC_UTF16 else if (((wxUint32)cc) > 0xffff) { // no surrogate pair generation (yet?) - return (size_t)-1; + return wxCONV_FAILED; } #endif else { if (buf) *buf++ = '+'; + len++; if (cc != '+') { // BASE64 encode string unsigned int lsb, d, l; - for (d = 0, l = 0;; psz++) + for (d = 0, l = 0; /*nothing*/; psz++) { for (lsb = 0; lsb < 2; lsb ++) { @@ -538,24 +681,30 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const len++; } } + cc = *psz; if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1)) break; } + if (l != 0) { if (buf) *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64]; + len++; } } + if (buf) *buf++ = '-'; len++; } } + if (buf && (len < n)) *buf = 0; + return len; } @@ -583,6 +732,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const unsigned cnt; for (cnt = 0; fc & 0x80; cnt++) fc <<= 1; + if (!cnt) { // plain ASCII char @@ -620,9 +770,11 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const invalid = true; break; } + psz++; res = (res << 6) | (cc & 0x3f); } + if (invalid || res <= utf8_max[ocnt]) { // illegal UTF-8 encoding @@ -641,7 +793,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const #ifdef WC_UTF16 // cast is ok because wchar_t == wxUuint16 if WC_UTF16 size_t pa = encode_utf16(res, (wxUint16 *)buf); - if (pa == (size_t)-1) + if (pa == wxCONV_FAILED) { invalid = true; } @@ -653,11 +805,12 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const } #else // !WC_UTF16 if (buf) - *buf++ = res; + *buf++ = (wchar_t)res; len++; #endif // WC_UTF16/!WC_UTF16 } } + if (invalid) { if (m_options & MAP_INVALID_UTF8_TO_PUA) @@ -667,14 +820,14 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const #ifdef WC_UTF16 // cast is ok because wchar_t == wxUuint16 if WC_UTF16 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf); - wxASSERT(pa != (size_t)-1); + wxASSERT(pa != wxCONV_FAILED); if (buf) buf += pa; opsz++; len += pa; #else if (buf) - *buf++ = wxUnicodePUA + (unsigned char)*opsz; + *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz); opsz++; len++; #endif @@ -686,25 +839,28 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const { if ( buf && len + 3 < n ) { - unsigned char n = *opsz; + unsigned char on = *opsz; *buf++ = L'\\'; - *buf++ = (wchar_t)( L'0' + n / 0100 ); - *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 ); - *buf++ = (wchar_t)( L'0' + n % 010 ); + *buf++ = (wchar_t)( L'0' + on / 0100 ); + *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 ); + *buf++ = (wchar_t)( L'0' + on % 010 ); } + opsz++; len += 4; } } else // MAP_INVALID_UTF8_NOT { - return (size_t)-1; + return wxCONV_FAILED; } } } } + if (buf && (len < n)) *buf = 0; + return len; } @@ -720,12 +876,13 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const while (*psz && ((!buf) || (len < n))) { wxUint32 cc; + #ifdef WC_UTF16 // cast is ok for WC_UTF16 size_t pa = decode_utf16((const wxUint16 *)psz, cc); - psz += (pa == (size_t)-1) ? 1 : pa; + psz += (pa == wxCONV_FAILED) ? 1 : pa; #else - cc=(*psz++) & 0x7fffffff; + cc = (*psz++) & 0x7fffffff; #endif if ( (m_options & MAP_INVALID_UTF8_TO_PUA) @@ -749,8 +906,8 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const { if (buf) { - *buf++ = (char) ((psz[0] - L'0')*0100 + - (psz[1] - L'0')*010 + + *buf++ = (char) ((psz[0] - L'0') * 0100 + + (psz[1] - L'0') * 010 + (psz[2] - L'0')); } @@ -760,7 +917,10 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const else { unsigned cnt; - for (cnt = 0; cc > utf8_max[cnt]; cnt++) {} + for (cnt = 0; cc > utf8_max[cnt]; cnt++) + { + } + if (!cnt) { // plain ASCII char @@ -768,7 +928,6 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const *buf++ = (char) cc; len++; } - else { len += cnt + 1; @@ -782,15 +941,15 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const } } - if (buf && (len dstLen ) + return wxCONV_FAILED; + + *dst++ = ch; } - if (buf && len 1) + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *outBuff++ = cc[0]; + if ( numChars == 2 ) { - *(wxUint16*)buf = cc[1]; - buf += sizeof(wxUint16); + // second character of a surrogate + *outBuff++ = cc[1]; } } - - len += pa*sizeof(wxUint16); - psz++; } - if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; - return len; + return outLen; } +// ---------------------------------------------------------------------------- +// endian-reversing conversions +// ---------------------------------------------------------------------------- -// swap 16bit MB to 32bit String -size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint16*)psz && (!buf || len < n)) + const size_t inLen = srcLen / BYTES_PER_CHAR; + if ( !dst ) { - wxUint32 cc; - char tmp[4]; - tmp[0]=psz[1]; tmp[1]=psz[0]; - tmp[2]=psz[3]; tmp[3]=psz[2]; + // optimization: return maximal space which could be needed for this + // string even if the real size could be smaller if the buffer contains + // any surrogates + return inLen; + } + + size_t outLen = 0; + const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); + for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) + { + wxUint32 ch; + wxUint16 tmp[2]; - size_t pa=decode_utf16((wxUint16*)tmp, cc); - if (pa == (size_t)-1) - return pa; + tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); + inBuff++; + tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); - if (buf) - *buf++ = cc; + const size_t numChars = decode_utf16(tmp, ch); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; + + if ( numChars == 2 ) + inBuff++; + + if ( ++outLen > dstLen ) + return wxCONV_FAILED; - len++; - psz += pa * sizeof(wxUint16); + *dst++ = ch; } - if (buf && len 1) + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]); + if ( numChars == 2 ) { - *buf++ = ((char*)cc)[3]; - *buf++ = ((char*)cc)[2]; + // second character of a surrogate + *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]); } } - - len += pa*sizeof(wxUint16); - psz++; } - if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; - return len; + return outLen; } -#endif // WC_UTF16 +#endif // WC_UTF16/!WC_UTF16 -// ---------------------------------------------------------------------------- +// ============================================================================ // UTF-32 -// ---------------------------------------------------------------------------- +// ============================================================================ #ifdef WORDS_BIGENDIAN -#define wxMBConvUTF32straight wxMBConvUTF32BE -#define wxMBConvUTF32swap wxMBConvUTF32LE + #define wxMBConvUTF32straight wxMBConvUTF32BE + #define wxMBConvUTF32swap wxMBConvUTF32LE #else -#define wxMBConvUTF32swap wxMBConvUTF32BE -#define wxMBConvUTF32straight wxMBConvUTF32LE + #define wxMBConvUTF32swap wxMBConvUTF32BE + #define wxMBConvUTF32straight wxMBConvUTF32LE #endif WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE; WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE; +/* static */ +size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen) +{ + if ( srcLen == wxNO_LEN ) + { + // count the number of bytes in input, including the trailing NULs + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + for ( srcLen = 1; *inBuff++; srcLen++ ) + ; + + srcLen *= BYTES_PER_CHAR; + } + else // we already have the length + { + // we can only convert an entire number of UTF-32 characters + if ( srcLen % BYTES_PER_CHAR ) + return wxCONV_FAILED; + } + + return srcLen; +} +// case when in-memory representation is UTF-16 #ifdef WC_UTF16 -// copy 32bit MB to 16bit String -size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- + +size_t +wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + const size_t inLen = srcLen / BYTES_PER_CHAR; + size_t outLen = 0; + for ( size_t n = 0; n < inLen; n++ ) { wxUint16 cc[2]; + const size_t numChars = encode_utf16(*inBuff++, cc); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - size_t pa=encode_utf16(*(wxUint32*)psz, cc); - if (pa == (size_t)-1) - return pa; - - if (buf) + outLen += numChars; + if ( dst ) { - *buf++ = cc[0]; - if (pa > 1) - *buf++ = cc[1]; + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *dst++ = cc[0]; + if ( numChars == 2 ) + { + // second character of a surrogate + *dst++ = cc[1]; + } } - len += pa; - psz += sizeof(wxUint32); } - if (buf && len dstLen ) + return wxCONV_FAILED; - return len; -} + *outBuff++ = ch; + } + return outLen; +} +// ---------------------------------------------------------------------------- +// endian-reversing conversions +// ---------------------------------------------------------------------------- -// swap 32bit MB to 16bit String -size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); + const size_t inLen = srcLen / BYTES_PER_CHAR; + size_t outLen = 0; + for ( size_t n = 0; n < inLen; n++, inBuff++ ) { - char tmp[4]; - tmp[0] = psz[3]; tmp[1] = psz[2]; - tmp[2] = psz[1]; tmp[3] = psz[0]; - - wxUint16 cc[2]; + const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; - size_t pa=encode_utf16(*(wxUint32*)tmp, cc); - if (pa == (size_t)-1) - return pa; - - if (buf) + outLen += numChars; + if ( dst ) { - *buf++ = cc[0]; - if (pa > 1) - *buf++ = cc[1]; + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *dst++ = cc[0]; + if ( numChars == 2 ) + { + // second character of a surrogate + *dst++ = cc[1]; + } } - len += pa; - psz += sizeof(wxUint32); } - if (buf && len dstLen ) + return wxCONV_FAILED; - return len; + *outBuff++ = wxUINT32_SWAP_ALWAYS(ch); + } + + return outLen; } -#else // WC_UTF16 +#else // !WC_UTF16: wchar_t is UTF-32 +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- -// copy 32bit MB to 32bit String -size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + // use memcpy() as it should be much faster than hand-written loop + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const size_t inLen = srcLen/BYTES_PER_CHAR; + if ( dst ) { - if (buf) - *buf++ = *(wxUint32*)psz; - len++; - psz += sizeof(wxUint32); - } + if ( dstLen < inLen ) + return wxCONV_FAILED; - if (buf && lenm_minMBCharWidth = m_minMBCharWidth; + return p; + } + bool IsOk() const { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); } protected: - // the iconv handlers used to translate from multibyte to wide char and in - // the other direction + // the iconv handlers used to translate from multibyte + // to wide char and in the other direction iconv_t m2w, w2m; + #if wxUSE_THREADS // guards access to m2w and w2m objects wxMutex m_iconvMutex; @@ -1323,6 +1625,14 @@ private: // true if the wide char encoding we use (i.e. ms_wcCharsetName) has // different endian-ness than the native one static bool ms_wcNeedsSwap; + + + // name of the encoding handled by this conversion + wxString m_name; + + // cached result of GetMBNulLen(); set to 0 meaning "unknown" + // initially + size_t m_minMBCharWidth; }; // make the constructor available for unit testing @@ -1334,6 +1644,7 @@ WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name ) delete result; return 0; } + return result; } @@ -1341,7 +1652,10 @@ wxString wxMBConv_iconv::ms_wcCharsetName; bool wxMBConv_iconv::ms_wcNeedsSwap = false; wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) + : m_name(name) { + m_minMBCharWidth = 0; + // iconv operates with chars, not wxChars, but luckily it uses only ASCII // names for the charsets const wxCharBuffer cname(wxString(name).ToAscii()); @@ -1349,6 +1663,8 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) // check for charset that represents wchar_t: if ( ms_wcCharsetName.empty() ) { + wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:")); + #if wxUSE_FONTMAP const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC); #else // !wxUSE_FONTMAP @@ -1363,23 +1679,29 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) }; #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP - for ( ; *names; ++names ) + for ( ; *names && ms_wcCharsetName.empty(); ++names ) { - const wxString name(*names); + const wxString nameCS(*names); // first try charset with explicit bytesex info (e.g. "UCS-4LE"): - wxString nameXE(name); - #ifdef WORDS_BIGENDIAN + wxString nameXE(nameCS); + +#ifdef WORDS_BIGENDIAN nameXE += _T("BE"); - #else // little endian +#else // little endian nameXE += _T("LE"); - #endif +#endif + + wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), + nameXE.c_str()); m2w = iconv_open(nameXE.ToAscii(), cname); if ( m2w == ICONV_T_INVALID ) { // try charset w/o bytesex info (e.g. "UCS4") - m2w = iconv_open(name.ToAscii(), cname); + wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), + nameCS.c_str()); + m2w = iconv_open(nameCS.ToAscii(), cname); // and check for bytesex ourselves: if ( m2w != ICONV_T_INVALID ) @@ -1397,18 +1719,19 @@ wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) wbufPtr = wbuf; bufPtr = buf; - res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz, - (char**)&wbufPtr, &outsz); + res = iconv( + m2w, ICONV_CHAR_CAST(&bufPtr), &insz, + (char**)&wbufPtr, &outsz); if (ICONV_FAILED(res, insz)) { wxLogLastError(wxT("iconv")); wxLogError(_("Conversion to charset '%s' doesn't work."), - name.c_str()); + nameCS.c_str()); } else // ok, can convert to this encoding, remember it { - ms_wcCharsetName = name; + ms_wcCharsetName = nameCS; ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0]; } } @@ -1457,6 +1780,31 @@ wxMBConv_iconv::~wxMBConv_iconv() size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const { + // find the string length: notice that must be done differently for + // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs + size_t inbuf; + const size_t nulLen = GetMBNulLen(); + switch ( nulLen ) + { + default: + return wxCONV_FAILED; + + case 1: + inbuf = strlen(psz); // arguably more optimized than our version + break; + + case 2: + case 4: + // for UTF-16/32 not only we need to have 2/4 consecutive NULs but + // they also have to start at character boundary and not span two + // adjacent characters + const char *p; + for ( p = psz; NotAllNULs(p, nulLen); p += nulLen ) + ; + inbuf = p - psz; + break; + } + #if wxUSE_THREADS // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle. // Unfortunately there is a couple of global wxCSConv objects such as @@ -1465,9 +1813,8 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const // only a few wx classes would be safe to use from non-main threads // as MB<->WC conversion would fail "randomly". wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); -#endif +#endif // wxUSE_THREADS - size_t inbuf = strlen(psz); size_t outbuf = n * SIZEOF_WCHAR_T; size_t res, cres; // VS: Use these instead of psz, buf because iconv() modifies its arguments: @@ -1485,13 +1832,11 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const if (ms_wcNeedsSwap) { // convert to native endianness - for ( unsigned n = 0; n < res; n++ ) - buf[n] = WC_BSWAP(buf[n]); + for ( unsigned i = 0; i < res; i++ ) + buf[n] = WC_BSWAP(buf[i]); } - // NB: iconv was given only strlen(psz) characters on input, and so - // it couldn't convert the trailing zero. Let's do it ourselves - // if there's some room left for it in the output buffer. + // NUL-terminate the string if there is any space left if (res < n) buf[res] = 0; } @@ -1501,23 +1846,26 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const // to calculate destination buffer requirement wchar_t tbuf[8]; res = 0; - do { + + do + { bufPtr = tbuf; - outbuf = 8*SIZEOF_WCHAR_T; + outbuf = 8 * SIZEOF_WCHAR_T; cres = iconv(m2w, ICONV_CHAR_CAST(&pszPtr), &inbuf, (char**)&bufPtr, &outbuf ); - res += 8-(outbuf/SIZEOF_WCHAR_T); - } while ((cres==(size_t)-1) && (errno==E2BIG)); + res += 8 - (outbuf / SIZEOF_WCHAR_T); + } + while ((cres == (size_t)-1) && (errno == E2BIG)); } if (ICONV_FAILED(cres, inbuf)) { //VS: it is ok if iconv fails, hence trace only wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); - return (size_t)-1; + return wxCONV_FAILED; } return res; @@ -1530,7 +1878,8 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); #endif - size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T; + size_t inlen = wxWcslen(psz); + size_t inbuf = inlen * SIZEOF_WCHAR_T; size_t outbuf = n; size_t res, cres; @@ -1542,9 +1891,10 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const // (doing WC_BSWAP twice on the original buffer won't help, as it // could be in read-only memory, or be accessed in some other thread) tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T); - for ( size_t n = 0; n < inbuf; n++ ) - tmpbuf[n] = WC_BSWAP(psz[n]); - tmpbuf[inbuf] = L'\0'; + for ( size_t i = 0; i < inlen; i++ ) + tmpbuf[n] = WC_BSWAP(psz[i]); + + tmpbuf[inlen] = L'\0'; psz = tmpbuf; } @@ -1553,7 +1903,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const // have destination buffer, convert there cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); - res = n-outbuf; + res = n - outbuf; // NB: iconv was given only wcslen(psz) characters on input, and so // it couldn't convert the trailing zero. Let's do it ourselves @@ -1563,17 +1913,20 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const } else { - // no destination buffer... convert using temp buffer + // no destination buffer: convert using temp buffer // to calculate destination buffer requirement char tbuf[16]; res = 0; - do { - buf = tbuf; outbuf = 16; + do + { + buf = tbuf; + outbuf = 16; cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); res += 16 - outbuf; - } while ((cres==(size_t)-1) && (errno==E2BIG)); + } + while ((cres == (size_t)-1) && (errno == E2BIG)); } if (ms_wcNeedsSwap) @@ -1583,14 +1936,43 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const if (ICONV_FAILED(cres, inbuf)) { - //VS: it is ok if iconv fails, hence trace only wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); - return (size_t)-1; + return wxCONV_FAILED; } return res; } +size_t wxMBConv_iconv::GetMBNulLen() const +{ + if ( m_minMBCharWidth == 0 ) + { + wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv); + +#if wxUSE_THREADS + // NB: explained in MB2WC + wxMutexLocker lock(self->m_iconvMutex); +#endif + + wchar_t *wnul = L""; + char buf[8]; // should be enough for NUL in any encoding + size_t inLen = sizeof(wchar_t), + outLen = WXSIZEOF(buf); + char *inBuff = (char *)wnul; + char *outBuff = buf; + if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 ) + { + self->m_minMBCharWidth = (size_t)-1; + } + else // ok + { + self->m_minMBCharWidth = outBuff - buf; + } + } + + return m_minMBCharWidth; +} + #endif // HAVE_ICONV @@ -1612,21 +1994,31 @@ public: wxMBConv_win32() { m_CodePage = CP_ACP; + m_minMBCharWidth = 0; + } + + wxMBConv_win32(const wxMBConv_win32& conv) + : wxMBConv() + { + m_CodePage = conv.m_CodePage; + m_minMBCharWidth = conv.m_minMBCharWidth; } #if wxUSE_FONTMAP wxMBConv_win32(const wxChar* name) { m_CodePage = wxCharsetToCodepage(name); + m_minMBCharWidth = 0; } wxMBConv_win32(wxFontEncoding encoding) { m_CodePage = wxEncodingToCodepage(encoding); + m_minMBCharWidth = 0; } -#endif +#endif // wxUSE_FONTMAP - size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const + virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const { // note that we have to use MB_ERR_INVALID_CHARS flag as it without it // the behaviour is not compatible with the Unix version (using iconv) @@ -1634,14 +2026,27 @@ public: // wouldn't work if reading an incomplete MB char didn't result in an // error // - // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in - // an error (tested under Windows Server 2003) and apparently it is - // done on purpose, i.e. the function accepts any input in this case - // and although I'd prefer to return error on ill-formed output, our - // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is - // explicitly ill-formed according to RFC 2152) neither so we don't - // even have any fallback here... - int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS; + // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or + // Win XP or newer and it is not supported for UTF-[78] so we always + // use our own conversions in this case. See + // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx + // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp + if ( m_CodePage == CP_UTF8 ) + { + return wxConvUTF8.MB2WC(buf, psz, n); + } + + if ( m_CodePage == CP_UTF7 ) + { + return wxConvUTF7.MB2WC(buf, psz, n); + } + + int flags = 0; + if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) && + IsAtLeastWin2kSP4() ) + { + flags = MB_ERR_INVALID_CHARS; + } const size_t len = ::MultiByteToWideChar ( @@ -1652,14 +2057,44 @@ public: buf, // output string buf ? n : 0 // size of output buffer ); + if ( !len ) + { + // function totally failed + return wxCONV_FAILED; + } + + // if we were really converting and didn't use MB_ERR_INVALID_CHARS, + // check if we succeeded, by doing a double trip: + if ( !flags && buf ) + { + const size_t mbLen = strlen(psz); + wxCharBuffer mbBuf(mbLen); + if ( ::WideCharToMultiByte + ( + m_CodePage, + 0, + buf, + -1, + mbBuf.data(), + mbLen + 1, // size in bytes, not length + NULL, + NULL + ) == 0 || + strcmp(mbBuf, psz) != 0 ) + { + // we didn't obtain the same thing we started from, hence + // the conversion was lossy and we consider that it failed + return wxCONV_FAILED; + } + } // note that it returns count of written chars for buf != NULL and size // of the needed buffer for buf == NULL so in either case the length of // the string (which never includes the terminating NUL) is one less - return len ? len - 1 : (size_t)-1; + return len - 1; } - size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const + virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const { /* we have a problem here: by default, WideCharToMultiByte() may @@ -1710,7 +2145,7 @@ public: if ( !len ) { // function totally failed - return (size_t)-1; + return wxCONV_FAILED; } // if we were really converting, check if we succeeded @@ -1721,17 +2156,17 @@ public: // check if the conversion failed, i.e. if any replacements // were done if ( usedDef ) - return (size_t)-1; + return wxCONV_FAILED; } else // we must resort to double tripping... { wxWCharBuffer wcBuf(n); - if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 || + if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || wcscmp(wcBuf, pwz) != 0 ) { // we didn't obtain the same thing we started from, hence // the conversion was lossy and we consider that it failed - return (size_t)-1; + return wxCONV_FAILED; } } } @@ -1740,6 +2175,47 @@ public: return len - 1; } + virtual size_t GetMBNulLen() const + { + if ( m_minMBCharWidth == 0 ) + { + int len = ::WideCharToMultiByte + ( + m_CodePage, // code page + 0, // no flags + L"", // input string + 1, // translate just the NUL + NULL, // output buffer + 0, // and its size + NULL, // no replacement char + NULL // [out] don't care if it was used + ); + + wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); + switch ( len ) + { + default: + wxLogDebug(_T("Unexpected NUL length %d"), len); + self->m_minMBCharWidth = (size_t)-1; + break; + + case 0: + self->m_minMBCharWidth = (size_t)-1; + break; + + case 1: + case 2: + case 4: + self->m_minMBCharWidth = len; + break; + } + } + + return m_minMBCharWidth; + } + + virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); } + bool IsOk() const { return m_CodePage != -1; } private: @@ -1761,8 +2237,9 @@ private: break; default: - // unknown, be conseravtive by default + // unknown: be conservative by default s_isWin98Or2k = 0; + break; } wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") ); @@ -1771,7 +2248,40 @@ private: return s_isWin98Or2k == 1; } + static bool IsAtLeastWin2kSP4() + { +#ifdef __WXWINCE__ + return false; +#else + static int s_isAtLeastWin2kSP4 = -1; + + if ( s_isAtLeastWin2kSP4 == -1 ) + { + OSVERSIONINFOEX ver; + + memset(&ver, 0, sizeof(ver)); + ver.dwOSVersionInfoSize = sizeof(ver); + GetVersionEx((OSVERSIONINFO*)&ver); + + s_isAtLeastWin2kSP4 = + ((ver.dwMajorVersion > 5) || // Vista+ + (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003 + (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 && + ver.wServicePackMajor >= 4)) // 2000 SP4+ + ? 1 : 0; + } + + return s_isAtLeastWin2kSP4 == 1; +#endif + } + + + // the code page we're working with long m_CodePage; + + // cached result of GetMBNulLen(), set to 0 initially meaning + // "unknown" + size_t m_minMBCharWidth; }; #endif // wxHAVE_WIN32_MB2WC @@ -1782,9 +2292,9 @@ private: #if defined(__WXCOCOA__) -// RN: There is no UTF-32 support in either Core Foundation or -// Cocoa. Strangely enough, internally Core Foundation uses -// UTF 32 internally quite a bit - its just not public (yet). +// RN: There is no UTF-32 support in either Core Foundation or Cocoa. +// Strangely enough, internally Core Foundation uses +// UTF-32 internally quite a bit - its just not public (yet). #include #include @@ -1792,12 +2302,13 @@ private: CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) { CFStringEncoding enc = kCFStringEncodingInvalidId ; - if ( encoding == wxFONTENCODING_DEFAULT ) - { - enc = CFStringGetSystemEncoding(); - } - else switch( encoding) + + switch (encoding) { + case wxFONTENCODING_DEFAULT : + enc = CFStringGetSystemEncoding(); + break ; + case wxFONTENCODING_ISO8859_1 : enc = kCFStringEncodingISOLatin1 ; break ; @@ -1853,7 +2364,7 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) // break ; case wxFONTENCODING_CP437 : - enc =kCFStringEncodingDOSLatinUS ; + enc = kCFStringEncodingDOSLatinUS ; break ; case wxFONTENCODING_CP850 : enc = kCFStringEncodingDOSLatin1; @@ -1865,7 +2376,7 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) enc = kCFStringEncodingDOSCyrillic; break ; case wxFONTENCODING_CP866 : - enc =kCFStringEncodingDOSRussian ; + enc = kCFStringEncodingDOSRussian ; break ; case wxFONTENCODING_CP874 : enc = kCFStringEncodingDOSThai; @@ -1874,7 +2385,7 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) enc = kCFStringEncodingDOSJapanese; break ; case wxFONTENCODING_CP936 : - enc =kCFStringEncodingDOSChineseSimplif ; + enc = kCFStringEncodingDOSChineseSimplif ; break ; case wxFONTENCODING_CP949 : enc = kCFStringEncodingDOSKorean; @@ -1886,10 +2397,10 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) enc = kCFStringEncodingWindowsLatin2; break ; case wxFONTENCODING_CP1251 : - enc =kCFStringEncodingWindowsCyrillic ; + enc = kCFStringEncodingWindowsCyrillic ; break ; case wxFONTENCODING_CP1252 : - enc =kCFStringEncodingWindowsLatin1 ; + enc = kCFStringEncodingWindowsLatin1 ; break ; case wxFONTENCODING_CP1253 : enc = kCFStringEncodingWindowsGreek; @@ -1898,10 +2409,10 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) enc = kCFStringEncodingWindowsLatin5; break ; case wxFONTENCODING_CP1255 : - enc =kCFStringEncodingWindowsHebrew ; + enc = kCFStringEncodingWindowsHebrew ; break ; case wxFONTENCODING_CP1256 : - enc =kCFStringEncodingWindowsArabic ; + enc = kCFStringEncodingWindowsArabic ; break ; case wxFONTENCODING_CP1257 : enc = kCFStringEncodingWindowsBalticRim; @@ -2039,10 +2550,12 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) // case wxFONTENCODING_MACKEYBOARD : // enc = kCFStringEncodingMacKeyboardGlyphs ; // break ; + default : // because gcc is picky break ; - } ; + } + return enc ; } @@ -2054,6 +2567,11 @@ public: Init(CFStringGetSystemEncoding()) ; } + wxMBConv_cocoa(const wxMBConv_cocoa& conv) + { + m_encoding = conv.m_encoding; + } + #if wxUSE_FONTMAP wxMBConv_cocoa(const wxChar* name) { @@ -2107,12 +2625,12 @@ public: CFRelease(theString); - szUniCharBuffer[nOutLength] = '\0' ; + szUniCharBuffer[nOutLength] = '\0'; #if SIZEOF_WCHAR_T == 4 - wxMBConvUTF16 converter ; - converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ; - delete[] szUniCharBuffer; + wxMBConvUTF16 converter; + converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize ); + delete [] szUniCharBuffer; #endif return nOutLength; @@ -2128,9 +2646,9 @@ public: #if SIZEOF_WCHAR_T == 4 wxMBConvUTF16 converter ; - nBufSize = converter.WC2MB( NULL , szUnConv , 0 ); - szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ; - converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ; + nBufSize = converter.WC2MB( NULL, szUnConv, 0 ); + szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1]; + converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar)); nBufSize /= sizeof(UniChar); #endif @@ -2176,6 +2694,8 @@ public: return nRealOutSize - 1; } + virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); } + bool IsOk() const { return m_encoding != kCFStringEncodingInvalidId && @@ -2202,10 +2722,15 @@ public: Init(CFStringGetSystemEncoding()) ; } + wxMBConv_mac(const wxMBConv_mac& conv) + { + Init(conv.m_char_encoding); + } + #if wxUSE_FONTMAP wxMBConv_mac(const wxChar* name) { - Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ; + Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ); } #endif @@ -2226,7 +2751,7 @@ public: { OSStatus status = noErr ; m_char_encoding = encoding ; - m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ; + m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ; status = TECCreateConverter(&m_MB2WC_converter, m_char_encoding, @@ -2240,35 +2765,41 @@ public: { OSStatus status = noErr ; ByteCount byteOutLen ; - ByteCount byteInLen = strlen(psz) ; + ByteCount byteInLen = strlen(psz) + 1; wchar_t *tbuf = NULL ; UniChar* ubuf = NULL ; size_t res = 0 ; if (buf == NULL) { - //apple specs say at least 32 - n = wxMax( 32 , byteInLen ) ; - tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ; + // Apple specs say at least 32 + n = wxMax( 32, byteInLen ) ; + tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; } + ByteCount byteBufferLen = n * sizeof( UniChar ) ; + #if SIZEOF_WCHAR_T == 4 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; #else ubuf = (UniChar*) (buf ? buf : tbuf) ; #endif - status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen, - (TextPtr) ubuf , byteBufferLen, &byteOutLen); + + status = TECConvertText( + m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, + (TextPtr) ubuf, byteBufferLen, &byteOutLen); + #if SIZEOF_WCHAR_T == 4 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar // is not properly terminated we get random characters at the end ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; wxMBConvUTF16 converter ; - res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ; + res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; free( ubuf ) ; #else res = byteOutLen / sizeof( UniChar ) ; #endif + if ( buf == NULL ) free(tbuf) ; @@ -2288,27 +2819,32 @@ public: if (buf == NULL) { - //apple specs say at least 32 - n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); + // Apple specs say at least 32 + n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); tbuf = (char*) malloc( n ) ; } ByteCount byteBufferLen = n ; UniChar* ubuf = NULL ; + #if SIZEOF_WCHAR_T == 4 wxMBConvUTF16 converter ; - size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ; + size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; byteInLen = unicharlen ; ubuf = (UniChar*) malloc( byteInLen + 2 ) ; - converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ; + converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; #else ubuf = (UniChar*) psz ; #endif - status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen, - (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen); + + status = TECConvertText( + m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen, + (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); + #if SIZEOF_WCHAR_T == 4 free( ubuf ) ; #endif + if ( buf == NULL ) free(tbuf) ; @@ -2321,28 +2857,30 @@ public: //of bogus characters wxWCharBuffer wcBuf(n); size_t pszlen = wxWcslen(psz); - if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 || + if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || wxWcslen(wcBuf) != pszlen || memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 ) { // we didn't obtain the same thing we started from, hence // the conversion was lossy and we consider that it failed - return (size_t)-1; + return wxCONV_FAILED; } } return res ; } + virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); } + bool IsOk() const - { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; } + { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; } private: - TECObjectRef m_MB2WC_converter ; - TECObjectRef m_WC2MB_converter ; + TECObjectRef m_MB2WC_converter; + TECObjectRef m_WC2MB_converter; - TextEncodingBase m_char_encoding ; - TextEncodingBase m_unicode_encoding ; + TextEncodingBase m_char_encoding; + TextEncodingBase m_unicode_encoding; }; #endif // defined(__WXMAC__) && defined(TARGET_CARBON) @@ -2387,8 +2925,8 @@ public: size_t inbuf = strlen(psz); if (buf) { - if (!m2w.Convert(psz,buf)) - return (size_t)-1; + if (!m2w.Convert(psz, buf)) + return wxCONV_FAILED; } return inbuf; } @@ -2398,19 +2936,39 @@ public: const size_t inbuf = wxWcslen(psz); if (buf) { - if (!w2m.Convert(psz,buf)) - return (size_t)-1; + if (!w2m.Convert(psz, buf)) + return wxCONV_FAILED; } return inbuf; } + virtual size_t GetMBNulLen() const + { + switch ( m_enc ) + { + case wxFONTENCODING_UTF16BE: + case wxFONTENCODING_UTF16LE: + return 2; + + case wxFONTENCODING_UTF32BE: + case wxFONTENCODING_UTF32LE: + return 4; + + default: + return 1; + } + } + + virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); } + bool IsOk() const { return m_ok; } public: wxFontEncoding m_enc; wxEncodingConverter m2w, w2m; +private: // were we initialized successfully? bool m_ok; @@ -2426,6 +2984,7 @@ WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name ) delete result; return 0; } + return result; } @@ -2451,7 +3010,11 @@ wxCSConv::wxCSConv(const wxChar *charset) SetName(charset); } +#if wxUSE_FONTMAP + m_encoding = wxFontMapperBase::GetEncodingFromName(charset); +#else m_encoding = wxFONTENCODING_SYSTEM; +#endif } wxCSConv::wxCSConv(wxFontEncoding encoding) @@ -2511,7 +3074,6 @@ void wxCSConv::SetName(const wxChar *charset) } #if wxUSE_FONTMAP -#include "wx/hashmap.h" WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual, wxEncodingNameCache ); @@ -2531,7 +3093,8 @@ wxMBConv *wxCSConv::DoCreate() const // check for the special case of ASCII or ISO8859-1 charset: as we have // special knowledge of it anyhow, we don't need to create a special // conversion object - if ( m_encoding == wxFONTENCODING_ISO8859_1 ) + if ( m_encoding == wxFONTENCODING_ISO8859_1 || + m_encoding == wxFONTENCODING_DEFAULT ) { // don't convert at all return NULL; @@ -2616,13 +3179,13 @@ wxMBConv *wxCSConv::DoCreate() const #endif } #endif // wxHAVE_WIN32_MB2WC + #if defined(__WXMAC__) { // leave UTF16 and UTF32 to the built-ins of wx if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE || ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) ) { - #if wxUSE_FONTMAP wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name) : new wxMBConv_mac(m_encoding); @@ -2636,17 +3199,18 @@ wxMBConv *wxCSConv::DoCreate() const } } #endif + #if defined(__WXCOCOA__) { if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) ) { - #if wxUSE_FONTMAP wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name) : new wxMBConv_cocoa(m_encoding); #else wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding); #endif + if ( conv->IsOk() ) return conv; @@ -2688,7 +3252,7 @@ wxMBConv *wxCSConv::DoCreate() const default: // nothing to do but put here to suppress gcc warnings - ; + break; } // step (3) @@ -2723,6 +3287,7 @@ wxMBConv *wxCSConv::DoCreate() const wxString::Format(_("encoding %s"), m_encoding).c_str() #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP ); + alreadyLoggingError = false; } @@ -2782,7 +3347,8 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const for (size_t c = 0; c <= len; c++) { if (psz[c] > 0xFF) - return (size_t)-1; + return wxCONV_FAILED; + buf[c] = (char)psz[c]; } } @@ -2791,13 +3357,25 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const for (size_t c = 0; c <= len; c++) { if (psz[c] > 0xFF) - return (size_t)-1; + return wxCONV_FAILED; } } return len; } +size_t wxCSConv::GetMBNulLen() const +{ + CreateConvIfNeeded(); + + if ( m_convReal ) + { + return m_convReal->GetMBNulLen(); + } + + return 1; +} + // ---------------------------------------------------------------------------- // globals // ---------------------------------------------------------------------------- @@ -2821,6 +3399,7 @@ WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj; WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj; WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj; WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj; +WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal; WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = & #ifdef __WXOSX__ wxConvUTF8Obj; @@ -2828,7 +3407,6 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = & wxConvLibcObj; #endif - #else // !wxUSE_WCHAR_T // stand-ins in absence of wchar_t @@ -2838,5 +3416,3 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc, wxConvUTF8; #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T - -