X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/bfab25d4c41327d6db170be1f832821f36cd425f..2454dc8a6480824859269fd316b9ca780ac7689e:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 778d8dbb05..01e0dc358c 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -86,6 +86,15 @@ // implementation // ============================================================================ +// helper function of cMB2WC(): check if n bytes at this location are all NUL +static bool NotAllNULs(const char *p, size_t n) +{ + while ( n && *p++ == '\0' ) + n--; + + return n != 0; +} + // ---------------------------------------------------------------------------- // UTF-16 en/decoding to/from UCS-4 // ---------------------------------------------------------------------------- @@ -138,178 +147,283 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) // wxMBConv // ---------------------------------------------------------------------------- -wxMBConv::~wxMBConv() +size_t +wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - // nothing to do here (necessary for Darwin linking probably) -} + // although new conversion classes are supposed to implement this function + // directly, the existins ones only implement the old MB2WC() and so, to + // avoid to have to rewrite all conversion classes at once, we provide a + // default (but not efficient) implementation of this one in terms of the + // old function by copying the input to ensure that it's NUL-terminated and + // then using MB2WC() to convert it + + // the number of chars [which would be] written to dst [if it were not NULL] + size_t dstWritten = 0; + + // the number of NULs terminating this string + size_t nulLen wxDUMMY_INITIALIZE(0); + + // if we were not given the input size we just have to assume that the + // string is properly terminated as we have no way of knowing how long it + // is anyhow, but if we do have the size check whether there are enough + // NULs at the end + wxCharBuffer bufTmp; + const char *srcEnd; + if ( srcLen != (size_t)-1 ) + { + // we need to know how to find the end of this string + nulLen = GetMBNulLen(); + if ( nulLen == wxCONV_FAILED ) + return wxCONV_FAILED; + + // if there are enough NULs we can avoid the copy + if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) ) + { + // make a copy in order to properly NUL-terminate the string + bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */); + char * const p = bufTmp.data(); + memcpy(p, src, srcLen); + for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ ) + *s = '\0'; + + src = bufTmp; + } -const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const -{ - if ( psz ) + srcEnd = src + srcLen; + } + else // quit after the first loop iteration { - // calculate the length of the buffer needed first - size_t nLen = MB2WC(NULL, psz, 0); - if ( nLen != (size_t)-1 ) + srcEnd = NULL; + } + + for ( ;; ) + { + // try to convert the current chunk + size_t lenChunk = MB2WC(NULL, src, 0); + if ( lenChunk == 0 ) { - // now do the actual conversion - wxWCharBuffer buf(nLen); - nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL - if ( nLen != (size_t)-1 ) - { - return buf; - } + // nothing left in the input string, conversion succeeded + break; } - } - wxWCharBuffer buf((wchar_t *)NULL); + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; - return buf; -} + // if we already have a previous chunk, leave the NUL separating it + // from this one + if ( dstWritten ) + { + dstWritten++; + if ( dst ) + dst++; + } -const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const -{ - if ( pwz ) - { - size_t nLen = WC2MB(NULL, pwz, 0); - if ( nLen != (size_t)-1 ) + dstWritten += lenChunk; + + if ( dst ) { - wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero - nLen = WC2MB(buf.data(), pwz, nLen + 4); - if ( nLen != (size_t)-1 ) - { - return buf; - } + if ( dstWritten > dstLen ) + return wxCONV_FAILED; + + lenChunk = MB2WC(dst, src, lenChunk + 1 /* for NUL */); + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; + + dst += lenChunk; } - } - wxCharBuffer buf((char *)NULL); + if ( !srcEnd ) + { + // we convert the entire string in this cas, as we suppose that the + // string is NUL-terminated and so srcEnd is not used at all + break; + } - return buf; -} + // advance the input pointer past the end of this chunk + while ( NotAllNULs(src, nulLen) ) + { + // notice that we must skip over multiple bytes here as we suppose + // that if NUL takes 2 or 4 bytes, then all the other characters do + // too and so if advanced by a single byte we might erroneously + // detect sequences of NUL bytes in the middle of the input + src += nulLen; + } -const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const -{ - wxASSERT(pOutSize != NULL); + src += nulLen; // skipping over its terminator as well - const char* szEnd = szString + nStringLen + 1; - const char* szPos = szString; - const char* szStart = szPos; + // note that ">=" (and not just "==") is needed here as the terminator + // we skipped just above could be inside or just after the buffer + // delimited by inEnd + if ( src >= srcEnd ) + break; + } - size_t nActualLength = 0; - size_t nCurrentSize = nStringLen; //try normal size first (should never resize?) + return dstWritten; +} - wxWCharBuffer theBuffer(nCurrentSize); +size_t +wxMBConv::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + // the number of chars [which would be] written to dst [if it were not NULL] + size_t dstWritten = 0; - //Convert the string until the length() is reached, continuing the - //loop every time a null character is reached - while(szPos != szEnd) + // make a copy of the input string unless it is already properly + // NUL-terminated + // + // if we don't know its length we have no choice but to assume that it is, + // indeed, properly terminated + wxWCharBuffer bufTmp; + if ( srcLen == (size_t)-1 ) { - wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true - - //Get the length of the current (sub)string - size_t nLen = MB2WC(NULL, szPos, 0); + srcLen = wxWcslen(src) + 1; + } + else if ( srcLen != 0 && src[srcLen - 1] != L'\0' ) + { + // make a copy in order to properly NUL-terminate the string + bufTmp = wxWCharBuffer(srcLen); + memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t)); + src = bufTmp; + } - //Invalid conversion? - if( nLen == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } + const size_t lenNul = GetMBNulLen(); + for ( const wchar_t * const srcEnd = src + srcLen; + src < srcEnd; + src += wxWcslen(src) + 1 /* skip L'\0' too */ ) + { + // try to convert the current chunk + size_t lenChunk = WC2MB(NULL, src, 0); + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; - //Increase the actual length (+1 for current null character) - nActualLength += nLen + 1; + lenChunk += lenNul; + dstWritten += lenChunk; - //if buffer too big, realloc the buffer - if (nActualLength > (nCurrentSize+1)) + if ( dst ) { - wxWCharBuffer theNewBuffer(nCurrentSize << 1); - memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t)); - theBuffer = theNewBuffer; - nCurrentSize <<= 1; - } + if ( dstWritten > dstLen ) + return wxCONV_FAILED; - //Convert the current (sub)string - if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } + if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED ) + return wxCONV_FAILED; - //Increment to next (sub)string - //Note that we have to use strlen instead of nLen here - //because XX2XX gives us the size of the output buffer, - //which is not necessarily the length of the string - szPos += strlen(szPos) + 1; + dst += lenChunk; + } } - //success - return actual length and the buffer - *pOutSize = nActualLength; - return theBuffer; + return dstWritten; } -const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const +size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const { - wxASSERT(pOutSize != NULL); + size_t rc = ToWChar(out, outLen, in); + if ( rc != wxCONV_FAILED ) + { + // ToWChar() returns the buffer length, i.e. including the trailing + // NUL, while this method doesn't take it into account + rc--; + } + + return rc; +} - const wchar_t* szEnd = szString + nStringLen + 1; - const wchar_t* szPos = szString; - const wchar_t* szStart = szPos; +size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const +{ + size_t rc = FromWChar(out, outLen, in); + if ( rc != wxCONV_FAILED ) + { + rc -= GetMBNulLen(); + } - size_t nActualLength = 0; - size_t nCurrentSize = nStringLen << 2; //try * 4 first + return rc; +} - wxCharBuffer theBuffer(nCurrentSize); +wxMBConv::~wxMBConv() +{ + // nothing to do here (necessary for Darwin linking probably) +} - //Convert the string until the length() is reached, continuing the - //loop every time a null character is reached - while(szPos != szEnd) +const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const +{ + if ( psz ) { - wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true + // calculate the length of the buffer needed first + const size_t nLen = MB2WC(NULL, psz, 0); + if ( nLen != wxCONV_FAILED ) + { + // now do the actual conversion + wxWCharBuffer buf(nLen /* +1 added implicitly */); + + // +1 for the trailing NULL + if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) + return buf; + } + } - //Get the length of the current (sub)string - size_t nLen = WC2MB(NULL, szPos, 0); + return wxWCharBuffer(); +} - //Invalid conversion? - if( nLen == (size_t)-1 ) +const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const +{ + if ( pwz ) + { + const size_t nLen = WC2MB(NULL, pwz, 0); + if ( nLen != wxCONV_FAILED ) { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; + // extra space for trailing NUL(s) + static const size_t extraLen = GetMaxMBNulLen(); + + wxCharBuffer buf(nLen + extraLen - 1); + if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) + return buf; } + } - //Increase the actual length (+1 for current null character) - nActualLength += nLen + 1; + return wxCharBuffer(); +} - //if buffer too big, realloc the buffer - if (nActualLength > (nCurrentSize+1)) +const wxWCharBuffer +wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const +{ + const size_t dstLen = ToWChar(NULL, 0, in, inLen); + if ( dstLen != wxCONV_FAILED ) + { + wxWCharBuffer wbuf(dstLen); + if ( ToWChar(wbuf.data(), dstLen, in, inLen) ) { - wxCharBuffer theNewBuffer(nCurrentSize << 1); - memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize); - theBuffer = theNewBuffer; - nCurrentSize <<= 1; + if ( outLen ) + *outLen = dstLen; + return wbuf; } + } + + if ( outLen ) + *outLen = 0; + + return wxWCharBuffer(); +} - //Convert the current (sub)string - if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 ) +const wxCharBuffer +wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const +{ + const size_t dstLen = FromWChar(NULL, 0, in, inLen); + if ( dstLen != wxCONV_FAILED ) + { + wxCharBuffer buf(dstLen); + if ( FromWChar(buf.data(), dstLen, in, inLen) ) { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; + if ( outLen ) + *outLen = dstLen; + return buf; } - - //Increment to next (sub)string - //Note that we have to use wxWcslen instead of nLen here - //because XX2XX gives us the size of the output buffer, - //which is not necessarily the length of the string - szPos += wxWcslen(szPos) + 1; } - //success - return actual length and the buffer - *pOutSize = nActualLength; - return theBuffer; + if ( outLen ) + *outLen = 0; + + return wxCharBuffer(); } // ---------------------------------------------------------------------------- @@ -326,12 +440,12 @@ size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const return wxWC2MB(buf, psz, n); } -#ifdef __UNIX__ - // ---------------------------------------------------------------------------- // wxConvBrokenFileNames // ---------------------------------------------------------------------------- +#ifdef __UNIX__ + wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) { if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0 @@ -341,23 +455,7 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) m_conv = new wxCSConv(charset); } -size_t -wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, - const char *psz, - size_t outputSize) const -{ - return m_conv->MB2WC( outputBuf, psz, outputSize ); -} - -size_t -wxConvBrokenFileNames::WC2MB(char *outputBuf, - const wchar_t *psz, - size_t outputSize) const -{ - return m_conv->WC2MB( outputBuf, psz, outputSize ); -} - -#endif +#endif // __UNIX__ // ---------------------------------------------------------------------------- // UTF-7 @@ -509,8 +607,6 @@ static const unsigned char utf7encode[128] = size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const { - - size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -864,7 +960,9 @@ size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const { size_t len = 0; - while ( *psz && (!buf || len < n) ) + // UTF16 string must be terminated by 2 NULs as single NULs may occur + // inside the string + while ( (psz[0] || psz[1]) && (!buf || len < n) ) { if ( buf ) { @@ -886,19 +984,21 @@ size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const // swap 16bit MB to 16bit String size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const { - size_t len=0; + size_t len = 0; - while (*psz && (!buf || len < n)) + while ( *psz && (!buf || len < n) ) { - if (buf) + if ( buf ) { *buf++ = ((char*)psz)[1]; *buf++ = ((char*)psz)[0]; } - len += sizeof(wxUint16); + len += 2; psz++; } - if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; + + if ( buf && len < n ) + *buf = '\0'; return len; } @@ -1320,6 +1420,10 @@ public: virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; + // classify this encoding as explained in wxMBConv::GetMBNulLen() + // comment + virtual size_t GetMBNulLen() const; + bool IsOk() const { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); } @@ -1341,6 +1445,10 @@ private: // true if the wide char encoding we use (i.e. ms_wcCharsetName) has // different endian-ness than the native one static bool ms_wcNeedsSwap; + + // cached result of GetMBNulLen(); set to 0 meaning "unknown" + // initially + size_t m_minMBCharWidth; }; // make the constructor available for unit testing @@ -1360,6 +1468,8 @@ bool wxMBConv_iconv::ms_wcNeedsSwap = false; wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) { + m_minMBCharWidth = 0; + // iconv operates with chars, not wxChars, but luckily it uses only ASCII // names for the charsets const wxCharBuffer cname(wxString(name).ToAscii()); @@ -1482,6 +1592,31 @@ wxMBConv_iconv::~wxMBConv_iconv() size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const { + // find the string length: notice that must be done differently for + // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs + size_t inbuf; + const size_t nulLen = GetMBNulLen(); + switch ( nulLen ) + { + default: + return (size_t)-1; + + case 1: + inbuf = strlen(psz); // arguably more optimized than our version + break; + + case 2: + case 4: + // for UTF-16/32 not only we need to have 2/4 consecutive NULs but + // they also have to start at character boundary and not span two + // adjacent characters + const char *p; + for ( p = psz; NotAllNULs(p, nulLen); p += nulLen ) + ; + inbuf = p - psz; + break; + } + #if wxUSE_THREADS // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle. // Unfortunately there is a couple of global wxCSConv objects such as @@ -1490,9 +1625,9 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const // only a few wx classes would be safe to use from non-main threads // as MB<->WC conversion would fail "randomly". wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); -#endif +#endif // wxUSE_THREADS + - size_t inbuf = strlen(psz); size_t outbuf = n * SIZEOF_WCHAR_T; size_t res, cres; // VS: Use these instead of psz, buf because iconv() modifies its arguments: @@ -1514,9 +1649,7 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const buf[n] = WC_BSWAP(buf[i]); } - // NB: iconv was given only strlen(psz) characters on input, and so - // it couldn't convert the trailing zero. Let's do it ourselves - // if there's some room left for it in the output buffer. + // NUL-terminate the string if there is any space left if (res < n) buf[res] = 0; } @@ -1616,6 +1749,36 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const return res; } +size_t wxMBConv_iconv::GetMBNulLen() const +{ + if ( m_minMBCharWidth == 0 ) + { + wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv); + +#if wxUSE_THREADS + // NB: explained in MB2WC + wxMutexLocker lock(self->m_iconvMutex); +#endif + + wchar_t *wnul = L""; + char buf[8]; // should be enough for NUL in any encoding + size_t inLen = sizeof(wchar_t), + outLen = WXSIZEOF(buf); + char *in = (char *)wnul; + char *out = buf; + if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 ) + { + self->m_minMBCharWidth = (size_t)-1; + } + else // ok + { + self->m_minMBCharWidth = out - buf; + } + } + + return m_minMBCharWidth; +} + #endif // HAVE_ICONV @@ -1637,19 +1800,22 @@ public: wxMBConv_win32() { m_CodePage = CP_ACP; + m_minMBCharWidth = 0; } #if wxUSE_FONTMAP wxMBConv_win32(const wxChar* name) { m_CodePage = wxCharsetToCodepage(name); + m_minMBCharWidth = 0; } wxMBConv_win32(wxFontEncoding encoding) { m_CodePage = wxEncodingToCodepage(encoding); + m_minMBCharWidth = 0; } -#endif +#endif // wxUSE_FONTMAP size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const { @@ -1816,6 +1982,44 @@ public: return len - 1; } + virtual size_t GetMBNulLen() const + { + if ( m_minMBCharWidth == 0 ) + { + int len = ::WideCharToMultiByte + ( + m_CodePage, // code page + 0, // no flags + L"", // input string + 1, // translate just the NUL + NULL, // output buffer + 0, // and its size + NULL, // no replacement char + NULL // [out] don't care if it was used + ); + + wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); + switch ( len ) + { + default: + wxLogDebug(_T("Unexpected NUL length %d"), len); + // fall through + + case 0: + self->m_minMBCharWidth = (size_t)-1; + break; + + case 1: + case 2: + case 4: + self->m_minMBCharWidth = len; + break; + } + } + + return m_minMBCharWidth; + } + bool IsOk() const { return m_CodePage != -1; } private: @@ -1874,7 +2078,13 @@ private: #endif } + + // the code page we're working with long m_CodePage; + + // cached result of GetMBNulLen(), set to 0 initially meaning + // "unknown" + size_t m_minMBCharWidth; }; #endif // wxHAVE_WIN32_MB2WC @@ -2508,12 +2718,30 @@ public: return inbuf; } + virtual size_t GetMBNulLen() const + { + switch ( m_enc ) + { + case wxFONTENCODING_UTF16BE: + case wxFONTENCODING_UTF16LE: + return 2; + + case wxFONTENCODING_UTF32BE: + case wxFONTENCODING_UTF32LE: + return 4; + + default: + return 1; + } + } + bool IsOk() const { return m_ok; } public: wxFontEncoding m_enc; wxEncodingConverter m2w, w2m; +private: // were we initialized successfully? bool m_ok; @@ -2906,6 +3134,18 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const return len; } +size_t wxCSConv::GetMBNulLen() const +{ + CreateConvIfNeeded(); + + if ( m_convReal ) + { + return m_convReal->GetMBNulLen(); + } + + return 1; +} + // ---------------------------------------------------------------------------- // globals // ----------------------------------------------------------------------------