X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/eec47cc6c45a2d9a474cae9da897ebfdb0b7be21..5e4bf05abdcf56390db5ebd1609326ecda47d64f:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index dfc8a40dac..01e0dc358c 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -86,6 +86,15 @@ // implementation // ============================================================================ +// helper function of cMB2WC(): check if n bytes at this location are all NUL +static bool NotAllNULs(const char *p, size_t n) +{ + while ( n && *p++ == '\0' ) + n--; + + return n != 0; +} + // ---------------------------------------------------------------------------- // UTF-16 en/decoding to/from UCS-4 // ---------------------------------------------------------------------------- @@ -138,143 +147,258 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) // wxMBConv // ---------------------------------------------------------------------------- -wxMBConv::~wxMBConv() -{ - // nothing to do here (necessary for Darwin linking probably) -} - -const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const +size_t +wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - if ( psz ) + // although new conversion classes are supposed to implement this function + // directly, the existins ones only implement the old MB2WC() and so, to + // avoid to have to rewrite all conversion classes at once, we provide a + // default (but not efficient) implementation of this one in terms of the + // old function by copying the input to ensure that it's NUL-terminated and + // then using MB2WC() to convert it + + // the number of chars [which would be] written to dst [if it were not NULL] + size_t dstWritten = 0; + + // the number of NULs terminating this string + size_t nulLen wxDUMMY_INITIALIZE(0); + + // if we were not given the input size we just have to assume that the + // string is properly terminated as we have no way of knowing how long it + // is anyhow, but if we do have the size check whether there are enough + // NULs at the end + wxCharBuffer bufTmp; + const char *srcEnd; + if ( srcLen != (size_t)-1 ) { - // calculate the length of the buffer needed first - size_t nLen = MB2WC(NULL, psz, 0); - if ( nLen != (size_t)-1 ) + // we need to know how to find the end of this string + nulLen = GetMBNulLen(); + if ( nulLen == wxCONV_FAILED ) + return wxCONV_FAILED; + + // if there are enough NULs we can avoid the copy + if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) ) { - // now do the actual conversion - wxWCharBuffer buf(nLen); - nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL - if ( nLen != (size_t)-1 ) - { - return buf; - } + // make a copy in order to properly NUL-terminate the string + bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */); + char * const p = bufTmp.data(); + memcpy(p, src, srcLen); + for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ ) + *s = '\0'; + + src = bufTmp; } + + srcEnd = src + srcLen; + } + else // quit after the first loop iteration + { + srcEnd = NULL; } - wxWCharBuffer buf((wchar_t *)NULL); + for ( ;; ) + { + // try to convert the current chunk + size_t lenChunk = MB2WC(NULL, src, 0); + if ( lenChunk == 0 ) + { + // nothing left in the input string, conversion succeeded + break; + } - return buf; -} + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; -const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const -{ - if ( pwz ) - { - size_t nLen = WC2MB(NULL, pwz, 0); - if ( nLen != (size_t)-1 ) + // if we already have a previous chunk, leave the NUL separating it + // from this one + if ( dstWritten ) { - wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero - nLen = WC2MB(buf.data(), pwz, nLen + 4); - if ( nLen != (size_t)-1 ) - { - return buf; - } + dstWritten++; + if ( dst ) + dst++; } - } - wxCharBuffer buf((char *)NULL); + dstWritten += lenChunk; - return buf; -} + if ( dst ) + { + if ( dstWritten > dstLen ) + return wxCONV_FAILED; -const wxWCharBuffer -wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const -{ - // the currently accumulated wide characters - wxWCharBuffer wbuf; + lenChunk = MB2WC(dst, src, lenChunk + 1 /* for NUL */); + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; - // the current length of wbuf - size_t lenBuf = 0; + dst += lenChunk; + } - // we need to know the representation of L'\0' for this conversion - size_t nulLen; - const char * const nul = GetMBNul(&nulLen); - if ( nulLen == (size_t)-1 || nulLen == 0 ) - return wxWCharBuffer(); + if ( !srcEnd ) + { + // we convert the entire string in this cas, as we suppose that the + // string is NUL-terminated and so srcEnd is not used at all + break; + } + + // advance the input pointer past the end of this chunk + while ( NotAllNULs(src, nulLen) ) + { + // notice that we must skip over multiple bytes here as we suppose + // that if NUL takes 2 or 4 bytes, then all the other characters do + // too and so if advanced by a single byte we might erroneously + // detect sequences of NUL bytes in the middle of the input + src += nulLen; + } + + src += nulLen; // skipping over its terminator as well + + // note that ">=" (and not just "==") is needed here as the terminator + // we skipped just above could be inside or just after the buffer + // delimited by inEnd + if ( src >= srcEnd ) + break; + } + + return dstWritten; +} + +size_t +wxMBConv::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + // the number of chars [which would be] written to dst [if it were not NULL] + size_t dstWritten = 0; // make a copy of the input string unless it is already properly // NUL-terminated - wxCharBuffer bufTmp; + // + // if we don't know its length we have no choice but to assume that it is, + // indeed, properly terminated + wxWCharBuffer bufTmp; + if ( srcLen == (size_t)-1 ) + { + srcLen = wxWcslen(src) + 1; + } + else if ( srcLen != 0 && src[srcLen - 1] != L'\0' ) + { + // make a copy in order to properly NUL-terminate the string + bufTmp = wxWCharBuffer(srcLen); + memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t)); + src = bufTmp; + } - // now we can compute the input size if we were not given it: notice that - // in this case the string must be properly NUL-terminated, of course, as - // otherwise we have no way of knowing how long it is - if ( inLen == (size_t)-1 ) + const size_t lenNul = GetMBNulLen(); + for ( const wchar_t * const srcEnd = src + srcLen; + src < srcEnd; + src += wxWcslen(src) + 1 /* skip L'\0' too */ ) { - // not the most efficient algorithm but it shouldn't matter as normally - // there are not many NULs in the string and so normally memcmp() - // should stop on the first character - for ( const char *p = in; ; p++ ) + // try to convert the current chunk + size_t lenChunk = WC2MB(NULL, src, 0); + + if ( lenChunk == wxCONV_FAILED ) + return wxCONV_FAILED; + + lenChunk += lenNul; + dstWritten += lenChunk; + + if ( dst ) { - if ( memcmp(p, nul, nulLen) == 0 ) - break; + if ( dstWritten > dstLen ) + return wxCONV_FAILED; + + if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED ) + return wxCONV_FAILED; + + dst += lenChunk; } + } - inLen = p - in + nulLen; + return dstWritten; +} + +size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const +{ + size_t rc = ToWChar(out, outLen, in); + if ( rc != wxCONV_FAILED ) + { + // ToWChar() returns the buffer length, i.e. including the trailing + // NUL, while this method doesn't take it into account + rc--; } - else // we already have the size + + return rc; +} + +size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const +{ + size_t rc = FromWChar(out, outLen, in); + if ( rc != wxCONV_FAILED ) { - // check if it's not already NUL-terminated too to avoid the copy - if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 ) - { - // make a copy in order to properly NUL-terminate the string - bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */); - memcpy(bufTmp.data(), in, inLen); - memcpy(bufTmp.data() + inLen, nul, nulLen); - } + rc -= GetMBNulLen(); } - if ( bufTmp ) - in = bufTmp; + return rc; +} - for ( const char * const inEnd = in + inLen;; ) +wxMBConv::~wxMBConv() +{ + // nothing to do here (necessary for Darwin linking probably) +} + +const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const +{ + if ( psz ) { - // try to convert the current chunk if anything left - size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0; - if ( lenChunk == 0 ) + // calculate the length of the buffer needed first + const size_t nLen = MB2WC(NULL, psz, 0); + if ( nLen != wxCONV_FAILED ) { - // nothing left in the input string, conversion succeeded - if ( outLen ) - { - // we shouldn't include the last NUL in the result length - *outLen = lenBuf ? lenBuf - 1 : 0; - } + // now do the actual conversion + wxWCharBuffer buf(nLen /* +1 added implicitly */); - return wbuf; + // +1 for the trailing NULL + if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) + return buf; } + } - if ( lenChunk == (size_t)-1 ) - break; - - const size_t lenBufNew = lenBuf + lenChunk; - if ( !wbuf.extend(lenBufNew) ) - break; + return wxWCharBuffer(); +} - lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */); - if ( lenChunk == (size_t)-1 ) - break; +const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const +{ + if ( pwz ) + { + const size_t nLen = WC2MB(NULL, pwz, 0); + if ( nLen != wxCONV_FAILED ) + { + // extra space for trailing NUL(s) + static const size_t extraLen = GetMaxMBNulLen(); - // +! for the embedded NUL (if something follows) - lenBuf = lenBufNew + 1; + wxCharBuffer buf(nLen + extraLen - 1); + if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) + return buf; + } + } - // advance the input pointer past the end of this chunk - while ( memcmp(in, nul, nulLen) != 0 ) - in++; + return wxCharBuffer(); +} - in += nulLen; // skipping over its terminator as well +const wxWCharBuffer +wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const +{ + const size_t dstLen = ToWChar(NULL, 0, in, inLen); + if ( dstLen != wxCONV_FAILED ) + { + wxWCharBuffer wbuf(dstLen); + if ( ToWChar(wbuf.data(), dstLen, in, inLen) ) + { + if ( outLen ) + *outLen = dstLen; + return wbuf; + } } - // conversion failed if ( outLen ) *outLen = 0; @@ -284,62 +408,18 @@ wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const { - // the currently accumulated multibyte characters - wxCharBuffer buf; - - // the current length of buf - size_t lenBuf = 0; - - // make a copy of the input string unless it is already properly - // NUL-terminated - // - // if we don't know its length we have no choice but to assume that it is, - // indeed, properly terminated - wxWCharBuffer bufTmp; - if ( inLen == (size_t)-1 ) + const size_t dstLen = FromWChar(NULL, 0, in, inLen); + if ( dstLen != wxCONV_FAILED ) { - inLen = wxWcslen(in) + 1; - } - else if ( inLen != 0 && in[inLen - 1] != L'\0' ) - { - // make a copy in order to properly NUL-terminate the string - bufTmp = wxWCharBuffer(inLen); - memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t)); - } - - if ( bufTmp ) - in = bufTmp; - - for ( const wchar_t * const inEnd = in + inLen;; ) - { - // try to convert the current chunk, if anything left - size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0; - if ( lenChunk == 0 ) + wxCharBuffer buf(dstLen); + if ( FromWChar(buf.data(), dstLen, in, inLen) ) { - // nothing left in the input string, conversion succeeded if ( outLen ) - *outLen = lenBuf ? lenBuf - 1 : lenBuf; - + *outLen = dstLen; return buf; } - - if ( lenChunk == (size_t)-1 ) - break; - - const size_t lenBufNew = lenBuf + lenChunk; - if ( !buf.extend(lenBufNew) ) - break; - - lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */); - if ( lenChunk == (size_t)-1 ) - break; - - // chunk successfully converted, go to the next one - in += wxWcslen(in) + 1 /* skip NUL too */; - lenBuf = lenBufNew + 1; } - // conversion failed if ( outLen ) *outLen = 0; @@ -1340,6 +1420,10 @@ public: virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; + // classify this encoding as explained in wxMBConv::GetMBNulLen() + // comment + virtual size_t GetMBNulLen() const; + bool IsOk() const { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); } @@ -1354,8 +1438,6 @@ protected: #endif private: - virtual const char *GetMBNul(size_t *nulLen) const; - // the name (for iconv_open()) of a wide char charset -- if none is // available on this machine, it will remain NULL static wxString ms_wcCharsetName; @@ -1364,9 +1446,9 @@ private: // different endian-ness than the native one static bool ms_wcNeedsSwap; - // NUL representation - size_t m_nulLen; - char m_nulBuf[8]; + // cached result of GetMBNulLen(); set to 0 meaning "unknown" + // initially + size_t m_minMBCharWidth; }; // make the constructor available for unit testing @@ -1386,7 +1468,7 @@ bool wxMBConv_iconv::ms_wcNeedsSwap = false; wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) { - m_nulLen = (size_t)-2; + m_minMBCharWidth = 0; // iconv operates with chars, not wxChars, but luckily it uses only ASCII // names for the charsets @@ -1510,6 +1592,31 @@ wxMBConv_iconv::~wxMBConv_iconv() size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const { + // find the string length: notice that must be done differently for + // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs + size_t inbuf; + const size_t nulLen = GetMBNulLen(); + switch ( nulLen ) + { + default: + return (size_t)-1; + + case 1: + inbuf = strlen(psz); // arguably more optimized than our version + break; + + case 2: + case 4: + // for UTF-16/32 not only we need to have 2/4 consecutive NULs but + // they also have to start at character boundary and not span two + // adjacent characters + const char *p; + for ( p = psz; NotAllNULs(p, nulLen); p += nulLen ) + ; + inbuf = p - psz; + break; + } + #if wxUSE_THREADS // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle. // Unfortunately there is a couple of global wxCSConv objects such as @@ -1518,9 +1625,9 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const // only a few wx classes would be safe to use from non-main threads // as MB<->WC conversion would fail "randomly". wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); -#endif +#endif // wxUSE_THREADS + - size_t inbuf = strlen(psz); size_t outbuf = n * SIZEOF_WCHAR_T; size_t res, cres; // VS: Use these instead of psz, buf because iconv() modifies its arguments: @@ -1542,9 +1649,7 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const buf[n] = WC_BSWAP(buf[i]); } - // NB: iconv was given only strlen(psz) characters on input, and so - // it couldn't convert the trailing zero. Let's do it ourselves - // if there's some room left for it in the output buffer. + // NUL-terminate the string if there is any space left if (res < n) buf[res] = 0; } @@ -1644,9 +1749,9 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const return res; } -const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const +size_t wxMBConv_iconv::GetMBNulLen() const { - if ( m_nulLen == (size_t)-2 ) + if ( m_minMBCharWidth == 0 ) { wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv); @@ -1655,14 +1760,23 @@ const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const wxMutexLocker lock(self->m_iconvMutex); #endif - size_t inLen = 1, - outLen = WXSIZEOF(m_nulBuf); - self->m_nulLen = iconv(w2m, ICONV_CHAR_CAST(L""), &inLen, - &self->m_nulBuf, &outLen); + wchar_t *wnul = L""; + char buf[8]; // should be enough for NUL in any encoding + size_t inLen = sizeof(wchar_t), + outLen = WXSIZEOF(buf); + char *in = (char *)wnul; + char *out = buf; + if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 ) + { + self->m_minMBCharWidth = (size_t)-1; + } + else // ok + { + self->m_minMBCharWidth = out - buf; + } } - *nulLen = m_nulLen; - return m_nulBuf; + return m_minMBCharWidth; } #endif // HAVE_ICONV @@ -1686,20 +1800,20 @@ public: wxMBConv_win32() { m_CodePage = CP_ACP; - m_nulLen = (size_t)-2; + m_minMBCharWidth = 0; } #if wxUSE_FONTMAP wxMBConv_win32(const wxChar* name) { m_CodePage = wxCharsetToCodepage(name); - m_nulLen = (size_t)-2; + m_minMBCharWidth = 0; } wxMBConv_win32(wxFontEncoding encoding) { m_CodePage = wxEncodingToCodepage(encoding); - m_nulLen = (size_t)-2; + m_minMBCharWidth = 0; } #endif // wxUSE_FONTMAP @@ -1868,6 +1982,44 @@ public: return len - 1; } + virtual size_t GetMBNulLen() const + { + if ( m_minMBCharWidth == 0 ) + { + int len = ::WideCharToMultiByte + ( + m_CodePage, // code page + 0, // no flags + L"", // input string + 1, // translate just the NUL + NULL, // output buffer + 0, // and its size + NULL, // no replacement char + NULL // [out] don't care if it was used + ); + + wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); + switch ( len ) + { + default: + wxLogDebug(_T("Unexpected NUL length %d"), len); + // fall through + + case 0: + self->m_minMBCharWidth = (size_t)-1; + break; + + case 1: + case 2: + case 4: + self->m_minMBCharWidth = len; + break; + } + } + + return m_minMBCharWidth; + } + bool IsOk() const { return m_CodePage != -1; } private: @@ -1926,35 +2078,13 @@ private: #endif } - virtual const char *GetMBNul(size_t *nulLen) const - { - if ( m_nulLen == (size_t)-2 ) - { - wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); - - self->m_nulLen = ::WideCharToMultiByte - ( - m_CodePage, // code page - 0, // no flags - L"", // input string - 1, // translate just NUL - self->m_nulBuf, // output buffer - WXSIZEOF(m_nulBuf), // and its size - NULL, // "replacement" char - NULL // [out] was it used? - ); - - if ( m_nulLen == 0 ) - self->m_nulLen = (size_t)-1; - } - - *nulLen = m_nulLen; - return m_nulBuf; - } + // the code page we're working with long m_CodePage; - size_t m_nulLen; - char m_nulBuf[8]; + + // cached result of GetMBNulLen(), set to 0 initially meaning + // "unknown" + size_t m_minMBCharWidth; }; #endif // wxHAVE_WIN32_MB2WC @@ -2588,33 +2718,30 @@ public: return inbuf; } - bool IsOk() const { return m_ok; } - -public: - wxFontEncoding m_enc; - wxEncodingConverter m2w, w2m; - -private: - virtual const char *GetMBNul(size_t *nulLen) const + virtual size_t GetMBNulLen() const { switch ( m_enc ) { case wxFONTENCODING_UTF16BE: case wxFONTENCODING_UTF16LE: - *nulLen = 2; - return "\0"; + return 2; case wxFONTENCODING_UTF32BE: case wxFONTENCODING_UTF32LE: - *nulLen = 4; - return "\0\0\0"; + return 4; default: - *nulLen = 1; - return ""; + return 1; } } + bool IsOk() const { return m_ok; } + +public: + wxFontEncoding m_enc; + wxEncodingConverter m2w, w2m; + +private: // were we initialized successfully? bool m_ok; @@ -3007,18 +3134,16 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const return len; } -const char *wxCSConv::GetMBNul(size_t *nulLen) const +size_t wxCSConv::GetMBNulLen() const { CreateConvIfNeeded(); if ( m_convReal ) { - // cast needed just to call private function of m_convReal - return ((wxCSConv *)m_convReal)->GetMBNul(nulLen); + return m_convReal->GetMBNulLen(); } - *nulLen = 1; - return ""; + return 1; } // ----------------------------------------------------------------------------