From 35d11700a5c88842e01e426a53d25ce1a3ecdffd Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Wed, 5 Apr 2006 23:04:10 +0000 Subject: [PATCH 1/1] implemented UTF-16/32 using To/FromWChar() instead of MB2WC/WC2MB for sizeof(wchar_t)==4 platforms too git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@38586 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- include/wx/strconv.h | 20 --- src/common/strconv.cpp | 310 +++++++++++++++++++++++------------------ 2 files changed, 174 insertions(+), 156 deletions(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index eaecd92657..134434405a 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -286,15 +286,10 @@ protected: class WXDLLIMPEXP_BASE wxMBConvUTF16LE : public wxMBConvUTF16Base { public: -#if SIZEOF_WCHAR_T == 2 virtual size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const; -#else - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; -#endif virtual wxMBConv *Clone() const { return new wxMBConvUTF16LE; } }; @@ -305,15 +300,10 @@ public: class WXDLLIMPEXP_BASE wxMBConvUTF16BE : public wxMBConvUTF16Base { public: -#if SIZEOF_WCHAR_T == 2 virtual size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const; -#else - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; -#endif virtual wxMBConv *Clone() const { return new wxMBConvUTF16BE; } }; @@ -342,15 +332,10 @@ protected: class WXDLLIMPEXP_BASE wxMBConvUTF32LE : public wxMBConvUTF32Base { public: -#if SIZEOF_WCHAR_T == 2 virtual size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const; -#else - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; -#endif virtual wxMBConv *Clone() const { return new wxMBConvUTF32LE; } }; @@ -361,15 +346,10 @@ public: class WXDLLIMPEXP_BASE wxMBConvUTF32BE : public wxMBConvUTF32Base { public: -#if SIZEOF_WCHAR_T == 2 virtual size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const; -#else - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; -#endif virtual wxMBConv *Clone() const { return new wxMBConvUTF32BE; } }; diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 7b28a13b10..f4636aaefb 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -144,13 +144,17 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) } #ifdef WC_UTF16 + typedef wchar_t wxDecodeSurrogate_t; +#else // !WC_UTF16 + typedef wxUint16 wxDecodeSurrogate_t; +#endif // WC_UTF16/!WC_UTF16 // returns the next UTF-32 character from the wchar_t buffer and advances the // pointer to the character after this one // // if an invalid character is found, *pSrc is set to NULL, the caller must // check for this -static wxUint32 wxDecodeSurrogate(const wchar_t **pSrc) +static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc) { wxUint32 out; const size_t n = decode_utf16(*pSrc, out); @@ -162,8 +166,6 @@ static wxUint32 wxDecodeSurrogate(const wchar_t **pSrc) return out; } -#endif // WC_UTF16 - // ---------------------------------------------------------------------------- // wxMBConv // ---------------------------------------------------------------------------- @@ -1071,130 +1073,163 @@ wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, // conversions without endianness change // ---------------------------------------------------------------------------- -size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint16*)psz && (!buf || len < n)) + const size_t inLen = srcLen/BYTES_PER_CHAR; + if ( !dst ) { - wxUint32 cc; - size_t pa=decode_utf16((wxUint16*)psz, cc); - if (pa == wxCONV_FAILED) - return pa; + // optimization: return maximal space which could be needed for this + // string even if the real size could be smaller if the buffer contains + // any surrogates + return inLen; + } - if (buf) - *buf++ = (wchar_t)cc; - len++; - psz += pa * sizeof(wxUint16); + size_t outLen = 0; + const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src); + for ( const wxUint16 * const inEnd = in + inLen; in < inEnd; ) + { + const wxUint32 ch = wxDecodeSurrogate(&in); + if ( !in ) + return wxCONV_FAILED; + + if ( ++outLen > dstLen ) + return wxCONV_FAILED; + + *dst++ = ch; } - if (buf && len 1) + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *out++ = cc[0]; + if ( numChars == 2 ) { - *(wxUint16*)buf = cc[1]; - buf += sizeof(wxUint16); + // second character of a surrogate + *out++ = cc[1]; } } - - len += pa*sizeof(wxUint16); - psz++; } - if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; - return len; + return outLen; } // ---------------------------------------------------------------------------- // endian-reversing conversions // ---------------------------------------------------------------------------- -// swap 16bit MB to 32bit String -size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint16*)psz && (!buf || len < n)) + const size_t inLen = srcLen/BYTES_PER_CHAR; + if ( !dst ) { - wxUint32 cc; - char tmp[4]; - tmp[0]=psz[1]; tmp[1]=psz[0]; - tmp[2]=psz[3]; tmp[3]=psz[2]; + // optimization: return maximal space which could be needed for this + // string even if the real size could be smaller if the buffer contains + // any surrogates + return inLen; + } - size_t pa=decode_utf16((wxUint16*)tmp, cc); - if (pa == wxCONV_FAILED) - return pa; + size_t outLen = 0; + const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src); + for ( const wxUint16 * const inEnd = in + inLen; in < inEnd; ) + { + wxUint32 ch; + wxUint16 tmp[2]; + tmp[0] = wxUINT16_SWAP_ALWAYS(*in); + in++; + tmp[1] = wxUINT16_SWAP_ALWAYS(*in); - if (buf) - *buf++ = (wchar_t)cc; + const size_t numChars = decode_utf16(tmp, ch); + if ( numChars == wxCONV_FAILED ) + return wxCONV_FAILED; + + if ( numChars == 2 ) + in++; - len++; - psz += pa * sizeof(wxUint16); + if ( ++outLen > dstLen ) + return wxCONV_FAILED; + + *dst++ = ch; } - if (buf && len 1) + if ( outLen > dstLen ) + return wxCONV_FAILED; + + *out++ = wxUINT16_SWAP_ALWAYS(cc[0]); + if ( numChars == 2 ) { - *buf++ = ((char*)cc)[3]; - *buf++ = ((char*)cc)[2]; + // second character of a surrogate + *out++ = wxUINT16_SWAP_ALWAYS(cc[1]); } } - - len += pa*sizeof(wxUint16); - psz++; } - if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; - return len; + return outLen; } #endif // WC_UTF16/!WC_UTF16 -// ---------------------------------------------------------------------------- +// ============================================================================ // UTF-32 -// ---------------------------------------------------------------------------- +// ============================================================================ #ifdef WORDS_BIGENDIAN #define wxMBConvUTF32straight wxMBConvUTF32BE @@ -1391,101 +1426,104 @@ wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, #else // !WC_UTF16: wchar_t is UTF-32 -// copy 32bit MB to 32bit String -size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +// ---------------------------------------------------------------------------- +// conversions without endianness change +// ---------------------------------------------------------------------------- + +size_t +wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - size_t len=0; + // use memcpy() as it should be much faster than hand-written loop + srcLen = GetLength(src, srcLen); + if ( srcLen == wxNO_LEN ) + return wxCONV_FAILED; - while (*(wxUint32*)psz && (!buf || len < n)) + const size_t inLen = srcLen/BYTES_PER_CHAR; + if ( dst ) { - if (buf) - *buf++ = (wchar_t)(*(wxUint32*)psz); - len++; - psz += sizeof(wxUint32); - } + if ( dstLen < inLen ) + return wxCONV_FAILED; - if (buf && len