From: Vadim Zeitlin Date: Mon, 22 Sep 2003 00:12:10 +0000 (+0000) Subject: added conversions to/from UTF 16/32 LE/BE (patch 809685) X-Git-Url: https://git.saurik.com/wxWidgets.git/commitdiff_plain/c91830cb4b924451c1ccd8835d64bd71f8e1df54 added conversions to/from UTF 16/32 LE/BE (patch 809685) git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@23792 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- diff --git a/include/wx/fontenc.h b/include/wx/fontenc.h index deb83fd070..d507f4ef3c 100644 --- a/include/wx/fontenc.h +++ b/include/wx/fontenc.h @@ -68,6 +68,12 @@ enum wxFontEncoding wxFONTENCODING_UTF7, // UTF-7 Unicode encoding wxFONTENCODING_UTF8, // UTF-8 Unicode encoding + wxFONTENCODING_UTF16, // UTF-16 Unicode encoding + wxFONTENCODING_UTF16BE, // UTF-16 Big Endian Unicode encoding + wxFONTENCODING_UTF16LE, // UTF-16 Little Endian Unicode encoding + wxFONTENCODING_UTF32, // UTF-32 Unicode encoding + wxFONTENCODING_UTF32BE, // UTF-32 Big Endian Unicode encoding + wxFONTENCODING_UTF32LE, // UTF-32 Little Endian Unicode encoding // Far Eastern encodings // Chinese diff --git a/include/wx/strconv.h b/include/wx/strconv.h index aea07bb35c..f129770ec1 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -91,6 +91,50 @@ public: WXDLLIMPEXP_DATA_BASE(extern wxMBConvUTF8) wxConvUTF8; +// ---------------------------------------------------------------------------- +// wxMBConvUTF16LE (for conversion using UTF16 Little Endian encoding) +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxMBConvUTF16LE : public wxMBConv +{ +public: + virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; + virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; +}; + +// ---------------------------------------------------------------------------- +// wxMBConvUTF16BE (for conversion using UTF16 Big Endian encoding) +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxMBConvUTF16BE : public wxMBConv +{ +public: + virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; + virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; +}; + +// ---------------------------------------------------------------------------- +// wxMBConvUCS4LE (for conversion using UTF32 Little Endian encoding) +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxMBConvUTF32LE : public wxMBConv +{ +public: + virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; + virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; +}; + +// ---------------------------------------------------------------------------- +// wxMBConvUCS4BE (for conversion using UTF32 Big Endian encoding) +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxMBConvUTF32BE : public wxMBConv +{ +public: + virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; + virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; +}; + // ---------------------------------------------------------------------------- // wxCSConv (for conversion based on loadable char sets) // ---------------------------------------------------------------------------- diff --git a/src/common/fmapbase.cpp b/src/common/fmapbase.cpp index b49754d7f9..e9b303613c 100644 --- a/src/common/fmapbase.cpp +++ b/src/common/fmapbase.cpp @@ -87,6 +87,12 @@ static wxFontEncoding gs_encodings[] = wxFONTENCODING_CP437, wxFONTENCODING_UTF7, wxFONTENCODING_UTF8, + wxFONTENCODING_UTF16, + wxFONTENCODING_UTF16BE, + wxFONTENCODING_UTF16LE, + wxFONTENCODING_UTF32, + wxFONTENCODING_UTF32BE, + wxFONTENCODING_UTF32LE, wxFONTENCODING_EUC_JP, }; @@ -124,6 +130,12 @@ static const wxChar* gs_encodingDescs[] = wxTRANSLATE( "Windows/DOS OEM (CP 437)" ), wxTRANSLATE( "Unicode 7 bit (UTF-7)" ), wxTRANSLATE( "Unicode 8 bit (UTF-8)" ), + wxTRANSLATE( "Unicode 16 bit (UTF-16)" ), + wxTRANSLATE( "Unicode 16 bit Big Endian (UTF-16BE)" ), + wxTRANSLATE( "Unicode 16 bit Little Endian (UTF-16LE)" ), + wxTRANSLATE( "Unicode 32 bit (UTF-32)" ), + wxTRANSLATE( "Unicode 32 bit Big Endian (UTF-32BE)" ), + wxTRANSLATE( "Unicode 32 bit Little Endian (UTF-32LE)" ), wxTRANSLATE( "Extended Unix Codepage for Japanese (EUC-JP)" ), }; @@ -161,6 +173,12 @@ static const wxChar* gs_encodingNames[] = wxT( "windows-437" ), wxT( "utf-7" ), wxT( "utf-8" ), + wxT( "utf-16" ), + wxT( "utf-16be" ), + wxT( "utf-16le" ), + wxT( "utf-32" ), + wxT( "utf-32be" ), + wxT( "utf-32le" ), wxT( "euc-jp" ), }; @@ -455,6 +473,30 @@ wxFontMapperBase::NonInteractiveCharsetToEncoding(const wxString& charset) { encoding = wxFONTENCODING_UTF8; } + else if ( cs == wxT("UTF-16") ) + { + encoding = wxFONTENCODING_UTF16; + } + else if ( cs == wxT("UTF-16BE") ) + { + encoding = wxFONTENCODING_UTF16BE; + } + else if ( cs == wxT("UTF-16LE") ) + { + encoding = wxFONTENCODING_UTF16LE; + } + else if ( cs == wxT("UTF-32") || cs == wxT("UCS-4") ) + { + encoding = wxFONTENCODING_UTF32; + } + else if ( cs == wxT("UTF-32BE") || cs == wxT("UCS-4BE") ) + { + encoding = wxFONTENCODING_UTF32BE; + } + else if ( cs == wxT("UTF-32LE") || cs == wxT("UCS-4LE") ) + { + encoding = wxFONTENCODING_UTF32LE; + } else if ( cs == wxT("GB2312") ) { encoding = wxFONTENCODING_GB2312; diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index cdbaabf73e..113c02acfd 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -143,16 +143,15 @@ IMPLEMENT_DYNAMIC_CLASS(wxStrConvModule, wxModule) // ============================================================================ // ---------------------------------------------------------------------------- -// UTF-16 en/decoding +// UTF-16 en/decoding to/from UCS-4 // ---------------------------------------------------------------------------- -#ifdef WC_UTF16 -static size_t encode_utf16(wxUint32 input, wchar_t *output) +static size_t encode_utf16(wxUint32 input, wxUint16 *output) { if (input<=0xffff) { - if (output) *output++ = (wchar_t) input; + if (output) *output++ = (wxUint16) input; return 1; } else if (input>=0x110000) @@ -163,14 +162,14 @@ static size_t encode_utf16(wxUint32 input, wchar_t *output) { if (output) { - *output++ = (wchar_t) ((input >> 10)+0xd7c0); - *output++ = (wchar_t) ((input&0x3ff)+0xdc00); + *output++ = (wxUint16) ((input >> 10)+0xd7c0); + *output++ = (wxUint16) ((input&0x3ff)+0xdc00); } return 2; } } -static size_t decode_utf16(const wchar_t* input, wxUint32& output) +static size_t decode_utf16(const wxUint16* input, wxUint32& output) { if ((*input<0xd800) || (*input>0xdfff)) { @@ -189,7 +188,6 @@ static size_t decode_utf16(const wchar_t* input, wxUint32& output) } } -#endif // WC_UTF16 // ---------------------------------------------------------------------------- // wxMBConv @@ -266,8 +264,8 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const size_t nLen = WC2MB(NULL, pwz, 0); if ( nLen != (size_t)-1 ) { - wxCharBuffer buf(nLen); - WC2MB(buf.data(), pwz, nLen + 1); + wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero + WC2MB(buf.data(), pwz, nLen + 4); return buf; } @@ -422,6 +420,463 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const return len; } + + + +// ---------------------------------------------------------------------------- +// UTF-16 +// ---------------------------------------------------------------------------- + +#ifdef WORDS_BIGENDIAN +#define wxMBConvUTF16straight wxMBConvUTF16BE +#define wxMBConvUTF16swap wxMBConvUTF16LE +#else +#define wxMBConvUTF16swap wxMBConvUTF16BE +#define wxMBConvUTF16straight wxMBConvUTF16LE +#endif + + +WXDLLIMPEXP_DATA_BASE(wxMBConvUTF16LE) wxConvUTF16LE; +WXDLLIMPEXP_DATA_BASE(wxMBConvUTF16BE) wxConvUTF16BE; + + + + + +#ifdef WC_UTF16 + + +// copy 16bit MB to 16bit String +size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +{ + size_t len=0; + + while (*(wxUint16*)psz && (!buf || len < n)) + { + if (buf) + *buf++ = *(wxUint16*)psz; + len++; + + psz += sizeof(wxUint16); + } + if (buf && len 1) + *((wxUint16*)buf)++ = cc[1]; + } + + len += pa*sizeof(wxUint16); + psz++; + } + if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; + + return len; +} + + +// swap 16bit MB to 32bit String +size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const +{ + size_t len=0; + + while (*(wxUint16*)psz && (!buf || len < n)) + { + wxUint32 cc; + char tmp[4]; + tmp[0]=psz[1]; tmp[1]=psz[0]; + tmp[2]=psz[3]; tmp[3]=psz[2]; + + size_t pa=decode_utf16((wxUint16*)tmp, cc); + if (pa == (size_t)-1) + return pa; + + if (buf) + *buf++ = cc; + + len++; + psz += pa * sizeof(wxUint16); + } + if (buf && len 1) + { + *buf++ = ((char*)cc)[3]; + *buf++ = ((char*)cc)[2]; + } + } + + len += pa*sizeof(wxUint16); + psz++; + } + if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; + + return len; +} + +#endif // WC_UTF16 + + +// ---------------------------------------------------------------------------- +// UTF-32 +// ---------------------------------------------------------------------------- + +#ifdef WORDS_BIGENDIAN +#define wxMBConvUTF32straight wxMBConvUTF32BE +#define wxMBConvUTF32swap wxMBConvUTF32LE +#else +#define wxMBConvUTF32swap wxMBConvUTF32BE +#define wxMBConvUTF32straight wxMBConvUTF32LE +#endif + + +WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE; +WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE; + + +#ifdef WC_UTF16 + +// copy 32bit MB to 16bit String +size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const +{ + size_t len=0; + + while (*(wxUint32*)psz && (!buf || len < n)) + { + wxUint16 cc[2]; + + size_t pa=encode_utf16(*(wxUint32*)psz, cc); + if (pa == (size_t)-1) + return pa; + + if (buf) + { + *buf++ = cc[0]; + if (pa > 1) + *buf++ = cc[1]; + } + len += pa; + psz += sizeof(wxUint32); + } + if (buf && len 1) + *buf++ = cc[1]; + } + len += pa; + psz += sizeof(wxUint32); + } + if (buf && len