From 0286d08d1453506f9ff9a830d58b3b35817d0b14 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Tue, 24 Jul 2007 15:01:10 +0000 Subject: [PATCH] add wxMBConvStrictUTF8 class implementing just UTF-8 conversion, without support for PUA/octal mappings and use it for wxConvUTF8 as it's simpler and more efficient (~20% faster) git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47703 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- include/wx/strconv.h | 28 ++++- src/common/strconv.cpp | 267 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 289 insertions(+), 6 deletions(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index 2cd26bd30d..1edf4f5082 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -257,11 +257,31 @@ public: // wxMBConvUTF8 (for conversion using UTF8 encoding) // ---------------------------------------------------------------------------- -class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv +// this is the real UTF-8 conversion class, it has to be called "strict UTF-8" +// for compatibility reasons: the wxMBConvUTF8 class below also supports lossy +// conversions if it is created with non default options +class WXDLLIMPEXP_BASE wxMBConvStrictUTF8 : public wxMBConv +{ +public: + // compiler-generated default ctor and other methods are ok + + virtual size_t ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen = wxNO_LEN) const; + virtual size_t FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen = wxNO_LEN) const; + + virtual wxMBConv *Clone() const { return new wxMBConvStrictUTF8(); } + +#if wxUSE_UNICODE_UTF8 + // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't + // take the shortcut in that case + virtual bool IsUTF8() const { return true; } +#endif +}; + +class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConvStrictUTF8 { public: - // FIXME-UTF8: split this class into multiple classes, one strict and - // other lossy (PUA, OCTAL mappings) enum { MAP_INVALID_UTF8_NOT = 0, @@ -470,7 +490,7 @@ WX_DECLARE_GLOBAL_CONV(wxMBConv, wxConvLibc) WX_DECLARE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1) #define wxConvISO8859_1 wxGet_wxConvISO8859_1() -WX_DECLARE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8) +WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8) #define wxConvUTF8 wxGet_wxConvUTF8() WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 59ea721b71..4d672fedfb 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -714,8 +714,268 @@ static wxUint32 utf8_max[]= const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; +// this table gives the length of the UTF-8 encoding from its first character: +unsigned char tableUtf8Lengths[256] = { + // single-byte sequences (ASCII): + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F + + // these are invalid: + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF + 0, 0, // C0,C1 + + // two-byte sequences: + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF + + // three-byte sequences: + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF + + // four-byte sequences: + 4, 4, 4, 4, 4, // F0..F4 + + // these are invalid again (5- or 6-byte + // sequences and sequences for code points + // above U+10FFFF, as restricted by RFC 3629): + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF +}; + +size_t +wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const +{ + wchar_t *out = dstLen ? dst : NULL; + size_t written = 0; + + if ( srcLen == wxNO_LEN ) + srcLen = strlen(src) + 1; + + for ( const char *p = src; ; p++ ) + { + if ( !(srcLen == wxNO_LEN ? *p : srcLen) ) + { + // all done successfully, just add the trailing NULL if we are not + // using explicit length + if ( srcLen == wxNO_LEN ) + { + if ( out ) + { + if ( !dstLen ) + break; + + *out = L'\0'; + } + + written++; + } + + return written; + } + + unsigned char c = *p; + unsigned len = tableUtf8Lengths[c]; + if ( !len ) + break; + + if ( srcLen < len ) // the test works for wxNO_LEN too + break; + + if ( srcLen != wxNO_LEN ) + srcLen -= len; + + if ( out && !dstLen-- ) + break; + + + // Char. number range | UTF-8 octet sequence + // (hexadecimal) | (binary) + // ----------------------+--------------------------------------------- + // 0000 0000 - 0000 007F | 0xxxxxxx + // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx + // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Code point value is stored in bits marked with 'x', lowest-order bit + // of the value on the right side in the diagram above. + // (from RFC 3629) + + // mask to extract lead byte's value ('x' bits above), by sequence length: + static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; + + // mask and value of lead byte's most significant bits, by length: + static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; + static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; + + len--; // it's more convenient to work with 0-based length here + + // extract the lead byte's value bits: + if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) + break; + + wxUint32 code = c & leadValueMask[len]; + + // all remaining bytes, if any, are handled in the same way regardless of + // sequence's length: + for ( ; len; --len ) + { + c = *++p; + if ( (c & 0xC0) != 0x80 ) + return wxCONV_FAILED; + + code <<= 6; + code |= c & 0x3F; + } + +#ifdef WC_UTF16 + // cast is ok because wchar_t == wxUint16 if WC_UTF16 + if ( encode_utf16(code, (wxUint16 *)out) == 2 ) + { + if ( out ) + out++; + written++; + } +#else // !WC_UTF16 + if ( out ) + *out = code; +#endif // WC_UTF16/!WC_UTF16 + + if ( out ) + out++; + + written++; + } + + return wxCONV_FAILED; +} + +size_t +wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + char *out = dstLen ? dst : NULL; + size_t written = 0; + + for ( const wchar_t *wp = src; ; wp++ ) + { + if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) ) + { + // all done successfully, just add the trailing NULL if we are not + // using explicit length + if ( srcLen == wxNO_LEN ) + { + if ( out ) + { + if ( !dstLen ) + break; + + *out = '\0'; + } + + written++; + } + + return written; + } + + + wxUint32 code; +#ifdef WC_UTF16 + // cast is ok for WC_UTF16 + if ( decode_utf16((const wxUint16 *)wp, code) == 2 ) + { + // skip the next char too as we decoded a surrogate + wp++; + } +#else // wchar_t is UTF-32 + code = *wp & 0x7fffffff; +#endif + + unsigned len; + if ( code <= 0x7F ) + { + len = 1; + if ( out ) + { + if ( dstLen < len ) + break; + + out[0] = (char)code; + } + } + else if ( code <= 0x07FF ) + { + len = 2; + if ( out ) + { + if ( dstLen < len ) + break; + + // NB: this line takes 6 least significant bits, encodes them as + // 10xxxxxx and discards them so that the next byte can be encoded: + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xC0 | code; + } + } + else if ( code < 0xFFFF ) + { + len = 3; + if ( out ) + { + if ( dstLen < len ) + break; + + out[2] = 0x80 | (code & 0x3F); code >>= 6; + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xE0 | code; + } + } + else if ( code <= 0x10FFFF ) + { + len = 4; + if ( out ) + { + if ( dstLen < len ) + break; + + out[3] = 0x80 | (code & 0x3F); code >>= 6; + out[2] = 0x80 | (code & 0x3F); code >>= 6; + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xF0 | code; + } + } + else + { + wxFAIL_MSG( _T("trying to encode undefined Unicode character") ); + break; + } + + if ( out ) + { + out += len; + dstLen -= len; + } + + written += len; + } + + // we only get here if an error occurs during decoding + return wxCONV_FAILED; +} + size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const { + if ( m_options == MAP_INVALID_UTF8_NOT ) + return wxMBConvStrictUTF8::MB2WC(buf, psz, n); + size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -785,7 +1045,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const else { #ifdef WC_UTF16 - // cast is ok because wchar_t == wxUuint16 if WC_UTF16 + // cast is ok because wchar_t == wxUint16 if WC_UTF16 size_t pa = encode_utf16(res, (wxUint16 *)buf); if (pa == wxCONV_FAILED) { @@ -865,6 +1125,9 @@ static inline bool isoctal(wchar_t wch) size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const { + if ( m_options == MAP_INVALID_UTF8_NOT ) + return wxMBConvStrictUTF8::WC2MB(buf, psz, n); + size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -2903,7 +3166,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE); #endif -WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE); +WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE); WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM)); -- 2.45.2