X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/6ac84a787253ecedb262c739ec04e753e11c3697..48271822ef3d56c8f91af882b68fd1b674a8e8e6:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 59ea721b71..f2364b7fc3 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -354,14 +354,14 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const if ( psz ) { // calculate the length of the buffer needed first - const size_t nLen = MB2WC(NULL, psz, 0); + const size_t nLen = ToWChar(NULL, 0, psz); if ( nLen != wxCONV_FAILED ) { // now do the actual conversion - wxWCharBuffer buf(nLen /* +1 added implicitly */); + wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */); // +1 for the trailing NULL - if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) + if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED ) return buf; } } @@ -373,14 +373,11 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const { if ( pwz ) { - const size_t nLen = WC2MB(NULL, pwz, 0); + const size_t nLen = FromWChar(NULL, 0, pwz); if ( nLen != wxCONV_FAILED ) { - // extra space for trailing NUL(s) - static const size_t extraLen = GetMaxMBNulLen(); - - wxCharBuffer buf(nLen + extraLen - 1); - if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) + wxCharBuffer buf(nLen - 1); + if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED ) return buf; } } @@ -714,8 +711,268 @@ static wxUint32 utf8_max[]= const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; +// this table gives the length of the UTF-8 encoding from its first character: +unsigned char tableUtf8Lengths[256] = { + // single-byte sequences (ASCII): + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F + + // these are invalid: + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF + 0, 0, // C0,C1 + + // two-byte sequences: + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF + + // three-byte sequences: + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF + + // four-byte sequences: + 4, 4, 4, 4, 4, // F0..F4 + + // these are invalid again (5- or 6-byte + // sequences and sequences for code points + // above U+10FFFF, as restricted by RFC 3629): + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF +}; + +size_t +wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const +{ + wchar_t *out = dstLen ? dst : NULL; + size_t written = 0; + + if ( srcLen == wxNO_LEN ) + srcLen = strlen(src) + 1; + + for ( const char *p = src; ; p++ ) + { + if ( !(srcLen == wxNO_LEN ? *p : srcLen) ) + { + // all done successfully, just add the trailing NULL if we are not + // using explicit length + if ( srcLen == wxNO_LEN ) + { + if ( out ) + { + if ( !dstLen ) + break; + + *out = L'\0'; + } + + written++; + } + + return written; + } + + unsigned char c = *p; + unsigned len = tableUtf8Lengths[c]; + if ( !len ) + break; + + if ( srcLen < len ) // the test works for wxNO_LEN too + break; + + if ( srcLen != wxNO_LEN ) + srcLen -= len; + + if ( out && !dstLen-- ) + break; + + + // Char. number range | UTF-8 octet sequence + // (hexadecimal) | (binary) + // ----------------------+--------------------------------------------- + // 0000 0000 - 0000 007F | 0xxxxxxx + // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx + // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Code point value is stored in bits marked with 'x', lowest-order bit + // of the value on the right side in the diagram above. + // (from RFC 3629) + + // mask to extract lead byte's value ('x' bits above), by sequence length: + static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; + + // mask and value of lead byte's most significant bits, by length: + static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; + static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; + + len--; // it's more convenient to work with 0-based length here + + // extract the lead byte's value bits: + if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) + break; + + wxUint32 code = c & leadValueMask[len]; + + // all remaining bytes, if any, are handled in the same way regardless of + // sequence's length: + for ( ; len; --len ) + { + c = *++p; + if ( (c & 0xC0) != 0x80 ) + return wxCONV_FAILED; + + code <<= 6; + code |= c & 0x3F; + } + +#ifdef WC_UTF16 + // cast is ok because wchar_t == wxUint16 if WC_UTF16 + if ( encode_utf16(code, (wxUint16 *)out) == 2 ) + { + if ( out ) + out++; + written++; + } +#else // !WC_UTF16 + if ( out ) + *out = code; +#endif // WC_UTF16/!WC_UTF16 + + if ( out ) + out++; + + written++; + } + + return wxCONV_FAILED; +} + +size_t +wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + char *out = dstLen ? dst : NULL; + size_t written = 0; + + for ( const wchar_t *wp = src; ; wp++ ) + { + if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) ) + { + // all done successfully, just add the trailing NULL if we are not + // using explicit length + if ( srcLen == wxNO_LEN ) + { + if ( out ) + { + if ( !dstLen ) + break; + + *out = '\0'; + } + + written++; + } + + return written; + } + + + wxUint32 code; +#ifdef WC_UTF16 + // cast is ok for WC_UTF16 + if ( decode_utf16((const wxUint16 *)wp, code) == 2 ) + { + // skip the next char too as we decoded a surrogate + wp++; + } +#else // wchar_t is UTF-32 + code = *wp & 0x7fffffff; +#endif + + unsigned len; + if ( code <= 0x7F ) + { + len = 1; + if ( out ) + { + if ( dstLen < len ) + break; + + out[0] = (char)code; + } + } + else if ( code <= 0x07FF ) + { + len = 2; + if ( out ) + { + if ( dstLen < len ) + break; + + // NB: this line takes 6 least significant bits, encodes them as + // 10xxxxxx and discards them so that the next byte can be encoded: + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xC0 | code; + } + } + else if ( code < 0xFFFF ) + { + len = 3; + if ( out ) + { + if ( dstLen < len ) + break; + + out[2] = 0x80 | (code & 0x3F); code >>= 6; + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xE0 | code; + } + } + else if ( code <= 0x10FFFF ) + { + len = 4; + if ( out ) + { + if ( dstLen < len ) + break; + + out[3] = 0x80 | (code & 0x3F); code >>= 6; + out[2] = 0x80 | (code & 0x3F); code >>= 6; + out[1] = 0x80 | (code & 0x3F); code >>= 6; + out[0] = 0xF0 | code; + } + } + else + { + wxFAIL_MSG( _T("trying to encode undefined Unicode character") ); + break; + } + + if ( out ) + { + out += len; + dstLen -= len; + } + + written += len; + } + + // we only get here if an error occurs during decoding + return wxCONV_FAILED; +} + size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const { + if ( m_options == MAP_INVALID_UTF8_NOT ) + return wxMBConvStrictUTF8::MB2WC(buf, psz, n); + size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -785,7 +1042,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const else { #ifdef WC_UTF16 - // cast is ok because wchar_t == wxUuint16 if WC_UTF16 + // cast is ok because wchar_t == wxUint16 if WC_UTF16 size_t pa = encode_utf16(res, (wxUint16 *)buf); if (pa == wxCONV_FAILED) { @@ -865,6 +1122,9 @@ static inline bool isoctal(wchar_t wch) size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const { + if ( m_options == MAP_INVALID_UTF8_NOT ) + return wxMBConvStrictUTF8::WC2MB(buf, psz, n); + size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -2476,7 +2736,7 @@ void wxCSConv::SetName(const char *charset) { if (charset) { - m_name = strdup(charset); + m_name = wxStrdup(charset); m_deferred = true; } } @@ -2903,7 +3163,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE); #endif -WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE); +WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE); WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));