X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/d6f2a8911e509fd9e61f881cc881a97f5aa05ae8..562e60a0f2ef165ab75c999eb8002d547c3585aa:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index f2364b7fc3..3e354f466b 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -391,7 +391,11 @@ wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen); if ( dstLen != wxCONV_FAILED ) { - wxWCharBuffer wbuf(dstLen - 1); + // notice that we allocate space for dstLen+1 wide characters here + // because we want the buffer to always be NUL-terminated, even if the + // input isn't (as otherwise the caller has no way to know its length) + wxWCharBuffer wbuf(dstLen); + wbuf.data()[dstLen - 1] = L'\0'; if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) @@ -417,16 +421,18 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); if ( dstLen != wxCONV_FAILED ) { - // special case of empty input: can't allocate 0 size buffer below as - // wxCharBuffer insists on NUL-terminating it - wxCharBuffer buf(dstLen ? dstLen - 1 : 1); + const size_t nulLen = GetMBNulLen(); + + // as above, ensure that the buffer is always NUL-terminated, even if + // the input is not + wxCharBuffer buf(dstLen + nulLen - 1); + memset(buf.data() + dstLen, 0, nulLen); if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) { *outLen = dstLen; - const size_t nulLen = GetMBNulLen(); if ( dstLen >= nulLen && !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) { @@ -703,7 +709,7 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const // UTF-8 // ---------------------------------------------------------------------------- -static wxUint32 utf8_max[]= +static const wxUint32 utf8_max[]= { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; // boundaries of the private use area we use to (temporarily) remap invalid @@ -712,7 +718,7 @@ const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; // this table gives the length of the UTF-8 encoding from its first character: -unsigned char tableUtf8Lengths[256] = { +const unsigned char tableUtf8Lengths[256] = { // single-byte sequences (ASCII): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F @@ -778,58 +784,73 @@ wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen, return written; } - unsigned char c = *p; - unsigned len = tableUtf8Lengths[c]; - if ( !len ) + if ( out && !dstLen-- ) break; - if ( srcLen < len ) // the test works for wxNO_LEN too - break; + wxUint32 code; + unsigned char c = *p; - if ( srcLen != wxNO_LEN ) - srcLen -= len; + if ( c < 0x80 ) + { + if ( srcLen == 0 ) // the test works for wxNO_LEN too + break; - if ( out && !dstLen-- ) - break; + if ( srcLen != wxNO_LEN ) + srcLen--; + code = c; + } + else + { + unsigned len = tableUtf8Lengths[c]; + if ( !len ) + break; - // Char. number range | UTF-8 octet sequence - // (hexadecimal) | (binary) - // ----------------------+--------------------------------------------- - // 0000 0000 - 0000 007F | 0xxxxxxx - // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx - // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx - // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // - // Code point value is stored in bits marked with 'x', lowest-order bit - // of the value on the right side in the diagram above. - // (from RFC 3629) + if ( srcLen < len ) // the test works for wxNO_LEN too + break; - // mask to extract lead byte's value ('x' bits above), by sequence length: - static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; + if ( srcLen != wxNO_LEN ) + srcLen -= len; - // mask and value of lead byte's most significant bits, by length: - static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; - static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; + // Char. number range | UTF-8 octet sequence + // (hexadecimal) | (binary) + // ----------------------+---------------------------------------- + // 0000 0000 - 0000 007F | 0xxxxxxx + // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx + // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Code point value is stored in bits marked with 'x', + // lowest-order bit of the value on the right side in the diagram + // above. (from RFC 3629) - len--; // it's more convenient to work with 0-based length here + // mask to extract lead byte's value ('x' bits above), by sequence + // length: + static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; - // extract the lead byte's value bits: - if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) - break; + // mask and value of lead byte's most significant bits, by length: + static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 }; + static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 }; - wxUint32 code = c & leadValueMask[len]; + len--; // it's more convenient to work with 0-based length here - // all remaining bytes, if any, are handled in the same way regardless of - // sequence's length: - for ( ; len; --len ) - { - c = *++p; - if ( (c & 0xC0) != 0x80 ) - return wxCONV_FAILED; + // extract the lead byte's value bits: + if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] ) + break; + + code = c & leadValueMask[len]; + + // all remaining bytes, if any, are handled in the same way + // regardless of sequence's length: + for ( ; len; --len ) + { + c = *++p; + if ( (c & 0xC0) != 0x80 ) + return wxCONV_FAILED; - code <<= 6; - code |= c & 0x3F; + code <<= 6; + code |= c & 0x3F; + } } #ifdef WC_UTF16 @@ -968,14 +989,15 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, return wxCONV_FAILED; } -size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n, + const char *psz, size_t srcLen) const { if ( m_options == MAP_INVALID_UTF8_NOT ) - return wxMBConvStrictUTF8::MB2WC(buf, psz, n); + return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen); size_t len = 0; - while (*psz && ((!buf) || (len < n))) + while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n))) { const char *opsz = psz; bool invalid = false; @@ -1109,10 +1131,10 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const } } - if (buf && (len < n)) + if (srcLen == wxNO_LEN && buf && (len < n)) *buf = 0; - return len; + return len + 1; } static inline bool isoctal(wchar_t wch) @@ -1120,14 +1142,15 @@ static inline bool isoctal(wchar_t wch) return L'0' <= wch && wch <= L'7'; } -size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, + const wchar_t *psz, size_t srcLen) const { if ( m_options == MAP_INVALID_UTF8_NOT ) - return wxMBConvStrictUTF8::WC2MB(buf, psz, n); + return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen); size_t len = 0; - while (*psz && ((!buf) || (len < n))) + while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n))) { wxUint32 cc; @@ -1195,10 +1218,10 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const } } - if (buf && (len < n)) + if (srcLen == wxNO_LEN && buf && (len < n)) *buf = 0; - return len; + return len + 1; } // ============================================================================ @@ -2411,26 +2434,38 @@ public: return wxCONV_FAILED; } - // if we were really converting, check if we succeeded - if ( buf ) + // we did something, check if we really succeeded + if ( flags ) { - if ( flags ) + // check if the conversion failed, i.e. if any replacements + // were done + if ( usedDef ) + return wxCONV_FAILED; + } + else // we must resort to double tripping... + { + // first we need to ensure that we really have the MB data: this is + // not the case if we're called with NULL buffer, in which case we + // need to do the conversion yet again + wxCharBuffer bufDef; + if ( !buf ) { - // check if the conversion failed, i.e. if any replacements - // were done - if ( usedDef ) + bufDef = wxCharBuffer(len); + buf = bufDef.data(); + if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1, + buf, len, NULL, NULL) ) return wxCONV_FAILED; } - else // we must resort to double tripping... + + if ( !n ) + n = wcslen(pwz); + wxWCharBuffer wcBuf(n); + if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED || + wcscmp(wcBuf, pwz) != 0 ) { - wxWCharBuffer wcBuf(n); - if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || - wcscmp(wcBuf, pwz) != 0 ) - { - // we didn't obtain the same thing we started from, hence - // the conversion was lossy and we consider that it failed - return wxCONV_FAILED; - } + // we didn't obtain the same thing we started from, hence + // the conversion was lossy and we consider that it failed + return wxCONV_FAILED; } } @@ -3163,8 +3198,14 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE); #endif -WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE); -WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE); +// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's +// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still +// provokes an error message about "not enough macro parameters"; and we +// can't use "()" here as the name##Obj declaration would be parsed as a +// function declaration then, so use a semicolon and live with an extra +// empty statement (and hope that no compilers warns about this) +WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;); +WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM)); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));