X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/467a2982d277060eced5160b4dc88daff442eb87..d79c9c67919f4428a8be5ffd6f5eba7dba35daa2:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index ba9af8782c..54ac2c8489 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -53,7 +53,7 @@ #include "wx/fontmap.h" #ifdef __DARWIN__ -#include "wx/mac/corefoundation/private/strconv_cf.h" +#include "wx/osx/core/private/strconv_cf.h" #endif //def __DARWIN__ @@ -212,11 +212,11 @@ wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, if ( lenChunk == wxCONV_FAILED ) return wxCONV_FAILED; - lenChunk++; // for the L'\0' at the end of this chunk - dstWritten += lenChunk; + if ( !srcEnd ) + dstWritten++; - if ( lenChunk == 1 ) + if ( !lenChunk ) { // nothing left in the input string, conversion succeeded break; @@ -227,10 +227,13 @@ wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, if ( dstWritten > dstLen ) return wxCONV_FAILED; - if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED ) + // +1 is for trailing NUL + if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED ) return wxCONV_FAILED; dst += lenChunk; + if ( !srcEnd ) + dst++; } if ( !srcEnd ) @@ -269,13 +272,15 @@ wxMBConv::FromWChar(char *dst, size_t dstLen, // the number of chars [which would be] written to dst [if it were not NULL] size_t dstWritten = 0; + // if we don't know its length we have no choice but to assume that it is + // NUL-terminated (notice that it can still be NUL-terminated even if + // explicit length is given but it doesn't change our return value) + const bool isNulTerminated = srcLen == wxNO_LEN; + // make a copy of the input string unless it is already properly // NUL-terminated - // - // if we don't know its length we have no choice but to assume that it is, - // indeed, properly terminated wxWCharBuffer bufTmp; - if ( srcLen == wxNO_LEN ) + if ( isNulTerminated ) { srcLen = wxWcslen(src) + 1; } @@ -298,18 +303,21 @@ wxMBConv::FromWChar(char *dst, size_t dstLen, if ( lenChunk == wxCONV_FAILED ) return wxCONV_FAILED; - lenChunk += lenNul; dstWritten += lenChunk; + if ( isNulTerminated ) + dstWritten += lenNul; if ( dst ) { if ( dstWritten > dstLen ) return wxCONV_FAILED; - if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED ) + if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED ) return wxCONV_FAILED; dst += lenChunk; + if ( isNulTerminated ) + dst += lenNul; } } @@ -391,13 +399,19 @@ wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const // because we want the buffer to always be NUL-terminated, even if the // input isn't (as otherwise the caller has no way to know its length) wxWCharBuffer wbuf(dstLen); - wbuf.data()[dstLen - 1] = L'\0'; + wbuf.data()[dstLen] = L'\0'; if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) { if ( outLen ) { *outLen = dstLen; - if ( wbuf[dstLen - 1] == L'\0' ) + + // we also need to handle NUL-terminated input strings + // specially: for them the output is the length of the string + // excluding the trailing NUL, however if we're asked to + // convert a specific number of characters we return the length + // of the resulting output even if it's NUL-terminated + if ( inLen == wxNO_LEN ) (*outLen)--; } @@ -429,11 +443,10 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const { *outLen = dstLen; - if ( dstLen >= nulLen && - !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) + if ( inLen == wxNO_LEN ) { - // in this case the output is NUL-terminated and we're not - // supposed to count NUL + // in this case both input and output are NUL-terminated + // and we're not supposed to count NUL *outLen -= nulLen; } } @@ -484,6 +497,8 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset) // ---------------------------------------------------------------------------- // Implementation (C) 2004 Fredrik Roubert +// +// Changes to work in streaming mode (C) 2008 Vadim Zeitlin // // BASE64 decoding table @@ -521,72 +536,134 @@ static const unsigned char utf7unb64[] = 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { + DecoderState stateOrig, + *statePtr; + if ( srcLen == wxNO_LEN ) + { + // convert the entire string, up to and including the trailing NUL + srcLen = strlen(src) + 1; + + // when working on the entire strings we don't update nor use the shift + // state from the previous call + statePtr = &stateOrig; + } + else // when working with partial strings we do use the shift state + { + statePtr = wx_const_cast(DecoderState *, &m_stateDecoder); + + // also save the old state to be able to rollback to it on error + stateOrig = m_stateDecoder; + } + + // but to simplify the code below we use this variable in both cases + DecoderState& state = *statePtr; + + + // number of characters [which would have been] written to dst [if it were + // not NULL] size_t len = 0; - while ( *psz && (!buf || (len < n)) ) + const char * const srcEnd = src + srcLen; + + while ( (src < srcEnd) && (!dst || (len < dstLen)) ) { - unsigned char cc = *psz++; - if (cc != '+') - { - // plain ASCII char - if (buf) - *buf++ = cc; - len++; - } - else if (*psz == '-') - { - // encoded plus sign - if (buf) - *buf++ = cc; - len++; - psz++; - } - else // start of BASE64 encoded string + const unsigned char cc = *src++; + + if ( state.IsShifted() ) { - bool lsb, ok; - unsigned int d, l; - for ( ok = lsb = false, d = 0, l = 0; - (cc = utf7unb64[(unsigned char)*psz]) != 0xff; - psz++ ) + const unsigned char dc = utf7unb64[cc]; + if ( dc == 0xff ) + { + // end of encoded part + state.ToDirect(); + + // re-parse this character normally below unless it's '-' which + // is consumed by the decoder + if ( cc == '-' ) + continue; + } + else // valid encoded character { - d <<= 6; - d += cc; - for (l += 6; l >= 8; lsb = !lsb) + // mini base64 decoder: each character is 6 bits + state.bit += 6; + state.accum <<= 6; + state.accum += dc; + + if ( state.bit >= 8 ) { - unsigned char c = (unsigned char)((d >> (l -= 8)) % 256); - if (lsb) + // got the full byte, consume it + state.bit -= 8; + unsigned char b = (state.accum >> state.bit) & 0x00ff; + + if ( state.isLSB ) { - if (buf) - *buf++ |= c; - len ++; - ok = true; + // we've got the full word, output it + if ( dst ) + *dst++ = (state.msb << 8) | b; + len++; + state.isLSB = false; } - else + else // MSB { - if (buf) - *buf = (wchar_t)(c << 8); + // just store it while we wait for LSB + state.msb = b; + state.isLSB = true; } } } + } - if ( !ok ) + if ( state.IsDirect() ) + { + // start of an encoded segment? + if ( cc == '+' ) { - // in valid UTF7 we should have valid characters after '+' - return wxCONV_FAILED; + if ( src == srcEnd ) + return wxCONV_FAILED; // can't have '+' at the end + + if ( *src == '-' ) + { + // just the encoded plus sign, don't switch to shifted mode + if ( dst ) + *dst++ = '+'; + len++; + src++; + } + else + { + state.ToShifted(); + } } + else // not '+' + { + // only printable 7 bit ASCII characters (with the exception of + // NUL, TAB, CR and LF) can be used directly + if ( cc >= 0x7f || (cc < ' ' && + !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) ) + return wxCONV_FAILED; - if (*psz == '-') - psz++; + if ( dst ) + *dst++ = cc; + len++; + } } } - if ( buf && (len < n) ) - *buf = '\0'; + if ( !len ) + { + // as we didn't read any characters we should be called with the same + // data (followed by some more new data) again later so don't save our + // state + state = stateOrig; + + return wxCONV_FAILED; + } return len; } @@ -616,7 +693,7 @@ static const unsigned char utf7enb64[] = // static const unsigned char utf7encode[128] = { - 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, + 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, @@ -626,21 +703,72 @@ static const unsigned char utf7encode[128] = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3 }; -size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const +static inline bool wxIsUTF7Direct(wchar_t wc) { + return wc < 0x80 && utf7encode[wc] < 1; +} + +size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + EncoderState stateOrig, + *statePtr; + if ( srcLen == wxNO_LEN ) + { + // we don't apply the stored state when operating on entire strings at + // once + statePtr = &stateOrig; + + srcLen = wxWcslen(src) + 1; + } + else // do use the mode we left the output in previously + { + stateOrig = m_stateEncoder; + statePtr = wx_const_cast(EncoderState *, &m_stateEncoder); + } + + EncoderState& state = *statePtr; + + size_t len = 0; - while (*psz && ((!buf) || (len < n))) + const wchar_t * const srcEnd = src + srcLen; + while ( src < srcEnd && (!dst || len < dstLen) ) { - wchar_t cc = *psz++; - if (cc < 0x80 && utf7encode[cc] < 1) + wchar_t cc = *src++; + if ( wxIsUTF7Direct(cc) ) { - // plain ASCII char - if (buf) - *buf++ = (char)cc; + if ( state.IsShifted() ) + { + // pad with zeros the last encoded block if necessary + if ( state.bit ) + { + if ( dst ) + *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64]; + len++; + } + + state.ToDirect(); + + if ( dst ) + *dst++ = '-'; + len++; + } + if ( dst ) + *dst++ = (char)cc; len++; } + else if ( cc == '+' && state.IsDirect() ) + { + if ( dst ) + { + *dst++ = '+'; + *dst++ = '-'; + } + + len += 2; + } #ifndef WC_UTF16 else if (((wxUint32)cc) > 0xffff) { @@ -650,52 +778,45 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const #endif else { - if (buf) - *buf++ = '+'; + if ( state.IsDirect() ) + { + state.ToShifted(); - len++; - if (cc != '+') + if ( dst ) + *dst++ = '+'; + len++; + } + + // BASE64 encode string + for ( ;; ) { - // BASE64 encode string - unsigned int lsb, d, l; - for (d = 0, l = 0; /*nothing*/; psz++) + for ( unsigned lsb = 0; lsb < 2; lsb++ ) { - for (lsb = 0; lsb < 2; lsb ++) - { - d <<= 8; - d += lsb ? cc & 0xff : (cc & 0xff00) >> 8; + state.accum <<= 8; + state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8; - for (l += 8; l >= 6; ) - { - l -= 6; - if (buf) - *buf++ = utf7enb64[(d >> l) % 64]; - len++; - } + for (state.bit += 8; state.bit >= 6; ) + { + state.bit -= 6; + if ( dst ) + *dst++ = utf7enb64[(state.accum >> state.bit) % 64]; + len++; } - - cc = *psz; - if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1)) - break; } - if (l != 0) - { - if (buf) - *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64]; + if ( src == srcEnd || wxIsUTF7Direct(cc = *src) ) + break; - len++; - } + src++; } - - if (buf) - *buf++ = '-'; - len++; } } - if (buf && (len < n)) - *buf = 0; + // we need to restore the original encoder state if we were called just to + // calculate the amount of space needed as we will presumably be called + // again to really convert the data now + if ( !dst ) + state = stateOrig; return len; } @@ -879,7 +1000,7 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, for ( const wchar_t *wp = src; ; wp++ ) { - if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) ) + if ( !(srcLen == wxNO_LEN ? *wp : srcLen) ) { // all done successfully, just add the trailing NULL if we are not // using explicit length @@ -899,6 +1020,8 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, return written; } + if ( srcLen != wxNO_LEN ) + srcLen--; wxUint32 code; #ifdef WC_UTF16 @@ -2082,6 +2205,11 @@ wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen, srcLen = p - src; break; } + + // when we're determining the length of the string ourselves we count + // the terminating NUL(s) as part of it and always NUL-terminate the + // output + srcLen += nulLen; } // we express length in the number of (wide) characters but iconv always @@ -2106,10 +2234,14 @@ wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen, char* bufPtr = (char*)dst; // have destination buffer, convert there + size_t dstLenOrig = dstLen; cres = iconv(m2w, ICONV_CHAR_CAST(&pszPtr), &srcLen, &bufPtr, &dstLen); - res = dstLen - (dstLen / SIZEOF_WCHAR_T); + + // convert the number of bytes converted as returned by iconv to the + // number of (wide) characters converted that we need + res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T; if (ms_wcNeedsSwap) { @@ -2117,10 +2249,6 @@ wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen, for ( unsigned i = 0; i < res; i++ ) dst[i] = WC_BSWAP(dst[i]); } - - // NUL-terminate the string if there is any space left - if (res < dstLen) - dst[res] = 0; } else // no destination buffer { @@ -2161,7 +2289,7 @@ size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen, #endif if ( srcLen == wxNO_LEN ) - srcLen = wxWcslen(src); + srcLen = wxWcslen(src) + 1; size_t inbuflen = srcLen * SIZEOF_WCHAR_T; size_t outbuflen = dstLen; @@ -2189,12 +2317,6 @@ size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen, cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen); res = dstLen - outbuflen; - - // NB: iconv was given only wcslen(src) characters on input, and so - // it couldn't convert the trailing zero. Let's do it ourselves - // if there's some room left for it in the output buffer. - if (res < dstLen) - dst[0] = 0; } else // no destination buffer { @@ -3042,69 +3164,57 @@ size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen, return m_convReal->ToWChar(dst, dstLen, src, srcLen); // latin-1 (direct) - return wxMBConv::ToWChar(dst, dstLen, src, srcLen); -} + if ( srcLen == wxNO_LEN ) + srcLen = strlen(src) + 1; // take trailing NUL too -size_t wxCSConv::FromWChar(char *dst, size_t dstLen, - const wchar_t *src, size_t srcLen) const -{ - CreateConvIfNeeded(); + if ( dst ) + { + if ( dstLen < srcLen ) + return wxCONV_FAILED; - if (m_convReal) - return m_convReal->FromWChar(dst, dstLen, src, srcLen); + for ( size_t n = 0; n < srcLen; n++ ) + dst[n] = (unsigned char)(src[n]); + } - // latin-1 (direct) - return wxMBConv::FromWChar(dst, dstLen, src, srcLen); + return srcLen; } -size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t wxCSConv::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { CreateConvIfNeeded(); if (m_convReal) - return m_convReal->MB2WC(buf, psz, n); + return m_convReal->FromWChar(dst, dstLen, src, srcLen); // latin-1 (direct) - size_t len = strlen(psz); + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; - if (buf) + if ( dst ) { - for (size_t c = 0; c <= len; c++) - buf[c] = (unsigned char)(psz[c]); - } - - return len; -} - -size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const -{ - CreateConvIfNeeded(); - - if (m_convReal) - return m_convReal->WC2MB(buf, psz, n); + if ( dstLen < srcLen ) + return wxCONV_FAILED; - // latin-1 (direct) - const size_t len = wxWcslen(psz); - if (buf) - { - for (size_t c = 0; c <= len; c++) + for ( size_t n = 0; n < srcLen; n++ ) { - if (psz[c] > 0xFF) + if ( src[n] > 0xFF ) return wxCONV_FAILED; - buf[c] = (char)psz[c]; + dst[n] = (char)src[n]; } + } - else + else // still need to check the input validity { - for (size_t c = 0; c <= len; c++) + for ( size_t n = 0; n < srcLen; n++ ) { - if (psz[c] > 0xFF) + if ( src[n] > 0xFF ) return wxCONV_FAILED; } } - return len; + return srcLen; } size_t wxCSConv::GetMBNulLen() const