X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/e8769ed11cb9315fbf414d121490982df714b68a..e5ef1cae0f424be0bffbd651cb10052dfb208396:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 8acfe82ead..507bf1307b 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -53,7 +53,7 @@ #include "wx/fontmap.h" #ifdef __DARWIN__ -#include "wx/mac/corefoundation/private/strconv_cf.h" +#include "wx/osx/core/private/strconv_cf.h" #endif //def __DARWIN__ @@ -484,6 +484,8 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset) // ---------------------------------------------------------------------------- // Implementation (C) 2004 Fredrik Roubert +// +// Changes to work in streaming mode (C) 2008 Vadim Zeitlin // // BASE64 decoding table @@ -521,73 +523,134 @@ static const unsigned char utf7unb64[] = 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { + DecoderState stateOrig, + *statePtr; + if ( srcLen == wxNO_LEN ) + { + // convert the entire string, up to and including the trailing NUL + srcLen = strlen(src) + 1; + + // when working on the entire strings we don't update nor use the shift + // state from the previous call + statePtr = &stateOrig; + } + else // when working with partial strings we do use the shift state + { + statePtr = wx_const_cast(DecoderState *, &m_stateDecoder); + + // also save the old state to be able to rollback to it on error + stateOrig = m_stateDecoder; + } + + // but to simplify the code below we use this variable in both cases + DecoderState& state = *statePtr; + + + // number of characters [which would have been] written to dst [if it were + // not NULL] size_t len = 0; - while ( *psz && (!buf || (len < n)) ) + const char * const srcEnd = src + srcLen; + + while ( (src < srcEnd) && (!dst || (len < dstLen)) ) { - unsigned char cc = *psz++; - if (cc != '+') - { - // plain ASCII char - if (buf) - *buf++ = cc; - len++; - } - else if (*psz == '-') - { - // encoded plus sign - if (buf) - *buf++ = cc; - len++; - psz++; - } - else // start of BASE64 encoded string + const unsigned char cc = *src++; + + if ( state.IsShifted() ) { - bool lsb, ok; - unsigned int d, l; - for ( ok = lsb = false, d = 0, l = 0; - (cc = utf7unb64[(unsigned char)*psz]) != 0xff; - psz++ ) + const unsigned char dc = utf7unb64[cc]; + if ( dc == 0xff ) { - d <<= 6; - d += cc; - for (l += 6; l >= 8; lsb = !lsb) + // end of encoded part + state.ToDirect(); + + // re-parse this character normally below unless it's '-' which + // is consumed by the decoder + if ( cc == '-' ) + continue; + } + else // valid encoded character + { + // mini base64 decoder: each character is 6 bits + state.bit += 6; + state.accum <<= 6; + state.accum += dc; + + if ( state.bit >= 8 ) { - unsigned char c = (unsigned char)((d >> (l -= 8)) % 256); - if (lsb) + // got the full byte, consume it + state.bit -= 8; + unsigned char b = (state.accum >> state.bit) & 0x00ff; + + if ( state.isLSB ) { - if (buf) - *buf++ |= c; - len ++; + // we've got the full word, output it + if ( dst ) + *dst++ = (state.msb << 8) | b; + len++; + state.isLSB = false; } - else + else // MSB { - if (buf) - *buf = (wchar_t)(c << 8); + // just store it while we wait for LSB + state.msb = b; + state.isLSB = true; } - - ok = true; } } + } - if ( !ok ) + if ( state.IsDirect() ) + { + // start of an encoded segment? + if ( cc == '+' ) { - // in valid UTF7 we should have valid characters after '+' - return wxCONV_FAILED; + if ( src == srcEnd ) + return wxCONV_FAILED; // can't have '+' at the end + + if ( *src == '-' ) + { + // just the encoded plus sign, don't switch to shifted mode + if ( dst ) + *dst++ = '+'; + len++; + src++; + } + else + { + state.ToShifted(); + } } + else // not '+' + { + // only printable 7 bit ASCII characters (with the exception of + // NUL, TAB, CR and LF) can be used directly + if ( cc >= 0x7f || (cc < ' ' && + !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) ) + return wxCONV_FAILED; - if (*psz == '-') - psz++; + if ( dst ) + *dst++ = cc; + len++; + } } } - if ( buf && (len < n) ) - *buf = '\0'; + if ( !len ) + { + // as we didn't read any characters we should be called with the same + // data (followed by some more new data) again later so don't save our + // state + state = stateOrig; + + return wxCONV_FAILED; + } return len; } @@ -617,7 +680,7 @@ static const unsigned char utf7enb64[] = // static const unsigned char utf7encode[128] = { - 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, + 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, @@ -627,21 +690,72 @@ static const unsigned char utf7encode[128] = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3 }; -size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const +static inline bool wxIsUTF7Direct(wchar_t wc) { + return wc < 0x80 && utf7encode[wc] < 1; +} + +size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + EncoderState stateOrig, + *statePtr; + if ( srcLen == wxNO_LEN ) + { + // we don't apply the stored state when operating on entire strings at + // once + statePtr = &stateOrig; + + srcLen = wxWcslen(src) + 1; + } + else // do use the mode we left the output in previously + { + stateOrig = m_stateEncoder; + statePtr = wx_const_cast(EncoderState *, &m_stateEncoder); + } + + EncoderState& state = *statePtr; + + size_t len = 0; - while (*psz && ((!buf) || (len < n))) + const wchar_t * const srcEnd = src + srcLen; + while ( src < srcEnd && (!dst || len < dstLen) ) { - wchar_t cc = *psz++; - if (cc < 0x80 && utf7encode[cc] < 1) + wchar_t cc = *src++; + if ( wxIsUTF7Direct(cc) ) { - // plain ASCII char - if (buf) - *buf++ = (char)cc; + if ( state.IsShifted() ) + { + // pad with zeros the last encoded block if necessary + if ( state.bit ) + { + if ( dst ) + *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64]; + len++; + } + + state.ToDirect(); + if ( dst ) + *dst++ = '-'; + len++; + } + + if ( dst ) + *dst++ = (char)cc; len++; } + else if ( cc == '+' && state.IsDirect() ) + { + if ( dst ) + { + *dst++ = '+'; + *dst++ = '-'; + } + + len += 2; + } #ifndef WC_UTF16 else if (((wxUint32)cc) > 0xffff) { @@ -651,52 +765,45 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const #endif else { - if (buf) - *buf++ = '+'; + if ( state.IsDirect() ) + { + state.ToShifted(); - len++; - if (cc != '+') + if ( dst ) + *dst++ = '+'; + len++; + } + + // BASE64 encode string + for ( ;; ) { - // BASE64 encode string - unsigned int lsb, d, l; - for (d = 0, l = 0; /*nothing*/; psz++) + for ( unsigned lsb = 0; lsb < 2; lsb++ ) { - for (lsb = 0; lsb < 2; lsb ++) - { - d <<= 8; - d += lsb ? cc & 0xff : (cc & 0xff00) >> 8; + state.accum <<= 8; + state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8; - for (l += 8; l >= 6; ) - { - l -= 6; - if (buf) - *buf++ = utf7enb64[(d >> l) % 64]; - len++; - } + for (state.bit += 8; state.bit >= 6; ) + { + state.bit -= 6; + if ( dst ) + *dst++ = utf7enb64[(state.accum >> state.bit) % 64]; + len++; } - - cc = *psz; - if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1)) - break; } - if (l != 0) - { - if (buf) - *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64]; + if ( src == srcEnd || wxIsUTF7Direct(cc = *src) ) + break; - len++; - } + src++; } - - if (buf) - *buf++ = '-'; - len++; } } - if (buf && (len < n)) - *buf = 0; + // we need to restore the original encoder state if we were called just to + // calculate the amount of space needed as we will presumably be called + // again to really convert the data now + if ( !dst ) + state = stateOrig; return len; } @@ -880,7 +987,7 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, for ( const wchar_t *wp = src; ; wp++ ) { - if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) ) + if ( !(srcLen == wxNO_LEN ? *wp : srcLen) ) { // all done successfully, just add the trailing NULL if we are not // using explicit length @@ -900,6 +1007,8 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, return written; } + if ( srcLen != wxNO_LEN ) + srcLen--; wxUint32 code; #ifdef WC_UTF16 @@ -1863,10 +1972,11 @@ public: wxMBConv_iconv(const char *name); virtual ~wxMBConv_iconv(); - virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; - virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; - - // classify this encoding as explained in wxMBConv::GetMBNulLen() comment + // implement base class virtual methods + virtual size_t ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen = wxNO_LEN) const; + virtual size_t FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen = wxNO_LEN) const; virtual size_t GetMBNulLen() const; #if wxUSE_UNICODE_UTF8 @@ -2052,33 +2162,47 @@ wxMBConv_iconv::~wxMBConv_iconv() iconv_close(w2m); } -size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const +size_t +wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const { - // find the string length: notice that must be done differently for - // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs - size_t inbuf; - const size_t nulLen = GetMBNulLen(); - switch ( nulLen ) + if ( srcLen == wxNO_LEN ) { - default: - return wxCONV_FAILED; + // find the string length: notice that must be done differently for + // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 + // consecutive NULs + const size_t nulLen = GetMBNulLen(); + switch ( nulLen ) + { + default: + return wxCONV_FAILED; - case 1: - inbuf = strlen(psz); // arguably more optimized than our version - break; + case 1: + srcLen = strlen(src); // arguably more optimized than our version + break; - case 2: - case 4: - // for UTF-16/32 not only we need to have 2/4 consecutive NULs but - // they also have to start at character boundary and not span two - // adjacent characters - const char *p; - for ( p = psz; NotAllNULs(p, nulLen); p += nulLen ) - ; - inbuf = p - psz; - break; + case 2: + case 4: + // for UTF-16/32 not only we need to have 2/4 consecutive NULs + // but they also have to start at character boundary and not + // span two adjacent characters + const char *p; + for ( p = src; NotAllNULs(p, nulLen); p += nulLen ) + ; + srcLen = p - src; + break; + } + + // when we're determining the length of the string ourselves we count + // the terminating NUL(s) as part of it and always NUL-terminate the + // output + srcLen += nulLen; } + // we express length in the number of (wide) characters but iconv always + // counts buffer sizes it in bytes + dstLen *= SIZEOF_WCHAR_T; + #if wxUSE_THREADS // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle. // Unfortunately there are a couple of global wxCSConv objects such as @@ -2089,53 +2213,51 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); #endif // wxUSE_THREADS - size_t outbuf = n * SIZEOF_WCHAR_T; size_t res, cres; - const char *pszPtr = psz; + const char *pszPtr = src; - if (buf) + if ( dst ) { - char* bufPtr = (char*)buf; + char* bufPtr = (char*)dst; // have destination buffer, convert there + size_t dstLenOrig = dstLen; cres = iconv(m2w, - ICONV_CHAR_CAST(&pszPtr), &inbuf, - &bufPtr, &outbuf); - res = n - (outbuf / SIZEOF_WCHAR_T); + ICONV_CHAR_CAST(&pszPtr), &srcLen, + &bufPtr, &dstLen); + + // convert the number of bytes converted as returned by iconv to the + // number of (wide) characters converted that we need + res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T; if (ms_wcNeedsSwap) { // convert to native endianness for ( unsigned i = 0; i < res; i++ ) - buf[n] = WC_BSWAP(buf[i]); + dst[i] = WC_BSWAP(dst[i]); } - - // NUL-terminate the string if there is any space left - if (res < n) - buf[res] = 0; } - else + else // no destination buffer { - // no destination buffer... convert using temp buffer - // to calculate destination buffer requirement + // convert using temp buffer to calculate the size of the buffer needed wchar_t tbuf[8]; res = 0; do { char* bufPtr = (char*)tbuf; - outbuf = 8 * SIZEOF_WCHAR_T; + dstLen = 8 * SIZEOF_WCHAR_T; cres = iconv(m2w, - ICONV_CHAR_CAST(&pszPtr), &inbuf, - &bufPtr, &outbuf ); + ICONV_CHAR_CAST(&pszPtr), &srcLen, + &bufPtr, &dstLen ); - res += 8 - (outbuf / SIZEOF_WCHAR_T); + res += 8 - (dstLen / SIZEOF_WCHAR_T); } while ((cres == (size_t)-1) && (errno == E2BIG)); } - if (ICONV_FAILED(cres, inbuf)) + if (ICONV_FAILED(cres, srcLen)) { //VS: it is ok if iconv fails, hence trace only wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); @@ -2145,16 +2267,19 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const return res; } -size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const +size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const { #if wxUSE_THREADS // NB: explained in MB2WC wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); #endif - size_t inlen = wxWcslen(psz); - size_t inbuflen = inlen * SIZEOF_WCHAR_T; - size_t outbuflen = n; + if ( srcLen == wxNO_LEN ) + srcLen = wxWcslen(src) + 1; + + size_t inbuflen = srcLen * SIZEOF_WCHAR_T; + size_t outbuflen = dstLen; size_t res, cres; wchar_t *tmpbuf = 0; @@ -2165,39 +2290,32 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const // (doing WC_BSWAP twice on the original buffer won't help, as it // could be in read-only memory, or be accessed in some other thread) tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T); - for ( size_t i = 0; i < inlen; i++ ) - tmpbuf[n] = WC_BSWAP(psz[i]); + for ( size_t i = 0; i < srcLen; i++ ) + tmpbuf[i] = WC_BSWAP(src[i]); - tmpbuf[inlen] = L'\0'; - psz = tmpbuf; + tmpbuf[srcLen] = L'\0'; + src = tmpbuf; } - char* inbuf = (char*)psz; - if (buf) + char* inbuf = (char*)src; + if ( dst ) { // have destination buffer, convert there - cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen); + cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen); - res = n - outbuflen; - - // NB: iconv was given only wcslen(psz) characters on input, and so - // it couldn't convert the trailing zero. Let's do it ourselves - // if there's some room left for it in the output buffer. - if (res < n) - buf[0] = 0; + res = dstLen - outbuflen; } - else + else // no destination buffer { - // no destination buffer: convert using temp buffer - // to calculate destination buffer requirement + // convert using temp buffer to calculate the size of the buffer needed char tbuf[16]; res = 0; do { - buf = tbuf; + dst = tbuf; outbuflen = 16; - cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen); + cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen); res += 16 - outbuflen; }