+
+static inline bool isoctal(wchar_t wch)
+{
+ return L'0' <= wch && wch <= L'7';
+}
+
+size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
+ const wchar_t *psz, size_t srcLen) const
+{
+ if ( m_options == MAP_INVALID_UTF8_NOT )
+ return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
+
+ size_t len = 0;
+
+ while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
+ {
+ wxUint32 cc;
+
+#ifdef WC_UTF16
+ // cast is ok for WC_UTF16
+ size_t pa = decode_utf16((const wxUint16 *)psz, cc);
+ psz += (pa == wxCONV_FAILED) ? 1 : pa;
+#else
+ cc = (*psz++) & 0x7fffffff;
+#endif
+
+ if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
+ && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
+ {
+ if (buf)
+ *buf++ = (char)(cc - wxUnicodePUA);
+ len++;
+ }
+ else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
+ && cc == L'\\' && psz[0] == L'\\' )
+ {
+ if (buf)
+ *buf++ = (char)cc;
+ psz++;
+ len++;
+ }
+ else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
+ cc == L'\\' &&
+ isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
+ {
+ if (buf)
+ {
+ *buf++ = (char) ((psz[0] - L'0') * 0100 +
+ (psz[1] - L'0') * 010 +
+ (psz[2] - L'0'));
+ }
+
+ psz += 3;
+ len++;
+ }
+ else
+ {
+ unsigned cnt;
+ for (cnt = 0; cc > utf8_max[cnt]; cnt++)
+ {
+ }
+
+ if (!cnt)
+ {
+ // plain ASCII char
+ if (buf)
+ *buf++ = (char) cc;
+ len++;
+ }
+ else
+ {
+ len += cnt + 1;
+ if (buf)
+ {
+ *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
+ while (cnt--)
+ *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+ }
+ }
+ }
+ }
+
+ if (srcLen == wxNO_LEN && buf && (len < n))
+ *buf = 0;
+
+ return len + 1;
+}
+
+// ============================================================================
+// UTF-16
+// ============================================================================
+
+#ifdef WORDS_BIGENDIAN
+ #define wxMBConvUTF16straight wxMBConvUTF16BE
+ #define wxMBConvUTF16swap wxMBConvUTF16LE
+#else
+ #define wxMBConvUTF16swap wxMBConvUTF16BE
+ #define wxMBConvUTF16straight wxMBConvUTF16LE
+#endif
+
+/* static */
+size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
+{
+ if ( srcLen == wxNO_LEN )
+ {
+ // count the number of bytes in input, including the trailing NULs
+ const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
+ for ( srcLen = 1; *inBuff++; srcLen++ )
+ ;
+
+ srcLen *= BYTES_PER_CHAR;
+ }
+ else // we already have the length
+ {
+ // we can only convert an entire number of UTF-16 characters
+ if ( srcLen % BYTES_PER_CHAR )
+ return wxCONV_FAILED;
+ }
+
+ return srcLen;
+}
+
+// case when in-memory representation is UTF-16 too
+#ifdef WC_UTF16
+
+// ----------------------------------------------------------------------------
+// conversions without endianness change
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ // set up the scene for using memcpy() (which is presumably more efficient
+ // than copying the bytes one by one)
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ const size_t inLen = srcLen / BYTES_PER_CHAR;
+ if ( dst )
+ {
+ if ( dstLen < inLen )
+ return wxCONV_FAILED;
+
+ memcpy(dst, src, srcLen);
+ }
+
+ return inLen;
+}
+
+size_t
+wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ srcLen *= BYTES_PER_CHAR;
+
+ if ( dst )
+ {
+ if ( dstLen < srcLen )
+ return wxCONV_FAILED;
+
+ memcpy(dst, src, srcLen);
+ }
+
+ return srcLen;
+}
+
+// ----------------------------------------------------------------------------
+// endian-reversing conversions
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ srcLen /= BYTES_PER_CHAR;
+
+ if ( dst )
+ {
+ if ( dstLen < srcLen )
+ return wxCONV_FAILED;
+
+ const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
+ for ( size_t n = 0; n < srcLen; n++, inBuff++ )
+ {
+ *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
+ }
+ }
+
+ return srcLen;
+}
+
+size_t
+wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ srcLen *= BYTES_PER_CHAR;
+
+ if ( dst )
+ {
+ if ( dstLen < srcLen )
+ return wxCONV_FAILED;
+
+ wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
+ for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
+ {
+ *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
+ }
+ }
+
+ return srcLen;
+}
+
+#else // !WC_UTF16: wchar_t is UTF-32
+
+// ----------------------------------------------------------------------------
+// conversions without endianness change
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ const size_t inLen = srcLen / BYTES_PER_CHAR;
+ if ( !dst )
+ {
+ // optimization: return maximal space which could be needed for this
+ // string even if the real size could be smaller if the buffer contains
+ // any surrogates
+ return inLen;
+ }
+
+ size_t outLen = 0;
+ const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
+ for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
+ {
+ const wxUint32 ch = wxDecodeSurrogate(&inBuff);
+ if ( !inBuff )
+ return wxCONV_FAILED;
+
+ if ( ++outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *dst++ = ch;
+ }
+
+
+ return outLen;
+}
+
+size_t
+wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ size_t outLen = 0;
+ wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
+ for ( size_t n = 0; n < srcLen; n++ )
+ {
+ wxUint16 cc[2];
+ const size_t numChars = encode_utf16(*src++, cc);
+ if ( numChars == wxCONV_FAILED )
+ return wxCONV_FAILED;
+
+ outLen += numChars * BYTES_PER_CHAR;
+ if ( outBuff )
+ {
+ if ( outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *outBuff++ = cc[0];
+ if ( numChars == 2 )
+ {
+ // second character of a surrogate
+ *outBuff++ = cc[1];
+ }
+ }
+ }
+
+ return outLen;
+}
+
+// ----------------------------------------------------------------------------
+// endian-reversing conversions
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ const size_t inLen = srcLen / BYTES_PER_CHAR;
+ if ( !dst )
+ {
+ // optimization: return maximal space which could be needed for this
+ // string even if the real size could be smaller if the buffer contains
+ // any surrogates
+ return inLen;
+ }
+
+ size_t outLen = 0;
+ const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
+ for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
+ {
+ wxUint32 ch;
+ wxUint16 tmp[2];
+
+ tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
+ inBuff++;
+ tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
+
+ const size_t numChars = decode_utf16(tmp, ch);
+ if ( numChars == wxCONV_FAILED )
+ return wxCONV_FAILED;
+
+ if ( numChars == 2 )
+ inBuff++;
+
+ if ( ++outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *dst++ = ch;
+ }
+
+
+ return outLen;
+}
+
+size_t
+wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ size_t outLen = 0;
+ wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
+ for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
+ {
+ wxUint16 cc[2];
+ const size_t numChars = encode_utf16(*src, cc);
+ if ( numChars == wxCONV_FAILED )
+ return wxCONV_FAILED;
+
+ outLen += numChars * BYTES_PER_CHAR;
+ if ( outBuff )
+ {
+ if ( outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
+ if ( numChars == 2 )
+ {
+ // second character of a surrogate
+ *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
+ }
+ }
+ }
+
+ return outLen;
+}
+
+#endif // WC_UTF16/!WC_UTF16
+
+
+// ============================================================================
+// UTF-32
+// ============================================================================
+
+#ifdef WORDS_BIGENDIAN
+ #define wxMBConvUTF32straight wxMBConvUTF32BE
+ #define wxMBConvUTF32swap wxMBConvUTF32LE
+#else
+ #define wxMBConvUTF32swap wxMBConvUTF32BE
+ #define wxMBConvUTF32straight wxMBConvUTF32LE
+#endif
+
+
+WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
+WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
+
+/* static */
+size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
+{
+ if ( srcLen == wxNO_LEN )
+ {
+ // count the number of bytes in input, including the trailing NULs
+ const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
+ for ( srcLen = 1; *inBuff++; srcLen++ )
+ ;
+
+ srcLen *= BYTES_PER_CHAR;
+ }
+ else // we already have the length
+ {
+ // we can only convert an entire number of UTF-32 characters
+ if ( srcLen % BYTES_PER_CHAR )
+ return wxCONV_FAILED;
+ }
+
+ return srcLen;
+}
+
+// case when in-memory representation is UTF-16
+#ifdef WC_UTF16
+
+// ----------------------------------------------------------------------------
+// conversions without endianness change
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
+ const size_t inLen = srcLen / BYTES_PER_CHAR;
+ size_t outLen = 0;
+ for ( size_t n = 0; n < inLen; n++ )
+ {
+ wxUint16 cc[2];
+ const size_t numChars = encode_utf16(*inBuff++, cc);
+ if ( numChars == wxCONV_FAILED )
+ return wxCONV_FAILED;
+
+ outLen += numChars;
+ if ( dst )
+ {
+ if ( outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *dst++ = cc[0];
+ if ( numChars == 2 )
+ {
+ // second character of a surrogate
+ *dst++ = cc[1];
+ }
+ }
+ }
+
+ return outLen;
+}
+
+size_t
+wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ if ( !dst )
+ {
+ // optimization: return maximal space which could be needed for this
+ // string instead of the exact amount which could be less if there are
+ // any surrogates in the input
+ //
+ // we consider that surrogates are rare enough to make it worthwhile to
+ // avoid running the loop below at the cost of slightly extra memory
+ // consumption
+ return srcLen * BYTES_PER_CHAR;
+ }
+
+ wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
+ size_t outLen = 0;
+ for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
+ {
+ const wxUint32 ch = wxDecodeSurrogate(&src);
+ if ( !src )
+ return wxCONV_FAILED;
+
+ outLen += BYTES_PER_CHAR;
+
+ if ( outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *outBuff++ = ch;
+ }
+
+ return outLen;
+}
+
+// ----------------------------------------------------------------------------
+// endian-reversing conversions
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
+ const size_t inLen = srcLen / BYTES_PER_CHAR;
+ size_t outLen = 0;
+ for ( size_t n = 0; n < inLen; n++, inBuff++ )
+ {
+ wxUint16 cc[2];
+ const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
+ if ( numChars == wxCONV_FAILED )
+ return wxCONV_FAILED;
+
+ outLen += numChars;
+ if ( dst )
+ {
+ if ( outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *dst++ = cc[0];
+ if ( numChars == 2 )
+ {
+ // second character of a surrogate
+ *dst++ = cc[1];
+ }
+ }
+ }
+
+ return outLen;
+}
+
+size_t
+wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ if ( !dst )
+ {
+ // optimization: return maximal space which could be needed for this
+ // string instead of the exact amount which could be less if there are
+ // any surrogates in the input
+ //
+ // we consider that surrogates are rare enough to make it worthwhile to
+ // avoid running the loop below at the cost of slightly extra memory
+ // consumption
+ return srcLen*BYTES_PER_CHAR;
+ }
+
+ wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
+ size_t outLen = 0;
+ for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
+ {
+ const wxUint32 ch = wxDecodeSurrogate(&src);
+ if ( !src )
+ return wxCONV_FAILED;
+
+ outLen += BYTES_PER_CHAR;
+
+ if ( outLen > dstLen )
+ return wxCONV_FAILED;
+
+ *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
+ }
+
+ return outLen;
+}
+
+#else // !WC_UTF16: wchar_t is UTF-32
+
+// ----------------------------------------------------------------------------
+// conversions without endianness change
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ // use memcpy() as it should be much faster than hand-written loop
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ const size_t inLen = srcLen/BYTES_PER_CHAR;
+ if ( dst )
+ {
+ if ( dstLen < inLen )
+ return wxCONV_FAILED;
+
+ memcpy(dst, src, srcLen);
+ }
+
+ return inLen;
+}
+
+size_t
+wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ srcLen *= BYTES_PER_CHAR;
+
+ if ( dst )
+ {
+ if ( dstLen < srcLen )
+ return wxCONV_FAILED;
+
+ memcpy(dst, src, srcLen);
+ }
+
+ return srcLen;
+}
+
+// ----------------------------------------------------------------------------
+// endian-reversing conversions
+// ----------------------------------------------------------------------------
+
+size_t
+wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
+ const char *src, size_t srcLen) const
+{
+ srcLen = GetLength(src, srcLen);
+ if ( srcLen == wxNO_LEN )
+ return wxCONV_FAILED;
+
+ srcLen /= BYTES_PER_CHAR;
+
+ if ( dst )
+ {
+ if ( dstLen < srcLen )
+ return wxCONV_FAILED;
+
+ const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
+ for ( size_t n = 0; n < srcLen; n++, inBuff++ )
+ {
+ *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
+ }
+ }
+
+ return srcLen;
+}
+
+size_t
+wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
+ const wchar_t *src, size_t srcLen) const
+{
+ if ( srcLen == wxNO_LEN )
+ srcLen = wxWcslen(src) + 1;
+
+ srcLen *= BYTES_PER_CHAR;
+
+ if ( dst )
+ {
+ if ( dstLen < srcLen )
+ return wxCONV_FAILED;
+
+ wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
+ for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
+ {
+ *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
+ }
+ }
+
+ return srcLen;
+}
+
+#endif // WC_UTF16/!WC_UTF16
+
+
+// ============================================================================
+// The classes doing conversion using the iconv_xxx() functions
+// ============================================================================
+
+#ifdef HAVE_ICONV
+
+// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
+// E2BIG if output buffer is _exactly_ as big as needed. Such case is
+// (unless there's yet another bug in glibc) the only case when iconv()
+// returns with (size_t)-1 (which means error) and says there are 0 bytes
+// left in the input buffer -- when _real_ error occurs,
+// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
+// iconv() failure.
+// [This bug does not appear in glibc 2.2.]
+#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
+#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
+ (errno != E2BIG || bufLeft != 0))
+#else
+#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
+#endif
+
+#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
+
+#define ICONV_T_INVALID ((iconv_t)-1)
+
+#if SIZEOF_WCHAR_T == 4
+ #define WC_BSWAP wxUINT32_SWAP_ALWAYS
+ #define WC_ENC wxFONTENCODING_UTF32
+#elif SIZEOF_WCHAR_T == 2
+ #define WC_BSWAP wxUINT16_SWAP_ALWAYS
+ #define WC_ENC wxFONTENCODING_UTF16
+#else // sizeof(wchar_t) != 2 nor 4
+ // does this ever happen?
+ #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"