#include <ctype.h>
#include <string.h>
#include <stdlib.h>
+#ifdef HAVE_LANGINFO_H
+ #include <langinfo.h>
+#endif
#if defined(__WIN32__) && !defined(__WXMICROWIN__)
#define wxHAVE_WIN32_MB2WC
#ifdef HAVE_ICONV
#include <iconv.h>
+ #include "wx/thread.h"
#endif
#include "wx/encconv.h"
return buf;
}
-size_t wxMBConv::MB2WC(wchar_t* szBuffer, const char* szString,
- size_t outsize, size_t nStringLen) const
+const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
{
+ wxASSERT(pOutSize != NULL);
+
const char* szEnd = szString + nStringLen + 1;
const char* szPos = szString;
const char* szStart = szPos;
size_t nActualLength = 0;
+ size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
+
+ wxWCharBuffer theBuffer(nCurrentSize);
//Convert the string until the length() is reached, continuing the
//loop every time a null character is reached
//Invalid conversion?
if( nLen == (size_t)-1 )
- return nLen;
+ {
+ *pOutSize = 0;
+ theBuffer.data()[0u] = wxT('\0');
+ return theBuffer;
+ }
+
//Increase the actual length (+1 for current null character)
nActualLength += nLen + 1;
- //Only copy data in if buffer size is big enough
- if (szBuffer != NULL &&
- nActualLength <= outsize)
+ //if buffer too big, realloc the buffer
+ if (nActualLength > (nCurrentSize+1))
{
- //Convert the current (sub)string
- if ( MB2WC(&szBuffer[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
- return (size_t)-1;
+ wxWCharBuffer theNewBuffer(nCurrentSize << 1);
+ memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
+ theBuffer = theNewBuffer;
+ nCurrentSize <<= 1;
+ }
+
+ //Convert the current (sub)string
+ if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
+ {
+ *pOutSize = 0;
+ theBuffer.data()[0u] = wxT('\0');
+ return theBuffer;
}
//Increment to next (sub)string
szPos += strlen(szPos) + 1;
}
- return nActualLength - 1; //success - return actual length
+ //success - return actual length and the buffer
+ *pOutSize = nActualLength;
+ return theBuffer;
}
-size_t wxMBConv::WC2MB(char* szBuffer, const wchar_t* szString,
- size_t outsize, size_t nStringLen) const
+const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
{
+ wxASSERT(pOutSize != NULL);
+
const wchar_t* szEnd = szString + nStringLen + 1;
const wchar_t* szPos = szString;
const wchar_t* szStart = szPos;
size_t nActualLength = 0;
+ size_t nCurrentSize = nStringLen << 2; //try * 4 first
+
+ wxCharBuffer theBuffer(nCurrentSize);
//Convert the string until the length() is reached, continuing the
//loop every time a null character is reached
//Invalid conversion?
if( nLen == (size_t)-1 )
- return nLen;
+ {
+ *pOutSize = 0;
+ theBuffer.data()[0u] = wxT('\0');
+ return theBuffer;
+ }
//Increase the actual length (+1 for current null character)
nActualLength += nLen + 1;
-
- //Only copy data in if buffer size is big enough
- if (szBuffer != NULL &&
- nActualLength <= outsize)
+
+ //if buffer too big, realloc the buffer
+ if (nActualLength > (nCurrentSize+1))
{
- //Convert the current (sub)string
- if(WC2MB(&szBuffer[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
- return (size_t)-1;
+ wxCharBuffer theNewBuffer(nCurrentSize << 1);
+ memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
+ theBuffer = theNewBuffer;
+ nCurrentSize <<= 1;
+ }
+
+ //Convert the current (sub)string
+ if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
+ {
+ *pOutSize = 0;
+ theBuffer.data()[0u] = wxT('\0');
+ return theBuffer;
}
//Increment to next (sub)string
szPos += wxWcslen(szPos) + 1;
}
- return nActualLength - 1; //success - return actual length
+ //success - return actual length and the buffer
+ *pOutSize = nActualLength;
+ return theBuffer;
}
// ----------------------------------------------------------------------------
{
return wxWC2MB(buf, psz, n);
}
+
+#ifdef __UNIX__
+
// ----------------------------------------------------------------------------
-// UTF-7
+// wxConvBrokenFileNames
+// ----------------------------------------------------------------------------
+
+wxConvBrokenFileNames::wxConvBrokenFileNames()
+{
+ // decide which conversion to use for the file names
+
+ // (1) this variable exists for the sole purpose of specifying the encoding
+ // of the filenames for GTK+ programs, so use it if it is set
+ const wxChar *encName = wxGetenv(_T("G_FILENAME_ENCODING"));
+ if ( encName )
+ {
+ m_conv = new wxCSConv(encName);
+ }
+ else // no G_FILENAME_ENCODING
+ {
+ // (2) if a non default locale is set, assume that the user wants his
+ // filenames in this locale too
+ switch ( wxLocale::GetSystemEncoding() )
+ {
+ default:
+ m_conv = new wxMBConvLibc;
+ break;
+
+ // (3) finally use UTF-8 by default
+ case wxFONTENCODING_SYSTEM:
+ case wxFONTENCODING_UTF8:
+ m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
+ break;
+ }
+ }
+}
+
+size_t
+wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
+ const char *psz,
+ size_t outputSize) const
+{
+ return m_conv->MB2WC( outputBuf, psz, outputSize );
+}
+
+size_t
+wxConvBrokenFileNames::WC2MB(char *outputBuf,
+ const wchar_t *psz,
+ size_t outputSize) const
+{
+ return m_conv->WC2MB( outputBuf, psz, outputSize );
+}
+
+#endif
+
+// ----------------------------------------------------------------------------
+// UTF-7
// ----------------------------------------------------------------------------
// Implementation (C) 2004 Fredrik Roubert
size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
-
size_t len = 0;
while (*psz && ((!buf) || (len < n)))
d += cc;
for (l += 6; l >= 8; lsb = !lsb)
{
- c = (d >> (l -= 8)) % 256;
+ c = (unsigned char)((d >> (l -= 8)) % 256);
if (lsb)
{
if (buf)
}
else
if (buf)
- *buf = c << 8;
+ *buf = (wchar_t)(c << 8);
}
}
if (*psz == '-')
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
};
-size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
-*psz, size_t n) const
+size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
len++;
}
#ifndef WC_UTF16
- else if (((wxUint16)cc) > 0xffff)
- {
+ else if (((wxUint32)cc) > 0xffff)
+ {
// no surrogate pair generation (yet?)
return (size_t)-1;
}
static wxUint32 utf8_max[]=
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
+// boundaries of the private use area we use to (temporarily) remap invalid
+// characters invalid in a UTF-8 encoded string
+const wxUint32 wxUnicodePUA = 0x100000;
+const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
+
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
size_t len = 0;
while (*psz && ((!buf) || (len < n)))
{
+ const char *opsz = psz;
+ bool invalid = false;
unsigned char cc = *psz++, fc = cc;
unsigned cnt;
for (cnt = 0; fc & 0x80; cnt++)
if (!cnt)
{
// invalid UTF-8 sequence
- return (size_t)-1;
+ invalid = true;
}
else
{
wxUint32 res = cc & (0x3f >> cnt);
while (cnt--)
{
- cc = *psz++;
+ cc = *psz;
if ((cc & 0xC0) != 0x80)
{
// invalid UTF-8 sequence
- return (size_t)-1;
+ invalid = true;
+ break;
}
+ psz++;
res = (res << 6) | (cc & 0x3f);
}
- if (res <= utf8_max[ocnt])
+ if (invalid || res <= utf8_max[ocnt])
{
// illegal UTF-8 encoding
- return (size_t)-1;
+ invalid = true;
+ }
+ else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
+ res >= wxUnicodePUA && res < wxUnicodePUAEnd)
+ {
+ // if one of our PUA characters turns up externally
+ // it must also be treated as an illegal sequence
+ // (a bit like you have to escape an escape character)
+ invalid = true;
}
+ else
+ {
#ifdef WC_UTF16
- // cast is ok because wchar_t == wxUuint16 if WC_UTF16
- size_t pa = encode_utf16(res, (wxUint16 *)buf);
- if (pa == (size_t)-1)
- return (size_t)-1;
- if (buf)
- buf += pa;
- len += pa;
+ // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+ size_t pa = encode_utf16(res, (wxUint16 *)buf);
+ if (pa == (size_t)-1)
+ {
+ invalid = true;
+ }
+ else
+ {
+ if (buf)
+ buf += pa;
+ len += pa;
+ }
#else // !WC_UTF16
- if (buf)
- *buf++ = res;
- len++;
+ if (buf)
+ *buf++ = res;
+ len++;
#endif // WC_UTF16/!WC_UTF16
+ }
+ }
+ if (invalid)
+ {
+ if (m_options & MAP_INVALID_UTF8_TO_PUA)
+ {
+ while (opsz < psz && (!buf || len < n))
+ {
+#ifdef WC_UTF16
+ // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+ size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
+ wxASSERT(pa != (size_t)-1);
+ if (buf)
+ buf += pa;
+ opsz++;
+ len += pa;
+#else
+ if (buf)
+ *buf++ = wxUnicodePUA + (unsigned char)*opsz;
+ opsz++;
+ len++;
+#endif
+ }
+ }
+ else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
+ {
+ while (opsz < psz && (!buf || len < n))
+ {
+ if ( buf && len + 3 < n )
+ {
+ unsigned char n = *opsz;
+ *buf++ = L'\\';
+ *buf++ = (wchar_t)( L'0' + n / 0100 );
+ *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
+ *buf++ = (wchar_t)( L'0' + n % 010 );
+ }
+ opsz++;
+ len += 4;
+ }
+ }
+ else // MAP_INVALID_UTF8_NOT
+ {
+ return (size_t)-1;
+ }
}
}
}
return len;
}
+static inline bool isoctal(wchar_t wch)
+{
+ return L'0' <= wch && wch <= L'7';
+}
+
size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
size_t len = 0;
#else
cc=(*psz++) & 0x7fffffff;
#endif
- unsigned cnt;
- for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
- if (!cnt)
+
+ if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
+ && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
{
- // plain ASCII char
if (buf)
- *buf++ = (char) cc;
+ *buf++ = (char)(cc - wxUnicodePUA);
len++;
}
+ else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
+ cc == L'\\' &&
+ isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
+ {
+ if (buf)
+ {
+ *buf++ = (char) ((psz[0] - L'0')*0100 +
+ (psz[1] - L'0')*010 +
+ (psz[2] - L'0'));
+ }
+ psz += 3;
+ len++;
+ }
else
{
- len += cnt + 1;
- if (buf)
+ unsigned cnt;
+ for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
+ if (!cnt)
{
- *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
- while (cnt--)
- *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+ // plain ASCII char
+ if (buf)
+ *buf++ = (char) cc;
+ len++;
+ }
+
+ else
+ {
+ len += cnt + 1;
+ if (buf)
+ {
+ *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
+ while (cnt--)
+ *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+ }
}
}
}
- if (buf && (len<n)) *buf = 0;
+ if (buf && (len<n))
+ *buf = 0;
return len;
}
-
-
-
// ----------------------------------------------------------------------------
// UTF-16
// ----------------------------------------------------------------------------
#ifdef HAVE_ICONV
-// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
-// if output buffer is _exactly_ as big as needed. Such case is (unless there's
-// yet another bug in glibc) the only case when iconv() returns with (size_t)-1
-// (which means error) and says there are 0 bytes left in the input buffer --
-// when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
-// this alternative test for iconv() failure.
+// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
+// E2BIG if output buffer is _exactly_ as big as needed. Such case is
+// (unless there's yet another bug in glibc) the only case when iconv()
+// returns with (size_t)-1 (which means error) and says there are 0 bytes
+// left in the input buffer -- when _real_ error occurs,
+// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
+// iconv() failure.
// [This bug does not appear in glibc 2.2.]
#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
// the other direction
iconv_t m2w,
w2m;
+#if wxUSE_THREADS
+ // guards access to m2w and w2m objects
+ wxMutex m_iconvMutex;
+#endif
private:
// the name (for iconv_open()) of a wide char charset -- if none is
size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
+#if wxUSE_THREADS
+ // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
+ // Unfortunately there is a couple of global wxCSConv objects such as
+ // wxConvLocal that are used all over wx code, so we have to make sure
+ // the handle is used by at most one thread at the time. Otherwise
+ // only a few wx classes would be safe to use from non-main threads
+ // as MB<->WC conversion would fail "randomly".
+ wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
+#endif
+
size_t inbuf = strlen(psz);
size_t outbuf = n * SIZEOF_WCHAR_T;
size_t res, cres;
size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
+#if wxUSE_THREADS
+ // NB: explained in MB2WC
+ wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
+#endif
+
size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
size_t outbuf = n;
size_t res, cres;
// and break the library itself, e.g. wxTextInputStream::NextChar()
// wouldn't work if reading an incomplete MB char didn't result in an
// error
+ //
+ // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
+ // an error (tested under Windows Server 2003) and apparently it is
+ // done on purpose, i.e. the function accepts any input in this case
+ // and although I'd prefer to return error on ill-formed output, our
+ // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
+ // explicitly ill-formed according to RFC 2152) neither so we don't
+ // even have any fallback here...
+ int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
+
const size_t len = ::MultiByteToWideChar
(
m_CodePage, // code page
- MB_ERR_INVALID_CHARS, // flags: fall on error
+ flags, // flags: fall on error
psz, // input string
-1, // its length (NUL-terminated)
buf, // output string
Init(CFStringGetSystemEncoding()) ;
}
+#if wxUSE_FONTMAP
wxMBConv_cocoa(const wxChar* name)
{
- Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
+ Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
}
+#endif
wxMBConv_cocoa(wxFontEncoding encoding)
{
#if SIZEOF_WCHAR_T == 4
UniChar* szUniCharBuffer = new UniChar[nOutSize];
#endif
-
+
CFStringGetCharacters(theString, theRange, szUniCharBuffer);
-
+
CFRelease(theString);
szUniCharBuffer[nOutLength] = '\0' ;
converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
delete[] szUniCharBuffer;
#endif
-
+
return nOutLength;
}
size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
{
wxASSERT(szUnConv);
-
+
size_t nRealOutSize;
size_t nBufSize = wxWcslen(szUnConv);
UniChar* szUniBuffer = (UniChar*) szUnConv;
{
if (szOut != NULL)
CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
-
+
nRealOutSize = CFStringGetLength(theString) + 1;
}
else
//0 tells CFString to return NULL if it meets such a character
false, //not an external representation
(UInt8*) szOut,
- nOutSize,
+ nOutSize,
(CFIndex*) &nRealOutSize
);
}
bool IsOk() const
{
- return m_encoding != kCFStringEncodingInvalidId &&
+ return m_encoding != kCFStringEncodingInvalidId &&
CFStringIsEncodingAvailable(m_encoding);
}
Init(CFStringGetSystemEncoding()) ;
}
+#if wxUSE_FONTMAP
wxMBConv_mac(const wxChar* name)
{
- Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
+ Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
}
+#endif
wxMBConv_mac(wxFontEncoding encoding)
{
if ( buf && res < n)
{
buf[res] = 0;
-
+
//we need to double-trip to verify it didn't insert any ? in place
//of bogus characters
wxWCharBuffer wcBuf(n);
wxMBConv_wxwin(const wxChar* name)
{
if (name)
- m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
+ m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
else
m_enc = wxFONTENCODING_SYSTEM;
{
size_t inbuf = strlen(psz);
if (buf)
- m2w.Convert(psz,buf);
+ {
+ if (!m2w.Convert(psz,buf))
+ return (size_t)-1;
+ }
return inbuf;
}
{
const size_t inbuf = wxWcslen(psz);
if (buf)
- w2m.Convert(psz,buf);
+ {
+ if (!w2m.Convert(psz,buf))
+ return (size_t)-1;
+ }
return inbuf;
}
#if wxUSE_FONTMAP
if ( name.empty() )
- name = wxFontMapper::Get()->GetEncodingName(m_encoding);
+ name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
#endif // wxUSE_FONTMAP
wxMBConv_iconv *conv = new wxMBConv_iconv(name);
#endif // wxHAVE_WIN32_MB2WC
#if defined(__WXMAC__)
{
- if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
+ // leave UTF16 and UTF32 to the built-ins of wx
+ if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
+ ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
{
+#if wxUSE_FONTMAP
wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
: new wxMBConv_mac(m_encoding);
+#else
+ wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
+#endif
if ( conv->IsOk() )
return conv;
if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
{
+#if wxUSE_FONTMAP
wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
: new wxMBConv_cocoa(m_encoding);
+#else
+ wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
+#endif
if ( conv->IsOk() )
return conv;
// use "false" to suppress interactive dialogs -- we can be called from
// anywhere and popping up a dialog from here is the last thing we want to
// do
- enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
+ enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
}
#endif // wxUSE_FONTMAP
m_name ? m_name
:
#if wxUSE_FONTMAP
- wxFontMapper::GetEncodingDescription(m_encoding).c_str()
+ wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
#else // !wxUSE_FONTMAP
wxString::Format(_("encoding %s"), m_encoding).c_str()
#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
static wxMBConvUTF7 wxConvUTF7Obj;
static wxMBConvUTF8 wxConvUTF8Obj;
-
WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
+WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
+#ifdef __WXOSX__
+ wxConvUTF8Obj;
+#else
+ wxConvLibcObj;
+#endif
+
#else // !wxUSE_WCHAR_T