summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
894d74d)
to PUA patch. I added UTF8_TO_OCTAL and made
that the default for filename conversion uner
GTK2. More adapation, e.g. to G_FILENAME_ENCODING
need to be done.
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@33099
c3d73ce0-8a6f-49c7-b76d-
6d57e0e08775
class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
{
public:
class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
{
public:
+ enum {
+ MAP_INVALID_UTF8_NOT = 0,
+ MAP_INVALID_UTF8_TO_PUA = 1,
+ MAP_INVALID_UTF8_TO_OCTAL = 2
+ };
+
+ wxMBConvUTF8(int options = MAP_INVALID_UTF8_NOT) : m_options(options) { }
virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
+
+private:
+ int m_options;
};
// ----------------------------------------------------------------------------
};
// ----------------------------------------------------------------------------
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
+#ifdef HAVE_LANGINFO_H
+ #include <langinfo.h>
+#endif
#if defined(__WIN32__) && !defined(__WXMICROWIN__)
#define wxHAVE_WIN32_MB2WC
#if defined(__WIN32__) && !defined(__WXMICROWIN__)
#define wxHAVE_WIN32_MB2WC
// wxConvBrokenFileNames is made for GTK2 in Unicode mode when
// files are accidentally written in an encoding which is not
// the system encoding. Typically, the system encoding will be
// wxConvBrokenFileNames is made for GTK2 in Unicode mode when
// files are accidentally written in an encoding which is not
// the system encoding. Typically, the system encoding will be
-// UTF8 but there might be files stored in ISO8859-1 in disk.
+// UTF8 but there might be files stored in ISO8859-1 on disk.
// ----------------------------------------------------------------------------
class wxConvBrokenFileNames: public wxMBConvLibc
{
public:
// ----------------------------------------------------------------------------
class wxConvBrokenFileNames: public wxMBConvLibc
{
public:
+ wxConvBrokenFileNames() : m_utf8conv(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL) { }
virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
+ inline bool UseUTF8() const;
+private:
+ wxMBConvUTF8 m_utf8conv;
+bool wxConvBrokenFileNames::UseUTF8() const
+{
+#if defined HAVE_LANGINFO_H && defined CODESET
+ char *codeset = nl_langinfo(CODESET);
+ return strcmp(codeset, "UTF-8") == 0;
+#else
+ return false;
+#endif
+}
+
size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
{
size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
{
-#if 0
- if (we find some invalid characters)
- {
- Convert to Unicode range.
- }
+ if (UseUTF8())
+ return m_utf8conv.MB2WC( outputBuf, psz, outputSize );
-#endif
- return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
+ return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
}
size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
{
}
size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
{
-#if 0
- Convert back from Unicode range.
-#endif
- return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
+ if (UseUTF8())
+ return m_utf8conv.WC2MB( outputBuf, psz, outputSize );
+ else
+ return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
}
// ----------------------------------------------------------------------------
}
// ----------------------------------------------------------------------------
static wxUint32 utf8_max[]=
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
static wxUint32 utf8_max[]=
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
+const wxUint32 wxUnicodePUA = 0x100000;
+const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
+
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
size_t len = 0;
while (*psz && ((!buf) || (len < n)))
{
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
size_t len = 0;
while (*psz && ((!buf) || (len < n)))
{
+ const char *opsz = psz;
+ bool invalid = false;
unsigned char cc = *psz++, fc = cc;
unsigned cnt;
for (cnt = 0; fc & 0x80; cnt++)
unsigned char cc = *psz++, fc = cc;
unsigned cnt;
for (cnt = 0; fc & 0x80; cnt++)
if (!cnt)
{
// invalid UTF-8 sequence
if (!cnt)
{
// invalid UTF-8 sequence
wxUint32 res = cc & (0x3f >> cnt);
while (cnt--)
{
wxUint32 res = cc & (0x3f >> cnt);
while (cnt--)
{
if ((cc & 0xC0) != 0x80)
{
// invalid UTF-8 sequence
if ((cc & 0xC0) != 0x80)
{
// invalid UTF-8 sequence
+ invalid = true;
+ break;
res = (res << 6) | (cc & 0x3f);
}
res = (res << 6) | (cc & 0x3f);
}
- if (res <= utf8_max[ocnt])
+ if (invalid || res <= utf8_max[ocnt])
{
// illegal UTF-8 encoding
{
// illegal UTF-8 encoding
+ else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
+ res >= wxUnicodePUA && res < wxUnicodePUAEnd)
+ {
+ // if one of our PUA characters turns up externally
+ // it must also be treated as an illegal sequence
+ // (a bit like you have to escape an escape character)
+ invalid = true;
+ }
+ else
+ {
- // cast is ok because wchar_t == wxUuint16 if WC_UTF16
- size_t pa = encode_utf16(res, (wxUint16 *)buf);
- if (pa == (size_t)-1)
- return (size_t)-1;
- if (buf)
- buf += pa;
- len += pa;
+ // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+ size_t pa = encode_utf16(res, (wxUint16 *)buf);
+ if (pa == (size_t)-1)
+ {
+ invalid = true;
+ }
+ else
+ {
+ if (buf)
+ buf += pa;
+ len += pa;
+ }
- if (buf)
- *buf++ = res;
- len++;
+ if (buf)
+ *buf++ = res;
+ len++;
#endif // WC_UTF16/!WC_UTF16
#endif // WC_UTF16/!WC_UTF16
+ }
+ }
+ if (invalid)
+ {
+ if (m_options & MAP_INVALID_UTF8_TO_PUA)
+ {
+ while (opsz < psz && (!buf || len < n))
+ {
+#ifdef WC_UTF16
+ // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+ size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
+ wxASSERT(pa != (size_t)-1);
+ if (buf)
+ buf += pa;
+ opsz++;
+ len += pa;
+#else
+ if (buf)
+ *buf++ = wxUnicodePUA + (unsigned char)*opsz;
+ opsz++;
+ len++;
+#endif
+ }
+ }
+ else
+ if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
+ {
+ while (opsz < psz && (!buf || len < n))
+ {
+ wchar_t str[6];
+ wxSnprintf( str, 5, L"\\%o", (int) (unsigned char) *opsz );
+ if (buf)
+ *buf++ = str[0];
+ if (buf)
+ *buf++ = str[1];
+ if (buf)
+ *buf++ = str[2];
+ if (buf)
+ *buf++ = str[3];
+ opsz++;
+ len += 4;
+ }
+ }
+ else
+ {
+ return (size_t)-1;
+ }
#else
cc=(*psz++) & 0x7fffffff;
#endif
#else
cc=(*psz++) & 0x7fffffff;
#endif
- unsigned cnt;
- for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
- if (!cnt)
+ if ((m_options & MAP_INVALID_UTF8_TO_PUA)
+ && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd)
+ *buf++ = (char)(cc - wxUnicodePUA);
+ if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
+ && cc == L'\\')
+ wchar_t str[4];
+ str[0] = *psz; psz++;
+ str[1] = *psz; psz++;
+ str[2] = *psz; psz++;
+ str[3] = 0;
+ int octal;
+ wxSscanf( str, L"%o", &octal );
+ *buf++ = (char) octal;
+ len++;
+ }
+ else
+ {
+ unsigned cnt;
+ for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
+ if (!cnt)
- *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
- while (cnt--)
- *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+ // plain ASCII char
+ if (buf)
+ *buf++ = (char) cc;
+ len++;
+ }
+
+ else
+ {
+ len += cnt + 1;
+ if (buf)
+ {
+ *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
+ while (cnt--)
+ *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+ }
// ----------------------------------------------------------------------------
// UTF-16
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// UTF-16
// ----------------------------------------------------------------------------
static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
static wxMBConvUTF7 wxConvUTF7Obj;
static wxMBConvUTF8 wxConvUTF8Obj;
static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
static wxMBConvUTF7 wxConvUTF7Obj;
static wxMBConvUTF8 wxConvUTF8Obj;
+static wxConvBrokenFileNames wxConvBrokenFileNamesObj;
WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
#ifdef __WXOSX__
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
#ifdef __WXOSX__
+ wxConvUTF8Obj;
+#elif __WXGTK20__
+ wxConvBrokenFileNamesObj;