]> git.saurik.com Git - wxWidgets.git/blobdiff - src/common/strconv.cpp
fixed compilation in non-Unicode build; fixed bug with buffer overrun in wxMBConvUTF8...
[wxWidgets.git] / src / common / strconv.cpp
index 76282821dbe1bd887e5d727add6bee9f62c2d382..db0147d6bc7706eaff14e6e4a6c3a1bb92ef2f95 100644 (file)
@@ -55,6 +55,9 @@
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
+#ifdef HAVE_LANGINFO_H
+  #include <langinfo.h>
+#endif
 
 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
     #define wxHAVE_WIN32_MB2WC
@@ -278,7 +281,7 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, si
 
     //success - return actual length and the buffer
     *pOutSize = nActualLength;
-    return theBuffer;  
+    return theBuffer;
 }
 
 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
@@ -313,7 +316,7 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen,
 
         //Increase the actual length (+1 for current null character)
         nActualLength += nLen + 1;
-        
+
         //if buffer too big, realloc the buffer
         if (nActualLength > (nCurrentSize+1))
         {
@@ -340,7 +343,7 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen,
 
     //success - return actual length and the buffer
     *pOutSize = nActualLength;
-    return theBuffer;  
+    return theBuffer;
 }
 
 // ----------------------------------------------------------------------------
@@ -356,8 +359,53 @@ size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 {
     return wxWC2MB(buf, psz, n);
 }
+
 // ----------------------------------------------------------------------------
-// UTF-7 
+// wxConvBrokenFileNames is made for GTK2 in Unicode mode when
+// files are accidentally written in an encoding which is not
+// the system encoding. Typically, the system encoding will be
+// UTF8 but there might be files stored in ISO8859-1 on disk.
+// ----------------------------------------------------------------------------
+
+class wxConvBrokenFileNames: public wxMBConvLibc
+{
+public:
+    wxConvBrokenFileNames() : m_utf8conv(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL) { }
+    virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
+    virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
+    inline bool UseUTF8() const;
+private:
+    wxMBConvUTF8 m_utf8conv;
+};
+
+bool wxConvBrokenFileNames::UseUTF8() const
+{
+#if defined HAVE_LANGINFO_H && defined CODESET
+    char *codeset = nl_langinfo(CODESET);
+    return strcmp(codeset, "UTF-8") == 0;
+#else
+    return false;
+#endif
+}
+
+size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
+{
+    if (UseUTF8())
+        return m_utf8conv.MB2WC( outputBuf, psz, outputSize );
+    else
+        return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
+}
+
+size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
+{
+    if (UseUTF8())
+        return m_utf8conv.WC2MB( outputBuf, psz, outputSize );
+    else
+        return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
+}
+
+// ----------------------------------------------------------------------------
+// UTF-7
 // ----------------------------------------------------------------------------
 
 // Implementation (C) 2004 Fredrik Roubert
@@ -567,12 +615,19 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 static wxUint32 utf8_max[]=
     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 
+// boundaries of the private use area we use to (temporarily) remap invalid
+// characters invalid in a UTF-8 encoded string
+const wxUint32 wxUnicodePUA = 0x100000;
+const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
+
 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 {
     size_t len = 0;
 
     while (*psz && ((!buf) || (len < n)))
     {
+        const char *opsz = psz;
+        bool invalid = false;
         unsigned char cc = *psz++, fc = cc;
         unsigned cnt;
         for (cnt = 0; fc & 0x80; cnt++)
@@ -590,7 +645,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
             if (!cnt)
             {
                 // invalid UTF-8 sequence
-                return (size_t)-1;
+                invalid = true;
             }
             else
             {
@@ -598,32 +653,93 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
                 wxUint32 res = cc & (0x3f >> cnt);
                 while (cnt--)
                 {
-                    cc = *psz++;
+                    cc = *psz;
                     if ((cc & 0xC0) != 0x80)
                     {
                         // invalid UTF-8 sequence
-                        return (size_t)-1;
+                        invalid = true;
+                        break;
                     }
+                    psz++;
                     res = (res << 6) | (cc & 0x3f);
                 }
-                if (res <= utf8_max[ocnt])
+                if (invalid || res <= utf8_max[ocnt])
                 {
                     // illegal UTF-8 encoding
-                    return (size_t)-1;
+                    invalid = true;
+                }
+                else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
+                        res >= wxUnicodePUA && res < wxUnicodePUAEnd)
+                {
+                    // if one of our PUA characters turns up externally
+                    // it must also be treated as an illegal sequence
+                    // (a bit like you have to escape an escape character)
+                    invalid = true;
                 }
+                else
+                {
 #ifdef WC_UTF16
-                // cast is ok because wchar_t == wxUuint16 if WC_UTF16
-                size_t pa = encode_utf16(res, (wxUint16 *)buf);
-                if (pa == (size_t)-1)
-                  return (size_t)-1;
-                if (buf)
-                    buf += pa;
-                len += pa;
+                    // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+                    size_t pa = encode_utf16(res, (wxUint16 *)buf);
+                    if (pa == (size_t)-1)
+                    {
+                        invalid = true;
+                    }
+                    else
+                    {
+                        if (buf)
+                            buf += pa;
+                        len += pa;
+                    }
 #else // !WC_UTF16
-                if (buf)
-                    *buf++ = res;
-                len++;
+                    if (buf)
+                        *buf++ = res;
+                    len++;
 #endif // WC_UTF16/!WC_UTF16
+                }
+            }
+            if (invalid)
+            {
+                if (m_options & MAP_INVALID_UTF8_TO_PUA)
+                {
+                    while (opsz < psz && (!buf || len < n))
+                    {
+#ifdef WC_UTF16
+                        // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+                        size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
+                        wxASSERT(pa != (size_t)-1);
+                        if (buf)
+                            buf += pa;
+                        opsz++;
+                        len += pa;
+#else
+                        if (buf)
+                            *buf++ = wxUnicodePUA + (unsigned char)*opsz;
+                        opsz++;
+                        len++;
+#endif
+                    }
+                }
+                else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
+                {
+                    while (opsz < psz && (!buf || len < n))
+                    {
+                        if ( buf && len + 3 < n )
+                        {
+                            unsigned char n = *opsz;
+                            *buf++ = L'\\';
+                            *buf++ = L'0' + n / 0100;
+                            *buf++ = L'0' + (n % 0100) / 010;
+                            *buf++ = L'0' + n % 010;
+                        }
+                        opsz++;
+                        len += 4;
+                    }
+                }
+                else // MAP_INVALID_UTF8_NOT
+                {
+                    return (size_t)-1;
+                }
             }
         }
     }
@@ -632,6 +748,11 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
     return len;
 }
 
+static inline bool isoctal(wchar_t wch)
+{
+    return L'0' <= wch && wch <= L'7';
+}
+
 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 {
     size_t len = 0;
@@ -646,36 +767,59 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 #else
         cc=(*psz++) & 0x7fffffff;
 #endif
-        unsigned cnt;
-        for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
-        if (!cnt)
+
+        if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
+                && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
         {
-            // plain ASCII char
             if (buf)
-                *buf++ = (char) cc;
+                *buf++ = (char)(cc - wxUnicodePUA);
             len++;
         }
+        else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
+                    cc == L'\\' &&
+                        isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
+        {
+            if (buf)
+            {
+                *buf++ = (char) (psz[0] - L'0')*0100 +
+                                (psz[1] - L'0')*010 +
+                                (psz[2] - L'0');
+            }
 
+            psz += 3;
+            len++;
+        }
         else
         {
-            len += cnt + 1;
-            if (buf)
+            unsigned cnt;
+            for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
+            if (!cnt)
             {
-                *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
-                while (cnt--)
-                    *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+                // plain ASCII char
+                if (buf)
+                    *buf++ = (char) cc;
+                len++;
+            }
+
+            else
+            {
+                len += cnt + 1;
+                if (buf)
+                {
+                    *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
+                    while (cnt--)
+                        *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+                }
             }
         }
     }
 
-    if (buf && (len<n)) *buf = 0;
+    if (buf && (len<n))
+        *buf = 0;
 
     return len;
 }
 
-
-
-
 // ----------------------------------------------------------------------------
 // UTF-16
 // ----------------------------------------------------------------------------
@@ -1309,7 +1453,7 @@ size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
     //     as MB<->WC conversion would fail "randomly".
     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
 #endif
+
     size_t inbuf = strlen(psz);
     size_t outbuf = n * SIZEOF_WCHAR_T;
     size_t res, cres;
@@ -1371,7 +1515,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
     // NB: explained in MB2WC
     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
 #endif
-    
+
     size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
     size_t outbuf = n;
     size_t res, cres;
@@ -1899,7 +2043,7 @@ public:
 #if wxUSE_FONTMAP
     wxMBConv_cocoa(const wxChar* name)
     {
-        Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
+        Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
     }
 #endif
 
@@ -1944,9 +2088,9 @@ public:
 #if SIZEOF_WCHAR_T == 4
         UniChar* szUniCharBuffer = new UniChar[nOutSize];
 #endif
+
         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
-        
+
         CFRelease(theString);
 
         szUniCharBuffer[nOutLength] = '\0' ;
@@ -1956,14 +2100,14 @@ public:
         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
         delete[] szUniCharBuffer;
 #endif
-    
+
         return nOutLength;
     }
 
     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
     {
         wxASSERT(szUnConv);
-        
+
         size_t nRealOutSize;
         size_t nBufSize = wxWcslen(szUnConv);
         UniChar* szUniBuffer = (UniChar*) szUnConv;
@@ -1991,7 +2135,7 @@ public:
         {
             if (szOut != NULL)
                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
-            
+
             nRealOutSize = CFStringGetLength(theString) + 1;
         }
         else
@@ -2004,7 +2148,7 @@ public:
                     //0 tells CFString to return NULL if it meets such a character
                 false, //not an external representation
                 (UInt8*) szOut,
-                nOutSize, 
+                nOutSize,
                 (CFIndex*) &nRealOutSize
                         );
         }
@@ -2020,7 +2164,7 @@ public:
 
     bool IsOk() const
     {
-        return m_encoding != kCFStringEncodingInvalidId && 
+        return m_encoding != kCFStringEncodingInvalidId &&
               CFStringIsEncodingAvailable(m_encoding);
     }
 
@@ -2047,7 +2191,7 @@ public:
 #if wxUSE_FONTMAP
     wxMBConv_mac(const wxChar* name)
     {
-        Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
+        Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
     }
 #endif
 
@@ -2158,7 +2302,7 @@ public:
         if ( buf  && res < n)
         {
             buf[res] = 0;
-            
+
             //we need to double-trip to verify it didn't insert any ? in place
             //of bogus characters
             wxWCharBuffer wcBuf(n);
@@ -2210,7 +2354,7 @@ public:
     wxMBConv_wxwin(const wxChar* name)
     {
         if (name)
-            m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
+            m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
         else
             m_enc = wxFONTENCODING_SYSTEM;
 
@@ -2369,7 +2513,7 @@ wxMBConv *wxCSConv::DoCreate() const
 
 #if wxUSE_FONTMAP
         if ( name.empty() )
-            name = wxFontMapper::Get()->GetEncodingName(m_encoding);
+            name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
 #endif // wxUSE_FONTMAP
 
         wxMBConv_iconv *conv = new wxMBConv_iconv(name);
@@ -2396,7 +2540,9 @@ wxMBConv *wxCSConv::DoCreate() const
 #endif // wxHAVE_WIN32_MB2WC
 #if defined(__WXMAC__)
     {
-        if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
+        // leave UTF16 and UTF32 to the built-ins of wx
+        if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
+            ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
         {
 
 #if wxUSE_FONTMAP
@@ -2438,7 +2584,7 @@ wxMBConv *wxCSConv::DoCreate() const
         // use "false" to suppress interactive dialogs -- we can be called from
         // anywhere and popping up a dialog from here is the last thing we want to
         // do
-        enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
+        enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
     }
 #endif // wxUSE_FONTMAP
 
@@ -2494,7 +2640,7 @@ wxMBConv *wxCSConv::DoCreate() const
                    m_name ? m_name
                       :
 #if wxUSE_FONTMAP
-                         wxFontMapper::GetEncodingDescription(m_encoding).c_str()
+                         wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
 #else // !wxUSE_FONTMAP
                          wxString::Format(_("encoding %s"), m_encoding).c_str()
 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
@@ -2590,7 +2736,7 @@ static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
 static wxMBConvUTF7 wxConvUTF7Obj;
 static wxMBConvUTF8 wxConvUTF8Obj;
-
+static wxConvBrokenFileNames wxConvBrokenFileNamesObj;
 
 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
@@ -2598,6 +2744,15 @@ WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
+WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
+#ifdef __WXOSX__
+                                    wxConvUTF8Obj;
+#elif __WXGTK20__
+                                    wxConvBrokenFileNamesObj;
+#else
+                                    wxConvLibcObj;
+#endif
+
 
 #else // !wxUSE_WCHAR_T