X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/830f8f11bca5c0892ae767ba14790c8b5b59011f..000c2be40d8e0231f6565ba2931f4c88add433d9:/src/common/convauto.cpp

diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp
index d43bb6d1ba..756e81c233 100644
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -23,69 +23,116 @@
     #pragma hdrstop
 #endif
 
-#if wxUSE_WCHAR_T
-
 #ifndef WX_PRECOMP
+    #include "wx/wx.h"
 #endif //WX_PRECOMP
 
 #include "wx/convauto.h"
 
+// we use latin1 by default as it seems the least bad choice: the files we need
+// to detect input of don't always come from the user system (they are often
+// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
+// seem to be a good idea and there is no other reasonable alternative
+wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
+
 // ============================================================================
 // implementation
 // ============================================================================
 
 /* static */
-wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
+void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
 {
-    if ( srcLen < 2 )
-    {
-        // minimal BOM is 2 bytes so bail out immediately and simplify the code
-        // below which wouldn't need to check for length for UTF-16 cases
-        return BOM_None;
-    }
+    wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
+                  wxT("wxFONTENCODING_DEFAULT doesn't make sense here") );
+
+    ms_defaultMBEncoding = enc;
+}
 
+/* static */
+wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
+{
     // examine the buffer for BOM presence
     //
-    // see http://www.unicode.org/faq/utf_bom.html#BOM
-    switch ( *src++ )
+    // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
+    //
+    //  Bytes           Encoding Form
+    //
+    //  00 00 FE FF     UTF-32, big-endian
+    //  FF FE 00 00     UTF-32, little-endian
+    //  FE FF           UTF-16, big-endian
+    //  FF FE           UTF-16, little-endian
+    //  EF BB BF        UTF-8
+    //
+    // as some BOMs are prefixes of other ones we may need to read more bytes
+    // to disambiguate them
+
+    switch ( srcLen )
     {
-        case '\0':
-            // could only be big endian UTF-32 (00 00 FE FF)
-            if ( srcLen >= 4 &&
-                    src[0] == '\0' &&
-                        src[1] == '\xfe' &&
-                            src[2] == '\xff' )
+        case 0:
+            return BOM_Unknown;
+
+        case 1:
+            if ( src[0] == '\x00' || src[0] == '\xFF' ||
+                 src[0] == '\xFE' || src[0] == '\xEF')
             {
-                return BOM_UTF32BE;
+                // this could be a BOM but we don't know yet
+                return BOM_Unknown;
             }
             break;
 
-        case '\xfe':
-            // could only be big endian UTF-16 (FE FF)
-            if ( *src++ == '\xff' )
+        case 2:
+        case 3:
+            if ( src[0] == '\xEF' && src[1] == '\xBB' )
             {
-                return BOM_UTF16BE;
+                if ( srcLen == 3 )
+                    return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+
+                return BOM_Unknown;
             }
-            break;
 
-        case '\xff':
-            // could be either little endian UTF-16 or UTF-32, both start
-            // with FF FE
-            if ( *src++ == '\xfe' )
+            if ( src[0] == '\xFE' && src[1] == '\xFF' )
+                return BOM_UTF16BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' )
             {
-                return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
-                            ? BOM_UTF32LE
-                            : BOM_UTF16LE;
+                // if the next byte is 0, it could be an UTF-32LE BOM but if it
+                // isn't we can be sure it's UTF-16LE
+                if ( srcLen == 3 && src[2] != '\x00' )
+                    return BOM_UTF16LE;
+
+                return BOM_Unknown;
             }
-            break;
 
-        case '\xef':
-            // is this UTF-8 BOM (EF BB BF)?
-            if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
+            if ( src[0] == '\x00' && src[1] == '\x00' )
             {
-                return BOM_UTF8;
+                // this could only be UTF-32BE, check that the data we have so
+                // far allows for it
+                if ( srcLen == 3 && src[2] != '\xFE' )
+                    return BOM_None;
+
+                return BOM_Unknown;
             }
             break;
+
+        default:
+            // we have at least 4 characters so we may finally decide whether
+            // we have a BOM or not
+            if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
+                return BOM_UTF8;
+
+            if ( src[0] == '\x00' && src[1] == '\x00' &&
+                 src[2] == '\xFE' && src[3] == '\xFF' )
+                return BOM_UTF32BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' &&
+                 src[2] == '\x00' && src[3] == '\x00' )
+                return BOM_UTF32LE;
+
+            if ( src[0] == '\xFE' && src[1] == '\xFF' )
+                return BOM_UTF16BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' )
+                return BOM_UTF16LE;
     }
 
     return BOM_None;
@@ -97,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
 
     switch ( bomType )
     {
+        case BOM_Unknown:
+            wxFAIL_MSG( "shouldn't be called for this BOM type" );
+            break;
+
+        case BOM_None:
+            // use the default
+            break;
+
         case BOM_UTF32BE:
             m_conv = new wxMBConvUTF32BE;
             m_ownsConv = true;
@@ -118,17 +173,20 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
             break;
 
         case BOM_UTF8:
-            m_conv = &wxConvUTF8;
-            m_ownsConv = false;
+            InitWithUTF8();
             break;
 
         default:
-            wxFAIL_MSG( _T("unexpected BOM type") );
-            // fall through: still need to create something
+            wxFAIL_MSG( "unknown BOM type" );
+    }
 
-        case BOM_None:
-            InitWithDefault();
-            m_consumedBOM = true; // as there is nothing to consume
+    if ( !m_conv )
+    {
+        // we end up here if there is no BOM or we didn't recognize it somehow
+        // (this shouldn't happen but still don't crash if it does), so use the
+        // default encoding
+        InitWithUTF8();
+        m_consumedBOM = true; // as there is nothing to consume
     }
 }
 
@@ -137,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
     int ofs;
     switch ( m_bomType )
     {
+        case BOM_Unknown:
+            wxFAIL_MSG( "shouldn't be called for this BOM type" );
+            return;
+
+        case BOM_None:
+            ofs = 0;
+            break;
+
         case BOM_UTF32BE:
         case BOM_UTF32LE:
             ofs = 4;
@@ -152,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
             break;
 
         default:
-            wxFAIL_MSG( _T("unexpected BOM type") );
-            // fall through: still need to create something
-
-        case BOM_None:
-            ofs = 0;
+            wxFAIL_MSG( "unknown BOM type" );
+            return;
     }
 
     *src += ofs;
@@ -164,11 +227,15 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
         *len -= ofs;
 }
 
-void wxConvAuto::InitFromInput(const char **src, size_t *len)
+bool wxConvAuto::InitFromInput(const char *src, size_t len)
 {
-    m_bomType = DetectBOM(*src, *len);
+    m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
+    if ( m_bomType == BOM_Unknown )
+        return false;
+
     InitFromBOM(m_bomType);
-    SkipBOM(src, len);
+
+    return true;
 }
 
 size_t
@@ -180,21 +247,61 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
     // during this initial call but also during the first call with non-NULL
     // dst as typically we're first called with NULL dst to calculate the
     // needed buffer size
-    wxConvAuto *self = wx_const_cast(wxConvAuto *, this);
+    wxConvAuto *self = const_cast<wxConvAuto *>(this);
+
+
     if ( !m_conv )
     {
-        self->InitFromInput(&src, &srcLen);
-        if ( dst )
-            self->m_consumedBOM = true;
+        if ( !self->InitFromInput(src, srcLen) )
+        {
+            // there is not enough data to determine whether we have a BOM or
+            // not, so fail for now -- the caller is supposed to call us again
+            // with more data
+            return wxCONV_FAILED;
+        }
     }
 
-    if ( !m_consumedBOM && dst )
+    if ( !m_consumedBOM )
     {
-        self->m_consumedBOM = true;
         SkipBOM(&src, &srcLen);
+        if ( srcLen == 0 )
+        {
+            // there is nothing left except the BOM so we'd return 0 below but
+            // this is unexpected: decoding a non-empty string must either fail
+            // or return something non-empty, in particular this would break
+            // the code in wxTextInputStream::NextChar()
+            //
+            // so still return an error as we need some more data to be able to
+            // decode it
+            return wxCONV_FAILED;
+        }
     }
 
-    return m_conv->ToWChar(dst, dstLen, src, srcLen);
+    // try to convert using the auto-detected encoding
+    size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
+    if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
+    {
+        // if the conversion failed but we didn't really detect anything and
+        // simply tried UTF-8 by default, retry it using the fall-back
+        if ( m_encDefault != wxFONTENCODING_MAX )
+        {
+            if ( m_ownsConv )
+                delete m_conv;
+
+            self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
+                                            ? GetFallbackEncoding()
+                                            : m_encDefault);
+            self->m_ownsConv = true;
+
+            rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
+        }
+    }
+
+    // don't skip the BOM again the next time if we really consumed it
+    if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
+        self->m_consumedBOM = true;
+
+    return rc;
 }
 
 size_t
@@ -204,11 +311,8 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
     if ( !m_conv )
     {
         // default to UTF-8 for the multibyte output
-        wx_const_cast(wxConvAuto *, this)->InitWithDefault();
+        const_cast<wxConvAuto *>(this)->InitWithUTF8();
     }
 
     return m_conv->FromWChar(dst, dstLen, src, srcLen);
 }
-
-#endif // wxUSE_WCHAR_T
-