From 4cb0e8d05cadea6be3a7bd93f1fea9a9e0df95f0 Mon Sep 17 00:00:00 2001
From: Vadim Zeitlin <vadim@wxwidgets.org>
Date: Mon, 4 Jan 2010 12:22:49 +0000
Subject: [PATCH] Fix wxConvAuto behaviour when it is used by
 wxTextInputStream.

wxConvAuto implicitly supposed that the chunk of data passed to it for
translation was big enough to allow it to at least detect the BOM from it.
However this isn't necessarily the case and never is with wxTextInputStream
which reads the bytes one by one.

Fix this by waiting until we have enough data to be able to detect the BOM.
This still doesn't fix the problem with streams without BOM and the
corresponding unit test still fails -- it will need to be fixed at the level
of wxTextInputStream itself later but handling correctly the cases when a BOM
is present is already better than before.

See #11570.

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@63064 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
---
 docs/changes.txt              |   1 +
 include/wx/convauto.h         |   6 +-
 src/common/convauto.cpp       | 161 +++++++++++++++++++++++-----------
 src/common/txtstrm.cpp        |   2 +-
 tests/mbconv/convautotest.cpp |  96 +++++++++++++++++++-
 5 files changed, 210 insertions(+), 56 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index bc778c5463..6099ceade0 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -435,6 +435,7 @@ All:
 - wxDateTime timezone functions now dynamic (no caching).
 - Added wxHttp::GetCookie and wxHttp::HasCookies (dodge).
 - Added support for unique volume names to wxFileName (Neno Ganchev).
+- Correct bugs when using wxTextInputStream with wxConvAuto (Leon Buikstra).
 
 Unix:
 
diff --git a/include/wx/convauto.h b/include/wx/convauto.h
index b3dde4c73a..3a2e2e4809 100644
--- a/include/wx/convauto.h
+++ b/include/wx/convauto.h
@@ -75,6 +75,7 @@ private:
     // all currently recognized BOM values
     enum BOMType
     {
+        BOM_Unknown = -1,
         BOM_None,
         BOM_UTF32BE,
         BOM_UTF32LE,
@@ -107,7 +108,10 @@ private:
 
     // create the correct conversion object for the BOM present in the
     // beginning of the buffer; adjust the buffer to skip the BOM if found
-    void InitFromInput(const char **src, size_t *len);
+    //
+    // return false if the buffer is too short to allow us to determine if we
+    // have BOM or not
+    bool InitFromInput(const char **src, size_t *len);
 
     // adjust src and len to skip over the BOM (identified by m_bomType) at the
     // start of the buffer
diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp
index f4e394d04d..c9ff7df9f6 100644
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -26,6 +26,7 @@
 #if wxUSE_WCHAR_T
 
 #ifndef WX_PRECOMP
+    #include "wx/wx.h"
 #endif //WX_PRECOMP
 
 #include "wx/convauto.h"
@@ -52,55 +53,86 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
 /* static */
 wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
 {
-    if ( srcLen < 2 )
-    {
-        // minimal BOM is 2 bytes so bail out immediately and simplify the code
-        // below which wouldn't need to check for length for UTF-16 cases
-        return BOM_None;
-    }
-
     // examine the buffer for BOM presence
     //
-    // see http://www.unicode.org/faq/utf_bom.html#BOM
-    switch ( *src++ )
+    // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
+    //
+    //  Bytes           Encoding Form
+    //
+    //  00 00 FE FF     UTF-32, big-endian
+    //  FF FE 00 00     UTF-32, little-endian
+    //  FE FF           UTF-16, big-endian
+    //  FF FE           UTF-16, little-endian
+    //  EF BB BF        UTF-8
+    //
+    // as some BOMs are prefixes of other ones we may need to read more bytes
+    // to disambiguate them
+
+    switch ( srcLen )
     {
-        case '\0':
-            // could only be big endian UTF-32 (00 00 FE FF)
-            if ( srcLen >= 4 &&
-                    src[0] == '\0' &&
-                        src[1] == '\xfe' &&
-                            src[2] == '\xff' )
+        case 0:
+            return BOM_Unknown;
+
+        case 1:
+            if ( src[0] == '\x00' || src[0] == '\xFF' ||
+                 src[0] == '\xFE' || src[0] == '\xEF')
             {
-                return BOM_UTF32BE;
+                // this could be a BOM but we don't know yet
+                return BOM_Unknown;
             }
             break;
 
-        case '\xfe':
-            // could only be big endian UTF-16 (FE FF)
-            if ( *src++ == '\xff' )
+        case 2:
+        case 3:
+            if ( src[0] == '\xEF' && src[1] == '\xBB' )
             {
-                return BOM_UTF16BE;
+                if ( srcLen == 3 )
+                    return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+
+                return BOM_Unknown;
             }
-            break;
 
-        case '\xff':
-            // could be either little endian UTF-16 or UTF-32, both start
-            // with FF FE
-            if ( *src++ == '\xfe' )
+            if ( src[0] == '\xFE' && src[1] == '\xFF' )
+                return BOM_UTF16BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' )
             {
-                return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
-                            ? BOM_UTF32LE
-                            : BOM_UTF16LE;
+                // if the next byte is 0, it could be an UTF-32LE BOM but if it
+                // isn't we can be sure it's UTF-16LE
+                if ( srcLen == 3 && src[2] != '\x00' )
+                    return BOM_UTF16LE;
+
+                return BOM_Unknown;
             }
-            break;
 
-        case '\xef':
-            // is this UTF-8 BOM (EF BB BF)?
-            if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
+            if ( src[0] == '\x00' && src[1] == '\x00' )
             {
-                return BOM_UTF8;
+                // this could only be UTF-32BE
+                if ( srcLen == 3 && src[2] == '\xFE' )
+                    return BOM_Unknown;
             }
+
             break;
+
+        default:
+            // we have at least 4 characters so we may finally decide whether
+            // we have a BOM or not
+            if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
+                return BOM_UTF8;
+
+            if ( src[0] == '\x00' && src[1] == '\x00' &&
+                 src[2] == '\xFE' && src[3] == '\xFF' )
+                return BOM_UTF32BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' &&
+                 src[2] == '\x00' && src[3] == '\x00' )
+                return BOM_UTF32LE;
+
+            if ( src[0] == '\xFE' && src[1] == '\xFF' )
+                return BOM_UTF16BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' )
+                return BOM_UTF16LE;
     }
 
     return BOM_None;
@@ -112,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
 
     switch ( bomType )
     {
+        case BOM_Unknown:
+            wxFAIL_MSG( "shouldn't be called for this BOM type" );
+            break;
+
+        case BOM_None:
+            // use the default
+            break;
+
         case BOM_UTF32BE:
             m_conv = new wxMBConvUTF32BE;
             m_ownsConv = true;
@@ -137,12 +177,16 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
             break;
 
         default:
-            wxFAIL_MSG( wxT("unexpected BOM type") );
-            // fall through: still need to create something
+            wxFAIL_MSG( "unknown BOM type" );
+    }
 
-        case BOM_None:
-            InitWithUTF8();
-            m_consumedBOM = true; // as there is nothing to consume
+    if ( !m_conv )
+    {
+        // we end up here if there is no BOM or we didn't recognize it somehow
+        // (this shouldn't happen but still don't crash if it does), so use the
+        // default encoding
+        InitWithUTF8();
+        m_consumedBOM = true; // as there is nothing to consume
     }
 }
 
@@ -151,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
     int ofs;
     switch ( m_bomType )
     {
+        case BOM_Unknown:
+            wxFAIL_MSG( "shouldn't be called for this BOM type" );
+            return;
+
+        case BOM_None:
+            ofs = 0;
+            break;
+
         case BOM_UTF32BE:
         case BOM_UTF32LE:
             ofs = 4;
@@ -166,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
             break;
 
         default:
-            wxFAIL_MSG( wxT("unexpected BOM type") );
-            // fall through: still need to create something
-
-        case BOM_None:
-            ofs = 0;
+            wxFAIL_MSG( "unknown BOM type" );
+            return;
     }
 
     *src += ofs;
@@ -178,11 +227,16 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
         *len -= ofs;
 }
 
-void wxConvAuto::InitFromInput(const char **src, size_t *len)
+bool wxConvAuto::InitFromInput(const char **src, size_t *len)
 {
     m_bomType = DetectBOM(*src, *len);
+    if ( m_bomType == BOM_Unknown )
+        return false;
+
     InitFromBOM(m_bomType);
     SkipBOM(src, len);
+
+    return true;
 }
 
 size_t
@@ -195,16 +249,20 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
     // dst as typically we're first called with NULL dst to calculate the
     // needed buffer size
     wxConvAuto *self = const_cast<wxConvAuto *>(this);
+
+
     if ( !m_conv )
     {
-        self->InitFromInput(&src, &srcLen);
-        if ( dst )
-            self->m_consumedBOM = true;
+        if ( !self->InitFromInput(&src, &srcLen) )
+        {
+            // there is not enough data to determine whether we have a BOM or
+            // not, so fail for now -- the caller is supposed to call us again
+            // with more data
+            return wxCONV_FAILED;
+        }
     }
-
-    if ( !m_consumedBOM && dst )
+    else if ( !m_consumedBOM && dst )
     {
-        self->m_consumedBOM = true;
         SkipBOM(&src, &srcLen);
     }
 
@@ -228,6 +286,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
         }
     }
 
+    if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
+        self->m_consumedBOM = true;
     return rc;
 }
 
@@ -245,4 +305,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
 }
 
 #endif // wxUSE_WCHAR_T
-
diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp
index 830bc985ba..9f9669faa1 100644
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@@ -76,7 +76,7 @@ wxChar wxTextInputStream::NextChar()
             return wxEOT;
 
         if ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1)
-                != wxCONV_FAILED )
+                == 1 )
             return wbuf[0];
     }
     // there should be no encoding which requires more than nine bytes for one character...
diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp
index fbd7042012..0c6292f625 100644
--- a/tests/mbconv/convautotest.cpp
+++ b/tests/mbconv/convautotest.cpp
@@ -19,11 +19,11 @@
 
 #if wxUSE_WCHAR_T
 
-#ifndef WX_PRECOMP
-#endif // WX_PRECOMP
-
 #include "wx/convauto.h"
 
+#include "wx/mstream.h"
+#include "wx/txtstrm.h"
+
 // ----------------------------------------------------------------------------
 // test class
 // ----------------------------------------------------------------------------
@@ -43,6 +43,12 @@ private:
         CPPUNIT_TEST( UTF16LE );
         CPPUNIT_TEST( UTF16BE );
         CPPUNIT_TEST( UTF8 );
+        CPPUNIT_TEST( StreamUTF8NoBOM );
+        CPPUNIT_TEST( StreamUTF8 );
+        CPPUNIT_TEST( StreamUTF16LE );
+        CPPUNIT_TEST( StreamUTF16BE );
+        CPPUNIT_TEST( StreamUTF32LE );
+        CPPUNIT_TEST( StreamUTF32BE );
     CPPUNIT_TEST_SUITE_END();
 
     // real test function: check that converting the src multibyte string to
@@ -57,6 +63,19 @@ private:
     void UTF16LE();
     void UTF16BE();
     void UTF8();
+
+    // test whether two lines of text are converted properly from a stream
+    void TestTextStream(const char *src,
+                        size_t srclength,
+                        const wxString& line1,
+                        const wxString& line2);
+
+    void StreamUTF8NoBOM();
+    void StreamUTF8();
+    void StreamUTF16LE();
+    void StreamUTF16BE();
+    void StreamUTF32LE();
+    void StreamUTF32BE();
 };
 
 // register in the unnamed registry so that these tests are run by default
@@ -118,5 +137,76 @@ void ConvAutoTestCase::UTF8()
 #endif
 }
 
+void ConvAutoTestCase::TestTextStream(const char *src,
+                                      size_t srclength,
+                                      const wxString& line1,
+                                      const wxString& line2)
+{
+    wxMemoryInputStream instream(src, srclength);
+    wxTextInputStream text(instream);
+
+    CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() );
+    CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() );
+}
+
+// the first line of the teststring used in the following functions is an
+// 'a' followed by a Japanese hiragana A (u+3042).
+// The second line is a single Greek beta (u+03B2). There is no blank line
+// at the end.
+
+namespace
+{
+
+const wxString line1 = wxString::FromUTF8("a\xe3\x81\x82");
+const wxString line2 = wxString::FromUTF8("\xce\xb2");
+
+} // anonymous namespace
+
+void ConvAutoTestCase::StreamUTF8NoBOM()
+{
+    // currently this test doesn't work because without the BOM wxConvAuto
+    // decides that the string is in Latin-1 after finding the first (but not
+    // the two subsequent ones which are part of the same UTF-8 sequence!)
+    // 8-bit character
+    //
+    // FIXME: we need to fix this at wxTextInputStream level, see #11570
+#if 0
+    TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2",
+                   7, line1, line2);
+#endif
+}
+
+void ConvAutoTestCase::StreamUTF8()
+{
+    TestTextStream("\xEF\xBB\xBF\x61\xE3\x81\x82\x0A\xCE\xB2",
+                   10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF16LE()
+{
+    TestTextStream("\xFF\xFE\x61\x00\x42\x30\x0A\x00\xB2\x03",
+                   10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF16BE()
+{
+    TestTextStream("\xFE\xFF\x00\x61\x30\x42\x00\x0A\x03\xB2",
+                   10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF32LE()
+{
+    TestTextStream("\xFF\xFE\0\0\x61\x00\0\0\x42\x30\0\0\x0A"
+                   "\x00\0\0\xB2\x03\0\0",
+                   20, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF32BE()
+{
+    TestTextStream("\0\0\xFE\xFF\0\0\x00\x61\0\0\x30\x42\0\0\x00\x0A"
+                   "\0\0\x03\xB2",
+                   20, line1, line2);
+}
+
 #endif // wxUSE_WCHAR_T
 
-- 
2.50.0