Fix wxConvAuto behaviour when it is used by wxTextInputStream.

author Vadim Zeitlin <vadim@wxwidgets.org>

Mon, 4 Jan 2010 12:22:49 +0000 (12:22 +0000)

committer Vadim Zeitlin <vadim@wxwidgets.org>

Mon, 4 Jan 2010 12:22:49 +0000 (12:22 +0000)
author Vadim Zeitlin <vadim@wxwidgets.org>
Mon, 4 Jan 2010 12:22:49 +0000 (12:22 +0000)
committer Vadim Zeitlin <vadim@wxwidgets.org>
Mon, 4 Jan 2010 12:22:49 +0000 (12:22 +0000)
diff --git a/docs/changes.txt b/docs/changes.txt

index bc778c5463422ab428aa97a141c7dfeef892250b..6099ceade00aef7e2d1973a4b64e630dd2bd85f5 100644 (file)
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -435,6 +435,7 @@ All:
  - wxDateTime timezone functions now dynamic (no caching).
  - Added wxHttp::GetCookie and wxHttp::HasCookies (dodge).
  - Added support for unique volume names to wxFileName (Neno Ganchev).
+- Correct bugs when using wxTextInputStream with wxConvAuto (Leon Buikstra).
  
  Unix:
  
diff --git a/include/wx/convauto.h b/include/wx/convauto.h

index b3dde4c73ad16f88a8baf7c6c65eb9d1c7969e51..3a2e2e480923eb55bcf50e6e9fdb4f8c40a33686 100644 (file)
--- a/include/wx/convauto.h
+++ b/include/wx/convauto.h
@@ -75,6 +75,7 @@ private:
      // all currently recognized BOM values
      enum BOMType
      {
+        BOM_Unknown = -1,
          BOM_None,
          BOM_UTF32BE,
          BOM_UTF32LE,
@@ -107,7 +108,10 @@ private:
  
      // create the correct conversion object for the BOM present in the
      // beginning of the buffer; adjust the buffer to skip the BOM if found
-    void InitFromInput(const char **src, size_t *len);
+    //
+    // return false if the buffer is too short to allow us to determine if we
+    // have BOM or not
+    bool InitFromInput(const char **src, size_t *len);
  
      // adjust src and len to skip over the BOM (identified by m_bomType) at the
      // start of the buffer
diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp

index f4e394d04dc46ccbda9aa4ee70e121a9bcac3c6f..c9ff7df9f68b3285948321bbeeda366d3a034993 100644 (file)
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -26,6 +26,7 @@
  #if wxUSE_WCHAR_T
  
  #ifndef WX_PRECOMP
+    #include "wx/wx.h"
  #endif //WX_PRECOMP
  
  #include "wx/convauto.h"
@@ -52,55 +53,86 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
  /* static */
  wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  {
-    if ( srcLen < 2 )
-    {
-        // minimal BOM is 2 bytes so bail out immediately and simplify the code
-        // below which wouldn't need to check for length for UTF-16 cases
-        return BOM_None;
-    }
-
      // examine the buffer for BOM presence
      //
-    // see http://www.unicode.org/faq/utf_bom.html#BOM
-    switch ( *src++ )
+    // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
+    //
+    //  Bytes           Encoding Form
+    //
+    //  00 00 FE FF     UTF-32, big-endian
+    //  FF FE 00 00     UTF-32, little-endian
+    //  FE FF           UTF-16, big-endian
+    //  FF FE           UTF-16, little-endian
+    //  EF BB BF        UTF-8
+    //
+    // as some BOMs are prefixes of other ones we may need to read more bytes
+    // to disambiguate them
+
+    switch ( srcLen )
      {
-        case '\0':
-            // could only be big endian UTF-32 (00 00 FE FF)
-            if ( srcLen >= 4 &&
-                    src[0] == '\0' &&
-                        src[1] == '\xfe' &&
-                            src[2] == '\xff' )
+        case 0:
+            return BOM_Unknown;
+
+        case 1:
+            if ( src[0] == '\x00' || src[0] == '\xFF' ||
+                 src[0] == '\xFE' || src[0] == '\xEF')
              {
-                return BOM_UTF32BE;
+                // this could be a BOM but we don't know yet
+                return BOM_Unknown;
              }
              break;
  
-        case '\xfe':
-            // could only be big endian UTF-16 (FE FF)
-            if ( *src++ == '\xff' )
+        case 2:
+        case 3:
+            if ( src[0] == '\xEF' && src[1] == '\xBB' )
              {
-                return BOM_UTF16BE;
+                if ( srcLen == 3 )
+                    return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+
+                return BOM_Unknown;
              }
-            break;
  
-        case '\xff':
-            // could be either little endian UTF-16 or UTF-32, both start
-            // with FF FE
-            if ( *src++ == '\xfe' )
+            if ( src[0] == '\xFE' && src[1] == '\xFF' )
+                return BOM_UTF16BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' )
              {
-                return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
-                            ? BOM_UTF32LE
-                            : BOM_UTF16LE;
+                // if the next byte is 0, it could be an UTF-32LE BOM but if it
+                // isn't we can be sure it's UTF-16LE
+                if ( srcLen == 3 && src[2] != '\x00' )
+                    return BOM_UTF16LE;
+
+                return BOM_Unknown;
              }
-            break;
  
-        case '\xef':
-            // is this UTF-8 BOM (EF BB BF)?
-            if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
+            if ( src[0] == '\x00' && src[1] == '\x00' )
              {
-                return BOM_UTF8;
+                // this could only be UTF-32BE
+                if ( srcLen == 3 && src[2] == '\xFE' )
+                    return BOM_Unknown;
              }
+
              break;
+
+        default:
+            // we have at least 4 characters so we may finally decide whether
+            // we have a BOM or not
+            if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
+                return BOM_UTF8;
+
+            if ( src[0] == '\x00' && src[1] == '\x00' &&
+                 src[2] == '\xFE' && src[3] == '\xFF' )
+                return BOM_UTF32BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' &&
+                 src[2] == '\x00' && src[3] == '\x00' )
+                return BOM_UTF32LE;
+
+            if ( src[0] == '\xFE' && src[1] == '\xFF' )
+                return BOM_UTF16BE;
+
+            if ( src[0] == '\xFF' && src[1] == '\xFE' )
+                return BOM_UTF16LE;
      }
  
      return BOM_None;
@@ -112,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
  
      switch ( bomType )
      {
+        case BOM_Unknown:
+            wxFAIL_MSG( "shouldn't be called for this BOM type" );
+            break;
+
+        case BOM_None:
+            // use the default
+            break;
+
          case BOM_UTF32BE:
              m_conv = new wxMBConvUTF32BE;
              m_ownsConv = true;
@@ -137,12 +177,16 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
              break;
  
          default:
-            wxFAIL_MSG( wxT("unexpected BOM type") );
-            // fall through: still need to create something
+            wxFAIL_MSG( "unknown BOM type" );
+    }
  
-        case BOM_None:
-            InitWithUTF8();
-            m_consumedBOM = true; // as there is nothing to consume
+    if ( !m_conv )
+    {
+        // we end up here if there is no BOM or we didn't recognize it somehow
+        // (this shouldn't happen but still don't crash if it does), so use the
+        // default encoding
+        InitWithUTF8();
+        m_consumedBOM = true; // as there is nothing to consume
      }
  }
  
@@ -151,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
      int ofs;
      switch ( m_bomType )
      {
+        case BOM_Unknown:
+            wxFAIL_MSG( "shouldn't be called for this BOM type" );
+            return;
+
+        case BOM_None:
+            ofs = 0;
+            break;
+
          case BOM_UTF32BE:
          case BOM_UTF32LE:
              ofs = 4;
@@ -166,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
              break;
  
          default:
-            wxFAIL_MSG( wxT("unexpected BOM type") );
-            // fall through: still need to create something
-
-        case BOM_None:
-            ofs = 0;
+            wxFAIL_MSG( "unknown BOM type" );
+            return;
      }
  
      *src += ofs;
@@ -178,11 +227,16 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
          *len -= ofs;
  }
  
-void wxConvAuto::InitFromInput(const char **src, size_t *len)
+bool wxConvAuto::InitFromInput(const char **src, size_t *len)
  {
      m_bomType = DetectBOM(*src, *len);
+    if ( m_bomType == BOM_Unknown )
+        return false;
+
      InitFromBOM(m_bomType);
      SkipBOM(src, len);
+
+    return true;
  }
  
  size_t
@@ -195,16 +249,20 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
      // dst as typically we're first called with NULL dst to calculate the
      // needed buffer size
      wxConvAuto *self = const_cast<wxConvAuto *>(this);
+
+
      if ( !m_conv )
      {
-        self->InitFromInput(&src, &srcLen);
-        if ( dst )
-            self->m_consumedBOM = true;
+        if ( !self->InitFromInput(&src, &srcLen) )
+        {
+            // there is not enough data to determine whether we have a BOM or
+            // not, so fail for now -- the caller is supposed to call us again
+            // with more data
+            return wxCONV_FAILED;
+        }
      }
-
-    if ( !m_consumedBOM && dst )
+    else if ( !m_consumedBOM && dst )
      {
-        self->m_consumedBOM = true;
          SkipBOM(&src, &srcLen);
      }
  
@@ -228,6 +286,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
          }
      }
  
+    if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
+        self->m_consumedBOM = true;
      return rc;
  }
  
@@ -245,4 +305,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
  }
  
  #endif // wxUSE_WCHAR_T
-
diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp

index 830bc985ba2f83b869e463466d57498cfaeb31df..9f9669faa184ec5e3b0ee34b33cec0bd67241ae5 100644 (file)
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@@ -76,7 +76,7 @@ wxChar wxTextInputStream::NextChar()
              return wxEOT;
  
          if ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1)
-                != wxCONV_FAILED )
+                == 1 )
              return wbuf[0];
      }
      // there should be no encoding which requires more than nine bytes for one character...
diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp

index fbd70420121c38f12669052bd8387d5fe2a1048b..0c6292f625c83c9741a6943e2aaff5df904c607c 100644 (file)
--- a/tests/mbconv/convautotest.cpp
+++ b/tests/mbconv/convautotest.cpp
@@ -19,11 +19,11 @@
  
  #if wxUSE_WCHAR_T
  
-#ifndef WX_PRECOMP
-#endif // WX_PRECOMP
-
  #include "wx/convauto.h"
  
+#include "wx/mstream.h"
+#include "wx/txtstrm.h"
+
  // ----------------------------------------------------------------------------
  // test class
  // ----------------------------------------------------------------------------
@@ -43,6 +43,12 @@ private:
          CPPUNIT_TEST( UTF16LE );
          CPPUNIT_TEST( UTF16BE );
          CPPUNIT_TEST( UTF8 );
+        CPPUNIT_TEST( StreamUTF8NoBOM );
+        CPPUNIT_TEST( StreamUTF8 );
+        CPPUNIT_TEST( StreamUTF16LE );
+        CPPUNIT_TEST( StreamUTF16BE );
+        CPPUNIT_TEST( StreamUTF32LE );
+        CPPUNIT_TEST( StreamUTF32BE );
      CPPUNIT_TEST_SUITE_END();
  
      // real test function: check that converting the src multibyte string to
@@ -57,6 +63,19 @@ private:
      void UTF16LE();
      void UTF16BE();
      void UTF8();
+
+    // test whether two lines of text are converted properly from a stream
+    void TestTextStream(const char *src,
+                        size_t srclength,
+                        const wxString& line1,
+                        const wxString& line2);
+
+    void StreamUTF8NoBOM();
+    void StreamUTF8();
+    void StreamUTF16LE();
+    void StreamUTF16BE();
+    void StreamUTF32LE();
+    void StreamUTF32BE();
  };
  
  // register in the unnamed registry so that these tests are run by default
@@ -118,5 +137,76 @@ void ConvAutoTestCase::UTF8()
  #endif
  }
  
+void ConvAutoTestCase::TestTextStream(const char *src,
+                                      size_t srclength,
+                                      const wxString& line1,
+                                      const wxString& line2)
+{
+    wxMemoryInputStream instream(src, srclength);
+    wxTextInputStream text(instream);
+
+    CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() );
+    CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() );
+}
+
+// the first line of the teststring used in the following functions is an
+// 'a' followed by a Japanese hiragana A (u+3042).
+// The second line is a single Greek beta (u+03B2). There is no blank line
+// at the end.
+
+namespace
+{
+
+const wxString line1 = wxString::FromUTF8("a\xe3\x81\x82");
+const wxString line2 = wxString::FromUTF8("\xce\xb2");
+
+} // anonymous namespace
+
+void ConvAutoTestCase::StreamUTF8NoBOM()
+{
+    // currently this test doesn't work because without the BOM wxConvAuto
+    // decides that the string is in Latin-1 after finding the first (but not
+    // the two subsequent ones which are part of the same UTF-8 sequence!)
+    // 8-bit character
+    //
+    // FIXME: we need to fix this at wxTextInputStream level, see #11570
+#if 0
+    TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2",
+                   7, line1, line2);
+#endif
+}
+
+void ConvAutoTestCase::StreamUTF8()
+{
+    TestTextStream("\xEF\xBB\xBF\x61\xE3\x81\x82\x0A\xCE\xB2",
+                   10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF16LE()
+{
+    TestTextStream("\xFF\xFE\x61\x00\x42\x30\x0A\x00\xB2\x03",
+                   10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF16BE()
+{
+    TestTextStream("\xFE\xFF\x00\x61\x30\x42\x00\x0A\x03\xB2",
+                   10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF32LE()
+{
+    TestTextStream("\xFF\xFE\0\0\x61\x00\0\0\x42\x30\0\0\x0A"
+                   "\x00\0\0\xB2\x03\0\0",
+                   20, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF32BE()
+{
+    TestTextStream("\0\0\xFE\xFF\0\0\x00\x61\0\0\x30\x42\0\0\x00\x0A"
+                   "\0\0\x03\xB2",
+                   20, line1, line2);
+}
+
  #endif // wxUSE_WCHAR_T
author	Vadim Zeitlin <vadim@wxwidgets.org>
	Mon, 4 Jan 2010 12:22:49 +0000 (12:22 +0000)
committer	Vadim Zeitlin <vadim@wxwidgets.org>
	Mon, 4 Jan 2010 12:22:49 +0000 (12:22 +0000)
docs/changes.txt		patch \| blob \| blame \| history
include/wx/convauto.h		patch \| blob \| blame \| history
src/common/convauto.cpp		patch \| blob \| blame \| history
src/common/txtstrm.cpp		patch \| blob \| blame \| history
tests/mbconv/convautotest.cpp		patch \| blob \| blame \| history