#if wxUSE_WCHAR_T
#ifndef WX_PRECOMP
+ #include "wx/wx.h"
#endif //WX_PRECOMP
#include "wx/convauto.h"
/* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
- if ( srcLen < 2 )
- {
- // minimal BOM is 2 bytes so bail out immediately and simplify the code
- // below which wouldn't need to check for length for UTF-16 cases
- return BOM_None;
- }
-
// examine the buffer for BOM presence
//
- // see http://www.unicode.org/faq/utf_bom.html#BOM
- switch ( *src++ )
+ // quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
+ //
+ // Bytes Encoding Form
+ //
+ // 00 00 FE FF UTF-32, big-endian
+ // FF FE 00 00 UTF-32, little-endian
+ // FE FF UTF-16, big-endian
+ // FF FE UTF-16, little-endian
+ // EF BB BF UTF-8
+ //
+ // as some BOMs are prefixes of other ones we may need to read more bytes
+ // to disambiguate them
+
+ switch ( srcLen )
{
- case '\0':
- // could only be big endian UTF-32 (00 00 FE FF)
- if ( srcLen >= 4 &&
- src[0] == '\0' &&
- src[1] == '\xfe' &&
- src[2] == '\xff' )
+ case 0:
+ return BOM_Unknown;
+
+ case 1:
+ if ( src[0] == '\x00' || src[0] == '\xFF' ||
+ src[0] == '\xFE' || src[0] == '\xEF')
{
- return BOM_UTF32BE;
+ // this could be a BOM but we don't know yet
+ return BOM_Unknown;
}
break;
- case '\xfe':
- // could only be big endian UTF-16 (FE FF)
- if ( *src++ == '\xff' )
+ case 2:
+ case 3:
+ if ( src[0] == '\xEF' && src[1] == '\xBB' )
{
- return BOM_UTF16BE;
+ if ( srcLen == 3 )
+ return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+
+ return BOM_Unknown;
}
- break;
- case '\xff':
- // could be either little endian UTF-16 or UTF-32, both start
- // with FF FE
- if ( *src++ == '\xfe' )
+ if ( src[0] == '\xFE' && src[1] == '\xFF' )
+ return BOM_UTF16BE;
+
+ if ( src[0] == '\xFF' && src[1] == '\xFE' )
{
- return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
- ? BOM_UTF32LE
- : BOM_UTF16LE;
+ // if the next byte is 0, it could be an UTF-32LE BOM but if it
+ // isn't we can be sure it's UTF-16LE
+ if ( srcLen == 3 && src[2] != '\x00' )
+ return BOM_UTF16LE;
+
+ return BOM_Unknown;
}
- break;
- case '\xef':
- // is this UTF-8 BOM (EF BB BF)?
- if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
+ if ( src[0] == '\x00' && src[1] == '\x00' )
{
- return BOM_UTF8;
+ // this could only be UTF-32BE
+ if ( srcLen == 3 && src[2] == '\xFE' )
+ return BOM_Unknown;
}
+
break;
+
+ default:
+ // we have at least 4 characters so we may finally decide whether
+ // we have a BOM or not
+ if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
+ return BOM_UTF8;
+
+ if ( src[0] == '\x00' && src[1] == '\x00' &&
+ src[2] == '\xFE' && src[3] == '\xFF' )
+ return BOM_UTF32BE;
+
+ if ( src[0] == '\xFF' && src[1] == '\xFE' &&
+ src[2] == '\x00' && src[3] == '\x00' )
+ return BOM_UTF32LE;
+
+ if ( src[0] == '\xFE' && src[1] == '\xFF' )
+ return BOM_UTF16BE;
+
+ if ( src[0] == '\xFF' && src[1] == '\xFE' )
+ return BOM_UTF16LE;
}
return BOM_None;
switch ( bomType )
{
+ case BOM_Unknown:
+ wxFAIL_MSG( "shouldn't be called for this BOM type" );
+ break;
+
+ case BOM_None:
+ // use the default
+ break;
+
case BOM_UTF32BE:
m_conv = new wxMBConvUTF32BE;
m_ownsConv = true;
break;
default:
- wxFAIL_MSG( wxT("unexpected BOM type") );
- // fall through: still need to create something
+ wxFAIL_MSG( "unknown BOM type" );
+ }
- case BOM_None:
- InitWithUTF8();
- m_consumedBOM = true; // as there is nothing to consume
+ if ( !m_conv )
+ {
+ // we end up here if there is no BOM or we didn't recognize it somehow
+ // (this shouldn't happen but still don't crash if it does), so use the
+ // default encoding
+ InitWithUTF8();
+ m_consumedBOM = true; // as there is nothing to consume
}
}
int ofs;
switch ( m_bomType )
{
+ case BOM_Unknown:
+ wxFAIL_MSG( "shouldn't be called for this BOM type" );
+ return;
+
+ case BOM_None:
+ ofs = 0;
+ break;
+
case BOM_UTF32BE:
case BOM_UTF32LE:
ofs = 4;
break;
default:
- wxFAIL_MSG( wxT("unexpected BOM type") );
- // fall through: still need to create something
-
- case BOM_None:
- ofs = 0;
+ wxFAIL_MSG( "unknown BOM type" );
+ return;
}
*src += ofs;
*len -= ofs;
}
-void wxConvAuto::InitFromInput(const char **src, size_t *len)
+bool wxConvAuto::InitFromInput(const char **src, size_t *len)
{
m_bomType = DetectBOM(*src, *len);
+ if ( m_bomType == BOM_Unknown )
+ return false;
+
InitFromBOM(m_bomType);
SkipBOM(src, len);
+
+ return true;
}
size_t
// dst as typically we're first called with NULL dst to calculate the
// needed buffer size
wxConvAuto *self = const_cast<wxConvAuto *>(this);
+
+
if ( !m_conv )
{
- self->InitFromInput(&src, &srcLen);
- if ( dst )
- self->m_consumedBOM = true;
+ if ( !self->InitFromInput(&src, &srcLen) )
+ {
+ // there is not enough data to determine whether we have a BOM or
+ // not, so fail for now -- the caller is supposed to call us again
+ // with more data
+ return wxCONV_FAILED;
+ }
}
-
- if ( !m_consumedBOM && dst )
+ else if ( !m_consumedBOM && dst )
{
- self->m_consumedBOM = true;
SkipBOM(&src, &srcLen);
}
}
}
+ if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
+ self->m_consumedBOM = true;
return rc;
}
}
#endif // wxUSE_WCHAR_T
-
#if wxUSE_WCHAR_T
-#ifndef WX_PRECOMP
-#endif // WX_PRECOMP
-
#include "wx/convauto.h"
+#include "wx/mstream.h"
+#include "wx/txtstrm.h"
+
// ----------------------------------------------------------------------------
// test class
// ----------------------------------------------------------------------------
CPPUNIT_TEST( UTF16LE );
CPPUNIT_TEST( UTF16BE );
CPPUNIT_TEST( UTF8 );
+ CPPUNIT_TEST( StreamUTF8NoBOM );
+ CPPUNIT_TEST( StreamUTF8 );
+ CPPUNIT_TEST( StreamUTF16LE );
+ CPPUNIT_TEST( StreamUTF16BE );
+ CPPUNIT_TEST( StreamUTF32LE );
+ CPPUNIT_TEST( StreamUTF32BE );
CPPUNIT_TEST_SUITE_END();
// real test function: check that converting the src multibyte string to
void UTF16LE();
void UTF16BE();
void UTF8();
+
+ // test whether two lines of text are converted properly from a stream
+ void TestTextStream(const char *src,
+ size_t srclength,
+ const wxString& line1,
+ const wxString& line2);
+
+ void StreamUTF8NoBOM();
+ void StreamUTF8();
+ void StreamUTF16LE();
+ void StreamUTF16BE();
+ void StreamUTF32LE();
+ void StreamUTF32BE();
};
// register in the unnamed registry so that these tests are run by default
#endif
}
+void ConvAutoTestCase::TestTextStream(const char *src,
+ size_t srclength,
+ const wxString& line1,
+ const wxString& line2)
+{
+ wxMemoryInputStream instream(src, srclength);
+ wxTextInputStream text(instream);
+
+ CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() );
+ CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() );
+}
+
+// the first line of the teststring used in the following functions is an
+// 'a' followed by a Japanese hiragana A (u+3042).
+// The second line is a single Greek beta (u+03B2). There is no blank line
+// at the end.
+
+namespace
+{
+
+const wxString line1 = wxString::FromUTF8("a\xe3\x81\x82");
+const wxString line2 = wxString::FromUTF8("\xce\xb2");
+
+} // anonymous namespace
+
+void ConvAutoTestCase::StreamUTF8NoBOM()
+{
+ // currently this test doesn't work because without the BOM wxConvAuto
+ // decides that the string is in Latin-1 after finding the first (but not
+ // the two subsequent ones which are part of the same UTF-8 sequence!)
+ // 8-bit character
+ //
+ // FIXME: we need to fix this at wxTextInputStream level, see #11570
+#if 0
+ TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2",
+ 7, line1, line2);
+#endif
+}
+
+void ConvAutoTestCase::StreamUTF8()
+{
+ TestTextStream("\xEF\xBB\xBF\x61\xE3\x81\x82\x0A\xCE\xB2",
+ 10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF16LE()
+{
+ TestTextStream("\xFF\xFE\x61\x00\x42\x30\x0A\x00\xB2\x03",
+ 10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF16BE()
+{
+ TestTextStream("\xFE\xFF\x00\x61\x30\x42\x00\x0A\x03\xB2",
+ 10, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF32LE()
+{
+ TestTextStream("\xFF\xFE\0\0\x61\x00\0\0\x42\x30\0\0\x0A"
+ "\x00\0\0\xB2\x03\0\0",
+ 20, line1, line2);
+}
+
+void ConvAutoTestCase::StreamUTF32BE()
+{
+ TestTextStream("\0\0\xFE\xFF\0\0\x00\x61\0\0\x30\x42\0\0\x00\x0A"
+ "\0\0\x03\xB2",
+ 20, line1, line2);
+}
+
#endif // wxUSE_WCHAR_T