// Purpose: implementation of wxConvAuto
// Author: Vadim Zeitlin
// Created: 2006-04-04
-// RCS-ID: $Id$
// Copyright: (c) 2006 Vadim Zeitlin <vadim@wxwindows.org>
// Licence: wxWindows licence
///////////////////////////////////////////////////////////////////////////////
#pragma hdrstop
#endif
-#if wxUSE_WCHAR_T
-
-#ifndef WX_PRECOMP
- #include "wx/wx.h"
-#endif //WX_PRECOMP
-
#include "wx/convauto.h"
// we use latin1 by default as it seems the least bad choice: the files we need
// seem to be a good idea and there is no other reasonable alternative
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
+namespace
+{
+
+const char BOM_UTF32BE[] = { '\x00', '\x00', '\xFE', '\xFF' };
+const char BOM_UTF32LE[] = { '\xFF', '\xFE', '\x00', '\x00' };
+const char BOM_UTF16BE[] = { '\xFE', '\xFF' };
+const char BOM_UTF16LE[] = { '\xFF', '\xFE' };
+const char BOM_UTF8[] = { '\xEF', '\xBB', '\xBF' };
+
+} // anonymous namespace
+
// ============================================================================
// implementation
// ============================================================================
}
/* static */
-wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
+const char* wxConvAuto::GetBOMChars(wxBOM bom, size_t* count)
+{
+ wxCHECK_MSG( count , NULL, wxS("count pointer must be provided") );
+
+ switch ( bom )
+ {
+ case wxBOM_UTF32BE: *count = WXSIZEOF(BOM_UTF32BE); return BOM_UTF32BE;
+ case wxBOM_UTF32LE: *count = WXSIZEOF(BOM_UTF32LE); return BOM_UTF32LE;
+ case wxBOM_UTF16BE: *count = WXSIZEOF(BOM_UTF16BE); return BOM_UTF16BE;
+ case wxBOM_UTF16LE: *count = WXSIZEOF(BOM_UTF16LE); return BOM_UTF16LE;
+ case wxBOM_UTF8 : *count = WXSIZEOF(BOM_UTF8 ); return BOM_UTF8;
+ case wxBOM_Unknown:
+ case wxBOM_None:
+ wxFAIL_MSG( wxS("Invalid BOM type") );
+ return NULL;
+ }
+
+ wxFAIL_MSG( wxS("Unknown BOM type") );
+ return NULL;
+}
+
+/* static */
+wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
// examine the buffer for BOM presence
//
switch ( srcLen )
{
case 0:
- return BOM_Unknown;
+ return wxBOM_Unknown;
case 1:
if ( src[0] == '\x00' || src[0] == '\xFF' ||
src[0] == '\xFE' || src[0] == '\xEF')
{
// this could be a BOM but we don't know yet
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
break;
if ( src[0] == '\xEF' && src[1] == '\xBB' )
{
if ( srcLen == 3 )
- return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+ return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
if ( src[0] == '\xFE' && src[1] == '\xFF' )
- return BOM_UTF16BE;
+ return wxBOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' )
{
// if the next byte is 0, it could be an UTF-32LE BOM but if it
// isn't we can be sure it's UTF-16LE
if ( srcLen == 3 && src[2] != '\x00' )
- return BOM_UTF16LE;
+ return wxBOM_UTF16LE;
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
if ( src[0] == '\x00' && src[1] == '\x00' )
{
- // this could only be UTF-32BE
- if ( srcLen == 3 && src[2] == '\xFE' )
- return BOM_Unknown;
- }
+ // this could only be UTF-32BE, check that the data we have so
+ // far allows for it
+ if ( srcLen == 3 && src[2] != '\xFE' )
+ return wxBOM_None;
+ return wxBOM_Unknown;
+ }
break;
default:
// we have at least 4 characters so we may finally decide whether
// we have a BOM or not
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
- return BOM_UTF8;
+ return wxBOM_UTF8;
if ( src[0] == '\x00' && src[1] == '\x00' &&
src[2] == '\xFE' && src[3] == '\xFF' )
- return BOM_UTF32BE;
+ return wxBOM_UTF32BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' &&
src[2] == '\x00' && src[3] == '\x00' )
- return BOM_UTF32LE;
+ return wxBOM_UTF32LE;
if ( src[0] == '\xFE' && src[1] == '\xFF' )
- return BOM_UTF16BE;
+ return wxBOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' )
- return BOM_UTF16LE;
+ return wxBOM_UTF16LE;
}
- return BOM_None;
+ return wxBOM_None;
}
-void wxConvAuto::InitFromBOM(BOMType bomType)
+void wxConvAuto::InitFromBOM(wxBOM bomType)
{
m_consumedBOM = false;
switch ( bomType )
{
- case BOM_Unknown:
+ case wxBOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" );
break;
- case BOM_None:
+ case wxBOM_None:
// use the default
break;
- case BOM_UTF32BE:
+ case wxBOM_UTF32BE:
m_conv = new wxMBConvUTF32BE;
m_ownsConv = true;
break;
- case BOM_UTF32LE:
+ case wxBOM_UTF32LE:
m_conv = new wxMBConvUTF32LE;
m_ownsConv = true;
break;
- case BOM_UTF16BE:
+ case wxBOM_UTF16BE:
m_conv = new wxMBConvUTF16BE;
m_ownsConv = true;
break;
- case BOM_UTF16LE:
+ case wxBOM_UTF16LE:
m_conv = new wxMBConvUTF16LE;
m_ownsConv = true;
break;
- case BOM_UTF8:
+ case wxBOM_UTF8:
InitWithUTF8();
break;
int ofs;
switch ( m_bomType )
{
- case BOM_Unknown:
+ case wxBOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" );
return;
- case BOM_None:
+ case wxBOM_None:
ofs = 0;
break;
- case BOM_UTF32BE:
- case BOM_UTF32LE:
+ case wxBOM_UTF32BE:
+ case wxBOM_UTF32LE:
ofs = 4;
break;
- case BOM_UTF16BE:
- case BOM_UTF16LE:
+ case wxBOM_UTF16BE:
+ case wxBOM_UTF16LE:
ofs = 2;
break;
- case BOM_UTF8:
+ case wxBOM_UTF8:
ofs = 3;
break;
*len -= ofs;
}
-bool wxConvAuto::InitFromInput(const char **src, size_t *len)
+bool wxConvAuto::InitFromInput(const char *src, size_t len)
{
- m_bomType = DetectBOM(*src, *len);
- if ( m_bomType == BOM_Unknown )
+ m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
+ if ( m_bomType == wxBOM_Unknown )
return false;
InitFromBOM(m_bomType);
- SkipBOM(src, len);
return true;
}
if ( !m_conv )
{
- if ( !self->InitFromInput(&src, &srcLen) )
+ if ( !self->InitFromInput(src, srcLen) )
{
// there is not enough data to determine whether we have a BOM or
// not, so fail for now -- the caller is supposed to call us again
return wxCONV_FAILED;
}
}
- else if ( !m_consumedBOM && dst )
+
+ if ( !m_consumedBOM )
{
SkipBOM(&src, &srcLen);
+ if ( srcLen == 0 )
+ {
+ // there is nothing left except the BOM so we'd return 0 below but
+ // this is unexpected: decoding a non-empty string must either fail
+ // or return something non-empty, in particular this would break
+ // the code in wxTextInputStream::NextChar()
+ //
+ // so still return an error as we need some more data to be able to
+ // decode it
+ return wxCONV_FAILED;
+ }
}
// try to convert using the auto-detected encoding
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
- if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
+ if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
{
// if the conversion failed but we didn't really detect anything and
// simply tried UTF-8 by default, retry it using the fall-back
}
}
- if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
+ // don't skip the BOM again the next time if we really consumed it
+ if ( rc != wxCONV_FAILED && dst && !m_consumedBOM )
self->m_consumedBOM = true;
+
return rc;
}
return m_conv->FromWChar(dst, dstLen, src, srcLen);
}
-
-#endif // wxUSE_WCHAR_T