// wxConvAuto: uses BOM to automatically detect input encoding
// ----------------------------------------------------------------------------
+// All currently recognized BOM values.
+enum wxBOM
+{
+ wxBOM_Unknown = -1,
+ wxBOM_None,
+ wxBOM_UTF32BE,
+ wxBOM_UTF32LE,
+ wxBOM_UTF16BE,
+ wxBOM_UTF16LE,
+ wxBOM_UTF8
+};
+
class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
{
public:
virtual wxMBConv *Clone() const { return new wxConvAuto(*this); }
-private:
- // all currently recognized BOM values
- enum BOMType
- {
- BOM_Unknown = -1,
- BOM_None,
- BOM_UTF32BE,
- BOM_UTF32LE,
- BOM_UTF16BE,
- BOM_UTF16LE,
- BOM_UTF8
- };
-
// return the BOM type of this buffer
- static BOMType DetectBOM(const char *src, size_t srcLen);
+ static wxBOM DetectBOM(const char *src, size_t srcLen);
+ wxBOM GetBOM() const
+ {
+ return m_bomType;
+ }
+
+private:
// common part of all ctors
void Init()
{
- // no need to initialize m_bomType and m_consumedBOM here, this will be
- // done when m_conv is created
+ // We don't initialize m_encDefault here as different ctors do it
+ // differently.
m_conv = NULL;
+ m_bomType = wxBOM_Unknown;
m_ownsConv = false;
+ m_consumedBOM = false;
}
// initialize m_conv with the UTF-8 conversion
}
// create the correct conversion object for the given BOM type
- void InitFromBOM(BOMType bomType);
+ void InitFromBOM(wxBOM bomType);
// create the correct conversion object for the BOM present in the
// beginning of the buffer
wxFontEncoding m_encDefault;
// our BOM type
- BOMType m_bomType;
+ wxBOM m_bomType;
// true if we allocated m_conv ourselves, false if we just use an existing
// global conversion
// Licence: wxWindows licence
/////////////////////////////////////////////////////////////////////////////
+/**
+ Constants representing various BOM types.
+
+ BOM is an abbreviation for "Byte Order Mark", a special Unicode character
+ which may be inserted into the beginning of a text stream to indicate its
+ encoding.
+
+ @since 2.9.3
+ */
+enum wxBOM
+{
+ /**
+ Unknown BOM.
+
+ This is returned if BOM presence couldn't be determined and normally
+ happens because not enough bytes of input have been analysed.
+ */
+ wxBOM_Unknown = -1,
+
+ /**
+ No BOM.
+
+ The stream doesn't contain BOM character at all.
+ */
+ wxBOM_None,
+
+ /**
+ UTF-32 big endian BOM.
+
+ The stream is encoded in big endian variant of UTF-32.
+ */
+ wxBOM_UTF32BE,
+
+ /**
+ UTF-32 little endian BOM.
+
+ The stream is encoded in little endian variant of UTF-32.
+ */
+ wxBOM_UTF32LE,
+
+ /**
+ UTF-16 big endian BOM.
+
+ The stream is encoded in big endian variant of UTF-16.
+ */
+ wxBOM_UTF16BE,
+
+ /**
+ UTF-16 little endian BOM.
+
+ The stream is encoded in little endian variant of UTF-16.
+ */
+ wxBOM_UTF16LE,
+
+ /**
+ UTF-8 BOM.
+
+ The stream is encoded in UTF-8.
+
+ Notice that contrary to a popular belief, it's perfectly possible and,
+ n fact, common under Microsoft Windows systems, to have a BOM in an
+ UTF-8 stream: while it's not used to indicate the endianness of UTF-8
+ stream (as it's byte-oriented), the BOM can still be useful just as an
+ unambiguous indicator of UTF-8 being used.
+ */
+ wxBOM_UTF8
+};
+
/**
@class wxConvAuto
*/
wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
+
+ /**
+ Return the detected BOM type.
+
+ The BOM type is detected after sufficiently many initial bytes have
+ passed through this conversion object so it will always return
+ wxBOM_Unknown immediately after the object creation but may return a
+ different value later.
+
+ @since 2.9.3
+ */
+ wxBOM GetBOM() const;
+
/**
Disable the use of the fall back encoding: if the input doesn't have a
BOM and is not valid UTF-8, the conversion will fail.
@c wxFONTENCODING_DEFAULT can't be used here.
*/
static void SetFallbackEncoding(wxFontEncoding enc);
-};
+ /**
+ Return the BOM type of this buffer.
+
+ This is a helper function which is normally only used internally by
+ wxConvAuto but provided for convenience of the code that wants to
+ detect the encoding of a stream by checking it for BOM presence on its
+ own.
+
+ @since 2.9.3
+ */
+ static wxBOM DetectBOM(const char *src, size_t srcLen);
+};
}
/* static */
-wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
+wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
// examine the buffer for BOM presence
//
switch ( srcLen )
{
case 0:
- return BOM_Unknown;
+ return wxBOM_Unknown;
case 1:
if ( src[0] == '\x00' || src[0] == '\xFF' ||
src[0] == '\xFE' || src[0] == '\xEF')
{
// this could be a BOM but we don't know yet
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
break;
if ( src[0] == '\xEF' && src[1] == '\xBB' )
{
if ( srcLen == 3 )
- return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+ return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
if ( src[0] == '\xFE' && src[1] == '\xFF' )
- return BOM_UTF16BE;
+ return wxBOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' )
{
// if the next byte is 0, it could be an UTF-32LE BOM but if it
// isn't we can be sure it's UTF-16LE
if ( srcLen == 3 && src[2] != '\x00' )
- return BOM_UTF16LE;
+ return wxBOM_UTF16LE;
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
if ( src[0] == '\x00' && src[1] == '\x00' )
// this could only be UTF-32BE, check that the data we have so
// far allows for it
if ( srcLen == 3 && src[2] != '\xFE' )
- return BOM_None;
+ return wxBOM_None;
- return BOM_Unknown;
+ return wxBOM_Unknown;
}
break;
// we have at least 4 characters so we may finally decide whether
// we have a BOM or not
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
- return BOM_UTF8;
+ return wxBOM_UTF8;
if ( src[0] == '\x00' && src[1] == '\x00' &&
src[2] == '\xFE' && src[3] == '\xFF' )
- return BOM_UTF32BE;
+ return wxBOM_UTF32BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' &&
src[2] == '\x00' && src[3] == '\x00' )
- return BOM_UTF32LE;
+ return wxBOM_UTF32LE;
if ( src[0] == '\xFE' && src[1] == '\xFF' )
- return BOM_UTF16BE;
+ return wxBOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' )
- return BOM_UTF16LE;
+ return wxBOM_UTF16LE;
}
- return BOM_None;
+ return wxBOM_None;
}
-void wxConvAuto::InitFromBOM(BOMType bomType)
+void wxConvAuto::InitFromBOM(wxBOM bomType)
{
m_consumedBOM = false;
switch ( bomType )
{
- case BOM_Unknown:
+ case wxBOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" );
break;
- case BOM_None:
+ case wxBOM_None:
// use the default
break;
- case BOM_UTF32BE:
+ case wxBOM_UTF32BE:
m_conv = new wxMBConvUTF32BE;
m_ownsConv = true;
break;
- case BOM_UTF32LE:
+ case wxBOM_UTF32LE:
m_conv = new wxMBConvUTF32LE;
m_ownsConv = true;
break;
- case BOM_UTF16BE:
+ case wxBOM_UTF16BE:
m_conv = new wxMBConvUTF16BE;
m_ownsConv = true;
break;
- case BOM_UTF16LE:
+ case wxBOM_UTF16LE:
m_conv = new wxMBConvUTF16LE;
m_ownsConv = true;
break;
- case BOM_UTF8:
+ case wxBOM_UTF8:
InitWithUTF8();
break;
int ofs;
switch ( m_bomType )
{
- case BOM_Unknown:
+ case wxBOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" );
return;
- case BOM_None:
+ case wxBOM_None:
ofs = 0;
break;
- case BOM_UTF32BE:
- case BOM_UTF32LE:
+ case wxBOM_UTF32BE:
+ case wxBOM_UTF32LE:
ofs = 4;
break;
- case BOM_UTF16BE:
- case BOM_UTF16LE:
+ case wxBOM_UTF16BE:
+ case wxBOM_UTF16LE:
ofs = 2;
break;
- case BOM_UTF8:
+ case wxBOM_UTF8:
ofs = 3;
break;
bool wxConvAuto::InitFromInput(const char *src, size_t len)
{
m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
- if ( m_bomType == BOM_Unknown )
+ if ( m_bomType == wxBOM_Unknown )
return false;
InitFromBOM(m_bomType);
// try to convert using the auto-detected encoding
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
- if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
+ if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
{
// if the conversion failed but we didn't really detect anything and
// simply tried UTF-8 by default, retry it using the fall-back