Make BOM-detection code in wxConvAuto public.

author Vadim Zeitlin <vadim@wxwidgets.org>

Thu, 27 Oct 2011 22:48:54 +0000 (22:48 +0000)

committer Vadim Zeitlin <vadim@wxwidgets.org>

Thu, 27 Oct 2011 22:48:54 +0000 (22:48 +0000)
author Vadim Zeitlin <vadim@wxwidgets.org>
Thu, 27 Oct 2011 22:48:54 +0000 (22:48 +0000)
committer Vadim Zeitlin <vadim@wxwidgets.org>
Thu, 27 Oct 2011 22:48:54 +0000 (22:48 +0000)
diff --git a/include/wx/convauto.h b/include/wx/convauto.h

index bc514cea3b0d8675c4221bf35569302723316ce0..4c18dba601da5a275e12e4ef7f394e7a7795a60e 100644 (file)
--- a/include/wx/convauto.h
+++ b/include/wx/convauto.h
@@ -18,6 +18,18 @@
  // wxConvAuto: uses BOM to automatically detect input encoding
  // ----------------------------------------------------------------------------
  
  // wxConvAuto: uses BOM to automatically detect input encoding
  // ----------------------------------------------------------------------------
  
+// All currently recognized BOM values.
+enum wxBOM
+{
+    wxBOM_Unknown = -1,
+    wxBOM_None,
+    wxBOM_UTF32BE,
+    wxBOM_UTF32LE,
+    wxBOM_UTF16BE,
+    wxBOM_UTF16LE,
+    wxBOM_UTF8
+};
+
  class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
  {
  public:
  class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
  {
  public:
@@ -69,29 +81,24 @@ public:
  
      virtual wxMBConv *Clone() const { return new wxConvAuto(*this); }
  
  
      virtual wxMBConv *Clone() const { return new wxConvAuto(*this); }
  
-private:
-    // all currently recognized BOM values
-    enum BOMType
-    {
-        BOM_Unknown = -1,
-        BOM_None,
-        BOM_UTF32BE,
-        BOM_UTF32LE,
-        BOM_UTF16BE,
-        BOM_UTF16LE,
-        BOM_UTF8
-    };
-
      // return the BOM type of this buffer
      // return the BOM type of this buffer
-    static BOMType DetectBOM(const char *src, size_t srcLen);
+    static wxBOM DetectBOM(const char *src, size_t srcLen);
  
  
+    wxBOM GetBOM() const
+    {
+        return m_bomType;
+    }
+
+private:
      // common part of all ctors
      void Init()
      {
      // common part of all ctors
      void Init()
      {
-        // no need to initialize m_bomType and m_consumedBOM here, this will be
-        // done when m_conv is created
+        // We don't initialize m_encDefault here as different ctors do it
+        // differently.
          m_conv = NULL;
          m_conv = NULL;
+        m_bomType = wxBOM_Unknown;
          m_ownsConv = false;
          m_ownsConv = false;
+        m_consumedBOM = false;
      }
  
      // initialize m_conv with the UTF-8 conversion
      }
  
      // initialize m_conv with the UTF-8 conversion
@@ -102,7 +109,7 @@ private:
      }
  
      // create the correct conversion object for the given BOM type
      }
  
      // create the correct conversion object for the given BOM type
-    void InitFromBOM(BOMType bomType);
+    void InitFromBOM(wxBOM bomType);
  
      // create the correct conversion object for the BOM present in the
      // beginning of the buffer
  
      // create the correct conversion object for the BOM present in the
      // beginning of the buffer
@@ -128,7 +135,7 @@ private:
      wxFontEncoding m_encDefault;
  
      // our BOM type
      wxFontEncoding m_encDefault;
  
      // our BOM type
-    BOMType m_bomType;
+    wxBOM m_bomType;
  
      // true if we allocated m_conv ourselves, false if we just use an existing
      // global conversion
  
      // true if we allocated m_conv ourselves, false if we just use an existing
      // global conversion
diff --git a/interface/wx/convauto.h b/interface/wx/convauto.h

index fc8f1987a2ad3c4f5dc94223f0413dc956a9b2e1..715d06c279e09daff2b8518c7f38917eb1ef9259 100644 (file)
--- a/interface/wx/convauto.h
+++ b/interface/wx/convauto.h
@@ -6,6 +6,74 @@
  // Licence:     wxWindows licence
  /////////////////////////////////////////////////////////////////////////////
  
  // Licence:     wxWindows licence
  /////////////////////////////////////////////////////////////////////////////
  
+/**
+    Constants representing various BOM types.
+
+    BOM is an abbreviation for "Byte Order Mark", a special Unicode character
+    which may be inserted into the beginning of a text stream to indicate its
+    encoding.
+
+    @since 2.9.3
+ */
+enum wxBOM
+{
+    /**
+        Unknown BOM.
+
+        This is returned if BOM presence couldn't be determined and normally
+        happens because not enough bytes of input have been analysed.
+     */
+    wxBOM_Unknown = -1,
+
+    /**
+        No BOM.
+
+        The stream doesn't contain BOM character at all.
+     */
+    wxBOM_None,
+
+    /**
+        UTF-32 big endian BOM.
+
+        The stream is encoded in big endian variant of UTF-32.
+     */
+    wxBOM_UTF32BE,
+
+    /**
+        UTF-32 little endian BOM.
+
+        The stream is encoded in little endian variant of UTF-32.
+     */
+    wxBOM_UTF32LE,
+
+    /**
+        UTF-16 big endian BOM.
+
+        The stream is encoded in big endian variant of UTF-16.
+     */
+    wxBOM_UTF16BE,
+
+    /**
+        UTF-16 little endian BOM.
+
+        The stream is encoded in little endian variant of UTF-16.
+     */
+    wxBOM_UTF16LE,
+
+    /**
+        UTF-8 BOM.
+
+        The stream is encoded in UTF-8.
+
+        Notice that contrary to a popular belief, it's perfectly possible and,
+        n fact, common under Microsoft Windows systems, to have a BOM in an
+        UTF-8 stream: while it's not used to indicate the endianness of UTF-8
+        stream (as it's byte-oriented), the BOM can still be useful just as an
+        unambiguous indicator of UTF-8 being used.
+     */
+    wxBOM_UTF8
+};
+
  /**
      @class wxConvAuto
  
  /**
      @class wxConvAuto
  
@@ -66,6 +134,19 @@ public:
      */
      wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
  
      */
      wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
  
+
+    /**
+        Return the detected BOM type.
+
+        The BOM type is detected after sufficiently many initial bytes have
+        passed through this conversion object so it will always return
+        wxBOM_Unknown immediately after the object creation but may return a
+        different value later.
+
+        @since 2.9.3
+    */
+    wxBOM GetBOM() const;
+
      /**
          Disable the use of the fall back encoding: if the input doesn't have a
          BOM and is not valid UTF-8, the conversion will fail.
      /**
          Disable the use of the fall back encoding: if the input doesn't have a
          BOM and is not valid UTF-8, the conversion will fail.
@@ -92,5 +173,16 @@ public:
          @c wxFONTENCODING_DEFAULT can't be used here.
      */
      static void SetFallbackEncoding(wxFontEncoding enc);
          @c wxFONTENCODING_DEFAULT can't be used here.
      */
      static void SetFallbackEncoding(wxFontEncoding enc);
-};
  
  
+    /**
+        Return the BOM type of this buffer.
+
+        This is a helper function which is normally only used internally by
+        wxConvAuto but provided for convenience of the code that wants to
+        detect the encoding of a stream by checking it for BOM presence on its
+        own.
+
+        @since 2.9.3
+    */
+    static wxBOM DetectBOM(const char *src, size_t srcLen);
+};
diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp

index 8620d4e02efa5f533d09a78f4bb3a144144202d6..7480754bb627eda76f5ea0ea49985fac1312d95f 100644 (file)
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -45,7 +45,7 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
  }
  
  /* static */
  }
  
  /* static */
-wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
+wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
  {
      // examine the buffer for BOM presence
      //
  {
      // examine the buffer for BOM presence
      //
@@ -65,14 +65,14 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
      switch ( srcLen )
      {
          case 0:
      switch ( srcLen )
      {
          case 0:
-            return BOM_Unknown;
+            return wxBOM_Unknown;
  
          case 1:
              if ( src[0] == '\x00' || src[0] == '\xFF' ||
                   src[0] == '\xFE' || src[0] == '\xEF')
              {
                  // this could be a BOM but we don't know yet
  
          case 1:
              if ( src[0] == '\x00' || src[0] == '\xFF' ||
                   src[0] == '\xFE' || src[0] == '\xEF')
              {
                  // this could be a BOM but we don't know yet
-                return BOM_Unknown;
+                return wxBOM_Unknown;
              }
              break;
  
              }
              break;
  
@@ -81,22 +81,22 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
              if ( src[0] == '\xEF' && src[1] == '\xBB' )
              {
                  if ( srcLen == 3 )
              if ( src[0] == '\xEF' && src[1] == '\xBB' )
              {
                  if ( srcLen == 3 )
-                    return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
+                    return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
  
  
-                return BOM_Unknown;
+                return wxBOM_Unknown;
              }
  
              if ( src[0] == '\xFE' && src[1] == '\xFF' )
              }
  
              if ( src[0] == '\xFE' && src[1] == '\xFF' )
-                return BOM_UTF16BE;
+                return wxBOM_UTF16BE;
  
              if ( src[0] == '\xFF' && src[1] == '\xFE' )
              {
                  // if the next byte is 0, it could be an UTF-32LE BOM but if it
                  // isn't we can be sure it's UTF-16LE
                  if ( srcLen == 3 && src[2] != '\x00' )
  
              if ( src[0] == '\xFF' && src[1] == '\xFE' )
              {
                  // if the next byte is 0, it could be an UTF-32LE BOM but if it
                  // isn't we can be sure it's UTF-16LE
                  if ( srcLen == 3 && src[2] != '\x00' )
-                    return BOM_UTF16LE;
+                    return wxBOM_UTF16LE;
  
  
-                return BOM_Unknown;
+                return wxBOM_Unknown;
              }
  
              if ( src[0] == '\x00' && src[1] == '\x00' )
              }
  
              if ( src[0] == '\x00' && src[1] == '\x00' )
@@ -104,9 +104,9 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
                  // this could only be UTF-32BE, check that the data we have so
                  // far allows for it
                  if ( srcLen == 3 && src[2] != '\xFE' )
                  // this could only be UTF-32BE, check that the data we have so
                  // far allows for it
                  if ( srcLen == 3 && src[2] != '\xFE' )
-                    return BOM_None;
+                    return wxBOM_None;
  
  
-                return BOM_Unknown;
+                return wxBOM_Unknown;
              }
              break;
  
              }
              break;
  
@@ -114,61 +114,61 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
              // we have at least 4 characters so we may finally decide whether
              // we have a BOM or not
              if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
              // we have at least 4 characters so we may finally decide whether
              // we have a BOM or not
              if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
-                return BOM_UTF8;
+                return wxBOM_UTF8;
  
              if ( src[0] == '\x00' && src[1] == '\x00' &&
                   src[2] == '\xFE' && src[3] == '\xFF' )
  
              if ( src[0] == '\x00' && src[1] == '\x00' &&
                   src[2] == '\xFE' && src[3] == '\xFF' )
-                return BOM_UTF32BE;
+                return wxBOM_UTF32BE;
  
              if ( src[0] == '\xFF' && src[1] == '\xFE' &&
                   src[2] == '\x00' && src[3] == '\x00' )
  
              if ( src[0] == '\xFF' && src[1] == '\xFE' &&
                   src[2] == '\x00' && src[3] == '\x00' )
-                return BOM_UTF32LE;
+                return wxBOM_UTF32LE;
  
              if ( src[0] == '\xFE' && src[1] == '\xFF' )
  
              if ( src[0] == '\xFE' && src[1] == '\xFF' )
-                return BOM_UTF16BE;
+                return wxBOM_UTF16BE;
  
              if ( src[0] == '\xFF' && src[1] == '\xFE' )
  
              if ( src[0] == '\xFF' && src[1] == '\xFE' )
-                return BOM_UTF16LE;
+                return wxBOM_UTF16LE;
      }
  
      }
  
-    return BOM_None;
+    return wxBOM_None;
  }
  
  }
  
-void wxConvAuto::InitFromBOM(BOMType bomType)
+void wxConvAuto::InitFromBOM(wxBOM bomType)
  {
      m_consumedBOM = false;
  
      switch ( bomType )
      {
  {
      m_consumedBOM = false;
  
      switch ( bomType )
      {
-        case BOM_Unknown:
+        case wxBOM_Unknown:
              wxFAIL_MSG( "shouldn't be called for this BOM type" );
              break;
  
              wxFAIL_MSG( "shouldn't be called for this BOM type" );
              break;
  
-        case BOM_None:
+        case wxBOM_None:
              // use the default
              break;
  
              // use the default
              break;
  
-        case BOM_UTF32BE:
+        case wxBOM_UTF32BE:
              m_conv = new wxMBConvUTF32BE;
              m_ownsConv = true;
              break;
  
              m_conv = new wxMBConvUTF32BE;
              m_ownsConv = true;
              break;
  
-        case BOM_UTF32LE:
+        case wxBOM_UTF32LE:
              m_conv = new wxMBConvUTF32LE;
              m_ownsConv = true;
              break;
  
              m_conv = new wxMBConvUTF32LE;
              m_ownsConv = true;
              break;
  
-        case BOM_UTF16BE:
+        case wxBOM_UTF16BE:
              m_conv = new wxMBConvUTF16BE;
              m_ownsConv = true;
              break;
  
              m_conv = new wxMBConvUTF16BE;
              m_ownsConv = true;
              break;
  
-        case BOM_UTF16LE:
+        case wxBOM_UTF16LE:
              m_conv = new wxMBConvUTF16LE;
              m_ownsConv = true;
              break;
  
              m_conv = new wxMBConvUTF16LE;
              m_ownsConv = true;
              break;
  
-        case BOM_UTF8:
+        case wxBOM_UTF8:
              InitWithUTF8();
              break;
  
              InitWithUTF8();
              break;
  
@@ -191,25 +191,25 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
      int ofs;
      switch ( m_bomType )
      {
      int ofs;
      switch ( m_bomType )
      {
-        case BOM_Unknown:
+        case wxBOM_Unknown:
              wxFAIL_MSG( "shouldn't be called for this BOM type" );
              return;
  
              wxFAIL_MSG( "shouldn't be called for this BOM type" );
              return;
  
-        case BOM_None:
+        case wxBOM_None:
              ofs = 0;
              break;
  
              ofs = 0;
              break;
  
-        case BOM_UTF32BE:
-        case BOM_UTF32LE:
+        case wxBOM_UTF32BE:
+        case wxBOM_UTF32LE:
              ofs = 4;
              break;
  
              ofs = 4;
              break;
  
-        case BOM_UTF16BE:
-        case BOM_UTF16LE:
+        case wxBOM_UTF16BE:
+        case wxBOM_UTF16LE:
              ofs = 2;
              break;
  
              ofs = 2;
              break;
  
-        case BOM_UTF8:
+        case wxBOM_UTF8:
              ofs = 3;
              break;
  
              ofs = 3;
              break;
  
@@ -226,7 +226,7 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
  bool wxConvAuto::InitFromInput(const char *src, size_t len)
  {
      m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
  bool wxConvAuto::InitFromInput(const char *src, size_t len)
  {
      m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
-    if ( m_bomType == BOM_Unknown )
+    if ( m_bomType == wxBOM_Unknown )
          return false;
  
      InitFromBOM(m_bomType);
          return false;
  
      InitFromBOM(m_bomType);
@@ -275,7 +275,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
  
      // try to convert using the auto-detected encoding
      size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
  
      // try to convert using the auto-detected encoding
      size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
-    if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
+    if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
      {
          // if the conversion failed but we didn't really detect anything and
          // simply tried UTF-8 by default, retry it using the fall-back
      {
          // if the conversion failed but we didn't really detect anything and
          // simply tried UTF-8 by default, retry it using the fall-back
author	Vadim Zeitlin <vadim@wxwidgets.org>
	Thu, 27 Oct 2011 22:48:54 +0000 (22:48 +0000)
committer	Vadim Zeitlin <vadim@wxwidgets.org>
	Thu, 27 Oct 2011 22:48:54 +0000 (22:48 +0000)
include/wx/convauto.h		patch \| blob \| blame \| history
interface/wx/convauto.h		patch \| blob \| blame \| history
src/common/convauto.cpp		patch \| blob \| blame \| history