replaced recently added wxMBConv::GetMBNul() with a less clever but better

author Vadim Zeitlin <vadim@wxwidgets.org>

Sun, 2 Apr 2006 14:59:53 +0000 (14:59 +0000)

committer Vadim Zeitlin <vadim@wxwidgets.org>

Sun, 2 Apr 2006 14:59:53 +0000 (14:59 +0000)
author Vadim Zeitlin <vadim@wxwidgets.org>
Sun, 2 Apr 2006 14:59:53 +0000 (14:59 +0000)
committer Vadim Zeitlin <vadim@wxwidgets.org>
Sun, 2 Apr 2006 14:59:53 +0000 (14:59 +0000)
diff --git a/include/wx/strconv.h b/include/wx/strconv.h

index 53b74ca6ccd2c5faad32db8b14d49386d3d03a38..32a96111758f9d5d3c17212c1ef03eaacb949222 100644 (file)
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -85,15 +85,18 @@ public:
      virtual ~wxMBConv();
  
  private:
-    // this function must return the multibyte representation of L'\0'
+    // this function is used in the implementation of cMB2WC() to distinguish
+    // between the following cases:
      //
-    // on error, nulLen should be set to -1
-    virtual const char *GetMBNul(size_t *nulLen) const
-    {
-        *nulLen = 1;
-
-        return "";
-    }
+    //      a) var width encoding with strings terminated by a single NUL
+    //         (usual multibyte encodings): return 1 in this case
+    //      b) fixed width encoding with 2 bytes/char and so terminated by
+    //         2 NULs (UTF-16/UCS-2 and variants): return 2 in this case
+    //      c) fixed width encoding with 4 bytes/char and so terminated by
+    //         4 NULs (UTF-32/UCS-4 and variants): return 4 in this case
+    //
+    // anything else is not supported currently and -1 should be returned
+    virtual size_t GetMinMBCharWidth() const { return 1; }
  };
  
  // ----------------------------------------------------------------------------
@@ -134,10 +137,10 @@ public:
      }
  
  private:
-    virtual const char *GetMBNul(size_t *nulLen) const
+    virtual size_t GetMinMBCharWidth() const
      {
          // cast needed to call a private function
-        return ((wxConvBrokenFileNames *)m_conv)->GetMBNul(nulLen);
+        return ((wxConvBrokenFileNames *)m_conv)->GetMinMBCharWidth();
      }
  
  
@@ -186,11 +189,7 @@ private:
  class WXDLLIMPEXP_BASE wxMBConvUTF16Base : public wxMBConv
  {
  private:
-    virtual const char *GetMBNul(size_t *nulLen) const
-    {
-        *nulLen = 2;
-        return "\0";
-    }
+    virtual size_t GetMinMBCharWidth() const { return 2; }
  };
  
  // ----------------------------------------------------------------------------
@@ -222,11 +221,7 @@ public:
  class WXDLLIMPEXP_BASE wxMBConvUTF32Base : public wxMBConv
  {
  private:
-    virtual const char *GetMBNul(size_t *nulLen) const
-    {
-        *nulLen = 4;
-        return "\0\0\0";
-    }
+    virtual size_t GetMinMBCharWidth() const { return 4; }
  };
  
  // ----------------------------------------------------------------------------
@@ -289,7 +284,7 @@ private:
      // charset string
      void SetName(const wxChar *charset);
  
-    virtual const char *GetMBNul(size_t *nulLen) const;
+    virtual size_t GetMinMBCharWidth() const;
  
  
      // note that we can't use wxString here because of compilation
diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp

index e50eddf3f93b5f796aa7bc6185b2a623229a52e4..58e73178d68f5528f09c90b1a85331efc3a16427 100644 (file)
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -187,6 +187,15 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
      return buf;
  }
  
+// helper of cMB2WC(): check if n bytes at this location are all NUL
+static bool NotAllNULs(const char *p, size_t n)
+{
+    while ( n && *p++ == '\0' )
+        n--;
+
+    return n != 0;
+}
+
  const wxWCharBuffer
  wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
  {
@@ -196,87 +205,108 @@ wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
      // the current length of wbuf
      size_t lenBuf = 0;
  
-    // we need to know the representation of L'\0' for this conversion
-    size_t nulLen;
-    const char * const nul = GetMBNul(&nulLen);
-    if ( nulLen == (size_t)-1 || nulLen == 0 )
-        return wxWCharBuffer();
+    // the number of NULs terminating this string
+    size_t nulLen   wxDUMMY_INITIALIZE(0);
  
      // make a copy of the input string unless it is already properly
      // NUL-terminated
      wxCharBuffer bufTmp;
  
-    // now we can compute the input size if we were not given it: notice that
-    // in this case the string must be properly NUL-terminated, of course, as
-    // otherwise we have no way of knowing how long it is
-    if ( inLen == (size_t)-1 )
+    // if we were not given the input size we just have to assume that the
+    // string is properly terminated as we have no way of knowing how long it
+    // is anyhow, but if we do have the size check whether there are enough
+    // NULs at the end
+    if ( inLen != (size_t)-1 )
      {
-        // not the most efficient algorithm but it shouldn't matter as normally
-        // there are not many NULs in the string and so normally memcmp()
-        // should stop on the first character
-        const char *p = in;
-        while ( memcmp(p, nul, nulLen) != 0 )
-            p++;
+        // we need to know how to find the end of this string
+        nulLen = GetMinMBCharWidth();
+        if ( nulLen == (size_t)-1 )
+            return wbuf;
  
-        inLen = p - in + nulLen;
-    }
-    else // we already have the size
-    {
-        // check if it's not already NUL-terminated too to avoid the copy
-        if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
+        // if there are enough NULs we can avoid the copy
+        if ( inLen < nulLen || NotAllNULs(in + inLen - nulLen, nulLen) )
          {
              // make a copy in order to properly NUL-terminate the string
              bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
-            memcpy(bufTmp.data(), in, inLen);
-            memcpy(bufTmp.data() + inLen, nul, nulLen);
+            char * const p = bufTmp.data();
+            memcpy(p, in, inLen);
+            for ( char *s = p + inLen; s < p + inLen + nulLen; s++ )
+                *s = '\0';
          }
      }
  
      if ( bufTmp )
          in = bufTmp;
  
+    size_t lenChunk;
      for ( const char * const inEnd = in + inLen;; )
      {
-        // try to convert the current chunk if anything left
-        size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
+        // try to convert the current chunk
+        lenChunk = MB2WC(NULL, in, 0);
          if ( lenChunk == 0 )
          {
              // nothing left in the input string, conversion succeeded
-            if ( outLen )
-            {
-                // we shouldn't include the last NUL in the result length
-                *outLen = lenBuf ? lenBuf - 1 : 0;
-            }
-
-            return wbuf;
+            break;
          }
  
          if ( lenChunk == (size_t)-1 )
              break;
  
+        // if we already have a previous chunk, leave the NUL separating it
+        // from this one
+        if ( lenBuf )
+            lenBuf++;
+
          const size_t lenBufNew = lenBuf + lenChunk;
          if ( !wbuf.extend(lenBufNew) )
+        {
+            lenChunk = (size_t)-1;
              break;
+        }
  
          lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
          if ( lenChunk == (size_t)-1 )
              break;
  
-        // +! for the embedded NUL (if something follows)
-        lenBuf = lenBufNew + 1;
+        lenBuf = lenBufNew;
+
+        if ( inLen == (size_t)-1 )
+        {
+            // convert only one chunk in this case, as we suppose that the
+            // string is NUL-terminated and so inEnd is not used at all
+            break;
+        }
  
          // advance the input pointer past the end of this chunk
-        while ( memcmp(in, nul, nulLen) != 0 )
-            in++;
+        while ( NotAllNULs(in, nulLen) )
+        {
+            // notice that we must skip over multiple bytes here as we suppose
+            // that if NUL takes 2 or 4 bytes, then all the other characters do
+            // too and so if advanced by a single byte we might erroneously
+            // detect sequences of NUL bytes in the middle of the input
+            in += nulLen;
+        }
  
          in += nulLen; // skipping over its terminator as well
+
+        // note that ">=" (and not just "==") is needed here as the terminator
+        // we skipped just above could be inside or just after the buffer
+        // delimited by inEnd
+        if ( in >= inEnd )
+            break;
+    }
+
+    if ( lenChunk == (size_t)-1 )
+    {
+        // conversion failed
+        lenBuf = 0;
+        wbuf.reset();
      }
  
-    // conversion failed
      if ( outLen )
-        *outLen = 0;
+        *outLen = lenBuf;
  
-    return wxWCharBuffer();
+    return wbuf;
  }
  
  const wxCharBuffer
@@ -1352,7 +1382,9 @@ protected:
  #endif
  
  private:
-    virtual const char *GetMBNul(size_t *nulLen) const;
+    // classify this encoding as explained in wxMBConv::GetMinMBCharWidth()
+    // comment
+    virtual size_t GetMinMBCharWidth() const;
  
      // the name (for iconv_open()) of a wide char charset -- if none is
      // available on this machine, it will remain NULL
@@ -1362,9 +1394,9 @@ private:
      // different endian-ness than the native one
      static bool ms_wcNeedsSwap;
  
-    // NUL representation
-    size_t m_nulLen;
-    char m_nulBuf[8];
+    // cached result of GetMinMBCharWidth(); set to 0 meaning "unknown"
+    // initially
+    size_t m_minMBCharWidth;
  };
  
  // make the constructor available for unit testing
@@ -1384,7 +1416,7 @@ bool wxMBConv_iconv::ms_wcNeedsSwap = false;
  
  wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
  {
-    m_nulLen = (size_t)-2;
+    m_minMBCharWidth = 0;
  
      // iconv operates with chars, not wxChars, but luckily it uses only ASCII
      // names for the charsets
@@ -1642,9 +1674,9 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
      return res;
  }
  
-const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
+size_t wxMBConv_iconv::GetMinMBCharWidth() const
  {
-    if ( m_nulLen == (size_t)-2 )
+    if ( m_minMBCharWidth == 0 )
      {
          wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
  
@@ -1654,22 +1686,22 @@ const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
  #endif
  
          wchar_t *wnul = L"";
+        char buf[8]; // should be enough for NUL in any encoding
          size_t inLen = sizeof(wchar_t),
-               outLen = WXSIZEOF(m_nulBuf);
+               outLen = WXSIZEOF(buf);
          const char *in = (char *)wnul;
-        char *out = self->m_nulBuf;
+        char *out = buf;
          if ( iconv(w2m, &in, &inLen, &out, &outLen) == (size_t)-1 )
          {
-            self->m_nulLen = (size_t)-1;
+            self->m_minMBCharWidth = (size_t)-1;
          }
          else // ok
          {
-            self->m_nulLen = out - m_nulBuf;
+            self->m_minMBCharWidth = out - buf;
          }
      }
  
-    *nulLen = m_nulLen;
-    return m_nulBuf;
+    return m_minMBCharWidth;
  }
  
  #endif // HAVE_ICONV
@@ -1693,20 +1725,20 @@ public:
      wxMBConv_win32()
      {
          m_CodePage = CP_ACP;
-        m_nulLen = (size_t)-2;
+        m_minMBCharWidth = 0;
      }
  
  #if wxUSE_FONTMAP
      wxMBConv_win32(const wxChar* name)
      {
          m_CodePage = wxCharsetToCodepage(name);
-        m_nulLen = (size_t)-2;
+        m_minMBCharWidth = 0;
      }
  
      wxMBConv_win32(wxFontEncoding encoding)
      {
          m_CodePage = wxEncodingToCodepage(encoding);
-        m_nulLen = (size_t)-2;
+        m_minMBCharWidth = 0;
      }
  #endif // wxUSE_FONTMAP
  
@@ -1933,35 +1965,50 @@ private:
  #endif
      }
  
-    virtual const char *GetMBNul(size_t *nulLen) const
+    virtual size_t GetMinMBCharWidth() const
      {
-        if ( m_nulLen == (size_t)-2 )
+        if ( m_minMBCharWidth == 0 )
          {
+            int len = ::WideCharToMultiByte
+                        (
+                            m_CodePage,     // code page
+                            0,              // no flags
+                            L"",            // input string
+                            1,              // translate just the NUL
+                            NULL,           // output buffer
+                            0,              // and its size
+                            NULL,           // no replacement char
+                            NULL            // [out] don't care if it was used
+                        );
+
              wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
+            switch ( len )
+            {
+                default:
+                    wxLogDebug(_T("Unexpected NUL length %d"), len);
+                    // fall through
  
-            self->m_nulLen = ::WideCharToMultiByte
-                               (
-                                    m_CodePage,         // code page
-                                    0,                  // no flags
-                                    L"",                // input string
-                                    1,                  // translate just NUL
-                                    self->m_nulBuf,     // output buffer
-                                    WXSIZEOF(m_nulBuf), // and its size
-                                    NULL,               // "replacement" char
-                                    NULL                // [out] was it used?
-                               );
+                case 0:
+                    self->m_minMBCharWidth = (size_t)-1;
+                    break;
  
-            if ( m_nulLen == 0 )
-                self->m_nulLen = (size_t)-1;
+                case 1:
+                case 2:
+                case 4:
+                    self->m_minMBCharWidth = len;
+                    break;
+            }
          }
  
-        *nulLen = m_nulLen;
-        return m_nulBuf;
+        return m_minMBCharWidth;
      }
  
+    // the code page we're working with
      long m_CodePage;
-    size_t m_nulLen;
-    char m_nulBuf[8];
+
+    // cached result of GetMinMBCharWidth(), set to 0 initially meaning
+    // "unknown"
+    size_t m_minMBCharWidth;
  };
  
  #endif // wxHAVE_WIN32_MB2WC
@@ -2602,23 +2649,20 @@ public:
      wxEncodingConverter m2w, w2m;
  
  private:
-    virtual const char *GetMBNul(size_t *nulLen) const
+    virtual size_t GetMinMBCharWidth() const
      {
          switch ( m_enc )
          {
              case wxFONTENCODING_UTF16BE:
              case wxFONTENCODING_UTF16LE:
-                *nulLen = 2;
-                return "\0";
+                return 2;
  
              case wxFONTENCODING_UTF32BE:
              case wxFONTENCODING_UTF32LE:
-                *nulLen = 4;
-                return "\0\0\0";
+                return 4;
  
              default:
-                *nulLen = 1;
-                return "";
+                return 1;
          }
      }
  
@@ -3014,18 +3058,17 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
      return len;
  }
  
-const char *wxCSConv::GetMBNul(size_t *nulLen) const
+size_t wxCSConv::GetMinMBCharWidth() const
  {
      CreateConvIfNeeded();
  
      if ( m_convReal )
      {
          // cast needed just to call private function of m_convReal
-        return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
+        return ((wxCSConv *)m_convReal)->GetMinMBCharWidth();
      }
  
-    *nulLen = 1;
-    return "";
+    return 1;
  }
  
  // ----------------------------------------------------------------------------
author	Vadim Zeitlin <vadim@wxwidgets.org>
	Sun, 2 Apr 2006 14:59:53 +0000 (14:59 +0000)
committer	Vadim Zeitlin <vadim@wxwidgets.org>
	Sun, 2 Apr 2006 14:59:53 +0000 (14:59 +0000)
include/wx/strconv.h		patch \| blob \| blame \| history
src/common/strconv.cpp		patch \| blob \| blame \| history