64 bit clean implementation

[wxWidgets.git] / src / common / strconv.cpp
diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp

index f2364b7fc3f59994061dcd95eb97e12a94f8640d..3e354f466b668ab41e01c581f0bea6af1df5ac2a 100644 (file)
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -391,7 +391,11 @@ wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
      const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
      if ( dstLen != wxCONV_FAILED )
      {
      const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
      if ( dstLen != wxCONV_FAILED )
      {
-        wxWCharBuffer wbuf(dstLen - 1);
+        // notice that we allocate space for dstLen+1 wide characters here
+        // because we want the buffer to always be NUL-terminated, even if the
+        // input isn't (as otherwise the caller has no way to know its length)
+        wxWCharBuffer wbuf(dstLen);
+        wbuf.data()[dstLen - 1] = L'\0';
          if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
          {
              if ( outLen )
          if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
          {
              if ( outLen )
@@ -417,16 +421,18 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
      size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
      if ( dstLen != wxCONV_FAILED )
      {
      size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
      if ( dstLen != wxCONV_FAILED )
      {
-        // special case of empty input: can't allocate 0 size buffer below as
-        // wxCharBuffer insists on NUL-terminating it
-        wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
+        const size_t nulLen = GetMBNulLen();
+
+        // as above, ensure that the buffer is always NUL-terminated, even if
+        // the input is not
+        wxCharBuffer buf(dstLen + nulLen - 1);
+        memset(buf.data() + dstLen, 0, nulLen);
          if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
          {
              if ( outLen )
              {
                  *outLen = dstLen;
  
          if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
          {
              if ( outLen )
              {
                  *outLen = dstLen;
  
-                const size_t nulLen = GetMBNulLen();
                  if ( dstLen >= nulLen &&
                          !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
                  {
                  if ( dstLen >= nulLen &&
                          !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
                  {
@@ -703,7 +709,7 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
  // UTF-8
  // ----------------------------------------------------------------------------
  
  // UTF-8
  // ----------------------------------------------------------------------------
  
-static wxUint32 utf8_max[]=
+static const wxUint32 utf8_max[]=
      { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
  
  // boundaries of the private use area we use to (temporarily) remap invalid
      { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
  
  // boundaries of the private use area we use to (temporarily) remap invalid
@@ -712,7 +718,7 @@ const wxUint32 wxUnicodePUA = 0x100000;
  const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
  
  // this table gives the length of the UTF-8 encoding from its first character:
  const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
  
  // this table gives the length of the UTF-8 encoding from its first character:
-unsigned char tableUtf8Lengths[256] = {
+const unsigned char tableUtf8Lengths[256] = {
      // single-byte sequences (ASCII):
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
      // single-byte sequences (ASCII):
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
@@ -778,58 +784,73 @@ wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
              return written;
          }
  
              return written;
          }
  
-        unsigned char c = *p;
-        unsigned len = tableUtf8Lengths[c];
-        if ( !len )
+        if ( out && !dstLen-- )
              break;
  
              break;
  
-        if ( srcLen < len ) // the test works for wxNO_LEN too
-            break;
+        wxUint32 code;
+        unsigned char c = *p;
  
  
-        if ( srcLen != wxNO_LEN )
-            srcLen -= len;
+        if ( c < 0x80 )
+        {
+            if ( srcLen == 0 ) // the test works for wxNO_LEN too
+                break;
  
  
-        if ( out && !dstLen-- )
-            break;
+            if ( srcLen != wxNO_LEN )
+                srcLen--;
  
  
+            code = c;
+        }
+        else
+        {
+            unsigned len = tableUtf8Lengths[c];
+            if ( !len )
+                break;
  
  
-        //   Char. number range   |        UTF-8 octet sequence
-        //      (hexadecimal)     |              (binary)
-        //  ----------------------+---------------------------------------------
-        //  0000 0000 - 0000 007F | 0xxxxxxx
-        //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
-        //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
-        //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-        //
-        //  Code point value is stored in bits marked with 'x', lowest-order bit
-        //  of the value on the right side in the diagram above.
-        //                                                       (from RFC 3629)
+            if ( srcLen < len ) // the test works for wxNO_LEN too
+                break;
  
  
-        // mask to extract lead byte's value ('x' bits above), by sequence length:
-        static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
+            if ( srcLen != wxNO_LEN )
+                srcLen -= len;
  
  
-        // mask and value of lead byte's most significant bits, by length:
-        static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
-        static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
+            //   Char. number range   |        UTF-8 octet sequence
+            //      (hexadecimal)     |              (binary)
+            //  ----------------------+----------------------------------------
+            //  0000 0000 - 0000 007F | 0xxxxxxx
+            //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
+            //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+            //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            //
+            //  Code point value is stored in bits marked with 'x',
+            //  lowest-order bit of the value on the right side in the diagram
+            //  above.                                         (from RFC 3629)
  
  
-        len--; // it's more convenient to work with 0-based length here
+            // mask to extract lead byte's value ('x' bits above), by sequence
+            // length:
+            static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
  
  
-        // extract the lead byte's value bits:
-        if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
-            break;
+            // mask and value of lead byte's most significant bits, by length:
+            static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
+            static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
  
  
-        wxUint32 code = c & leadValueMask[len];
+            len--; // it's more convenient to work with 0-based length here
  
  
-        // all remaining bytes, if any, are handled in the same way regardless of
-        // sequence's length:
-        for ( ; len; --len )
-        {
-            c = *++p;
-            if ( (c & 0xC0) != 0x80 )
-                return wxCONV_FAILED;
+            // extract the lead byte's value bits:
+            if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
+                break;
+
+            code = c & leadValueMask[len];
+
+            // all remaining bytes, if any, are handled in the same way
+            // regardless of sequence's length:
+            for ( ; len; --len )
+            {
+                c = *++p;
+                if ( (c & 0xC0) != 0x80 )
+                    return wxCONV_FAILED;
  
  
-            code <<= 6;
-            code |= c & 0x3F;
+                code <<= 6;
+                code |= c & 0x3F;
+            }
          }
  
  #ifdef WC_UTF16
          }
  
  #ifdef WC_UTF16
@@ -968,14 +989,15 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
      return wxCONV_FAILED;
  }
  
      return wxCONV_FAILED;
  }
  
-size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
+size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
+                             const char *psz, size_t srcLen) const
  {
      if ( m_options == MAP_INVALID_UTF8_NOT )
  {
      if ( m_options == MAP_INVALID_UTF8_NOT )
-        return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
+        return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
  
      size_t len = 0;
  
  
      size_t len = 0;
  
-    while (*psz && ((!buf) || (len < n)))
+    while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
      {
          const char *opsz = psz;
          bool invalid = false;
      {
          const char *opsz = psz;
          bool invalid = false;
@@ -1109,10 +1131,10 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
          }
      }
  
          }
      }
  
-    if (buf && (len < n))
+    if (srcLen == wxNO_LEN && buf && (len < n))
          *buf = 0;
  
          *buf = 0;
  
-    return len;
+    return len + 1;
  }
  
  static inline bool isoctal(wchar_t wch)
  }
  
  static inline bool isoctal(wchar_t wch)
@@ -1120,14 +1142,15 @@ static inline bool isoctal(wchar_t wch)
      return L'0' <= wch && wch <= L'7';
  }
  
      return L'0' <= wch && wch <= L'7';
  }
  
-size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
+size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
+                               const wchar_t *psz, size_t srcLen) const
  {
      if ( m_options == MAP_INVALID_UTF8_NOT )
  {
      if ( m_options == MAP_INVALID_UTF8_NOT )
-        return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
+        return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
  
      size_t len = 0;
  
  
      size_t len = 0;
  
-    while (*psz && ((!buf) || (len < n)))
+    while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
      {
          wxUint32 cc;
  
      {
          wxUint32 cc;
  
@@ -1195,10 +1218,10 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
          }
      }
  
          }
      }
  
-    if (buf && (len < n))
+    if (srcLen == wxNO_LEN && buf && (len < n))
          *buf = 0;
  
          *buf = 0;
  
-    return len;
+    return len + 1;
  }
  
  // ============================================================================
  }
  
  // ============================================================================
@@ -2411,26 +2434,38 @@ public:
              return wxCONV_FAILED;
          }
  
              return wxCONV_FAILED;
          }
  
-        // if we were really converting, check if we succeeded
-        if ( buf )
+        // we did something, check if we really succeeded
+        if ( flags )
          {
          {
-            if ( flags )
+            // check if the conversion failed, i.e. if any replacements
+            // were done
+            if ( usedDef )
+                return wxCONV_FAILED;
+        }
+        else // we must resort to double tripping...
+        {
+            // first we need to ensure that we really have the MB data: this is
+            // not the case if we're called with NULL buffer, in which case we
+            // need to do the conversion yet again
+            wxCharBuffer bufDef;
+            if ( !buf )
              {
              {
-                // check if the conversion failed, i.e. if any replacements
-                // were done
-                if ( usedDef )
+                bufDef = wxCharBuffer(len);
+                buf = bufDef.data();
+                if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
+                                            buf, len, NULL, NULL) )
                      return wxCONV_FAILED;
              }
                      return wxCONV_FAILED;
              }
-            else // we must resort to double tripping...
+
+            if ( !n )
+                n = wcslen(pwz);
+            wxWCharBuffer wcBuf(n);
+            if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
+                    wcscmp(wcBuf, pwz) != 0 )
              {
              {
-                wxWCharBuffer wcBuf(n);
-                if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
-                        wcscmp(wcBuf, pwz) != 0 )
-                {
-                    // we didn't obtain the same thing we started from, hence
-                    // the conversion was lossy and we consider that it failed
-                    return wxCONV_FAILED;
-                }
+                // we didn't obtain the same thing we started from, hence
+                // the conversion was lossy and we consider that it failed
+                return wxCONV_FAILED;
              }
          }
  
              }
          }
  
@@ -3163,8 +3198,14 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
      WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
  #endif
  
      WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
  #endif
  
-WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
-WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
+// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
+//     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
+//     provokes an error message about "not enough macro parameters"; and we
+//     can't use "()" here as the name##Obj declaration would be parsed as a
+//     function declaration then, so use a semicolon and live with an extra
+//     empty statement (and hope that no compilers warns about this)
+WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
+WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
  
  WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
  WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
  
  WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
  WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));