X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/6ac84a787253ecedb262c739ec04e753e11c3697..48271822ef3d56c8f91af882b68fd1b674a8e8e6:/src/common/strconv.cpp

diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp
index 59ea721b71..f2364b7fc3 100644
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -354,14 +354,14 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
     if ( psz )
     {
         // calculate the length of the buffer needed first
-        const size_t nLen = MB2WC(NULL, psz, 0);
+        const size_t nLen = ToWChar(NULL, 0, psz);
         if ( nLen != wxCONV_FAILED )
         {
             // now do the actual conversion
-            wxWCharBuffer buf(nLen /* +1 added implicitly */);
+            wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 
             // +1 for the trailing NULL
-            if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
+            if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
                 return buf;
         }
     }
@@ -373,14 +373,11 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 {
     if ( pwz )
     {
-        const size_t nLen = WC2MB(NULL, pwz, 0);
+        const size_t nLen = FromWChar(NULL, 0, pwz);
         if ( nLen != wxCONV_FAILED )
         {
-            // extra space for trailing NUL(s)
-            static const size_t extraLen = GetMaxMBNulLen();
-
-            wxCharBuffer buf(nLen + extraLen - 1);
-            if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
+            wxCharBuffer buf(nLen - 1);
+            if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
                 return buf;
         }
     }
@@ -714,8 +711,268 @@ static wxUint32 utf8_max[]=
 const wxUint32 wxUnicodePUA = 0x100000;
 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 
+// this table gives the length of the UTF-8 encoding from its first character:
+unsigned char tableUtf8Lengths[256] = {
+    // single-byte sequences (ASCII):
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
+
+    // these are invalid:
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
+    0, 0,                                            // C0,C1
+
+    // two-byte sequences:
+          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
+
+    // three-byte sequences:
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
+
+    // four-byte sequences:
+    4, 4, 4, 4, 4,                                   // F0..F4
+
+    // these are invalid again (5- or 6-byte
+    // sequences and sequences for code points
+    // above U+10FFFF, as restricted by RFC 3629):
+                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
+};
+
+size_t
+wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
+                            const char *src, size_t srcLen) const
+{
+    wchar_t *out = dstLen ? dst : NULL;
+    size_t written = 0;
+
+    if ( srcLen == wxNO_LEN )
+        srcLen = strlen(src) + 1;
+
+    for ( const char *p = src; ; p++ )
+    {
+        if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
+        {
+            // all done successfully, just add the trailing NULL if we are not
+            // using explicit length
+            if ( srcLen == wxNO_LEN )
+            {
+                if ( out )
+                {
+                    if ( !dstLen )
+                        break;
+
+                    *out = L'\0';
+                }
+
+                written++;
+            }
+
+            return written;
+        }
+
+        unsigned char c = *p;
+        unsigned len = tableUtf8Lengths[c];
+        if ( !len )
+            break;
+
+        if ( srcLen < len ) // the test works for wxNO_LEN too
+            break;
+
+        if ( srcLen != wxNO_LEN )
+            srcLen -= len;
+
+        if ( out && !dstLen-- )
+            break;
+
+
+        //   Char. number range   |        UTF-8 octet sequence
+        //      (hexadecimal)     |              (binary)
+        //  ----------------------+---------------------------------------------
+        //  0000 0000 - 0000 007F | 0xxxxxxx
+        //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
+        //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+        //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        //
+        //  Code point value is stored in bits marked with 'x', lowest-order bit
+        //  of the value on the right side in the diagram above.
+        //                                                       (from RFC 3629)
+
+        // mask to extract lead byte's value ('x' bits above), by sequence length:
+        static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
+
+        // mask and value of lead byte's most significant bits, by length:
+        static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
+        static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
+
+        len--; // it's more convenient to work with 0-based length here
+
+        // extract the lead byte's value bits:
+        if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
+            break;
+
+        wxUint32 code = c & leadValueMask[len];
+
+        // all remaining bytes, if any, are handled in the same way regardless of
+        // sequence's length:
+        for ( ; len; --len )
+        {
+            c = *++p;
+            if ( (c & 0xC0) != 0x80 )
+                return wxCONV_FAILED;
+
+            code <<= 6;
+            code |= c & 0x3F;
+        }
+
+#ifdef WC_UTF16
+        // cast is ok because wchar_t == wxUint16 if WC_UTF16
+        if ( encode_utf16(code, (wxUint16 *)out) == 2 )
+        {
+            if ( out )
+                out++;
+            written++;
+        }
+#else // !WC_UTF16
+        if ( out )
+            *out = code;
+#endif // WC_UTF16/!WC_UTF16
+
+        if ( out )
+            out++;
+
+        written++;
+    }
+
+    return wxCONV_FAILED;
+}
+
+size_t
+wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
+                              const wchar_t *src, size_t srcLen) const
+{
+    char *out = dstLen ? dst : NULL;
+    size_t written = 0;
+
+    for ( const wchar_t *wp = src; ; wp++ )
+    {
+        if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
+        {
+            // all done successfully, just add the trailing NULL if we are not
+            // using explicit length
+            if ( srcLen == wxNO_LEN )
+            {
+                if ( out )
+                {
+                    if ( !dstLen )
+                        break;
+
+                    *out = '\0';
+                }
+
+                written++;
+            }
+
+            return written;
+        }
+
+
+        wxUint32 code;
+#ifdef WC_UTF16
+        // cast is ok for WC_UTF16
+        if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
+        {
+            // skip the next char too as we decoded a surrogate
+            wp++;
+        }
+#else // wchar_t is UTF-32
+        code = *wp & 0x7fffffff;
+#endif
+
+        unsigned len;
+        if ( code <= 0x7F )
+        {
+            len = 1;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                out[0] = (char)code;
+            }
+        }
+        else if ( code <= 0x07FF )
+        {
+            len = 2;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                // NB: this line takes 6 least significant bits, encodes them as
+                // 10xxxxxx and discards them so that the next byte can be encoded:
+                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[0] = 0xC0 | code;
+            }
+        }
+        else if ( code < 0xFFFF )
+        {
+            len = 3;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                out[2] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[0] = 0xE0 | code;
+            }
+        }
+        else if ( code <= 0x10FFFF )
+        {
+            len = 4;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                out[3] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[2] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[0] = 0xF0 | code;
+            }
+        }
+        else
+        {
+            wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
+            break;
+        }
+
+        if ( out )
+        {
+            out += len;
+            dstLen -= len;
+        }
+
+        written += len;
+    }
+
+    // we only get here if an error occurs during decoding
+    return wxCONV_FAILED;
+}
+
 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 {
+    if ( m_options == MAP_INVALID_UTF8_NOT )
+        return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
+
     size_t len = 0;
 
     while (*psz && ((!buf) || (len < n)))
@@ -785,7 +1042,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
                 else
                 {
 #ifdef WC_UTF16
-                    // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+                    // cast is ok because wchar_t == wxUint16 if WC_UTF16
                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
                     if (pa == wxCONV_FAILED)
                     {
@@ -865,6 +1122,9 @@ static inline bool isoctal(wchar_t wch)
 
 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 {
+    if ( m_options == MAP_INVALID_UTF8_NOT )
+        return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
+
     size_t len = 0;
 
     while (*psz && ((!buf) || (len < n)))
@@ -2476,7 +2736,7 @@ void wxCSConv::SetName(const char *charset)
 {
     if (charset)
     {
-        m_name = strdup(charset);
+        m_name = wxStrdup(charset);
         m_deferred = true;
     }
 }
@@ -2903,7 +3163,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
 #endif
 
-WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
+WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
 
 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));