From 0286d08d1453506f9ff9a830d58b3b35817d0b14 Mon Sep 17 00:00:00 2001
From: Vadim Zeitlin <vadim@wxwidgets.org>
Date: Tue, 24 Jul 2007 15:01:10 +0000
Subject: [PATCH] add wxMBConvStrictUTF8 class implementing just UTF-8
 conversion, without support for PUA/octal mappings and use it for wxConvUTF8
 as it's simpler and more efficient (~20% faster)

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47703 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
---
 include/wx/strconv.h   |  28 ++++-
 src/common/strconv.cpp | 267 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 289 insertions(+), 6 deletions(-)

diff --git a/include/wx/strconv.h b/include/wx/strconv.h
index 2cd26bd30d..1edf4f5082 100644
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -257,11 +257,31 @@ public:
 // wxMBConvUTF8 (for conversion using UTF8 encoding)
 // ----------------------------------------------------------------------------
 
-class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
+// this is the real UTF-8 conversion class, it has to be called "strict UTF-8"
+// for compatibility reasons: the wxMBConvUTF8 class below also supports lossy
+// conversions if it is created with non default options
+class WXDLLIMPEXP_BASE wxMBConvStrictUTF8 : public wxMBConv
+{
+public:
+    // compiler-generated default ctor and other methods are ok
+
+    virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
+                           const char *src, size_t srcLen = wxNO_LEN) const;
+    virtual size_t FromWChar(char *dst, size_t dstLen,
+                             const wchar_t *src, size_t srcLen = wxNO_LEN) const;
+
+    virtual wxMBConv *Clone() const { return new wxMBConvStrictUTF8(); }
+
+#if wxUSE_UNICODE_UTF8
+    // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
+    //     take the shortcut in that case
+    virtual bool IsUTF8() const { return true; }
+#endif
+};
+
+class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConvStrictUTF8
 {
 public:
-    // FIXME-UTF8: split this class into multiple classes, one strict and
-    //             other lossy (PUA, OCTAL mappings)
     enum
     {
         MAP_INVALID_UTF8_NOT = 0,
@@ -470,7 +490,7 @@ WX_DECLARE_GLOBAL_CONV(wxMBConv, wxConvLibc)
 WX_DECLARE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1)
 #define wxConvISO8859_1 wxGet_wxConvISO8859_1()
 
-WX_DECLARE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8)
+WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8)
 #define wxConvUTF8 wxGet_wxConvUTF8()
 
 WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7)
diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp
index 59ea721b71..4d672fedfb 100644
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -714,8 +714,268 @@ static wxUint32 utf8_max[]=
 const wxUint32 wxUnicodePUA = 0x100000;
 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 
+// this table gives the length of the UTF-8 encoding from its first character:
+unsigned char tableUtf8Lengths[256] = {
+    // single-byte sequences (ASCII):
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
+
+    // these are invalid:
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
+    0, 0,                                            // C0,C1
+
+    // two-byte sequences:
+          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
+
+    // three-byte sequences:
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
+
+    // four-byte sequences:
+    4, 4, 4, 4, 4,                                   // F0..F4
+
+    // these are invalid again (5- or 6-byte
+    // sequences and sequences for code points
+    // above U+10FFFF, as restricted by RFC 3629):
+                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
+};
+
+size_t
+wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
+                            const char *src, size_t srcLen) const
+{
+    wchar_t *out = dstLen ? dst : NULL;
+    size_t written = 0;
+
+    if ( srcLen == wxNO_LEN )
+        srcLen = strlen(src) + 1;
+
+    for ( const char *p = src; ; p++ )
+    {
+        if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
+        {
+            // all done successfully, just add the trailing NULL if we are not
+            // using explicit length
+            if ( srcLen == wxNO_LEN )
+            {
+                if ( out )
+                {
+                    if ( !dstLen )
+                        break;
+
+                    *out = L'\0';
+                }
+
+                written++;
+            }
+
+            return written;
+        }
+
+        unsigned char c = *p;
+        unsigned len = tableUtf8Lengths[c];
+        if ( !len )
+            break;
+
+        if ( srcLen < len ) // the test works for wxNO_LEN too
+            break;
+
+        if ( srcLen != wxNO_LEN )
+            srcLen -= len;
+
+        if ( out && !dstLen-- )
+            break;
+
+
+        //   Char. number range   |        UTF-8 octet sequence
+        //      (hexadecimal)     |              (binary)
+        //  ----------------------+---------------------------------------------
+        //  0000 0000 - 0000 007F | 0xxxxxxx
+        //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
+        //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+        //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        //
+        //  Code point value is stored in bits marked with 'x', lowest-order bit
+        //  of the value on the right side in the diagram above.
+        //                                                       (from RFC 3629)
+
+        // mask to extract lead byte's value ('x' bits above), by sequence length:
+        static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
+
+        // mask and value of lead byte's most significant bits, by length:
+        static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
+        static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
+
+        len--; // it's more convenient to work with 0-based length here
+
+        // extract the lead byte's value bits:
+        if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
+            break;
+
+        wxUint32 code = c & leadValueMask[len];
+
+        // all remaining bytes, if any, are handled in the same way regardless of
+        // sequence's length:
+        for ( ; len; --len )
+        {
+            c = *++p;
+            if ( (c & 0xC0) != 0x80 )
+                return wxCONV_FAILED;
+
+            code <<= 6;
+            code |= c & 0x3F;
+        }
+
+#ifdef WC_UTF16
+        // cast is ok because wchar_t == wxUint16 if WC_UTF16
+        if ( encode_utf16(code, (wxUint16 *)out) == 2 )
+        {
+            if ( out )
+                out++;
+            written++;
+        }
+#else // !WC_UTF16
+        if ( out )
+            *out = code;
+#endif // WC_UTF16/!WC_UTF16
+
+        if ( out )
+            out++;
+
+        written++;
+    }
+
+    return wxCONV_FAILED;
+}
+
+size_t
+wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
+                              const wchar_t *src, size_t srcLen) const
+{
+    char *out = dstLen ? dst : NULL;
+    size_t written = 0;
+
+    for ( const wchar_t *wp = src; ; wp++ )
+    {
+        if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
+        {
+            // all done successfully, just add the trailing NULL if we are not
+            // using explicit length
+            if ( srcLen == wxNO_LEN )
+            {
+                if ( out )
+                {
+                    if ( !dstLen )
+                        break;
+
+                    *out = '\0';
+                }
+
+                written++;
+            }
+
+            return written;
+        }
+
+
+        wxUint32 code;
+#ifdef WC_UTF16
+        // cast is ok for WC_UTF16
+        if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
+        {
+            // skip the next char too as we decoded a surrogate
+            wp++;
+        }
+#else // wchar_t is UTF-32
+        code = *wp & 0x7fffffff;
+#endif
+
+        unsigned len;
+        if ( code <= 0x7F )
+        {
+            len = 1;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                out[0] = (char)code;
+            }
+        }
+        else if ( code <= 0x07FF )
+        {
+            len = 2;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                // NB: this line takes 6 least significant bits, encodes them as
+                // 10xxxxxx and discards them so that the next byte can be encoded:
+                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[0] = 0xC0 | code;
+            }
+        }
+        else if ( code < 0xFFFF )
+        {
+            len = 3;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                out[2] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[0] = 0xE0 | code;
+            }
+        }
+        else if ( code <= 0x10FFFF )
+        {
+            len = 4;
+            if ( out )
+            {
+                if ( dstLen < len )
+                    break;
+
+                out[3] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[2] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
+                out[0] = 0xF0 | code;
+            }
+        }
+        else
+        {
+            wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
+            break;
+        }
+
+        if ( out )
+        {
+            out += len;
+            dstLen -= len;
+        }
+
+        written += len;
+    }
+
+    // we only get here if an error occurs during decoding
+    return wxCONV_FAILED;
+}
+
 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 {
+    if ( m_options == MAP_INVALID_UTF8_NOT )
+        return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
+
     size_t len = 0;
 
     while (*psz && ((!buf) || (len < n)))
@@ -785,7 +1045,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
                 else
                 {
 #ifdef WC_UTF16
-                    // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+                    // cast is ok because wchar_t == wxUint16 if WC_UTF16
                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
                     if (pa == wxCONV_FAILED)
                     {
@@ -865,6 +1125,9 @@ static inline bool isoctal(wchar_t wch)
 
 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 {
+    if ( m_options == MAP_INVALID_UTF8_NOT )
+        return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
+
     size_t len = 0;
 
     while (*psz && ((!buf) || (len < n)))
@@ -2903,7 +3166,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
 #endif
 
-WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
+WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
 
 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
-- 
2.50.0