src/common/stringops.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/stringops.cpp
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 2007 REA Elektronik GmbH
   9 // Licence:     wxWindows licence
  10 /////////////////////////////////////////////////////////////////////////////
  11
  12 // ===========================================================================
  13 // headers
  14 // ===========================================================================
  15
  16 // For compilers that support precompilation, includes "wx.h".
  17 #include "wx/wxprec.h"
  18
  19 #ifdef __BORLANDC__
  20     #pragma hdrstop
  21 #endif
  22
  23 #ifndef WX_PRECOMP
  24     #include "wx/stringops.h"
  25 #endif
  26
  27 // ===========================================================================
  28 // implementation
  29 // ===========================================================================
  30
  31 #if wxUSE_UNICODE_UTF8
  32
  33 // ---------------------------------------------------------------------------
  34 // UTF-8 sequences lengths
  35 // ---------------------------------------------------------------------------
  36
  37 unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
  38     // single-byte sequences (ASCII):
  39     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
  40     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
  41     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
  42     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
  43     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
  44     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
  45     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
  46     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
  47
  48     // these are invalid, we use step 1 to skip
  49     // over them (should never happen):
  50     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
  51     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
  52     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
  53     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
  54     1, 1,                                            // C0,C1
  55
  56     // two-byte sequences:
  57           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
  58     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
  59
  60     // three-byte sequences:
  61     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
  62
  63     // four-byte sequences:
  64     4, 4, 4, 4, 4,                                   // F0..F4
  65
  66     // these are invalid again (5- or 6-byte
  67     // sequences and sequences for code points
  68     // above U+10FFFF, as restricted by RFC 3629):
  69                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
  70 };
  71
  72 // ---------------------------------------------------------------------------
  73 // UTF-8 operations
  74 // ---------------------------------------------------------------------------
  75
  76 //
  77 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
  78 //
  79 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
  80 // -------------------+----------+----------+----------+----------+
  81 //   U+0000..U+007F   |  00..7F  |          |          |          |
  82 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
  83 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
  84 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
  85 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
  86 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
  87 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
  88 // -------------------+----------+----------+----------+----------+
  89
  90 bool wxStringOperationsUtf8::IsValidUtf8String(const char *str)
  91 {
  92     if ( !str )
  93         return true; // empty string is UTF8 string
  94
  95     const unsigned char *c = (const unsigned char*)str;
  96
  97     for ( ; *c; ++c )
  98     {
  99         unsigned char b = *c;
 100
 101         if ( b <= 0x7F ) // 00..7F
 102             continue;
 103
 104         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 105             return false;
 106
 107         // two-byte sequences:
 108         else if ( b <= 0xDF ) // C2..DF
 109         {
 110             b = *(++c);
 111             if ( !(b >= 0x80 && b <= 0xBF ) )
 112                 return false;
 113         }
 114
 115         // three-byte sequences:
 116         else if ( b == 0xE0 )
 117         {
 118             b = *(++c);
 119             if ( !(b >= 0xA0 && b <= 0xBF ) )
 120                 return false;
 121             b = *(++c);
 122             if ( !(b >= 0x80 && b <= 0xBF ) )
 123                 return false;
 124         }
 125         else if ( b <= 0xEF ) // E1..EF
 126         {
 127             for ( int i = 0; i < 2; ++i )
 128             {
 129                 b = *(++c);
 130                 if ( !(b >= 0x80 && b <= 0xBF ) )
 131                     return false;
 132             }
 133         }
 134
 135         // four-byte sequences:
 136         else if ( b == 0xF0 )
 137         {
 138             b = *(++c);
 139             if ( !(b >= 0x90 && b <= 0xBF ) )
 140                 return false;
 141             for ( int i = 0; i < 2; ++i )
 142             {
 143                 b = *(++c);
 144                 if ( !(b >= 0x80 && b <= 0xBF ) )
 145                     return false;
 146             }
 147         }
 148         else if ( b <= 0xF3 ) // F1..F3
 149         {
 150             for ( int i = 0; i < 3; ++i )
 151             {
 152                 b = *(++c);
 153                 if ( !(b >= 0x80 && b <= 0xBF ) )
 154                     return false;
 155             }
 156         }
 157         else if ( b == 0xF4 )
 158         {
 159             b = *(++c);
 160             if ( !(b >= 0x80 && b <= 0x8F ) )
 161                 return false;
 162             for ( int i = 0; i < 2; ++i )
 163             {
 164                 b = *(++c);
 165                 if ( !(b >= 0x80 && b <= 0xBF ) )
 166                     return false;
 167             }
 168         }
 169         else // otherwise, it's invalid lead byte
 170             return false;
 171     }
 172
 173     return true;
 174 }
 175
 176 #ifdef __WXDEBUG__
 177 bool wxStringOperationsUtf8::IsValidUtf8LeadByte(unsigned char c)
 178 {
 179     return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
 180 }
 181 #endif
 182
 183
 184 wxStringOperationsUtf8::Utf8CharBuffer
 185 wxStringOperationsUtf8::EncodeChar(const wxUniChar& ch)
 186 {
 187     Utf8CharBuffer buf;
 188     char *out = buf.data;
 189
 190     wxUniChar::value_type code = ch.GetValue();
 191
 192     //    Char. number range   |        UTF-8 octet sequence
 193     //       (hexadecimal)     |              (binary)
 194     //   ----------------------+---------------------------------------------
 195     //   0000 0000 - 0000 007F | 0xxxxxxx
 196     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 197     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 198     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 199     //
 200     //   Code point value is stored in bits marked with 'x', lowest-order bit
 201     //   of the value on the right side in the diagram above.
 202     //                                                        (from RFC 3629)
 203
 204     if ( code <= 0x7F )
 205     {
 206         out[1] = 0;
 207         out[0] = (char)code;
 208     }
 209     else if ( code <= 0x07FF )
 210     {
 211         out[2] = 0;
 212         // NB: this line takes 6 least significant bits, encodes them as
 213         // 10xxxxxx and discards them so that the next byte can be encoded:
 214         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 215         out[0] = 0xC0 | code;
 216     }
 217     else if ( code < 0xFFFF )
 218     {
 219         out[3] = 0;
 220         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 221         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 222         out[0] = 0xE0 | code;
 223     }
 224     else if ( code <= 0x10FFFF )
 225     {
 226         out[4] = 0;
 227         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 228         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 229         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 230         out[0] = 0xF0 | code;
 231     }
 232     else
 233     {
 234         wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 235         out[0] = 0;
 236     }
 237
 238     return buf;
 239 }
 240
 241 wxUniChar
 242 wxStringOperationsUtf8::DecodeChar(wxStringImpl::const_iterator i)
 243 {
 244     wxASSERT( IsValidUtf8LeadByte(*i) );
 245
 246     wxUniChar::value_type code = 0;
 247     size_t len = GetUtf8CharLength(*i);
 248     wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
 249
 250     //    Char. number range   |        UTF-8 octet sequence
 251     //       (hexadecimal)     |              (binary)
 252     //   ----------------------+---------------------------------------------
 253     //   0000 0000 - 0000 007F | 0xxxxxxx
 254     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 255     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 256     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 257     //
 258     //   Code point value is stored in bits marked with 'x', lowest-order bit
 259     //   of the value on the right side in the diagram above.
 260     //                                                        (from RFC 3629)
 261
 262     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 263     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 264 #ifdef __WXDEBUG__
 265     // mask and value of lead byte's most significant bits, by length:
 266     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 267     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 268 #endif
 269
 270     // extract the lead byte's value bits:
 271     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 272                   s_leadMarkerVal[len-1],
 273                   _T("invalid UTF-8 lead byte") );
 274     code = (unsigned char)*i & s_leadValueMask[len-1];
 275
 276     // all remaining bytes, if any, are handled in the same way regardless of
 277     // sequence's length:
 278     for ( ++i ; len > 1; --len, ++i )
 279     {
 280         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 281                       _T("invalid UTF-8 byte") );
 282
 283         code <<= 6;
 284         code |= (unsigned char)*i & 0x3F;
 285     }
 286
 287     return wxUniChar(code);
 288 }
 289
 290 wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
 291 {
 292     Utf8CharBuffer once(EncodeChar(ch));
 293     // the IncIter() table can be used to determine the length of ch's encoding:
 294     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 295
 296     wxCharBuffer buf(n * len);
 297     char *ptr = buf.data();
 298     for ( size_t i = 0; i < n; i++, ptr += len )
 299     {
 300         memcpy(ptr, once.data, len);
 301     }
 302
 303     return buf;
 304 }
 305
 306 #endif // wxUSE_UNICODE_UTF8