src/common/stringops.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/stringops.cpp
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 2007 REA Elektronik GmbH
   9 // Licence:     wxWindows licence
  10 /////////////////////////////////////////////////////////////////////////////
  11
  12 // ===========================================================================
  13 // headers
  14 // ===========================================================================
  15
  16 // For compilers that support precompilation, includes "wx.h".
  17 #include "wx/wxprec.h"
  18
  19 #ifdef __BORLANDC__
  20     #pragma hdrstop
  21 #endif
  22
  23 #ifndef WX_PRECOMP
  24     #include "wx/stringops.h"
  25 #endif
  26
  27 // ===========================================================================
  28 // implementation
  29 // ===========================================================================
  30
  31 #if wxUSE_UNICODE_UTF8
  32
  33 // ---------------------------------------------------------------------------
  34 // UTF-8 sequences lengths
  35 // ---------------------------------------------------------------------------
  36
  37 const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
  38     // single-byte sequences (ASCII):
  39     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
  40     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
  41     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
  42     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
  43     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
  44     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
  45     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
  46     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
  47
  48     // these are invalid, we use step 1 to skip
  49     // over them (should never happen):
  50     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
  51     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
  52     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
  53     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
  54     1, 1,                                            // C0,C1
  55
  56     // two-byte sequences:
  57           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
  58     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
  59
  60     // three-byte sequences:
  61     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
  62
  63     // four-byte sequences:
  64     4, 4, 4, 4, 4,                                   // F0..F4
  65
  66     // these are invalid again (5- or 6-byte
  67     // sequences and sequences for code points
  68     // above U+10FFFF, as restricted by RFC 3629):
  69                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
  70 };
  71
  72 // ---------------------------------------------------------------------------
  73 // UTF-8 operations
  74 // ---------------------------------------------------------------------------
  75
  76 //
  77 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
  78 //
  79 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
  80 // -------------------+----------+----------+----------+----------+
  81 //   U+0000..U+007F   |  00..7F  |          |          |          |
  82 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
  83 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
  84 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
  85 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
  86 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
  87 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
  88 // -------------------+----------+----------+----------+----------+
  89
  90 bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
  91 {
  92     if ( !str )
  93         return true; // empty string is UTF8 string
  94
  95     const unsigned char *c = (const unsigned char*)str;
  96     const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len;
  97
  98     for ( ; c != end && *c; ++c )
  99     {
 100         unsigned char b = *c;
 101
 102         if ( end != NULL )
 103         {
 104             // if the string is not NULL-terminated, verify we have enough
 105             // bytes in it left for current character's encoding:
 106             if ( c + ms_utf8IterTable[*c] > end )
 107                 return false;
 108         }
 109
 110         if ( b <= 0x7F ) // 00..7F
 111             continue;
 112
 113         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 114             return false;
 115
 116         // two-byte sequences:
 117         else if ( b <= 0xDF ) // C2..DF
 118         {
 119             b = *(++c);
 120             if ( !(b >= 0x80 && b <= 0xBF ) )
 121                 return false;
 122         }
 123
 124         // three-byte sequences:
 125         else if ( b == 0xE0 )
 126         {
 127             b = *(++c);
 128             if ( !(b >= 0xA0 && b <= 0xBF ) )
 129                 return false;
 130             b = *(++c);
 131             if ( !(b >= 0x80 && b <= 0xBF ) )
 132                 return false;
 133         }
 134         else if ( b == 0xED )
 135         {
 136             b = *(++c);
 137             if ( !(b >= 0x80 && b <= 0x9F ) )
 138                 return false;
 139             b = *(++c);
 140             if ( !(b >= 0x80 && b <= 0xBF ) )
 141                 return false;
 142         }
 143         else if ( b <= 0xEF ) // E1..EC EE..EF
 144         {
 145             for ( int i = 0; i < 2; ++i )
 146             {
 147                 b = *(++c);
 148                 if ( !(b >= 0x80 && b <= 0xBF ) )
 149                     return false;
 150             }
 151         }
 152
 153         // four-byte sequences:
 154         else if ( b == 0xF0 )
 155         {
 156             b = *(++c);
 157             if ( !(b >= 0x90 && b <= 0xBF ) )
 158                 return false;
 159             for ( int i = 0; i < 2; ++i )
 160             {
 161                 b = *(++c);
 162                 if ( !(b >= 0x80 && b <= 0xBF ) )
 163                     return false;
 164             }
 165         }
 166         else if ( b <= 0xF3 ) // F1..F3
 167         {
 168             for ( int i = 0; i < 3; ++i )
 169             {
 170                 b = *(++c);
 171                 if ( !(b >= 0x80 && b <= 0xBF ) )
 172                     return false;
 173             }
 174         }
 175         else if ( b == 0xF4 )
 176         {
 177             b = *(++c);
 178             if ( !(b >= 0x80 && b <= 0x8F ) )
 179                 return false;
 180             for ( int i = 0; i < 2; ++i )
 181             {
 182                 b = *(++c);
 183                 if ( !(b >= 0x80 && b <= 0xBF ) )
 184                     return false;
 185             }
 186         }
 187         else // otherwise, it's invalid lead byte
 188             return false;
 189     }
 190
 191     return true;
 192 }
 193
 194 #ifdef __WXDEBUG__
 195 bool wxStringOperationsUtf8::IsValidUtf8LeadByte(unsigned char c)
 196 {
 197     return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
 198 }
 199 #endif
 200
 201
 202 // NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding
 203 //     code in single place
 204 wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const
 205 {
 206     Utf8CharBuffer buf = { "" }; // init to avoid g++ 4.1 warning with -O2
 207     char *out = buf.data;
 208
 209     value_type code = GetValue();
 210
 211     //    Char. number range   |        UTF-8 octet sequence
 212     //       (hexadecimal)     |              (binary)
 213     //   ----------------------+---------------------------------------------
 214     //   0000 0000 - 0000 007F | 0xxxxxxx
 215     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 216     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 217     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 218     //
 219     //   Code point value is stored in bits marked with 'x', lowest-order bit
 220     //   of the value on the right side in the diagram above.
 221     //                                                        (from RFC 3629)
 222
 223     if ( code <= 0x7F )
 224     {
 225         out[1] = 0;
 226         out[0] = (char)code;
 227     }
 228     else if ( code <= 0x07FF )
 229     {
 230         out[2] = 0;
 231         // NB: this line takes 6 least significant bits, encodes them as
 232         // 10xxxxxx and discards them so that the next byte can be encoded:
 233         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 234         out[0] = 0xC0 | code;
 235     }
 236     else if ( code < 0xFFFF )
 237     {
 238         out[3] = 0;
 239         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 240         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 241         out[0] = 0xE0 | code;
 242     }
 243     else if ( code <= 0x10FFFF )
 244     {
 245         out[4] = 0;
 246         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 247         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 248         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 249         out[0] = 0xF0 | code;
 250     }
 251     else
 252     {
 253         wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 254         out[0] = 0;
 255     }
 256
 257     return buf;
 258 }
 259
 260 wxUniChar
 261 wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i)
 262 {
 263     wxASSERT( IsValidUtf8LeadByte(*i) );
 264
 265     wxUniChar::value_type code = 0;
 266     size_t len = GetUtf8CharLength(*i);
 267     wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
 268
 269     //    Char. number range   |        UTF-8 octet sequence
 270     //       (hexadecimal)     |              (binary)
 271     //   ----------------------+---------------------------------------------
 272     //   0000 0000 - 0000 007F | 0xxxxxxx
 273     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 274     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 275     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 276     //
 277     //   Code point value is stored in bits marked with 'x', lowest-order bit
 278     //   of the value on the right side in the diagram above.
 279     //                                                        (from RFC 3629)
 280
 281     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 282     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 283 #ifdef __WXDEBUG__
 284     // mask and value of lead byte's most significant bits, by length:
 285     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 286     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 287 #endif
 288
 289     // extract the lead byte's value bits:
 290     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 291                   s_leadMarkerVal[len-1],
 292                   _T("invalid UTF-8 lead byte") );
 293     code = (unsigned char)*i & s_leadValueMask[len-1];
 294
 295     // all remaining bytes, if any, are handled in the same way regardless of
 296     // sequence's length:
 297     for ( ++i ; len > 1; --len, ++i )
 298     {
 299         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 300                       _T("invalid UTF-8 byte") );
 301
 302         code <<= 6;
 303         code |= (unsigned char)*i & 0x3F;
 304     }
 305
 306     return wxUniChar(code);
 307 }
 308
 309 wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
 310 {
 311     Utf8CharBuffer once(EncodeChar(ch));
 312     // the IncIter() table can be used to determine the length of ch's encoding:
 313     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 314
 315     wxCharBuffer buf(n * len);
 316     char *ptr = buf.data();
 317     for ( size_t i = 0; i < n; i++, ptr += len )
 318     {
 319         memcpy(ptr, once.data, len);
 320     }
 321
 322     return buf;
 323 }
 324
 325 #endif // wxUSE_UNICODE_UTF8