src/common/stringops.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/stringops.cpp
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 2007 REA Elektronik GmbH
   9 // Licence:     wxWindows licence
  10 /////////////////////////////////////////////////////////////////////////////
  11
  12 // ===========================================================================
  13 // headers
  14 // ===========================================================================
  15
  16 // For compilers that support precompilation, includes "wx.h".
  17 #include "wx/wxprec.h"
  18
  19 #ifdef __BORLANDC__
  20     #pragma hdrstop
  21 #endif
  22
  23 #ifndef WX_PRECOMP
  24     #include "wx/stringops.h"
  25 #endif
  26
  27 // ===========================================================================
  28 // implementation
  29 // ===========================================================================
  30
  31 #if wxUSE_UNICODE_UTF8
  32
  33 // ---------------------------------------------------------------------------
  34 // UTF-8 sequences lengths
  35 // ---------------------------------------------------------------------------
  36
  37 const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
  38     // single-byte sequences (ASCII):
  39     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
  40     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
  41     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
  42     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
  43     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
  44     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
  45     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
  46     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
  47
  48     // these are invalid, we use step 1 to skip
  49     // over them (should never happen):
  50     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
  51     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
  52     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
  53     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
  54     1, 1,                                            // C0,C1
  55
  56     // two-byte sequences:
  57           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
  58     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
  59
  60     // three-byte sequences:
  61     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
  62
  63     // four-byte sequences:
  64     4, 4, 4, 4, 4,                                   // F0..F4
  65
  66     // these are invalid again (5- or 6-byte
  67     // sequences and sequences for code points
  68     // above U+10FFFF, as restricted by RFC 3629):
  69                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
  70 };
  71
  72 // ---------------------------------------------------------------------------
  73 // UTF-8 operations
  74 // ---------------------------------------------------------------------------
  75
  76 //
  77 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
  78 //
  79 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
  80 // -------------------+----------+----------+----------+----------+
  81 //   U+0000..U+007F   |  00..7F  |          |          |          |
  82 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
  83 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
  84 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
  85 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
  86 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
  87 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
  88 // -------------------+----------+----------+----------+----------+
  89
  90 bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
  91 {
  92     if ( !str )
  93         return true; // empty string is UTF8 string
  94
  95     const unsigned char *c = (const unsigned char*)str;
  96     const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len;
  97
  98     for ( ; c != end && *c; ++c )
  99     {
 100         unsigned char b = *c;
 101
 102         if ( end != NULL )
 103         {
 104             // if the string is not NULL-terminated, verify we have enough
 105             // bytes in it left for current character's encoding:
 106             if ( c + ms_utf8IterTable[*c] > end )
 107                 return false;
 108         }
 109
 110         if ( b <= 0x7F ) // 00..7F
 111             continue;
 112
 113         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 114             return false;
 115
 116         // two-byte sequences:
 117         else if ( b <= 0xDF ) // C2..DF
 118         {
 119             b = *(++c);
 120             if ( !(b >= 0x80 && b <= 0xBF ) )
 121                 return false;
 122         }
 123
 124         // three-byte sequences:
 125         else if ( b == 0xE0 )
 126         {
 127             b = *(++c);
 128             if ( !(b >= 0xA0 && b <= 0xBF ) )
 129                 return false;
 130             b = *(++c);
 131             if ( !(b >= 0x80 && b <= 0xBF ) )
 132                 return false;
 133         }
 134         else if ( b <= 0xEF ) // E1..EF
 135         {
 136             for ( int i = 0; i < 2; ++i )
 137             {
 138                 b = *(++c);
 139                 if ( !(b >= 0x80 && b <= 0xBF ) )
 140                     return false;
 141             }
 142         }
 143
 144         // four-byte sequences:
 145         else if ( b == 0xF0 )
 146         {
 147             b = *(++c);
 148             if ( !(b >= 0x90 && b <= 0xBF ) )
 149                 return false;
 150             for ( int i = 0; i < 2; ++i )
 151             {
 152                 b = *(++c);
 153                 if ( !(b >= 0x80 && b <= 0xBF ) )
 154                     return false;
 155             }
 156         }
 157         else if ( b <= 0xF3 ) // F1..F3
 158         {
 159             for ( int i = 0; i < 3; ++i )
 160             {
 161                 b = *(++c);
 162                 if ( !(b >= 0x80 && b <= 0xBF ) )
 163                     return false;
 164             }
 165         }
 166         else if ( b == 0xF4 )
 167         {
 168             b = *(++c);
 169             if ( !(b >= 0x80 && b <= 0x8F ) )
 170                 return false;
 171             for ( int i = 0; i < 2; ++i )
 172             {
 173                 b = *(++c);
 174                 if ( !(b >= 0x80 && b <= 0xBF ) )
 175                     return false;
 176             }
 177         }
 178         else // otherwise, it's invalid lead byte
 179             return false;
 180     }
 181
 182     return true;
 183 }
 184
 185 #ifdef __WXDEBUG__
 186 bool wxStringOperationsUtf8::IsValidUtf8LeadByte(unsigned char c)
 187 {
 188     return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
 189 }
 190 #endif
 191
 192
 193 // NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding
 194 //     code in single place
 195 wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const
 196 {
 197     Utf8CharBuffer buf;
 198     char *out = buf.data;
 199
 200     value_type code = GetValue();
 201
 202     //    Char. number range   |        UTF-8 octet sequence
 203     //       (hexadecimal)     |              (binary)
 204     //   ----------------------+---------------------------------------------
 205     //   0000 0000 - 0000 007F | 0xxxxxxx
 206     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 207     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 208     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 209     //
 210     //   Code point value is stored in bits marked with 'x', lowest-order bit
 211     //   of the value on the right side in the diagram above.
 212     //                                                        (from RFC 3629)
 213
 214     if ( code <= 0x7F )
 215     {
 216         out[1] = 0;
 217         out[0] = (char)code;
 218     }
 219     else if ( code <= 0x07FF )
 220     {
 221         out[2] = 0;
 222         // NB: this line takes 6 least significant bits, encodes them as
 223         // 10xxxxxx and discards them so that the next byte can be encoded:
 224         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 225         out[0] = 0xC0 | code;
 226     }
 227     else if ( code < 0xFFFF )
 228     {
 229         out[3] = 0;
 230         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 231         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 232         out[0] = 0xE0 | code;
 233     }
 234     else if ( code <= 0x10FFFF )
 235     {
 236         out[4] = 0;
 237         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 238         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 239         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 240         out[0] = 0xF0 | code;
 241     }
 242     else
 243     {
 244         wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 245         out[0] = 0;
 246     }
 247
 248     return buf;
 249 }
 250
 251 wxUniChar
 252 wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i)
 253 {
 254     wxASSERT( IsValidUtf8LeadByte(*i) );
 255
 256     wxUniChar::value_type code = 0;
 257     size_t len = GetUtf8CharLength(*i);
 258     wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
 259
 260     //    Char. number range   |        UTF-8 octet sequence
 261     //       (hexadecimal)     |              (binary)
 262     //   ----------------------+---------------------------------------------
 263     //   0000 0000 - 0000 007F | 0xxxxxxx
 264     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 265     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 266     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 267     //
 268     //   Code point value is stored in bits marked with 'x', lowest-order bit
 269     //   of the value on the right side in the diagram above.
 270     //                                                        (from RFC 3629)
 271
 272     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 273     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 274 #ifdef __WXDEBUG__
 275     // mask and value of lead byte's most significant bits, by length:
 276     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 277     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 278 #endif
 279
 280     // extract the lead byte's value bits:
 281     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 282                   s_leadMarkerVal[len-1],
 283                   _T("invalid UTF-8 lead byte") );
 284     code = (unsigned char)*i & s_leadValueMask[len-1];
 285
 286     // all remaining bytes, if any, are handled in the same way regardless of
 287     // sequence's length:
 288     for ( ++i ; len > 1; --len, ++i )
 289     {
 290         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 291                       _T("invalid UTF-8 byte") );
 292
 293         code <<= 6;
 294         code |= (unsigned char)*i & 0x3F;
 295     }
 296
 297     return wxUniChar(code);
 298 }
 299
 300 wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
 301 {
 302     Utf8CharBuffer once(EncodeChar(ch));
 303     // the IncIter() table can be used to determine the length of ch's encoding:
 304     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 305
 306     wxCharBuffer buf(n * len);
 307     char *ptr = buf.data();
 308     for ( size_t i = 0; i < n; i++, ptr += len )
 309     {
 310         memcpy(ptr, once.data, len);
 311     }
 312
 313     return buf;
 314 }
 315
 316 #endif // wxUSE_UNICODE_UTF8