src/common/stringops.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/stringops.cpp
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // Copyright:   (c) 2007 REA Elektronik GmbH
   8 // Licence:     wxWindows licence
   9 /////////////////////////////////////////////////////////////////////////////
  10
  11 // ===========================================================================
  12 // headers
  13 // ===========================================================================
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/stringops.h"
  24 #endif
  25
  26 // ===========================================================================
  27 // implementation
  28 // ===========================================================================
  29
  30 #if wxUSE_UNICODE_UTF8
  31
  32 // ---------------------------------------------------------------------------
  33 // UTF-8 sequences lengths
  34 // ---------------------------------------------------------------------------
  35
  36 const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
  37     // single-byte sequences (ASCII):
  38     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
  39     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
  40     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
  41     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
  42     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
  43     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
  44     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
  45     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
  46
  47     // these are invalid, we use step 1 to skip
  48     // over them (should never happen):
  49     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
  50     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
  51     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
  52     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
  53     1, 1,                                            // C0,C1
  54
  55     // two-byte sequences:
  56           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
  57     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
  58
  59     // three-byte sequences:
  60     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
  61
  62     // four-byte sequences:
  63     4, 4, 4, 4, 4,                                   // F0..F4
  64
  65     // these are invalid again (5- or 6-byte
  66     // sequences and sequences for code points
  67     // above U+10FFFF, as restricted by RFC 3629):
  68                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
  69 };
  70
  71 // ---------------------------------------------------------------------------
  72 // UTF-8 operations
  73 // ---------------------------------------------------------------------------
  74
  75 //
  76 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
  77 //
  78 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
  79 // -------------------+----------+----------+----------+----------+
  80 //   U+0000..U+007F   |  00..7F  |          |          |          |
  81 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
  82 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
  83 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
  84 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
  85 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
  86 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
  87 // -------------------+----------+----------+----------+----------+
  88
  89 bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
  90 {
  91     if ( !str )
  92         return true; // empty string is UTF8 string
  93
  94     const unsigned char *c = (const unsigned char*)str;
  95     const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len;
  96
  97     for ( ; c != end && *c; ++c )
  98     {
  99         unsigned char b = *c;
 100
 101         if ( end != NULL )
 102         {
 103             // if the string is not NULL-terminated, verify we have enough
 104             // bytes in it left for current character's encoding:
 105             if ( c + ms_utf8IterTable[*c] > end )
 106                 return false;
 107         }
 108
 109         if ( b <= 0x7F ) // 00..7F
 110             continue;
 111
 112         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 113             return false;
 114
 115         // two-byte sequences:
 116         else if ( b <= 0xDF ) // C2..DF
 117         {
 118             b = *(++c);
 119             if ( !(b >= 0x80 && b <= 0xBF ) )
 120                 return false;
 121         }
 122
 123         // three-byte sequences:
 124         else if ( b == 0xE0 )
 125         {
 126             b = *(++c);
 127             if ( !(b >= 0xA0 && b <= 0xBF ) )
 128                 return false;
 129             b = *(++c);
 130             if ( !(b >= 0x80 && b <= 0xBF ) )
 131                 return false;
 132         }
 133         else if ( b == 0xED )
 134         {
 135             b = *(++c);
 136             if ( !(b >= 0x80 && b <= 0x9F ) )
 137                 return false;
 138             b = *(++c);
 139             if ( !(b >= 0x80 && b <= 0xBF ) )
 140                 return false;
 141         }
 142         else if ( b <= 0xEF ) // E1..EC EE..EF
 143         {
 144             for ( int i = 0; i < 2; ++i )
 145             {
 146                 b = *(++c);
 147                 if ( !(b >= 0x80 && b <= 0xBF ) )
 148                     return false;
 149             }
 150         }
 151
 152         // four-byte sequences:
 153         else if ( b == 0xF0 )
 154         {
 155             b = *(++c);
 156             if ( !(b >= 0x90 && b <= 0xBF ) )
 157                 return false;
 158             for ( int i = 0; i < 2; ++i )
 159             {
 160                 b = *(++c);
 161                 if ( !(b >= 0x80 && b <= 0xBF ) )
 162                     return false;
 163             }
 164         }
 165         else if ( b <= 0xF3 ) // F1..F3
 166         {
 167             for ( int i = 0; i < 3; ++i )
 168             {
 169                 b = *(++c);
 170                 if ( !(b >= 0x80 && b <= 0xBF ) )
 171                     return false;
 172             }
 173         }
 174         else if ( b == 0xF4 )
 175         {
 176             b = *(++c);
 177             if ( !(b >= 0x80 && b <= 0x8F ) )
 178                 return false;
 179             for ( int i = 0; i < 2; ++i )
 180             {
 181                 b = *(++c);
 182                 if ( !(b >= 0x80 && b <= 0xBF ) )
 183                     return false;
 184             }
 185         }
 186         else // otherwise, it's invalid lead byte
 187             return false;
 188     }
 189
 190     return true;
 191 }
 192
 193 // NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding
 194 //     code in single place
 195 wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const
 196 {
 197     Utf8CharBuffer buf = { "" }; // init to avoid g++ 4.1 warning with -O2
 198     char *out = buf.data;
 199
 200     value_type code = GetValue();
 201
 202     //    Char. number range   |        UTF-8 octet sequence
 203     //       (hexadecimal)     |              (binary)
 204     //   ----------------------+---------------------------------------------
 205     //   0000 0000 - 0000 007F | 0xxxxxxx
 206     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 207     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 208     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 209     //
 210     //   Code point value is stored in bits marked with 'x', lowest-order bit
 211     //   of the value on the right side in the diagram above.
 212     //                                                        (from RFC 3629)
 213
 214     if ( code <= 0x7F )
 215     {
 216         out[1] = 0;
 217         out[0] = (char)code;
 218     }
 219     else if ( code <= 0x07FF )
 220     {
 221         out[2] = 0;
 222         // NB: this line takes 6 least significant bits, encodes them as
 223         // 10xxxxxx and discards them so that the next byte can be encoded:
 224         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 225         out[0] = 0xC0 | code;
 226     }
 227     else if ( code < 0xFFFF )
 228     {
 229         out[3] = 0;
 230         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 231         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 232         out[0] = 0xE0 | code;
 233     }
 234     else if ( code <= 0x10FFFF )
 235     {
 236         out[4] = 0;
 237         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 238         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 239         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 240         out[0] = 0xF0 | code;
 241     }
 242     else
 243     {
 244         wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
 245         out[0] = 0;
 246     }
 247
 248     return buf;
 249 }
 250
 251 wxUniChar
 252 wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i)
 253 {
 254     wxASSERT( IsValidUtf8LeadByte(*i) );
 255
 256     size_t len = GetUtf8CharLength(*i);
 257     wxASSERT_MSG( len <= 4, wxT("invalid UTF-8 sequence length") );
 258
 259     //    Char. number range   |        UTF-8 octet sequence
 260     //       (hexadecimal)     |              (binary)
 261     //   ----------------------+---------------------------------------------
 262     //   0000 0000 - 0000 007F | 0xxxxxxx
 263     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 264     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 265     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 266     //
 267     //   Code point value is stored in bits marked with 'x', lowest-order bit
 268     //   of the value on the right side in the diagram above.
 269     //                                                        (from RFC 3629)
 270
 271     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 272     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 273 #if wxDEBUG_LEVEL
 274     // mask and value of lead byte's most significant bits, by length:
 275     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 276     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 277 #endif
 278
 279     // extract the lead byte's value bits:
 280     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 281                   s_leadMarkerVal[len-1],
 282                   wxT("invalid UTF-8 lead byte") );
 283     wxUniChar::value_type code = (unsigned char)*i & s_leadValueMask[len-1];
 284
 285     // all remaining bytes, if any, are handled in the same way regardless of
 286     // sequence's length:
 287     for ( ++i ; len > 1; --len, ++i )
 288     {
 289         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 290                       wxT("invalid UTF-8 byte") );
 291
 292         code <<= 6;
 293         code |= (unsigned char)*i & 0x3F;
 294     }
 295
 296     return wxUniChar(code);
 297 }
 298
 299 wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
 300 {
 301     Utf8CharBuffer once(EncodeChar(ch));
 302     // the IncIter() table can be used to determine the length of ch's encoding:
 303     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 304
 305     wxCharBuffer buf(n * len);
 306     char *ptr = buf.data();
 307     for ( size_t i = 0; i < n; i++, ptr += len )
 308     {
 309         memcpy(ptr, once.data, len);
 310     }
 311
 312     return buf;
 313 }
 314
 315 #endif // wxUSE_UNICODE_UTF8