src/common/stringops.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/stringops.cpp
   3 // Purpose:     implementation of wxString primitive operations
   4 // Author:      Vaclav Slavik
   5 // Modified by:
   6 // Created:     2007-04-16
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 2007 REA Elektronik GmbH
   9 // Licence:     wxWindows licence
  10 /////////////////////////////////////////////////////////////////////////////
  11
  12 // ===========================================================================
  13 // headers
  14 // ===========================================================================
  15
  16 // For compilers that support precompilation, includes "wx.h".
  17 #include "wx/wxprec.h"
  18
  19 #ifdef __BORLANDC__
  20     #pragma hdrstop
  21 #endif
  22
  23 #ifndef WX_PRECOMP
  24     #include "wx/stringops.h"
  25 #endif
  26
  27 // ===========================================================================
  28 // implementation
  29 // ===========================================================================
  30
  31 #if wxUSE_UNICODE_UTF8
  32
  33 // ---------------------------------------------------------------------------
  34 // UTF-8 sequences lengths
  35 // ---------------------------------------------------------------------------
  36
  37 const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
  38     // single-byte sequences (ASCII):
  39     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
  40     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
  41     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
  42     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
  43     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
  44     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
  45     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
  46     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
  47
  48     // these are invalid, we use step 1 to skip
  49     // over them (should never happen):
  50     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
  51     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
  52     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
  53     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
  54     1, 1,                                            // C0,C1
  55
  56     // two-byte sequences:
  57           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
  58     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
  59
  60     // three-byte sequences:
  61     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
  62
  63     // four-byte sequences:
  64     4, 4, 4, 4, 4,                                   // F0..F4
  65
  66     // these are invalid again (5- or 6-byte
  67     // sequences and sequences for code points
  68     // above U+10FFFF, as restricted by RFC 3629):
  69                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
  70 };
  71
  72 // ---------------------------------------------------------------------------
  73 // UTF-8 operations
  74 // ---------------------------------------------------------------------------
  75
  76 //
  77 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
  78 //
  79 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
  80 // -------------------+----------+----------+----------+----------+
  81 //   U+0000..U+007F   |  00..7F  |          |          |          |
  82 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
  83 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
  84 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
  85 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
  86 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
  87 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
  88 // -------------------+----------+----------+----------+----------+
  89
  90 bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
  91 {
  92     if ( !str )
  93         return true; // empty string is UTF8 string
  94
  95     const unsigned char *c = (const unsigned char*)str;
  96     const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len;
  97
  98     for ( ; c != end && *c; ++c )
  99     {
 100         unsigned char b = *c;
 101
 102         if ( end != NULL )
 103         {
 104             // if the string is not NULL-terminated, verify we have enough
 105             // bytes in it left for current character's encoding:
 106             if ( c + ms_utf8IterTable[*c] > end )
 107                 return false;
 108         }
 109
 110         if ( b <= 0x7F ) // 00..7F
 111             continue;
 112
 113         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 114             return false;
 115
 116         // two-byte sequences:
 117         else if ( b <= 0xDF ) // C2..DF
 118         {
 119             b = *(++c);
 120             if ( !(b >= 0x80 && b <= 0xBF ) )
 121                 return false;
 122         }
 123
 124         // three-byte sequences:
 125         else if ( b == 0xE0 )
 126         {
 127             b = *(++c);
 128             if ( !(b >= 0xA0 && b <= 0xBF ) )
 129                 return false;
 130             b = *(++c);
 131             if ( !(b >= 0x80 && b <= 0xBF ) )
 132                 return false;
 133         }
 134         else if ( b == 0xED )
 135         {
 136             b = *(++c);
 137             if ( !(b >= 0x80 && b <= 0x9F ) )
 138                 return false;
 139             b = *(++c);
 140             if ( !(b >= 0x80 && b <= 0xBF ) )
 141                 return false;
 142         }
 143         else if ( b <= 0xEF ) // E1..EC EE..EF
 144         {
 145             for ( int i = 0; i < 2; ++i )
 146             {
 147                 b = *(++c);
 148                 if ( !(b >= 0x80 && b <= 0xBF ) )
 149                     return false;
 150             }
 151         }
 152
 153         // four-byte sequences:
 154         else if ( b == 0xF0 )
 155         {
 156             b = *(++c);
 157             if ( !(b >= 0x90 && b <= 0xBF ) )
 158                 return false;
 159             for ( int i = 0; i < 2; ++i )
 160             {
 161                 b = *(++c);
 162                 if ( !(b >= 0x80 && b <= 0xBF ) )
 163                     return false;
 164             }
 165         }
 166         else if ( b <= 0xF3 ) // F1..F3
 167         {
 168             for ( int i = 0; i < 3; ++i )
 169             {
 170                 b = *(++c);
 171                 if ( !(b >= 0x80 && b <= 0xBF ) )
 172                     return false;
 173             }
 174         }
 175         else if ( b == 0xF4 )
 176         {
 177             b = *(++c);
 178             if ( !(b >= 0x80 && b <= 0x8F ) )
 179                 return false;
 180             for ( int i = 0; i < 2; ++i )
 181             {
 182                 b = *(++c);
 183                 if ( !(b >= 0x80 && b <= 0xBF ) )
 184                     return false;
 185             }
 186         }
 187         else // otherwise, it's invalid lead byte
 188             return false;
 189     }
 190
 191     return true;
 192 }
 193
 194 // NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding
 195 //     code in single place
 196 wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const
 197 {
 198     Utf8CharBuffer buf = { "" }; // init to avoid g++ 4.1 warning with -O2
 199     char *out = buf.data;
 200
 201     value_type code = GetValue();
 202
 203     //    Char. number range   |        UTF-8 octet sequence
 204     //       (hexadecimal)     |              (binary)
 205     //   ----------------------+---------------------------------------------
 206     //   0000 0000 - 0000 007F | 0xxxxxxx
 207     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 208     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 209     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 210     //
 211     //   Code point value is stored in bits marked with 'x', lowest-order bit
 212     //   of the value on the right side in the diagram above.
 213     //                                                        (from RFC 3629)
 214
 215     if ( code <= 0x7F )
 216     {
 217         out[1] = 0;
 218         out[0] = (char)code;
 219     }
 220     else if ( code <= 0x07FF )
 221     {
 222         out[2] = 0;
 223         // NB: this line takes 6 least significant bits, encodes them as
 224         // 10xxxxxx and discards them so that the next byte can be encoded:
 225         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 226         out[0] = 0xC0 | code;
 227     }
 228     else if ( code < 0xFFFF )
 229     {
 230         out[3] = 0;
 231         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 232         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 233         out[0] = 0xE0 | code;
 234     }
 235     else if ( code <= 0x10FFFF )
 236     {
 237         out[4] = 0;
 238         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 239         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 240         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 241         out[0] = 0xF0 | code;
 242     }
 243     else
 244     {
 245         wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
 246         out[0] = 0;
 247     }
 248
 249     return buf;
 250 }
 251
 252 wxUniChar
 253 wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i)
 254 {
 255     wxASSERT( IsValidUtf8LeadByte(*i) );
 256
 257     size_t len = GetUtf8CharLength(*i);
 258     wxASSERT_MSG( len <= 4, wxT("invalid UTF-8 sequence length") );
 259
 260     //    Char. number range   |        UTF-8 octet sequence
 261     //       (hexadecimal)     |              (binary)
 262     //   ----------------------+---------------------------------------------
 263     //   0000 0000 - 0000 007F | 0xxxxxxx
 264     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 265     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 266     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 267     //
 268     //   Code point value is stored in bits marked with 'x', lowest-order bit
 269     //   of the value on the right side in the diagram above.
 270     //                                                        (from RFC 3629)
 271
 272     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 273     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 274 #if wxDEBUG_LEVEL
 275     // mask and value of lead byte's most significant bits, by length:
 276     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 277     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 278 #endif
 279
 280     // extract the lead byte's value bits:
 281     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 282                   s_leadMarkerVal[len-1],
 283                   wxT("invalid UTF-8 lead byte") );
 284     wxUniChar::value_type code = (unsigned char)*i & s_leadValueMask[len-1];
 285
 286     // all remaining bytes, if any, are handled in the same way regardless of
 287     // sequence's length:
 288     for ( ++i ; len > 1; --len, ++i )
 289     {
 290         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 291                       wxT("invalid UTF-8 byte") );
 292
 293         code <<= 6;
 294         code |= (unsigned char)*i & 0x3F;
 295     }
 296
 297     return wxUniChar(code);
 298 }
 299
 300 wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
 301 {
 302     Utf8CharBuffer once(EncodeChar(ch));
 303     // the IncIter() table can be used to determine the length of ch's encoding:
 304     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 305
 306     wxCharBuffer buf(n * len);
 307     char *ptr = buf.data();
 308     for ( size_t i = 0; i < n; i++, ptr += len )
 309     {
 310         memcpy(ptr, once.data, len);
 311     }
 312
 313     return buf;
 314 }
 315
 316 #endif // wxUSE_UNICODE_UTF8