| 1 | ///////////////////////////////////////////////////////////////////////////// |
| 2 | // Name: src/common/stringops.cpp |
| 3 | // Purpose: implementation of wxString primitive operations |
| 4 | // Author: Vaclav Slavik |
| 5 | // Modified by: |
| 6 | // Created: 2007-04-16 |
| 7 | // RCS-ID: $Id$ |
| 8 | // Copyright: (c) 2007 REA Elektronik GmbH |
| 9 | // Licence: wxWindows licence |
| 10 | ///////////////////////////////////////////////////////////////////////////// |
| 11 | |
| 12 | // =========================================================================== |
| 13 | // headers |
| 14 | // =========================================================================== |
| 15 | |
| 16 | // For compilers that support precompilation, includes "wx.h". |
| 17 | #include "wx/wxprec.h" |
| 18 | |
| 19 | #ifdef __BORLANDC__ |
| 20 | #pragma hdrstop |
| 21 | #endif |
| 22 | |
| 23 | #ifndef WX_PRECOMP |
| 24 | #include "wx/stringops.h" |
| 25 | #endif |
| 26 | |
| 27 | // =========================================================================== |
| 28 | // implementation |
| 29 | // =========================================================================== |
| 30 | |
| 31 | #if wxUSE_UNICODE_UTF8 |
| 32 | |
| 33 | // --------------------------------------------------------------------------- |
| 34 | // UTF-8 sequences lengths |
| 35 | // --------------------------------------------------------------------------- |
| 36 | |
| 37 | const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { |
| 38 | // single-byte sequences (ASCII): |
| 39 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F |
| 40 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F |
| 41 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F |
| 42 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F |
| 43 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F |
| 44 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F |
| 45 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F |
| 46 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F |
| 47 | |
| 48 | // these are invalid, we use step 1 to skip |
| 49 | // over them (should never happen): |
| 50 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F |
| 51 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F |
| 52 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF |
| 53 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF |
| 54 | 1, 1, // C0,C1 |
| 55 | |
| 56 | // two-byte sequences: |
| 57 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF |
| 58 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF |
| 59 | |
| 60 | // three-byte sequences: |
| 61 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF |
| 62 | |
| 63 | // four-byte sequences: |
| 64 | 4, 4, 4, 4, 4, // F0..F4 |
| 65 | |
| 66 | // these are invalid again (5- or 6-byte |
| 67 | // sequences and sequences for code points |
| 68 | // above U+10FFFF, as restricted by RFC 3629): |
| 69 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF |
| 70 | }; |
| 71 | |
| 72 | // --------------------------------------------------------------------------- |
| 73 | // UTF-8 operations |
| 74 | // --------------------------------------------------------------------------- |
| 75 | |
| 76 | // |
| 77 | // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences |
| 78 | // |
| 79 | // Code Points | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte | |
| 80 | // -------------------+----------+----------+----------+----------+ |
| 81 | // U+0000..U+007F | 00..7F | | | | |
| 82 | // U+0080..U+07FF | C2..DF | 80..BF | | | |
| 83 | // U+0800..U+0FFF | E0 | A0..BF | 80..BF | | |
| 84 | // U+1000..U+FFFF | E1..EF | 80..BF | 80..BF | | |
| 85 | // U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | |
| 86 | // U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | |
| 87 | // U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | |
| 88 | // -------------------+----------+----------+----------+----------+ |
| 89 | |
| 90 | bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len) |
| 91 | { |
| 92 | if ( !str ) |
| 93 | return true; // empty string is UTF8 string |
| 94 | |
| 95 | const unsigned char *c = (const unsigned char*)str; |
| 96 | const unsigned char * const end = (len == wxStringImpl::npos) ? NULL : c + len; |
| 97 | |
| 98 | for ( ; c != end && *c; ++c ) |
| 99 | { |
| 100 | unsigned char b = *c; |
| 101 | |
| 102 | if ( end != NULL ) |
| 103 | { |
| 104 | // if the string is not NULL-terminated, verify we have enough |
| 105 | // bytes in it left for current character's encoding: |
| 106 | if ( c + ms_utf8IterTable[*c] > end ) |
| 107 | return false; |
| 108 | } |
| 109 | |
| 110 | if ( b <= 0x7F ) // 00..7F |
| 111 | continue; |
| 112 | |
| 113 | else if ( b < 0xC2 ) // invalid lead bytes: 80..C1 |
| 114 | return false; |
| 115 | |
| 116 | // two-byte sequences: |
| 117 | else if ( b <= 0xDF ) // C2..DF |
| 118 | { |
| 119 | b = *(++c); |
| 120 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 121 | return false; |
| 122 | } |
| 123 | |
| 124 | // three-byte sequences: |
| 125 | else if ( b == 0xE0 ) |
| 126 | { |
| 127 | b = *(++c); |
| 128 | if ( !(b >= 0xA0 && b <= 0xBF ) ) |
| 129 | return false; |
| 130 | b = *(++c); |
| 131 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 132 | return false; |
| 133 | } |
| 134 | else if ( b == 0xED ) |
| 135 | { |
| 136 | b = *(++c); |
| 137 | if ( !(b >= 0x80 && b <= 0x9F ) ) |
| 138 | return false; |
| 139 | b = *(++c); |
| 140 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 141 | return false; |
| 142 | } |
| 143 | else if ( b <= 0xEF ) // E1..EC EE..EF |
| 144 | { |
| 145 | for ( int i = 0; i < 2; ++i ) |
| 146 | { |
| 147 | b = *(++c); |
| 148 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 149 | return false; |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | // four-byte sequences: |
| 154 | else if ( b == 0xF0 ) |
| 155 | { |
| 156 | b = *(++c); |
| 157 | if ( !(b >= 0x90 && b <= 0xBF ) ) |
| 158 | return false; |
| 159 | for ( int i = 0; i < 2; ++i ) |
| 160 | { |
| 161 | b = *(++c); |
| 162 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 163 | return false; |
| 164 | } |
| 165 | } |
| 166 | else if ( b <= 0xF3 ) // F1..F3 |
| 167 | { |
| 168 | for ( int i = 0; i < 3; ++i ) |
| 169 | { |
| 170 | b = *(++c); |
| 171 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 172 | return false; |
| 173 | } |
| 174 | } |
| 175 | else if ( b == 0xF4 ) |
| 176 | { |
| 177 | b = *(++c); |
| 178 | if ( !(b >= 0x80 && b <= 0x8F ) ) |
| 179 | return false; |
| 180 | for ( int i = 0; i < 2; ++i ) |
| 181 | { |
| 182 | b = *(++c); |
| 183 | if ( !(b >= 0x80 && b <= 0xBF ) ) |
| 184 | return false; |
| 185 | } |
| 186 | } |
| 187 | else // otherwise, it's invalid lead byte |
| 188 | return false; |
| 189 | } |
| 190 | |
| 191 | return true; |
| 192 | } |
| 193 | |
| 194 | // NB: this is in this file and not unichar.cpp to keep all UTF-8 encoding |
| 195 | // code in single place |
| 196 | wxUniChar::Utf8CharBuffer wxUniChar::AsUTF8() const |
| 197 | { |
| 198 | Utf8CharBuffer buf = { "" }; // init to avoid g++ 4.1 warning with -O2 |
| 199 | char *out = buf.data; |
| 200 | |
| 201 | value_type code = GetValue(); |
| 202 | |
| 203 | // Char. number range | UTF-8 octet sequence |
| 204 | // (hexadecimal) | (binary) |
| 205 | // ----------------------+--------------------------------------------- |
| 206 | // 0000 0000 - 0000 007F | 0xxxxxxx |
| 207 | // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx |
| 208 | // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
| 209 | // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 210 | // |
| 211 | // Code point value is stored in bits marked with 'x', lowest-order bit |
| 212 | // of the value on the right side in the diagram above. |
| 213 | // (from RFC 3629) |
| 214 | |
| 215 | if ( code <= 0x7F ) |
| 216 | { |
| 217 | out[1] = 0; |
| 218 | out[0] = (char)code; |
| 219 | } |
| 220 | else if ( code <= 0x07FF ) |
| 221 | { |
| 222 | out[2] = 0; |
| 223 | // NB: this line takes 6 least significant bits, encodes them as |
| 224 | // 10xxxxxx and discards them so that the next byte can be encoded: |
| 225 | out[1] = 0x80 | (code & 0x3F); code >>= 6; |
| 226 | out[0] = 0xC0 | code; |
| 227 | } |
| 228 | else if ( code < 0xFFFF ) |
| 229 | { |
| 230 | out[3] = 0; |
| 231 | out[2] = 0x80 | (code & 0x3F); code >>= 6; |
| 232 | out[1] = 0x80 | (code & 0x3F); code >>= 6; |
| 233 | out[0] = 0xE0 | code; |
| 234 | } |
| 235 | else if ( code <= 0x10FFFF ) |
| 236 | { |
| 237 | out[4] = 0; |
| 238 | out[3] = 0x80 | (code & 0x3F); code >>= 6; |
| 239 | out[2] = 0x80 | (code & 0x3F); code >>= 6; |
| 240 | out[1] = 0x80 | (code & 0x3F); code >>= 6; |
| 241 | out[0] = 0xF0 | code; |
| 242 | } |
| 243 | else |
| 244 | { |
| 245 | wxFAIL_MSG( wxT("trying to encode undefined Unicode character") ); |
| 246 | out[0] = 0; |
| 247 | } |
| 248 | |
| 249 | return buf; |
| 250 | } |
| 251 | |
| 252 | wxUniChar |
| 253 | wxStringOperationsUtf8::DecodeNonAsciiChar(wxStringImpl::const_iterator i) |
| 254 | { |
| 255 | wxASSERT( IsValidUtf8LeadByte(*i) ); |
| 256 | |
| 257 | size_t len = GetUtf8CharLength(*i); |
| 258 | wxASSERT_MSG( len <= 4, wxT("invalid UTF-8 sequence length") ); |
| 259 | |
| 260 | // Char. number range | UTF-8 octet sequence |
| 261 | // (hexadecimal) | (binary) |
| 262 | // ----------------------+--------------------------------------------- |
| 263 | // 0000 0000 - 0000 007F | 0xxxxxxx |
| 264 | // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx |
| 265 | // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
| 266 | // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 267 | // |
| 268 | // Code point value is stored in bits marked with 'x', lowest-order bit |
| 269 | // of the value on the right side in the diagram above. |
| 270 | // (from RFC 3629) |
| 271 | |
| 272 | // mask to extract lead byte's value ('x' bits above), by sequence's length: |
| 273 | static const unsigned char s_leadValueMask[4] = { 0x7F, 0x1F, 0x0F, 0x07 }; |
| 274 | #if wxDEBUG_LEVEL |
| 275 | // mask and value of lead byte's most significant bits, by length: |
| 276 | static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 }; |
| 277 | static const unsigned char s_leadMarkerVal[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; |
| 278 | #endif |
| 279 | |
| 280 | // extract the lead byte's value bits: |
| 281 | wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) == |
| 282 | s_leadMarkerVal[len-1], |
| 283 | wxT("invalid UTF-8 lead byte") ); |
| 284 | wxUniChar::value_type code = (unsigned char)*i & s_leadValueMask[len-1]; |
| 285 | |
| 286 | // all remaining bytes, if any, are handled in the same way regardless of |
| 287 | // sequence's length: |
| 288 | for ( ++i ; len > 1; --len, ++i ) |
| 289 | { |
| 290 | wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80, |
| 291 | wxT("invalid UTF-8 byte") ); |
| 292 | |
| 293 | code <<= 6; |
| 294 | code |= (unsigned char)*i & 0x3F; |
| 295 | } |
| 296 | |
| 297 | return wxUniChar(code); |
| 298 | } |
| 299 | |
| 300 | wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch) |
| 301 | { |
| 302 | Utf8CharBuffer once(EncodeChar(ch)); |
| 303 | // the IncIter() table can be used to determine the length of ch's encoding: |
| 304 | size_t len = ms_utf8IterTable[(unsigned char)once.data[0]]; |
| 305 | |
| 306 | wxCharBuffer buf(n * len); |
| 307 | char *ptr = buf.data(); |
| 308 | for ( size_t i = 0; i < n; i++, ptr += len ) |
| 309 | { |
| 310 | memcpy(ptr, once.data, len); |
| 311 | } |
| 312 | |
| 313 | return buf; |
| 314 | } |
| 315 | |
| 316 | #endif // wxUSE_UNICODE_UTF8 |