]> git.saurik.com Git - wxWidgets.git/blame - src/common/ustring.cpp
Make code reading BMP files more robust.
[wxWidgets.git] / src / common / ustring.cpp
CommitLineData
9a6d1438
RR
1/////////////////////////////////////////////////////////////////////////////
2// Name: src/common/ustring.cpp
3// Purpose: wxUString class
4// Author: Robert Roebling
5// Created: 2008-07-25
2cce6635 6// RCS-ID: $Id$
9a6d1438
RR
7// Copyright: (c) 2008 Robert Roebling
8// Licence: wxWindows licence
9///////////////////////////////////////////////////////////////////////////////
10
11// For compilers that support precompilation, includes "wx.h".
12#include "wx/wxprec.h"
13
14#ifdef __BORLANDC__
15 #pragma hdrstop
16#endif
17
a99bcb5e
PC
18#include "wx/ustring.h"
19
9a6d1438 20#ifndef WX_PRECOMP
a99bcb5e 21 #include "wx/crt.h"
9a6d1438
RR
22 #include "wx/log.h"
23#endif
24
9a6d1438
RR
25wxUString &wxUString::assignFromAscii( const char *str )
26{
27 size_type len = wxStrlen( str );
5c69ef61 28
9a6d1438
RR
29 wxU32CharBuffer buffer( len );
30 wxChar32 *ptr = buffer.data();
5c69ef61 31
9a6d1438
RR
32 size_type i;
33 for (i = 0; i < len; i++)
34 {
35 *ptr = *str;
36 ptr++;
37 str++;
38 }
5c69ef61 39
9a6d1438
RR
40 return assign( buffer );
41}
42
43wxUString &wxUString::assignFromAscii( const char *str, size_type n )
44{
45 size_type len = 0;
46 const char *s = str;
47 while (len < n && *s)
48 {
49 len++;
50 s++;
51 }
5c69ef61 52
9a6d1438
RR
53 wxU32CharBuffer buffer( len );
54 wxChar32 *ptr = buffer.data();
5c69ef61 55
9a6d1438
RR
56 size_type i;
57 for (i = 0; i < len; i++)
58 {
59 *ptr = *str;
60 ptr++;
61 str++;
62 }
5c69ef61 63
9a6d1438
RR
64 return *this;
65}
66
67// ----------------------------------------------------------------------------
68// UTF-8
69// ----------------------------------------------------------------------------
70
9a6d1438
RR
71// this table gives the length of the UTF-8 encoding from its first character:
72const unsigned char tableUtf8Lengths[256] = {
73 // single-byte sequences (ASCII):
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
82
83 // these are invalid:
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
88 0, 0, // C0,C1
89
90 // two-byte sequences:
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
92 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
93
94 // three-byte sequences:
95 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
96
97 // four-byte sequences:
98 4, 4, 4, 4, 4, // F0..F4
99
100 // these are invalid again (5- or 6-byte
101 // sequences and sequences for code points
102 // above U+10FFFF, as restricted by RFC 3629):
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
104};
105
106wxUString &wxUString::assignFromUTF8( const char *str )
107{
108 if (!str)
109 return assign( wxUString() );
5c69ef61 110
9a6d1438
RR
111 size_type ucs4_len = 0;
112 const char *p = str;
113 while (*p)
114 {
115 unsigned char c = *p;
116 size_type len = tableUtf8Lengths[c];
117 if (!len)
118 return assign( wxUString() ); // don't try to convert invalid UTF-8
119 ucs4_len++;
120 p += len;
121 }
122
123 wxU32CharBuffer buffer( ucs4_len );
124 wxChar32 *out = buffer.data();
5c69ef61 125
9a6d1438
RR
126 p = str;
127 while (*p)
128 {
129 unsigned char c = *p;
130 if (c < 0x80)
131 {
132 *out = c;
133 p++;
134 }
135 else
136 {
137 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
138
139 // Char. number range | UTF-8 octet sequence
140 // (hexadecimal) | (binary)
141 // ----------------------+----------------------------------------
142 // 0000 0000 - 0000 007F | 0xxxxxxx
143 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
144 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
145 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
146 //
147 // Code point value is stored in bits marked with 'x',
148 // lowest-order bit of the value on the right side in the diagram
149 // above. (from RFC 3629)
150
151 // mask to extract lead byte's value ('x' bits above), by sequence
152 // length:
153 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
154
155 // mask and value of lead byte's most significant bits, by length:
156 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
157 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
158
159 len--; // it's more convenient to work with 0-based length here
160
161 // extract the lead byte's value bits:
162 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
163 break;
164
165 wxChar32 code = c & leadValueMask[len];
166
167 // all remaining bytes, if any, are handled in the same way
168 // regardless of sequence's length:
169 for ( ; len; --len )
170 {
171 c = *++p;
172 if ( (c & 0xC0) != 0x80 )
173 return assign( wxUString() ); // don't try to convert invalid UTF-8
174
175 code <<= 6;
176 code |= c & 0x3F;
177 }
5c69ef61 178
9a6d1438
RR
179 *out = code;
180 p++;
181 }
182 out++;
183 }
184
185 return assign( buffer.data() );
186}
187
188wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
189{
190 if (!str)
191 return assign( wxUString() );
5c69ef61 192
9a6d1438
RR
193 size_type ucs4_len = 0;
194 size_type utf8_pos = 0;
195 const char *p = str;
196 while (*p)
197 {
198 unsigned char c = *p;
199 size_type len = tableUtf8Lengths[c];
200 if (!len)
201 return assign( wxUString() ); // don't try to convert invalid UTF-8
202 if (utf8_pos + len > n)
203 break;
204 utf8_pos += len;
205 ucs4_len ++;
206 p += len;
207 }
5c69ef61 208
9a6d1438
RR
209 wxU32CharBuffer buffer( ucs4_len );
210 wxChar32 *out = buffer.data();
5c69ef61 211
9a6d1438
RR
212 utf8_pos = 0;
213 p = str;
214 while (*p)
215 {
216 unsigned char c = *p;
217 if (c < 0x80)
218 {
219 if (utf8_pos + 1 > n)
220 break;
221 utf8_pos++;
5c69ef61 222
9a6d1438
RR
223 *out = c;
224 p++;
225 }
226 else
227 {
228 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
229 if (utf8_pos + len > n)
230 break;
231 utf8_pos += len;
232
233 // Char. number range | UTF-8 octet sequence
234 // (hexadecimal) | (binary)
235 // ----------------------+----------------------------------------
236 // 0000 0000 - 0000 007F | 0xxxxxxx
237 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
238 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
239 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
240 //
241 // Code point value is stored in bits marked with 'x',
242 // lowest-order bit of the value on the right side in the diagram
243 // above. (from RFC 3629)
244
245 // mask to extract lead byte's value ('x' bits above), by sequence
246 // length:
247 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
248
249 // mask and value of lead byte's most significant bits, by length:
250 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
251 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
252
253 len--; // it's more convenient to work with 0-based length here
254
255 // extract the lead byte's value bits:
256 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
257 break;
258
259 wxChar32 code = c & leadValueMask[len];
260
261 // all remaining bytes, if any, are handled in the same way
262 // regardless of sequence's length:
263 for ( ; len; --len )
264 {
265 c = *++p;
266 if ( (c & 0xC0) != 0x80 )
267 return assign( wxUString() ); // don't try to convert invalid UTF-8
268
269 code <<= 6;
270 code |= c & 0x3F;
271 }
5c69ef61 272
9a6d1438
RR
273 *out = code;
274 p++;
275 }
276 out++;
277 }
5c69ef61 278
9a6d1438
RR
279 *out = 0;
280
281 return assign( buffer.data() );
282}
283
284wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
285{
286 if (!str)
287 return assign( wxUString() );
5c69ef61 288
9a6d1438
RR
289 size_type ucs4_len = 0;
290 size_type utf16_pos = 0;
291 const wxChar16 *p = str;
292 while (*p)
293 {
294 size_type len;
295 if ((*p < 0xd800) || (*p > 0xdfff))
296 {
297 len = 1;
298 }
299 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
300 {
301 return assign( wxUString() ); // don't try to convert invalid UTF-16
302 }
303 else
304 {
305 len = 2;
306 }
5c69ef61 307
9a6d1438
RR
308 if (utf16_pos + len > n)
309 break;
5c69ef61 310
9a6d1438
RR
311 ucs4_len++;
312 p += len;
313 utf16_pos += len;
314 }
315
316 wxU32CharBuffer buffer( ucs4_len );
317 wxChar32 *out = buffer.data();
318
319 utf16_pos = 0;
5c69ef61 320
9a6d1438
RR
321 p = str;
322 while (*p)
323 {
324 if ((*p < 0xd800) || (*p > 0xdfff))
325 {
326 if (utf16_pos + 1 > n)
327 break;
5c69ef61 328
9a6d1438
RR
329 *out = *p;
330 p++;
331 utf16_pos++;
332 }
333 else
334 {
335 if (utf16_pos + 2 > n)
336 break;
5c69ef61 337
9a6d1438
RR
338 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
339 p += 2;
340 utf16_pos += 2;
341 }
342 out++;
343 }
5c69ef61 344
9a6d1438
RR
345 return assign( buffer.data() );
346}
347
348wxUString &wxUString::assignFromUTF16( const wxChar16* str )
349{
350 if (!str)
351 return assign( wxUString() );
5c69ef61 352
9a6d1438
RR
353 size_type ucs4_len = 0;
354 const wxChar16 *p = str;
355 while (*p)
356 {
357 size_type len;
358 if ((*p < 0xd800) || (*p > 0xdfff))
359 {
360 len = 1;
361 }
362 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
363 {
364 return assign( wxUString() ); // don't try to convert invalid UTF-16
365 }
366 else
367 {
368 len = 2;
369 }
5c69ef61 370
9a6d1438
RR
371 ucs4_len++;
372 p += len;
373 }
374
375 wxU32CharBuffer buffer( ucs4_len );
376 wxChar32 *out = buffer.data();
5c69ef61 377
9a6d1438
RR
378 p = str;
379 while (*p)
380 {
381 if ((*p < 0xd800) || (*p > 0xdfff))
382 {
383 *out = *p;
384 p++;
385 }
386 else
387 {
388 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
389 p += 2;
390 }
391 out++;
392 }
5c69ef61 393
9a6d1438
RR
394 return assign( buffer.data() );
395}
396
397wxUString &wxUString::assignFromCString( const char* str )
398{
399 if (!str)
400 return assign( wxUString() );
5c69ef61 401
de4983f3 402 wxScopedWCharBuffer buffer = wxConvLibc.cMB2WC( str );
5c69ef61 403
9a6d1438
RR
404 return assign( buffer );
405}
406
407wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
408{
409 if (!str)
410 return assign( wxUString() );
5c69ef61 411
de4983f3 412 wxScopedWCharBuffer buffer = conv.cMB2WC( str );
5c69ef61 413
9a6d1438
RR
414 return assign( buffer );
415}
416
de4983f3 417wxScopedCharBuffer wxUString::utf8_str() const
9a6d1438
RR
418{
419 size_type utf8_length = 0;
420 const wxChar32 *ptr = data();
5c69ef61 421
9a6d1438
RR
422 while (*ptr)
423 {
424 wxChar32 code = *ptr;
425 ptr++;
5c69ef61 426
9a6d1438
RR
427 if ( code <= 0x7F )
428 {
429 utf8_length++;
430 }
431 else if ( code <= 0x07FF )
432 {
433 utf8_length += 2;
434 }
435 else if ( code < 0xFFFF )
436 {
437 utf8_length += 3;
438 }
439 else if ( code <= 0x10FFFF )
440 {
441 utf8_length += 4;
442 }
443 else
444 {
445 // invalid range, skip
446 }
447 }
5c69ef61 448
9a6d1438 449 wxCharBuffer result( utf8_length );
5c69ef61 450
9a6d1438 451 char *out = result.data();
5c69ef61 452
9a6d1438
RR
453 ptr = data();
454 while (*ptr)
455 {
456 wxChar32 code = *ptr;
457 ptr++;
5c69ef61 458
9a6d1438
RR
459 if ( code <= 0x7F )
460 {
461 out[0] = (char)code;
462 out++;
463 }
464 else if ( code <= 0x07FF )
465 {
466 out[1] = 0x80 | (code & 0x3F); code >>= 6;
467 out[0] = 0xC0 | code;
468 out += 2;
469 }
470 else if ( code < 0xFFFF )
471 {
472 out[2] = 0x80 | (code & 0x3F); code >>= 6;
473 out[1] = 0x80 | (code & 0x3F); code >>= 6;
474 out[0] = 0xE0 | code;
475 out += 3;
476 }
477 else if ( code <= 0x10FFFF )
478 {
479 out[3] = 0x80 | (code & 0x3F); code >>= 6;
480 out[2] = 0x80 | (code & 0x3F); code >>= 6;
481 out[1] = 0x80 | (code & 0x3F); code >>= 6;
482 out[0] = 0xF0 | code;
483 out += 4;
484 }
485 else
486 {
487 // invalid range, skip
488 }
489 }
490
9a6d1438
RR
491 return result;
492}
5c69ef61 493
de4983f3 494wxScopedU16CharBuffer wxUString::utf16_str() const
9a6d1438
RR
495{
496 size_type utf16_length = 0;
497 const wxChar32 *ptr = data();
5c69ef61 498
9a6d1438
RR
499 while (*ptr)
500 {
501 wxChar32 code = *ptr;
502 ptr++;
5c69ef61 503
9a6d1438 504 // TODO: error range checks
5c69ef61 505
9a6d1438
RR
506 if (code < 0x10000)
507 utf16_length++;
508 else
509 utf16_length += 2;
510 }
5c69ef61 511
9a6d1438
RR
512 wxU16CharBuffer result( utf16_length );
513 wxChar16 *out = result.data();
5c69ef61 514
9a6d1438 515 ptr = data();
5c69ef61 516
9a6d1438
RR
517 while (*ptr)
518 {
519 wxChar32 code = *ptr;
520 ptr++;
5c69ef61 521
9a6d1438 522 // TODO: error range checks
5c69ef61 523
9a6d1438
RR
524 if (code < 0x10000)
525 {
526 out[0] = code;
527 out++;
528 }
529 else
530 {
531 out[0] = (code - 0x10000) / 0x400 + 0xd800;
532 out[1] = (code - 0x10000) % 0x400 + 0xdc00;
533 out += 2;
534 }
535 }
5c69ef61
VZ
536
537 return result;
9a6d1438 538}