]> git.saurik.com Git - wxWidgets.git/blame - src/common/ustring.cpp
Fix assorted typos in comments and other non-code.
[wxWidgets.git] / src / common / ustring.cpp
CommitLineData
9a6d1438
RR
1/////////////////////////////////////////////////////////////////////////////
2// Name: src/common/ustring.cpp
3// Purpose: wxUString class
4// Author: Robert Roebling
5// Created: 2008-07-25
9a6d1438
RR
6// Copyright: (c) 2008 Robert Roebling
7// Licence: wxWindows licence
8///////////////////////////////////////////////////////////////////////////////
9
10// For compilers that support precompilation, includes "wx.h".
11#include "wx/wxprec.h"
12
13#ifdef __BORLANDC__
14 #pragma hdrstop
15#endif
16
a99bcb5e
PC
17#include "wx/ustring.h"
18
9a6d1438 19#ifndef WX_PRECOMP
a99bcb5e 20 #include "wx/crt.h"
9a6d1438
RR
21 #include "wx/log.h"
22#endif
23
9a6d1438
RR
24wxUString &wxUString::assignFromAscii( const char *str )
25{
26 size_type len = wxStrlen( str );
5c69ef61 27
9a6d1438
RR
28 wxU32CharBuffer buffer( len );
29 wxChar32 *ptr = buffer.data();
5c69ef61 30
9a6d1438
RR
31 size_type i;
32 for (i = 0; i < len; i++)
33 {
34 *ptr = *str;
35 ptr++;
36 str++;
37 }
5c69ef61 38
9a6d1438
RR
39 return assign( buffer );
40}
41
42wxUString &wxUString::assignFromAscii( const char *str, size_type n )
43{
44 size_type len = 0;
45 const char *s = str;
46 while (len < n && *s)
47 {
48 len++;
49 s++;
50 }
5c69ef61 51
9a6d1438
RR
52 wxU32CharBuffer buffer( len );
53 wxChar32 *ptr = buffer.data();
5c69ef61 54
9a6d1438
RR
55 size_type i;
56 for (i = 0; i < len; i++)
57 {
58 *ptr = *str;
59 ptr++;
60 str++;
61 }
5c69ef61 62
9a6d1438
RR
63 return *this;
64}
65
66// ----------------------------------------------------------------------------
67// UTF-8
68// ----------------------------------------------------------------------------
69
9a6d1438
RR
70// this table gives the length of the UTF-8 encoding from its first character:
71const unsigned char tableUtf8Lengths[256] = {
72 // single-byte sequences (ASCII):
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
81
82 // these are invalid:
83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
87 0, 0, // C0,C1
88
89 // two-byte sequences:
90 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
92
93 // three-byte sequences:
94 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
95
96 // four-byte sequences:
97 4, 4, 4, 4, 4, // F0..F4
98
99 // these are invalid again (5- or 6-byte
100 // sequences and sequences for code points
101 // above U+10FFFF, as restricted by RFC 3629):
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
103};
104
105wxUString &wxUString::assignFromUTF8( const char *str )
106{
107 if (!str)
108 return assign( wxUString() );
5c69ef61 109
9a6d1438
RR
110 size_type ucs4_len = 0;
111 const char *p = str;
112 while (*p)
113 {
114 unsigned char c = *p;
115 size_type len = tableUtf8Lengths[c];
116 if (!len)
117 return assign( wxUString() ); // don't try to convert invalid UTF-8
118 ucs4_len++;
119 p += len;
120 }
121
122 wxU32CharBuffer buffer( ucs4_len );
123 wxChar32 *out = buffer.data();
5c69ef61 124
9a6d1438
RR
125 p = str;
126 while (*p)
127 {
128 unsigned char c = *p;
129 if (c < 0x80)
130 {
131 *out = c;
132 p++;
133 }
134 else
135 {
136 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
137
138 // Char. number range | UTF-8 octet sequence
139 // (hexadecimal) | (binary)
140 // ----------------------+----------------------------------------
141 // 0000 0000 - 0000 007F | 0xxxxxxx
142 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
143 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
144 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
145 //
146 // Code point value is stored in bits marked with 'x',
147 // lowest-order bit of the value on the right side in the diagram
148 // above. (from RFC 3629)
149
150 // mask to extract lead byte's value ('x' bits above), by sequence
151 // length:
152 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
153
154 // mask and value of lead byte's most significant bits, by length:
155 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
156 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
157
158 len--; // it's more convenient to work with 0-based length here
159
160 // extract the lead byte's value bits:
161 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
162 break;
163
164 wxChar32 code = c & leadValueMask[len];
165
166 // all remaining bytes, if any, are handled in the same way
167 // regardless of sequence's length:
168 for ( ; len; --len )
169 {
170 c = *++p;
171 if ( (c & 0xC0) != 0x80 )
172 return assign( wxUString() ); // don't try to convert invalid UTF-8
173
174 code <<= 6;
175 code |= c & 0x3F;
176 }
5c69ef61 177
9a6d1438
RR
178 *out = code;
179 p++;
180 }
181 out++;
182 }
183
184 return assign( buffer.data() );
185}
186
187wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
188{
189 if (!str)
190 return assign( wxUString() );
5c69ef61 191
9a6d1438
RR
192 size_type ucs4_len = 0;
193 size_type utf8_pos = 0;
194 const char *p = str;
195 while (*p)
196 {
197 unsigned char c = *p;
198 size_type len = tableUtf8Lengths[c];
199 if (!len)
200 return assign( wxUString() ); // don't try to convert invalid UTF-8
201 if (utf8_pos + len > n)
202 break;
203 utf8_pos += len;
204 ucs4_len ++;
205 p += len;
206 }
5c69ef61 207
9a6d1438
RR
208 wxU32CharBuffer buffer( ucs4_len );
209 wxChar32 *out = buffer.data();
5c69ef61 210
9a6d1438
RR
211 utf8_pos = 0;
212 p = str;
213 while (*p)
214 {
215 unsigned char c = *p;
216 if (c < 0x80)
217 {
218 if (utf8_pos + 1 > n)
219 break;
220 utf8_pos++;
5c69ef61 221
9a6d1438
RR
222 *out = c;
223 p++;
224 }
225 else
226 {
227 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
228 if (utf8_pos + len > n)
229 break;
230 utf8_pos += len;
231
232 // Char. number range | UTF-8 octet sequence
233 // (hexadecimal) | (binary)
234 // ----------------------+----------------------------------------
235 // 0000 0000 - 0000 007F | 0xxxxxxx
236 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
237 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
238 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
239 //
240 // Code point value is stored in bits marked with 'x',
241 // lowest-order bit of the value on the right side in the diagram
242 // above. (from RFC 3629)
243
244 // mask to extract lead byte's value ('x' bits above), by sequence
245 // length:
246 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
247
248 // mask and value of lead byte's most significant bits, by length:
249 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
250 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
251
252 len--; // it's more convenient to work with 0-based length here
253
254 // extract the lead byte's value bits:
255 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
256 break;
257
258 wxChar32 code = c & leadValueMask[len];
259
260 // all remaining bytes, if any, are handled in the same way
261 // regardless of sequence's length:
262 for ( ; len; --len )
263 {
264 c = *++p;
265 if ( (c & 0xC0) != 0x80 )
266 return assign( wxUString() ); // don't try to convert invalid UTF-8
267
268 code <<= 6;
269 code |= c & 0x3F;
270 }
5c69ef61 271
9a6d1438
RR
272 *out = code;
273 p++;
274 }
275 out++;
276 }
5c69ef61 277
9a6d1438
RR
278 *out = 0;
279
280 return assign( buffer.data() );
281}
282
283wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
284{
285 if (!str)
286 return assign( wxUString() );
5c69ef61 287
9a6d1438
RR
288 size_type ucs4_len = 0;
289 size_type utf16_pos = 0;
290 const wxChar16 *p = str;
291 while (*p)
292 {
293 size_type len;
294 if ((*p < 0xd800) || (*p > 0xdfff))
295 {
296 len = 1;
297 }
298 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
299 {
300 return assign( wxUString() ); // don't try to convert invalid UTF-16
301 }
302 else
303 {
304 len = 2;
305 }
5c69ef61 306
9a6d1438
RR
307 if (utf16_pos + len > n)
308 break;
5c69ef61 309
9a6d1438
RR
310 ucs4_len++;
311 p += len;
312 utf16_pos += len;
313 }
314
315 wxU32CharBuffer buffer( ucs4_len );
316 wxChar32 *out = buffer.data();
317
318 utf16_pos = 0;
5c69ef61 319
9a6d1438
RR
320 p = str;
321 while (*p)
322 {
323 if ((*p < 0xd800) || (*p > 0xdfff))
324 {
325 if (utf16_pos + 1 > n)
326 break;
5c69ef61 327
9a6d1438
RR
328 *out = *p;
329 p++;
330 utf16_pos++;
331 }
332 else
333 {
334 if (utf16_pos + 2 > n)
335 break;
5c69ef61 336
9a6d1438
RR
337 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
338 p += 2;
339 utf16_pos += 2;
340 }
341 out++;
342 }
5c69ef61 343
9a6d1438
RR
344 return assign( buffer.data() );
345}
346
347wxUString &wxUString::assignFromUTF16( const wxChar16* str )
348{
349 if (!str)
350 return assign( wxUString() );
5c69ef61 351
9a6d1438
RR
352 size_type ucs4_len = 0;
353 const wxChar16 *p = str;
354 while (*p)
355 {
356 size_type len;
357 if ((*p < 0xd800) || (*p > 0xdfff))
358 {
359 len = 1;
360 }
361 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
362 {
363 return assign( wxUString() ); // don't try to convert invalid UTF-16
364 }
365 else
366 {
367 len = 2;
368 }
5c69ef61 369
9a6d1438
RR
370 ucs4_len++;
371 p += len;
372 }
373
374 wxU32CharBuffer buffer( ucs4_len );
375 wxChar32 *out = buffer.data();
5c69ef61 376
9a6d1438
RR
377 p = str;
378 while (*p)
379 {
380 if ((*p < 0xd800) || (*p > 0xdfff))
381 {
382 *out = *p;
383 p++;
384 }
385 else
386 {
387 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
388 p += 2;
389 }
390 out++;
391 }
5c69ef61 392
9a6d1438
RR
393 return assign( buffer.data() );
394}
395
396wxUString &wxUString::assignFromCString( const char* str )
397{
398 if (!str)
399 return assign( wxUString() );
5c69ef61 400
de4983f3 401 wxScopedWCharBuffer buffer = wxConvLibc.cMB2WC( str );
5c69ef61 402
9a6d1438
RR
403 return assign( buffer );
404}
405
406wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
407{
408 if (!str)
409 return assign( wxUString() );
5c69ef61 410
de4983f3 411 wxScopedWCharBuffer buffer = conv.cMB2WC( str );
5c69ef61 412
9a6d1438
RR
413 return assign( buffer );
414}
415
de4983f3 416wxScopedCharBuffer wxUString::utf8_str() const
9a6d1438
RR
417{
418 size_type utf8_length = 0;
419 const wxChar32 *ptr = data();
5c69ef61 420
9a6d1438
RR
421 while (*ptr)
422 {
423 wxChar32 code = *ptr;
424 ptr++;
5c69ef61 425
9a6d1438
RR
426 if ( code <= 0x7F )
427 {
428 utf8_length++;
429 }
430 else if ( code <= 0x07FF )
431 {
432 utf8_length += 2;
433 }
434 else if ( code < 0xFFFF )
435 {
436 utf8_length += 3;
437 }
438 else if ( code <= 0x10FFFF )
439 {
440 utf8_length += 4;
441 }
442 else
443 {
444 // invalid range, skip
445 }
446 }
5c69ef61 447
9a6d1438 448 wxCharBuffer result( utf8_length );
5c69ef61 449
9a6d1438 450 char *out = result.data();
5c69ef61 451
9a6d1438
RR
452 ptr = data();
453 while (*ptr)
454 {
455 wxChar32 code = *ptr;
456 ptr++;
5c69ef61 457
9a6d1438
RR
458 if ( code <= 0x7F )
459 {
460 out[0] = (char)code;
461 out++;
462 }
463 else if ( code <= 0x07FF )
464 {
465 out[1] = 0x80 | (code & 0x3F); code >>= 6;
466 out[0] = 0xC0 | code;
467 out += 2;
468 }
469 else if ( code < 0xFFFF )
470 {
471 out[2] = 0x80 | (code & 0x3F); code >>= 6;
472 out[1] = 0x80 | (code & 0x3F); code >>= 6;
473 out[0] = 0xE0 | code;
474 out += 3;
475 }
476 else if ( code <= 0x10FFFF )
477 {
478 out[3] = 0x80 | (code & 0x3F); code >>= 6;
479 out[2] = 0x80 | (code & 0x3F); code >>= 6;
480 out[1] = 0x80 | (code & 0x3F); code >>= 6;
481 out[0] = 0xF0 | code;
482 out += 4;
483 }
484 else
485 {
486 // invalid range, skip
487 }
488 }
489
9a6d1438
RR
490 return result;
491}
5c69ef61 492
de4983f3 493wxScopedU16CharBuffer wxUString::utf16_str() const
9a6d1438
RR
494{
495 size_type utf16_length = 0;
496 const wxChar32 *ptr = data();
5c69ef61 497
9a6d1438
RR
498 while (*ptr)
499 {
500 wxChar32 code = *ptr;
501 ptr++;
5c69ef61 502
9a6d1438 503 // TODO: error range checks
5c69ef61 504
9a6d1438
RR
505 if (code < 0x10000)
506 utf16_length++;
507 else
508 utf16_length += 2;
509 }
5c69ef61 510
9a6d1438
RR
511 wxU16CharBuffer result( utf16_length );
512 wxChar16 *out = result.data();
5c69ef61 513
9a6d1438 514 ptr = data();
5c69ef61 515
9a6d1438
RR
516 while (*ptr)
517 {
518 wxChar32 code = *ptr;
519 ptr++;
5c69ef61 520
9a6d1438 521 // TODO: error range checks
5c69ef61 522
9a6d1438
RR
523 if (code < 0x10000)
524 {
525 out[0] = code;
526 out++;
527 }
528 else
529 {
530 out[0] = (code - 0x10000) / 0x400 + 0xd800;
531 out[1] = (code - 0x10000) % 0x400 + 0xdc00;
532 out += 2;
533 }
534 }
5c69ef61
VZ
535
536 return result;
9a6d1438 537}