]> git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
skip apple options
[wxWidgets.git] / src / common / ustring.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
5 // Created: 2008-07-25
6 // Copyright: (c) 2008 Robert Roebling
7 // Licence: wxWindows licence
8 ///////////////////////////////////////////////////////////////////////////////
9
10 // For compilers that support precompilation, includes "wx.h".
11 #include "wx/wxprec.h"
12
13 #ifdef __BORLANDC__
14 #pragma hdrstop
15 #endif
16
17 #include "wx/ustring.h"
18
19 #ifndef WX_PRECOMP
20 #include "wx/crt.h"
21 #include "wx/log.h"
22 #endif
23
24 wxUString &wxUString::assignFromAscii( const char *str )
25 {
26 size_type len = wxStrlen( str );
27
28 wxU32CharBuffer buffer( len );
29 wxChar32 *ptr = buffer.data();
30
31 size_type i;
32 for (i = 0; i < len; i++)
33 {
34 *ptr = *str;
35 ptr++;
36 str++;
37 }
38
39 return assign( buffer );
40 }
41
42 wxUString &wxUString::assignFromAscii( const char *str, size_type n )
43 {
44 size_type len = 0;
45 const char *s = str;
46 while (len < n && *s)
47 {
48 len++;
49 s++;
50 }
51
52 wxU32CharBuffer buffer( len );
53 wxChar32 *ptr = buffer.data();
54
55 size_type i;
56 for (i = 0; i < len; i++)
57 {
58 *ptr = *str;
59 ptr++;
60 str++;
61 }
62
63 return *this;
64 }
65
66 // ----------------------------------------------------------------------------
67 // UTF-8
68 // ----------------------------------------------------------------------------
69
70 // this table gives the length of the UTF-8 encoding from its first character:
71 const unsigned char tableUtf8Lengths[256] = {
72 // single-byte sequences (ASCII):
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
81
82 // these are invalid:
83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
87 0, 0, // C0,C1
88
89 // two-byte sequences:
90 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
92
93 // three-byte sequences:
94 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
95
96 // four-byte sequences:
97 4, 4, 4, 4, 4, // F0..F4
98
99 // these are invalid again (5- or 6-byte
100 // sequences and sequences for code points
101 // above U+10FFFF, as restricted by RFC 3629):
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
103 };
104
105 wxUString &wxUString::assignFromUTF8( const char *str )
106 {
107 if (!str)
108 return assign( wxUString() );
109
110 size_type ucs4_len = 0;
111 const char *p = str;
112 while (*p)
113 {
114 unsigned char c = *p;
115 size_type len = tableUtf8Lengths[c];
116 if (!len)
117 return assign( wxUString() ); // don't try to convert invalid UTF-8
118 ucs4_len++;
119 p += len;
120 }
121
122 wxU32CharBuffer buffer( ucs4_len );
123 wxChar32 *out = buffer.data();
124
125 p = str;
126 while (*p)
127 {
128 unsigned char c = *p;
129 if (c < 0x80)
130 {
131 *out = c;
132 p++;
133 }
134 else
135 {
136 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
137
138 // Char. number range | UTF-8 octet sequence
139 // (hexadecimal) | (binary)
140 // ----------------------+----------------------------------------
141 // 0000 0000 - 0000 007F | 0xxxxxxx
142 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
143 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
144 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
145 //
146 // Code point value is stored in bits marked with 'x',
147 // lowest-order bit of the value on the right side in the diagram
148 // above. (from RFC 3629)
149
150 // mask to extract lead byte's value ('x' bits above), by sequence
151 // length:
152 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
153
154 // mask and value of lead byte's most significant bits, by length:
155 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
156 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
157
158 len--; // it's more convenient to work with 0-based length here
159
160 // extract the lead byte's value bits:
161 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
162 break;
163
164 wxChar32 code = c & leadValueMask[len];
165
166 // all remaining bytes, if any, are handled in the same way
167 // regardless of sequence's length:
168 for ( ; len; --len )
169 {
170 c = *++p;
171 if ( (c & 0xC0) != 0x80 )
172 return assign( wxUString() ); // don't try to convert invalid UTF-8
173
174 code <<= 6;
175 code |= c & 0x3F;
176 }
177
178 *out = code;
179 p++;
180 }
181 out++;
182 }
183
184 return assign( buffer.data() );
185 }
186
187 wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
188 {
189 if (!str)
190 return assign( wxUString() );
191
192 size_type ucs4_len = 0;
193 size_type utf8_pos = 0;
194 const char *p = str;
195 while (*p)
196 {
197 unsigned char c = *p;
198 size_type len = tableUtf8Lengths[c];
199 if (!len)
200 return assign( wxUString() ); // don't try to convert invalid UTF-8
201 if (utf8_pos + len > n)
202 break;
203 utf8_pos += len;
204 ucs4_len ++;
205 p += len;
206 }
207
208 wxU32CharBuffer buffer( ucs4_len );
209 wxChar32 *out = buffer.data();
210
211 utf8_pos = 0;
212 p = str;
213 while (*p)
214 {
215 unsigned char c = *p;
216 if (c < 0x80)
217 {
218 if (utf8_pos + 1 > n)
219 break;
220 utf8_pos++;
221
222 *out = c;
223 p++;
224 }
225 else
226 {
227 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
228 if (utf8_pos + len > n)
229 break;
230 utf8_pos += len;
231
232 // Char. number range | UTF-8 octet sequence
233 // (hexadecimal) | (binary)
234 // ----------------------+----------------------------------------
235 // 0000 0000 - 0000 007F | 0xxxxxxx
236 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
237 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
238 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
239 //
240 // Code point value is stored in bits marked with 'x',
241 // lowest-order bit of the value on the right side in the diagram
242 // above. (from RFC 3629)
243
244 // mask to extract lead byte's value ('x' bits above), by sequence
245 // length:
246 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
247
248 // mask and value of lead byte's most significant bits, by length:
249 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
250 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
251
252 len--; // it's more convenient to work with 0-based length here
253
254 // extract the lead byte's value bits:
255 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
256 break;
257
258 wxChar32 code = c & leadValueMask[len];
259
260 // all remaining bytes, if any, are handled in the same way
261 // regardless of sequence's length:
262 for ( ; len; --len )
263 {
264 c = *++p;
265 if ( (c & 0xC0) != 0x80 )
266 return assign( wxUString() ); // don't try to convert invalid UTF-8
267
268 code <<= 6;
269 code |= c & 0x3F;
270 }
271
272 *out = code;
273 p++;
274 }
275 out++;
276 }
277
278 *out = 0;
279
280 return assign( buffer.data() );
281 }
282
283 wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
284 {
285 if (!str)
286 return assign( wxUString() );
287
288 size_type ucs4_len = 0;
289 size_type utf16_pos = 0;
290 const wxChar16 *p = str;
291 while (*p)
292 {
293 size_type len;
294 if ((*p < 0xd800) || (*p > 0xdfff))
295 {
296 len = 1;
297 }
298 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
299 {
300 return assign( wxUString() ); // don't try to convert invalid UTF-16
301 }
302 else
303 {
304 len = 2;
305 }
306
307 if (utf16_pos + len > n)
308 break;
309
310 ucs4_len++;
311 p += len;
312 utf16_pos += len;
313 }
314
315 wxU32CharBuffer buffer( ucs4_len );
316 wxChar32 *out = buffer.data();
317
318 utf16_pos = 0;
319
320 p = str;
321 while (*p)
322 {
323 if ((*p < 0xd800) || (*p > 0xdfff))
324 {
325 if (utf16_pos + 1 > n)
326 break;
327
328 *out = *p;
329 p++;
330 utf16_pos++;
331 }
332 else
333 {
334 if (utf16_pos + 2 > n)
335 break;
336
337 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
338 p += 2;
339 utf16_pos += 2;
340 }
341 out++;
342 }
343
344 return assign( buffer.data() );
345 }
346
347 wxUString &wxUString::assignFromUTF16( const wxChar16* str )
348 {
349 if (!str)
350 return assign( wxUString() );
351
352 size_type ucs4_len = 0;
353 const wxChar16 *p = str;
354 while (*p)
355 {
356 size_type len;
357 if ((*p < 0xd800) || (*p > 0xdfff))
358 {
359 len = 1;
360 }
361 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
362 {
363 return assign( wxUString() ); // don't try to convert invalid UTF-16
364 }
365 else
366 {
367 len = 2;
368 }
369
370 ucs4_len++;
371 p += len;
372 }
373
374 wxU32CharBuffer buffer( ucs4_len );
375 wxChar32 *out = buffer.data();
376
377 p = str;
378 while (*p)
379 {
380 if ((*p < 0xd800) || (*p > 0xdfff))
381 {
382 *out = *p;
383 p++;
384 }
385 else
386 {
387 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
388 p += 2;
389 }
390 out++;
391 }
392
393 return assign( buffer.data() );
394 }
395
396 wxUString &wxUString::assignFromCString( const char* str )
397 {
398 if (!str)
399 return assign( wxUString() );
400
401 wxScopedWCharBuffer buffer = wxConvLibc.cMB2WC( str );
402
403 return assign( buffer );
404 }
405
406 wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
407 {
408 if (!str)
409 return assign( wxUString() );
410
411 wxScopedWCharBuffer buffer = conv.cMB2WC( str );
412
413 return assign( buffer );
414 }
415
416 wxScopedCharBuffer wxUString::utf8_str() const
417 {
418 size_type utf8_length = 0;
419 const wxChar32 *ptr = data();
420
421 while (*ptr)
422 {
423 wxChar32 code = *ptr;
424 ptr++;
425
426 if ( code <= 0x7F )
427 {
428 utf8_length++;
429 }
430 else if ( code <= 0x07FF )
431 {
432 utf8_length += 2;
433 }
434 else if ( code < 0xFFFF )
435 {
436 utf8_length += 3;
437 }
438 else if ( code <= 0x10FFFF )
439 {
440 utf8_length += 4;
441 }
442 else
443 {
444 // invalid range, skip
445 }
446 }
447
448 wxCharBuffer result( utf8_length );
449
450 char *out = result.data();
451
452 ptr = data();
453 while (*ptr)
454 {
455 wxChar32 code = *ptr;
456 ptr++;
457
458 if ( code <= 0x7F )
459 {
460 out[0] = (char)code;
461 out++;
462 }
463 else if ( code <= 0x07FF )
464 {
465 out[1] = 0x80 | (code & 0x3F); code >>= 6;
466 out[0] = 0xC0 | code;
467 out += 2;
468 }
469 else if ( code < 0xFFFF )
470 {
471 out[2] = 0x80 | (code & 0x3F); code >>= 6;
472 out[1] = 0x80 | (code & 0x3F); code >>= 6;
473 out[0] = 0xE0 | code;
474 out += 3;
475 }
476 else if ( code <= 0x10FFFF )
477 {
478 out[3] = 0x80 | (code & 0x3F); code >>= 6;
479 out[2] = 0x80 | (code & 0x3F); code >>= 6;
480 out[1] = 0x80 | (code & 0x3F); code >>= 6;
481 out[0] = 0xF0 | code;
482 out += 4;
483 }
484 else
485 {
486 // invalid range, skip
487 }
488 }
489
490 return result;
491 }
492
493 wxScopedU16CharBuffer wxUString::utf16_str() const
494 {
495 size_type utf16_length = 0;
496 const wxChar32 *ptr = data();
497
498 while (*ptr)
499 {
500 wxChar32 code = *ptr;
501 ptr++;
502
503 // TODO: error range checks
504
505 if (code < 0x10000)
506 utf16_length++;
507 else
508 utf16_length += 2;
509 }
510
511 wxU16CharBuffer result( utf16_length );
512 wxChar16 *out = result.data();
513
514 ptr = data();
515
516 while (*ptr)
517 {
518 wxChar32 code = *ptr;
519 ptr++;
520
521 // TODO: error range checks
522
523 if (code < 0x10000)
524 {
525 out[0] = code;
526 out++;
527 }
528 else
529 {
530 out[0] = (code - 0x10000) / 0x400 + 0xd800;
531 out[1] = (code - 0x10000) % 0x400 + 0xdc00;
532 out += 2;
533 }
534 }
535
536 return result;
537 }