]> git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
a9b9241eb475551e2e43486503eaf6a1a64f5fc3
[wxWidgets.git] / src / common / ustring.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
5 // Created: 2008-07-25
6 // RCS-ID: $Id:$
7 // Copyright: (c) 2008 Robert Roebling
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // For compilers that support precompilation, includes "wx.h".
12 #include "wx/wxprec.h"
13
14 #ifdef __BORLANDC__
15 #pragma hdrstop
16 #endif
17
18 #include "wx/ustring.h"
19
20 #ifndef WX_PRECOMP
21 #include "wx/crt.h"
22 #include "wx/log.h"
23 #endif
24
25 wxUString &wxUString::assignFromAscii( const char *str )
26 {
27 size_type len = wxStrlen( str );
28
29 wxU32CharBuffer buffer( len );
30 wxChar32 *ptr = buffer.data();
31
32 size_type i;
33 for (i = 0; i < len; i++)
34 {
35 *ptr = *str;
36 ptr++;
37 str++;
38 }
39
40 return assign( buffer );
41 }
42
43 wxUString &wxUString::assignFromAscii( const char *str, size_type n )
44 {
45 size_type len = 0;
46 const char *s = str;
47 while (len < n && *s)
48 {
49 len++;
50 s++;
51 }
52
53 wxU32CharBuffer buffer( len );
54 wxChar32 *ptr = buffer.data();
55
56 size_type i;
57 for (i = 0; i < len; i++)
58 {
59 *ptr = *str;
60 ptr++;
61 str++;
62 }
63
64 return *this;
65 }
66
67 // ----------------------------------------------------------------------------
68 // UTF-8
69 // ----------------------------------------------------------------------------
70
71 static const wxUint32 utf8_max[]=
72 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
73
74 // this table gives the length of the UTF-8 encoding from its first character:
75 const unsigned char tableUtf8Lengths[256] = {
76 // single-byte sequences (ASCII):
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
82 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
85
86 // these are invalid:
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
91 0, 0, // C0,C1
92
93 // two-byte sequences:
94 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
95 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
96
97 // three-byte sequences:
98 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
99
100 // four-byte sequences:
101 4, 4, 4, 4, 4, // F0..F4
102
103 // these are invalid again (5- or 6-byte
104 // sequences and sequences for code points
105 // above U+10FFFF, as restricted by RFC 3629):
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
107 };
108
109 wxUString &wxUString::assignFromUTF8( const char *str )
110 {
111 if (!str)
112 return assign( wxUString() );
113
114 size_type ucs4_len = 0;
115 const char *p = str;
116 while (*p)
117 {
118 unsigned char c = *p;
119 size_type len = tableUtf8Lengths[c];
120 if (!len)
121 return assign( wxUString() ); // don't try to convert invalid UTF-8
122 ucs4_len++;
123 p += len;
124 }
125
126 wxU32CharBuffer buffer( ucs4_len );
127 wxChar32 *out = buffer.data();
128
129 p = str;
130 while (*p)
131 {
132 unsigned char c = *p;
133 if (c < 0x80)
134 {
135 *out = c;
136 p++;
137 }
138 else
139 {
140 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
141
142 // Char. number range | UTF-8 octet sequence
143 // (hexadecimal) | (binary)
144 // ----------------------+----------------------------------------
145 // 0000 0000 - 0000 007F | 0xxxxxxx
146 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
147 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
148 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
149 //
150 // Code point value is stored in bits marked with 'x',
151 // lowest-order bit of the value on the right side in the diagram
152 // above. (from RFC 3629)
153
154 // mask to extract lead byte's value ('x' bits above), by sequence
155 // length:
156 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
157
158 // mask and value of lead byte's most significant bits, by length:
159 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
160 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
161
162 len--; // it's more convenient to work with 0-based length here
163
164 // extract the lead byte's value bits:
165 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
166 break;
167
168 wxChar32 code = c & leadValueMask[len];
169
170 // all remaining bytes, if any, are handled in the same way
171 // regardless of sequence's length:
172 for ( ; len; --len )
173 {
174 c = *++p;
175 if ( (c & 0xC0) != 0x80 )
176 return assign( wxUString() ); // don't try to convert invalid UTF-8
177
178 code <<= 6;
179 code |= c & 0x3F;
180 }
181
182 *out = code;
183 p++;
184 }
185 out++;
186 }
187
188 return assign( buffer.data() );
189 }
190
191 wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
192 {
193 if (!str)
194 return assign( wxUString() );
195
196 size_type ucs4_len = 0;
197 size_type utf8_pos = 0;
198 const char *p = str;
199 while (*p)
200 {
201 unsigned char c = *p;
202 size_type len = tableUtf8Lengths[c];
203 if (!len)
204 return assign( wxUString() ); // don't try to convert invalid UTF-8
205 if (utf8_pos + len > n)
206 break;
207 utf8_pos += len;
208 ucs4_len ++;
209 p += len;
210 }
211
212 wxU32CharBuffer buffer( ucs4_len );
213 wxChar32 *out = buffer.data();
214
215 utf8_pos = 0;
216 p = str;
217 while (*p)
218 {
219 unsigned char c = *p;
220 if (c < 0x80)
221 {
222 if (utf8_pos + 1 > n)
223 break;
224 utf8_pos++;
225
226 *out = c;
227 p++;
228 }
229 else
230 {
231 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
232 if (utf8_pos + len > n)
233 break;
234 utf8_pos += len;
235
236 // Char. number range | UTF-8 octet sequence
237 // (hexadecimal) | (binary)
238 // ----------------------+----------------------------------------
239 // 0000 0000 - 0000 007F | 0xxxxxxx
240 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
241 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
242 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
243 //
244 // Code point value is stored in bits marked with 'x',
245 // lowest-order bit of the value on the right side in the diagram
246 // above. (from RFC 3629)
247
248 // mask to extract lead byte's value ('x' bits above), by sequence
249 // length:
250 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
251
252 // mask and value of lead byte's most significant bits, by length:
253 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
254 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
255
256 len--; // it's more convenient to work with 0-based length here
257
258 // extract the lead byte's value bits:
259 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
260 break;
261
262 wxChar32 code = c & leadValueMask[len];
263
264 // all remaining bytes, if any, are handled in the same way
265 // regardless of sequence's length:
266 for ( ; len; --len )
267 {
268 c = *++p;
269 if ( (c & 0xC0) != 0x80 )
270 return assign( wxUString() ); // don't try to convert invalid UTF-8
271
272 code <<= 6;
273 code |= c & 0x3F;
274 }
275
276 *out = code;
277 p++;
278 }
279 out++;
280 }
281
282 *out = 0;
283
284 return assign( buffer.data() );
285 }
286
287 wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
288 {
289 if (!str)
290 return assign( wxUString() );
291
292 size_type ucs4_len = 0;
293 size_type utf16_pos = 0;
294 const wxChar16 *p = str;
295 while (*p)
296 {
297 size_type len;
298 if ((*p < 0xd800) || (*p > 0xdfff))
299 {
300 len = 1;
301 }
302 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
303 {
304 return assign( wxUString() ); // don't try to convert invalid UTF-16
305 }
306 else
307 {
308 len = 2;
309 }
310
311 if (utf16_pos + len > n)
312 break;
313
314 ucs4_len++;
315 p += len;
316 utf16_pos += len;
317 }
318
319 wxU32CharBuffer buffer( ucs4_len );
320 wxChar32 *out = buffer.data();
321
322 utf16_pos = 0;
323
324 p = str;
325 while (*p)
326 {
327 if ((*p < 0xd800) || (*p > 0xdfff))
328 {
329 if (utf16_pos + 1 > n)
330 break;
331
332 *out = *p;
333 p++;
334 utf16_pos++;
335 }
336 else
337 {
338 if (utf16_pos + 2 > n)
339 break;
340
341 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
342 p += 2;
343 utf16_pos += 2;
344 }
345 out++;
346 }
347
348 return assign( buffer.data() );
349 }
350
351 wxUString &wxUString::assignFromUTF16( const wxChar16* str )
352 {
353 if (!str)
354 return assign( wxUString() );
355
356 size_type ucs4_len = 0;
357 const wxChar16 *p = str;
358 while (*p)
359 {
360 size_type len;
361 if ((*p < 0xd800) || (*p > 0xdfff))
362 {
363 len = 1;
364 }
365 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
366 {
367 return assign( wxUString() ); // don't try to convert invalid UTF-16
368 }
369 else
370 {
371 len = 2;
372 }
373
374 ucs4_len++;
375 p += len;
376 }
377
378 wxU32CharBuffer buffer( ucs4_len );
379 wxChar32 *out = buffer.data();
380
381 p = str;
382 while (*p)
383 {
384 if ((*p < 0xd800) || (*p > 0xdfff))
385 {
386 *out = *p;
387 p++;
388 }
389 else
390 {
391 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
392 p += 2;
393 }
394 out++;
395 }
396
397 return assign( buffer.data() );
398 }
399
400 wxUString &wxUString::assignFromCString( const char* str )
401 {
402 if (!str)
403 return assign( wxUString() );
404
405 wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );
406
407 return assign( buffer );
408 }
409
410 wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
411 {
412 if (!str)
413 return assign( wxUString() );
414
415 wxWCharBuffer buffer = conv.cMB2WC( str );
416
417 return assign( buffer );
418 }
419
420 wxCharBuffer wxUString::utf8_str() const
421 {
422 size_type utf8_length = 0;
423 const wxChar32 *ptr = data();
424
425 while (*ptr)
426 {
427 wxChar32 code = *ptr;
428 ptr++;
429
430 if ( code <= 0x7F )
431 {
432 utf8_length++;
433 }
434 else if ( code <= 0x07FF )
435 {
436 utf8_length += 2;
437 }
438 else if ( code < 0xFFFF )
439 {
440 utf8_length += 3;
441 }
442 else if ( code <= 0x10FFFF )
443 {
444 utf8_length += 4;
445 }
446 else
447 {
448 // invalid range, skip
449 }
450 }
451
452 wxCharBuffer result( utf8_length );
453
454 char *out = result.data();
455
456 ptr = data();
457 while (*ptr)
458 {
459 wxChar32 code = *ptr;
460 ptr++;
461
462 if ( code <= 0x7F )
463 {
464 out[0] = (char)code;
465 out++;
466 }
467 else if ( code <= 0x07FF )
468 {
469 out[1] = 0x80 | (code & 0x3F); code >>= 6;
470 out[0] = 0xC0 | code;
471 out += 2;
472 }
473 else if ( code < 0xFFFF )
474 {
475 out[2] = 0x80 | (code & 0x3F); code >>= 6;
476 out[1] = 0x80 | (code & 0x3F); code >>= 6;
477 out[0] = 0xE0 | code;
478 out += 3;
479 }
480 else if ( code <= 0x10FFFF )
481 {
482 out[3] = 0x80 | (code & 0x3F); code >>= 6;
483 out[2] = 0x80 | (code & 0x3F); code >>= 6;
484 out[1] = 0x80 | (code & 0x3F); code >>= 6;
485 out[0] = 0xF0 | code;
486 out += 4;
487 }
488 else
489 {
490 // invalid range, skip
491 }
492 }
493
494 wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
495 wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
496
497 return result;
498 }
499
500 wxU16CharBuffer wxUString::utf16_str() const
501 {
502 size_type utf16_length = 0;
503 const wxChar32 *ptr = data();
504
505 while (*ptr)
506 {
507 wxChar32 code = *ptr;
508 ptr++;
509
510 // TODO: error range checks
511
512 if (code < 0x10000)
513 utf16_length++;
514 else
515 utf16_length += 2;
516 }
517
518 wxU16CharBuffer result( utf16_length );
519 wxChar16 *out = result.data();
520
521 ptr = data();
522
523 while (*ptr)
524 {
525 wxChar32 code = *ptr;
526 ptr++;
527
528 // TODO: error range checks
529
530 if (code < 0x10000)
531 {
532 out[0] = code;
533 out++;
534 }
535 else
536 {
537 out[0] = (code - 0x10000) / 0x400 + 0xd800;
538 out[1] = (code - 0x10000) % 0x400 + 0xdc00;
539 out += 2;
540 }
541 }
542
543 return result;
544 }