]> git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
many document corrections by charles; partial commit of patch #10087
[wxWidgets.git] / src / common / ustring.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
5 // Created: 2008-07-25
6 // RCS-ID: $Id:$
7 // Copyright: (c) 2008 Robert Roebling
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // For compilers that support precompilation, includes "wx.h".
12 #include "wx/wxprec.h"
13
14 #ifdef __BORLANDC__
15 #pragma hdrstop
16 #endif
17
18 #include "wx/ustring.h"
19
20 #ifndef WX_PRECOMP
21 #include "wx/crt.h"
22 #include "wx/log.h"
23 #endif
24
25 wxUString &wxUString::assignFromAscii( const char *str )
26 {
27 size_type len = wxStrlen( str );
28
29 wxU32CharBuffer buffer( len );
30 wxChar32 *ptr = buffer.data();
31
32 size_type i;
33 for (i = 0; i < len; i++)
34 {
35 *ptr = *str;
36 ptr++;
37 str++;
38 }
39
40 return assign( buffer );
41 }
42
43 wxUString &wxUString::assignFromAscii( const char *str, size_type n )
44 {
45 size_type len = 0;
46 const char *s = str;
47 while (len < n && *s)
48 {
49 len++;
50 s++;
51 }
52
53 wxU32CharBuffer buffer( len );
54 wxChar32 *ptr = buffer.data();
55
56 size_type i;
57 for (i = 0; i < len; i++)
58 {
59 *ptr = *str;
60 ptr++;
61 str++;
62 }
63
64 return *this;
65 }
66
67 // ----------------------------------------------------------------------------
68 // UTF-8
69 // ----------------------------------------------------------------------------
70
71 // this table gives the length of the UTF-8 encoding from its first character:
72 const unsigned char tableUtf8Lengths[256] = {
73 // single-byte sequences (ASCII):
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
82
83 // these are invalid:
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
88 0, 0, // C0,C1
89
90 // two-byte sequences:
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
92 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
93
94 // three-byte sequences:
95 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
96
97 // four-byte sequences:
98 4, 4, 4, 4, 4, // F0..F4
99
100 // these are invalid again (5- or 6-byte
101 // sequences and sequences for code points
102 // above U+10FFFF, as restricted by RFC 3629):
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
104 };
105
106 wxUString &wxUString::assignFromUTF8( const char *str )
107 {
108 if (!str)
109 return assign( wxUString() );
110
111 size_type ucs4_len = 0;
112 const char *p = str;
113 while (*p)
114 {
115 unsigned char c = *p;
116 size_type len = tableUtf8Lengths[c];
117 if (!len)
118 return assign( wxUString() ); // don't try to convert invalid UTF-8
119 ucs4_len++;
120 p += len;
121 }
122
123 wxU32CharBuffer buffer( ucs4_len );
124 wxChar32 *out = buffer.data();
125
126 p = str;
127 while (*p)
128 {
129 unsigned char c = *p;
130 if (c < 0x80)
131 {
132 *out = c;
133 p++;
134 }
135 else
136 {
137 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
138
139 // Char. number range | UTF-8 octet sequence
140 // (hexadecimal) | (binary)
141 // ----------------------+----------------------------------------
142 // 0000 0000 - 0000 007F | 0xxxxxxx
143 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
144 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
145 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
146 //
147 // Code point value is stored in bits marked with 'x',
148 // lowest-order bit of the value on the right side in the diagram
149 // above. (from RFC 3629)
150
151 // mask to extract lead byte's value ('x' bits above), by sequence
152 // length:
153 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
154
155 // mask and value of lead byte's most significant bits, by length:
156 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
157 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
158
159 len--; // it's more convenient to work with 0-based length here
160
161 // extract the lead byte's value bits:
162 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
163 break;
164
165 wxChar32 code = c & leadValueMask[len];
166
167 // all remaining bytes, if any, are handled in the same way
168 // regardless of sequence's length:
169 for ( ; len; --len )
170 {
171 c = *++p;
172 if ( (c & 0xC0) != 0x80 )
173 return assign( wxUString() ); // don't try to convert invalid UTF-8
174
175 code <<= 6;
176 code |= c & 0x3F;
177 }
178
179 *out = code;
180 p++;
181 }
182 out++;
183 }
184
185 return assign( buffer.data() );
186 }
187
188 wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
189 {
190 if (!str)
191 return assign( wxUString() );
192
193 size_type ucs4_len = 0;
194 size_type utf8_pos = 0;
195 const char *p = str;
196 while (*p)
197 {
198 unsigned char c = *p;
199 size_type len = tableUtf8Lengths[c];
200 if (!len)
201 return assign( wxUString() ); // don't try to convert invalid UTF-8
202 if (utf8_pos + len > n)
203 break;
204 utf8_pos += len;
205 ucs4_len ++;
206 p += len;
207 }
208
209 wxU32CharBuffer buffer( ucs4_len );
210 wxChar32 *out = buffer.data();
211
212 utf8_pos = 0;
213 p = str;
214 while (*p)
215 {
216 unsigned char c = *p;
217 if (c < 0x80)
218 {
219 if (utf8_pos + 1 > n)
220 break;
221 utf8_pos++;
222
223 *out = c;
224 p++;
225 }
226 else
227 {
228 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
229 if (utf8_pos + len > n)
230 break;
231 utf8_pos += len;
232
233 // Char. number range | UTF-8 octet sequence
234 // (hexadecimal) | (binary)
235 // ----------------------+----------------------------------------
236 // 0000 0000 - 0000 007F | 0xxxxxxx
237 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
238 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
239 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
240 //
241 // Code point value is stored in bits marked with 'x',
242 // lowest-order bit of the value on the right side in the diagram
243 // above. (from RFC 3629)
244
245 // mask to extract lead byte's value ('x' bits above), by sequence
246 // length:
247 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
248
249 // mask and value of lead byte's most significant bits, by length:
250 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
251 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
252
253 len--; // it's more convenient to work with 0-based length here
254
255 // extract the lead byte's value bits:
256 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
257 break;
258
259 wxChar32 code = c & leadValueMask[len];
260
261 // all remaining bytes, if any, are handled in the same way
262 // regardless of sequence's length:
263 for ( ; len; --len )
264 {
265 c = *++p;
266 if ( (c & 0xC0) != 0x80 )
267 return assign( wxUString() ); // don't try to convert invalid UTF-8
268
269 code <<= 6;
270 code |= c & 0x3F;
271 }
272
273 *out = code;
274 p++;
275 }
276 out++;
277 }
278
279 *out = 0;
280
281 return assign( buffer.data() );
282 }
283
284 wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
285 {
286 if (!str)
287 return assign( wxUString() );
288
289 size_type ucs4_len = 0;
290 size_type utf16_pos = 0;
291 const wxChar16 *p = str;
292 while (*p)
293 {
294 size_type len;
295 if ((*p < 0xd800) || (*p > 0xdfff))
296 {
297 len = 1;
298 }
299 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
300 {
301 return assign( wxUString() ); // don't try to convert invalid UTF-16
302 }
303 else
304 {
305 len = 2;
306 }
307
308 if (utf16_pos + len > n)
309 break;
310
311 ucs4_len++;
312 p += len;
313 utf16_pos += len;
314 }
315
316 wxU32CharBuffer buffer( ucs4_len );
317 wxChar32 *out = buffer.data();
318
319 utf16_pos = 0;
320
321 p = str;
322 while (*p)
323 {
324 if ((*p < 0xd800) || (*p > 0xdfff))
325 {
326 if (utf16_pos + 1 > n)
327 break;
328
329 *out = *p;
330 p++;
331 utf16_pos++;
332 }
333 else
334 {
335 if (utf16_pos + 2 > n)
336 break;
337
338 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
339 p += 2;
340 utf16_pos += 2;
341 }
342 out++;
343 }
344
345 return assign( buffer.data() );
346 }
347
348 wxUString &wxUString::assignFromUTF16( const wxChar16* str )
349 {
350 if (!str)
351 return assign( wxUString() );
352
353 size_type ucs4_len = 0;
354 const wxChar16 *p = str;
355 while (*p)
356 {
357 size_type len;
358 if ((*p < 0xd800) || (*p > 0xdfff))
359 {
360 len = 1;
361 }
362 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
363 {
364 return assign( wxUString() ); // don't try to convert invalid UTF-16
365 }
366 else
367 {
368 len = 2;
369 }
370
371 ucs4_len++;
372 p += len;
373 }
374
375 wxU32CharBuffer buffer( ucs4_len );
376 wxChar32 *out = buffer.data();
377
378 p = str;
379 while (*p)
380 {
381 if ((*p < 0xd800) || (*p > 0xdfff))
382 {
383 *out = *p;
384 p++;
385 }
386 else
387 {
388 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
389 p += 2;
390 }
391 out++;
392 }
393
394 return assign( buffer.data() );
395 }
396
397 wxUString &wxUString::assignFromCString( const char* str )
398 {
399 if (!str)
400 return assign( wxUString() );
401
402 wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );
403
404 return assign( buffer );
405 }
406
407 wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
408 {
409 if (!str)
410 return assign( wxUString() );
411
412 wxWCharBuffer buffer = conv.cMB2WC( str );
413
414 return assign( buffer );
415 }
416
417 wxCharBuffer wxUString::utf8_str() const
418 {
419 size_type utf8_length = 0;
420 const wxChar32 *ptr = data();
421
422 while (*ptr)
423 {
424 wxChar32 code = *ptr;
425 ptr++;
426
427 if ( code <= 0x7F )
428 {
429 utf8_length++;
430 }
431 else if ( code <= 0x07FF )
432 {
433 utf8_length += 2;
434 }
435 else if ( code < 0xFFFF )
436 {
437 utf8_length += 3;
438 }
439 else if ( code <= 0x10FFFF )
440 {
441 utf8_length += 4;
442 }
443 else
444 {
445 // invalid range, skip
446 }
447 }
448
449 wxCharBuffer result( utf8_length );
450
451 char *out = result.data();
452
453 ptr = data();
454 while (*ptr)
455 {
456 wxChar32 code = *ptr;
457 ptr++;
458
459 if ( code <= 0x7F )
460 {
461 out[0] = (char)code;
462 out++;
463 }
464 else if ( code <= 0x07FF )
465 {
466 out[1] = 0x80 | (code & 0x3F); code >>= 6;
467 out[0] = 0xC0 | code;
468 out += 2;
469 }
470 else if ( code < 0xFFFF )
471 {
472 out[2] = 0x80 | (code & 0x3F); code >>= 6;
473 out[1] = 0x80 | (code & 0x3F); code >>= 6;
474 out[0] = 0xE0 | code;
475 out += 3;
476 }
477 else if ( code <= 0x10FFFF )
478 {
479 out[3] = 0x80 | (code & 0x3F); code >>= 6;
480 out[2] = 0x80 | (code & 0x3F); code >>= 6;
481 out[1] = 0x80 | (code & 0x3F); code >>= 6;
482 out[0] = 0xF0 | code;
483 out += 4;
484 }
485 else
486 {
487 // invalid range, skip
488 }
489 }
490
491 wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
492 wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
493
494 return result;
495 }
496
497 wxU16CharBuffer wxUString::utf16_str() const
498 {
499 size_type utf16_length = 0;
500 const wxChar32 *ptr = data();
501
502 while (*ptr)
503 {
504 wxChar32 code = *ptr;
505 ptr++;
506
507 // TODO: error range checks
508
509 if (code < 0x10000)
510 utf16_length++;
511 else
512 utf16_length += 2;
513 }
514
515 wxU16CharBuffer result( utf16_length );
516 wxChar16 *out = result.data();
517
518 ptr = data();
519
520 while (*ptr)
521 {
522 wxChar32 code = *ptr;
523 ptr++;
524
525 // TODO: error range checks
526
527 if (code < 0x10000)
528 {
529 out[0] = code;
530 out++;
531 }
532 else
533 {
534 out[0] = (code - 0x10000) / 0x400 + 0xd800;
535 out[1] = (code - 0x10000) % 0x400 + 0xdc00;
536 out += 2;
537 }
538 }
539
540 return result;
541 }