OSX regrouping
[wxWidgets.git] / src / common / ustring.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
5 // Created: 2008-07-25
6 // RCS-ID: $Id:$
7 // Copyright: (c) 2008 Robert Roebling
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // For compilers that support precompilation, includes "wx.h".
12 #include "wx/wxprec.h"
13
14 #ifdef __BORLANDC__
15 #pragma hdrstop
16 #endif
17
18 #ifndef WX_PRECOMP
19 #include "wx/strconv.h" // wxConvLibc
20 #include "wx/log.h"
21 #endif
22
23 #include "wx/ustring.h"
24 #include "wx/unichar.h"
25 #include "wx/string.h"
26
27
28 wxUString &wxUString::assignFromAscii( const char *str )
29 {
30 size_type len = wxStrlen( str );
31
32 wxU32CharBuffer buffer( len );
33 wxChar32 *ptr = buffer.data();
34
35 size_type i;
36 for (i = 0; i < len; i++)
37 {
38 *ptr = *str;
39 ptr++;
40 str++;
41 }
42
43 return assign( buffer );
44 }
45
46 wxUString &wxUString::assignFromAscii( const char *str, size_type n )
47 {
48 size_type len = 0;
49 const char *s = str;
50 while (len < n && *s)
51 {
52 len++;
53 s++;
54 }
55
56 wxU32CharBuffer buffer( len );
57 wxChar32 *ptr = buffer.data();
58
59 size_type i;
60 for (i = 0; i < len; i++)
61 {
62 *ptr = *str;
63 ptr++;
64 str++;
65 }
66
67 return *this;
68 }
69
70 // ----------------------------------------------------------------------------
71 // UTF-8
72 // ----------------------------------------------------------------------------
73
74 static const wxUint32 utf8_max[]=
75 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
76
77 // this table gives the length of the UTF-8 encoding from its first character:
78 const unsigned char tableUtf8Lengths[256] = {
79 // single-byte sequences (ASCII):
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
82 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
88
89 // these are invalid:
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
94 0, 0, // C0,C1
95
96 // two-byte sequences:
97 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
98 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
99
100 // three-byte sequences:
101 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
102
103 // four-byte sequences:
104 4, 4, 4, 4, 4, // F0..F4
105
106 // these are invalid again (5- or 6-byte
107 // sequences and sequences for code points
108 // above U+10FFFF, as restricted by RFC 3629):
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
110 };
111
112 wxUString &wxUString::assignFromUTF8( const char *str )
113 {
114 if (!str)
115 return assign( wxUString() );
116
117 size_type ucs4_len = 0;
118 const char *p = str;
119 while (*p)
120 {
121 unsigned char c = *p;
122 size_type len = tableUtf8Lengths[c];
123 if (!len)
124 return assign( wxUString() ); // don't try to convert invalid UTF-8
125 ucs4_len++;
126 p += len;
127 }
128
129 wxU32CharBuffer buffer( ucs4_len );
130 wxChar32 *out = buffer.data();
131
132 p = str;
133 while (*p)
134 {
135 unsigned char c = *p;
136 if (c < 0x80)
137 {
138 *out = c;
139 p++;
140 }
141 else
142 {
143 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
144
145 // Char. number range | UTF-8 octet sequence
146 // (hexadecimal) | (binary)
147 // ----------------------+----------------------------------------
148 // 0000 0000 - 0000 007F | 0xxxxxxx
149 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
150 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
151 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
152 //
153 // Code point value is stored in bits marked with 'x',
154 // lowest-order bit of the value on the right side in the diagram
155 // above. (from RFC 3629)
156
157 // mask to extract lead byte's value ('x' bits above), by sequence
158 // length:
159 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
160
161 // mask and value of lead byte's most significant bits, by length:
162 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
163 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
164
165 len--; // it's more convenient to work with 0-based length here
166
167 // extract the lead byte's value bits:
168 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
169 break;
170
171 wxChar32 code = c & leadValueMask[len];
172
173 // all remaining bytes, if any, are handled in the same way
174 // regardless of sequence's length:
175 for ( ; len; --len )
176 {
177 c = *++p;
178 if ( (c & 0xC0) != 0x80 )
179 return assign( wxUString() ); // don't try to convert invalid UTF-8
180
181 code <<= 6;
182 code |= c & 0x3F;
183 }
184
185 *out = code;
186 p++;
187 }
188 out++;
189 }
190
191 return assign( buffer.data() );
192 }
193
194 wxUString &wxUString::assignFromUTF8( const char *str, size_type n )
195 {
196 if (!str)
197 return assign( wxUString() );
198
199 size_type ucs4_len = 0;
200 size_type utf8_pos = 0;
201 const char *p = str;
202 while (*p)
203 {
204 unsigned char c = *p;
205 size_type len = tableUtf8Lengths[c];
206 if (!len)
207 return assign( wxUString() ); // don't try to convert invalid UTF-8
208 if (utf8_pos + len > n)
209 break;
210 utf8_pos += len;
211 ucs4_len ++;
212 p += len;
213 }
214
215 wxU32CharBuffer buffer( ucs4_len );
216 wxChar32 *out = buffer.data();
217
218 utf8_pos = 0;
219 p = str;
220 while (*p)
221 {
222 unsigned char c = *p;
223 if (c < 0x80)
224 {
225 if (utf8_pos + 1 > n)
226 break;
227 utf8_pos++;
228
229 *out = c;
230 p++;
231 }
232 else
233 {
234 size_type len = tableUtf8Lengths[c]; // len == 0 is caught above
235 if (utf8_pos + len > n)
236 break;
237 utf8_pos += len;
238
239 // Char. number range | UTF-8 octet sequence
240 // (hexadecimal) | (binary)
241 // ----------------------+----------------------------------------
242 // 0000 0000 - 0000 007F | 0xxxxxxx
243 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
244 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
245 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
246 //
247 // Code point value is stored in bits marked with 'x',
248 // lowest-order bit of the value on the right side in the diagram
249 // above. (from RFC 3629)
250
251 // mask to extract lead byte's value ('x' bits above), by sequence
252 // length:
253 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
254
255 // mask and value of lead byte's most significant bits, by length:
256 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
257 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
258
259 len--; // it's more convenient to work with 0-based length here
260
261 // extract the lead byte's value bits:
262 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
263 break;
264
265 wxChar32 code = c & leadValueMask[len];
266
267 // all remaining bytes, if any, are handled in the same way
268 // regardless of sequence's length:
269 for ( ; len; --len )
270 {
271 c = *++p;
272 if ( (c & 0xC0) != 0x80 )
273 return assign( wxUString() ); // don't try to convert invalid UTF-8
274
275 code <<= 6;
276 code |= c & 0x3F;
277 }
278
279 *out = code;
280 p++;
281 }
282 out++;
283 }
284
285 *out = 0;
286
287 return assign( buffer.data() );
288 }
289
290 wxUString &wxUString::assignFromUTF16( const wxChar16* str, size_type n )
291 {
292 if (!str)
293 return assign( wxUString() );
294
295 size_type ucs4_len = 0;
296 size_type utf16_pos = 0;
297 const wxChar16 *p = str;
298 while (*p)
299 {
300 size_type len;
301 if ((*p < 0xd800) || (*p > 0xdfff))
302 {
303 len = 1;
304 }
305 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
306 {
307 return assign( wxUString() ); // don't try to convert invalid UTF-16
308 }
309 else
310 {
311 len = 2;
312 }
313
314 if (utf16_pos + len > n)
315 break;
316
317 ucs4_len++;
318 p += len;
319 utf16_pos += len;
320 }
321
322 wxU32CharBuffer buffer( ucs4_len );
323 wxChar32 *out = buffer.data();
324
325 utf16_pos = 0;
326
327 p = str;
328 while (*p)
329 {
330 if ((*p < 0xd800) || (*p > 0xdfff))
331 {
332 if (utf16_pos + 1 > n)
333 break;
334
335 *out = *p;
336 p++;
337 utf16_pos++;
338 }
339 else
340 {
341 if (utf16_pos + 2 > n)
342 break;
343
344 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
345 p += 2;
346 utf16_pos += 2;
347 }
348 out++;
349 }
350
351 return assign( buffer.data() );
352 }
353
354 wxUString &wxUString::assignFromUTF16( const wxChar16* str )
355 {
356 if (!str)
357 return assign( wxUString() );
358
359 size_type ucs4_len = 0;
360 const wxChar16 *p = str;
361 while (*p)
362 {
363 size_type len;
364 if ((*p < 0xd800) || (*p > 0xdfff))
365 {
366 len = 1;
367 }
368 else if ((p[1] < 0xdc00) || (p[1] > 0xdfff))
369 {
370 return assign( wxUString() ); // don't try to convert invalid UTF-16
371 }
372 else
373 {
374 len = 2;
375 }
376
377 ucs4_len++;
378 p += len;
379 }
380
381 wxU32CharBuffer buffer( ucs4_len );
382 wxChar32 *out = buffer.data();
383
384 p = str;
385 while (*p)
386 {
387 if ((*p < 0xd800) || (*p > 0xdfff))
388 {
389 *out = *p;
390 p++;
391 }
392 else
393 {
394 *out = ((p[0] - 0xd7c0) << 10) + (p[1] - 0xdc00);
395 p += 2;
396 }
397 out++;
398 }
399
400 return assign( buffer.data() );
401 }
402
403 wxUString &wxUString::assignFromCString( const char* str )
404 {
405 if (!str)
406 return assign( wxUString() );
407
408 wxWCharBuffer buffer = wxConvLibc.cMB2WC( str );
409
410 return assign( buffer );
411 }
412
413 wxUString &wxUString::assignFromCString( const char* str, const wxMBConv &conv )
414 {
415 if (!str)
416 return assign( wxUString() );
417
418 wxWCharBuffer buffer = conv.cMB2WC( str );
419
420 return assign( buffer );
421 }
422
423 wxCharBuffer wxUString::utf8_str() const
424 {
425 size_type utf8_length = 0;
426 const wxChar32 *ptr = data();
427
428 while (*ptr)
429 {
430 wxChar32 code = *ptr;
431 ptr++;
432
433 if ( code <= 0x7F )
434 {
435 utf8_length++;
436 }
437 else if ( code <= 0x07FF )
438 {
439 utf8_length += 2;
440 }
441 else if ( code < 0xFFFF )
442 {
443 utf8_length += 3;
444 }
445 else if ( code <= 0x10FFFF )
446 {
447 utf8_length += 4;
448 }
449 else
450 {
451 // invalid range, skip
452 }
453 }
454
455 wxCharBuffer result( utf8_length );
456
457 char *out = result.data();
458
459 ptr = data();
460 while (*ptr)
461 {
462 wxChar32 code = *ptr;
463 ptr++;
464
465 if ( code <= 0x7F )
466 {
467 out[0] = (char)code;
468 out++;
469 }
470 else if ( code <= 0x07FF )
471 {
472 out[1] = 0x80 | (code & 0x3F); code >>= 6;
473 out[0] = 0xC0 | code;
474 out += 2;
475 }
476 else if ( code < 0xFFFF )
477 {
478 out[2] = 0x80 | (code & 0x3F); code >>= 6;
479 out[1] = 0x80 | (code & 0x3F); code >>= 6;
480 out[0] = 0xE0 | code;
481 out += 3;
482 }
483 else if ( code <= 0x10FFFF )
484 {
485 out[3] = 0x80 | (code & 0x3F); code >>= 6;
486 out[2] = 0x80 | (code & 0x3F); code >>= 6;
487 out[1] = 0x80 | (code & 0x3F); code >>= 6;
488 out[0] = 0xF0 | code;
489 out += 4;
490 }
491 else
492 {
493 // invalid range, skip
494 }
495 }
496
497 wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
498 wxPrintf( "utf8_str %s len %d\n", result, wxStrlen( result.data() ) );
499
500 return result;
501 }
502
503 wxU16CharBuffer wxUString::utf16_str() const
504 {
505 size_type utf16_length = 0;
506 const wxChar32 *ptr = data();
507
508 while (*ptr)
509 {
510 wxChar32 code = *ptr;
511 ptr++;
512
513 // TODO: error range checks
514
515 if (code < 0x10000)
516 utf16_length++;
517 else
518 utf16_length += 2;
519 }
520
521 wxU16CharBuffer result( utf16_length );
522 wxChar16 *out = result.data();
523
524 ptr = data();
525
526 while (*ptr)
527 {
528 wxChar32 code = *ptr;
529 ptr++;
530
531 // TODO: error range checks
532
533 if (code < 0x10000)
534 {
535 out[0] = code;
536 out++;
537 }
538 else
539 {
540 out[0] = (code - 0x10000) / 0x400 + 0xd800;
541 out[1] = (code - 0x10000) % 0x400 + 0xdc00;
542 out += 2;
543 }
544 }
545
546 return result;
547 }
548