1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
6 // Copyright: (c) 2008 Robert Roebling
7 // Licence: wxWindows licence
8 ///////////////////////////////////////////////////////////////////////////////
10 // For compilers that support precompilation, includes "wx.h".
11 #include "wx/wxprec.h"
17 #include "wx/ustring.h"
24 wxUString
&wxUString::assignFromAscii( const char *str
)
26 size_type len
= wxStrlen( str
);
28 wxU32CharBuffer
buffer( len
);
29 wxChar32
*ptr
= buffer
.data();
32 for (i
= 0; i
< len
; i
++)
39 return assign( buffer
);
42 wxUString
&wxUString::assignFromAscii( const char *str
, size_type n
)
52 wxU32CharBuffer
buffer( len
);
53 wxChar32
*ptr
= buffer
.data();
56 for (i
= 0; i
< len
; i
++)
66 // ----------------------------------------------------------------------------
68 // ----------------------------------------------------------------------------
70 // this table gives the length of the UTF-8 encoding from its first character:
71 const unsigned char tableUtf8Lengths
[256] = {
72 // single-byte sequences (ASCII):
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
89 // two-byte sequences:
90 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
93 // three-byte sequences:
94 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
96 // four-byte sequences:
97 4, 4, 4, 4, 4, // F0..F4
99 // these are invalid again (5- or 6-byte
100 // sequences and sequences for code points
101 // above U+10FFFF, as restricted by RFC 3629):
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
105 wxUString
&wxUString::assignFromUTF8( const char *str
)
108 return assign( wxUString() );
110 size_type ucs4_len
= 0;
114 unsigned char c
= *p
;
115 size_type len
= tableUtf8Lengths
[c
];
117 return assign( wxUString() ); // don't try to convert invalid UTF-8
122 wxU32CharBuffer
buffer( ucs4_len
);
123 wxChar32
*out
= buffer
.data();
128 unsigned char c
= *p
;
136 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
138 // Char. number range | UTF-8 octet sequence
139 // (hexadecimal) | (binary)
140 // ----------------------+----------------------------------------
141 // 0000 0000 - 0000 007F | 0xxxxxxx
142 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
143 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
144 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
146 // Code point value is stored in bits marked with 'x',
147 // lowest-order bit of the value on the right side in the diagram
148 // above. (from RFC 3629)
150 // mask to extract lead byte's value ('x' bits above), by sequence
152 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
154 // mask and value of lead byte's most significant bits, by length:
155 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
156 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
158 len
--; // it's more convenient to work with 0-based length here
160 // extract the lead byte's value bits:
161 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
164 wxChar32 code
= c
& leadValueMask
[len
];
166 // all remaining bytes, if any, are handled in the same way
167 // regardless of sequence's length:
171 if ( (c
& 0xC0) != 0x80 )
172 return assign( wxUString() ); // don't try to convert invalid UTF-8
184 return assign( buffer
.data() );
187 wxUString
&wxUString::assignFromUTF8( const char *str
, size_type n
)
190 return assign( wxUString() );
192 size_type ucs4_len
= 0;
193 size_type utf8_pos
= 0;
197 unsigned char c
= *p
;
198 size_type len
= tableUtf8Lengths
[c
];
200 return assign( wxUString() ); // don't try to convert invalid UTF-8
201 if (utf8_pos
+ len
> n
)
208 wxU32CharBuffer
buffer( ucs4_len
);
209 wxChar32
*out
= buffer
.data();
215 unsigned char c
= *p
;
218 if (utf8_pos
+ 1 > n
)
227 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
228 if (utf8_pos
+ len
> n
)
232 // Char. number range | UTF-8 octet sequence
233 // (hexadecimal) | (binary)
234 // ----------------------+----------------------------------------
235 // 0000 0000 - 0000 007F | 0xxxxxxx
236 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
237 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
238 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
240 // Code point value is stored in bits marked with 'x',
241 // lowest-order bit of the value on the right side in the diagram
242 // above. (from RFC 3629)
244 // mask to extract lead byte's value ('x' bits above), by sequence
246 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
248 // mask and value of lead byte's most significant bits, by length:
249 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
250 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
252 len
--; // it's more convenient to work with 0-based length here
254 // extract the lead byte's value bits:
255 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
258 wxChar32 code
= c
& leadValueMask
[len
];
260 // all remaining bytes, if any, are handled in the same way
261 // regardless of sequence's length:
265 if ( (c
& 0xC0) != 0x80 )
266 return assign( wxUString() ); // don't try to convert invalid UTF-8
280 return assign( buffer
.data() );
283 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
, size_type n
)
286 return assign( wxUString() );
288 size_type ucs4_len
= 0;
289 size_type utf16_pos
= 0;
290 const wxChar16
*p
= str
;
294 if ((*p
< 0xd800) || (*p
> 0xdfff))
298 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
300 return assign( wxUString() ); // don't try to convert invalid UTF-16
307 if (utf16_pos
+ len
> n
)
315 wxU32CharBuffer
buffer( ucs4_len
);
316 wxChar32
*out
= buffer
.data();
323 if ((*p
< 0xd800) || (*p
> 0xdfff))
325 if (utf16_pos
+ 1 > n
)
334 if (utf16_pos
+ 2 > n
)
337 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
344 return assign( buffer
.data() );
347 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
)
350 return assign( wxUString() );
352 size_type ucs4_len
= 0;
353 const wxChar16
*p
= str
;
357 if ((*p
< 0xd800) || (*p
> 0xdfff))
361 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
363 return assign( wxUString() ); // don't try to convert invalid UTF-16
374 wxU32CharBuffer
buffer( ucs4_len
);
375 wxChar32
*out
= buffer
.data();
380 if ((*p
< 0xd800) || (*p
> 0xdfff))
387 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
393 return assign( buffer
.data() );
396 wxUString
&wxUString::assignFromCString( const char* str
)
399 return assign( wxUString() );
401 wxScopedWCharBuffer buffer
= wxConvLibc
.cMB2WC( str
);
403 return assign( buffer
);
406 wxUString
&wxUString::assignFromCString( const char* str
, const wxMBConv
&conv
)
409 return assign( wxUString() );
411 wxScopedWCharBuffer buffer
= conv
.cMB2WC( str
);
413 return assign( buffer
);
416 wxScopedCharBuffer
wxUString::utf8_str() const
418 size_type utf8_length
= 0;
419 const wxChar32
*ptr
= data();
423 wxChar32 code
= *ptr
;
430 else if ( code
<= 0x07FF )
434 else if ( code
< 0xFFFF )
438 else if ( code
<= 0x10FFFF )
444 // invalid range, skip
448 wxCharBuffer
result( utf8_length
);
450 char *out
= result
.data();
455 wxChar32 code
= *ptr
;
463 else if ( code
<= 0x07FF )
465 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
466 out
[0] = 0xC0 | code
;
469 else if ( code
< 0xFFFF )
471 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
472 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
473 out
[0] = 0xE0 | code
;
476 else if ( code
<= 0x10FFFF )
478 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
479 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
480 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
481 out
[0] = 0xF0 | code
;
486 // invalid range, skip
493 wxScopedU16CharBuffer
wxUString::utf16_str() const
495 size_type utf16_length
= 0;
496 const wxChar32
*ptr
= data();
500 wxChar32 code
= *ptr
;
503 // TODO: error range checks
511 wxU16CharBuffer
result( utf16_length
);
512 wxChar16
*out
= result
.data();
518 wxChar32 code
= *ptr
;
521 // TODO: error range checks
530 out
[0] = (code
- 0x10000) / 0x400 + 0xd800;
531 out
[1] = (code
- 0x10000) % 0x400 + 0xdc00;