1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
7 // Copyright: (c) 2008 Robert Roebling
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
11 // For compilers that support precompilation, includes "wx.h".
12 #include "wx/wxprec.h"
18 #include "wx/ustring.h"
25 wxUString
&wxUString::assignFromAscii( const char *str
)
27 size_type len
= wxStrlen( str
);
29 wxU32CharBuffer
buffer( len
);
30 wxChar32
*ptr
= buffer
.data();
33 for (i
= 0; i
< len
; i
++)
40 return assign( buffer
);
43 wxUString
&wxUString::assignFromAscii( const char *str
, size_type n
)
53 wxU32CharBuffer
buffer( len
);
54 wxChar32
*ptr
= buffer
.data();
57 for (i
= 0; i
< len
; i
++)
67 // ----------------------------------------------------------------------------
69 // ----------------------------------------------------------------------------
71 // this table gives the length of the UTF-8 encoding from its first character:
72 const unsigned char tableUtf8Lengths
[256] = {
73 // single-byte sequences (ASCII):
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
90 // two-byte sequences:
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
92 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
94 // three-byte sequences:
95 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
97 // four-byte sequences:
98 4, 4, 4, 4, 4, // F0..F4
100 // these are invalid again (5- or 6-byte
101 // sequences and sequences for code points
102 // above U+10FFFF, as restricted by RFC 3629):
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
106 wxUString
&wxUString::assignFromUTF8( const char *str
)
109 return assign( wxUString() );
111 size_type ucs4_len
= 0;
115 unsigned char c
= *p
;
116 size_type len
= tableUtf8Lengths
[c
];
118 return assign( wxUString() ); // don't try to convert invalid UTF-8
123 wxU32CharBuffer
buffer( ucs4_len
);
124 wxChar32
*out
= buffer
.data();
129 unsigned char c
= *p
;
137 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
139 // Char. number range | UTF-8 octet sequence
140 // (hexadecimal) | (binary)
141 // ----------------------+----------------------------------------
142 // 0000 0000 - 0000 007F | 0xxxxxxx
143 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
144 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
145 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
147 // Code point value is stored in bits marked with 'x',
148 // lowest-order bit of the value on the right side in the diagram
149 // above. (from RFC 3629)
151 // mask to extract lead byte's value ('x' bits above), by sequence
153 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
155 // mask and value of lead byte's most significant bits, by length:
156 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
157 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
159 len
--; // it's more convenient to work with 0-based length here
161 // extract the lead byte's value bits:
162 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
165 wxChar32 code
= c
& leadValueMask
[len
];
167 // all remaining bytes, if any, are handled in the same way
168 // regardless of sequence's length:
172 if ( (c
& 0xC0) != 0x80 )
173 return assign( wxUString() ); // don't try to convert invalid UTF-8
185 return assign( buffer
.data() );
188 wxUString
&wxUString::assignFromUTF8( const char *str
, size_type n
)
191 return assign( wxUString() );
193 size_type ucs4_len
= 0;
194 size_type utf8_pos
= 0;
198 unsigned char c
= *p
;
199 size_type len
= tableUtf8Lengths
[c
];
201 return assign( wxUString() ); // don't try to convert invalid UTF-8
202 if (utf8_pos
+ len
> n
)
209 wxU32CharBuffer
buffer( ucs4_len
);
210 wxChar32
*out
= buffer
.data();
216 unsigned char c
= *p
;
219 if (utf8_pos
+ 1 > n
)
228 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
229 if (utf8_pos
+ len
> n
)
233 // Char. number range | UTF-8 octet sequence
234 // (hexadecimal) | (binary)
235 // ----------------------+----------------------------------------
236 // 0000 0000 - 0000 007F | 0xxxxxxx
237 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
238 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
239 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
241 // Code point value is stored in bits marked with 'x',
242 // lowest-order bit of the value on the right side in the diagram
243 // above. (from RFC 3629)
245 // mask to extract lead byte's value ('x' bits above), by sequence
247 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
249 // mask and value of lead byte's most significant bits, by length:
250 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
251 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
253 len
--; // it's more convenient to work with 0-based length here
255 // extract the lead byte's value bits:
256 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
259 wxChar32 code
= c
& leadValueMask
[len
];
261 // all remaining bytes, if any, are handled in the same way
262 // regardless of sequence's length:
266 if ( (c
& 0xC0) != 0x80 )
267 return assign( wxUString() ); // don't try to convert invalid UTF-8
281 return assign( buffer
.data() );
284 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
, size_type n
)
287 return assign( wxUString() );
289 size_type ucs4_len
= 0;
290 size_type utf16_pos
= 0;
291 const wxChar16
*p
= str
;
295 if ((*p
< 0xd800) || (*p
> 0xdfff))
299 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
301 return assign( wxUString() ); // don't try to convert invalid UTF-16
308 if (utf16_pos
+ len
> n
)
316 wxU32CharBuffer
buffer( ucs4_len
);
317 wxChar32
*out
= buffer
.data();
324 if ((*p
< 0xd800) || (*p
> 0xdfff))
326 if (utf16_pos
+ 1 > n
)
335 if (utf16_pos
+ 2 > n
)
338 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
345 return assign( buffer
.data() );
348 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
)
351 return assign( wxUString() );
353 size_type ucs4_len
= 0;
354 const wxChar16
*p
= str
;
358 if ((*p
< 0xd800) || (*p
> 0xdfff))
362 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
364 return assign( wxUString() ); // don't try to convert invalid UTF-16
375 wxU32CharBuffer
buffer( ucs4_len
);
376 wxChar32
*out
= buffer
.data();
381 if ((*p
< 0xd800) || (*p
> 0xdfff))
388 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
394 return assign( buffer
.data() );
397 wxUString
&wxUString::assignFromCString( const char* str
)
400 return assign( wxUString() );
402 wxScopedWCharBuffer buffer
= wxConvLibc
.cMB2WC( str
);
404 return assign( buffer
);
407 wxUString
&wxUString::assignFromCString( const char* str
, const wxMBConv
&conv
)
410 return assign( wxUString() );
412 wxScopedWCharBuffer buffer
= conv
.cMB2WC( str
);
414 return assign( buffer
);
417 wxScopedCharBuffer
wxUString::utf8_str() const
419 size_type utf8_length
= 0;
420 const wxChar32
*ptr
= data();
424 wxChar32 code
= *ptr
;
431 else if ( code
<= 0x07FF )
435 else if ( code
< 0xFFFF )
439 else if ( code
<= 0x10FFFF )
445 // invalid range, skip
449 wxCharBuffer
result( utf8_length
);
451 char *out
= result
.data();
456 wxChar32 code
= *ptr
;
464 else if ( code
<= 0x07FF )
466 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
467 out
[0] = 0xC0 | code
;
470 else if ( code
< 0xFFFF )
472 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
473 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
474 out
[0] = 0xE0 | code
;
477 else if ( code
<= 0x10FFFF )
479 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
480 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
481 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
482 out
[0] = 0xF0 | code
;
487 // invalid range, skip
494 wxScopedU16CharBuffer
wxUString::utf16_str() const
496 size_type utf16_length
= 0;
497 const wxChar32
*ptr
= data();
501 wxChar32 code
= *ptr
;
504 // TODO: error range checks
512 wxU16CharBuffer
result( utf16_length
);
513 wxChar16
*out
= result
.data();
519 wxChar32 code
= *ptr
;
522 // TODO: error range checks
531 out
[0] = (code
- 0x10000) / 0x400 + 0xd800;
532 out
[1] = (code
- 0x10000) % 0x400 + 0xdc00;