]>
git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
7 // Copyright: (c) 2008 Robert Roebling
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
11 // For compilers that support precompilation, includes "wx.h".
12 #include "wx/wxprec.h"
19 #include "wx/strconv.h" // wxConvLibc
23 #include "wx/ustring.h"
24 #include "wx/unichar.h"
25 #include "wx/string.h"
28 wxUString
&wxUString::assignFromAscii( const char *str
)
30 size_type len
= wxStrlen( str
);
32 wxU32CharBuffer
buffer( len
);
33 wxChar32
*ptr
= buffer
.data();
36 for (i
= 0; i
< len
; i
++)
43 return assign( buffer
);
46 wxUString
&wxUString::assignFromAscii( const char *str
, size_type n
)
56 wxU32CharBuffer
buffer( len
);
57 wxChar32
*ptr
= buffer
.data();
60 for (i
= 0; i
< len
; i
++)
70 // ----------------------------------------------------------------------------
72 // ----------------------------------------------------------------------------
74 static const wxUint32 utf8_max
[]=
75 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
77 // this table gives the length of the UTF-8 encoding from its first character:
78 const unsigned char tableUtf8Lengths
[256] = {
79 // single-byte sequences (ASCII):
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
82 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
96 // two-byte sequences:
97 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
98 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
100 // three-byte sequences:
101 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
103 // four-byte sequences:
104 4, 4, 4, 4, 4, // F0..F4
106 // these are invalid again (5- or 6-byte
107 // sequences and sequences for code points
108 // above U+10FFFF, as restricted by RFC 3629):
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
112 wxUString
&wxUString::assignFromUTF8( const char *str
)
115 return assign( wxUString() );
117 size_type ucs4_len
= 0;
121 unsigned char c
= *p
;
122 size_type len
= tableUtf8Lengths
[c
];
124 return assign( wxUString() ); // don't try to convert invalid UTF-8
129 wxU32CharBuffer
buffer( ucs4_len
);
130 wxChar32
*out
= buffer
.data();
135 unsigned char c
= *p
;
143 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
145 // Char. number range | UTF-8 octet sequence
146 // (hexadecimal) | (binary)
147 // ----------------------+----------------------------------------
148 // 0000 0000 - 0000 007F | 0xxxxxxx
149 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
150 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
151 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
153 // Code point value is stored in bits marked with 'x',
154 // lowest-order bit of the value on the right side in the diagram
155 // above. (from RFC 3629)
157 // mask to extract lead byte's value ('x' bits above), by sequence
159 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
161 // mask and value of lead byte's most significant bits, by length:
162 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
163 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
165 len
--; // it's more convenient to work with 0-based length here
167 // extract the lead byte's value bits:
168 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
171 wxChar32 code
= c
& leadValueMask
[len
];
173 // all remaining bytes, if any, are handled in the same way
174 // regardless of sequence's length:
178 if ( (c
& 0xC0) != 0x80 )
179 return assign( wxUString() ); // don't try to convert invalid UTF-8
191 return assign( buffer
.data() );
194 wxUString
&wxUString::assignFromUTF8( const char *str
, size_type n
)
197 return assign( wxUString() );
199 size_type ucs4_len
= 0;
200 size_type utf8_pos
= 0;
204 unsigned char c
= *p
;
205 size_type len
= tableUtf8Lengths
[c
];
207 return assign( wxUString() ); // don't try to convert invalid UTF-8
208 if (utf8_pos
+ len
> n
)
215 wxU32CharBuffer
buffer( ucs4_len
);
216 wxChar32
*out
= buffer
.data();
222 unsigned char c
= *p
;
225 if (utf8_pos
+ 1 > n
)
234 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
235 if (utf8_pos
+ len
> n
)
239 // Char. number range | UTF-8 octet sequence
240 // (hexadecimal) | (binary)
241 // ----------------------+----------------------------------------
242 // 0000 0000 - 0000 007F | 0xxxxxxx
243 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
244 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
245 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
247 // Code point value is stored in bits marked with 'x',
248 // lowest-order bit of the value on the right side in the diagram
249 // above. (from RFC 3629)
251 // mask to extract lead byte's value ('x' bits above), by sequence
253 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
255 // mask and value of lead byte's most significant bits, by length:
256 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
257 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
259 len
--; // it's more convenient to work with 0-based length here
261 // extract the lead byte's value bits:
262 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
265 wxChar32 code
= c
& leadValueMask
[len
];
267 // all remaining bytes, if any, are handled in the same way
268 // regardless of sequence's length:
272 if ( (c
& 0xC0) != 0x80 )
273 return assign( wxUString() ); // don't try to convert invalid UTF-8
287 return assign( buffer
.data() );
290 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
, size_type n
)
293 return assign( wxUString() );
295 size_type ucs4_len
= 0;
296 size_type utf16_pos
= 0;
297 const wxChar16
*p
= str
;
301 if ((*p
< 0xd800) || (*p
> 0xdfff))
305 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
307 return assign( wxUString() ); // don't try to convert invalid UTF-16
314 if (utf16_pos
+ len
> n
)
322 wxU32CharBuffer
buffer( ucs4_len
);
323 wxChar32
*out
= buffer
.data();
330 if ((*p
< 0xd800) || (*p
> 0xdfff))
332 if (utf16_pos
+ 1 > n
)
341 if (utf16_pos
+ 2 > n
)
344 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
351 return assign( buffer
.data() );
354 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
)
357 return assign( wxUString() );
359 size_type ucs4_len
= 0;
360 const wxChar16
*p
= str
;
364 if ((*p
< 0xd800) || (*p
> 0xdfff))
368 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
370 return assign( wxUString() ); // don't try to convert invalid UTF-16
381 wxU32CharBuffer
buffer( ucs4_len
);
382 wxChar32
*out
= buffer
.data();
387 if ((*p
< 0xd800) || (*p
> 0xdfff))
394 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
400 return assign( buffer
.data() );
403 wxUString
&wxUString::assignFromCString( const char* str
)
406 return assign( wxUString() );
408 wxWCharBuffer buffer
= wxConvLibc
.cMB2WC( str
);
410 return assign( buffer
);
413 wxUString
&wxUString::assignFromCString( const char* str
, const wxMBConv
&conv
)
416 return assign( wxUString() );
418 wxWCharBuffer buffer
= conv
.cMB2WC( str
);
420 return assign( buffer
);
423 wxCharBuffer
wxUString::utf8_str() const
425 size_type utf8_length
= 0;
426 const wxChar32
*ptr
= data();
430 wxChar32 code
= *ptr
;
437 else if ( code
<= 0x07FF )
441 else if ( code
< 0xFFFF )
445 else if ( code
<= 0x10FFFF )
451 // invalid range, skip
455 wxCharBuffer
result( utf8_length
);
457 char *out
= result
.data();
462 wxChar32 code
= *ptr
;
470 else if ( code
<= 0x07FF )
472 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
473 out
[0] = 0xC0 | code
;
476 else if ( code
< 0xFFFF )
478 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
479 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
480 out
[0] = 0xE0 | code
;
483 else if ( code
<= 0x10FFFF )
485 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
486 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
487 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
488 out
[0] = 0xF0 | code
;
493 // invalid range, skip
497 wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) );
498 wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) );
503 wxU16CharBuffer
wxUString::utf16_str() const
505 size_type utf16_length
= 0;
506 const wxChar32
*ptr
= data();
510 wxChar32 code
= *ptr
;
513 // TODO: error range checks
521 wxU16CharBuffer
result( utf16_length
);
522 wxChar16
*out
= result
.data();
528 wxChar32 code
= *ptr
;
531 // TODO: error range checks
540 out
[0] = (code
- 0x10000) / 0x400 + 0xd800;
541 out
[1] = (code
- 0x10000) % 0x400 + 0xdc00;