]>
git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
a9b9241eb475551e2e43486503eaf6a1a64f5fc3
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/ustring.cpp
3 // Purpose: wxUString class
4 // Author: Robert Roebling
7 // Copyright: (c) 2008 Robert Roebling
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
11 // For compilers that support precompilation, includes "wx.h".
12 #include "wx/wxprec.h"
18 #include "wx/ustring.h"
25 wxUString
&wxUString::assignFromAscii( const char *str
)
27 size_type len
= wxStrlen( str
);
29 wxU32CharBuffer
buffer( len
);
30 wxChar32
*ptr
= buffer
.data();
33 for (i
= 0; i
< len
; i
++)
40 return assign( buffer
);
43 wxUString
&wxUString::assignFromAscii( const char *str
, size_type n
)
53 wxU32CharBuffer
buffer( len
);
54 wxChar32
*ptr
= buffer
.data();
57 for (i
= 0; i
< len
; i
++)
67 // ----------------------------------------------------------------------------
69 // ----------------------------------------------------------------------------
71 static const wxUint32 utf8_max
[]=
72 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
74 // this table gives the length of the UTF-8 encoding from its first character:
75 const unsigned char tableUtf8Lengths
[256] = {
76 // single-byte sequences (ASCII):
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
82 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
93 // two-byte sequences:
94 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
95 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
97 // three-byte sequences:
98 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
100 // four-byte sequences:
101 4, 4, 4, 4, 4, // F0..F4
103 // these are invalid again (5- or 6-byte
104 // sequences and sequences for code points
105 // above U+10FFFF, as restricted by RFC 3629):
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
109 wxUString
&wxUString::assignFromUTF8( const char *str
)
112 return assign( wxUString() );
114 size_type ucs4_len
= 0;
118 unsigned char c
= *p
;
119 size_type len
= tableUtf8Lengths
[c
];
121 return assign( wxUString() ); // don't try to convert invalid UTF-8
126 wxU32CharBuffer
buffer( ucs4_len
);
127 wxChar32
*out
= buffer
.data();
132 unsigned char c
= *p
;
140 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
142 // Char. number range | UTF-8 octet sequence
143 // (hexadecimal) | (binary)
144 // ----------------------+----------------------------------------
145 // 0000 0000 - 0000 007F | 0xxxxxxx
146 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
147 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
148 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150 // Code point value is stored in bits marked with 'x',
151 // lowest-order bit of the value on the right side in the diagram
152 // above. (from RFC 3629)
154 // mask to extract lead byte's value ('x' bits above), by sequence
156 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
158 // mask and value of lead byte's most significant bits, by length:
159 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
160 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
162 len
--; // it's more convenient to work with 0-based length here
164 // extract the lead byte's value bits:
165 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
168 wxChar32 code
= c
& leadValueMask
[len
];
170 // all remaining bytes, if any, are handled in the same way
171 // regardless of sequence's length:
175 if ( (c
& 0xC0) != 0x80 )
176 return assign( wxUString() ); // don't try to convert invalid UTF-8
188 return assign( buffer
.data() );
191 wxUString
&wxUString::assignFromUTF8( const char *str
, size_type n
)
194 return assign( wxUString() );
196 size_type ucs4_len
= 0;
197 size_type utf8_pos
= 0;
201 unsigned char c
= *p
;
202 size_type len
= tableUtf8Lengths
[c
];
204 return assign( wxUString() ); // don't try to convert invalid UTF-8
205 if (utf8_pos
+ len
> n
)
212 wxU32CharBuffer
buffer( ucs4_len
);
213 wxChar32
*out
= buffer
.data();
219 unsigned char c
= *p
;
222 if (utf8_pos
+ 1 > n
)
231 size_type len
= tableUtf8Lengths
[c
]; // len == 0 is caught above
232 if (utf8_pos
+ len
> n
)
236 // Char. number range | UTF-8 octet sequence
237 // (hexadecimal) | (binary)
238 // ----------------------+----------------------------------------
239 // 0000 0000 - 0000 007F | 0xxxxxxx
240 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
241 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
242 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
244 // Code point value is stored in bits marked with 'x',
245 // lowest-order bit of the value on the right side in the diagram
246 // above. (from RFC 3629)
248 // mask to extract lead byte's value ('x' bits above), by sequence
250 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
252 // mask and value of lead byte's most significant bits, by length:
253 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
254 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
256 len
--; // it's more convenient to work with 0-based length here
258 // extract the lead byte's value bits:
259 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
262 wxChar32 code
= c
& leadValueMask
[len
];
264 // all remaining bytes, if any, are handled in the same way
265 // regardless of sequence's length:
269 if ( (c
& 0xC0) != 0x80 )
270 return assign( wxUString() ); // don't try to convert invalid UTF-8
284 return assign( buffer
.data() );
287 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
, size_type n
)
290 return assign( wxUString() );
292 size_type ucs4_len
= 0;
293 size_type utf16_pos
= 0;
294 const wxChar16
*p
= str
;
298 if ((*p
< 0xd800) || (*p
> 0xdfff))
302 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
304 return assign( wxUString() ); // don't try to convert invalid UTF-16
311 if (utf16_pos
+ len
> n
)
319 wxU32CharBuffer
buffer( ucs4_len
);
320 wxChar32
*out
= buffer
.data();
327 if ((*p
< 0xd800) || (*p
> 0xdfff))
329 if (utf16_pos
+ 1 > n
)
338 if (utf16_pos
+ 2 > n
)
341 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
348 return assign( buffer
.data() );
351 wxUString
&wxUString::assignFromUTF16( const wxChar16
* str
)
354 return assign( wxUString() );
356 size_type ucs4_len
= 0;
357 const wxChar16
*p
= str
;
361 if ((*p
< 0xd800) || (*p
> 0xdfff))
365 else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff))
367 return assign( wxUString() ); // don't try to convert invalid UTF-16
378 wxU32CharBuffer
buffer( ucs4_len
);
379 wxChar32
*out
= buffer
.data();
384 if ((*p
< 0xd800) || (*p
> 0xdfff))
391 *out
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00);
397 return assign( buffer
.data() );
400 wxUString
&wxUString::assignFromCString( const char* str
)
403 return assign( wxUString() );
405 wxWCharBuffer buffer
= wxConvLibc
.cMB2WC( str
);
407 return assign( buffer
);
410 wxUString
&wxUString::assignFromCString( const char* str
, const wxMBConv
&conv
)
413 return assign( wxUString() );
415 wxWCharBuffer buffer
= conv
.cMB2WC( str
);
417 return assign( buffer
);
420 wxCharBuffer
wxUString::utf8_str() const
422 size_type utf8_length
= 0;
423 const wxChar32
*ptr
= data();
427 wxChar32 code
= *ptr
;
434 else if ( code
<= 0x07FF )
438 else if ( code
< 0xFFFF )
442 else if ( code
<= 0x10FFFF )
448 // invalid range, skip
452 wxCharBuffer
result( utf8_length
);
454 char *out
= result
.data();
459 wxChar32 code
= *ptr
;
467 else if ( code
<= 0x07FF )
469 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
470 out
[0] = 0xC0 | code
;
473 else if ( code
< 0xFFFF )
475 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
476 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
477 out
[0] = 0xE0 | code
;
480 else if ( code
<= 0x10FFFF )
482 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
483 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
484 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
485 out
[0] = 0xF0 | code
;
490 // invalid range, skip
494 wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) );
495 wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) );
500 wxU16CharBuffer
wxUString::utf16_str() const
502 size_type utf16_length
= 0;
503 const wxChar32
*ptr
= data();
507 wxChar32 code
= *ptr
;
510 // TODO: error range checks
518 wxU16CharBuffer
result( utf16_length
);
519 wxChar16
*out
= result
.data();
525 wxChar32 code
= *ptr
;
528 // TODO: error range checks
537 out
[0] = (code
- 0x10000) / 0x400 + 0xd800;
538 out
[1] = (code
- 0x10000) % 0x400 + 0xdc00;