]>
git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
a9b9241eb475551e2e43486503eaf6a1a64f5fc3
   1 ///////////////////////////////////////////////////////////////////////////// 
   2 // Name:        src/common/ustring.cpp 
   3 // Purpose:     wxUString class 
   4 // Author:      Robert Roebling 
   7 // Copyright:   (c) 2008 Robert Roebling 
   8 // Licence:     wxWindows licence 
   9 /////////////////////////////////////////////////////////////////////////////// 
  11 // For compilers that support precompilation, includes "wx.h". 
  12 #include "wx/wxprec.h" 
  18 #include "wx/ustring.h" 
  25 wxUString 
&wxUString::assignFromAscii( const char *str 
) 
  27    size_type len 
= wxStrlen( str 
); 
  29    wxU32CharBuffer 
buffer( len 
); 
  30    wxChar32 
*ptr 
= buffer
.data(); 
  33    for (i 
= 0; i 
< len
; i
++) 
  40    return assign( buffer 
); 
  43 wxUString 
&wxUString::assignFromAscii( const char *str
, size_type n 
) 
  53    wxU32CharBuffer 
buffer( len 
); 
  54    wxChar32 
*ptr 
= buffer
.data(); 
  57    for (i 
= 0; i 
< len
; i
++) 
  67 // ---------------------------------------------------------------------------- 
  69 // ---------------------------------------------------------------------------- 
  71 static const wxUint32 utf8_max
[]= 
  72     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; 
  74 // this table gives the length of the UTF-8 encoding from its first character: 
  75 const unsigned char tableUtf8Lengths
[256] = { 
  76     // single-byte sequences (ASCII): 
  77     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F 
  78     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F 
  79     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F 
  80     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F 
  81     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F 
  82     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F 
  83     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F 
  84     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F 
  87     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F 
  88     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F 
  89     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF 
  90     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF 
  93     // two-byte sequences: 
  94           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF 
  95     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF 
  97     // three-byte sequences: 
  98     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF 
 100     // four-byte sequences: 
 101     4, 4, 4, 4, 4,                                   // F0..F4 
 103     // these are invalid again (5- or 6-byte 
 104     // sequences and sequences for code points 
 105     // above U+10FFFF, as restricted by RFC 3629): 
 106                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF 
 109 wxUString 
&wxUString::assignFromUTF8( const char *str 
) 
 112         return assign( wxUString() ); 
 114     size_type ucs4_len 
= 0; 
 118         unsigned char c 
= *p
; 
 119         size_type len 
= tableUtf8Lengths
[c
]; 
 121            return assign( wxUString() );  // don't try to convert invalid UTF-8 
 126     wxU32CharBuffer 
buffer( ucs4_len 
); 
 127     wxChar32 
*out 
= buffer
.data(); 
 132         unsigned char c 
= *p
; 
 140             size_type len 
= tableUtf8Lengths
[c
];  // len == 0 is caught above 
 142             //   Char. number range   |        UTF-8 octet sequence 
 143             //      (hexadecimal)     |              (binary) 
 144             //  ----------------------+---------------------------------------- 
 145             //  0000 0000 - 0000 007F | 0xxxxxxx 
 146             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 
 147             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 
 148             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
 150             //  Code point value is stored in bits marked with 'x', 
 151             //  lowest-order bit of the value on the right side in the diagram 
 152             //  above.                                         (from RFC 3629) 
 154             // mask to extract lead byte's value ('x' bits above), by sequence 
 156             static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 }; 
 158             // mask and value of lead byte's most significant bits, by length: 
 159             static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 }; 
 160             static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 }; 
 162             len
--; // it's more convenient to work with 0-based length here 
 164             // extract the lead byte's value bits: 
 165             if ( (c 
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] ) 
 168             wxChar32 code 
= c 
& leadValueMask
[len
]; 
 170             // all remaining bytes, if any, are handled in the same way 
 171             // regardless of sequence's length: 
 175                 if ( (c 
& 0xC0) != 0x80 ) 
 176                     return assign( wxUString() );  // don't try to convert invalid UTF-8 
 188     return assign( buffer
.data() ); 
 191 wxUString 
&wxUString::assignFromUTF8( const char *str
, size_type n 
) 
 194         return assign( wxUString() ); 
 196     size_type ucs4_len 
= 0; 
 197     size_type utf8_pos 
= 0; 
 201         unsigned char c 
= *p
; 
 202         size_type len 
= tableUtf8Lengths
[c
]; 
 204            return assign( wxUString() );  // don't try to convert invalid UTF-8 
 205         if (utf8_pos 
+ len 
> n
) 
 212     wxU32CharBuffer 
buffer( ucs4_len 
); 
 213     wxChar32 
*out 
= buffer
.data(); 
 219         unsigned char c 
= *p
; 
 222             if (utf8_pos 
+ 1 > n
) 
 231             size_type len 
= tableUtf8Lengths
[c
];  // len == 0 is caught above 
 232             if (utf8_pos 
+ len 
> n
) 
 236             //   Char. number range   |        UTF-8 octet sequence 
 237             //      (hexadecimal)     |              (binary) 
 238             //  ----------------------+---------------------------------------- 
 239             //  0000 0000 - 0000 007F | 0xxxxxxx 
 240             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 
 241             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 
 242             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
 244             //  Code point value is stored in bits marked with 'x', 
 245             //  lowest-order bit of the value on the right side in the diagram 
 246             //  above.                                         (from RFC 3629) 
 248             // mask to extract lead byte's value ('x' bits above), by sequence 
 250             static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 }; 
 252             // mask and value of lead byte's most significant bits, by length: 
 253             static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 }; 
 254             static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 }; 
 256             len
--; // it's more convenient to work with 0-based length here 
 258             // extract the lead byte's value bits: 
 259             if ( (c 
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] ) 
 262             wxChar32 code 
= c 
& leadValueMask
[len
]; 
 264             // all remaining bytes, if any, are handled in the same way 
 265             // regardless of sequence's length: 
 269                 if ( (c 
& 0xC0) != 0x80 ) 
 270                     return assign( wxUString() );  // don't try to convert invalid UTF-8 
 284     return assign( buffer
.data() ); 
 287 wxUString 
&wxUString::assignFromUTF16( const wxChar16
* str
, size_type n 
) 
 290         return assign( wxUString() ); 
 292     size_type ucs4_len 
= 0; 
 293     size_type utf16_pos 
= 0; 
 294     const wxChar16 
*p 
= str
; 
 298         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 302         else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff)) 
 304             return assign( wxUString() );  // don't try to convert invalid UTF-16 
 311         if (utf16_pos 
+ len 
> n
) 
 319     wxU32CharBuffer 
buffer( ucs4_len 
); 
 320     wxChar32 
*out 
= buffer
.data(); 
 327         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 329             if (utf16_pos 
+ 1 > n
) 
 338             if (utf16_pos 
+ 2 > n
) 
 341            *out 
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00); 
 348     return assign( buffer
.data() ); 
 351 wxUString 
&wxUString::assignFromUTF16( const wxChar16
* str 
) 
 354         return assign( wxUString() ); 
 356     size_type ucs4_len 
= 0; 
 357     const wxChar16 
*p 
= str
; 
 361         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 365         else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff)) 
 367             return assign( wxUString() );  // don't try to convert invalid UTF-16 
 378     wxU32CharBuffer 
buffer( ucs4_len 
); 
 379     wxChar32 
*out 
= buffer
.data(); 
 384         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 391            *out 
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00); 
 397     return assign( buffer
.data() ); 
 400 wxUString 
&wxUString::assignFromCString( const char* str 
) 
 403         return assign( wxUString() ); 
 405     wxWCharBuffer buffer 
= wxConvLibc
.cMB2WC( str 
); 
 407     return assign( buffer 
); 
 410 wxUString 
&wxUString::assignFromCString( const char* str
, const wxMBConv 
&conv 
) 
 413         return assign( wxUString() ); 
 415     wxWCharBuffer buffer 
= conv
.cMB2WC( str 
); 
 417     return assign( buffer 
); 
 420 wxCharBuffer 
wxUString::utf8_str() const 
 422     size_type utf8_length 
= 0; 
 423     const wxChar32 
*ptr 
= data(); 
 427         wxChar32 code 
= *ptr
; 
 434         else if ( code 
<= 0x07FF ) 
 438         else if ( code 
< 0xFFFF ) 
 442         else if ( code 
<= 0x10FFFF ) 
 448             // invalid range, skip 
 452     wxCharBuffer 
result( utf8_length 
); 
 454     char *out 
= result
.data(); 
 459         wxChar32 code 
= *ptr
; 
 467         else if ( code 
<= 0x07FF ) 
 469             out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 470             out
[0] = 0xC0 | code
; 
 473         else if ( code 
< 0xFFFF ) 
 475             out
[2] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 476             out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 477             out
[0] = 0xE0 | code
; 
 480         else if ( code 
<= 0x10FFFF ) 
 482             out
[3] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 483             out
[2] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 484             out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 485             out
[0] = 0xF0 | code
; 
 490             // invalid range, skip 
 494     wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) ); 
 495     wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) ); 
 500 wxU16CharBuffer 
wxUString::utf16_str() const 
 502     size_type utf16_length 
= 0; 
 503     const wxChar32 
*ptr 
= data(); 
 507         wxChar32 code 
= *ptr
; 
 510         // TODO: error range checks 
 518     wxU16CharBuffer 
result( utf16_length 
); 
 519     wxChar16 
*out 
= result
.data(); 
 525         wxChar32 code 
= *ptr
; 
 528         // TODO: error range checks 
 537            out
[0] = (code 
- 0x10000) / 0x400 + 0xd800; 
 538            out
[1] = (code 
- 0x10000) % 0x400 + 0xdc00;