]>
git.saurik.com Git - wxWidgets.git/blob - src/common/ustring.cpp
   1 ///////////////////////////////////////////////////////////////////////////// 
   2 // Name:        src/common/ustring.cpp 
   3 // Purpose:     wxUString class 
   4 // Author:      Robert Roebling 
   7 // Copyright:   (c) 2008 Robert Roebling 
   8 // Licence:     wxWindows licence 
   9 /////////////////////////////////////////////////////////////////////////////// 
  11 // For compilers that support precompilation, includes "wx.h". 
  12 #include "wx/wxprec.h" 
  18 #include "wx/ustring.h" 
  25 wxUString 
&wxUString::assignFromAscii( const char *str 
) 
  27    size_type len 
= wxStrlen( str 
); 
  29    wxU32CharBuffer 
buffer( len 
); 
  30    wxChar32 
*ptr 
= buffer
.data(); 
  33    for (i 
= 0; i 
< len
; i
++) 
  40    return assign( buffer 
); 
  43 wxUString 
&wxUString::assignFromAscii( const char *str
, size_type n 
) 
  53    wxU32CharBuffer 
buffer( len 
); 
  54    wxChar32 
*ptr 
= buffer
.data(); 
  57    for (i 
= 0; i 
< len
; i
++) 
  67 // ---------------------------------------------------------------------------- 
  69 // ---------------------------------------------------------------------------- 
  71 // this table gives the length of the UTF-8 encoding from its first character: 
  72 const unsigned char tableUtf8Lengths
[256] = { 
  73     // single-byte sequences (ASCII): 
  74     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F 
  75     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F 
  76     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F 
  77     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F 
  78     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F 
  79     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F 
  80     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F 
  81     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F 
  84     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F 
  85     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F 
  86     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF 
  87     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF 
  90     // two-byte sequences: 
  91           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF 
  92     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF 
  94     // three-byte sequences: 
  95     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF 
  97     // four-byte sequences: 
  98     4, 4, 4, 4, 4,                                   // F0..F4 
 100     // these are invalid again (5- or 6-byte 
 101     // sequences and sequences for code points 
 102     // above U+10FFFF, as restricted by RFC 3629): 
 103                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF 
 106 wxUString 
&wxUString::assignFromUTF8( const char *str 
) 
 109         return assign( wxUString() ); 
 111     size_type ucs4_len 
= 0; 
 115         unsigned char c 
= *p
; 
 116         size_type len 
= tableUtf8Lengths
[c
]; 
 118            return assign( wxUString() );  // don't try to convert invalid UTF-8 
 123     wxU32CharBuffer 
buffer( ucs4_len 
); 
 124     wxChar32 
*out 
= buffer
.data(); 
 129         unsigned char c 
= *p
; 
 137             size_type len 
= tableUtf8Lengths
[c
];  // len == 0 is caught above 
 139             //   Char. number range   |        UTF-8 octet sequence 
 140             //      (hexadecimal)     |              (binary) 
 141             //  ----------------------+---------------------------------------- 
 142             //  0000 0000 - 0000 007F | 0xxxxxxx 
 143             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 
 144             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 
 145             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
 147             //  Code point value is stored in bits marked with 'x', 
 148             //  lowest-order bit of the value on the right side in the diagram 
 149             //  above.                                         (from RFC 3629) 
 151             // mask to extract lead byte's value ('x' bits above), by sequence 
 153             static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 }; 
 155             // mask and value of lead byte's most significant bits, by length: 
 156             static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 }; 
 157             static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 }; 
 159             len
--; // it's more convenient to work with 0-based length here 
 161             // extract the lead byte's value bits: 
 162             if ( (c 
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] ) 
 165             wxChar32 code 
= c 
& leadValueMask
[len
]; 
 167             // all remaining bytes, if any, are handled in the same way 
 168             // regardless of sequence's length: 
 172                 if ( (c 
& 0xC0) != 0x80 ) 
 173                     return assign( wxUString() );  // don't try to convert invalid UTF-8 
 185     return assign( buffer
.data() ); 
 188 wxUString 
&wxUString::assignFromUTF8( const char *str
, size_type n 
) 
 191         return assign( wxUString() ); 
 193     size_type ucs4_len 
= 0; 
 194     size_type utf8_pos 
= 0; 
 198         unsigned char c 
= *p
; 
 199         size_type len 
= tableUtf8Lengths
[c
]; 
 201            return assign( wxUString() );  // don't try to convert invalid UTF-8 
 202         if (utf8_pos 
+ len 
> n
) 
 209     wxU32CharBuffer 
buffer( ucs4_len 
); 
 210     wxChar32 
*out 
= buffer
.data(); 
 216         unsigned char c 
= *p
; 
 219             if (utf8_pos 
+ 1 > n
) 
 228             size_type len 
= tableUtf8Lengths
[c
];  // len == 0 is caught above 
 229             if (utf8_pos 
+ len 
> n
) 
 233             //   Char. number range   |        UTF-8 octet sequence 
 234             //      (hexadecimal)     |              (binary) 
 235             //  ----------------------+---------------------------------------- 
 236             //  0000 0000 - 0000 007F | 0xxxxxxx 
 237             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 
 238             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 
 239             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
 241             //  Code point value is stored in bits marked with 'x', 
 242             //  lowest-order bit of the value on the right side in the diagram 
 243             //  above.                                         (from RFC 3629) 
 245             // mask to extract lead byte's value ('x' bits above), by sequence 
 247             static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 }; 
 249             // mask and value of lead byte's most significant bits, by length: 
 250             static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 }; 
 251             static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 }; 
 253             len
--; // it's more convenient to work with 0-based length here 
 255             // extract the lead byte's value bits: 
 256             if ( (c 
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] ) 
 259             wxChar32 code 
= c 
& leadValueMask
[len
]; 
 261             // all remaining bytes, if any, are handled in the same way 
 262             // regardless of sequence's length: 
 266                 if ( (c 
& 0xC0) != 0x80 ) 
 267                     return assign( wxUString() );  // don't try to convert invalid UTF-8 
 281     return assign( buffer
.data() ); 
 284 wxUString 
&wxUString::assignFromUTF16( const wxChar16
* str
, size_type n 
) 
 287         return assign( wxUString() ); 
 289     size_type ucs4_len 
= 0; 
 290     size_type utf16_pos 
= 0; 
 291     const wxChar16 
*p 
= str
; 
 295         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 299         else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff)) 
 301             return assign( wxUString() );  // don't try to convert invalid UTF-16 
 308         if (utf16_pos 
+ len 
> n
) 
 316     wxU32CharBuffer 
buffer( ucs4_len 
); 
 317     wxChar32 
*out 
= buffer
.data(); 
 324         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 326             if (utf16_pos 
+ 1 > n
) 
 335             if (utf16_pos 
+ 2 > n
) 
 338            *out 
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00); 
 345     return assign( buffer
.data() ); 
 348 wxUString 
&wxUString::assignFromUTF16( const wxChar16
* str 
) 
 351         return assign( wxUString() ); 
 353     size_type ucs4_len 
= 0; 
 354     const wxChar16 
*p 
= str
; 
 358         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 362         else if ((p
[1] < 0xdc00) || (p
[1] > 0xdfff)) 
 364             return assign( wxUString() );  // don't try to convert invalid UTF-16 
 375     wxU32CharBuffer 
buffer( ucs4_len 
); 
 376     wxChar32 
*out 
= buffer
.data(); 
 381         if ((*p 
< 0xd800) || (*p 
> 0xdfff)) 
 388            *out 
= ((p
[0] - 0xd7c0) << 10) + (p
[1] - 0xdc00); 
 394     return assign( buffer
.data() ); 
 397 wxUString 
&wxUString::assignFromCString( const char* str 
) 
 400         return assign( wxUString() ); 
 402     wxWCharBuffer buffer 
= wxConvLibc
.cMB2WC( str 
); 
 404     return assign( buffer 
); 
 407 wxUString 
&wxUString::assignFromCString( const char* str
, const wxMBConv 
&conv 
) 
 410         return assign( wxUString() ); 
 412     wxWCharBuffer buffer 
= conv
.cMB2WC( str 
); 
 414     return assign( buffer 
); 
 417 wxCharBuffer 
wxUString::utf8_str() const 
 419     size_type utf8_length 
= 0; 
 420     const wxChar32 
*ptr 
= data(); 
 424         wxChar32 code 
= *ptr
; 
 431         else if ( code 
<= 0x07FF ) 
 435         else if ( code 
< 0xFFFF ) 
 439         else if ( code 
<= 0x10FFFF ) 
 445             // invalid range, skip 
 449     wxCharBuffer 
result( utf8_length 
); 
 451     char *out 
= result
.data(); 
 456         wxChar32 code 
= *ptr
; 
 464         else if ( code 
<= 0x07FF ) 
 466             out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 467             out
[0] = 0xC0 | code
; 
 470         else if ( code 
< 0xFFFF ) 
 472             out
[2] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 473             out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 474             out
[0] = 0xE0 | code
; 
 477         else if ( code 
<= 0x10FFFF ) 
 479             out
[3] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 480             out
[2] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 481             out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 482             out
[0] = 0xF0 | code
; 
 487             // invalid range, skip 
 491     wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) ); 
 492     wxPrintf( "utf8_str %s len %d\n", result
, wxStrlen( result
.data() ) ); 
 497 wxU16CharBuffer 
wxUString::utf16_str() const 
 499     size_type utf16_length 
= 0; 
 500     const wxChar32 
*ptr 
= data(); 
 504         wxChar32 code 
= *ptr
; 
 507         // TODO: error range checks 
 515     wxU16CharBuffer 
result( utf16_length 
); 
 516     wxChar16 
*out 
= result
.data(); 
 522         wxChar32 code 
= *ptr
; 
 525         // TODO: error range checks 
 534            out
[0] = (code 
- 0x10000) / 0x400 + 0xd800; 
 535            out
[1] = (code 
- 0x10000) % 0x400 + 0xdc00;