1 ///////////////////////////////////////////////////////////////////////////// 
   2 // Name:        src/common/strconv.cpp 
   3 // Purpose:     Unicode conversion classes 
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik, 
   5 //              Ryan Norton, Fredrik Roubert (UTF7) 
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik 
  10 //              (c) 2000-2003 Vadim Zeitlin 
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert 
  12 // Licence:     wxWindows licence 
  13 ///////////////////////////////////////////////////////////////////////////// 
  15 // For compilers that support precompilation, includes "wx.h". 
  16 #include "wx/wxprec.h" 
  26     #include "wx/hashmap.h" 
  29 #include "wx/strconv.h" 
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__) 
  42     #include "wx/msw/private.h" 
  43     #include "wx/msw/missing.h" 
  44     #define wxHAVE_WIN32_MB2WC 
  53     #include "wx/thread.h" 
  56 #include "wx/encconv.h" 
  57 #include "wx/fontmap.h" 
  60 #include "wx/mac/corefoundation/private/strconv_cf.h" 
  61 #endif //def __DARWIN__ 
  64 #define TRACE_STRCONV _T("strconv") 
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to 
  68 #if SIZEOF_WCHAR_T == 2 
  73 // ============================================================================ 
  75 // ============================================================================ 
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL 
  78 static bool NotAllNULs(const char *p
, size_t n
) 
  80     while ( n 
&& *p
++ == '\0' ) 
  86 // ---------------------------------------------------------------------------- 
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling 
  88 // ---------------------------------------------------------------------------- 
  90 static size_t encode_utf16(wxUint32 input
, wxUint16 
*output
) 
  95             *output 
= (wxUint16
) input
; 
  99     else if (input 
>= 0x110000) 
 101         return wxCONV_FAILED
; 
 107             *output
++ = (wxUint16
) ((input 
>> 10) + 0xd7c0); 
 108             *output 
= (wxUint16
) ((input 
& 0x3ff) + 0xdc00); 
 115 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
) 
 117     if ((*input 
< 0xd800) || (*input 
> 0xdfff)) 
 122     else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff)) 
 125         return wxCONV_FAILED
; 
 129         output 
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00); 
 135     typedef wchar_t wxDecodeSurrogate_t
; 
 137     typedef wxUint16 wxDecodeSurrogate_t
; 
 138 #endif // WC_UTF16/!WC_UTF16 
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the 
 141 // pointer to the character after this one 
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must 
 145 static wxUint32 
wxDecodeSurrogate(const wxDecodeSurrogate_t 
**pSrc
) 
 149         n 
= decode_utf16(wx_reinterpret_cast(const wxUint16 
*, *pSrc
), out
); 
 150     if ( n 
== wxCONV_FAILED 
) 
 158 // ---------------------------------------------------------------------------- 
 160 // ---------------------------------------------------------------------------- 
 163 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
, 
 164                   const char *src
, size_t srcLen
) const 
 166     // although new conversion classes are supposed to implement this function 
 167     // directly, the existins ones only implement the old MB2WC() and so, to 
 168     // avoid to have to rewrite all conversion classes at once, we provide a 
 169     // default (but not efficient) implementation of this one in terms of the 
 170     // old function by copying the input to ensure that it's NUL-terminated and 
 171     // then using MB2WC() to convert it 
 173     // the number of chars [which would be] written to dst [if it were not NULL] 
 174     size_t dstWritten 
= 0; 
 176     // the number of NULs terminating this string 
 177     size_t nulLen 
= 0;  // not really needed, but just to avoid warnings 
 179     // if we were not given the input size we just have to assume that the 
 180     // string is properly terminated as we have no way of knowing how long it 
 181     // is anyhow, but if we do have the size check whether there are enough 
 185     if ( srcLen 
!= wxNO_LEN 
) 
 187         // we need to know how to find the end of this string 
 188         nulLen 
= GetMBNulLen(); 
 189         if ( nulLen 
== wxCONV_FAILED 
) 
 190             return wxCONV_FAILED
; 
 192         // if there are enough NULs we can avoid the copy 
 193         if ( srcLen 
< nulLen 
|| NotAllNULs(src 
+ srcLen 
- nulLen
, nulLen
) ) 
 195             // make a copy in order to properly NUL-terminate the string 
 196             bufTmp 
= wxCharBuffer(srcLen 
+ nulLen 
- 1 /* 1 will be added */); 
 197             char * const p 
= bufTmp
.data(); 
 198             memcpy(p
, src
, srcLen
); 
 199             for ( char *s 
= p 
+ srcLen
; s 
< p 
+ srcLen 
+ nulLen
; s
++ ) 
 205         srcEnd 
= src 
+ srcLen
; 
 207     else // quit after the first loop iteration 
 214         // try to convert the current chunk 
 215         size_t lenChunk 
= MB2WC(NULL
, src
, 0); 
 216         if ( lenChunk 
== wxCONV_FAILED 
) 
 217             return wxCONV_FAILED
; 
 219         lenChunk
++; // for the L'\0' at the end of this chunk 
 221         dstWritten 
+= lenChunk
; 
 225             // nothing left in the input string, conversion succeeded 
 231             if ( dstWritten 
> dstLen 
) 
 232                 return wxCONV_FAILED
; 
 234             if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED 
) 
 235                 return wxCONV_FAILED
; 
 242             // we convert just one chunk in this case as this is the entire 
 247         // advance the input pointer past the end of this chunk 
 248         while ( NotAllNULs(src
, nulLen
) ) 
 250             // notice that we must skip over multiple bytes here as we suppose 
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do 
 252             // too and so if advanced by a single byte we might erroneously 
 253             // detect sequences of NUL bytes in the middle of the input 
 257         src 
+= nulLen
; // skipping over its terminator as well 
 259         // note that ">=" (and not just "==") is needed here as the terminator 
 260         // we skipped just above could be inside or just after the buffer 
 261         // delimited by inEnd 
 270 wxMBConv::FromWChar(char *dst
, size_t dstLen
, 
 271                     const wchar_t *src
, size_t srcLen
) const 
 273     // the number of chars [which would be] written to dst [if it were not NULL] 
 274     size_t dstWritten 
= 0; 
 276     // make a copy of the input string unless it is already properly 
 279     // if we don't know its length we have no choice but to assume that it is, 
 280     // indeed, properly terminated 
 281     wxWCharBuffer bufTmp
; 
 282     if ( srcLen 
== wxNO_LEN 
) 
 284         srcLen 
= wxWcslen(src
) + 1; 
 286     else if ( srcLen 
!= 0 && src
[srcLen 
- 1] != L
'\0' ) 
 288         // make a copy in order to properly NUL-terminate the string 
 289         bufTmp 
= wxWCharBuffer(srcLen
); 
 290         memcpy(bufTmp
.data(), src
, srcLen 
* sizeof(wchar_t)); 
 294     const size_t lenNul 
= GetMBNulLen(); 
 295     for ( const wchar_t * const srcEnd 
= src 
+ srcLen
; 
 297           src 
+= wxWcslen(src
) + 1 /* skip L'\0' too */ ) 
 299         // try to convert the current chunk 
 300         size_t lenChunk 
= WC2MB(NULL
, src
, 0); 
 302         if ( lenChunk 
== wxCONV_FAILED 
) 
 303             return wxCONV_FAILED
; 
 306         dstWritten 
+= lenChunk
; 
 310             if ( dstWritten 
> dstLen 
) 
 311                 return wxCONV_FAILED
; 
 313             if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED 
) 
 314                 return wxCONV_FAILED
; 
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const 
 325     size_t rc 
= ToWChar(outBuff
, outLen
, inBuff
); 
 326     if ( rc 
!= wxCONV_FAILED 
) 
 328         // ToWChar() returns the buffer length, i.e. including the trailing 
 329         // NUL, while this method doesn't take it into account 
 336 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const 
 338     size_t rc 
= FromWChar(outBuff
, outLen
, inBuff
); 
 339     if ( rc 
!= wxCONV_FAILED 
) 
 347 wxMBConv::~wxMBConv() 
 349     // nothing to do here (necessary for Darwin linking probably) 
 352 const wxWCharBuffer 
wxMBConv::cMB2WC(const char *psz
) const 
 356         // calculate the length of the buffer needed first 
 357         const size_t nLen 
= ToWChar(NULL
, 0, psz
); 
 358         if ( nLen 
!= wxCONV_FAILED 
) 
 360             // now do the actual conversion 
 361             wxWCharBuffer 
buf(nLen 
- 1 /* +1 added implicitly */); 
 363             // +1 for the trailing NULL 
 364             if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED 
) 
 369     return wxWCharBuffer(); 
 372 const wxCharBuffer 
wxMBConv::cWC2MB(const wchar_t *pwz
) const 
 376         const size_t nLen 
= FromWChar(NULL
, 0, pwz
); 
 377         if ( nLen 
!= wxCONV_FAILED 
) 
 379             wxCharBuffer 
buf(nLen 
- 1); 
 380             if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED 
) 
 385     return wxCharBuffer(); 
 389 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const 
 391     const size_t dstLen 
= ToWChar(NULL
, 0, inBuff
, inLen
); 
 392     if ( dstLen 
!= wxCONV_FAILED 
) 
 394         wxWCharBuffer 
wbuf(dstLen 
- 1); 
 395         if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED 
) 
 400                 if ( wbuf
[dstLen 
- 1] == L
'\0' ) 
 411     return wxWCharBuffer(); 
 415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const 
 417     size_t dstLen 
= FromWChar(NULL
, 0, inBuff
, inLen
); 
 418     if ( dstLen 
!= wxCONV_FAILED 
) 
 420         // special case of empty input: can't allocate 0 size buffer below as 
 421         // wxCharBuffer insists on NUL-terminating it 
 422         wxCharBuffer 
buf(dstLen 
? dstLen 
- 1 : 1); 
 423         if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED 
) 
 429                 const size_t nulLen 
= GetMBNulLen(); 
 430                 if ( dstLen 
>= nulLen 
&& 
 431                         !NotAllNULs(buf
.data() + dstLen 
- nulLen
, nulLen
) ) 
 433                     // in this case the output is NUL-terminated and we're not 
 434                     // supposed to count NUL 
 446     return wxCharBuffer(); 
 449 // ---------------------------------------------------------------------------- 
 451 // ---------------------------------------------------------------------------- 
 453 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 455     return wxMB2WC(buf
, psz
, n
); 
 458 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 460     return wxWC2MB(buf
, psz
, n
); 
 463 // ---------------------------------------------------------------------------- 
 464 // wxConvBrokenFileNames 
 465 // ---------------------------------------------------------------------------- 
 469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
) 
 471     if ( wxStricmp(charset
, _T("UTF-8")) == 0 || 
 472          wxStricmp(charset
, _T("UTF8")) == 0  ) 
 473         m_conv 
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
); 
 475         m_conv 
= new wxCSConv(charset
); 
 480 // ---------------------------------------------------------------------------- 
 482 // ---------------------------------------------------------------------------- 
 484 // Implementation (C) 2004 Fredrik Roubert 
 487 // BASE64 decoding table 
 489 static const unsigned char utf7unb64
[] = 
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 496     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f, 
 497     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 
 498     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 499     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 
 500     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 
 501     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 
 502     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff, 
 503     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 
 504     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 
 505     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 
 506     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff, 
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 
 525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 529     while ( *psz 
&& (!buf 
|| (len 
< n
)) ) 
 531         unsigned char cc 
= *psz
++; 
 539         else if (*psz 
== '-') 
 547         else // start of BASE64 encoded string 
 551             for ( ok 
= lsb 
= false, d 
= 0, l 
= 0; 
 552                   (cc 
= utf7unb64
[(unsigned char)*psz
]) != 0xff; 
 557                 for (l 
+= 6; l 
>= 8; lsb 
= !lsb
) 
 559                     unsigned char c 
= (unsigned char)((d 
>> (l 
-= 8)) % 256); 
 569                             *buf 
= (wchar_t)(c 
<< 8); 
 578                 // in valid UTF7 we should have valid characters after '+' 
 579                 return wxCONV_FAILED
; 
 587     if ( buf 
&& (len 
< n
) ) 
 594 // BASE64 encoding table 
 596 static const unsigned char utf7enb64
[] = 
 598     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 
 599     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
 600     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
 601     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 
 602     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 
 603     'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 
 604     'w', 'x', 'y', 'z', '0', '1', '2', '3', 
 605     '4', '5', '6', '7', '8', '9', '+', '/' 
 609 // UTF-7 encoding table 
 611 // 0 - Set D (directly encoded characters) 
 612 // 1 - Set O (optional direct characters) 
 613 // 2 - whitespace characters (optional) 
 614 // 3 - special characters 
 616 static const unsigned char utf7encode
[128] = 
 618     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
 620     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3, 
 621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 
 622     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3 
 628 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 632     while (*psz 
&& ((!buf
) || (len 
< n
))) 
 635         if (cc 
< 0x80 && utf7encode
[cc
] < 1) 
 644         else if (((wxUint32
)cc
) > 0xffff) 
 646             // no surrogate pair generation (yet?) 
 647             return wxCONV_FAILED
; 
 658                 // BASE64 encode string 
 659                 unsigned int lsb
, d
, l
; 
 660                 for (d 
= 0, l 
= 0; /*nothing*/; psz
++) 
 662                     for (lsb 
= 0; lsb 
< 2; lsb 
++) 
 665                         d 
+= lsb 
? cc 
& 0xff : (cc 
& 0xff00) >> 8; 
 667                         for (l 
+= 8; l 
>= 6; ) 
 671                                 *buf
++ = utf7enb64
[(d 
>> l
) % 64]; 
 677                     if (!(cc
) || (cc 
< 0x80 && utf7encode
[cc
] < 1)) 
 684                         *buf
++ = utf7enb64
[((d 
% 16) << (6 - l
)) % 64]; 
 696     if (buf 
&& (len 
< n
)) 
 702 // ---------------------------------------------------------------------------- 
 704 // ---------------------------------------------------------------------------- 
 706 static wxUint32 utf8_max
[]= 
 707     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; 
 709 // boundaries of the private use area we use to (temporarily) remap invalid 
 710 // characters invalid in a UTF-8 encoded string 
 711 const wxUint32 wxUnicodePUA 
= 0x100000; 
 712 const wxUint32 wxUnicodePUAEnd 
= wxUnicodePUA 
+ 256; 
 714 // this table gives the length of the UTF-8 encoding from its first character: 
 715 unsigned char tableUtf8Lengths
[256] = { 
 716     // single-byte sequences (ASCII): 
 717     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F 
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F 
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F 
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F 
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F 
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F 
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F 
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F 
 726     // these are invalid: 
 727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F 
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F 
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF 
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF 
 733     // two-byte sequences: 
 734           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF 
 735     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF 
 737     // three-byte sequences: 
 738     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF 
 740     // four-byte sequences: 
 741     4, 4, 4, 4, 4,                                   // F0..F4 
 743     // these are invalid again (5- or 6-byte 
 744     // sequences and sequences for code points 
 745     // above U+10FFFF, as restricted by RFC 3629): 
 746                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF 
 750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
, 
 751                             const char *src
, size_t srcLen
) const 
 753     wchar_t *out 
= dstLen 
? dst 
: NULL
; 
 756     if ( srcLen 
== wxNO_LEN 
) 
 757         srcLen 
= strlen(src
) + 1; 
 759     for ( const char *p 
= src
; ; p
++ ) 
 761         if ( !(srcLen 
== wxNO_LEN 
? *p 
: srcLen
) ) 
 763             // all done successfully, just add the trailing NULL if we are not 
 764             // using explicit length 
 765             if ( srcLen 
== wxNO_LEN 
) 
 781         unsigned char c 
= *p
; 
 782         unsigned len 
= tableUtf8Lengths
[c
]; 
 786         if ( srcLen 
< len 
) // the test works for wxNO_LEN too 
 789         if ( srcLen 
!= wxNO_LEN 
) 
 792         if ( out 
&& !dstLen
-- ) 
 796         //   Char. number range   |        UTF-8 octet sequence 
 797         //      (hexadecimal)     |              (binary) 
 798         //  ----------------------+--------------------------------------------- 
 799         //  0000 0000 - 0000 007F | 0xxxxxxx 
 800         //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 
 801         //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 
 802         //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
 804         //  Code point value is stored in bits marked with 'x', lowest-order bit 
 805         //  of the value on the right side in the diagram above. 
 808         // mask to extract lead byte's value ('x' bits above), by sequence length: 
 809         static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 }; 
 811         // mask and value of lead byte's most significant bits, by length: 
 812         static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 }; 
 813         static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 }; 
 815         len
--; // it's more convenient to work with 0-based length here 
 817         // extract the lead byte's value bits: 
 818         if ( (c 
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] ) 
 821         wxUint32 code 
= c 
& leadValueMask
[len
]; 
 823         // all remaining bytes, if any, are handled in the same way regardless of 
 824         // sequence's length: 
 828             if ( (c 
& 0xC0) != 0x80 ) 
 829                 return wxCONV_FAILED
; 
 836         // cast is ok because wchar_t == wxUint16 if WC_UTF16 
 837         if ( encode_utf16(code
, (wxUint16 
*)out
) == 2 ) 
 846 #endif // WC_UTF16/!WC_UTF16 
 854     return wxCONV_FAILED
; 
 858 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
, 
 859                               const wchar_t *src
, size_t srcLen
) const 
 861     char *out 
= dstLen 
? dst 
: NULL
; 
 864     for ( const wchar_t *wp 
= src
; ; wp
++ ) 
 866         if ( !(srcLen 
== wxNO_LEN 
? *wp 
: srcLen
--) ) 
 868             // all done successfully, just add the trailing NULL if we are not 
 869             // using explicit length 
 870             if ( srcLen 
== wxNO_LEN 
) 
 889         // cast is ok for WC_UTF16 
 890         if ( decode_utf16((const wxUint16 
*)wp
, code
) == 2 ) 
 892             // skip the next char too as we decoded a surrogate 
 895 #else // wchar_t is UTF-32 
 896         code 
= *wp 
& 0x7fffffff; 
 911         else if ( code 
<= 0x07FF ) 
 919                 // NB: this line takes 6 least significant bits, encodes them as 
 920                 // 10xxxxxx and discards them so that the next byte can be encoded: 
 921                 out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 922                 out
[0] = 0xC0 | code
; 
 925         else if ( code 
< 0xFFFF ) 
 933                 out
[2] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 934                 out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 935                 out
[0] = 0xE0 | code
; 
 938         else if ( code 
<= 0x10FFFF ) 
 946                 out
[3] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 947                 out
[2] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 948                 out
[1] = 0x80 | (code 
& 0x3F);  code 
>>= 6; 
 949                 out
[0] = 0xF0 | code
; 
 954             wxFAIL_MSG( _T("trying to encode undefined Unicode character") ); 
 967     // we only get here if an error occurs during decoding 
 968     return wxCONV_FAILED
; 
 971 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 973     if ( m_options 
== MAP_INVALID_UTF8_NOT 
) 
 974         return wxMBConvStrictUTF8::MB2WC(buf
, psz
, n
); 
 978     while (*psz 
&& ((!buf
) || (len 
< n
))) 
 980         const char *opsz 
= psz
; 
 981         bool invalid 
= false; 
 982         unsigned char cc 
= *psz
++, fc 
= cc
; 
 984         for (cnt 
= 0; fc 
& 0x80; cnt
++) 
 994             // escape the escape character for octal escapes 
 995             if ((m_options 
& MAP_INVALID_UTF8_TO_OCTAL
) 
 996                     && cc 
== '\\' && (!buf 
|| len 
< n
)) 
1008                 // invalid UTF-8 sequence 
1013                 unsigned ocnt 
= cnt 
- 1; 
1014                 wxUint32 res 
= cc 
& (0x3f >> cnt
); 
1018                     if ((cc 
& 0xC0) != 0x80) 
1020                         // invalid UTF-8 sequence 
1026                     res 
= (res 
<< 6) | (cc 
& 0x3f); 
1029                 if (invalid 
|| res 
<= utf8_max
[ocnt
]) 
1031                     // illegal UTF-8 encoding 
1034                 else if ((m_options 
& MAP_INVALID_UTF8_TO_PUA
) && 
1035                         res 
>= wxUnicodePUA 
&& res 
< wxUnicodePUAEnd
) 
1037                     // if one of our PUA characters turns up externally 
1038                     // it must also be treated as an illegal sequence 
1039                     // (a bit like you have to escape an escape character) 
1045                     // cast is ok because wchar_t == wxUint16 if WC_UTF16 
1046                     size_t pa 
= encode_utf16(res
, (wxUint16 
*)buf
); 
1047                     if (pa 
== wxCONV_FAILED
) 
1059                         *buf
++ = (wchar_t)res
; 
1061 #endif // WC_UTF16/!WC_UTF16 
1067                 if (m_options 
& MAP_INVALID_UTF8_TO_PUA
) 
1069                     while (opsz 
< psz 
&& (!buf 
|| len 
< n
)) 
1072                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16 
1073                         size_t pa 
= encode_utf16((unsigned char)*opsz 
+ wxUnicodePUA
, (wxUint16 
*)buf
); 
1074                         wxASSERT(pa 
!= wxCONV_FAILED
); 
1081                             *buf
++ = (wchar_t)(wxUnicodePUA 
+ (unsigned char)*opsz
); 
1087                 else if (m_options 
& MAP_INVALID_UTF8_TO_OCTAL
) 
1089                     while (opsz 
< psz 
&& (!buf 
|| len 
< n
)) 
1091                         if ( buf 
&& len 
+ 3 < n 
) 
1093                             unsigned char on 
= *opsz
; 
1095                             *buf
++ = (wchar_t)( L
'0' + on 
/ 0100 ); 
1096                             *buf
++ = (wchar_t)( L
'0' + (on 
% 0100) / 010 ); 
1097                             *buf
++ = (wchar_t)( L
'0' + on 
% 010 ); 
1104                 else // MAP_INVALID_UTF8_NOT 
1106                     return wxCONV_FAILED
; 
1112     if (buf 
&& (len 
< n
)) 
1118 static inline bool isoctal(wchar_t wch
) 
1120     return L
'0' <= wch 
&& wch 
<= L
'7'; 
1123 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
1125     if ( m_options 
== MAP_INVALID_UTF8_NOT 
) 
1126         return wxMBConvStrictUTF8::WC2MB(buf
, psz
, n
); 
1130     while (*psz 
&& ((!buf
) || (len 
< n
))) 
1135         // cast is ok for WC_UTF16 
1136         size_t pa 
= decode_utf16((const wxUint16 
*)psz
, cc
); 
1137         psz 
+= (pa 
== wxCONV_FAILED
) ? 1 : pa
; 
1139         cc 
= (*psz
++) & 0x7fffffff; 
1142         if ( (m_options 
& MAP_INVALID_UTF8_TO_PUA
) 
1143                 && cc 
>= wxUnicodePUA 
&& cc 
< wxUnicodePUAEnd 
) 
1146                 *buf
++ = (char)(cc 
- wxUnicodePUA
); 
1149         else if ( (m_options 
& MAP_INVALID_UTF8_TO_OCTAL
) 
1150                     && cc 
== L
'\\' && psz
[0] == L
'\\' ) 
1157         else if ( (m_options 
& MAP_INVALID_UTF8_TO_OCTAL
) && 
1159                         isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) ) 
1163                 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 + 
1164                                  (psz
[1] - L
'0') * 010 + 
1174             for (cnt 
= 0; cc 
> utf8_max
[cnt
]; cnt
++) 
1190                     *buf
++ = (char) ((-128 >> cnt
) | ((cc 
>> (cnt 
* 6)) & (0x3f >> cnt
))); 
1192                         *buf
++ = (char) (0x80 | ((cc 
>> (cnt 
* 6)) & 0x3f)); 
1198     if (buf 
&& (len 
< n
)) 
1204 // ============================================================================ 
1206 // ============================================================================ 
1208 #ifdef WORDS_BIGENDIAN 
1209     #define wxMBConvUTF16straight wxMBConvUTF16BE 
1210     #define wxMBConvUTF16swap     wxMBConvUTF16LE 
1212     #define wxMBConvUTF16swap     wxMBConvUTF16BE 
1213     #define wxMBConvUTF16straight wxMBConvUTF16LE 
1217 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
) 
1219     if ( srcLen 
== wxNO_LEN 
) 
1221         // count the number of bytes in input, including the trailing NULs 
1222         const wxUint16 
*inBuff 
= wx_reinterpret_cast(const wxUint16 
*, src
); 
1223         for ( srcLen 
= 1; *inBuff
++; srcLen
++ ) 
1226         srcLen 
*= BYTES_PER_CHAR
; 
1228     else // we already have the length 
1230         // we can only convert an entire number of UTF-16 characters 
1231         if ( srcLen 
% BYTES_PER_CHAR 
) 
1232             return wxCONV_FAILED
; 
1238 // case when in-memory representation is UTF-16 too 
1241 // ---------------------------------------------------------------------------- 
1242 // conversions without endianness change 
1243 // ---------------------------------------------------------------------------- 
1246 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
, 
1247                                const char *src
, size_t srcLen
) const 
1249     // set up the scene for using memcpy() (which is presumably more efficient 
1250     // than copying the bytes one by one) 
1251     srcLen 
= GetLength(src
, srcLen
); 
1252     if ( srcLen 
== wxNO_LEN 
) 
1253         return wxCONV_FAILED
; 
1255     const size_t inLen 
= srcLen 
/ BYTES_PER_CHAR
; 
1258         if ( dstLen 
< inLen 
) 
1259             return wxCONV_FAILED
; 
1261         memcpy(dst
, src
, srcLen
); 
1268 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
, 
1269                                  const wchar_t *src
, size_t srcLen
) const 
1271     if ( srcLen 
== wxNO_LEN 
) 
1272         srcLen 
= wxWcslen(src
) + 1; 
1274     srcLen 
*= BYTES_PER_CHAR
; 
1278         if ( dstLen 
< srcLen 
) 
1279             return wxCONV_FAILED
; 
1281         memcpy(dst
, src
, srcLen
); 
1287 // ---------------------------------------------------------------------------- 
1288 // endian-reversing conversions 
1289 // ---------------------------------------------------------------------------- 
1292 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
, 
1293                            const char *src
, size_t srcLen
) const 
1295     srcLen 
= GetLength(src
, srcLen
); 
1296     if ( srcLen 
== wxNO_LEN 
) 
1297         return wxCONV_FAILED
; 
1299     srcLen 
/= BYTES_PER_CHAR
; 
1303         if ( dstLen 
< srcLen 
) 
1304             return wxCONV_FAILED
; 
1306         const wxUint16 
*inBuff 
= wx_reinterpret_cast(const wxUint16 
*, src
); 
1307         for ( size_t n 
= 0; n 
< srcLen
; n
++, inBuff
++ ) 
1309             *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
); 
1317 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
, 
1318                              const wchar_t *src
, size_t srcLen
) const 
1320     if ( srcLen 
== wxNO_LEN 
) 
1321         srcLen 
= wxWcslen(src
) + 1; 
1323     srcLen 
*= BYTES_PER_CHAR
; 
1327         if ( dstLen 
< srcLen 
) 
1328             return wxCONV_FAILED
; 
1330         wxUint16 
*outBuff 
= wx_reinterpret_cast(wxUint16 
*, dst
); 
1331         for ( size_t n 
= 0; n 
< srcLen
; n 
+= BYTES_PER_CHAR
, src
++ ) 
1333             *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
); 
1340 #else // !WC_UTF16: wchar_t is UTF-32 
1342 // ---------------------------------------------------------------------------- 
1343 // conversions without endianness change 
1344 // ---------------------------------------------------------------------------- 
1347 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
, 
1348                                const char *src
, size_t srcLen
) const 
1350     srcLen 
= GetLength(src
, srcLen
); 
1351     if ( srcLen 
== wxNO_LEN 
) 
1352         return wxCONV_FAILED
; 
1354     const size_t inLen 
= srcLen 
/ BYTES_PER_CHAR
; 
1357         // optimization: return maximal space which could be needed for this 
1358         // string even if the real size could be smaller if the buffer contains 
1364     const wxUint16 
*inBuff 
= wx_reinterpret_cast(const wxUint16 
*, src
); 
1365     for ( const wxUint16 
* const inEnd 
= inBuff 
+ inLen
; inBuff 
< inEnd
; ) 
1367         const wxUint32 ch 
= wxDecodeSurrogate(&inBuff
); 
1369             return wxCONV_FAILED
; 
1371         if ( ++outLen 
> dstLen 
) 
1372             return wxCONV_FAILED
; 
1382 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
, 
1383                                  const wchar_t *src
, size_t srcLen
) const 
1385     if ( srcLen 
== wxNO_LEN 
) 
1386         srcLen 
= wxWcslen(src
) + 1; 
1389     wxUint16 
*outBuff 
= wx_reinterpret_cast(wxUint16 
*, dst
); 
1390     for ( size_t n 
= 0; n 
< srcLen
; n
++ ) 
1393         const size_t numChars 
= encode_utf16(*src
++, cc
); 
1394         if ( numChars 
== wxCONV_FAILED 
) 
1395             return wxCONV_FAILED
; 
1397         outLen 
+= numChars 
* BYTES_PER_CHAR
; 
1400             if ( outLen 
> dstLen 
) 
1401                 return wxCONV_FAILED
; 
1404             if ( numChars 
== 2 ) 
1406                 // second character of a surrogate 
1415 // ---------------------------------------------------------------------------- 
1416 // endian-reversing conversions 
1417 // ---------------------------------------------------------------------------- 
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
, 
1421                            const char *src
, size_t srcLen
) const 
1423     srcLen 
= GetLength(src
, srcLen
); 
1424     if ( srcLen 
== wxNO_LEN 
) 
1425         return wxCONV_FAILED
; 
1427     const size_t inLen 
= srcLen 
/ BYTES_PER_CHAR
; 
1430         // optimization: return maximal space which could be needed for this 
1431         // string even if the real size could be smaller if the buffer contains 
1437     const wxUint16 
*inBuff 
= wx_reinterpret_cast(const wxUint16 
*, src
); 
1438     for ( const wxUint16 
* const inEnd 
= inBuff 
+ inLen
; inBuff 
< inEnd
; ) 
1443         tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
); 
1445         tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
); 
1447         const size_t numChars 
= decode_utf16(tmp
, ch
); 
1448         if ( numChars 
== wxCONV_FAILED 
) 
1449             return wxCONV_FAILED
; 
1451         if ( numChars 
== 2 ) 
1454         if ( ++outLen 
> dstLen 
) 
1455             return wxCONV_FAILED
; 
1465 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
, 
1466                              const wchar_t *src
, size_t srcLen
) const 
1468     if ( srcLen 
== wxNO_LEN 
) 
1469         srcLen 
= wxWcslen(src
) + 1; 
1472     wxUint16 
*outBuff 
= wx_reinterpret_cast(wxUint16 
*, dst
); 
1473     for ( const wchar_t *srcEnd 
= src 
+ srcLen
; src 
< srcEnd
; src
++ ) 
1476         const size_t numChars 
= encode_utf16(*src
, cc
); 
1477         if ( numChars 
== wxCONV_FAILED 
) 
1478             return wxCONV_FAILED
; 
1480         outLen 
+= numChars 
* BYTES_PER_CHAR
; 
1483             if ( outLen 
> dstLen 
) 
1484                 return wxCONV_FAILED
; 
1486             *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]); 
1487             if ( numChars 
== 2 ) 
1489                 // second character of a surrogate 
1490                 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]); 
1498 #endif // WC_UTF16/!WC_UTF16 
1501 // ============================================================================ 
1503 // ============================================================================ 
1505 #ifdef WORDS_BIGENDIAN 
1506     #define wxMBConvUTF32straight  wxMBConvUTF32BE 
1507     #define wxMBConvUTF32swap      wxMBConvUTF32LE 
1509     #define wxMBConvUTF32swap      wxMBConvUTF32BE 
1510     #define wxMBConvUTF32straight  wxMBConvUTF32LE 
1514 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
; 
1515 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
; 
1518 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
) 
1520     if ( srcLen 
== wxNO_LEN 
) 
1522         // count the number of bytes in input, including the trailing NULs 
1523         const wxUint32 
*inBuff 
= wx_reinterpret_cast(const wxUint32 
*, src
); 
1524         for ( srcLen 
= 1; *inBuff
++; srcLen
++ ) 
1527         srcLen 
*= BYTES_PER_CHAR
; 
1529     else // we already have the length 
1531         // we can only convert an entire number of UTF-32 characters 
1532         if ( srcLen 
% BYTES_PER_CHAR 
) 
1533             return wxCONV_FAILED
; 
1539 // case when in-memory representation is UTF-16 
1542 // ---------------------------------------------------------------------------- 
1543 // conversions without endianness change 
1544 // ---------------------------------------------------------------------------- 
1547 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
, 
1548                                const char *src
, size_t srcLen
) const 
1550     srcLen 
= GetLength(src
, srcLen
); 
1551     if ( srcLen 
== wxNO_LEN 
) 
1552         return wxCONV_FAILED
; 
1554     const wxUint32 
*inBuff 
= wx_reinterpret_cast(const wxUint32 
*, src
); 
1555     const size_t inLen 
= srcLen 
/ BYTES_PER_CHAR
; 
1557     for ( size_t n 
= 0; n 
< inLen
; n
++ ) 
1560         const size_t numChars 
= encode_utf16(*inBuff
++, cc
); 
1561         if ( numChars 
== wxCONV_FAILED 
) 
1562             return wxCONV_FAILED
; 
1567             if ( outLen 
> dstLen 
) 
1568                 return wxCONV_FAILED
; 
1571             if ( numChars 
== 2 ) 
1573                 // second character of a surrogate 
1583 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
, 
1584                                  const wchar_t *src
, size_t srcLen
) const 
1586     if ( srcLen 
== wxNO_LEN 
) 
1587         srcLen 
= wxWcslen(src
) + 1; 
1591         // optimization: return maximal space which could be needed for this 
1592         // string instead of the exact amount which could be less if there are 
1593         // any surrogates in the input 
1595         // we consider that surrogates are rare enough to make it worthwhile to 
1596         // avoid running the loop below at the cost of slightly extra memory 
1598         return srcLen 
* BYTES_PER_CHAR
; 
1601     wxUint32 
*outBuff 
= wx_reinterpret_cast(wxUint32 
*, dst
); 
1603     for ( const wchar_t * const srcEnd 
= src 
+ srcLen
; src 
< srcEnd
; ) 
1605         const wxUint32 ch 
= wxDecodeSurrogate(&src
); 
1607             return wxCONV_FAILED
; 
1609         outLen 
+= BYTES_PER_CHAR
; 
1611         if ( outLen 
> dstLen 
) 
1612             return wxCONV_FAILED
; 
1620 // ---------------------------------------------------------------------------- 
1621 // endian-reversing conversions 
1622 // ---------------------------------------------------------------------------- 
1625 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
, 
1626                            const char *src
, size_t srcLen
) const 
1628     srcLen 
= GetLength(src
, srcLen
); 
1629     if ( srcLen 
== wxNO_LEN 
) 
1630         return wxCONV_FAILED
; 
1632     const wxUint32 
*inBuff 
= wx_reinterpret_cast(const wxUint32 
*, src
); 
1633     const size_t inLen 
= srcLen 
/ BYTES_PER_CHAR
; 
1635     for ( size_t n 
= 0; n 
< inLen
; n
++, inBuff
++ ) 
1638         const size_t numChars 
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
); 
1639         if ( numChars 
== wxCONV_FAILED 
) 
1640             return wxCONV_FAILED
; 
1645             if ( outLen 
> dstLen 
) 
1646                 return wxCONV_FAILED
; 
1649             if ( numChars 
== 2 ) 
1651                 // second character of a surrogate 
1661 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
, 
1662                              const wchar_t *src
, size_t srcLen
) const 
1664     if ( srcLen 
== wxNO_LEN 
) 
1665         srcLen 
= wxWcslen(src
) + 1; 
1669         // optimization: return maximal space which could be needed for this 
1670         // string instead of the exact amount which could be less if there are 
1671         // any surrogates in the input 
1673         // we consider that surrogates are rare enough to make it worthwhile to 
1674         // avoid running the loop below at the cost of slightly extra memory 
1676         return srcLen
*BYTES_PER_CHAR
; 
1679     wxUint32 
*outBuff 
= wx_reinterpret_cast(wxUint32 
*, dst
); 
1681     for ( const wchar_t * const srcEnd 
= src 
+ srcLen
; src 
< srcEnd
; ) 
1683         const wxUint32 ch 
= wxDecodeSurrogate(&src
); 
1685             return wxCONV_FAILED
; 
1687         outLen 
+= BYTES_PER_CHAR
; 
1689         if ( outLen 
> dstLen 
) 
1690             return wxCONV_FAILED
; 
1692         *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
); 
1698 #else // !WC_UTF16: wchar_t is UTF-32 
1700 // ---------------------------------------------------------------------------- 
1701 // conversions without endianness change 
1702 // ---------------------------------------------------------------------------- 
1705 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
, 
1706                                const char *src
, size_t srcLen
) const 
1708     // use memcpy() as it should be much faster than hand-written loop 
1709     srcLen 
= GetLength(src
, srcLen
); 
1710     if ( srcLen 
== wxNO_LEN 
) 
1711         return wxCONV_FAILED
; 
1713     const size_t inLen 
= srcLen
/BYTES_PER_CHAR
; 
1716         if ( dstLen 
< inLen 
) 
1717             return wxCONV_FAILED
; 
1719         memcpy(dst
, src
, srcLen
); 
1726 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
, 
1727                                  const wchar_t *src
, size_t srcLen
) const 
1729     if ( srcLen 
== wxNO_LEN 
) 
1730         srcLen 
= wxWcslen(src
) + 1; 
1732     srcLen 
*= BYTES_PER_CHAR
; 
1736         if ( dstLen 
< srcLen 
) 
1737             return wxCONV_FAILED
; 
1739         memcpy(dst
, src
, srcLen
); 
1745 // ---------------------------------------------------------------------------- 
1746 // endian-reversing conversions 
1747 // ---------------------------------------------------------------------------- 
1750 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
, 
1751                            const char *src
, size_t srcLen
) const 
1753     srcLen 
= GetLength(src
, srcLen
); 
1754     if ( srcLen 
== wxNO_LEN 
) 
1755         return wxCONV_FAILED
; 
1757     srcLen 
/= BYTES_PER_CHAR
; 
1761         if ( dstLen 
< srcLen 
) 
1762             return wxCONV_FAILED
; 
1764         const wxUint32 
*inBuff 
= wx_reinterpret_cast(const wxUint32 
*, src
); 
1765         for ( size_t n 
= 0; n 
< srcLen
; n
++, inBuff
++ ) 
1767             *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
); 
1775 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
, 
1776                              const wchar_t *src
, size_t srcLen
) const 
1778     if ( srcLen 
== wxNO_LEN 
) 
1779         srcLen 
= wxWcslen(src
) + 1; 
1781     srcLen 
*= BYTES_PER_CHAR
; 
1785         if ( dstLen 
< srcLen 
) 
1786             return wxCONV_FAILED
; 
1788         wxUint32 
*outBuff 
= wx_reinterpret_cast(wxUint32 
*, dst
); 
1789         for ( size_t n 
= 0; n 
< srcLen
; n 
+= BYTES_PER_CHAR
, src
++ ) 
1791             *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
); 
1798 #endif // WC_UTF16/!WC_UTF16 
1801 // ============================================================================ 
1802 // The classes doing conversion using the iconv_xxx() functions 
1803 // ============================================================================ 
1807 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with 
1808 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is 
1809 //     (unless there's yet another bug in glibc) the only case when iconv() 
1810 //     returns with (size_t)-1 (which means error) and says there are 0 bytes 
1811 //     left in the input buffer -- when _real_ error occurs, 
1812 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for 
1814 //     [This bug does not appear in glibc 2.2.] 
1815 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1 
1816 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \ 
1817                                      (errno != E2BIG || bufLeft != 0)) 
1819 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1) 
1822 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x)) 
1824 #define ICONV_T_INVALID ((iconv_t)-1) 
1826 #if SIZEOF_WCHAR_T == 4 
1827     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS 
1828     #define WC_ENC      wxFONTENCODING_UTF32 
1829 #elif SIZEOF_WCHAR_T == 2 
1830     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS 
1831     #define WC_ENC      wxFONTENCODING_UTF16 
1832 #else // sizeof(wchar_t) != 2 nor 4 
1833     // does this ever happen? 
1834     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org" 
1837 // ---------------------------------------------------------------------------- 
1838 // wxMBConv_iconv: encapsulates an iconv character set 
1839 // ---------------------------------------------------------------------------- 
1841 class wxMBConv_iconv 
: public wxMBConv
 
1844     wxMBConv_iconv(const char *name
); 
1845     virtual ~wxMBConv_iconv(); 
1847     virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const; 
1848     virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const; 
1850     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment 
1851     virtual size_t GetMBNulLen() const; 
1853 #if wxUSE_UNICODE_UTF8 
1854     virtual bool IsUTF8() const; 
1857     virtual wxMBConv 
*Clone() const 
1859         wxMBConv_iconv 
*p 
= new wxMBConv_iconv(m_name
.ToAscii()); 
1860         p
->m_minMBCharWidth 
= m_minMBCharWidth
; 
1865         { return (m2w 
!= ICONV_T_INVALID
) && (w2m 
!= ICONV_T_INVALID
); } 
1868     // the iconv handlers used to translate from multibyte 
1869     // to wide char and in the other direction 
1874     // guards access to m2w and w2m objects 
1875     wxMutex m_iconvMutex
; 
1879     // the name (for iconv_open()) of a wide char charset -- if none is 
1880     // available on this machine, it will remain NULL 
1881     static wxString ms_wcCharsetName
; 
1883     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has 
1884     // different endian-ness than the native one 
1885     static bool ms_wcNeedsSwap
; 
1888     // name of the encoding handled by this conversion 
1891     // cached result of GetMBNulLen(); set to 0 meaning "unknown" 
1893     size_t m_minMBCharWidth
; 
1896 // make the constructor available for unit testing 
1897 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name 
) 
1899     wxMBConv_iconv
* result 
= new wxMBConv_iconv( name 
); 
1900     if ( !result
->IsOk() ) 
1909 wxString 
wxMBConv_iconv::ms_wcCharsetName
; 
1910 bool wxMBConv_iconv::ms_wcNeedsSwap 
= false; 
1912 wxMBConv_iconv::wxMBConv_iconv(const char *name
) 
1915     m_minMBCharWidth 
= 0; 
1917     // check for charset that represents wchar_t: 
1918     if ( ms_wcCharsetName
.empty() ) 
1920         wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:")); 
1923         const wxChar 
**names 
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
); 
1924 #else // !wxUSE_FONTMAP 
1925         static const wxChar 
*names_static
[] = 
1927 #if SIZEOF_WCHAR_T == 4 
1929 #elif SIZEOF_WCHAR_T = 2 
1934         const wxChar 
**names 
= names_static
; 
1935 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP 
1937         for ( ; *names 
&& ms_wcCharsetName
.empty(); ++names 
) 
1939             const wxString 
nameCS(*names
); 
1941             // first try charset with explicit bytesex info (e.g. "UCS-4LE"): 
1942             wxString 
nameXE(nameCS
); 
1944 #ifdef WORDS_BIGENDIAN 
1946 #else // little endian 
1950             wxLogTrace(TRACE_STRCONV
, _T("  trying charset \"%s\""), 
1953             m2w 
= iconv_open(nameXE
.ToAscii(), name
); 
1954             if ( m2w 
== ICONV_T_INVALID 
) 
1956                 // try charset w/o bytesex info (e.g. "UCS4") 
1957                 wxLogTrace(TRACE_STRCONV
, _T("  trying charset \"%s\""), 
1959                 m2w 
= iconv_open(nameCS
.ToAscii(), name
); 
1961                 // and check for bytesex ourselves: 
1962                 if ( m2w 
!= ICONV_T_INVALID 
) 
1964                     char    buf
[2], *bufPtr
; 
1965                     wchar_t wbuf
[2], *wbufPtr
; 
1973                     outsz 
= SIZEOF_WCHAR_T 
* 2; 
1978                         m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
, 
1979                         (char**)&wbufPtr
, &outsz
); 
1981                     if (ICONV_FAILED(res
, insz
)) 
1983                         wxLogLastError(wxT("iconv")); 
1984                         wxLogError(_("Conversion to charset '%s' doesn't work."), 
1987                     else // ok, can convert to this encoding, remember it 
1989                         ms_wcCharsetName 
= nameCS
; 
1990                         ms_wcNeedsSwap 
= wbuf
[0] != (wchar_t)buf
[0]; 
1994             else // use charset not requiring byte swapping 
1996                 ms_wcCharsetName 
= nameXE
; 
2000         wxLogTrace(TRACE_STRCONV
, 
2001                    wxT("iconv wchar_t charset is \"%s\"%s"), 
2002                    ms_wcCharsetName
.empty() ? wxString("<none>") 
2004                    ms_wcNeedsSwap 
? _T(" (needs swap)") 
2007     else // we already have ms_wcCharsetName 
2009         m2w 
= iconv_open(ms_wcCharsetName
.ToAscii(), name
); 
2012     if ( ms_wcCharsetName
.empty() ) 
2014         w2m 
= ICONV_T_INVALID
; 
2018         w2m 
= iconv_open(name
, ms_wcCharsetName
.ToAscii()); 
2019         if ( w2m 
== ICONV_T_INVALID 
) 
2021             wxLogTrace(TRACE_STRCONV
, 
2022                        wxT("\"%s\" -> \"%s\" works but not the converse!?"), 
2023                        ms_wcCharsetName
.c_str(), name
); 
2028 wxMBConv_iconv::~wxMBConv_iconv() 
2030     if ( m2w 
!= ICONV_T_INVALID 
) 
2032     if ( w2m 
!= ICONV_T_INVALID 
) 
2036 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
2038     // find the string length: notice that must be done differently for 
2039     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs 
2041     const size_t nulLen 
= GetMBNulLen(); 
2045             return wxCONV_FAILED
; 
2048             inbuf 
= strlen(psz
); // arguably more optimized than our version 
2053             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but 
2054             // they also have to start at character boundary and not span two 
2055             // adjacent characters 
2057             for ( p 
= psz
; NotAllNULs(p
, nulLen
); p 
+= nulLen 
) 
2064     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle. 
2065     //     Unfortunately there are a couple of global wxCSConv objects such as 
2066     //     wxConvLocal that are used all over wx code, so we have to make sure 
2067     //     the handle is used by at most one thread at the time. Otherwise 
2068     //     only a few wx classes would be safe to use from non-main threads 
2069     //     as MB<->WC conversion would fail "randomly". 
2070     wxMutexLocker 
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
); 
2071 #endif // wxUSE_THREADS 
2073     size_t outbuf 
= n 
* SIZEOF_WCHAR_T
; 
2075     // VS: Use these instead of psz, buf because iconv() modifies its arguments: 
2076     wchar_t *bufPtr 
= buf
; 
2077     const char *pszPtr 
= psz
; 
2081         // have destination buffer, convert there 
2083                      ICONV_CHAR_CAST(&pszPtr
), &inbuf
, 
2084                      (char**)&bufPtr
, &outbuf
); 
2085         res 
= n 
- (outbuf 
/ SIZEOF_WCHAR_T
); 
2089             // convert to native endianness 
2090             for ( unsigned i 
= 0; i 
< res
; i
++ ) 
2091                 buf
[n
] = WC_BSWAP(buf
[i
]); 
2094         // NUL-terminate the string if there is any space left 
2100         // no destination buffer... convert using temp buffer 
2101         // to calculate destination buffer requirement 
2108             outbuf 
= 8 * SIZEOF_WCHAR_T
; 
2111                          ICONV_CHAR_CAST(&pszPtr
), &inbuf
, 
2112                          (char**)&bufPtr
, &outbuf 
); 
2114             res 
+= 8 - (outbuf 
/ SIZEOF_WCHAR_T
); 
2116         while ((cres 
== (size_t)-1) && (errno 
== E2BIG
)); 
2119     if (ICONV_FAILED(cres
, inbuf
)) 
2121         //VS: it is ok if iconv fails, hence trace only 
2122         wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); 
2123         return wxCONV_FAILED
; 
2129 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
2132     // NB: explained in MB2WC 
2133     wxMutexLocker 
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
); 
2136     size_t inlen 
= wxWcslen(psz
); 
2137     size_t inbuf 
= inlen 
* SIZEOF_WCHAR_T
; 
2141     wchar_t *tmpbuf 
= 0; 
2145         // need to copy to temp buffer to switch endianness 
2146         // (doing WC_BSWAP twice on the original buffer won't help, as it 
2147         //  could be in read-only memory, or be accessed in some other thread) 
2148         tmpbuf 
= (wchar_t *)malloc(inbuf 
+ SIZEOF_WCHAR_T
); 
2149         for ( size_t i 
= 0; i 
< inlen
; i
++ ) 
2150             tmpbuf
[n
] = WC_BSWAP(psz
[i
]); 
2152         tmpbuf
[inlen
] = L
'\0'; 
2158         // have destination buffer, convert there 
2159         cres 
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf 
); 
2163         // NB: iconv was given only wcslen(psz) characters on input, and so 
2164         //     it couldn't convert the trailing zero. Let's do it ourselves 
2165         //     if there's some room left for it in the output buffer. 
2171         // no destination buffer: convert using temp buffer 
2172         // to calculate destination buffer requirement 
2180             cres 
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf 
); 
2184         while ((cres 
== (size_t)-1) && (errno 
== E2BIG
)); 
2192     if (ICONV_FAILED(cres
, inbuf
)) 
2194         wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); 
2195         return wxCONV_FAILED
; 
2201 size_t wxMBConv_iconv::GetMBNulLen() const 
2203     if ( m_minMBCharWidth 
== 0 ) 
2205         wxMBConv_iconv 
* const self 
= wxConstCast(this, wxMBConv_iconv
); 
2208         // NB: explained in MB2WC 
2209         wxMutexLocker 
lock(self
->m_iconvMutex
); 
2212         const wchar_t *wnul 
= L
""; 
2213         char buf
[8]; // should be enough for NUL in any encoding 
2214         size_t inLen 
= sizeof(wchar_t), 
2215                outLen 
= WXSIZEOF(buf
); 
2216         char *inBuff 
= (char *)wnul
; 
2217         char *outBuff 
= buf
; 
2218         if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 ) 
2220             self
->m_minMBCharWidth 
= (size_t)-1; 
2224             self
->m_minMBCharWidth 
= outBuff 
- buf
; 
2228     return m_minMBCharWidth
; 
2231 #if wxUSE_UNICODE_UTF8 
2232 bool wxMBConv_iconv::IsUTF8() const 
2234     return wxStricmp(m_name
, "UTF-8") == 0 || 
2235            wxStricmp(m_name
, "UTF8") == 0; 
2239 #endif // HAVE_ICONV 
2242 // ============================================================================ 
2243 // Win32 conversion classes 
2244 // ============================================================================ 
2246 #ifdef wxHAVE_WIN32_MB2WC 
2250 extern WXDLLIMPEXP_BASE 
long wxCharsetToCodepage(const char *charset
); 
2251 extern WXDLLIMPEXP_BASE 
long wxEncodingToCodepage(wxFontEncoding encoding
); 
2254 class wxMBConv_win32 
: public wxMBConv
 
2259         m_CodePage 
= CP_ACP
; 
2260         m_minMBCharWidth 
= 0; 
2263     wxMBConv_win32(const wxMBConv_win32
& conv
) 
2266         m_CodePage 
= conv
.m_CodePage
; 
2267         m_minMBCharWidth 
= conv
.m_minMBCharWidth
; 
2271     wxMBConv_win32(const char* name
) 
2273         m_CodePage 
= wxCharsetToCodepage(name
); 
2274         m_minMBCharWidth 
= 0; 
2277     wxMBConv_win32(wxFontEncoding encoding
) 
2279         m_CodePage 
= wxEncodingToCodepage(encoding
); 
2280         m_minMBCharWidth 
= 0; 
2282 #endif // wxUSE_FONTMAP 
2284     virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
2286         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it 
2287         // the behaviour is not compatible with the Unix version (using iconv) 
2288         // and break the library itself, e.g. wxTextInputStream::NextChar() 
2289         // wouldn't work if reading an incomplete MB char didn't result in an 
2292         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or 
2293         // Win XP or newer and it is not supported for UTF-[78] so we always 
2294         // use our own conversions in this case. See 
2295         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx 
2296         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp 
2297         if ( m_CodePage 
== CP_UTF8 
) 
2299             return wxMBConvUTF8().MB2WC(buf
, psz
, n
); 
2302         if ( m_CodePage 
== CP_UTF7 
) 
2304             return wxMBConvUTF7().MB2WC(buf
, psz
, n
); 
2308         if ( (m_CodePage 
< 50000 && m_CodePage 
!= CP_SYMBOL
) && 
2309                 IsAtLeastWin2kSP4() ) 
2311             flags 
= MB_ERR_INVALID_CHARS
; 
2314         const size_t len 
= ::MultiByteToWideChar
 
2316                                 m_CodePage
,     // code page 
2317                                 flags
,          // flags: fall on error 
2318                                 psz
,            // input string 
2319                                 -1,             // its length (NUL-terminated) 
2320                                 buf
,            // output string 
2321                                 buf 
? n 
: 0     // size of output buffer 
2325             // function totally failed 
2326             return wxCONV_FAILED
; 
2329         // if we were really converting and didn't use MB_ERR_INVALID_CHARS, 
2330         // check if we succeeded, by doing a double trip: 
2331         if ( !flags 
&& buf 
) 
2333             const size_t mbLen 
= strlen(psz
); 
2334             wxCharBuffer 
mbBuf(mbLen
); 
2335             if ( ::WideCharToMultiByte
 
2342                       mbLen 
+ 1,        // size in bytes, not length 
2346                   strcmp(mbBuf
, psz
) != 0 ) 
2348                 // we didn't obtain the same thing we started from, hence 
2349                 // the conversion was lossy and we consider that it failed 
2350                 return wxCONV_FAILED
; 
2354         // note that it returns count of written chars for buf != NULL and size 
2355         // of the needed buffer for buf == NULL so in either case the length of 
2356         // the string (which never includes the terminating NUL) is one less 
2360     virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const 
2363             we have a problem here: by default, WideCharToMultiByte() may 
2364             replace characters unrepresentable in the target code page with bad 
2365             quality approximations such as turning "1/2" symbol (U+00BD) into 
2366             "1" for the code pages which don't have it and we, obviously, want 
2367             to avoid this at any price 
2369             the trouble is that this function does it _silently_, i.e. it won't 
2370             even tell us whether it did or not... Win98/2000 and higher provide 
2371             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and 
2372             we have to resort to a round trip, i.e. check that converting back 
2373             results in the same string -- this is, of course, expensive but 
2374             otherwise we simply can't be sure to not garble the data. 
2377         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN 
2378         // it doesn't work with CJK encodings (which we test for rather roughly 
2379         // here...) nor with UTF-7/8 nor, of course, with Windows versions not 
2381         BOOL usedDef 
wxDUMMY_INITIALIZE(false); 
2384         if ( CanUseNoBestFit() && m_CodePage 
< 50000 ) 
2386             // it's our lucky day 
2387             flags 
= WC_NO_BEST_FIT_CHARS
; 
2388             pUsedDef 
= &usedDef
; 
2390         else // old system or unsupported encoding 
2396         const size_t len 
= ::WideCharToMultiByte
 
2398                                 m_CodePage
,     // code page 
2399                                 flags
,          // either none or no best fit 
2400                                 pwz
,            // input string 
2401                                 -1,             // it is (wide) NUL-terminated 
2402                                 buf
,            // output buffer 
2403                                 buf 
? n 
: 0,    // and its size 
2404                                 NULL
,           // default "replacement" char 
2405                                 pUsedDef        
// [out] was it used? 
2410             // function totally failed 
2411             return wxCONV_FAILED
; 
2414         // if we were really converting, check if we succeeded 
2419                 // check if the conversion failed, i.e. if any replacements 
2422                     return wxCONV_FAILED
; 
2424             else // we must resort to double tripping... 
2426                 wxWCharBuffer 
wcBuf(n
); 
2427                 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED 
|| 
2428                         wcscmp(wcBuf
, pwz
) != 0 ) 
2430                     // we didn't obtain the same thing we started from, hence 
2431                     // the conversion was lossy and we consider that it failed 
2432                     return wxCONV_FAILED
; 
2437         // see the comment above for the reason of "len - 1" 
2441     virtual size_t GetMBNulLen() const 
2443         if ( m_minMBCharWidth 
== 0 ) 
2445             int len 
= ::WideCharToMultiByte
 
2447                             m_CodePage
,     // code page 
2449                             L
"",            // input string 
2450                             1,              // translate just the NUL 
2451                             NULL
,           // output buffer 
2453                             NULL
,           // no replacement char 
2454                             NULL            
// [out] don't care if it was used 
2457             wxMBConv_win32 
* const self 
= wxConstCast(this, wxMBConv_win32
); 
2461                     wxLogDebug(_T("Unexpected NUL length %d"), len
); 
2462                     self
->m_minMBCharWidth 
= (size_t)-1; 
2466                     self
->m_minMBCharWidth 
= (size_t)-1; 
2472                     self
->m_minMBCharWidth 
= len
; 
2477         return m_minMBCharWidth
; 
2480     virtual wxMBConv 
*Clone() const { return new wxMBConv_win32(*this); } 
2482     bool IsOk() const { return m_CodePage 
!= -1; } 
2485     static bool CanUseNoBestFit() 
2487         static int s_isWin98Or2k 
= -1; 
2489         if ( s_isWin98Or2k 
== -1 ) 
2492             switch ( wxGetOsVersion(&verMaj
, &verMin
) ) 
2494                 case wxOS_WINDOWS_9X
: 
2495                     s_isWin98Or2k 
= verMaj 
>= 4 && verMin 
>= 10; 
2498                 case wxOS_WINDOWS_NT
: 
2499                     s_isWin98Or2k 
= verMaj 
>= 5; 
2503                     // unknown: be conservative by default 
2508             wxASSERT_MSG( s_isWin98Or2k 
!= -1, _T("should be set above") ); 
2511         return s_isWin98Or2k 
== 1; 
2514     static bool IsAtLeastWin2kSP4() 
2519         static int s_isAtLeastWin2kSP4 
= -1; 
2521         if ( s_isAtLeastWin2kSP4 
== -1 ) 
2523             OSVERSIONINFOEX ver
; 
2525             memset(&ver
, 0, sizeof(ver
)); 
2526             ver
.dwOSVersionInfoSize 
= sizeof(ver
); 
2527             GetVersionEx((OSVERSIONINFO
*)&ver
); 
2529             s_isAtLeastWin2kSP4 
= 
2530               ((ver
.dwMajorVersion 
> 5) || // Vista+ 
2531                (ver
.dwMajorVersion 
== 5 && ver
.dwMinorVersion 
> 0) || // XP/2003 
2532                (ver
.dwMajorVersion 
== 5 && ver
.dwMinorVersion 
== 0 && 
2533                ver
.wServicePackMajor 
>= 4)) // 2000 SP4+ 
2537         return s_isAtLeastWin2kSP4 
== 1; 
2542     // the code page we're working with 
2545     // cached result of GetMBNulLen(), set to 0 initially meaning 
2547     size_t m_minMBCharWidth
; 
2550 #endif // wxHAVE_WIN32_MB2WC 
2553 // ============================================================================ 
2554 // wxEncodingConverter based conversion classes 
2555 // ============================================================================ 
2559 class wxMBConv_wxwin 
: public wxMBConv
 
2564         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings. 
2565         // The wxMBConv_cf class does a better job. 
2566         m_ok 
= (m_enc 
< wxFONTENCODING_MACMIN 
|| m_enc 
> wxFONTENCODING_MACMAX
) && 
2567                m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) && 
2568                w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
); 
2572     // temporarily just use wxEncodingConverter stuff, 
2573     // so that it works while a better implementation is built 
2574     wxMBConv_wxwin(const char* name
) 
2577             m_enc 
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false); 
2579             m_enc 
= wxFONTENCODING_SYSTEM
; 
2584     wxMBConv_wxwin(wxFontEncoding enc
) 
2591     size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const 
2593         size_t inbuf 
= strlen(psz
); 
2596             if (!m2w
.Convert(psz
, buf
)) 
2597                 return wxCONV_FAILED
; 
2602     size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const 
2604         const size_t inbuf 
= wxWcslen(psz
); 
2607             if (!w2m
.Convert(psz
, buf
)) 
2608                 return wxCONV_FAILED
; 
2614     virtual size_t GetMBNulLen() const 
2618             case wxFONTENCODING_UTF16BE
: 
2619             case wxFONTENCODING_UTF16LE
: 
2622             case wxFONTENCODING_UTF32BE
: 
2623             case wxFONTENCODING_UTF32LE
: 
2631     virtual wxMBConv 
*Clone() const { return new wxMBConv_wxwin(m_enc
); } 
2633     bool IsOk() const { return m_ok
; } 
2636     wxFontEncoding m_enc
; 
2637     wxEncodingConverter m2w
, w2m
; 
2640     // were we initialized successfully? 
2643     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
) 
2646 // make the constructors available for unit testing 
2647 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name 
) 
2649     wxMBConv_wxwin
* result 
= new wxMBConv_wxwin( name 
); 
2650     if ( !result
->IsOk() ) 
2659 #endif // wxUSE_FONTMAP 
2661 // ============================================================================ 
2662 // wxCSConv implementation 
2663 // ============================================================================ 
2665 void wxCSConv::Init() 
2672 wxCSConv::wxCSConv(const wxString
& charset
) 
2676     if ( !charset
.empty() ) 
2678         SetName(charset
.ToAscii()); 
2682     m_encoding 
= wxFontMapperBase::GetEncodingFromName(charset
); 
2684     m_encoding 
= wxFONTENCODING_SYSTEM
; 
2688 wxCSConv::wxCSConv(wxFontEncoding encoding
) 
2690     if ( encoding 
== wxFONTENCODING_MAX 
|| encoding 
== wxFONTENCODING_DEFAULT 
) 
2692         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") ); 
2694         encoding 
= wxFONTENCODING_SYSTEM
; 
2699     m_encoding 
= encoding
; 
2702 wxCSConv::~wxCSConv() 
2707 wxCSConv::wxCSConv(const wxCSConv
& conv
) 
2712     SetName(conv
.m_name
); 
2713     m_encoding 
= conv
.m_encoding
; 
2716 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
) 
2720     SetName(conv
.m_name
); 
2721     m_encoding 
= conv
.m_encoding
; 
2726 void wxCSConv::Clear() 
2735 void wxCSConv::SetName(const char *charset
) 
2739         m_name 
= wxStrdup(charset
); 
2746 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
, 
2747                      wxEncodingNameCache 
); 
2749 static wxEncodingNameCache gs_nameCache
; 
2752 wxMBConv 
*wxCSConv::DoCreate() const 
2755     wxLogTrace(TRACE_STRCONV
, 
2756                wxT("creating conversion for %s"), 
2758                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str())); 
2759 #endif // wxUSE_FONTMAP 
2761     // check for the special case of ASCII or ISO8859-1 charset: as we have 
2762     // special knowledge of it anyhow, we don't need to create a special 
2763     // conversion object 
2764     if ( m_encoding 
== wxFONTENCODING_ISO8859_1 
|| 
2765             m_encoding 
== wxFONTENCODING_DEFAULT 
) 
2767         // don't convert at all 
2771     // we trust OS to do conversion better than we can so try external 
2772     // conversion methods first 
2774     // the full order is: 
2775     //      1. OS conversion (iconv() under Unix or Win32 API) 
2776     //      2. hard coded conversions for UTF 
2777     //      3. wxEncodingConverter as fall back 
2783 #endif // !wxUSE_FONTMAP 
2786         wxFontEncoding 
encoding(m_encoding
); 
2791             wxMBConv_iconv 
*conv 
= new wxMBConv_iconv(m_name
); 
2799                 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false); 
2800 #endif // wxUSE_FONTMAP 
2804             const wxEncodingNameCache::iterator it 
= gs_nameCache
.find(encoding
); 
2805             if ( it 
!= gs_nameCache
.end() ) 
2807                 if ( it
->second
.empty() ) 
2810                 wxMBConv_iconv 
*conv 
= new wxMBConv_iconv(it
->second
.ToAscii()); 
2817             const wxChar
** names 
= wxFontMapperBase::GetAllEncodingNames(encoding
); 
2818             // CS : in case this does not return valid names (eg for MacRoman) 
2819             // encoding got a 'failure' entry in the cache all the same, 
2820             // although it just has to be created using a different method, so 
2821             // only store failed iconv creation attempts (or perhaps we 
2822             // shoulnd't do this at all ?) 
2823             if ( names
[0] != NULL 
) 
2825                 for ( ; *names
; ++names 
) 
2827                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames() 
2828                     //             will need changes that will obsolete this 
2829                     wxString 
name(*names
); 
2830                     wxMBConv_iconv 
*conv 
= new wxMBConv_iconv(name
.ToAscii()); 
2833                         gs_nameCache
[encoding
] = *names
; 
2840                 gs_nameCache
[encoding
] = _T(""); // cache the failure 
2843 #endif // wxUSE_FONTMAP 
2845 #endif // HAVE_ICONV 
2847 #ifdef wxHAVE_WIN32_MB2WC 
2850         wxMBConv_win32 
*conv 
= m_name 
? new wxMBConv_win32(m_name
) 
2851                                       : new wxMBConv_win32(m_encoding
); 
2860 #endif // wxHAVE_WIN32_MB2WC 
2864         // leave UTF16 and UTF32 to the built-ins of wx 
2865         if ( m_name 
|| ( m_encoding 
< wxFONTENCODING_UTF16BE 
|| 
2866             ( m_encoding 
>= wxFONTENCODING_MACMIN 
&& m_encoding 
<= wxFONTENCODING_MACMAX 
) ) ) 
2869             wxMBConv_cf 
*conv 
= m_name 
? new wxMBConv_cf(m_name
) 
2870                                           : new wxMBConv_cf(m_encoding
); 
2872             wxMBConv_cf 
*conv 
= new wxMBConv_cf(m_encoding
); 
2881 #endif // __DARWIN__ 
2884     wxFontEncoding enc 
= m_encoding
; 
2886     if ( enc 
== wxFONTENCODING_SYSTEM 
&& m_name 
) 
2888         // use "false" to suppress interactive dialogs -- we can be called from 
2889         // anywhere and popping up a dialog from here is the last thing we want to 
2891         enc 
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false); 
2893 #endif // wxUSE_FONTMAP 
2897         case wxFONTENCODING_UTF7
: 
2898              return new wxMBConvUTF7
; 
2900         case wxFONTENCODING_UTF8
: 
2901              return new wxMBConvUTF8
; 
2903         case wxFONTENCODING_UTF16BE
: 
2904              return new wxMBConvUTF16BE
; 
2906         case wxFONTENCODING_UTF16LE
: 
2907              return new wxMBConvUTF16LE
; 
2909         case wxFONTENCODING_UTF32BE
: 
2910              return new wxMBConvUTF32BE
; 
2912         case wxFONTENCODING_UTF32LE
: 
2913              return new wxMBConvUTF32LE
; 
2916              // nothing to do but put here to suppress gcc warnings 
2923         wxMBConv_wxwin 
*conv 
= m_name 
? new wxMBConv_wxwin(m_name
) 
2924                                       : new wxMBConv_wxwin(m_encoding
); 
2930 #endif // wxUSE_FONTMAP 
2932     // NB: This is a hack to prevent deadlock. What could otherwise happen 
2933     //     in Unicode build: wxConvLocal creation ends up being here 
2934     //     because of some failure and logs the error. But wxLog will try to 
2935     //     attach a timestamp, for which it will need wxConvLocal (to convert 
2936     //     time to char* and then wchar_t*), but that fails, tries to log the 
2937     //     error, but wxLog has an (already locked) critical section that 
2938     //     guards the static buffer. 
2939     static bool alreadyLoggingError 
= false; 
2940     if (!alreadyLoggingError
) 
2942         alreadyLoggingError 
= true; 
2943         wxLogError(_("Cannot convert from the charset '%s'!"), 
2947                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii() 
2948 #else // !wxUSE_FONTMAP 
2949                          (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii() 
2950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP 
2953         alreadyLoggingError 
= false; 
2959 void wxCSConv::CreateConvIfNeeded() const 
2963         wxCSConv 
*self 
= (wxCSConv 
*)this; // const_cast 
2965         // if we don't have neither the name nor the encoding, use the default 
2966         // encoding for this system 
2967         if ( !m_name 
&& m_encoding 
== wxFONTENCODING_SYSTEM 
) 
2970             self
->m_encoding 
= wxLocale::GetSystemEncoding(); 
2972             // fallback to some reasonable default: 
2973             self
->m_encoding 
= wxFONTENCODING_ISO8859_1
; 
2974 #endif // wxUSE_INTL 
2977         self
->m_convReal 
= DoCreate(); 
2978         self
->m_deferred 
= false; 
2982 bool wxCSConv::IsOk() const 
2984     CreateConvIfNeeded(); 
2986     // special case: no convReal created for wxFONTENCODING_ISO8859_1 
2987     if ( m_encoding 
== wxFONTENCODING_ISO8859_1 
) 
2988         return true; // always ok as we do it ourselves 
2990     // m_convReal->IsOk() is called at its own creation, so we know it must 
2991     // be ok if m_convReal is non-NULL 
2992     return m_convReal 
!= NULL
; 
2995 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
, 
2996                          const char *src
, size_t srcLen
) const 
2998     CreateConvIfNeeded(); 
3001         return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
); 
3004     return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
); 
3007 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
, 
3008                            const wchar_t *src
, size_t srcLen
) const 
3010     CreateConvIfNeeded(); 
3013         return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
); 
3016     return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
); 
3019 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
3021     CreateConvIfNeeded(); 
3024         return m_convReal
->MB2WC(buf
, psz
, n
); 
3027     size_t len 
= strlen(psz
); 
3031         for (size_t c 
= 0; c 
<= len
; c
++) 
3032             buf
[c
] = (unsigned char)(psz
[c
]); 
3038 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
3040     CreateConvIfNeeded(); 
3043         return m_convReal
->WC2MB(buf
, psz
, n
); 
3046     const size_t len 
= wxWcslen(psz
); 
3049         for (size_t c 
= 0; c 
<= len
; c
++) 
3052                 return wxCONV_FAILED
; 
3054             buf
[c
] = (char)psz
[c
]; 
3059         for (size_t c 
= 0; c 
<= len
; c
++) 
3062                 return wxCONV_FAILED
; 
3069 size_t wxCSConv::GetMBNulLen() const 
3071     CreateConvIfNeeded(); 
3075         return m_convReal
->GetMBNulLen(); 
3078     // otherwise, we are ISO-8859-1 
3082 #if wxUSE_UNICODE_UTF8 
3083 bool wxCSConv::IsUTF8() const 
3085     CreateConvIfNeeded(); 
3089         return m_convReal
->IsUTF8(); 
3092     // otherwise, we are ISO-8859-1 
3100 wxWCharBuffer 
wxSafeConvertMB2WX(const char *s
) 
3103         return wxWCharBuffer(); 
3105     wxWCharBuffer 
wbuf(wxConvLibc
.cMB2WX(s
)); 
3107         wbuf 
= wxMBConvUTF8().cMB2WX(s
); 
3109         wbuf 
= wxConvISO8859_1
.cMB2WX(s
); 
3114 wxCharBuffer 
wxSafeConvertWX2MB(const wchar_t *ws
) 
3117         return wxCharBuffer(); 
3119     wxCharBuffer 
buf(wxConvLibc
.cWX2MB(ws
)); 
3121         buf 
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
); 
3126 #endif // wxUSE_UNICODE 
3128 // ---------------------------------------------------------------------------- 
3130 // ---------------------------------------------------------------------------- 
3132 // NB: The reason why we create converted objects in this convoluted way, 
3133 //     using a factory function instead of global variable, is that they 
3134 //     may be used at static initialization time (some of them are used by 
3135 //     wxString ctors and there may be a global wxString object). In other 
3136 //     words, possibly _before_ the converter global object would be 
3143 #undef wxConvISO8859_1 
3145 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \ 
3146     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \ 
3147     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \ 
3149         static impl_klass name##Obj ctor_args;                          \ 
3150         return &name##Obj;                                              \ 
3152     /* this ensures that all global converter objects are created */    \ 
3153     /* by the time static initialization is done, i.e. before any */    \ 
3154     /* thread is launched: */                                           \ 
3155     static klass* gs_##name##instance = wxGet_##name##Ptr() 
3157 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \ 
3158     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args) 
3161     WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
); 
3163     WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
); 
3166 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
); 
3167 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
); 
3169 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
)); 
3170 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
)); 
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv 
*) wxConvCurrent 
= wxGet_wxConvLibcPtr(); 
3173 WXDLLIMPEXP_DATA_BASE(wxMBConv 
*) wxConvUI 
= wxGet_wxConvLocalPtr(); 
3176 // The xnu kernel always communicates file paths in decomposed UTF-8. 
3177 // WARNING: Are we sure that CFString's conversion will cause decomposition? 
3178 static wxMBConv_cf 
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
); 
3181 WXDLLIMPEXP_DATA_BASE(wxMBConv 
*) wxConvFileName 
= 
3184 #else // !__DARWIN__ 
3185                                     wxGet_wxConvLibcPtr(); 
3186 #endif // __DARWIN__/!__DARWIN__ 
3188 #else // !wxUSE_WCHAR_T 
3190 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now 
3191 // stand-ins in absence of wchar_t 
3192 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
, 
3197 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T