src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include <CoreFoundation/CFString.h>
  61 #include <CoreFoundation/CFStringEncodingExt.h>
  62
  63 #include "wx/mac/corefoundation/cfref.h"
  64 #endif //def __DARWIN__
  65
  66 #ifdef __WXMAC__
  67 #ifndef __DARWIN__
  68 #include <ATSUnicode.h>
  69 #include <TextCommon.h>
  70 #include <TextEncodingConverter.h>
  71 #endif
  72
  73 // includes Mac headers
  74 #include "wx/mac/private.h"
  75 #endif
  76
  77
  78 #define TRACE_STRCONV _T("strconv")
  79
  80 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  81 // be 4 bytes
  82 #if SIZEOF_WCHAR_T == 2
  83     #define WC_UTF16
  84 #endif
  85
  86
  87 // ============================================================================
  88 // implementation
  89 // ============================================================================
  90
  91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  92 static bool NotAllNULs(const char *p, size_t n)
  93 {
  94     while ( n && *p++ == '\0' )
  95         n--;
  96
  97     return n != 0;
  98 }
  99
 100 // ----------------------------------------------------------------------------
 101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
 102 // ----------------------------------------------------------------------------
 103
 104 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 105 {
 106     if (input <= 0xffff)
 107     {
 108         if (output)
 109             *output = (wxUint16) input;
 110
 111         return 1;
 112     }
 113     else if (input >= 0x110000)
 114     {
 115         return wxCONV_FAILED;
 116     }
 117     else
 118     {
 119         if (output)
 120         {
 121             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 122             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 123         }
 124
 125         return 2;
 126     }
 127 }
 128
 129 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 130 {
 131     if ((*input < 0xd800) || (*input > 0xdfff))
 132     {
 133         output = *input;
 134         return 1;
 135     }
 136     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 137     {
 138         output = *input;
 139         return wxCONV_FAILED;
 140     }
 141     else
 142     {
 143         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 144         return 2;
 145     }
 146 }
 147
 148 #ifdef WC_UTF16
 149     typedef wchar_t wxDecodeSurrogate_t;
 150 #else // !WC_UTF16
 151     typedef wxUint16 wxDecodeSurrogate_t;
 152 #endif // WC_UTF16/!WC_UTF16
 153
 154 // returns the next UTF-32 character from the wchar_t buffer and advances the
 155 // pointer to the character after this one
 156 //
 157 // if an invalid character is found, *pSrc is set to NULL, the caller must
 158 // check for this
 159 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 160 {
 161     wxUint32 out;
 162     const size_t
 163         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 164     if ( n == wxCONV_FAILED )
 165         *pSrc = NULL;
 166     else
 167         *pSrc += n;
 168
 169     return out;
 170 }
 171
 172 // ----------------------------------------------------------------------------
 173 // wxMBConv
 174 // ----------------------------------------------------------------------------
 175
 176 size_t
 177 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 178                   const char *src, size_t srcLen) const
 179 {
 180     // although new conversion classes are supposed to implement this function
 181     // directly, the existins ones only implement the old MB2WC() and so, to
 182     // avoid to have to rewrite all conversion classes at once, we provide a
 183     // default (but not efficient) implementation of this one in terms of the
 184     // old function by copying the input to ensure that it's NUL-terminated and
 185     // then using MB2WC() to convert it
 186
 187     // the number of chars [which would be] written to dst [if it were not NULL]
 188     size_t dstWritten = 0;
 189
 190     // the number of NULs terminating this string
 191     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 192
 193     // if we were not given the input size we just have to assume that the
 194     // string is properly terminated as we have no way of knowing how long it
 195     // is anyhow, but if we do have the size check whether there are enough
 196     // NULs at the end
 197     wxCharBuffer bufTmp;
 198     const char *srcEnd;
 199     if ( srcLen != wxNO_LEN )
 200     {
 201         // we need to know how to find the end of this string
 202         nulLen = GetMBNulLen();
 203         if ( nulLen == wxCONV_FAILED )
 204             return wxCONV_FAILED;
 205
 206         // if there are enough NULs we can avoid the copy
 207         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 208         {
 209             // make a copy in order to properly NUL-terminate the string
 210             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 211             char * const p = bufTmp.data();
 212             memcpy(p, src, srcLen);
 213             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 214                 *s = '\0';
 215
 216             src = bufTmp;
 217         }
 218
 219         srcEnd = src + srcLen;
 220     }
 221     else // quit after the first loop iteration
 222     {
 223         srcEnd = NULL;
 224     }
 225
 226     for ( ;; )
 227     {
 228         // try to convert the current chunk
 229         size_t lenChunk = MB2WC(NULL, src, 0);
 230         if ( lenChunk == wxCONV_FAILED )
 231             return wxCONV_FAILED;
 232
 233         lenChunk++; // for the L'\0' at the end of this chunk
 234
 235         dstWritten += lenChunk;
 236
 237         if ( lenChunk == 1 )
 238         {
 239             // nothing left in the input string, conversion succeeded
 240             break;
 241         }
 242
 243         if ( dst )
 244         {
 245             if ( dstWritten > dstLen )
 246                 return wxCONV_FAILED;
 247
 248             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 249                 return wxCONV_FAILED;
 250
 251             dst += lenChunk;
 252         }
 253
 254         if ( !srcEnd )
 255         {
 256             // we convert just one chunk in this case as this is the entire
 257             // string anyhow
 258             break;
 259         }
 260
 261         // advance the input pointer past the end of this chunk
 262         while ( NotAllNULs(src, nulLen) )
 263         {
 264             // notice that we must skip over multiple bytes here as we suppose
 265             // that if NUL takes 2 or 4 bytes, then all the other characters do
 266             // too and so if advanced by a single byte we might erroneously
 267             // detect sequences of NUL bytes in the middle of the input
 268             src += nulLen;
 269         }
 270
 271         src += nulLen; // skipping over its terminator as well
 272
 273         // note that ">=" (and not just "==") is needed here as the terminator
 274         // we skipped just above could be inside or just after the buffer
 275         // delimited by inEnd
 276         if ( src >= srcEnd )
 277             break;
 278     }
 279
 280     return dstWritten;
 281 }
 282
 283 size_t
 284 wxMBConv::FromWChar(char *dst, size_t dstLen,
 285                     const wchar_t *src, size_t srcLen) const
 286 {
 287     // the number of chars [which would be] written to dst [if it were not NULL]
 288     size_t dstWritten = 0;
 289
 290     // make a copy of the input string unless it is already properly
 291     // NUL-terminated
 292     //
 293     // if we don't know its length we have no choice but to assume that it is,
 294     // indeed, properly terminated
 295     wxWCharBuffer bufTmp;
 296     if ( srcLen == wxNO_LEN )
 297     {
 298         srcLen = wxWcslen(src) + 1;
 299     }
 300     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 301     {
 302         // make a copy in order to properly NUL-terminate the string
 303         bufTmp = wxWCharBuffer(srcLen);
 304         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 305         src = bufTmp;
 306     }
 307
 308     const size_t lenNul = GetMBNulLen();
 309     for ( const wchar_t * const srcEnd = src + srcLen;
 310           src < srcEnd;
 311           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 312     {
 313         // try to convert the current chunk
 314         size_t lenChunk = WC2MB(NULL, src, 0);
 315
 316         if ( lenChunk == wxCONV_FAILED )
 317             return wxCONV_FAILED;
 318
 319         lenChunk += lenNul;
 320         dstWritten += lenChunk;
 321
 322         if ( dst )
 323         {
 324             if ( dstWritten > dstLen )
 325                 return wxCONV_FAILED;
 326
 327             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 328                 return wxCONV_FAILED;
 329
 330             dst += lenChunk;
 331         }
 332     }
 333
 334     return dstWritten;
 335 }
 336
 337 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 338 {
 339     size_t rc = ToWChar(outBuff, outLen, inBuff);
 340     if ( rc != wxCONV_FAILED )
 341     {
 342         // ToWChar() returns the buffer length, i.e. including the trailing
 343         // NUL, while this method doesn't take it into account
 344         rc--;
 345     }
 346
 347     return rc;
 348 }
 349
 350 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 351 {
 352     size_t rc = FromWChar(outBuff, outLen, inBuff);
 353     if ( rc != wxCONV_FAILED )
 354     {
 355         rc -= GetMBNulLen();
 356     }
 357
 358     return rc;
 359 }
 360
 361 wxMBConv::~wxMBConv()
 362 {
 363     // nothing to do here (necessary for Darwin linking probably)
 364 }
 365
 366 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 367 {
 368     if ( psz )
 369     {
 370         // calculate the length of the buffer needed first
 371         const size_t nLen = MB2WC(NULL, psz, 0);
 372         if ( nLen != wxCONV_FAILED )
 373         {
 374             // now do the actual conversion
 375             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 376
 377             // +1 for the trailing NULL
 378             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 379                 return buf;
 380         }
 381     }
 382
 383     return wxWCharBuffer();
 384 }
 385
 386 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 387 {
 388     if ( pwz )
 389     {
 390         const size_t nLen = WC2MB(NULL, pwz, 0);
 391         if ( nLen != wxCONV_FAILED )
 392         {
 393             // extra space for trailing NUL(s)
 394             static const size_t extraLen = GetMaxMBNulLen();
 395
 396             wxCharBuffer buf(nLen + extraLen - 1);
 397             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 398                 return buf;
 399         }
 400     }
 401
 402     return wxCharBuffer();
 403 }
 404
 405 const wxWCharBuffer
 406 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 407 {
 408     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 409     if ( dstLen != wxCONV_FAILED )
 410     {
 411         wxWCharBuffer wbuf(dstLen - 1);
 412         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 413         {
 414             if ( outLen )
 415             {
 416                 *outLen = dstLen;
 417                 if ( wbuf[dstLen - 1] == L'\0' )
 418                     (*outLen)--;
 419             }
 420
 421             return wbuf;
 422         }
 423     }
 424
 425     if ( outLen )
 426         *outLen = 0;
 427
 428     return wxWCharBuffer();
 429 }
 430
 431 const wxCharBuffer
 432 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 433 {
 434     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 435     if ( dstLen != wxCONV_FAILED )
 436     {
 437         // special case of empty input: can't allocate 0 size buffer below as
 438         // wxCharBuffer insists on NUL-terminating it
 439         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 440         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 441         {
 442             if ( outLen )
 443             {
 444                 *outLen = dstLen;
 445
 446                 const size_t nulLen = GetMBNulLen();
 447                 if ( dstLen >= nulLen &&
 448                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 449                 {
 450                     // in this case the output is NUL-terminated and we're not
 451                     // supposed to count NUL
 452                     *outLen -= nulLen;
 453                 }
 454             }
 455
 456             return buf;
 457         }
 458     }
 459
 460     if ( outLen )
 461         *outLen = 0;
 462
 463     return wxCharBuffer();
 464 }
 465
 466 // ----------------------------------------------------------------------------
 467 // wxMBConvLibc
 468 // ----------------------------------------------------------------------------
 469
 470 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 471 {
 472     return wxMB2WC(buf, psz, n);
 473 }
 474
 475 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 476 {
 477     return wxWC2MB(buf, psz, n);
 478 }
 479
 480 // ----------------------------------------------------------------------------
 481 // wxConvBrokenFileNames
 482 // ----------------------------------------------------------------------------
 483
 484 #ifdef __UNIX__
 485
 486 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 487 {
 488     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 489          wxStricmp(charset, _T("UTF8")) == 0  )
 490         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 491     else
 492         m_conv = new wxCSConv(charset);
 493 }
 494
 495 #endif // __UNIX__
 496
 497 // ----------------------------------------------------------------------------
 498 // UTF-7
 499 // ----------------------------------------------------------------------------
 500
 501 // Implementation (C) 2004 Fredrik Roubert
 502
 503 //
 504 // BASE64 decoding table
 505 //
 506 static const unsigned char utf7unb64[] =
 507 {
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 514     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 515     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 517     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 518     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 519     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 521     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 522     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 523     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 535     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 536     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 539     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 540 };
 541
 542 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 543 {
 544     size_t len = 0;
 545
 546     while ( *psz && (!buf || (len < n)) )
 547     {
 548         unsigned char cc = *psz++;
 549         if (cc != '+')
 550         {
 551             // plain ASCII char
 552             if (buf)
 553                 *buf++ = cc;
 554             len++;
 555         }
 556         else if (*psz == '-')
 557         {
 558             // encoded plus sign
 559             if (buf)
 560                 *buf++ = cc;
 561             len++;
 562             psz++;
 563         }
 564         else // start of BASE64 encoded string
 565         {
 566             bool lsb, ok;
 567             unsigned int d, l;
 568             for ( ok = lsb = false, d = 0, l = 0;
 569                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 570                   psz++ )
 571             {
 572                 d <<= 6;
 573                 d += cc;
 574                 for (l += 6; l >= 8; lsb = !lsb)
 575                 {
 576                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 577                     if (lsb)
 578                     {
 579                         if (buf)
 580                             *buf++ |= c;
 581                         len ++;
 582                     }
 583                     else
 584                     {
 585                         if (buf)
 586                             *buf = (wchar_t)(c << 8);
 587                     }
 588
 589                     ok = true;
 590                 }
 591             }
 592
 593             if ( !ok )
 594             {
 595                 // in valid UTF7 we should have valid characters after '+'
 596                 return wxCONV_FAILED;
 597             }
 598
 599             if (*psz == '-')
 600                 psz++;
 601         }
 602     }
 603
 604     if ( buf && (len < n) )
 605         *buf = '\0';
 606
 607     return len;
 608 }
 609
 610 //
 611 // BASE64 encoding table
 612 //
 613 static const unsigned char utf7enb64[] =
 614 {
 615     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 616     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 617     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 618     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 619     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 620     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 621     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 622     '4', '5', '6', '7', '8', '9', '+', '/'
 623 };
 624
 625 //
 626 // UTF-7 encoding table
 627 //
 628 // 0 - Set D (directly encoded characters)
 629 // 1 - Set O (optional direct characters)
 630 // 2 - whitespace characters (optional)
 631 // 3 - special characters
 632 //
 633 static const unsigned char utf7encode[128] =
 634 {
 635     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 636     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 637     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 638     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 639     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 640     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 641     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 642     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 643 };
 644
 645 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 646 {
 647     size_t len = 0;
 648
 649     while (*psz && ((!buf) || (len < n)))
 650     {
 651         wchar_t cc = *psz++;
 652         if (cc < 0x80 && utf7encode[cc] < 1)
 653         {
 654             // plain ASCII char
 655             if (buf)
 656                 *buf++ = (char)cc;
 657
 658             len++;
 659         }
 660 #ifndef WC_UTF16
 661         else if (((wxUint32)cc) > 0xffff)
 662         {
 663             // no surrogate pair generation (yet?)
 664             return wxCONV_FAILED;
 665         }
 666 #endif
 667         else
 668         {
 669             if (buf)
 670                 *buf++ = '+';
 671
 672             len++;
 673             if (cc != '+')
 674             {
 675                 // BASE64 encode string
 676                 unsigned int lsb, d, l;
 677                 for (d = 0, l = 0; /*nothing*/; psz++)
 678                 {
 679                     for (lsb = 0; lsb < 2; lsb ++)
 680                     {
 681                         d <<= 8;
 682                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 683
 684                         for (l += 8; l >= 6; )
 685                         {
 686                             l -= 6;
 687                             if (buf)
 688                                 *buf++ = utf7enb64[(d >> l) % 64];
 689                             len++;
 690                         }
 691                     }
 692
 693                     cc = *psz;
 694                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 695                         break;
 696                 }
 697
 698                 if (l != 0)
 699                 {
 700                     if (buf)
 701                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 702
 703                     len++;
 704                 }
 705             }
 706
 707             if (buf)
 708                 *buf++ = '-';
 709             len++;
 710         }
 711     }
 712
 713     if (buf && (len < n))
 714         *buf = 0;
 715
 716     return len;
 717 }
 718
 719 // ----------------------------------------------------------------------------
 720 // UTF-8
 721 // ----------------------------------------------------------------------------
 722
 723 static wxUint32 utf8_max[]=
 724     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 725
 726 // boundaries of the private use area we use to (temporarily) remap invalid
 727 // characters invalid in a UTF-8 encoded string
 728 const wxUint32 wxUnicodePUA = 0x100000;
 729 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 730
 731 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 732 {
 733     size_t len = 0;
 734
 735     while (*psz && ((!buf) || (len < n)))
 736     {
 737         const char *opsz = psz;
 738         bool invalid = false;
 739         unsigned char cc = *psz++, fc = cc;
 740         unsigned cnt;
 741         for (cnt = 0; fc & 0x80; cnt++)
 742             fc <<= 1;
 743
 744         if (!cnt)
 745         {
 746             // plain ASCII char
 747             if (buf)
 748                 *buf++ = cc;
 749             len++;
 750
 751             // escape the escape character for octal escapes
 752             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 753                     && cc == '\\' && (!buf || len < n))
 754             {
 755                 if (buf)
 756                     *buf++ = cc;
 757                 len++;
 758             }
 759         }
 760         else
 761         {
 762             cnt--;
 763             if (!cnt)
 764             {
 765                 // invalid UTF-8 sequence
 766                 invalid = true;
 767             }
 768             else
 769             {
 770                 unsigned ocnt = cnt - 1;
 771                 wxUint32 res = cc & (0x3f >> cnt);
 772                 while (cnt--)
 773                 {
 774                     cc = *psz;
 775                     if ((cc & 0xC0) != 0x80)
 776                     {
 777                         // invalid UTF-8 sequence
 778                         invalid = true;
 779                         break;
 780                     }
 781
 782                     psz++;
 783                     res = (res << 6) | (cc & 0x3f);
 784                 }
 785
 786                 if (invalid || res <= utf8_max[ocnt])
 787                 {
 788                     // illegal UTF-8 encoding
 789                     invalid = true;
 790                 }
 791                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 792                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 793                 {
 794                     // if one of our PUA characters turns up externally
 795                     // it must also be treated as an illegal sequence
 796                     // (a bit like you have to escape an escape character)
 797                     invalid = true;
 798                 }
 799                 else
 800                 {
 801 #ifdef WC_UTF16
 802                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 803                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 804                     if (pa == wxCONV_FAILED)
 805                     {
 806                         invalid = true;
 807                     }
 808                     else
 809                     {
 810                         if (buf)
 811                             buf += pa;
 812                         len += pa;
 813                     }
 814 #else // !WC_UTF16
 815                     if (buf)
 816                         *buf++ = (wchar_t)res;
 817                     len++;
 818 #endif // WC_UTF16/!WC_UTF16
 819                 }
 820             }
 821
 822             if (invalid)
 823             {
 824                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 825                 {
 826                     while (opsz < psz && (!buf || len < n))
 827                     {
 828 #ifdef WC_UTF16
 829                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 830                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 831                         wxASSERT(pa != wxCONV_FAILED);
 832                         if (buf)
 833                             buf += pa;
 834                         opsz++;
 835                         len += pa;
 836 #else
 837                         if (buf)
 838                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 839                         opsz++;
 840                         len++;
 841 #endif
 842                     }
 843                 }
 844                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 845                 {
 846                     while (opsz < psz && (!buf || len < n))
 847                     {
 848                         if ( buf && len + 3 < n )
 849                         {
 850                             unsigned char on = *opsz;
 851                             *buf++ = L'\\';
 852                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 853                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 854                             *buf++ = (wchar_t)( L'0' + on % 010 );
 855                         }
 856
 857                         opsz++;
 858                         len += 4;
 859                     }
 860                 }
 861                 else // MAP_INVALID_UTF8_NOT
 862                 {
 863                     return wxCONV_FAILED;
 864                 }
 865             }
 866         }
 867     }
 868
 869     if (buf && (len < n))
 870         *buf = 0;
 871
 872     return len;
 873 }
 874
 875 static inline bool isoctal(wchar_t wch)
 876 {
 877     return L'0' <= wch && wch <= L'7';
 878 }
 879
 880 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 881 {
 882     size_t len = 0;
 883
 884     while (*psz && ((!buf) || (len < n)))
 885     {
 886         wxUint32 cc;
 887
 888 #ifdef WC_UTF16
 889         // cast is ok for WC_UTF16
 890         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 891         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 892 #else
 893         cc = (*psz++) & 0x7fffffff;
 894 #endif
 895
 896         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 897                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 898         {
 899             if (buf)
 900                 *buf++ = (char)(cc - wxUnicodePUA);
 901             len++;
 902         }
 903         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 904                     && cc == L'\\' && psz[0] == L'\\' )
 905         {
 906             if (buf)
 907                 *buf++ = (char)cc;
 908             psz++;
 909             len++;
 910         }
 911         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 912                     cc == L'\\' &&
 913                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 914         {
 915             if (buf)
 916             {
 917                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 918                                  (psz[1] - L'0') * 010 +
 919                                  (psz[2] - L'0'));
 920             }
 921
 922             psz += 3;
 923             len++;
 924         }
 925         else
 926         {
 927             unsigned cnt;
 928             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 929             {
 930             }
 931
 932             if (!cnt)
 933             {
 934                 // plain ASCII char
 935                 if (buf)
 936                     *buf++ = (char) cc;
 937                 len++;
 938             }
 939             else
 940             {
 941                 len += cnt + 1;
 942                 if (buf)
 943                 {
 944                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 945                     while (cnt--)
 946                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 947                 }
 948             }
 949         }
 950     }
 951
 952     if (buf && (len < n))
 953         *buf = 0;
 954
 955     return len;
 956 }
 957
 958 // ============================================================================
 959 // UTF-16
 960 // ============================================================================
 961
 962 #ifdef WORDS_BIGENDIAN
 963     #define wxMBConvUTF16straight wxMBConvUTF16BE
 964     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 965 #else
 966     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 967     #define wxMBConvUTF16straight wxMBConvUTF16LE
 968 #endif
 969
 970 /* static */
 971 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 972 {
 973     if ( srcLen == wxNO_LEN )
 974     {
 975         // count the number of bytes in input, including the trailing NULs
 976         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 977         for ( srcLen = 1; *inBuff++; srcLen++ )
 978             ;
 979
 980         srcLen *= BYTES_PER_CHAR;
 981     }
 982     else // we already have the length
 983     {
 984         // we can only convert an entire number of UTF-16 characters
 985         if ( srcLen % BYTES_PER_CHAR )
 986             return wxCONV_FAILED;
 987     }
 988
 989     return srcLen;
 990 }
 991
 992 // case when in-memory representation is UTF-16 too
 993 #ifdef WC_UTF16
 994
 995 // ----------------------------------------------------------------------------
 996 // conversions without endianness change
 997 // ----------------------------------------------------------------------------
 998
 999 size_t
1000 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1001                                const char *src, size_t srcLen) const
1002 {
1003     // set up the scene for using memcpy() (which is presumably more efficient
1004     // than copying the bytes one by one)
1005     srcLen = GetLength(src, srcLen);
1006     if ( srcLen == wxNO_LEN )
1007         return wxCONV_FAILED;
1008
1009     const size_t inLen = srcLen / BYTES_PER_CHAR;
1010     if ( dst )
1011     {
1012         if ( dstLen < inLen )
1013             return wxCONV_FAILED;
1014
1015         memcpy(dst, src, srcLen);
1016     }
1017
1018     return inLen;
1019 }
1020
1021 size_t
1022 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1023                                  const wchar_t *src, size_t srcLen) const
1024 {
1025     if ( srcLen == wxNO_LEN )
1026         srcLen = wxWcslen(src) + 1;
1027
1028     srcLen *= BYTES_PER_CHAR;
1029
1030     if ( dst )
1031     {
1032         if ( dstLen < srcLen )
1033             return wxCONV_FAILED;
1034
1035         memcpy(dst, src, srcLen);
1036     }
1037
1038     return srcLen;
1039 }
1040
1041 // ----------------------------------------------------------------------------
1042 // endian-reversing conversions
1043 // ----------------------------------------------------------------------------
1044
1045 size_t
1046 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1047                            const char *src, size_t srcLen) const
1048 {
1049     srcLen = GetLength(src, srcLen);
1050     if ( srcLen == wxNO_LEN )
1051         return wxCONV_FAILED;
1052
1053     srcLen /= BYTES_PER_CHAR;
1054
1055     if ( dst )
1056     {
1057         if ( dstLen < srcLen )
1058             return wxCONV_FAILED;
1059
1060         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1061         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1062         {
1063             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1064         }
1065     }
1066
1067     return srcLen;
1068 }
1069
1070 size_t
1071 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1072                              const wchar_t *src, size_t srcLen) const
1073 {
1074     if ( srcLen == wxNO_LEN )
1075         srcLen = wxWcslen(src) + 1;
1076
1077     srcLen *= BYTES_PER_CHAR;
1078
1079     if ( dst )
1080     {
1081         if ( dstLen < srcLen )
1082             return wxCONV_FAILED;
1083
1084         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1085         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1086         {
1087             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1088         }
1089     }
1090
1091     return srcLen;
1092 }
1093
1094 #else // !WC_UTF16: wchar_t is UTF-32
1095
1096 // ----------------------------------------------------------------------------
1097 // conversions without endianness change
1098 // ----------------------------------------------------------------------------
1099
1100 size_t
1101 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1102                                const char *src, size_t srcLen) const
1103 {
1104     srcLen = GetLength(src, srcLen);
1105     if ( srcLen == wxNO_LEN )
1106         return wxCONV_FAILED;
1107
1108     const size_t inLen = srcLen / BYTES_PER_CHAR;
1109     if ( !dst )
1110     {
1111         // optimization: return maximal space which could be needed for this
1112         // string even if the real size could be smaller if the buffer contains
1113         // any surrogates
1114         return inLen;
1115     }
1116
1117     size_t outLen = 0;
1118     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1119     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1120     {
1121         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1122         if ( !inBuff )
1123             return wxCONV_FAILED;
1124
1125         if ( ++outLen > dstLen )
1126             return wxCONV_FAILED;
1127
1128         *dst++ = ch;
1129     }
1130
1131
1132     return outLen;
1133 }
1134
1135 size_t
1136 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1137                                  const wchar_t *src, size_t srcLen) const
1138 {
1139     if ( srcLen == wxNO_LEN )
1140         srcLen = wxWcslen(src) + 1;
1141
1142     size_t outLen = 0;
1143     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1144     for ( size_t n = 0; n < srcLen; n++ )
1145     {
1146         wxUint16 cc[2];
1147         const size_t numChars = encode_utf16(*src++, cc);
1148         if ( numChars == wxCONV_FAILED )
1149             return wxCONV_FAILED;
1150
1151         outLen += numChars * BYTES_PER_CHAR;
1152         if ( outBuff )
1153         {
1154             if ( outLen > dstLen )
1155                 return wxCONV_FAILED;
1156
1157             *outBuff++ = cc[0];
1158             if ( numChars == 2 )
1159             {
1160                 // second character of a surrogate
1161                 *outBuff++ = cc[1];
1162             }
1163         }
1164     }
1165
1166     return outLen;
1167 }
1168
1169 // ----------------------------------------------------------------------------
1170 // endian-reversing conversions
1171 // ----------------------------------------------------------------------------
1172
1173 size_t
1174 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1175                            const char *src, size_t srcLen) const
1176 {
1177     srcLen = GetLength(src, srcLen);
1178     if ( srcLen == wxNO_LEN )
1179         return wxCONV_FAILED;
1180
1181     const size_t inLen = srcLen / BYTES_PER_CHAR;
1182     if ( !dst )
1183     {
1184         // optimization: return maximal space which could be needed for this
1185         // string even if the real size could be smaller if the buffer contains
1186         // any surrogates
1187         return inLen;
1188     }
1189
1190     size_t outLen = 0;
1191     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1192     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1193     {
1194         wxUint32 ch;
1195         wxUint16 tmp[2];
1196
1197         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1198         inBuff++;
1199         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1200
1201         const size_t numChars = decode_utf16(tmp, ch);
1202         if ( numChars == wxCONV_FAILED )
1203             return wxCONV_FAILED;
1204
1205         if ( numChars == 2 )
1206             inBuff++;
1207
1208         if ( ++outLen > dstLen )
1209             return wxCONV_FAILED;
1210
1211         *dst++ = ch;
1212     }
1213
1214
1215     return outLen;
1216 }
1217
1218 size_t
1219 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1220                              const wchar_t *src, size_t srcLen) const
1221 {
1222     if ( srcLen == wxNO_LEN )
1223         srcLen = wxWcslen(src) + 1;
1224
1225     size_t outLen = 0;
1226     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1227     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1228     {
1229         wxUint16 cc[2];
1230         const size_t numChars = encode_utf16(*src, cc);
1231         if ( numChars == wxCONV_FAILED )
1232             return wxCONV_FAILED;
1233
1234         outLen += numChars * BYTES_PER_CHAR;
1235         if ( outBuff )
1236         {
1237             if ( outLen > dstLen )
1238                 return wxCONV_FAILED;
1239
1240             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1241             if ( numChars == 2 )
1242             {
1243                 // second character of a surrogate
1244                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1245             }
1246         }
1247     }
1248
1249     return outLen;
1250 }
1251
1252 #endif // WC_UTF16/!WC_UTF16
1253
1254
1255 // ============================================================================
1256 // UTF-32
1257 // ============================================================================
1258
1259 #ifdef WORDS_BIGENDIAN
1260     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1261     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1262 #else
1263     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1264     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1265 #endif
1266
1267
1268 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1269 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1270
1271 /* static */
1272 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1273 {
1274     if ( srcLen == wxNO_LEN )
1275     {
1276         // count the number of bytes in input, including the trailing NULs
1277         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1278         for ( srcLen = 1; *inBuff++; srcLen++ )
1279             ;
1280
1281         srcLen *= BYTES_PER_CHAR;
1282     }
1283     else // we already have the length
1284     {
1285         // we can only convert an entire number of UTF-32 characters
1286         if ( srcLen % BYTES_PER_CHAR )
1287             return wxCONV_FAILED;
1288     }
1289
1290     return srcLen;
1291 }
1292
1293 // case when in-memory representation is UTF-16
1294 #ifdef WC_UTF16
1295
1296 // ----------------------------------------------------------------------------
1297 // conversions without endianness change
1298 // ----------------------------------------------------------------------------
1299
1300 size_t
1301 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1302                                const char *src, size_t srcLen) const
1303 {
1304     srcLen = GetLength(src, srcLen);
1305     if ( srcLen == wxNO_LEN )
1306         return wxCONV_FAILED;
1307
1308     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1309     const size_t inLen = srcLen / BYTES_PER_CHAR;
1310     size_t outLen = 0;
1311     for ( size_t n = 0; n < inLen; n++ )
1312     {
1313         wxUint16 cc[2];
1314         const size_t numChars = encode_utf16(*inBuff++, cc);
1315         if ( numChars == wxCONV_FAILED )
1316             return wxCONV_FAILED;
1317
1318         outLen += numChars;
1319         if ( dst )
1320         {
1321             if ( outLen > dstLen )
1322                 return wxCONV_FAILED;
1323
1324             *dst++ = cc[0];
1325             if ( numChars == 2 )
1326             {
1327                 // second character of a surrogate
1328                 *dst++ = cc[1];
1329             }
1330         }
1331     }
1332
1333     return outLen;
1334 }
1335
1336 size_t
1337 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1338                                  const wchar_t *src, size_t srcLen) const
1339 {
1340     if ( srcLen == wxNO_LEN )
1341         srcLen = wxWcslen(src) + 1;
1342
1343     if ( !dst )
1344     {
1345         // optimization: return maximal space which could be needed for this
1346         // string instead of the exact amount which could be less if there are
1347         // any surrogates in the input
1348         //
1349         // we consider that surrogates are rare enough to make it worthwhile to
1350         // avoid running the loop below at the cost of slightly extra memory
1351         // consumption
1352         return srcLen * BYTES_PER_CHAR;
1353     }
1354
1355     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1356     size_t outLen = 0;
1357     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1358     {
1359         const wxUint32 ch = wxDecodeSurrogate(&src);
1360         if ( !src )
1361             return wxCONV_FAILED;
1362
1363         outLen += BYTES_PER_CHAR;
1364
1365         if ( outLen > dstLen )
1366             return wxCONV_FAILED;
1367
1368         *outBuff++ = ch;
1369     }
1370
1371     return outLen;
1372 }
1373
1374 // ----------------------------------------------------------------------------
1375 // endian-reversing conversions
1376 // ----------------------------------------------------------------------------
1377
1378 size_t
1379 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1380                            const char *src, size_t srcLen) const
1381 {
1382     srcLen = GetLength(src, srcLen);
1383     if ( srcLen == wxNO_LEN )
1384         return wxCONV_FAILED;
1385
1386     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1387     const size_t inLen = srcLen / BYTES_PER_CHAR;
1388     size_t outLen = 0;
1389     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1390     {
1391         wxUint16 cc[2];
1392         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1393         if ( numChars == wxCONV_FAILED )
1394             return wxCONV_FAILED;
1395
1396         outLen += numChars;
1397         if ( dst )
1398         {
1399             if ( outLen > dstLen )
1400                 return wxCONV_FAILED;
1401
1402             *dst++ = cc[0];
1403             if ( numChars == 2 )
1404             {
1405                 // second character of a surrogate
1406                 *dst++ = cc[1];
1407             }
1408         }
1409     }
1410
1411     return outLen;
1412 }
1413
1414 size_t
1415 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1416                              const wchar_t *src, size_t srcLen) const
1417 {
1418     if ( srcLen == wxNO_LEN )
1419         srcLen = wxWcslen(src) + 1;
1420
1421     if ( !dst )
1422     {
1423         // optimization: return maximal space which could be needed for this
1424         // string instead of the exact amount which could be less if there are
1425         // any surrogates in the input
1426         //
1427         // we consider that surrogates are rare enough to make it worthwhile to
1428         // avoid running the loop below at the cost of slightly extra memory
1429         // consumption
1430         return srcLen*BYTES_PER_CHAR;
1431     }
1432
1433     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1434     size_t outLen = 0;
1435     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1436     {
1437         const wxUint32 ch = wxDecodeSurrogate(&src);
1438         if ( !src )
1439             return wxCONV_FAILED;
1440
1441         outLen += BYTES_PER_CHAR;
1442
1443         if ( outLen > dstLen )
1444             return wxCONV_FAILED;
1445
1446         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1447     }
1448
1449     return outLen;
1450 }
1451
1452 #else // !WC_UTF16: wchar_t is UTF-32
1453
1454 // ----------------------------------------------------------------------------
1455 // conversions without endianness change
1456 // ----------------------------------------------------------------------------
1457
1458 size_t
1459 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1460                                const char *src, size_t srcLen) const
1461 {
1462     // use memcpy() as it should be much faster than hand-written loop
1463     srcLen = GetLength(src, srcLen);
1464     if ( srcLen == wxNO_LEN )
1465         return wxCONV_FAILED;
1466
1467     const size_t inLen = srcLen/BYTES_PER_CHAR;
1468     if ( dst )
1469     {
1470         if ( dstLen < inLen )
1471             return wxCONV_FAILED;
1472
1473         memcpy(dst, src, srcLen);
1474     }
1475
1476     return inLen;
1477 }
1478
1479 size_t
1480 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1481                                  const wchar_t *src, size_t srcLen) const
1482 {
1483     if ( srcLen == wxNO_LEN )
1484         srcLen = wxWcslen(src) + 1;
1485
1486     srcLen *= BYTES_PER_CHAR;
1487
1488     if ( dst )
1489     {
1490         if ( dstLen < srcLen )
1491             return wxCONV_FAILED;
1492
1493         memcpy(dst, src, srcLen);
1494     }
1495
1496     return srcLen;
1497 }
1498
1499 // ----------------------------------------------------------------------------
1500 // endian-reversing conversions
1501 // ----------------------------------------------------------------------------
1502
1503 size_t
1504 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1505                            const char *src, size_t srcLen) const
1506 {
1507     srcLen = GetLength(src, srcLen);
1508     if ( srcLen == wxNO_LEN )
1509         return wxCONV_FAILED;
1510
1511     srcLen /= BYTES_PER_CHAR;
1512
1513     if ( dst )
1514     {
1515         if ( dstLen < srcLen )
1516             return wxCONV_FAILED;
1517
1518         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1519         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1520         {
1521             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1522         }
1523     }
1524
1525     return srcLen;
1526 }
1527
1528 size_t
1529 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1530                              const wchar_t *src, size_t srcLen) const
1531 {
1532     if ( srcLen == wxNO_LEN )
1533         srcLen = wxWcslen(src) + 1;
1534
1535     srcLen *= BYTES_PER_CHAR;
1536
1537     if ( dst )
1538     {
1539         if ( dstLen < srcLen )
1540             return wxCONV_FAILED;
1541
1542         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1543         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1544         {
1545             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1546         }
1547     }
1548
1549     return srcLen;
1550 }
1551
1552 #endif // WC_UTF16/!WC_UTF16
1553
1554
1555 // ============================================================================
1556 // The classes doing conversion using the iconv_xxx() functions
1557 // ============================================================================
1558
1559 #ifdef HAVE_ICONV
1560
1561 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1562 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1563 //     (unless there's yet another bug in glibc) the only case when iconv()
1564 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1565 //     left in the input buffer -- when _real_ error occurs,
1566 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1567 //     iconv() failure.
1568 //     [This bug does not appear in glibc 2.2.]
1569 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1570 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1571                                      (errno != E2BIG || bufLeft != 0))
1572 #else
1573 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1574 #endif
1575
1576 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1577
1578 #define ICONV_T_INVALID ((iconv_t)-1)
1579
1580 #if SIZEOF_WCHAR_T == 4
1581     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1582     #define WC_ENC      wxFONTENCODING_UTF32
1583 #elif SIZEOF_WCHAR_T == 2
1584     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1585     #define WC_ENC      wxFONTENCODING_UTF16
1586 #else // sizeof(wchar_t) != 2 nor 4
1587     // does this ever happen?
1588     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1589 #endif
1590
1591 // ----------------------------------------------------------------------------
1592 // wxMBConv_iconv: encapsulates an iconv character set
1593 // ----------------------------------------------------------------------------
1594
1595 class wxMBConv_iconv : public wxMBConv
1596 {
1597 public:
1598     wxMBConv_iconv(const char *name);
1599     virtual ~wxMBConv_iconv();
1600
1601     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1602     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1603
1604     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1605     virtual size_t GetMBNulLen() const;
1606
1607 #if wxUSE_UNICODE_UTF8
1608     virtual bool IsUTF8() const;
1609 #endif
1610
1611     virtual wxMBConv *Clone() const
1612     {
1613         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1614         p->m_minMBCharWidth = m_minMBCharWidth;
1615         return p;
1616     }
1617
1618     bool IsOk() const
1619         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1620
1621 protected:
1622     // the iconv handlers used to translate from multibyte
1623     // to wide char and in the other direction
1624     iconv_t m2w,
1625             w2m;
1626
1627 #if wxUSE_THREADS
1628     // guards access to m2w and w2m objects
1629     wxMutex m_iconvMutex;
1630 #endif
1631
1632 private:
1633     // the name (for iconv_open()) of a wide char charset -- if none is
1634     // available on this machine, it will remain NULL
1635     static wxString ms_wcCharsetName;
1636
1637     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1638     // different endian-ness than the native one
1639     static bool ms_wcNeedsSwap;
1640
1641
1642     // name of the encoding handled by this conversion
1643     wxString m_name;
1644
1645     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1646     // initially
1647     size_t m_minMBCharWidth;
1648 };
1649
1650 // make the constructor available for unit testing
1651 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1652 {
1653     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1654     if ( !result->IsOk() )
1655     {
1656         delete result;
1657         return 0;
1658     }
1659
1660     return result;
1661 }
1662
1663 wxString wxMBConv_iconv::ms_wcCharsetName;
1664 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1665
1666 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1667               : m_name(name)
1668 {
1669     m_minMBCharWidth = 0;
1670
1671     // check for charset that represents wchar_t:
1672     if ( ms_wcCharsetName.empty() )
1673     {
1674         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1675
1676 #if wxUSE_FONTMAP
1677         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1678 #else // !wxUSE_FONTMAP
1679         static const wxChar *names_static[] =
1680         {
1681 #if SIZEOF_WCHAR_T == 4
1682             _T("UCS-4"),
1683 #elif SIZEOF_WCHAR_T = 2
1684             _T("UCS-2"),
1685 #endif
1686             NULL
1687         };
1688         const wxChar **names = names_static;
1689 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1690
1691         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1692         {
1693             const wxString nameCS(*names);
1694
1695             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1696             wxString nameXE(nameCS);
1697
1698 #ifdef WORDS_BIGENDIAN
1699                 nameXE += _T("BE");
1700 #else // little endian
1701                 nameXE += _T("LE");
1702 #endif
1703
1704             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1705                        nameXE.c_str());
1706
1707             m2w = iconv_open(nameXE.ToAscii(), name);
1708             if ( m2w == ICONV_T_INVALID )
1709             {
1710                 // try charset w/o bytesex info (e.g. "UCS4")
1711                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1712                            nameCS.c_str());
1713                 m2w = iconv_open(nameCS.ToAscii(), name);
1714
1715                 // and check for bytesex ourselves:
1716                 if ( m2w != ICONV_T_INVALID )
1717                 {
1718                     char    buf[2], *bufPtr;
1719                     wchar_t wbuf[2], *wbufPtr;
1720                     size_t  insz, outsz;
1721                     size_t  res;
1722
1723                     buf[0] = 'A';
1724                     buf[1] = 0;
1725                     wbuf[0] = 0;
1726                     insz = 2;
1727                     outsz = SIZEOF_WCHAR_T * 2;
1728                     wbufPtr = wbuf;
1729                     bufPtr = buf;
1730
1731                     res = iconv(
1732                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1733                         (char**)&wbufPtr, &outsz);
1734
1735                     if (ICONV_FAILED(res, insz))
1736                     {
1737                         wxLogLastError(wxT("iconv"));
1738                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1739                                    nameCS.c_str());
1740                     }
1741                     else // ok, can convert to this encoding, remember it
1742                     {
1743                         ms_wcCharsetName = nameCS;
1744                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1745                     }
1746                 }
1747             }
1748             else // use charset not requiring byte swapping
1749             {
1750                 ms_wcCharsetName = nameXE;
1751             }
1752         }
1753
1754         wxLogTrace(TRACE_STRCONV,
1755                    wxT("iconv wchar_t charset is \"%s\"%s"),
1756                    ms_wcCharsetName.empty() ? wxString("<none>")
1757                                             : ms_wcCharsetName,
1758                    ms_wcNeedsSwap ? _T(" (needs swap)")
1759                                   : _T(""));
1760     }
1761     else // we already have ms_wcCharsetName
1762     {
1763         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1764     }
1765
1766     if ( ms_wcCharsetName.empty() )
1767     {
1768         w2m = ICONV_T_INVALID;
1769     }
1770     else
1771     {
1772         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1773         if ( w2m == ICONV_T_INVALID )
1774         {
1775             wxLogTrace(TRACE_STRCONV,
1776                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1777                        ms_wcCharsetName.c_str(), name);
1778         }
1779     }
1780 }
1781
1782 wxMBConv_iconv::~wxMBConv_iconv()
1783 {
1784     if ( m2w != ICONV_T_INVALID )
1785         iconv_close(m2w);
1786     if ( w2m != ICONV_T_INVALID )
1787         iconv_close(w2m);
1788 }
1789
1790 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1791 {
1792     // find the string length: notice that must be done differently for
1793     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1794     size_t inbuf;
1795     const size_t nulLen = GetMBNulLen();
1796     switch ( nulLen )
1797     {
1798         default:
1799             return wxCONV_FAILED;
1800
1801         case 1:
1802             inbuf = strlen(psz); // arguably more optimized than our version
1803             break;
1804
1805         case 2:
1806         case 4:
1807             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1808             // they also have to start at character boundary and not span two
1809             // adjacent characters
1810             const char *p;
1811             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1812                 ;
1813             inbuf = p - psz;
1814             break;
1815     }
1816
1817 #if wxUSE_THREADS
1818     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1819     //     Unfortunately there are a couple of global wxCSConv objects such as
1820     //     wxConvLocal that are used all over wx code, so we have to make sure
1821     //     the handle is used by at most one thread at the time. Otherwise
1822     //     only a few wx classes would be safe to use from non-main threads
1823     //     as MB<->WC conversion would fail "randomly".
1824     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1825 #endif // wxUSE_THREADS
1826
1827     size_t outbuf = n * SIZEOF_WCHAR_T;
1828     size_t res, cres;
1829     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1830     wchar_t *bufPtr = buf;
1831     const char *pszPtr = psz;
1832
1833     if (buf)
1834     {
1835         // have destination buffer, convert there
1836         cres = iconv(m2w,
1837                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1838                      (char**)&bufPtr, &outbuf);
1839         res = n - (outbuf / SIZEOF_WCHAR_T);
1840
1841         if (ms_wcNeedsSwap)
1842         {
1843             // convert to native endianness
1844             for ( unsigned i = 0; i < res; i++ )
1845                 buf[n] = WC_BSWAP(buf[i]);
1846         }
1847
1848         // NUL-terminate the string if there is any space left
1849         if (res < n)
1850             buf[res] = 0;
1851     }
1852     else
1853     {
1854         // no destination buffer... convert using temp buffer
1855         // to calculate destination buffer requirement
1856         wchar_t tbuf[8];
1857         res = 0;
1858
1859         do
1860         {
1861             bufPtr = tbuf;
1862             outbuf = 8 * SIZEOF_WCHAR_T;
1863
1864             cres = iconv(m2w,
1865                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1866                          (char**)&bufPtr, &outbuf );
1867
1868             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1869         }
1870         while ((cres == (size_t)-1) && (errno == E2BIG));
1871     }
1872
1873     if (ICONV_FAILED(cres, inbuf))
1874     {
1875         //VS: it is ok if iconv fails, hence trace only
1876         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1877         return wxCONV_FAILED;
1878     }
1879
1880     return res;
1881 }
1882
1883 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1884 {
1885 #if wxUSE_THREADS
1886     // NB: explained in MB2WC
1887     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1888 #endif
1889
1890     size_t inlen = wxWcslen(psz);
1891     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1892     size_t outbuf = n;
1893     size_t res, cres;
1894
1895     wchar_t *tmpbuf = 0;
1896
1897     if (ms_wcNeedsSwap)
1898     {
1899         // need to copy to temp buffer to switch endianness
1900         // (doing WC_BSWAP twice on the original buffer won't help, as it
1901         //  could be in read-only memory, or be accessed in some other thread)
1902         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1903         for ( size_t i = 0; i < inlen; i++ )
1904             tmpbuf[n] = WC_BSWAP(psz[i]);
1905
1906         tmpbuf[inlen] = L'\0';
1907         psz = tmpbuf;
1908     }
1909
1910     if (buf)
1911     {
1912         // have destination buffer, convert there
1913         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1914
1915         res = n - outbuf;
1916
1917         // NB: iconv was given only wcslen(psz) characters on input, and so
1918         //     it couldn't convert the trailing zero. Let's do it ourselves
1919         //     if there's some room left for it in the output buffer.
1920         if (res < n)
1921             buf[0] = 0;
1922     }
1923     else
1924     {
1925         // no destination buffer: convert using temp buffer
1926         // to calculate destination buffer requirement
1927         char tbuf[16];
1928         res = 0;
1929         do
1930         {
1931             buf = tbuf;
1932             outbuf = 16;
1933
1934             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1935
1936             res += 16 - outbuf;
1937         }
1938         while ((cres == (size_t)-1) && (errno == E2BIG));
1939     }
1940
1941     if (ms_wcNeedsSwap)
1942     {
1943         free(tmpbuf);
1944     }
1945
1946     if (ICONV_FAILED(cres, inbuf))
1947     {
1948         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1949         return wxCONV_FAILED;
1950     }
1951
1952     return res;
1953 }
1954
1955 size_t wxMBConv_iconv::GetMBNulLen() const
1956 {
1957     if ( m_minMBCharWidth == 0 )
1958     {
1959         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1960
1961 #if wxUSE_THREADS
1962         // NB: explained in MB2WC
1963         wxMutexLocker lock(self->m_iconvMutex);
1964 #endif
1965
1966         const wchar_t *wnul = L"";
1967         char buf[8]; // should be enough for NUL in any encoding
1968         size_t inLen = sizeof(wchar_t),
1969                outLen = WXSIZEOF(buf);
1970         char *inBuff = (char *)wnul;
1971         char *outBuff = buf;
1972         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1973         {
1974             self->m_minMBCharWidth = (size_t)-1;
1975         }
1976         else // ok
1977         {
1978             self->m_minMBCharWidth = outBuff - buf;
1979         }
1980     }
1981
1982     return m_minMBCharWidth;
1983 }
1984
1985 #if wxUSE_UNICODE_UTF8
1986 bool wxMBConv_iconv::IsUTF8() const
1987 {
1988     return wxStricmp(m_name, "UTF-8") == 0 ||
1989            wxStricmp(m_name, "UTF8") == 0;
1990 }
1991 #endif
1992
1993 #endif // HAVE_ICONV
1994
1995
1996 // ============================================================================
1997 // Win32 conversion classes
1998 // ============================================================================
1999
2000 #ifdef wxHAVE_WIN32_MB2WC
2001
2002 // from utils.cpp
2003 #if wxUSE_FONTMAP
2004 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2005 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2006 #endif
2007
2008 class wxMBConv_win32 : public wxMBConv
2009 {
2010 public:
2011     wxMBConv_win32()
2012     {
2013         m_CodePage = CP_ACP;
2014         m_minMBCharWidth = 0;
2015     }
2016
2017     wxMBConv_win32(const wxMBConv_win32& conv)
2018         : wxMBConv()
2019     {
2020         m_CodePage = conv.m_CodePage;
2021         m_minMBCharWidth = conv.m_minMBCharWidth;
2022     }
2023
2024 #if wxUSE_FONTMAP
2025     wxMBConv_win32(const char* name)
2026     {
2027         m_CodePage = wxCharsetToCodepage(name);
2028         m_minMBCharWidth = 0;
2029     }
2030
2031     wxMBConv_win32(wxFontEncoding encoding)
2032     {
2033         m_CodePage = wxEncodingToCodepage(encoding);
2034         m_minMBCharWidth = 0;
2035     }
2036 #endif // wxUSE_FONTMAP
2037
2038     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2039     {
2040         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2041         // the behaviour is not compatible with the Unix version (using iconv)
2042         // and break the library itself, e.g. wxTextInputStream::NextChar()
2043         // wouldn't work if reading an incomplete MB char didn't result in an
2044         // error
2045         //
2046         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2047         // Win XP or newer and it is not supported for UTF-[78] so we always
2048         // use our own conversions in this case. See
2049         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2050         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2051         if ( m_CodePage == CP_UTF8 )
2052         {
2053             return wxMBConvUTF8().MB2WC(buf, psz, n);
2054         }
2055
2056         if ( m_CodePage == CP_UTF7 )
2057         {
2058             return wxMBConvUTF7().MB2WC(buf, psz, n);
2059         }
2060
2061         int flags = 0;
2062         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2063                 IsAtLeastWin2kSP4() )
2064         {
2065             flags = MB_ERR_INVALID_CHARS;
2066         }
2067
2068         const size_t len = ::MultiByteToWideChar
2069                              (
2070                                 m_CodePage,     // code page
2071                                 flags,          // flags: fall on error
2072                                 psz,            // input string
2073                                 -1,             // its length (NUL-terminated)
2074                                 buf,            // output string
2075                                 buf ? n : 0     // size of output buffer
2076                              );
2077         if ( !len )
2078         {
2079             // function totally failed
2080             return wxCONV_FAILED;
2081         }
2082
2083         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2084         // check if we succeeded, by doing a double trip:
2085         if ( !flags && buf )
2086         {
2087             const size_t mbLen = strlen(psz);
2088             wxCharBuffer mbBuf(mbLen);
2089             if ( ::WideCharToMultiByte
2090                    (
2091                       m_CodePage,
2092                       0,
2093                       buf,
2094                       -1,
2095                       mbBuf.data(),
2096                       mbLen + 1,        // size in bytes, not length
2097                       NULL,
2098                       NULL
2099                    ) == 0 ||
2100                   strcmp(mbBuf, psz) != 0 )
2101             {
2102                 // we didn't obtain the same thing we started from, hence
2103                 // the conversion was lossy and we consider that it failed
2104                 return wxCONV_FAILED;
2105             }
2106         }
2107
2108         // note that it returns count of written chars for buf != NULL and size
2109         // of the needed buffer for buf == NULL so in either case the length of
2110         // the string (which never includes the terminating NUL) is one less
2111         return len - 1;
2112     }
2113
2114     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2115     {
2116         /*
2117             we have a problem here: by default, WideCharToMultiByte() may
2118             replace characters unrepresentable in the target code page with bad
2119             quality approximations such as turning "1/2" symbol (U+00BD) into
2120             "1" for the code pages which don't have it and we, obviously, want
2121             to avoid this at any price
2122
2123             the trouble is that this function does it _silently_, i.e. it won't
2124             even tell us whether it did or not... Win98/2000 and higher provide
2125             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2126             we have to resort to a round trip, i.e. check that converting back
2127             results in the same string -- this is, of course, expensive but
2128             otherwise we simply can't be sure to not garble the data.
2129          */
2130
2131         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2132         // it doesn't work with CJK encodings (which we test for rather roughly
2133         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2134         // supporting it
2135         BOOL usedDef wxDUMMY_INITIALIZE(false);
2136         BOOL *pUsedDef;
2137         int flags;
2138         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2139         {
2140             // it's our lucky day
2141             flags = WC_NO_BEST_FIT_CHARS;
2142             pUsedDef = &usedDef;
2143         }
2144         else // old system or unsupported encoding
2145         {
2146             flags = 0;
2147             pUsedDef = NULL;
2148         }
2149
2150         const size_t len = ::WideCharToMultiByte
2151                              (
2152                                 m_CodePage,     // code page
2153                                 flags,          // either none or no best fit
2154                                 pwz,            // input string
2155                                 -1,             // it is (wide) NUL-terminated
2156                                 buf,            // output buffer
2157                                 buf ? n : 0,    // and its size
2158                                 NULL,           // default "replacement" char
2159                                 pUsedDef        // [out] was it used?
2160                              );
2161
2162         if ( !len )
2163         {
2164             // function totally failed
2165             return wxCONV_FAILED;
2166         }
2167
2168         // if we were really converting, check if we succeeded
2169         if ( buf )
2170         {
2171             if ( flags )
2172             {
2173                 // check if the conversion failed, i.e. if any replacements
2174                 // were done
2175                 if ( usedDef )
2176                     return wxCONV_FAILED;
2177             }
2178             else // we must resort to double tripping...
2179             {
2180                 wxWCharBuffer wcBuf(n);
2181                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2182                         wcscmp(wcBuf, pwz) != 0 )
2183                 {
2184                     // we didn't obtain the same thing we started from, hence
2185                     // the conversion was lossy and we consider that it failed
2186                     return wxCONV_FAILED;
2187                 }
2188             }
2189         }
2190
2191         // see the comment above for the reason of "len - 1"
2192         return len - 1;
2193     }
2194
2195     virtual size_t GetMBNulLen() const
2196     {
2197         if ( m_minMBCharWidth == 0 )
2198         {
2199             int len = ::WideCharToMultiByte
2200                         (
2201                             m_CodePage,     // code page
2202                             0,              // no flags
2203                             L"",            // input string
2204                             1,              // translate just the NUL
2205                             NULL,           // output buffer
2206                             0,              // and its size
2207                             NULL,           // no replacement char
2208                             NULL            // [out] don't care if it was used
2209                         );
2210
2211             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2212             switch ( len )
2213             {
2214                 default:
2215                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2216                     self->m_minMBCharWidth = (size_t)-1;
2217                     break;
2218
2219                 case 0:
2220                     self->m_minMBCharWidth = (size_t)-1;
2221                     break;
2222
2223                 case 1:
2224                 case 2:
2225                 case 4:
2226                     self->m_minMBCharWidth = len;
2227                     break;
2228             }
2229         }
2230
2231         return m_minMBCharWidth;
2232     }
2233
2234     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2235
2236     bool IsOk() const { return m_CodePage != -1; }
2237
2238 private:
2239     static bool CanUseNoBestFit()
2240     {
2241         static int s_isWin98Or2k = -1;
2242
2243         if ( s_isWin98Or2k == -1 )
2244         {
2245             int verMaj, verMin;
2246             switch ( wxGetOsVersion(&verMaj, &verMin) )
2247             {
2248                 case wxOS_WINDOWS_9X:
2249                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2250                     break;
2251
2252                 case wxOS_WINDOWS_NT:
2253                     s_isWin98Or2k = verMaj >= 5;
2254                     break;
2255
2256                 default:
2257                     // unknown: be conservative by default
2258                     s_isWin98Or2k = 0;
2259                     break;
2260             }
2261
2262             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2263         }
2264
2265         return s_isWin98Or2k == 1;
2266     }
2267
2268     static bool IsAtLeastWin2kSP4()
2269     {
2270 #ifdef __WXWINCE__
2271         return false;
2272 #else
2273         static int s_isAtLeastWin2kSP4 = -1;
2274
2275         if ( s_isAtLeastWin2kSP4 == -1 )
2276         {
2277             OSVERSIONINFOEX ver;
2278
2279             memset(&ver, 0, sizeof(ver));
2280             ver.dwOSVersionInfoSize = sizeof(ver);
2281             GetVersionEx((OSVERSIONINFO*)&ver);
2282
2283             s_isAtLeastWin2kSP4 =
2284               ((ver.dwMajorVersion > 5) || // Vista+
2285                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2286                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2287                ver.wServicePackMajor >= 4)) // 2000 SP4+
2288               ? 1 : 0;
2289         }
2290
2291         return s_isAtLeastWin2kSP4 == 1;
2292 #endif
2293     }
2294
2295
2296     // the code page we're working with
2297     long m_CodePage;
2298
2299     // cached result of GetMBNulLen(), set to 0 initially meaning
2300     // "unknown"
2301     size_t m_minMBCharWidth;
2302 };
2303
2304 #endif // wxHAVE_WIN32_MB2WC
2305
2306 // ============================================================================
2307 // CoreFoundation conversion classes
2308 // ============================================================================
2309
2310 #ifdef __DARWIN__
2311
2312 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2313 {
2314     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2315
2316     switch (encoding)
2317     {
2318         case wxFONTENCODING_DEFAULT :
2319             enc = CFStringGetSystemEncoding();
2320             break ;
2321
2322         case wxFONTENCODING_ISO8859_1 :
2323             enc = kCFStringEncodingISOLatin1 ;
2324             break ;
2325         case wxFONTENCODING_ISO8859_2 :
2326             enc = kCFStringEncodingISOLatin2;
2327             break ;
2328         case wxFONTENCODING_ISO8859_3 :
2329             enc = kCFStringEncodingISOLatin3 ;
2330             break ;
2331         case wxFONTENCODING_ISO8859_4 :
2332             enc = kCFStringEncodingISOLatin4;
2333             break ;
2334         case wxFONTENCODING_ISO8859_5 :
2335             enc = kCFStringEncodingISOLatinCyrillic;
2336             break ;
2337         case wxFONTENCODING_ISO8859_6 :
2338             enc = kCFStringEncodingISOLatinArabic;
2339             break ;
2340         case wxFONTENCODING_ISO8859_7 :
2341             enc = kCFStringEncodingISOLatinGreek;
2342             break ;
2343         case wxFONTENCODING_ISO8859_8 :
2344             enc = kCFStringEncodingISOLatinHebrew;
2345             break ;
2346         case wxFONTENCODING_ISO8859_9 :
2347             enc = kCFStringEncodingISOLatin5;
2348             break ;
2349         case wxFONTENCODING_ISO8859_10 :
2350             enc = kCFStringEncodingISOLatin6;
2351             break ;
2352         case wxFONTENCODING_ISO8859_11 :
2353             enc = kCFStringEncodingISOLatinThai;
2354             break ;
2355         case wxFONTENCODING_ISO8859_13 :
2356             enc = kCFStringEncodingISOLatin7;
2357             break ;
2358         case wxFONTENCODING_ISO8859_14 :
2359             enc = kCFStringEncodingISOLatin8;
2360             break ;
2361         case wxFONTENCODING_ISO8859_15 :
2362             enc = kCFStringEncodingISOLatin9;
2363             break ;
2364
2365         case wxFONTENCODING_KOI8 :
2366             enc = kCFStringEncodingKOI8_R;
2367             break ;
2368         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2369             enc = kCFStringEncodingDOSRussian;
2370             break ;
2371
2372 //      case wxFONTENCODING_BULGARIAN :
2373 //          enc = ;
2374 //          break ;
2375
2376         case wxFONTENCODING_CP437 :
2377             enc = kCFStringEncodingDOSLatinUS ;
2378             break ;
2379         case wxFONTENCODING_CP850 :
2380             enc = kCFStringEncodingDOSLatin1;
2381             break ;
2382         case wxFONTENCODING_CP852 :
2383             enc = kCFStringEncodingDOSLatin2;
2384             break ;
2385         case wxFONTENCODING_CP855 :
2386             enc = kCFStringEncodingDOSCyrillic;
2387             break ;
2388         case wxFONTENCODING_CP866 :
2389             enc = kCFStringEncodingDOSRussian ;
2390             break ;
2391         case wxFONTENCODING_CP874 :
2392             enc = kCFStringEncodingDOSThai;
2393             break ;
2394         case wxFONTENCODING_CP932 :
2395             enc = kCFStringEncodingDOSJapanese;
2396             break ;
2397         case wxFONTENCODING_CP936 :
2398             enc = kCFStringEncodingDOSChineseSimplif ;
2399             break ;
2400         case wxFONTENCODING_CP949 :
2401             enc = kCFStringEncodingDOSKorean;
2402             break ;
2403         case wxFONTENCODING_CP950 :
2404             enc = kCFStringEncodingDOSChineseTrad;
2405             break ;
2406         case wxFONTENCODING_CP1250 :
2407             enc = kCFStringEncodingWindowsLatin2;
2408             break ;
2409         case wxFONTENCODING_CP1251 :
2410             enc = kCFStringEncodingWindowsCyrillic ;
2411             break ;
2412         case wxFONTENCODING_CP1252 :
2413             enc = kCFStringEncodingWindowsLatin1 ;
2414             break ;
2415         case wxFONTENCODING_CP1253 :
2416             enc = kCFStringEncodingWindowsGreek;
2417             break ;
2418         case wxFONTENCODING_CP1254 :
2419             enc = kCFStringEncodingWindowsLatin5;
2420             break ;
2421         case wxFONTENCODING_CP1255 :
2422             enc = kCFStringEncodingWindowsHebrew ;
2423             break ;
2424         case wxFONTENCODING_CP1256 :
2425             enc = kCFStringEncodingWindowsArabic ;
2426             break ;
2427         case wxFONTENCODING_CP1257 :
2428             enc = kCFStringEncodingWindowsBalticRim;
2429             break ;
2430 //   This only really encodes to UTF7 (if that) evidently
2431 //        case wxFONTENCODING_UTF7 :
2432 //            enc = kCFStringEncodingNonLossyASCII ;
2433 //            break ;
2434         case wxFONTENCODING_UTF8 :
2435             enc = kCFStringEncodingUTF8 ;
2436             break ;
2437         case wxFONTENCODING_EUC_JP :
2438             enc = kCFStringEncodingEUC_JP;
2439             break ;
2440 /* Don't support conversion to/from UTF16 as wxWidgets can do this better.
2441  * In particular, ToWChar would fail miserably using strlen on an input UTF16.
2442         case wxFONTENCODING_UTF16 :
2443             enc = kCFStringEncodingUnicode ;
2444             break ;
2445 */
2446         case wxFONTENCODING_MACROMAN :
2447             enc = kCFStringEncodingMacRoman ;
2448             break ;
2449         case wxFONTENCODING_MACJAPANESE :
2450             enc = kCFStringEncodingMacJapanese ;
2451             break ;
2452         case wxFONTENCODING_MACCHINESETRAD :
2453             enc = kCFStringEncodingMacChineseTrad ;
2454             break ;
2455         case wxFONTENCODING_MACKOREAN :
2456             enc = kCFStringEncodingMacKorean ;
2457             break ;
2458         case wxFONTENCODING_MACARABIC :
2459             enc = kCFStringEncodingMacArabic ;
2460             break ;
2461         case wxFONTENCODING_MACHEBREW :
2462             enc = kCFStringEncodingMacHebrew ;
2463             break ;
2464         case wxFONTENCODING_MACGREEK :
2465             enc = kCFStringEncodingMacGreek ;
2466             break ;
2467         case wxFONTENCODING_MACCYRILLIC :
2468             enc = kCFStringEncodingMacCyrillic ;
2469             break ;
2470         case wxFONTENCODING_MACDEVANAGARI :
2471             enc = kCFStringEncodingMacDevanagari ;
2472             break ;
2473         case wxFONTENCODING_MACGURMUKHI :
2474             enc = kCFStringEncodingMacGurmukhi ;
2475             break ;
2476         case wxFONTENCODING_MACGUJARATI :
2477             enc = kCFStringEncodingMacGujarati ;
2478             break ;
2479         case wxFONTENCODING_MACORIYA :
2480             enc = kCFStringEncodingMacOriya ;
2481             break ;
2482         case wxFONTENCODING_MACBENGALI :
2483             enc = kCFStringEncodingMacBengali ;
2484             break ;
2485         case wxFONTENCODING_MACTAMIL :
2486             enc = kCFStringEncodingMacTamil ;
2487             break ;
2488         case wxFONTENCODING_MACTELUGU :
2489             enc = kCFStringEncodingMacTelugu ;
2490             break ;
2491         case wxFONTENCODING_MACKANNADA :
2492             enc = kCFStringEncodingMacKannada ;
2493             break ;
2494         case wxFONTENCODING_MACMALAJALAM :
2495             enc = kCFStringEncodingMacMalayalam ;
2496             break ;
2497         case wxFONTENCODING_MACSINHALESE :
2498             enc = kCFStringEncodingMacSinhalese ;
2499             break ;
2500         case wxFONTENCODING_MACBURMESE :
2501             enc = kCFStringEncodingMacBurmese ;
2502             break ;
2503         case wxFONTENCODING_MACKHMER :
2504             enc = kCFStringEncodingMacKhmer ;
2505             break ;
2506         case wxFONTENCODING_MACTHAI :
2507             enc = kCFStringEncodingMacThai ;
2508             break ;
2509         case wxFONTENCODING_MACLAOTIAN :
2510             enc = kCFStringEncodingMacLaotian ;
2511             break ;
2512         case wxFONTENCODING_MACGEORGIAN :
2513             enc = kCFStringEncodingMacGeorgian ;
2514             break ;
2515         case wxFONTENCODING_MACARMENIAN :
2516             enc = kCFStringEncodingMacArmenian ;
2517             break ;
2518         case wxFONTENCODING_MACCHINESESIMP :
2519             enc = kCFStringEncodingMacChineseSimp ;
2520             break ;
2521         case wxFONTENCODING_MACTIBETAN :
2522             enc = kCFStringEncodingMacTibetan ;
2523             break ;
2524         case wxFONTENCODING_MACMONGOLIAN :
2525             enc = kCFStringEncodingMacMongolian ;
2526             break ;
2527         case wxFONTENCODING_MACETHIOPIC :
2528             enc = kCFStringEncodingMacEthiopic ;
2529             break ;
2530         case wxFONTENCODING_MACCENTRALEUR :
2531             enc = kCFStringEncodingMacCentralEurRoman ;
2532             break ;
2533         case wxFONTENCODING_MACVIATNAMESE :
2534             enc = kCFStringEncodingMacVietnamese ;
2535             break ;
2536         case wxFONTENCODING_MACARABICEXT :
2537             enc = kCFStringEncodingMacExtArabic ;
2538             break ;
2539         case wxFONTENCODING_MACSYMBOL :
2540             enc = kCFStringEncodingMacSymbol ;
2541             break ;
2542         case wxFONTENCODING_MACDINGBATS :
2543             enc = kCFStringEncodingMacDingbats ;
2544             break ;
2545         case wxFONTENCODING_MACTURKISH :
2546             enc = kCFStringEncodingMacTurkish ;
2547             break ;
2548         case wxFONTENCODING_MACCROATIAN :
2549             enc = kCFStringEncodingMacCroatian ;
2550             break ;
2551         case wxFONTENCODING_MACICELANDIC :
2552             enc = kCFStringEncodingMacIcelandic ;
2553             break ;
2554         case wxFONTENCODING_MACROMANIAN :
2555             enc = kCFStringEncodingMacRomanian ;
2556             break ;
2557         case wxFONTENCODING_MACCELTIC :
2558             enc = kCFStringEncodingMacCeltic ;
2559             break ;
2560         case wxFONTENCODING_MACGAELIC :
2561             enc = kCFStringEncodingMacGaelic ;
2562             break ;
2563 //      case wxFONTENCODING_MACKEYBOARD :
2564 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2565 //          break ;
2566
2567         default :
2568             // because gcc is picky
2569             break ;
2570     }
2571
2572     return enc ;
2573 }
2574
2575 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2576 // Provide a constant for the wchat_t encoding used by the host platform.
2577 #ifdef WORDS_BIGENDIAN
2578     static const CFStringEncoding wxCFStringEncodingWcharT = kCFStringEncodingUTF32BE;
2579 #else
2580     static const CFStringEncoding wxCFStringEncodingWcharT = kCFStringEncodingUTF32LE;
2581 #endif
2582
2583 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2584
2585 class wxMBConv_cf : public wxMBConv
2586 {
2587 public:
2588     wxMBConv_cf()
2589     {
2590         Init(CFStringGetSystemEncoding()) ;
2591     }
2592
2593     wxMBConv_cf(const wxMBConv_cf& conv)
2594     {
2595         m_encoding = conv.m_encoding;
2596     }
2597
2598 #if wxUSE_FONTMAP
2599     wxMBConv_cf(const char* name)
2600     {
2601         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2602     }
2603 #endif
2604
2605     wxMBConv_cf(wxFontEncoding encoding)
2606     {
2607         Init( wxCFStringEncFromFontEnc(encoding) );
2608     }
2609
2610     virtual ~wxMBConv_cf()
2611     {
2612     }
2613
2614     void Init( CFStringEncoding encoding)
2615     {
2616         m_encoding = encoding ;
2617     }
2618
2619     virtual size_t ToWChar(wchar_t * dst, size_t dstSize, const char * src, size_t srcSize = wxNO_LEN) const
2620     {
2621         wxCHECK(src, wxCONV_FAILED);
2622
2623         /* NOTE: This is wrong if the source encoding has an element size
2624          * other than char (e.g. it's kCFStringEncodingUnicode)
2625          * If the user specifies it, it's presumably right though.
2626          * Right now we don't support UTF-16 in anyway since wx can do a better job.
2627          */
2628         if(srcSize == wxNO_LEN)
2629             srcSize = strlen(src) + 1;
2630
2631         // First create the temporary CFString
2632         wxCFRef<CFStringRef> theString( CFStringCreateWithBytes (
2633                                                 NULL, //the allocator
2634                                                 (const UInt8*)src,
2635                                                 srcSize,
2636                                                 m_encoding,
2637                                                 false //no BOM/external representation
2638                                                 ));
2639
2640         wxCHECK(theString != NULL, wxCONV_FAILED);
2641
2642         /* NOTE: The string content includes the NULL element if the source string did
2643          * That means we have to do nothing special because the destination will have
2644          * the NULL element iff the source did and the NULL element will be included
2645          * in the count iff it was included in the source count.
2646          */
2647
2648
2649 /* If we're compiling against Tiger headers we can support direct conversion
2650  * to UTF32.  If we are then run against a pre-Tiger system, the encoding
2651  * won't be available so we'll defer to the string->UTF-16->UTF-32 conversion.
2652  */
2653 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2654         if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT))
2655         {
2656             CFRange fullStringRange = CFRangeMake(0, CFStringGetLength(theString));
2657             CFIndex usedBufLen;
2658
2659             CFIndex charsConverted = CFStringGetBytes(
2660                     theString,
2661                     fullStringRange,
2662                     wxCFStringEncodingWcharT,
2663                     0,
2664                     false,
2665                     // if dstSize is 0 then pass NULL to get required length in usedBufLen
2666                     dstSize != 0?(UInt8*)dst:NULL,
2667                     dstSize * sizeof(wchar_t),
2668                     &usedBufLen);
2669
2670             // charsConverted is > 0 iff conversion succeeded
2671             if(charsConverted <= 0)
2672                 return wxCONV_FAILED;
2673
2674             /* usedBufLen is the number of bytes written, so we divide by
2675              * sizeof(wchar_t) to get the number of elements written.
2676              */
2677             wxASSERT( (usedBufLen % sizeof(wchar_t)) == 0 );
2678
2679             // CFStringGetBytes does exactly the right thing when buffer
2680             // pointer is NULL and returns the number of bytes required
2681             return usedBufLen / sizeof(wchar_t);
2682         }
2683         else
2684 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2685         {
2686             // NOTE: Includes NULL iff source did
2687             /* NOTE: This is an approximation.  The eventual UTF-32 will
2688              * possibly have less elements but certainly not more.
2689              */
2690             size_t returnSize = CFStringGetLength(theString);
2691
2692             if (dstSize == 0 || dst == NULL)
2693             {
2694                 return returnSize;
2695             }
2696
2697             // Convert the entire string.. too hard to figure out how many UTF-16 we'd need
2698             // for an undersized UTF-32 destination buffer.
2699             CFRange fullStringRange = CFRangeMake(0, CFStringGetLength(theString));
2700             UniChar *szUniCharBuffer = new UniChar[fullStringRange.length];
2701
2702             CFStringGetCharacters(theString, fullStringRange, szUniCharBuffer);
2703
2704             wxMBConvUTF16 converter;
2705             returnSize = converter.ToWChar( dst, dstSize, (const char*)szUniCharBuffer, fullStringRange.length );
2706             delete [] szUniCharBuffer;
2707
2708             return returnSize;
2709         }
2710         // NOTREACHED
2711     }
2712
2713     virtual size_t FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcSize) const
2714     {
2715         wxCHECK(src, wxCONV_FAILED);
2716
2717         if(srcSize == wxNO_LEN)
2718             srcSize = wxStrlen(src) + 1;
2719
2720         // Temporary CFString
2721         wxCFRef<CFStringRef> theString;
2722
2723 /* If we're compiling against Tiger headers we can support direct conversion
2724  * from UTF32.  If we are then run against a pre-Tiger system, the encoding
2725  * won't be available so we'll defer to the UTF-32->UTF-16->string conversion.
2726  */
2727 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2728         if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT))
2729         {
2730             theString = wxCFRef<CFStringRef>(CFStringCreateWithBytes(
2731                     kCFAllocatorDefault,
2732                     (UInt8*)src,
2733                     srcSize * sizeof(wchar_t),
2734                     wxCFStringEncodingWcharT,
2735                     false));
2736         }
2737         else
2738 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2739         {
2740             wxMBConvUTF16 converter;
2741             size_t cbUniBuffer = converter.FromWChar( NULL, 0, src, srcSize );
2742             wxASSERT(cbUniBuffer % sizeof(UniChar));
2743
2744             // Will be free'd by kCFAllocatorMalloc when CFString is released
2745             UniChar *tmpUniBuffer = (UniChar*)malloc(cbUniBuffer);
2746
2747             cbUniBuffer = converter.FromWChar( (char*) tmpUniBuffer, cbUniBuffer, src, srcSize );
2748             wxASSERT(cbUniBuffer % sizeof(UniChar));
2749
2750             theString = wxCFRef<CFStringRef>(CFStringCreateWithCharactersNoCopy(
2751                         kCFAllocatorDefault,
2752                         tmpUniBuffer,
2753                         cbUniBuffer / sizeof(UniChar),
2754                         kCFAllocatorMalloc
2755                     ));
2756
2757         }
2758
2759         wxCHECK(theString != NULL, wxCONV_FAILED);
2760
2761         CFIndex usedBufLen;
2762
2763         CFIndex charsConverted = CFStringGetBytes(
2764                 theString,
2765                 CFRangeMake(0, CFStringGetLength(theString)),
2766                 m_encoding,
2767                 0, // FAIL on unconvertible characters
2768                 false, // not an external representation
2769                 // if dstSize is 0 then pass NULL to get required length in usedBufLen
2770                 (dstSize != 0)?(UInt8*)dst:NULL,
2771                 dstSize,
2772                 &usedBufLen
2773             );
2774
2775         // charsConverted is > 0 iff conversion succeeded
2776         if(charsConverted <= 0)
2777             return wxCONV_FAILED;
2778
2779         return usedBufLen;
2780     }
2781
2782     virtual wxMBConv *Clone() const { return new wxMBConv_cf(*this); }
2783
2784     bool IsOk() const
2785     {
2786         return m_encoding != kCFStringEncodingInvalidId &&
2787               CFStringIsEncodingAvailable(m_encoding);
2788     }
2789
2790 private:
2791     CFStringEncoding m_encoding ;
2792 };
2793
2794 #endif // __DARWIN__
2795
2796 // ============================================================================
2797 // Mac conversion classes
2798 // ============================================================================
2799
2800 /* Although we are in the base library we currently have this wxMac
2801  * conditional.  This is not generally good but fortunately does not affect
2802  * the ABI of the base library, only what encodings might work.
2803  * It does mean that a wxBase built as part of wxMac has slightly more support
2804  * than one built for wxCocoa or even wxGtk.
2805  */
2806 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2807
2808 class wxMBConv_mac : public wxMBConv
2809 {
2810 public:
2811     wxMBConv_mac()
2812     {
2813         Init(CFStringGetSystemEncoding()) ;
2814     }
2815
2816     wxMBConv_mac(const wxMBConv_mac& conv)
2817     {
2818         Init(conv.m_char_encoding);
2819     }
2820
2821 #if wxUSE_FONTMAP
2822     wxMBConv_mac(const char* name)
2823     {
2824         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2825     }
2826 #endif
2827
2828     wxMBConv_mac(wxFontEncoding encoding)
2829     {
2830         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2831     }
2832
2833     virtual ~wxMBConv_mac()
2834     {
2835         OSStatus status = noErr ;
2836         if (m_MB2WC_converter)
2837             status = TECDisposeConverter(m_MB2WC_converter);
2838         if (m_WC2MB_converter)
2839             status = TECDisposeConverter(m_WC2MB_converter);
2840     }
2841
2842     void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2843             TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2844     {
2845         m_MB2WC_converter = NULL ;
2846         m_WC2MB_converter = NULL ;
2847         m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2848         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2849     }
2850
2851     virtual void CreateIfNeeded() const
2852     {
2853         if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2854         {
2855             OSStatus status = noErr ;
2856             status = TECCreateConverter(&m_MB2WC_converter,
2857                                     m_char_encoding,
2858                                     m_unicode_encoding);
2859             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2860             status = TECCreateConverter(&m_WC2MB_converter,
2861                                     m_unicode_encoding,
2862                                     m_char_encoding);
2863             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2864         }
2865     }
2866
2867     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2868     {
2869         CreateIfNeeded() ;
2870         OSStatus status = noErr ;
2871         ByteCount byteOutLen ;
2872         ByteCount byteInLen = strlen(psz) + 1;
2873         wchar_t *tbuf = NULL ;
2874         UniChar* ubuf = NULL ;
2875         size_t res = 0 ;
2876
2877         if (buf == NULL)
2878         {
2879             // Apple specs say at least 32
2880             n = wxMax( 32, byteInLen ) ;
2881             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2882         }
2883
2884         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2885
2886 #if SIZEOF_WCHAR_T == 4
2887         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2888 #else
2889         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2890 #endif
2891
2892         status = TECConvertText(
2893             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2894             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2895
2896 #if SIZEOF_WCHAR_T == 4
2897         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2898         // is not properly terminated we get random characters at the end
2899         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2900         wxMBConvUTF16 converter ;
2901         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2902         free( ubuf ) ;
2903 #else
2904         res = byteOutLen / sizeof( UniChar ) ;
2905 #endif
2906
2907         if ( buf == NULL )
2908              free(tbuf) ;
2909
2910         if ( buf  && res < n)
2911             buf[res] = 0;
2912
2913         return res ;
2914     }
2915
2916     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2917     {
2918         CreateIfNeeded() ;
2919         OSStatus status = noErr ;
2920         ByteCount byteOutLen ;
2921         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2922
2923         char *tbuf = NULL ;
2924
2925         if (buf == NULL)
2926         {
2927             // Apple specs say at least 32
2928             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2929             tbuf = (char*) malloc( n ) ;
2930         }
2931
2932         ByteCount byteBufferLen = n ;
2933         UniChar* ubuf = NULL ;
2934
2935 #if SIZEOF_WCHAR_T == 4
2936         wxMBConvUTF16 converter ;
2937         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2938         byteInLen = unicharlen ;
2939         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2940         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2941 #else
2942         ubuf = (UniChar*) psz ;
2943 #endif
2944
2945         status = TECConvertText(
2946             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2947             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2948
2949 #if SIZEOF_WCHAR_T == 4
2950         free( ubuf ) ;
2951 #endif
2952
2953         if ( buf == NULL )
2954             free(tbuf) ;
2955
2956         size_t res = byteOutLen ;
2957         if ( buf  && res < n)
2958         {
2959             buf[res] = 0;
2960
2961             //we need to double-trip to verify it didn't insert any ? in place
2962             //of bogus characters
2963             wxWCharBuffer wcBuf(n);
2964             size_t pszlen = wxWcslen(psz);
2965             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2966                         wxWcslen(wcBuf) != pszlen ||
2967                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2968             {
2969                 // we didn't obtain the same thing we started from, hence
2970                 // the conversion was lossy and we consider that it failed
2971                 return wxCONV_FAILED;
2972             }
2973         }
2974
2975         return res ;
2976     }
2977
2978     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2979
2980     bool IsOk() const
2981     {
2982         CreateIfNeeded() ;
2983         return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2984     }
2985
2986 protected :
2987     mutable TECObjectRef m_MB2WC_converter;
2988     mutable TECObjectRef m_WC2MB_converter;
2989
2990     TextEncodingBase m_char_encoding;
2991     TextEncodingBase m_unicode_encoding;
2992 };
2993
2994 // MB is decomposed (D) normalized UTF8
2995
2996 class wxMBConv_macUTF8D : public wxMBConv_mac
2997 {
2998 public :
2999     wxMBConv_macUTF8D()
3000     {
3001         Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
3002         m_uni = NULL;
3003         m_uniBack = NULL ;
3004     }
3005
3006     virtual ~wxMBConv_macUTF8D()
3007     {
3008         if (m_uni!=NULL)
3009             DisposeUnicodeToTextInfo(&m_uni);
3010         if (m_uniBack!=NULL)
3011             DisposeUnicodeToTextInfo(&m_uniBack);
3012     }
3013
3014     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
3015     {
3016         CreateIfNeeded() ;
3017         OSStatus status = noErr ;
3018         ByteCount byteOutLen ;
3019         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
3020
3021         char *tbuf = NULL ;
3022
3023         if (buf == NULL)
3024         {
3025             // Apple specs say at least 32
3026             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
3027             tbuf = (char*) malloc( n ) ;
3028         }
3029
3030         ByteCount byteBufferLen = n ;
3031         UniChar* ubuf = NULL ;
3032
3033 #if SIZEOF_WCHAR_T == 4
3034         wxMBConvUTF16 converter ;
3035         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
3036         byteInLen = unicharlen ;
3037         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
3038         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
3039 #else
3040         ubuf = (UniChar*) psz ;
3041 #endif
3042
3043         // ubuf is a non-decomposed UniChar buffer
3044
3045         ByteCount dcubuflen = byteInLen * 2 + 2 ;
3046         ByteCount dcubufread , dcubufwritten ;
3047         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3048
3049         ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
3050             kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , dcubuf ) ;
3051
3052         // we now convert that decomposed buffer into UTF8
3053
3054         status = TECConvertText(
3055             m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
3056             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
3057
3058         free( dcubuf );
3059
3060 #if SIZEOF_WCHAR_T == 4
3061         free( ubuf ) ;
3062 #endif
3063
3064         if ( buf == NULL )
3065             free(tbuf) ;
3066
3067         size_t res = byteOutLen ;
3068         if ( buf  && res < n)
3069         {
3070             buf[res] = 0;
3071             // don't test for round-trip fidelity yet, we cannot guarantee it yet
3072         }
3073
3074         return res ;
3075     }
3076
3077     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
3078     {
3079         CreateIfNeeded() ;
3080         OSStatus status = noErr ;
3081         ByteCount byteOutLen ;
3082         ByteCount byteInLen = strlen(psz) + 1;
3083         wchar_t *tbuf = NULL ;
3084         UniChar* ubuf = NULL ;
3085         size_t res = 0 ;
3086
3087         if (buf == NULL)
3088         {
3089             // Apple specs say at least 32
3090             n = wxMax( 32, byteInLen ) ;
3091             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3092         }
3093
3094         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3095
3096 #if SIZEOF_WCHAR_T == 4
3097         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3098 #else
3099         ubuf = (UniChar*) (buf ? buf : tbuf) ;
3100 #endif
3101
3102         ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3103         ByteCount dcubufread , dcubufwritten ;
3104         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3105
3106         status = TECConvertText(
3107                                 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3108                                 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3109         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3110         // is not properly terminated we get random characters at the end
3111         dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3112
3113         // now from the decomposed UniChar to properly composed uniChar
3114         ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3115                                   kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , ubuf ) ;
3116
3117         free( dcubuf );
3118         byteOutLen = dcubufwritten ;
3119         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3120
3121
3122 #if SIZEOF_WCHAR_T == 4
3123         wxMBConvUTF16 converter ;
3124         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3125         free( ubuf ) ;
3126 #else
3127         res = byteOutLen / sizeof( UniChar ) ;
3128 #endif
3129
3130         if ( buf == NULL )
3131             free(tbuf) ;
3132
3133         if ( buf  && res < n)
3134             buf[res] = 0;
3135
3136         return res ;
3137     }
3138
3139     virtual void CreateIfNeeded() const
3140     {
3141         wxMBConv_mac::CreateIfNeeded() ;
3142         if ( m_uni == NULL )
3143         {
3144             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3145                 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3146             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3147                 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3148             m_map.mappingVersion = kUnicodeUseLatestMapping;
3149
3150             OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3151             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3152
3153             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3154                                                        kUnicodeNoSubset, kTextEncodingDefaultFormat);
3155             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3156                                                      kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3157             m_map.mappingVersion = kUnicodeUseLatestMapping;
3158             err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3159             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3160         }
3161     }
3162 protected :
3163     mutable UnicodeToTextInfo   m_uni;
3164     mutable UnicodeToTextInfo   m_uniBack;
3165     mutable UnicodeMapping      m_map;
3166 };
3167 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3168
3169 // ============================================================================
3170 // wxEncodingConverter based conversion classes
3171 // ============================================================================
3172
3173 #if wxUSE_FONTMAP
3174
3175 class wxMBConv_wxwin : public wxMBConv
3176 {
3177 private:
3178     void Init()
3179     {
3180         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3181                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3182     }
3183
3184 public:
3185     // temporarily just use wxEncodingConverter stuff,
3186     // so that it works while a better implementation is built
3187     wxMBConv_wxwin(const char* name)
3188     {
3189         if (name)
3190             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3191         else
3192             m_enc = wxFONTENCODING_SYSTEM;
3193
3194         Init();
3195     }
3196
3197     wxMBConv_wxwin(wxFontEncoding enc)
3198     {
3199         m_enc = enc;
3200
3201         Init();
3202     }
3203
3204     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3205     {
3206         size_t inbuf = strlen(psz);
3207         if (buf)
3208         {
3209             if (!m2w.Convert(psz, buf))
3210                 return wxCONV_FAILED;
3211         }
3212         return inbuf;
3213     }
3214
3215     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3216     {
3217         const size_t inbuf = wxWcslen(psz);
3218         if (buf)
3219         {
3220             if (!w2m.Convert(psz, buf))
3221                 return wxCONV_FAILED;
3222         }
3223
3224         return inbuf;
3225     }
3226
3227     virtual size_t GetMBNulLen() const
3228     {
3229         switch ( m_enc )
3230         {
3231             case wxFONTENCODING_UTF16BE:
3232             case wxFONTENCODING_UTF16LE:
3233                 return 2;
3234
3235             case wxFONTENCODING_UTF32BE:
3236             case wxFONTENCODING_UTF32LE:
3237                 return 4;
3238
3239             default:
3240                 return 1;
3241         }
3242     }
3243
3244     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3245
3246     bool IsOk() const { return m_ok; }
3247
3248 public:
3249     wxFontEncoding m_enc;
3250     wxEncodingConverter m2w, w2m;
3251
3252 private:
3253     // were we initialized successfully?
3254     bool m_ok;
3255
3256     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3257 };
3258
3259 // make the constructors available for unit testing
3260 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
3261 {
3262     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3263     if ( !result->IsOk() )
3264     {
3265         delete result;
3266         return 0;
3267     }
3268
3269     return result;
3270 }
3271
3272 #endif // wxUSE_FONTMAP
3273
3274 // ============================================================================
3275 // wxCSConv implementation
3276 // ============================================================================
3277
3278 void wxCSConv::Init()
3279 {
3280     m_name = NULL;
3281     m_convReal =  NULL;
3282     m_deferred = true;
3283 }
3284
3285 wxCSConv::wxCSConv(const wxString& charset)
3286 {
3287     Init();
3288
3289     if ( !charset.empty() )
3290     {
3291         SetName(charset.ToAscii());
3292     }
3293
3294 #if wxUSE_FONTMAP
3295     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3296 #else
3297     m_encoding = wxFONTENCODING_SYSTEM;
3298 #endif
3299 }
3300
3301 wxCSConv::wxCSConv(wxFontEncoding encoding)
3302 {
3303     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3304     {
3305         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3306
3307         encoding = wxFONTENCODING_SYSTEM;
3308     }
3309
3310     Init();
3311
3312     m_encoding = encoding;
3313 }
3314
3315 wxCSConv::~wxCSConv()
3316 {
3317     Clear();
3318 }
3319
3320 wxCSConv::wxCSConv(const wxCSConv& conv)
3321         : wxMBConv()
3322 {
3323     Init();
3324
3325     SetName(conv.m_name);
3326     m_encoding = conv.m_encoding;
3327 }
3328
3329 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3330 {
3331     Clear();
3332
3333     SetName(conv.m_name);
3334     m_encoding = conv.m_encoding;
3335
3336     return *this;
3337 }
3338
3339 void wxCSConv::Clear()
3340 {
3341     free(m_name);
3342     delete m_convReal;
3343
3344     m_name = NULL;
3345     m_convReal = NULL;
3346 }
3347
3348 void wxCSConv::SetName(const char *charset)
3349 {
3350     if (charset)
3351     {
3352         m_name = strdup(charset);
3353         m_deferred = true;
3354     }
3355 }
3356
3357 #if wxUSE_FONTMAP
3358
3359 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3360                      wxEncodingNameCache );
3361
3362 static wxEncodingNameCache gs_nameCache;
3363 #endif
3364
3365 wxMBConv *wxCSConv::DoCreate() const
3366 {
3367 #if wxUSE_FONTMAP
3368     wxLogTrace(TRACE_STRCONV,
3369                wxT("creating conversion for %s"),
3370                (m_name ? m_name
3371                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3372 #endif // wxUSE_FONTMAP
3373
3374     // check for the special case of ASCII or ISO8859-1 charset: as we have
3375     // special knowledge of it anyhow, we don't need to create a special
3376     // conversion object
3377     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3378             m_encoding == wxFONTENCODING_DEFAULT )
3379     {
3380         // don't convert at all
3381         return NULL;
3382     }
3383
3384     // we trust OS to do conversion better than we can so try external
3385     // conversion methods first
3386     //
3387     // the full order is:
3388     //      1. OS conversion (iconv() under Unix or Win32 API)
3389     //      2. hard coded conversions for UTF
3390     //      3. wxEncodingConverter as fall back
3391
3392     // step (1)
3393 #ifdef HAVE_ICONV
3394 #if !wxUSE_FONTMAP
3395     if ( m_name )
3396 #endif // !wxUSE_FONTMAP
3397     {
3398 #if wxUSE_FONTMAP
3399         wxFontEncoding encoding(m_encoding);
3400 #endif
3401
3402         if ( m_name )
3403         {
3404             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3405             if ( conv->IsOk() )
3406                 return conv;
3407
3408             delete conv;
3409
3410 #if wxUSE_FONTMAP
3411             encoding =
3412                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3413 #endif // wxUSE_FONTMAP
3414         }
3415 #if wxUSE_FONTMAP
3416         {
3417             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3418             if ( it != gs_nameCache.end() )
3419             {
3420                 if ( it->second.empty() )
3421                     return NULL;
3422
3423                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3424                 if ( conv->IsOk() )
3425                     return conv;
3426
3427                 delete conv;
3428             }
3429
3430             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3431             // CS : in case this does not return valid names (eg for MacRoman)
3432             // encoding got a 'failure' entry in the cache all the same,
3433             // although it just has to be created using a different method, so
3434             // only store failed iconv creation attempts (or perhaps we
3435             // shoulnd't do this at all ?)
3436             if ( names[0] != NULL )
3437             {
3438                 for ( ; *names; ++names )
3439                 {
3440                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3441                     //             will need changes that will obsolete this
3442                     wxString name(*names);
3443                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3444                     if ( conv->IsOk() )
3445                     {
3446                         gs_nameCache[encoding] = *names;
3447                         return conv;
3448                     }
3449
3450                     delete conv;
3451                 }
3452
3453                 gs_nameCache[encoding] = _T(""); // cache the failure
3454             }
3455         }
3456 #endif // wxUSE_FONTMAP
3457     }
3458 #endif // HAVE_ICONV
3459
3460 #ifdef wxHAVE_WIN32_MB2WC
3461     {
3462 #if wxUSE_FONTMAP
3463         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3464                                       : new wxMBConv_win32(m_encoding);
3465         if ( conv->IsOk() )
3466             return conv;
3467
3468         delete conv;
3469 #else
3470         return NULL;
3471 #endif
3472     }
3473 #endif // wxHAVE_WIN32_MB2WC
3474
3475 #if defined(__WXMAC__)
3476     {
3477         // leave UTF16 and UTF32 to the built-ins of wx
3478         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3479             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3480         {
3481 #if wxUSE_FONTMAP
3482             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3483                                         : new wxMBConv_mac(m_encoding);
3484 #else
3485             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3486 #endif
3487             if ( conv->IsOk() )
3488                  return conv;
3489
3490             delete conv;
3491         }
3492     }
3493 #endif
3494
3495 #ifdef __DARWIN__
3496     {
3497         // leave UTF16 and UTF32 to the built-ins of wx
3498         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3499             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3500         {
3501 #if wxUSE_FONTMAP
3502             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3503                                           : new wxMBConv_cf(m_encoding);
3504 #else
3505             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3506 #endif
3507
3508             if ( conv->IsOk() )
3509                  return conv;
3510
3511             delete conv;
3512         }
3513     }
3514 #endif // __DARWIN__
3515
3516     // step (2)
3517     wxFontEncoding enc = m_encoding;
3518 #if wxUSE_FONTMAP
3519     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3520     {
3521         // use "false" to suppress interactive dialogs -- we can be called from
3522         // anywhere and popping up a dialog from here is the last thing we want to
3523         // do
3524         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3525     }
3526 #endif // wxUSE_FONTMAP
3527
3528     switch ( enc )
3529     {
3530         case wxFONTENCODING_UTF7:
3531              return new wxMBConvUTF7;
3532
3533         case wxFONTENCODING_UTF8:
3534              return new wxMBConvUTF8;
3535
3536         case wxFONTENCODING_UTF16BE:
3537              return new wxMBConvUTF16BE;
3538
3539         case wxFONTENCODING_UTF16LE:
3540              return new wxMBConvUTF16LE;
3541
3542         case wxFONTENCODING_UTF32BE:
3543              return new wxMBConvUTF32BE;
3544
3545         case wxFONTENCODING_UTF32LE:
3546              return new wxMBConvUTF32LE;
3547
3548         default:
3549              // nothing to do but put here to suppress gcc warnings
3550              break;
3551     }
3552
3553     // step (3)
3554 #if wxUSE_FONTMAP
3555     {
3556         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3557                                       : new wxMBConv_wxwin(m_encoding);
3558         if ( conv->IsOk() )
3559             return conv;
3560
3561         delete conv;
3562     }
3563 #endif // wxUSE_FONTMAP
3564
3565     // NB: This is a hack to prevent deadlock. What could otherwise happen
3566     //     in Unicode build: wxConvLocal creation ends up being here
3567     //     because of some failure and logs the error. But wxLog will try to
3568     //     attach a timestamp, for which it will need wxConvLocal (to convert
3569     //     time to char* and then wchar_t*), but that fails, tries to log the
3570     //     error, but wxLog has an (already locked) critical section that
3571     //     guards the static buffer.
3572     static bool alreadyLoggingError = false;
3573     if (!alreadyLoggingError)
3574     {
3575         alreadyLoggingError = true;
3576         wxLogError(_("Cannot convert from the charset '%s'!"),
3577                    m_name ? m_name
3578                       :
3579 #if wxUSE_FONTMAP
3580                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3581 #else // !wxUSE_FONTMAP
3582                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3583 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3584               );
3585
3586         alreadyLoggingError = false;
3587     }
3588
3589     return NULL;
3590 }
3591
3592 void wxCSConv::CreateConvIfNeeded() const
3593 {
3594     if ( m_deferred )
3595     {
3596         wxCSConv *self = (wxCSConv *)this; // const_cast
3597
3598         // if we don't have neither the name nor the encoding, use the default
3599         // encoding for this system
3600         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3601         {
3602 #if wxUSE_INTL
3603             self->m_encoding = wxLocale::GetSystemEncoding();
3604 #else
3605             // fallback to some reasonable default:
3606             self->m_encoding = wxFONTENCODING_ISO8859_1;
3607 #endif // wxUSE_INTL
3608         }
3609
3610         self->m_convReal = DoCreate();
3611         self->m_deferred = false;
3612     }
3613 }
3614
3615 bool wxCSConv::IsOk() const
3616 {
3617     CreateConvIfNeeded();
3618
3619     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3620     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3621         return true; // always ok as we do it ourselves
3622
3623     // m_convReal->IsOk() is called at its own creation, so we know it must
3624     // be ok if m_convReal is non-NULL
3625     return m_convReal != NULL;
3626 }
3627
3628 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3629                          const char *src, size_t srcLen) const
3630 {
3631     CreateConvIfNeeded();
3632
3633     if (m_convReal)
3634         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3635
3636     // latin-1 (direct)
3637     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3638 }
3639
3640 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3641                            const wchar_t *src, size_t srcLen) const
3642 {
3643     CreateConvIfNeeded();
3644
3645     if (m_convReal)
3646         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3647
3648     // latin-1 (direct)
3649     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3650 }
3651
3652 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3653 {
3654     CreateConvIfNeeded();
3655
3656     if (m_convReal)
3657         return m_convReal->MB2WC(buf, psz, n);
3658
3659     // latin-1 (direct)
3660     size_t len = strlen(psz);
3661
3662     if (buf)
3663     {
3664         for (size_t c = 0; c <= len; c++)
3665             buf[c] = (unsigned char)(psz[c]);
3666     }
3667
3668     return len;
3669 }
3670
3671 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3672 {
3673     CreateConvIfNeeded();
3674
3675     if (m_convReal)
3676         return m_convReal->WC2MB(buf, psz, n);
3677
3678     // latin-1 (direct)
3679     const size_t len = wxWcslen(psz);
3680     if (buf)
3681     {
3682         for (size_t c = 0; c <= len; c++)
3683         {
3684             if (psz[c] > 0xFF)
3685                 return wxCONV_FAILED;
3686
3687             buf[c] = (char)psz[c];
3688         }
3689     }
3690     else
3691     {
3692         for (size_t c = 0; c <= len; c++)
3693         {
3694             if (psz[c] > 0xFF)
3695                 return wxCONV_FAILED;
3696         }
3697     }
3698
3699     return len;
3700 }
3701
3702 size_t wxCSConv::GetMBNulLen() const
3703 {
3704     CreateConvIfNeeded();
3705
3706     if ( m_convReal )
3707     {
3708         return m_convReal->GetMBNulLen();
3709     }
3710
3711     // otherwise, we are ISO-8859-1
3712     return 1;
3713 }
3714
3715 #if wxUSE_UNICODE_UTF8
3716 bool wxCSConv::IsUTF8() const
3717 {
3718     CreateConvIfNeeded();
3719
3720     if ( m_convReal )
3721     {
3722         return m_convReal->IsUTF8();
3723     }
3724
3725     // otherwise, we are ISO-8859-1
3726     return false;
3727 }
3728 #endif
3729
3730
3731 #if wxUSE_UNICODE
3732
3733 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3734 {
3735     if ( !s )
3736         return wxWCharBuffer();
3737
3738     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3739     if ( !wbuf )
3740         wbuf = wxMBConvUTF8().cMB2WX(s);
3741     if ( !wbuf )
3742         wbuf = wxConvISO8859_1.cMB2WX(s);
3743
3744     return wbuf;
3745 }
3746
3747 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3748 {
3749     if ( !ws )
3750         return wxCharBuffer();
3751
3752     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3753     if ( !buf )
3754         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3755
3756     return buf;
3757 }
3758
3759 #endif // wxUSE_UNICODE
3760
3761 // ----------------------------------------------------------------------------
3762 // globals
3763 // ----------------------------------------------------------------------------
3764
3765 // NB: The reason why we create converted objects in this convoluted way,
3766 //     using a factory function instead of global variable, is that they
3767 //     may be used at static initialization time (some of them are used by
3768 //     wxString ctors and there may be a global wxString object). In other
3769 //     words, possibly _before_ the converter global object would be
3770 //     initialized.
3771
3772 #undef wxConvLibc
3773 #undef wxConvUTF8
3774 #undef wxConvUTF7
3775 #undef wxConvLocal
3776 #undef wxConvISO8859_1
3777
3778 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3779     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3780     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3781     {                                                                   \
3782         static impl_klass name##Obj ctor_args;                          \
3783         return &name##Obj;                                              \
3784     }                                                                   \
3785     /* this ensures that all global converter objects are created */    \
3786     /* by the time static initialization is done, i.e. before any */    \
3787     /* thread is launched: */                                           \
3788     static klass* gs_##name##instance = wxGet_##name##Ptr()
3789
3790 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3791     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3792
3793 #ifdef __WINDOWS__
3794     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3795 #elif defined(__WXMAC__) && !defined(__MACH__)
3796     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3797 #else
3798     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3799 #endif
3800
3801 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3802 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3803
3804 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3805 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3806
3807 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3808 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3809
3810 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3811 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3812 #endif
3813 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3814 #ifdef __WXOSX__
3815 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3816                                     &wxConvMacUTF8DObj;
3817 #else
3818                                     wxGet_wxConvUTF8Ptr();
3819 #endif
3820 #else // !__WXOSX__
3821                                     wxGet_wxConvLibcPtr();
3822 #endif // __WXOSX__/!__WXOSX__
3823
3824 #else // !wxUSE_WCHAR_T
3825
3826 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3827 // stand-ins in absence of wchar_t
3828 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3829                                 wxConvISO8859_1,
3830                                 wxConvLocal,
3831                                 wxConvUTF8;
3832
3833 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T