src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21     #include "wx/utils.h"
  22     #include "wx/hashmap.h"
  23 #endif
  24
  25 #include "wx/strconv.h"
  26
  27 #if wxUSE_WCHAR_T
  28
  29 #ifndef __WXWINCE__
  30 #include <errno.h>
  31 #endif
  32
  33 #include <ctype.h>
  34 #include <string.h>
  35 #include <stdlib.h>
  36
  37 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  38     #include "wx/msw/private.h"
  39     #include "wx/msw/missing.h"
  40     #define wxHAVE_WIN32_MB2WC
  41 #endif
  42
  43 #ifdef __SALFORDC__
  44     #include <clib.h>
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __WXMAC__
  56 #ifndef __DARWIN__
  57 #include <ATSUnicode.h>
  58 #include <TextCommon.h>
  59 #include <TextEncodingConverter.h>
  60 #endif
  61
  62 // includes Mac headers
  63 #include "wx/mac/private.h"
  64 #endif
  65
  66
  67 #define TRACE_STRCONV _T("strconv")
  68
  69 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  70 // be 4 bytes
  71 #if SIZEOF_WCHAR_T == 2
  72     #define WC_UTF16
  73 #endif
  74
  75
  76 // ============================================================================
  77 // implementation
  78 // ============================================================================
  79
  80 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  81 static bool NotAllNULs(const char *p, size_t n)
  82 {
  83     while ( n && *p++ == '\0' )
  84         n--;
  85
  86     return n != 0;
  87 }
  88
  89 // ----------------------------------------------------------------------------
  90 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  91 // ----------------------------------------------------------------------------
  92
  93 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  94 {
  95     if (input <= 0xffff)
  96     {
  97         if (output)
  98             *output = (wxUint16) input;
  99
 100         return 1;
 101     }
 102     else if (input >= 0x110000)
 103     {
 104         return wxCONV_FAILED;
 105     }
 106     else
 107     {
 108         if (output)
 109         {
 110             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 111             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 112         }
 113
 114         return 2;
 115     }
 116 }
 117
 118 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 119 {
 120     if ((*input < 0xd800) || (*input > 0xdfff))
 121     {
 122         output = *input;
 123         return 1;
 124     }
 125     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 126     {
 127         output = *input;
 128         return wxCONV_FAILED;
 129     }
 130     else
 131     {
 132         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 133         return 2;
 134     }
 135 }
 136
 137 #ifdef WC_UTF16
 138     typedef wchar_t wxDecodeSurrogate_t;
 139 #else // !WC_UTF16
 140     typedef wxUint16 wxDecodeSurrogate_t;
 141 #endif // WC_UTF16/!WC_UTF16
 142
 143 // returns the next UTF-32 character from the wchar_t buffer and advances the
 144 // pointer to the character after this one
 145 //
 146 // if an invalid character is found, *pSrc is set to NULL, the caller must
 147 // check for this
 148 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 149 {
 150     wxUint32 out;
 151     const size_t
 152         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 153     if ( n == wxCONV_FAILED )
 154         *pSrc = NULL;
 155     else
 156         *pSrc += n;
 157
 158     return out;
 159 }
 160
 161 // ----------------------------------------------------------------------------
 162 // wxMBConv
 163 // ----------------------------------------------------------------------------
 164
 165 size_t
 166 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 167                   const char *src, size_t srcLen) const
 168 {
 169     // although new conversion classes are supposed to implement this function
 170     // directly, the existins ones only implement the old MB2WC() and so, to
 171     // avoid to have to rewrite all conversion classes at once, we provide a
 172     // default (but not efficient) implementation of this one in terms of the
 173     // old function by copying the input to ensure that it's NUL-terminated and
 174     // then using MB2WC() to convert it
 175
 176     // the number of chars [which would be] written to dst [if it were not NULL]
 177     size_t dstWritten = 0;
 178
 179     // the number of NULs terminating this string
 180     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 181
 182     // if we were not given the input size we just have to assume that the
 183     // string is properly terminated as we have no way of knowing how long it
 184     // is anyhow, but if we do have the size check whether there are enough
 185     // NULs at the end
 186     wxCharBuffer bufTmp;
 187     const char *srcEnd;
 188     if ( srcLen != wxNO_LEN )
 189     {
 190         // we need to know how to find the end of this string
 191         nulLen = GetMBNulLen();
 192         if ( nulLen == wxCONV_FAILED )
 193             return wxCONV_FAILED;
 194
 195         // if there are enough NULs we can avoid the copy
 196         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 197         {
 198             // make a copy in order to properly NUL-terminate the string
 199             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 200             char * const p = bufTmp.data();
 201             memcpy(p, src, srcLen);
 202             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 203                 *s = '\0';
 204
 205             src = bufTmp;
 206         }
 207
 208         srcEnd = src + srcLen;
 209     }
 210     else // quit after the first loop iteration
 211     {
 212         srcEnd = NULL;
 213     }
 214
 215     for ( ;; )
 216     {
 217         // try to convert the current chunk
 218         size_t lenChunk = MB2WC(NULL, src, 0);
 219         if ( lenChunk == wxCONV_FAILED )
 220             return wxCONV_FAILED;
 221
 222         lenChunk++; // for the L'\0' at the end of this chunk
 223
 224         dstWritten += lenChunk;
 225
 226         if ( lenChunk == 1 )
 227         {
 228             // nothing left in the input string, conversion succeeded
 229             break;
 230         }
 231
 232         if ( dst )
 233         {
 234             if ( dstWritten > dstLen )
 235                 return wxCONV_FAILED;
 236
 237             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 238                 return wxCONV_FAILED;
 239
 240             dst += lenChunk;
 241         }
 242
 243         if ( !srcEnd )
 244         {
 245             // we convert just one chunk in this case as this is the entire
 246             // string anyhow
 247             break;
 248         }
 249
 250         // advance the input pointer past the end of this chunk
 251         while ( NotAllNULs(src, nulLen) )
 252         {
 253             // notice that we must skip over multiple bytes here as we suppose
 254             // that if NUL takes 2 or 4 bytes, then all the other characters do
 255             // too and so if advanced by a single byte we might erroneously
 256             // detect sequences of NUL bytes in the middle of the input
 257             src += nulLen;
 258         }
 259
 260         src += nulLen; // skipping over its terminator as well
 261
 262         // note that ">=" (and not just "==") is needed here as the terminator
 263         // we skipped just above could be inside or just after the buffer
 264         // delimited by inEnd
 265         if ( src >= srcEnd )
 266             break;
 267     }
 268
 269     return dstWritten;
 270 }
 271
 272 size_t
 273 wxMBConv::FromWChar(char *dst, size_t dstLen,
 274                     const wchar_t *src, size_t srcLen) const
 275 {
 276     // the number of chars [which would be] written to dst [if it were not NULL]
 277     size_t dstWritten = 0;
 278
 279     // make a copy of the input string unless it is already properly
 280     // NUL-terminated
 281     //
 282     // if we don't know its length we have no choice but to assume that it is,
 283     // indeed, properly terminated
 284     wxWCharBuffer bufTmp;
 285     if ( srcLen == wxNO_LEN )
 286     {
 287         srcLen = wxWcslen(src) + 1;
 288     }
 289     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 290     {
 291         // make a copy in order to properly NUL-terminate the string
 292         bufTmp = wxWCharBuffer(srcLen);
 293         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 294         src = bufTmp;
 295     }
 296
 297     const size_t lenNul = GetMBNulLen();
 298     for ( const wchar_t * const srcEnd = src + srcLen;
 299           src < srcEnd;
 300           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 301     {
 302         // try to convert the current chunk
 303         size_t lenChunk = WC2MB(NULL, src, 0);
 304
 305         if ( lenChunk == wxCONV_FAILED )
 306             return wxCONV_FAILED;
 307
 308         lenChunk += lenNul;
 309         dstWritten += lenChunk;
 310
 311         if ( dst )
 312         {
 313             if ( dstWritten > dstLen )
 314                 return wxCONV_FAILED;
 315
 316             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 317                 return wxCONV_FAILED;
 318
 319             dst += lenChunk;
 320         }
 321     }
 322
 323     return dstWritten;
 324 }
 325
 326 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 327 {
 328     size_t rc = ToWChar(outBuff, outLen, inBuff);
 329     if ( rc != wxCONV_FAILED )
 330     {
 331         // ToWChar() returns the buffer length, i.e. including the trailing
 332         // NUL, while this method doesn't take it into account
 333         rc--;
 334     }
 335
 336     return rc;
 337 }
 338
 339 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 340 {
 341     size_t rc = FromWChar(outBuff, outLen, inBuff);
 342     if ( rc != wxCONV_FAILED )
 343     {
 344         rc -= GetMBNulLen();
 345     }
 346
 347     return rc;
 348 }
 349
 350 wxMBConv::~wxMBConv()
 351 {
 352     // nothing to do here (necessary for Darwin linking probably)
 353 }
 354
 355 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 356 {
 357     if ( psz )
 358     {
 359         // calculate the length of the buffer needed first
 360         const size_t nLen = MB2WC(NULL, psz, 0);
 361         if ( nLen != wxCONV_FAILED )
 362         {
 363             // now do the actual conversion
 364             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 365
 366             // +1 for the trailing NULL
 367             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 368                 return buf;
 369         }
 370     }
 371
 372     return wxWCharBuffer();
 373 }
 374
 375 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 376 {
 377     if ( pwz )
 378     {
 379         const size_t nLen = WC2MB(NULL, pwz, 0);
 380         if ( nLen != wxCONV_FAILED )
 381         {
 382             // extra space for trailing NUL(s)
 383             static const size_t extraLen = GetMaxMBNulLen();
 384
 385             wxCharBuffer buf(nLen + extraLen - 1);
 386             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 387                 return buf;
 388         }
 389     }
 390
 391     return wxCharBuffer();
 392 }
 393
 394 const wxWCharBuffer
 395 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 396 {
 397     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 398     if ( dstLen != wxCONV_FAILED )
 399     {
 400         wxWCharBuffer wbuf(dstLen - 1);
 401         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 402         {
 403             if ( outLen )
 404             {
 405                 *outLen = dstLen;
 406                 if ( wbuf[dstLen - 1] == L'\0' )
 407                     (*outLen)--;
 408             }
 409
 410             return wbuf;
 411         }
 412     }
 413
 414     if ( outLen )
 415         *outLen = 0;
 416
 417     return wxWCharBuffer();
 418 }
 419
 420 const wxCharBuffer
 421 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 422 {
 423     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 424     if ( dstLen != wxCONV_FAILED )
 425     {
 426         // special case of empty input: can't allocate 0 size buffer below as
 427         // wxCharBuffer insists on NUL-terminating it
 428         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 429         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 430         {
 431             if ( outLen )
 432             {
 433                 *outLen = dstLen;
 434
 435                 const size_t nulLen = GetMBNulLen();
 436                 if ( dstLen >= nulLen &&
 437                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 438                 {
 439                     // in this case the output is NUL-terminated and we're not
 440                     // supposed to count NUL
 441                     *outLen -= nulLen;
 442                 }
 443             }
 444
 445             return buf;
 446         }
 447     }
 448
 449     if ( outLen )
 450         *outLen = 0;
 451
 452     return wxCharBuffer();
 453 }
 454
 455 // ----------------------------------------------------------------------------
 456 // wxMBConvLibc
 457 // ----------------------------------------------------------------------------
 458
 459 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 460 {
 461     return wxMB2WC(buf, psz, n);
 462 }
 463
 464 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 465 {
 466     return wxWC2MB(buf, psz, n);
 467 }
 468
 469 // ----------------------------------------------------------------------------
 470 // wxConvBrokenFileNames
 471 // ----------------------------------------------------------------------------
 472
 473 #ifdef __UNIX__
 474
 475 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 476 {
 477     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 478                   || wxStricmp(charset, _T("UTF8")) == 0  )
 479         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 480     else
 481         m_conv = new wxCSConv(charset);
 482 }
 483
 484 #endif // __UNIX__
 485
 486 // ----------------------------------------------------------------------------
 487 // UTF-7
 488 // ----------------------------------------------------------------------------
 489
 490 // Implementation (C) 2004 Fredrik Roubert
 491
 492 //
 493 // BASE64 decoding table
 494 //
 495 static const unsigned char utf7unb64[] =
 496 {
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 503     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 504     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 506     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 507     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 508     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 510     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 511     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 512     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 529 };
 530
 531 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 532 {
 533     size_t len = 0;
 534
 535     while ( *psz && (!buf || (len < n)) )
 536     {
 537         unsigned char cc = *psz++;
 538         if (cc != '+')
 539         {
 540             // plain ASCII char
 541             if (buf)
 542                 *buf++ = cc;
 543             len++;
 544         }
 545         else if (*psz == '-')
 546         {
 547             // encoded plus sign
 548             if (buf)
 549                 *buf++ = cc;
 550             len++;
 551             psz++;
 552         }
 553         else // start of BASE64 encoded string
 554         {
 555             bool lsb, ok;
 556             unsigned int d, l;
 557             for ( ok = lsb = false, d = 0, l = 0;
 558                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 559                   psz++ )
 560             {
 561                 d <<= 6;
 562                 d += cc;
 563                 for (l += 6; l >= 8; lsb = !lsb)
 564                 {
 565                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 566                     if (lsb)
 567                     {
 568                         if (buf)
 569                             *buf++ |= c;
 570                         len ++;
 571                     }
 572                     else
 573                     {
 574                         if (buf)
 575                             *buf = (wchar_t)(c << 8);
 576                     }
 577
 578                     ok = true;
 579                 }
 580             }
 581
 582             if ( !ok )
 583             {
 584                 // in valid UTF7 we should have valid characters after '+'
 585                 return wxCONV_FAILED;
 586             }
 587
 588             if (*psz == '-')
 589                 psz++;
 590         }
 591     }
 592
 593     if ( buf && (len < n) )
 594         *buf = '\0';
 595
 596     return len;
 597 }
 598
 599 //
 600 // BASE64 encoding table
 601 //
 602 static const unsigned char utf7enb64[] =
 603 {
 604     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 605     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 606     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 607     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 608     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 609     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 610     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 611     '4', '5', '6', '7', '8', '9', '+', '/'
 612 };
 613
 614 //
 615 // UTF-7 encoding table
 616 //
 617 // 0 - Set D (directly encoded characters)
 618 // 1 - Set O (optional direct characters)
 619 // 2 - whitespace characters (optional)
 620 // 3 - special characters
 621 //
 622 static const unsigned char utf7encode[128] =
 623 {
 624     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 625     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 626     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 628     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 629     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 630     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 631     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 632 };
 633
 634 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 635 {
 636     size_t len = 0;
 637
 638     while (*psz && ((!buf) || (len < n)))
 639     {
 640         wchar_t cc = *psz++;
 641         if (cc < 0x80 && utf7encode[cc] < 1)
 642         {
 643             // plain ASCII char
 644             if (buf)
 645                 *buf++ = (char)cc;
 646
 647             len++;
 648         }
 649 #ifndef WC_UTF16
 650         else if (((wxUint32)cc) > 0xffff)
 651         {
 652             // no surrogate pair generation (yet?)
 653             return wxCONV_FAILED;
 654         }
 655 #endif
 656         else
 657         {
 658             if (buf)
 659                 *buf++ = '+';
 660
 661             len++;
 662             if (cc != '+')
 663             {
 664                 // BASE64 encode string
 665                 unsigned int lsb, d, l;
 666                 for (d = 0, l = 0; /*nothing*/; psz++)
 667                 {
 668                     for (lsb = 0; lsb < 2; lsb ++)
 669                     {
 670                         d <<= 8;
 671                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 672
 673                         for (l += 8; l >= 6; )
 674                         {
 675                             l -= 6;
 676                             if (buf)
 677                                 *buf++ = utf7enb64[(d >> l) % 64];
 678                             len++;
 679                         }
 680                     }
 681
 682                     cc = *psz;
 683                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 684                         break;
 685                 }
 686
 687                 if (l != 0)
 688                 {
 689                     if (buf)
 690                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 691
 692                     len++;
 693                 }
 694             }
 695
 696             if (buf)
 697                 *buf++ = '-';
 698             len++;
 699         }
 700     }
 701
 702     if (buf && (len < n))
 703         *buf = 0;
 704
 705     return len;
 706 }
 707
 708 // ----------------------------------------------------------------------------
 709 // UTF-8
 710 // ----------------------------------------------------------------------------
 711
 712 static wxUint32 utf8_max[]=
 713     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 714
 715 // boundaries of the private use area we use to (temporarily) remap invalid
 716 // characters invalid in a UTF-8 encoded string
 717 const wxUint32 wxUnicodePUA = 0x100000;
 718 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 719
 720 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 721 {
 722     size_t len = 0;
 723
 724     while (*psz && ((!buf) || (len < n)))
 725     {
 726         const char *opsz = psz;
 727         bool invalid = false;
 728         unsigned char cc = *psz++, fc = cc;
 729         unsigned cnt;
 730         for (cnt = 0; fc & 0x80; cnt++)
 731             fc <<= 1;
 732
 733         if (!cnt)
 734         {
 735             // plain ASCII char
 736             if (buf)
 737                 *buf++ = cc;
 738             len++;
 739
 740             // escape the escape character for octal escapes
 741             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 742                     && cc == '\\' && (!buf || len < n))
 743             {
 744                 if (buf)
 745                     *buf++ = cc;
 746                 len++;
 747             }
 748         }
 749         else
 750         {
 751             cnt--;
 752             if (!cnt)
 753             {
 754                 // invalid UTF-8 sequence
 755                 invalid = true;
 756             }
 757             else
 758             {
 759                 unsigned ocnt = cnt - 1;
 760                 wxUint32 res = cc & (0x3f >> cnt);
 761                 while (cnt--)
 762                 {
 763                     cc = *psz;
 764                     if ((cc & 0xC0) != 0x80)
 765                     {
 766                         // invalid UTF-8 sequence
 767                         invalid = true;
 768                         break;
 769                     }
 770
 771                     psz++;
 772                     res = (res << 6) | (cc & 0x3f);
 773                 }
 774
 775                 if (invalid || res <= utf8_max[ocnt])
 776                 {
 777                     // illegal UTF-8 encoding
 778                     invalid = true;
 779                 }
 780                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 781                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 782                 {
 783                     // if one of our PUA characters turns up externally
 784                     // it must also be treated as an illegal sequence
 785                     // (a bit like you have to escape an escape character)
 786                     invalid = true;
 787                 }
 788                 else
 789                 {
 790 #ifdef WC_UTF16
 791                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 792                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 793                     if (pa == wxCONV_FAILED)
 794                     {
 795                         invalid = true;
 796                     }
 797                     else
 798                     {
 799                         if (buf)
 800                             buf += pa;
 801                         len += pa;
 802                     }
 803 #else // !WC_UTF16
 804                     if (buf)
 805                         *buf++ = (wchar_t)res;
 806                     len++;
 807 #endif // WC_UTF16/!WC_UTF16
 808                 }
 809             }
 810
 811             if (invalid)
 812             {
 813                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 814                 {
 815                     while (opsz < psz && (!buf || len < n))
 816                     {
 817 #ifdef WC_UTF16
 818                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 819                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 820                         wxASSERT(pa != wxCONV_FAILED);
 821                         if (buf)
 822                             buf += pa;
 823                         opsz++;
 824                         len += pa;
 825 #else
 826                         if (buf)
 827                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 828                         opsz++;
 829                         len++;
 830 #endif
 831                     }
 832                 }
 833                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 834                 {
 835                     while (opsz < psz && (!buf || len < n))
 836                     {
 837                         if ( buf && len + 3 < n )
 838                         {
 839                             unsigned char on = *opsz;
 840                             *buf++ = L'\\';
 841                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 842                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 843                             *buf++ = (wchar_t)( L'0' + on % 010 );
 844                         }
 845
 846                         opsz++;
 847                         len += 4;
 848                     }
 849                 }
 850                 else // MAP_INVALID_UTF8_NOT
 851                 {
 852                     return wxCONV_FAILED;
 853                 }
 854             }
 855         }
 856     }
 857
 858     if (buf && (len < n))
 859         *buf = 0;
 860
 861     return len;
 862 }
 863
 864 static inline bool isoctal(wchar_t wch)
 865 {
 866     return L'0' <= wch && wch <= L'7';
 867 }
 868
 869 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 870 {
 871     size_t len = 0;
 872
 873     while (*psz && ((!buf) || (len < n)))
 874     {
 875         wxUint32 cc;
 876
 877 #ifdef WC_UTF16
 878         // cast is ok for WC_UTF16
 879         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 880         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 881 #else
 882         cc = (*psz++) & 0x7fffffff;
 883 #endif
 884
 885         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 886                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 887         {
 888             if (buf)
 889                 *buf++ = (char)(cc - wxUnicodePUA);
 890             len++;
 891         }
 892         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 893                     && cc == L'\\' && psz[0] == L'\\' )
 894         {
 895             if (buf)
 896                 *buf++ = (char)cc;
 897             psz++;
 898             len++;
 899         }
 900         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 901                     cc == L'\\' &&
 902                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 903         {
 904             if (buf)
 905             {
 906                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 907                                  (psz[1] - L'0') * 010 +
 908                                  (psz[2] - L'0'));
 909             }
 910
 911             psz += 3;
 912             len++;
 913         }
 914         else
 915         {
 916             unsigned cnt;
 917             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 918             {
 919             }
 920
 921             if (!cnt)
 922             {
 923                 // plain ASCII char
 924                 if (buf)
 925                     *buf++ = (char) cc;
 926                 len++;
 927             }
 928             else
 929             {
 930                 len += cnt + 1;
 931                 if (buf)
 932                 {
 933                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 934                     while (cnt--)
 935                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 936                 }
 937             }
 938         }
 939     }
 940
 941     if (buf && (len < n))
 942         *buf = 0;
 943
 944     return len;
 945 }
 946
 947 // ============================================================================
 948 // UTF-16
 949 // ============================================================================
 950
 951 #ifdef WORDS_BIGENDIAN
 952     #define wxMBConvUTF16straight wxMBConvUTF16BE
 953     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 954 #else
 955     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 956     #define wxMBConvUTF16straight wxMBConvUTF16LE
 957 #endif
 958
 959 /* static */
 960 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 961 {
 962     if ( srcLen == wxNO_LEN )
 963     {
 964         // count the number of bytes in input, including the trailing NULs
 965         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 966         for ( srcLen = 1; *inBuff++; srcLen++ )
 967             ;
 968
 969         srcLen *= BYTES_PER_CHAR;
 970     }
 971     else // we already have the length
 972     {
 973         // we can only convert an entire number of UTF-16 characters
 974         if ( srcLen % BYTES_PER_CHAR )
 975             return wxCONV_FAILED;
 976     }
 977
 978     return srcLen;
 979 }
 980
 981 // case when in-memory representation is UTF-16 too
 982 #ifdef WC_UTF16
 983
 984 // ----------------------------------------------------------------------------
 985 // conversions without endianness change
 986 // ----------------------------------------------------------------------------
 987
 988 size_t
 989 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 990                                const char *src, size_t srcLen) const
 991 {
 992     // set up the scene for using memcpy() (which is presumably more efficient
 993     // than copying the bytes one by one)
 994     srcLen = GetLength(src, srcLen);
 995     if ( srcLen == wxNO_LEN )
 996         return wxCONV_FAILED;
 997
 998     const size_t inLen = srcLen / BYTES_PER_CHAR;
 999     if ( dst )
1000     {
1001         if ( dstLen < inLen )
1002             return wxCONV_FAILED;
1003
1004         memcpy(dst, src, srcLen);
1005     }
1006
1007     return inLen;
1008 }
1009
1010 size_t
1011 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1012                                  const wchar_t *src, size_t srcLen) const
1013 {
1014     if ( srcLen == wxNO_LEN )
1015         srcLen = wxWcslen(src) + 1;
1016
1017     srcLen *= BYTES_PER_CHAR;
1018
1019     if ( dst )
1020     {
1021         if ( dstLen < srcLen )
1022             return wxCONV_FAILED;
1023
1024         memcpy(dst, src, srcLen);
1025     }
1026
1027     return srcLen;
1028 }
1029
1030 // ----------------------------------------------------------------------------
1031 // endian-reversing conversions
1032 // ----------------------------------------------------------------------------
1033
1034 size_t
1035 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1036                            const char *src, size_t srcLen) const
1037 {
1038     srcLen = GetLength(src, srcLen);
1039     if ( srcLen == wxNO_LEN )
1040         return wxCONV_FAILED;
1041
1042     srcLen /= BYTES_PER_CHAR;
1043
1044     if ( dst )
1045     {
1046         if ( dstLen < srcLen )
1047             return wxCONV_FAILED;
1048
1049         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1050         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1051         {
1052             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1053         }
1054     }
1055
1056     return srcLen;
1057 }
1058
1059 size_t
1060 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1061                              const wchar_t *src, size_t srcLen) const
1062 {
1063     if ( srcLen == wxNO_LEN )
1064         srcLen = wxWcslen(src) + 1;
1065
1066     srcLen *= BYTES_PER_CHAR;
1067
1068     if ( dst )
1069     {
1070         if ( dstLen < srcLen )
1071             return wxCONV_FAILED;
1072
1073         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1074         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1075         {
1076             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1077         }
1078     }
1079
1080     return srcLen;
1081 }
1082
1083 #else // !WC_UTF16: wchar_t is UTF-32
1084
1085 // ----------------------------------------------------------------------------
1086 // conversions without endianness change
1087 // ----------------------------------------------------------------------------
1088
1089 size_t
1090 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1091                                const char *src, size_t srcLen) const
1092 {
1093     srcLen = GetLength(src, srcLen);
1094     if ( srcLen == wxNO_LEN )
1095         return wxCONV_FAILED;
1096
1097     const size_t inLen = srcLen / BYTES_PER_CHAR;
1098     if ( !dst )
1099     {
1100         // optimization: return maximal space which could be needed for this
1101         // string even if the real size could be smaller if the buffer contains
1102         // any surrogates
1103         return inLen;
1104     }
1105
1106     size_t outLen = 0;
1107     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1108     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1109     {
1110         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1111         if ( !inBuff )
1112             return wxCONV_FAILED;
1113
1114         if ( ++outLen > dstLen )
1115             return wxCONV_FAILED;
1116
1117         *dst++ = ch;
1118     }
1119
1120
1121     return outLen;
1122 }
1123
1124 size_t
1125 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1126                                  const wchar_t *src, size_t srcLen) const
1127 {
1128     if ( srcLen == wxNO_LEN )
1129         srcLen = wxWcslen(src) + 1;
1130
1131     size_t outLen = 0;
1132     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1133     for ( size_t n = 0; n < srcLen; n++ )
1134     {
1135         wxUint16 cc[2];
1136         const size_t numChars = encode_utf16(*src++, cc);
1137         if ( numChars == wxCONV_FAILED )
1138             return wxCONV_FAILED;
1139
1140         outLen += numChars * BYTES_PER_CHAR;
1141         if ( outBuff )
1142         {
1143             if ( outLen > dstLen )
1144                 return wxCONV_FAILED;
1145
1146             *outBuff++ = cc[0];
1147             if ( numChars == 2 )
1148             {
1149                 // second character of a surrogate
1150                 *outBuff++ = cc[1];
1151             }
1152         }
1153     }
1154
1155     return outLen;
1156 }
1157
1158 // ----------------------------------------------------------------------------
1159 // endian-reversing conversions
1160 // ----------------------------------------------------------------------------
1161
1162 size_t
1163 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1164                            const char *src, size_t srcLen) const
1165 {
1166     srcLen = GetLength(src, srcLen);
1167     if ( srcLen == wxNO_LEN )
1168         return wxCONV_FAILED;
1169
1170     const size_t inLen = srcLen / BYTES_PER_CHAR;
1171     if ( !dst )
1172     {
1173         // optimization: return maximal space which could be needed for this
1174         // string even if the real size could be smaller if the buffer contains
1175         // any surrogates
1176         return inLen;
1177     }
1178
1179     size_t outLen = 0;
1180     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1181     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1182     {
1183         wxUint32 ch;
1184         wxUint16 tmp[2];
1185
1186         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1187         inBuff++;
1188         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189
1190         const size_t numChars = decode_utf16(tmp, ch);
1191         if ( numChars == wxCONV_FAILED )
1192             return wxCONV_FAILED;
1193
1194         if ( numChars == 2 )
1195             inBuff++;
1196
1197         if ( ++outLen > dstLen )
1198             return wxCONV_FAILED;
1199
1200         *dst++ = ch;
1201     }
1202
1203
1204     return outLen;
1205 }
1206
1207 size_t
1208 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1209                              const wchar_t *src, size_t srcLen) const
1210 {
1211     if ( srcLen == wxNO_LEN )
1212         srcLen = wxWcslen(src) + 1;
1213
1214     size_t outLen = 0;
1215     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1216     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1217     {
1218         wxUint16 cc[2];
1219         const size_t numChars = encode_utf16(*src, cc);
1220         if ( numChars == wxCONV_FAILED )
1221             return wxCONV_FAILED;
1222
1223         outLen += numChars * BYTES_PER_CHAR;
1224         if ( outBuff )
1225         {
1226             if ( outLen > dstLen )
1227                 return wxCONV_FAILED;
1228
1229             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1230             if ( numChars == 2 )
1231             {
1232                 // second character of a surrogate
1233                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1234             }
1235         }
1236     }
1237
1238     return outLen;
1239 }
1240
1241 #endif // WC_UTF16/!WC_UTF16
1242
1243
1244 // ============================================================================
1245 // UTF-32
1246 // ============================================================================
1247
1248 #ifdef WORDS_BIGENDIAN
1249     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1250     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1251 #else
1252     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1253     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1254 #endif
1255
1256
1257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1258 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1259
1260 /* static */
1261 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1262 {
1263     if ( srcLen == wxNO_LEN )
1264     {
1265         // count the number of bytes in input, including the trailing NULs
1266         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1267         for ( srcLen = 1; *inBuff++; srcLen++ )
1268             ;
1269
1270         srcLen *= BYTES_PER_CHAR;
1271     }
1272     else // we already have the length
1273     {
1274         // we can only convert an entire number of UTF-32 characters
1275         if ( srcLen % BYTES_PER_CHAR )
1276             return wxCONV_FAILED;
1277     }
1278
1279     return srcLen;
1280 }
1281
1282 // case when in-memory representation is UTF-16
1283 #ifdef WC_UTF16
1284
1285 // ----------------------------------------------------------------------------
1286 // conversions without endianness change
1287 // ----------------------------------------------------------------------------
1288
1289 size_t
1290 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1291                                const char *src, size_t srcLen) const
1292 {
1293     srcLen = GetLength(src, srcLen);
1294     if ( srcLen == wxNO_LEN )
1295         return wxCONV_FAILED;
1296
1297     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1298     const size_t inLen = srcLen / BYTES_PER_CHAR;
1299     size_t outLen = 0;
1300     for ( size_t n = 0; n < inLen; n++ )
1301     {
1302         wxUint16 cc[2];
1303         const size_t numChars = encode_utf16(*inBuff++, cc);
1304         if ( numChars == wxCONV_FAILED )
1305             return wxCONV_FAILED;
1306
1307         outLen += numChars;
1308         if ( dst )
1309         {
1310             if ( outLen > dstLen )
1311                 return wxCONV_FAILED;
1312
1313             *dst++ = cc[0];
1314             if ( numChars == 2 )
1315             {
1316                 // second character of a surrogate
1317                 *dst++ = cc[1];
1318             }
1319         }
1320     }
1321
1322     return outLen;
1323 }
1324
1325 size_t
1326 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1327                                  const wchar_t *src, size_t srcLen) const
1328 {
1329     if ( srcLen == wxNO_LEN )
1330         srcLen = wxWcslen(src) + 1;
1331
1332     if ( !dst )
1333     {
1334         // optimization: return maximal space which could be needed for this
1335         // string instead of the exact amount which could be less if there are
1336         // any surrogates in the input
1337         //
1338         // we consider that surrogates are rare enough to make it worthwhile to
1339         // avoid running the loop below at the cost of slightly extra memory
1340         // consumption
1341         return srcLen * BYTES_PER_CHAR;
1342     }
1343
1344     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1345     size_t outLen = 0;
1346     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1347     {
1348         const wxUint32 ch = wxDecodeSurrogate(&src);
1349         if ( !src )
1350             return wxCONV_FAILED;
1351
1352         outLen += BYTES_PER_CHAR;
1353
1354         if ( outLen > dstLen )
1355             return wxCONV_FAILED;
1356
1357         *outBuff++ = ch;
1358     }
1359
1360     return outLen;
1361 }
1362
1363 // ----------------------------------------------------------------------------
1364 // endian-reversing conversions
1365 // ----------------------------------------------------------------------------
1366
1367 size_t
1368 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1369                            const char *src, size_t srcLen) const
1370 {
1371     srcLen = GetLength(src, srcLen);
1372     if ( srcLen == wxNO_LEN )
1373         return wxCONV_FAILED;
1374
1375     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1376     const size_t inLen = srcLen / BYTES_PER_CHAR;
1377     size_t outLen = 0;
1378     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1379     {
1380         wxUint16 cc[2];
1381         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1382         if ( numChars == wxCONV_FAILED )
1383             return wxCONV_FAILED;
1384
1385         outLen += numChars;
1386         if ( dst )
1387         {
1388             if ( outLen > dstLen )
1389                 return wxCONV_FAILED;
1390
1391             *dst++ = cc[0];
1392             if ( numChars == 2 )
1393             {
1394                 // second character of a surrogate
1395                 *dst++ = cc[1];
1396             }
1397         }
1398     }
1399
1400     return outLen;
1401 }
1402
1403 size_t
1404 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1405                              const wchar_t *src, size_t srcLen) const
1406 {
1407     if ( srcLen == wxNO_LEN )
1408         srcLen = wxWcslen(src) + 1;
1409
1410     if ( !dst )
1411     {
1412         // optimization: return maximal space which could be needed for this
1413         // string instead of the exact amount which could be less if there are
1414         // any surrogates in the input
1415         //
1416         // we consider that surrogates are rare enough to make it worthwhile to
1417         // avoid running the loop below at the cost of slightly extra memory
1418         // consumption
1419         return srcLen*BYTES_PER_CHAR;
1420     }
1421
1422     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1423     size_t outLen = 0;
1424     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1425     {
1426         const wxUint32 ch = wxDecodeSurrogate(&src);
1427         if ( !src )
1428             return wxCONV_FAILED;
1429
1430         outLen += BYTES_PER_CHAR;
1431
1432         if ( outLen > dstLen )
1433             return wxCONV_FAILED;
1434
1435         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1436     }
1437
1438     return outLen;
1439 }
1440
1441 #else // !WC_UTF16: wchar_t is UTF-32
1442
1443 // ----------------------------------------------------------------------------
1444 // conversions without endianness change
1445 // ----------------------------------------------------------------------------
1446
1447 size_t
1448 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1449                                const char *src, size_t srcLen) const
1450 {
1451     // use memcpy() as it should be much faster than hand-written loop
1452     srcLen = GetLength(src, srcLen);
1453     if ( srcLen == wxNO_LEN )
1454         return wxCONV_FAILED;
1455
1456     const size_t inLen = srcLen/BYTES_PER_CHAR;
1457     if ( dst )
1458     {
1459         if ( dstLen < inLen )
1460             return wxCONV_FAILED;
1461
1462         memcpy(dst, src, srcLen);
1463     }
1464
1465     return inLen;
1466 }
1467
1468 size_t
1469 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1470                                  const wchar_t *src, size_t srcLen) const
1471 {
1472     if ( srcLen == wxNO_LEN )
1473         srcLen = wxWcslen(src) + 1;
1474
1475     srcLen *= BYTES_PER_CHAR;
1476
1477     if ( dst )
1478     {
1479         if ( dstLen < srcLen )
1480             return wxCONV_FAILED;
1481
1482         memcpy(dst, src, srcLen);
1483     }
1484
1485     return srcLen;
1486 }
1487
1488 // ----------------------------------------------------------------------------
1489 // endian-reversing conversions
1490 // ----------------------------------------------------------------------------
1491
1492 size_t
1493 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1494                            const char *src, size_t srcLen) const
1495 {
1496     srcLen = GetLength(src, srcLen);
1497     if ( srcLen == wxNO_LEN )
1498         return wxCONV_FAILED;
1499
1500     srcLen /= BYTES_PER_CHAR;
1501
1502     if ( dst )
1503     {
1504         if ( dstLen < srcLen )
1505             return wxCONV_FAILED;
1506
1507         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1508         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1509         {
1510             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1511         }
1512     }
1513
1514     return srcLen;
1515 }
1516
1517 size_t
1518 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1519                              const wchar_t *src, size_t srcLen) const
1520 {
1521     if ( srcLen == wxNO_LEN )
1522         srcLen = wxWcslen(src) + 1;
1523
1524     srcLen *= BYTES_PER_CHAR;
1525
1526     if ( dst )
1527     {
1528         if ( dstLen < srcLen )
1529             return wxCONV_FAILED;
1530
1531         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1532         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1533         {
1534             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1535         }
1536     }
1537
1538     return srcLen;
1539 }
1540
1541 #endif // WC_UTF16/!WC_UTF16
1542
1543
1544 // ============================================================================
1545 // The classes doing conversion using the iconv_xxx() functions
1546 // ============================================================================
1547
1548 #ifdef HAVE_ICONV
1549
1550 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1551 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1552 //     (unless there's yet another bug in glibc) the only case when iconv()
1553 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1554 //     left in the input buffer -- when _real_ error occurs,
1555 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1556 //     iconv() failure.
1557 //     [This bug does not appear in glibc 2.2.]
1558 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1559 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1560                                      (errno != E2BIG || bufLeft != 0))
1561 #else
1562 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1563 #endif
1564
1565 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1566
1567 #define ICONV_T_INVALID ((iconv_t)-1)
1568
1569 #if SIZEOF_WCHAR_T == 4
1570     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1571     #define WC_ENC      wxFONTENCODING_UTF32
1572 #elif SIZEOF_WCHAR_T == 2
1573     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1574     #define WC_ENC      wxFONTENCODING_UTF16
1575 #else // sizeof(wchar_t) != 2 nor 4
1576     // does this ever happen?
1577     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1578 #endif
1579
1580 // ----------------------------------------------------------------------------
1581 // wxMBConv_iconv: encapsulates an iconv character set
1582 // ----------------------------------------------------------------------------
1583
1584 class wxMBConv_iconv : public wxMBConv
1585 {
1586 public:
1587     wxMBConv_iconv(const wxChar *name);
1588     virtual ~wxMBConv_iconv();
1589
1590     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1591     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1592
1593     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1594     virtual size_t GetMBNulLen() const;
1595
1596     virtual wxMBConv *Clone() const
1597     {
1598         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1599         p->m_minMBCharWidth = m_minMBCharWidth;
1600         return p;
1601     }
1602
1603     bool IsOk() const
1604         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1605
1606 protected:
1607     // the iconv handlers used to translate from multibyte
1608     // to wide char and in the other direction
1609     iconv_t m2w,
1610             w2m;
1611
1612 #if wxUSE_THREADS
1613     // guards access to m2w and w2m objects
1614     wxMutex m_iconvMutex;
1615 #endif
1616
1617 private:
1618     // the name (for iconv_open()) of a wide char charset -- if none is
1619     // available on this machine, it will remain NULL
1620     static wxString ms_wcCharsetName;
1621
1622     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1623     // different endian-ness than the native one
1624     static bool ms_wcNeedsSwap;
1625
1626
1627     // name of the encoding handled by this conversion
1628     wxString m_name;
1629
1630     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1631     // initially
1632     size_t m_minMBCharWidth;
1633 };
1634
1635 // make the constructor available for unit testing
1636 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1637 {
1638     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1639     if ( !result->IsOk() )
1640     {
1641         delete result;
1642         return 0;
1643     }
1644
1645     return result;
1646 }
1647
1648 wxString wxMBConv_iconv::ms_wcCharsetName;
1649 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1650
1651 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1652               : m_name(name)
1653 {
1654     m_minMBCharWidth = 0;
1655
1656     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1657     // names for the charsets
1658     const wxCharBuffer cname(wxString(name).ToAscii());
1659
1660     // check for charset that represents wchar_t:
1661     if ( ms_wcCharsetName.empty() )
1662     {
1663         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1664
1665 #if wxUSE_FONTMAP
1666         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1667 #else // !wxUSE_FONTMAP
1668         static const wxChar *names_static[] =
1669         {
1670 #if SIZEOF_WCHAR_T == 4
1671             _T("UCS-4"),
1672 #elif SIZEOF_WCHAR_T = 2
1673             _T("UCS-2"),
1674 #endif
1675             NULL
1676         };
1677         const wxChar **names = names_static;
1678 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1679
1680         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1681         {
1682             const wxString nameCS(*names);
1683
1684             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1685             wxString nameXE(nameCS);
1686
1687 #ifdef WORDS_BIGENDIAN
1688                 nameXE += _T("BE");
1689 #else // little endian
1690                 nameXE += _T("LE");
1691 #endif
1692
1693             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1694                        nameXE.c_str());
1695
1696             m2w = iconv_open(nameXE.ToAscii(), cname);
1697             if ( m2w == ICONV_T_INVALID )
1698             {
1699                 // try charset w/o bytesex info (e.g. "UCS4")
1700                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1701                            nameCS.c_str());
1702                 m2w = iconv_open(nameCS.ToAscii(), cname);
1703
1704                 // and check for bytesex ourselves:
1705                 if ( m2w != ICONV_T_INVALID )
1706                 {
1707                     char    buf[2], *bufPtr;
1708                     wchar_t wbuf[2], *wbufPtr;
1709                     size_t  insz, outsz;
1710                     size_t  res;
1711
1712                     buf[0] = 'A';
1713                     buf[1] = 0;
1714                     wbuf[0] = 0;
1715                     insz = 2;
1716                     outsz = SIZEOF_WCHAR_T * 2;
1717                     wbufPtr = wbuf;
1718                     bufPtr = buf;
1719
1720                     res = iconv(
1721                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1722                         (char**)&wbufPtr, &outsz);
1723
1724                     if (ICONV_FAILED(res, insz))
1725                     {
1726                         wxLogLastError(wxT("iconv"));
1727                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1728                                    nameCS.c_str());
1729                     }
1730                     else // ok, can convert to this encoding, remember it
1731                     {
1732                         ms_wcCharsetName = nameCS;
1733                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1734                     }
1735                 }
1736             }
1737             else // use charset not requiring byte swapping
1738             {
1739                 ms_wcCharsetName = nameXE;
1740             }
1741         }
1742
1743         wxLogTrace(TRACE_STRCONV,
1744                    wxT("iconv wchar_t charset is \"%s\"%s"),
1745                    ms_wcCharsetName.empty() ? _T("<none>")
1746                                             : ms_wcCharsetName.c_str(),
1747                    ms_wcNeedsSwap ? _T(" (needs swap)")
1748                                   : _T(""));
1749     }
1750     else // we already have ms_wcCharsetName
1751     {
1752         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1753     }
1754
1755     if ( ms_wcCharsetName.empty() )
1756     {
1757         w2m = ICONV_T_INVALID;
1758     }
1759     else
1760     {
1761         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1762         if ( w2m == ICONV_T_INVALID )
1763         {
1764             wxLogTrace(TRACE_STRCONV,
1765                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1766                        ms_wcCharsetName.c_str(), cname.data());
1767         }
1768     }
1769 }
1770
1771 wxMBConv_iconv::~wxMBConv_iconv()
1772 {
1773     if ( m2w != ICONV_T_INVALID )
1774         iconv_close(m2w);
1775     if ( w2m != ICONV_T_INVALID )
1776         iconv_close(w2m);
1777 }
1778
1779 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1780 {
1781     // find the string length: notice that must be done differently for
1782     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1783     size_t inbuf;
1784     const size_t nulLen = GetMBNulLen();
1785     switch ( nulLen )
1786     {
1787         default:
1788             return wxCONV_FAILED;
1789
1790         case 1:
1791             inbuf = strlen(psz); // arguably more optimized than our version
1792             break;
1793
1794         case 2:
1795         case 4:
1796             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1797             // they also have to start at character boundary and not span two
1798             // adjacent characters
1799             const char *p;
1800             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1801                 ;
1802             inbuf = p - psz;
1803             break;
1804     }
1805
1806 #if wxUSE_THREADS
1807     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1808     //     Unfortunately there are a couple of global wxCSConv objects such as
1809     //     wxConvLocal that are used all over wx code, so we have to make sure
1810     //     the handle is used by at most one thread at the time. Otherwise
1811     //     only a few wx classes would be safe to use from non-main threads
1812     //     as MB<->WC conversion would fail "randomly".
1813     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1814 #endif // wxUSE_THREADS
1815
1816     size_t outbuf = n * SIZEOF_WCHAR_T;
1817     size_t res, cres;
1818     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1819     wchar_t *bufPtr = buf;
1820     const char *pszPtr = psz;
1821
1822     if (buf)
1823     {
1824         // have destination buffer, convert there
1825         cres = iconv(m2w,
1826                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1827                      (char**)&bufPtr, &outbuf);
1828         res = n - (outbuf / SIZEOF_WCHAR_T);
1829
1830         if (ms_wcNeedsSwap)
1831         {
1832             // convert to native endianness
1833             for ( unsigned i = 0; i < res; i++ )
1834                 buf[n] = WC_BSWAP(buf[i]);
1835         }
1836
1837         // NUL-terminate the string if there is any space left
1838         if (res < n)
1839             buf[res] = 0;
1840     }
1841     else
1842     {
1843         // no destination buffer... convert using temp buffer
1844         // to calculate destination buffer requirement
1845         wchar_t tbuf[8];
1846         res = 0;
1847
1848         do
1849         {
1850             bufPtr = tbuf;
1851             outbuf = 8 * SIZEOF_WCHAR_T;
1852
1853             cres = iconv(m2w,
1854                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1855                          (char**)&bufPtr, &outbuf );
1856
1857             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1858         }
1859         while ((cres == (size_t)-1) && (errno == E2BIG));
1860     }
1861
1862     if (ICONV_FAILED(cres, inbuf))
1863     {
1864         //VS: it is ok if iconv fails, hence trace only
1865         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1866         return wxCONV_FAILED;
1867     }
1868
1869     return res;
1870 }
1871
1872 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1873 {
1874 #if wxUSE_THREADS
1875     // NB: explained in MB2WC
1876     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1877 #endif
1878
1879     size_t inlen = wxWcslen(psz);
1880     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1881     size_t outbuf = n;
1882     size_t res, cres;
1883
1884     wchar_t *tmpbuf = 0;
1885
1886     if (ms_wcNeedsSwap)
1887     {
1888         // need to copy to temp buffer to switch endianness
1889         // (doing WC_BSWAP twice on the original buffer won't help, as it
1890         //  could be in read-only memory, or be accessed in some other thread)
1891         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1892         for ( size_t i = 0; i < inlen; i++ )
1893             tmpbuf[n] = WC_BSWAP(psz[i]);
1894
1895         tmpbuf[inlen] = L'\0';
1896         psz = tmpbuf;
1897     }
1898
1899     if (buf)
1900     {
1901         // have destination buffer, convert there
1902         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1903
1904         res = n - outbuf;
1905
1906         // NB: iconv was given only wcslen(psz) characters on input, and so
1907         //     it couldn't convert the trailing zero. Let's do it ourselves
1908         //     if there's some room left for it in the output buffer.
1909         if (res < n)
1910             buf[0] = 0;
1911     }
1912     else
1913     {
1914         // no destination buffer: convert using temp buffer
1915         // to calculate destination buffer requirement
1916         char tbuf[16];
1917         res = 0;
1918         do
1919         {
1920             buf = tbuf;
1921             outbuf = 16;
1922
1923             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1924
1925             res += 16 - outbuf;
1926         }
1927         while ((cres == (size_t)-1) && (errno == E2BIG));
1928     }
1929
1930     if (ms_wcNeedsSwap)
1931     {
1932         free(tmpbuf);
1933     }
1934
1935     if (ICONV_FAILED(cres, inbuf))
1936     {
1937         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1938         return wxCONV_FAILED;
1939     }
1940
1941     return res;
1942 }
1943
1944 size_t wxMBConv_iconv::GetMBNulLen() const
1945 {
1946     if ( m_minMBCharWidth == 0 )
1947     {
1948         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1949
1950 #if wxUSE_THREADS
1951         // NB: explained in MB2WC
1952         wxMutexLocker lock(self->m_iconvMutex);
1953 #endif
1954
1955         wchar_t *wnul = L"";
1956         char buf[8]; // should be enough for NUL in any encoding
1957         size_t inLen = sizeof(wchar_t),
1958                outLen = WXSIZEOF(buf);
1959         char *inBuff = (char *)wnul;
1960         char *outBuff = buf;
1961         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1962         {
1963             self->m_minMBCharWidth = (size_t)-1;
1964         }
1965         else // ok
1966         {
1967             self->m_minMBCharWidth = outBuff - buf;
1968         }
1969     }
1970
1971     return m_minMBCharWidth;
1972 }
1973
1974 #endif // HAVE_ICONV
1975
1976
1977 // ============================================================================
1978 // Win32 conversion classes
1979 // ============================================================================
1980
1981 #ifdef wxHAVE_WIN32_MB2WC
1982
1983 // from utils.cpp
1984 #if wxUSE_FONTMAP
1985 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1986 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1987 #endif
1988
1989 class wxMBConv_win32 : public wxMBConv
1990 {
1991 public:
1992     wxMBConv_win32()
1993     {
1994         m_CodePage = CP_ACP;
1995         m_minMBCharWidth = 0;
1996     }
1997
1998     wxMBConv_win32(const wxMBConv_win32& conv)
1999         : wxMBConv()
2000     {
2001         m_CodePage = conv.m_CodePage;
2002         m_minMBCharWidth = conv.m_minMBCharWidth;
2003     }
2004
2005 #if wxUSE_FONTMAP
2006     wxMBConv_win32(const wxChar* name)
2007     {
2008         m_CodePage = wxCharsetToCodepage(name);
2009         m_minMBCharWidth = 0;
2010     }
2011
2012     wxMBConv_win32(wxFontEncoding encoding)
2013     {
2014         m_CodePage = wxEncodingToCodepage(encoding);
2015         m_minMBCharWidth = 0;
2016     }
2017 #endif // wxUSE_FONTMAP
2018
2019     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2020     {
2021         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2022         // the behaviour is not compatible with the Unix version (using iconv)
2023         // and break the library itself, e.g. wxTextInputStream::NextChar()
2024         // wouldn't work if reading an incomplete MB char didn't result in an
2025         // error
2026         //
2027         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2028         // Win XP or newer and it is not supported for UTF-[78] so we always
2029         // use our own conversions in this case. See
2030         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2031         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2032         if ( m_CodePage == CP_UTF8 )
2033         {
2034             return wxConvUTF8.MB2WC(buf, psz, n);
2035         }
2036
2037         if ( m_CodePage == CP_UTF7 )
2038         {
2039             return wxConvUTF7.MB2WC(buf, psz, n);
2040         }
2041
2042         int flags = 0;
2043         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2044                 IsAtLeastWin2kSP4() )
2045         {
2046             flags = MB_ERR_INVALID_CHARS;
2047         }
2048
2049         const size_t len = ::MultiByteToWideChar
2050                              (
2051                                 m_CodePage,     // code page
2052                                 flags,          // flags: fall on error
2053                                 psz,            // input string
2054                                 -1,             // its length (NUL-terminated)
2055                                 buf,            // output string
2056                                 buf ? n : 0     // size of output buffer
2057                              );
2058         if ( !len )
2059         {
2060             // function totally failed
2061             return wxCONV_FAILED;
2062         }
2063
2064         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2065         // check if we succeeded, by doing a double trip:
2066         if ( !flags && buf )
2067         {
2068             const size_t mbLen = strlen(psz);
2069             wxCharBuffer mbBuf(mbLen);
2070             if ( ::WideCharToMultiByte
2071                    (
2072                       m_CodePage,
2073                       0,
2074                       buf,
2075                       -1,
2076                       mbBuf.data(),
2077                       mbLen + 1,        // size in bytes, not length
2078                       NULL,
2079                       NULL
2080                    ) == 0 ||
2081                   strcmp(mbBuf, psz) != 0 )
2082             {
2083                 // we didn't obtain the same thing we started from, hence
2084                 // the conversion was lossy and we consider that it failed
2085                 return wxCONV_FAILED;
2086             }
2087         }
2088
2089         // note that it returns count of written chars for buf != NULL and size
2090         // of the needed buffer for buf == NULL so in either case the length of
2091         // the string (which never includes the terminating NUL) is one less
2092         return len - 1;
2093     }
2094
2095     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2096     {
2097         /*
2098             we have a problem here: by default, WideCharToMultiByte() may
2099             replace characters unrepresentable in the target code page with bad
2100             quality approximations such as turning "1/2" symbol (U+00BD) into
2101             "1" for the code pages which don't have it and we, obviously, want
2102             to avoid this at any price
2103
2104             the trouble is that this function does it _silently_, i.e. it won't
2105             even tell us whether it did or not... Win98/2000 and higher provide
2106             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2107             we have to resort to a round trip, i.e. check that converting back
2108             results in the same string -- this is, of course, expensive but
2109             otherwise we simply can't be sure to not garble the data.
2110          */
2111
2112         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2113         // it doesn't work with CJK encodings (which we test for rather roughly
2114         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2115         // supporting it
2116         BOOL usedDef wxDUMMY_INITIALIZE(false);
2117         BOOL *pUsedDef;
2118         int flags;
2119         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2120         {
2121             // it's our lucky day
2122             flags = WC_NO_BEST_FIT_CHARS;
2123             pUsedDef = &usedDef;
2124         }
2125         else // old system or unsupported encoding
2126         {
2127             flags = 0;
2128             pUsedDef = NULL;
2129         }
2130
2131         const size_t len = ::WideCharToMultiByte
2132                              (
2133                                 m_CodePage,     // code page
2134                                 flags,          // either none or no best fit
2135                                 pwz,            // input string
2136                                 -1,             // it is (wide) NUL-terminated
2137                                 buf,            // output buffer
2138                                 buf ? n : 0,    // and its size
2139                                 NULL,           // default "replacement" char
2140                                 pUsedDef        // [out] was it used?
2141                              );
2142
2143         if ( !len )
2144         {
2145             // function totally failed
2146             return wxCONV_FAILED;
2147         }
2148
2149         // if we were really converting, check if we succeeded
2150         if ( buf )
2151         {
2152             if ( flags )
2153             {
2154                 // check if the conversion failed, i.e. if any replacements
2155                 // were done
2156                 if ( usedDef )
2157                     return wxCONV_FAILED;
2158             }
2159             else // we must resort to double tripping...
2160             {
2161                 wxWCharBuffer wcBuf(n);
2162                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2163                         wcscmp(wcBuf, pwz) != 0 )
2164                 {
2165                     // we didn't obtain the same thing we started from, hence
2166                     // the conversion was lossy and we consider that it failed
2167                     return wxCONV_FAILED;
2168                 }
2169             }
2170         }
2171
2172         // see the comment above for the reason of "len - 1"
2173         return len - 1;
2174     }
2175
2176     virtual size_t GetMBNulLen() const
2177     {
2178         if ( m_minMBCharWidth == 0 )
2179         {
2180             int len = ::WideCharToMultiByte
2181                         (
2182                             m_CodePage,     // code page
2183                             0,              // no flags
2184                             L"",            // input string
2185                             1,              // translate just the NUL
2186                             NULL,           // output buffer
2187                             0,              // and its size
2188                             NULL,           // no replacement char
2189                             NULL            // [out] don't care if it was used
2190                         );
2191
2192             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2193             switch ( len )
2194             {
2195                 default:
2196                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2197                     self->m_minMBCharWidth = (size_t)-1;
2198                     break;
2199
2200                 case 0:
2201                     self->m_minMBCharWidth = (size_t)-1;
2202                     break;
2203
2204                 case 1:
2205                 case 2:
2206                 case 4:
2207                     self->m_minMBCharWidth = len;
2208                     break;
2209             }
2210         }
2211
2212         return m_minMBCharWidth;
2213     }
2214
2215     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2216
2217     bool IsOk() const { return m_CodePage != -1; }
2218
2219 private:
2220     static bool CanUseNoBestFit()
2221     {
2222         static int s_isWin98Or2k = -1;
2223
2224         if ( s_isWin98Or2k == -1 )
2225         {
2226             int verMaj, verMin;
2227             switch ( wxGetOsVersion(&verMaj, &verMin) )
2228             {
2229                 case wxOS_WINDOWS_9X:
2230                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2231                     break;
2232
2233                 case wxOS_WINDOWS_NT:
2234                     s_isWin98Or2k = verMaj >= 5;
2235                     break;
2236
2237                 default:
2238                     // unknown: be conservative by default
2239                     s_isWin98Or2k = 0;
2240                     break;
2241             }
2242
2243             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2244         }
2245
2246         return s_isWin98Or2k == 1;
2247     }
2248
2249     static bool IsAtLeastWin2kSP4()
2250     {
2251 #ifdef __WXWINCE__
2252         return false;
2253 #else
2254         static int s_isAtLeastWin2kSP4 = -1;
2255
2256         if ( s_isAtLeastWin2kSP4 == -1 )
2257         {
2258             OSVERSIONINFOEX ver;
2259
2260             memset(&ver, 0, sizeof(ver));
2261             ver.dwOSVersionInfoSize = sizeof(ver);
2262             GetVersionEx((OSVERSIONINFO*)&ver);
2263
2264             s_isAtLeastWin2kSP4 =
2265               ((ver.dwMajorVersion > 5) || // Vista+
2266                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2267                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2268                ver.wServicePackMajor >= 4)) // 2000 SP4+
2269               ? 1 : 0;
2270         }
2271
2272         return s_isAtLeastWin2kSP4 == 1;
2273 #endif
2274     }
2275
2276
2277     // the code page we're working with
2278     long m_CodePage;
2279
2280     // cached result of GetMBNulLen(), set to 0 initially meaning
2281     // "unknown"
2282     size_t m_minMBCharWidth;
2283 };
2284
2285 #endif // wxHAVE_WIN32_MB2WC
2286
2287 // ============================================================================
2288 // Cocoa conversion classes
2289 // ============================================================================
2290
2291 #if defined(__WXCOCOA__)
2292
2293 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2294 // Strangely enough, internally Core Foundation uses
2295 // UTF-32 internally quite a bit - its just not public (yet).
2296
2297 #include <CoreFoundation/CFString.h>
2298 #include <CoreFoundation/CFStringEncodingExt.h>
2299
2300 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2301 {
2302     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2303
2304     switch (encoding)
2305     {
2306         case wxFONTENCODING_DEFAULT :
2307             enc = CFStringGetSystemEncoding();
2308             break ;
2309
2310         case wxFONTENCODING_ISO8859_1 :
2311             enc = kCFStringEncodingISOLatin1 ;
2312             break ;
2313         case wxFONTENCODING_ISO8859_2 :
2314             enc = kCFStringEncodingISOLatin2;
2315             break ;
2316         case wxFONTENCODING_ISO8859_3 :
2317             enc = kCFStringEncodingISOLatin3 ;
2318             break ;
2319         case wxFONTENCODING_ISO8859_4 :
2320             enc = kCFStringEncodingISOLatin4;
2321             break ;
2322         case wxFONTENCODING_ISO8859_5 :
2323             enc = kCFStringEncodingISOLatinCyrillic;
2324             break ;
2325         case wxFONTENCODING_ISO8859_6 :
2326             enc = kCFStringEncodingISOLatinArabic;
2327             break ;
2328         case wxFONTENCODING_ISO8859_7 :
2329             enc = kCFStringEncodingISOLatinGreek;
2330             break ;
2331         case wxFONTENCODING_ISO8859_8 :
2332             enc = kCFStringEncodingISOLatinHebrew;
2333             break ;
2334         case wxFONTENCODING_ISO8859_9 :
2335             enc = kCFStringEncodingISOLatin5;
2336             break ;
2337         case wxFONTENCODING_ISO8859_10 :
2338             enc = kCFStringEncodingISOLatin6;
2339             break ;
2340         case wxFONTENCODING_ISO8859_11 :
2341             enc = kCFStringEncodingISOLatinThai;
2342             break ;
2343         case wxFONTENCODING_ISO8859_13 :
2344             enc = kCFStringEncodingISOLatin7;
2345             break ;
2346         case wxFONTENCODING_ISO8859_14 :
2347             enc = kCFStringEncodingISOLatin8;
2348             break ;
2349         case wxFONTENCODING_ISO8859_15 :
2350             enc = kCFStringEncodingISOLatin9;
2351             break ;
2352
2353         case wxFONTENCODING_KOI8 :
2354             enc = kCFStringEncodingKOI8_R;
2355             break ;
2356         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2357             enc = kCFStringEncodingDOSRussian;
2358             break ;
2359
2360 //      case wxFONTENCODING_BULGARIAN :
2361 //          enc = ;
2362 //          break ;
2363
2364         case wxFONTENCODING_CP437 :
2365             enc = kCFStringEncodingDOSLatinUS ;
2366             break ;
2367         case wxFONTENCODING_CP850 :
2368             enc = kCFStringEncodingDOSLatin1;
2369             break ;
2370         case wxFONTENCODING_CP852 :
2371             enc = kCFStringEncodingDOSLatin2;
2372             break ;
2373         case wxFONTENCODING_CP855 :
2374             enc = kCFStringEncodingDOSCyrillic;
2375             break ;
2376         case wxFONTENCODING_CP866 :
2377             enc = kCFStringEncodingDOSRussian ;
2378             break ;
2379         case wxFONTENCODING_CP874 :
2380             enc = kCFStringEncodingDOSThai;
2381             break ;
2382         case wxFONTENCODING_CP932 :
2383             enc = kCFStringEncodingDOSJapanese;
2384             break ;
2385         case wxFONTENCODING_CP936 :
2386             enc = kCFStringEncodingDOSChineseSimplif ;
2387             break ;
2388         case wxFONTENCODING_CP949 :
2389             enc = kCFStringEncodingDOSKorean;
2390             break ;
2391         case wxFONTENCODING_CP950 :
2392             enc = kCFStringEncodingDOSChineseTrad;
2393             break ;
2394         case wxFONTENCODING_CP1250 :
2395             enc = kCFStringEncodingWindowsLatin2;
2396             break ;
2397         case wxFONTENCODING_CP1251 :
2398             enc = kCFStringEncodingWindowsCyrillic ;
2399             break ;
2400         case wxFONTENCODING_CP1252 :
2401             enc = kCFStringEncodingWindowsLatin1 ;
2402             break ;
2403         case wxFONTENCODING_CP1253 :
2404             enc = kCFStringEncodingWindowsGreek;
2405             break ;
2406         case wxFONTENCODING_CP1254 :
2407             enc = kCFStringEncodingWindowsLatin5;
2408             break ;
2409         case wxFONTENCODING_CP1255 :
2410             enc = kCFStringEncodingWindowsHebrew ;
2411             break ;
2412         case wxFONTENCODING_CP1256 :
2413             enc = kCFStringEncodingWindowsArabic ;
2414             break ;
2415         case wxFONTENCODING_CP1257 :
2416             enc = kCFStringEncodingWindowsBalticRim;
2417             break ;
2418 //   This only really encodes to UTF7 (if that) evidently
2419 //        case wxFONTENCODING_UTF7 :
2420 //            enc = kCFStringEncodingNonLossyASCII ;
2421 //            break ;
2422         case wxFONTENCODING_UTF8 :
2423             enc = kCFStringEncodingUTF8 ;
2424             break ;
2425         case wxFONTENCODING_EUC_JP :
2426             enc = kCFStringEncodingEUC_JP;
2427             break ;
2428         case wxFONTENCODING_UTF16 :
2429             enc = kCFStringEncodingUnicode ;
2430             break ;
2431         case wxFONTENCODING_MACROMAN :
2432             enc = kCFStringEncodingMacRoman ;
2433             break ;
2434         case wxFONTENCODING_MACJAPANESE :
2435             enc = kCFStringEncodingMacJapanese ;
2436             break ;
2437         case wxFONTENCODING_MACCHINESETRAD :
2438             enc = kCFStringEncodingMacChineseTrad ;
2439             break ;
2440         case wxFONTENCODING_MACKOREAN :
2441             enc = kCFStringEncodingMacKorean ;
2442             break ;
2443         case wxFONTENCODING_MACARABIC :
2444             enc = kCFStringEncodingMacArabic ;
2445             break ;
2446         case wxFONTENCODING_MACHEBREW :
2447             enc = kCFStringEncodingMacHebrew ;
2448             break ;
2449         case wxFONTENCODING_MACGREEK :
2450             enc = kCFStringEncodingMacGreek ;
2451             break ;
2452         case wxFONTENCODING_MACCYRILLIC :
2453             enc = kCFStringEncodingMacCyrillic ;
2454             break ;
2455         case wxFONTENCODING_MACDEVANAGARI :
2456             enc = kCFStringEncodingMacDevanagari ;
2457             break ;
2458         case wxFONTENCODING_MACGURMUKHI :
2459             enc = kCFStringEncodingMacGurmukhi ;
2460             break ;
2461         case wxFONTENCODING_MACGUJARATI :
2462             enc = kCFStringEncodingMacGujarati ;
2463             break ;
2464         case wxFONTENCODING_MACORIYA :
2465             enc = kCFStringEncodingMacOriya ;
2466             break ;
2467         case wxFONTENCODING_MACBENGALI :
2468             enc = kCFStringEncodingMacBengali ;
2469             break ;
2470         case wxFONTENCODING_MACTAMIL :
2471             enc = kCFStringEncodingMacTamil ;
2472             break ;
2473         case wxFONTENCODING_MACTELUGU :
2474             enc = kCFStringEncodingMacTelugu ;
2475             break ;
2476         case wxFONTENCODING_MACKANNADA :
2477             enc = kCFStringEncodingMacKannada ;
2478             break ;
2479         case wxFONTENCODING_MACMALAJALAM :
2480             enc = kCFStringEncodingMacMalayalam ;
2481             break ;
2482         case wxFONTENCODING_MACSINHALESE :
2483             enc = kCFStringEncodingMacSinhalese ;
2484             break ;
2485         case wxFONTENCODING_MACBURMESE :
2486             enc = kCFStringEncodingMacBurmese ;
2487             break ;
2488         case wxFONTENCODING_MACKHMER :
2489             enc = kCFStringEncodingMacKhmer ;
2490             break ;
2491         case wxFONTENCODING_MACTHAI :
2492             enc = kCFStringEncodingMacThai ;
2493             break ;
2494         case wxFONTENCODING_MACLAOTIAN :
2495             enc = kCFStringEncodingMacLaotian ;
2496             break ;
2497         case wxFONTENCODING_MACGEORGIAN :
2498             enc = kCFStringEncodingMacGeorgian ;
2499             break ;
2500         case wxFONTENCODING_MACARMENIAN :
2501             enc = kCFStringEncodingMacArmenian ;
2502             break ;
2503         case wxFONTENCODING_MACCHINESESIMP :
2504             enc = kCFStringEncodingMacChineseSimp ;
2505             break ;
2506         case wxFONTENCODING_MACTIBETAN :
2507             enc = kCFStringEncodingMacTibetan ;
2508             break ;
2509         case wxFONTENCODING_MACMONGOLIAN :
2510             enc = kCFStringEncodingMacMongolian ;
2511             break ;
2512         case wxFONTENCODING_MACETHIOPIC :
2513             enc = kCFStringEncodingMacEthiopic ;
2514             break ;
2515         case wxFONTENCODING_MACCENTRALEUR :
2516             enc = kCFStringEncodingMacCentralEurRoman ;
2517             break ;
2518         case wxFONTENCODING_MACVIATNAMESE :
2519             enc = kCFStringEncodingMacVietnamese ;
2520             break ;
2521         case wxFONTENCODING_MACARABICEXT :
2522             enc = kCFStringEncodingMacExtArabic ;
2523             break ;
2524         case wxFONTENCODING_MACSYMBOL :
2525             enc = kCFStringEncodingMacSymbol ;
2526             break ;
2527         case wxFONTENCODING_MACDINGBATS :
2528             enc = kCFStringEncodingMacDingbats ;
2529             break ;
2530         case wxFONTENCODING_MACTURKISH :
2531             enc = kCFStringEncodingMacTurkish ;
2532             break ;
2533         case wxFONTENCODING_MACCROATIAN :
2534             enc = kCFStringEncodingMacCroatian ;
2535             break ;
2536         case wxFONTENCODING_MACICELANDIC :
2537             enc = kCFStringEncodingMacIcelandic ;
2538             break ;
2539         case wxFONTENCODING_MACROMANIAN :
2540             enc = kCFStringEncodingMacRomanian ;
2541             break ;
2542         case wxFONTENCODING_MACCELTIC :
2543             enc = kCFStringEncodingMacCeltic ;
2544             break ;
2545         case wxFONTENCODING_MACGAELIC :
2546             enc = kCFStringEncodingMacGaelic ;
2547             break ;
2548 //      case wxFONTENCODING_MACKEYBOARD :
2549 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2550 //          break ;
2551
2552         default :
2553             // because gcc is picky
2554             break ;
2555     }
2556
2557     return enc ;
2558 }
2559
2560 class wxMBConv_cocoa : public wxMBConv
2561 {
2562 public:
2563     wxMBConv_cocoa()
2564     {
2565         Init(CFStringGetSystemEncoding()) ;
2566     }
2567
2568     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2569     {
2570         m_encoding = conv.m_encoding;
2571     }
2572
2573 #if wxUSE_FONTMAP
2574     wxMBConv_cocoa(const wxChar* name)
2575     {
2576         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2577     }
2578 #endif
2579
2580     wxMBConv_cocoa(wxFontEncoding encoding)
2581     {
2582         Init( wxCFStringEncFromFontEnc(encoding) );
2583     }
2584
2585     virtual ~wxMBConv_cocoa()
2586     {
2587     }
2588
2589     void Init( CFStringEncoding encoding)
2590     {
2591         m_encoding = encoding ;
2592     }
2593
2594     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2595     {
2596         wxASSERT(szUnConv);
2597
2598         CFStringRef theString = CFStringCreateWithBytes (
2599                                                 NULL, //the allocator
2600                                                 (const UInt8*)szUnConv,
2601                                                 strlen(szUnConv),
2602                                                 m_encoding,
2603                                                 false //no BOM/external representation
2604                                                 );
2605
2606         wxASSERT(theString);
2607
2608         size_t nOutLength = CFStringGetLength(theString);
2609
2610         if (szOut == NULL)
2611         {
2612             CFRelease(theString);
2613             return nOutLength;
2614         }
2615
2616         CFRange theRange = { 0, nOutSize };
2617
2618 #if SIZEOF_WCHAR_T == 4
2619         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2620 #endif
2621
2622         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2623
2624         CFRelease(theString);
2625
2626         szUniCharBuffer[nOutLength] = '\0';
2627
2628 #if SIZEOF_WCHAR_T == 4
2629         wxMBConvUTF16 converter;
2630         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2631         delete [] szUniCharBuffer;
2632 #endif
2633
2634         return nOutLength;
2635     }
2636
2637     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2638     {
2639         wxASSERT(szUnConv);
2640
2641         size_t nRealOutSize;
2642         size_t nBufSize = wxWcslen(szUnConv);
2643         UniChar* szUniBuffer = (UniChar*) szUnConv;
2644
2645 #if SIZEOF_WCHAR_T == 4
2646         wxMBConvUTF16 converter ;
2647         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2648         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2649         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2650         nBufSize /= sizeof(UniChar);
2651 #endif
2652
2653         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2654                                 NULL, //allocator
2655                                 szUniBuffer,
2656                                 nBufSize,
2657                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2658                             );
2659
2660         wxASSERT(theString);
2661
2662         //Note that CER puts a BOM when converting to unicode
2663         //so we  check and use getchars instead in that case
2664         if (m_encoding == kCFStringEncodingUnicode)
2665         {
2666             if (szOut != NULL)
2667                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2668
2669             nRealOutSize = CFStringGetLength(theString) + 1;
2670         }
2671         else
2672         {
2673             CFStringGetBytes(
2674                 theString,
2675                 CFRangeMake(0, CFStringGetLength(theString)),
2676                 m_encoding,
2677                 0, //what to put in characters that can't be converted -
2678                     //0 tells CFString to return NULL if it meets such a character
2679                 false, //not an external representation
2680                 (UInt8*) szOut,
2681                 nOutSize,
2682                 (CFIndex*) &nRealOutSize
2683                         );
2684         }
2685
2686         CFRelease(theString);
2687
2688 #if SIZEOF_WCHAR_T == 4
2689         delete[] szUniBuffer;
2690 #endif
2691
2692         return  nRealOutSize - 1;
2693     }
2694
2695     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2696
2697     bool IsOk() const
2698     {
2699         return m_encoding != kCFStringEncodingInvalidId &&
2700               CFStringIsEncodingAvailable(m_encoding);
2701     }
2702
2703 private:
2704     CFStringEncoding m_encoding ;
2705 };
2706
2707 #endif // defined(__WXCOCOA__)
2708
2709 // ============================================================================
2710 // Mac conversion classes
2711 // ============================================================================
2712
2713 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2714
2715 class wxMBConv_mac : public wxMBConv
2716 {
2717 public:
2718     wxMBConv_mac()
2719     {
2720         Init(CFStringGetSystemEncoding()) ;
2721     }
2722
2723     wxMBConv_mac(const wxMBConv_mac& conv)
2724     {
2725         Init(conv.m_char_encoding);
2726     }
2727
2728 #if wxUSE_FONTMAP
2729     wxMBConv_mac(const wxChar* name)
2730     {
2731         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2732     }
2733 #endif
2734
2735     wxMBConv_mac(wxFontEncoding encoding)
2736     {
2737         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2738     }
2739
2740     virtual ~wxMBConv_mac()
2741     {
2742         OSStatus status = noErr ;
2743         if (m_MB2WC_converter)
2744             status = TECDisposeConverter(m_MB2WC_converter);
2745         if (m_WC2MB_converter)
2746             status = TECDisposeConverter(m_WC2MB_converter);
2747     }
2748
2749     void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2750             TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2751     {
2752         m_MB2WC_converter = NULL ;
2753         m_WC2MB_converter = NULL ;
2754         m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2755         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2756     }
2757
2758     virtual void CreateIfNeeded() const
2759     {
2760         if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2761         {
2762             OSStatus status = noErr ;
2763             status = TECCreateConverter(&m_MB2WC_converter,
2764                                     m_char_encoding,
2765                                     m_unicode_encoding);
2766             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2767             status = TECCreateConverter(&m_WC2MB_converter,
2768                                     m_unicode_encoding,
2769                                     m_char_encoding);
2770             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2771         }
2772     }
2773
2774     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2775     {
2776         CreateIfNeeded() ;
2777         OSStatus status = noErr ;
2778         ByteCount byteOutLen ;
2779         ByteCount byteInLen = strlen(psz) + 1;
2780         wchar_t *tbuf = NULL ;
2781         UniChar* ubuf = NULL ;
2782         size_t res = 0 ;
2783
2784         if (buf == NULL)
2785         {
2786             // Apple specs say at least 32
2787             n = wxMax( 32, byteInLen ) ;
2788             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2789         }
2790
2791         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2792
2793 #if SIZEOF_WCHAR_T == 4
2794         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2795 #else
2796         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2797 #endif
2798
2799         status = TECConvertText(
2800             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2801             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2802
2803 #if SIZEOF_WCHAR_T == 4
2804         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2805         // is not properly terminated we get random characters at the end
2806         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2807         wxMBConvUTF16 converter ;
2808         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2809         free( ubuf ) ;
2810 #else
2811         res = byteOutLen / sizeof( UniChar ) ;
2812 #endif
2813
2814         if ( buf == NULL )
2815              free(tbuf) ;
2816
2817         if ( buf  && res < n)
2818             buf[res] = 0;
2819
2820         return res ;
2821     }
2822
2823     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2824     {
2825         CreateIfNeeded() ;
2826         OSStatus status = noErr ;
2827         ByteCount byteOutLen ;
2828         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2829
2830         char *tbuf = NULL ;
2831
2832         if (buf == NULL)
2833         {
2834             // Apple specs say at least 32
2835             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2836             tbuf = (char*) malloc( n ) ;
2837         }
2838
2839         ByteCount byteBufferLen = n ;
2840         UniChar* ubuf = NULL ;
2841
2842 #if SIZEOF_WCHAR_T == 4
2843         wxMBConvUTF16 converter ;
2844         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2845         byteInLen = unicharlen ;
2846         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2847         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2848 #else
2849         ubuf = (UniChar*) psz ;
2850 #endif
2851
2852         status = TECConvertText(
2853             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2854             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2855
2856 #if SIZEOF_WCHAR_T == 4
2857         free( ubuf ) ;
2858 #endif
2859
2860         if ( buf == NULL )
2861             free(tbuf) ;
2862
2863         size_t res = byteOutLen ;
2864         if ( buf  && res < n)
2865         {
2866             buf[res] = 0;
2867
2868             //we need to double-trip to verify it didn't insert any ? in place
2869             //of bogus characters
2870             wxWCharBuffer wcBuf(n);
2871             size_t pszlen = wxWcslen(psz);
2872             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2873                         wxWcslen(wcBuf) != pszlen ||
2874                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2875             {
2876                 // we didn't obtain the same thing we started from, hence
2877                 // the conversion was lossy and we consider that it failed
2878                 return wxCONV_FAILED;
2879             }
2880         }
2881
2882         return res ;
2883     }
2884
2885     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2886
2887     bool IsOk() const
2888     {
2889         CreateIfNeeded() ;
2890         return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2891     }
2892
2893 protected :
2894     mutable TECObjectRef m_MB2WC_converter;
2895     mutable TECObjectRef m_WC2MB_converter;
2896
2897     TextEncodingBase m_char_encoding;
2898     TextEncodingBase m_unicode_encoding;
2899 };
2900
2901 // MB is decomposed (D) normalized UTF8
2902
2903 class wxMBConv_macUTF8D : public wxMBConv_mac
2904 {
2905 public :
2906     wxMBConv_macUTF8D()
2907     {
2908         Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2909         m_uni = NULL;
2910         m_uniBack = NULL ;
2911     }
2912
2913     virtual ~wxMBConv_macUTF8D()
2914     {
2915         if (m_uni!=NULL)
2916             DisposeUnicodeToTextInfo(&m_uni);
2917         if (m_uniBack!=NULL)
2918             DisposeUnicodeToTextInfo(&m_uniBack);
2919     }
2920
2921     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2922     {
2923         CreateIfNeeded() ;
2924         OSStatus status = noErr ;
2925         ByteCount byteOutLen ;
2926         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2927
2928         char *tbuf = NULL ;
2929
2930         if (buf == NULL)
2931         {
2932             // Apple specs say at least 32
2933             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2934             tbuf = (char*) malloc( n ) ;
2935         }
2936
2937         ByteCount byteBufferLen = n ;
2938         UniChar* ubuf = NULL ;
2939
2940 #if SIZEOF_WCHAR_T == 4
2941         wxMBConvUTF16 converter ;
2942         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2943         byteInLen = unicharlen ;
2944         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2945         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2946 #else
2947         ubuf = (UniChar*) psz ;
2948 #endif
2949
2950         // ubuf is a non-decomposed UniChar buffer
2951
2952         ByteCount dcubuflen = byteInLen * 2 + 2 ;
2953         ByteCount dcubufread , dcubufwritten ;
2954         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2955
2956         ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2957             kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , dcubuf ) ;
2958
2959         // we now convert that decomposed buffer into UTF8
2960
2961         status = TECConvertText(
2962             m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2963             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2964
2965         free( dcubuf );
2966
2967 #if SIZEOF_WCHAR_T == 4
2968         free( ubuf ) ;
2969 #endif
2970
2971         if ( buf == NULL )
2972             free(tbuf) ;
2973
2974         size_t res = byteOutLen ;
2975         if ( buf  && res < n)
2976         {
2977             buf[res] = 0;
2978             // don't test for round-trip fidelity yet, we cannot guarantee it yet
2979         }
2980
2981         return res ;
2982     }
2983
2984     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2985     {
2986         CreateIfNeeded() ;
2987         OSStatus status = noErr ;
2988         ByteCount byteOutLen ;
2989         ByteCount byteInLen = strlen(psz) + 1;
2990         wchar_t *tbuf = NULL ;
2991         UniChar* ubuf = NULL ;
2992         size_t res = 0 ;
2993
2994         if (buf == NULL)
2995         {
2996             // Apple specs say at least 32
2997             n = wxMax( 32, byteInLen ) ;
2998             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2999         }
3000
3001         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3002
3003 #if SIZEOF_WCHAR_T == 4
3004         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3005 #else
3006         ubuf = (UniChar*) (buf ? buf : tbuf) ;
3007 #endif
3008
3009         ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3010         ByteCount dcubufread , dcubufwritten ;
3011         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3012
3013         status = TECConvertText(
3014                                 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3015                                 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3016         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3017         // is not properly terminated we get random characters at the end
3018         dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3019
3020         // now from the decomposed UniChar to properly composed uniChar
3021         ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3022                                   kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , ubuf ) ;
3023
3024         free( dcubuf );
3025         byteOutLen = dcubufwritten ;
3026         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3027
3028
3029 #if SIZEOF_WCHAR_T == 4
3030         wxMBConvUTF16 converter ;
3031         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3032         free( ubuf ) ;
3033 #else
3034         res = byteOutLen / sizeof( UniChar ) ;
3035 #endif
3036
3037         if ( buf == NULL )
3038             free(tbuf) ;
3039
3040         if ( buf  && res < n)
3041             buf[res] = 0;
3042
3043         return res ;
3044     }
3045
3046     virtual void CreateIfNeeded() const
3047     {
3048         wxMBConv_mac::CreateIfNeeded() ;
3049         if ( m_uni == NULL )
3050         {
3051             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3052                 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3053             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3054                 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3055             m_map.mappingVersion = kUnicodeUseLatestMapping;
3056
3057             OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3058             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3059
3060             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3061                                                        kUnicodeNoSubset, kTextEncodingDefaultFormat);
3062             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3063                                                      kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3064             m_map.mappingVersion = kUnicodeUseLatestMapping;
3065             err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3066             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3067         }
3068     }
3069 protected :
3070     mutable UnicodeToTextInfo   m_uni;
3071     mutable UnicodeToTextInfo   m_uniBack;
3072     mutable UnicodeMapping      m_map;
3073 };
3074 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3075
3076 // ============================================================================
3077 // wxEncodingConverter based conversion classes
3078 // ============================================================================
3079
3080 #if wxUSE_FONTMAP
3081
3082 class wxMBConv_wxwin : public wxMBConv
3083 {
3084 private:
3085     void Init()
3086     {
3087         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3088                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3089     }
3090
3091 public:
3092     // temporarily just use wxEncodingConverter stuff,
3093     // so that it works while a better implementation is built
3094     wxMBConv_wxwin(const wxChar* name)
3095     {
3096         if (name)
3097             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3098         else
3099             m_enc = wxFONTENCODING_SYSTEM;
3100
3101         Init();
3102     }
3103
3104     wxMBConv_wxwin(wxFontEncoding enc)
3105     {
3106         m_enc = enc;
3107
3108         Init();
3109     }
3110
3111     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3112     {
3113         size_t inbuf = strlen(psz);
3114         if (buf)
3115         {
3116             if (!m2w.Convert(psz, buf))
3117                 return wxCONV_FAILED;
3118         }
3119         return inbuf;
3120     }
3121
3122     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3123     {
3124         const size_t inbuf = wxWcslen(psz);
3125         if (buf)
3126         {
3127             if (!w2m.Convert(psz, buf))
3128                 return wxCONV_FAILED;
3129         }
3130
3131         return inbuf;
3132     }
3133
3134     virtual size_t GetMBNulLen() const
3135     {
3136         switch ( m_enc )
3137         {
3138             case wxFONTENCODING_UTF16BE:
3139             case wxFONTENCODING_UTF16LE:
3140                 return 2;
3141
3142             case wxFONTENCODING_UTF32BE:
3143             case wxFONTENCODING_UTF32LE:
3144                 return 4;
3145
3146             default:
3147                 return 1;
3148         }
3149     }
3150
3151     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3152
3153     bool IsOk() const { return m_ok; }
3154
3155 public:
3156     wxFontEncoding m_enc;
3157     wxEncodingConverter m2w, w2m;
3158
3159 private:
3160     // were we initialized successfully?
3161     bool m_ok;
3162
3163     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3164 };
3165
3166 // make the constructors available for unit testing
3167 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
3168 {
3169     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3170     if ( !result->IsOk() )
3171     {
3172         delete result;
3173         return 0;
3174     }
3175
3176     return result;
3177 }
3178
3179 #endif // wxUSE_FONTMAP
3180
3181 // ============================================================================
3182 // wxCSConv implementation
3183 // ============================================================================
3184
3185 void wxCSConv::Init()
3186 {
3187     m_name = NULL;
3188     m_convReal =  NULL;
3189     m_deferred = true;
3190 }
3191
3192 wxCSConv::wxCSConv(const wxChar *charset)
3193 {
3194     Init();
3195
3196     if ( charset )
3197     {
3198         SetName(charset);
3199     }
3200
3201 #if wxUSE_FONTMAP
3202     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3203 #else
3204     m_encoding = wxFONTENCODING_SYSTEM;
3205 #endif
3206 }
3207
3208 wxCSConv::wxCSConv(wxFontEncoding encoding)
3209 {
3210     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3211     {
3212         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3213
3214         encoding = wxFONTENCODING_SYSTEM;
3215     }
3216
3217     Init();
3218
3219     m_encoding = encoding;
3220 }
3221
3222 wxCSConv::~wxCSConv()
3223 {
3224     Clear();
3225 }
3226
3227 wxCSConv::wxCSConv(const wxCSConv& conv)
3228         : wxMBConv()
3229 {
3230     Init();
3231
3232     SetName(conv.m_name);
3233     m_encoding = conv.m_encoding;
3234 }
3235
3236 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3237 {
3238     Clear();
3239
3240     SetName(conv.m_name);
3241     m_encoding = conv.m_encoding;
3242
3243     return *this;
3244 }
3245
3246 void wxCSConv::Clear()
3247 {
3248     free(m_name);
3249     delete m_convReal;
3250
3251     m_name = NULL;
3252     m_convReal = NULL;
3253 }
3254
3255 void wxCSConv::SetName(const wxChar *charset)
3256 {
3257     if (charset)
3258     {
3259         m_name = wxStrdup(charset);
3260         m_deferred = true;
3261     }
3262 }
3263
3264 #if wxUSE_FONTMAP
3265
3266 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3267                      wxEncodingNameCache );
3268
3269 static wxEncodingNameCache gs_nameCache;
3270 #endif
3271
3272 wxMBConv *wxCSConv::DoCreate() const
3273 {
3274 #if wxUSE_FONTMAP
3275     wxLogTrace(TRACE_STRCONV,
3276                wxT("creating conversion for %s"),
3277                (m_name ? m_name
3278                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3279 #endif // wxUSE_FONTMAP
3280
3281     // check for the special case of ASCII or ISO8859-1 charset: as we have
3282     // special knowledge of it anyhow, we don't need to create a special
3283     // conversion object
3284     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3285             m_encoding == wxFONTENCODING_DEFAULT )
3286     {
3287         // don't convert at all
3288         return NULL;
3289     }
3290
3291     // we trust OS to do conversion better than we can so try external
3292     // conversion methods first
3293     //
3294     // the full order is:
3295     //      1. OS conversion (iconv() under Unix or Win32 API)
3296     //      2. hard coded conversions for UTF
3297     //      3. wxEncodingConverter as fall back
3298
3299     // step (1)
3300 #ifdef HAVE_ICONV
3301 #if !wxUSE_FONTMAP
3302     if ( m_name )
3303 #endif // !wxUSE_FONTMAP
3304     {
3305         wxString name(m_name);
3306 #if wxUSE_FONTMAP
3307         wxFontEncoding encoding(m_encoding);
3308 #endif
3309
3310         if ( !name.empty() )
3311         {
3312             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3313             if ( conv->IsOk() )
3314                 return conv;
3315
3316             delete conv;
3317
3318 #if wxUSE_FONTMAP
3319             encoding =
3320                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3321 #endif // wxUSE_FONTMAP
3322         }
3323 #if wxUSE_FONTMAP
3324         {
3325             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3326             if ( it != gs_nameCache.end() )
3327             {
3328                 if ( it->second.empty() )
3329                     return NULL;
3330
3331                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3332                 if ( conv->IsOk() )
3333                     return conv;
3334
3335                 delete conv;
3336             }
3337
3338             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3339             // CS : in case this does not return valid names (eg for MacRoman) encoding
3340             // got a 'failure' entry in the cache all the same, although it just has to
3341             // be created using a different method, so only store failed iconv creation
3342             // attempts (or perhaps we shoulnd't do this at all ?)
3343             if ( names[0] != NULL )
3344             {
3345                 for ( ; *names; ++names )
3346                 {
3347                     wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3348                     if ( conv->IsOk() )
3349                     {
3350                         gs_nameCache[encoding] = *names;
3351                         return conv;
3352                     }
3353
3354                     delete conv;
3355                 }
3356
3357                 gs_nameCache[encoding] = _T(""); // cache the failure
3358             }
3359         }
3360 #endif // wxUSE_FONTMAP
3361     }
3362 #endif // HAVE_ICONV
3363
3364 #ifdef wxHAVE_WIN32_MB2WC
3365     {
3366 #if wxUSE_FONTMAP
3367         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3368                                       : new wxMBConv_win32(m_encoding);
3369         if ( conv->IsOk() )
3370             return conv;
3371
3372         delete conv;
3373 #else
3374         return NULL;
3375 #endif
3376     }
3377 #endif // wxHAVE_WIN32_MB2WC
3378
3379 #if defined(__WXMAC__)
3380     {
3381         // leave UTF16 and UTF32 to the built-ins of wx
3382         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3383             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3384         {
3385 #if wxUSE_FONTMAP
3386             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3387                                         : new wxMBConv_mac(m_encoding);
3388 #else
3389             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3390 #endif
3391             if ( conv->IsOk() )
3392                  return conv;
3393
3394             delete conv;
3395         }
3396     }
3397 #endif
3398
3399 #if defined(__WXCOCOA__)
3400     {
3401         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3402         {
3403 #if wxUSE_FONTMAP
3404             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3405                                           : new wxMBConv_cocoa(m_encoding);
3406 #else
3407             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3408 #endif
3409
3410             if ( conv->IsOk() )
3411                  return conv;
3412
3413             delete conv;
3414         }
3415     }
3416 #endif
3417     // step (2)
3418     wxFontEncoding enc = m_encoding;
3419 #if wxUSE_FONTMAP
3420     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3421     {
3422         // use "false" to suppress interactive dialogs -- we can be called from
3423         // anywhere and popping up a dialog from here is the last thing we want to
3424         // do
3425         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3426     }
3427 #endif // wxUSE_FONTMAP
3428
3429     switch ( enc )
3430     {
3431         case wxFONTENCODING_UTF7:
3432              return new wxMBConvUTF7;
3433
3434         case wxFONTENCODING_UTF8:
3435              return new wxMBConvUTF8;
3436
3437         case wxFONTENCODING_UTF16BE:
3438              return new wxMBConvUTF16BE;
3439
3440         case wxFONTENCODING_UTF16LE:
3441              return new wxMBConvUTF16LE;
3442
3443         case wxFONTENCODING_UTF32BE:
3444              return new wxMBConvUTF32BE;
3445
3446         case wxFONTENCODING_UTF32LE:
3447              return new wxMBConvUTF32LE;
3448
3449         default:
3450              // nothing to do but put here to suppress gcc warnings
3451              break;
3452     }
3453
3454     // step (3)
3455 #if wxUSE_FONTMAP
3456     {
3457         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3458                                       : new wxMBConv_wxwin(m_encoding);
3459         if ( conv->IsOk() )
3460             return conv;
3461
3462         delete conv;
3463     }
3464 #endif // wxUSE_FONTMAP
3465
3466     // NB: This is a hack to prevent deadlock. What could otherwise happen
3467     //     in Unicode build: wxConvLocal creation ends up being here
3468     //     because of some failure and logs the error. But wxLog will try to
3469     //     attach a timestamp, for which it will need wxConvLocal (to convert
3470     //     time to char* and then wchar_t*), but that fails, tries to log the
3471     //     error, but wxLog has an (already locked) critical section that
3472     //     guards the static buffer.
3473     static bool alreadyLoggingError = false;
3474     if (!alreadyLoggingError)
3475     {
3476         alreadyLoggingError = true;
3477         wxLogError(_("Cannot convert from the charset '%s'!"),
3478                    m_name ? m_name
3479                       :
3480 #if wxUSE_FONTMAP
3481                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3482 #else // !wxUSE_FONTMAP
3483                          wxString::Format(_("encoding %i"), m_encoding).c_str()
3484 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3485               );
3486
3487         alreadyLoggingError = false;
3488     }
3489
3490     return NULL;
3491 }
3492
3493 void wxCSConv::CreateConvIfNeeded() const
3494 {
3495     if ( m_deferred )
3496     {
3497         wxCSConv *self = (wxCSConv *)this; // const_cast
3498
3499         // if we don't have neither the name nor the encoding, use the default
3500         // encoding for this system
3501         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3502         {
3503 #if wxUSE_INTL
3504             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3505 #else
3506             // fallback to some reasonable default:
3507             self->m_encoding = wxFONTENCODING_ISO8859_1;
3508 #endif // wxUSE_INTL
3509         }
3510
3511         self->m_convReal = DoCreate();
3512         self->m_deferred = false;
3513     }
3514 }
3515
3516 bool wxCSConv::IsOk() const
3517 {
3518     CreateConvIfNeeded();
3519
3520     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3521     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3522         return true; // always ok as we do it ourselves
3523
3524     // m_convReal->IsOk() is called at its own creation, so we know it must
3525     // be ok if m_convReal is non-NULL
3526     return m_convReal != NULL;
3527 }
3528
3529 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3530                          const char *src, size_t srcLen) const
3531 {
3532     CreateConvIfNeeded();
3533
3534     if (m_convReal)
3535         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3536
3537     // latin-1 (direct)
3538     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3539 }
3540
3541 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3542                            const wchar_t *src, size_t srcLen) const
3543 {
3544     CreateConvIfNeeded();
3545
3546     if (m_convReal)
3547         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3548
3549     // latin-1 (direct)
3550     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3551 }
3552
3553 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3554 {
3555     CreateConvIfNeeded();
3556
3557     if (m_convReal)
3558         return m_convReal->MB2WC(buf, psz, n);
3559
3560     // latin-1 (direct)
3561     size_t len = strlen(psz);
3562
3563     if (buf)
3564     {
3565         for (size_t c = 0; c <= len; c++)
3566             buf[c] = (unsigned char)(psz[c]);
3567     }
3568
3569     return len;
3570 }
3571
3572 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3573 {
3574     CreateConvIfNeeded();
3575
3576     if (m_convReal)
3577         return m_convReal->WC2MB(buf, psz, n);
3578
3579     // latin-1 (direct)
3580     const size_t len = wxWcslen(psz);
3581     if (buf)
3582     {
3583         for (size_t c = 0; c <= len; c++)
3584         {
3585             if (psz[c] > 0xFF)
3586                 return wxCONV_FAILED;
3587
3588             buf[c] = (char)psz[c];
3589         }
3590     }
3591     else
3592     {
3593         for (size_t c = 0; c <= len; c++)
3594         {
3595             if (psz[c] > 0xFF)
3596                 return wxCONV_FAILED;
3597         }
3598     }
3599
3600     return len;
3601 }
3602
3603 size_t wxCSConv::GetMBNulLen() const
3604 {
3605     CreateConvIfNeeded();
3606
3607     if ( m_convReal )
3608     {
3609         return m_convReal->GetMBNulLen();
3610     }
3611
3612     return 1;
3613 }
3614
3615 // ----------------------------------------------------------------------------
3616 // globals
3617 // ----------------------------------------------------------------------------
3618
3619 #ifdef __WINDOWS__
3620     static wxMBConv_win32 wxConvLibcObj;
3621 #elif defined(__WXMAC__) && !defined(__MACH__)
3622     static wxMBConv_mac wxConvLibcObj ;
3623 #else
3624     static wxMBConvLibc wxConvLibcObj;
3625 #endif
3626
3627 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3628 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3629 static wxMBConvUTF7 wxConvUTF7Obj;
3630 static wxMBConvUTF8 wxConvUTF8Obj;
3631 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3632 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3633 #endif
3634 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3635 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3636 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3637 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3638 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3639 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3640 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3641 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3642 #ifdef __WXOSX__
3643 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3644                                     wxConvMacUTF8DObj;
3645 #else
3646                                     wxConvUTF8Obj;
3647 #endif
3648 #else // !__WXOSX__
3649                                     wxConvLibcObj;
3650 #endif // __WXOSX__/!__WXOSX__
3651
3652 #if wxUSE_UNICODE
3653
3654 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3655 {
3656     if ( !s )
3657         return wxWCharBuffer();
3658
3659     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3660     if ( !wbuf )
3661         wbuf = wxConvUTF8.cMB2WX(s);
3662     if ( !wbuf )
3663         wbuf = wxConvISO8859_1.cMB2WX(s);
3664
3665     return wbuf;
3666 }
3667
3668 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3669 {
3670     if ( !ws )
3671         return wxCharBuffer();
3672
3673     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3674     if ( !buf )
3675         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3676
3677     return buf;
3678 }
3679
3680 #endif // wxUSE_UNICODE
3681
3682 #else // !wxUSE_WCHAR_T
3683
3684 // stand-ins in absence of wchar_t
3685 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3686                                 wxConvISO8859_1,
3687                                 wxConvLocal,
3688                                 wxConvUTF8;
3689
3690 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T