src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = MB2WC(NULL, psz, 0);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = WC2MB(NULL, pwz, 0);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             // extra space for trailing NUL(s)
 380             static const size_t extraLen = GetMaxMBNulLen();
 381
 382             wxCharBuffer buf(nLen + extraLen - 1);
 383             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 384                 return buf;
 385         }
 386     }
 387
 388     return wxCharBuffer();
 389 }
 390
 391 const wxWCharBuffer
 392 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 393 {
 394     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 395     if ( dstLen != wxCONV_FAILED )
 396     {
 397         wxWCharBuffer wbuf(dstLen - 1);
 398         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 399         {
 400             if ( outLen )
 401             {
 402                 *outLen = dstLen;
 403                 if ( wbuf[dstLen - 1] == L'\0' )
 404                     (*outLen)--;
 405             }
 406
 407             return wbuf;
 408         }
 409     }
 410
 411     if ( outLen )
 412         *outLen = 0;
 413
 414     return wxWCharBuffer();
 415 }
 416
 417 const wxCharBuffer
 418 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 419 {
 420     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 421     if ( dstLen != wxCONV_FAILED )
 422     {
 423         // special case of empty input: can't allocate 0 size buffer below as
 424         // wxCharBuffer insists on NUL-terminating it
 425         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 const size_t nulLen = GetMBNulLen();
 433                 if ( dstLen >= nulLen &&
 434                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 435                 {
 436                     // in this case the output is NUL-terminated and we're not
 437                     // supposed to count NUL
 438                     *outLen -= nulLen;
 439                 }
 440             }
 441
 442             return buf;
 443         }
 444     }
 445
 446     if ( outLen )
 447         *outLen = 0;
 448
 449     return wxCharBuffer();
 450 }
 451
 452 // ----------------------------------------------------------------------------
 453 // wxMBConvLibc
 454 // ----------------------------------------------------------------------------
 455
 456 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 457 {
 458     return wxMB2WC(buf, psz, n);
 459 }
 460
 461 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 462 {
 463     return wxWC2MB(buf, psz, n);
 464 }
 465
 466 // ----------------------------------------------------------------------------
 467 // wxConvBrokenFileNames
 468 // ----------------------------------------------------------------------------
 469
 470 #ifdef __UNIX__
 471
 472 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 473 {
 474     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 475          wxStricmp(charset, _T("UTF8")) == 0  )
 476         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 477     else
 478         m_conv = new wxCSConv(charset);
 479 }
 480
 481 #endif // __UNIX__
 482
 483 // ----------------------------------------------------------------------------
 484 // UTF-7
 485 // ----------------------------------------------------------------------------
 486
 487 // Implementation (C) 2004 Fredrik Roubert
 488
 489 //
 490 // BASE64 decoding table
 491 //
 492 static const unsigned char utf7unb64[] =
 493 {
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 500     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 501     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 503     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 504     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 505     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 507     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 508     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 509     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 526 };
 527
 528 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 529 {
 530     size_t len = 0;
 531
 532     while ( *psz && (!buf || (len < n)) )
 533     {
 534         unsigned char cc = *psz++;
 535         if (cc != '+')
 536         {
 537             // plain ASCII char
 538             if (buf)
 539                 *buf++ = cc;
 540             len++;
 541         }
 542         else if (*psz == '-')
 543         {
 544             // encoded plus sign
 545             if (buf)
 546                 *buf++ = cc;
 547             len++;
 548             psz++;
 549         }
 550         else // start of BASE64 encoded string
 551         {
 552             bool lsb, ok;
 553             unsigned int d, l;
 554             for ( ok = lsb = false, d = 0, l = 0;
 555                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 556                   psz++ )
 557             {
 558                 d <<= 6;
 559                 d += cc;
 560                 for (l += 6; l >= 8; lsb = !lsb)
 561                 {
 562                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 563                     if (lsb)
 564                     {
 565                         if (buf)
 566                             *buf++ |= c;
 567                         len ++;
 568                     }
 569                     else
 570                     {
 571                         if (buf)
 572                             *buf = (wchar_t)(c << 8);
 573                     }
 574
 575                     ok = true;
 576                 }
 577             }
 578
 579             if ( !ok )
 580             {
 581                 // in valid UTF7 we should have valid characters after '+'
 582                 return wxCONV_FAILED;
 583             }
 584
 585             if (*psz == '-')
 586                 psz++;
 587         }
 588     }
 589
 590     if ( buf && (len < n) )
 591         *buf = '\0';
 592
 593     return len;
 594 }
 595
 596 //
 597 // BASE64 encoding table
 598 //
 599 static const unsigned char utf7enb64[] =
 600 {
 601     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 602     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 603     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 604     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 605     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 606     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 607     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 608     '4', '5', '6', '7', '8', '9', '+', '/'
 609 };
 610
 611 //
 612 // UTF-7 encoding table
 613 //
 614 // 0 - Set D (directly encoded characters)
 615 // 1 - Set O (optional direct characters)
 616 // 2 - whitespace characters (optional)
 617 // 3 - special characters
 618 //
 619 static const unsigned char utf7encode[128] =
 620 {
 621     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 622     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 623     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 624     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 625     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 627     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 628     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 629 };
 630
 631 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 632 {
 633     size_t len = 0;
 634
 635     while (*psz && ((!buf) || (len < n)))
 636     {
 637         wchar_t cc = *psz++;
 638         if (cc < 0x80 && utf7encode[cc] < 1)
 639         {
 640             // plain ASCII char
 641             if (buf)
 642                 *buf++ = (char)cc;
 643
 644             len++;
 645         }
 646 #ifndef WC_UTF16
 647         else if (((wxUint32)cc) > 0xffff)
 648         {
 649             // no surrogate pair generation (yet?)
 650             return wxCONV_FAILED;
 651         }
 652 #endif
 653         else
 654         {
 655             if (buf)
 656                 *buf++ = '+';
 657
 658             len++;
 659             if (cc != '+')
 660             {
 661                 // BASE64 encode string
 662                 unsigned int lsb, d, l;
 663                 for (d = 0, l = 0; /*nothing*/; psz++)
 664                 {
 665                     for (lsb = 0; lsb < 2; lsb ++)
 666                     {
 667                         d <<= 8;
 668                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 669
 670                         for (l += 8; l >= 6; )
 671                         {
 672                             l -= 6;
 673                             if (buf)
 674                                 *buf++ = utf7enb64[(d >> l) % 64];
 675                             len++;
 676                         }
 677                     }
 678
 679                     cc = *psz;
 680                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 681                         break;
 682                 }
 683
 684                 if (l != 0)
 685                 {
 686                     if (buf)
 687                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 688
 689                     len++;
 690                 }
 691             }
 692
 693             if (buf)
 694                 *buf++ = '-';
 695             len++;
 696         }
 697     }
 698
 699     if (buf && (len < n))
 700         *buf = 0;
 701
 702     return len;
 703 }
 704
 705 // ----------------------------------------------------------------------------
 706 // UTF-8
 707 // ----------------------------------------------------------------------------
 708
 709 static wxUint32 utf8_max[]=
 710     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 711
 712 // boundaries of the private use area we use to (temporarily) remap invalid
 713 // characters invalid in a UTF-8 encoded string
 714 const wxUint32 wxUnicodePUA = 0x100000;
 715 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 716
 717 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 718 {
 719     size_t len = 0;
 720
 721     while (*psz && ((!buf) || (len < n)))
 722     {
 723         const char *opsz = psz;
 724         bool invalid = false;
 725         unsigned char cc = *psz++, fc = cc;
 726         unsigned cnt;
 727         for (cnt = 0; fc & 0x80; cnt++)
 728             fc <<= 1;
 729
 730         if (!cnt)
 731         {
 732             // plain ASCII char
 733             if (buf)
 734                 *buf++ = cc;
 735             len++;
 736
 737             // escape the escape character for octal escapes
 738             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 739                     && cc == '\\' && (!buf || len < n))
 740             {
 741                 if (buf)
 742                     *buf++ = cc;
 743                 len++;
 744             }
 745         }
 746         else
 747         {
 748             cnt--;
 749             if (!cnt)
 750             {
 751                 // invalid UTF-8 sequence
 752                 invalid = true;
 753             }
 754             else
 755             {
 756                 unsigned ocnt = cnt - 1;
 757                 wxUint32 res = cc & (0x3f >> cnt);
 758                 while (cnt--)
 759                 {
 760                     cc = *psz;
 761                     if ((cc & 0xC0) != 0x80)
 762                     {
 763                         // invalid UTF-8 sequence
 764                         invalid = true;
 765                         break;
 766                     }
 767
 768                     psz++;
 769                     res = (res << 6) | (cc & 0x3f);
 770                 }
 771
 772                 if (invalid || res <= utf8_max[ocnt])
 773                 {
 774                     // illegal UTF-8 encoding
 775                     invalid = true;
 776                 }
 777                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 778                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 779                 {
 780                     // if one of our PUA characters turns up externally
 781                     // it must also be treated as an illegal sequence
 782                     // (a bit like you have to escape an escape character)
 783                     invalid = true;
 784                 }
 785                 else
 786                 {
 787 #ifdef WC_UTF16
 788                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 789                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 790                     if (pa == wxCONV_FAILED)
 791                     {
 792                         invalid = true;
 793                     }
 794                     else
 795                     {
 796                         if (buf)
 797                             buf += pa;
 798                         len += pa;
 799                     }
 800 #else // !WC_UTF16
 801                     if (buf)
 802                         *buf++ = (wchar_t)res;
 803                     len++;
 804 #endif // WC_UTF16/!WC_UTF16
 805                 }
 806             }
 807
 808             if (invalid)
 809             {
 810                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 811                 {
 812                     while (opsz < psz && (!buf || len < n))
 813                     {
 814 #ifdef WC_UTF16
 815                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 816                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 817                         wxASSERT(pa != wxCONV_FAILED);
 818                         if (buf)
 819                             buf += pa;
 820                         opsz++;
 821                         len += pa;
 822 #else
 823                         if (buf)
 824                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 825                         opsz++;
 826                         len++;
 827 #endif
 828                     }
 829                 }
 830                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 831                 {
 832                     while (opsz < psz && (!buf || len < n))
 833                     {
 834                         if ( buf && len + 3 < n )
 835                         {
 836                             unsigned char on = *opsz;
 837                             *buf++ = L'\\';
 838                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 839                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 840                             *buf++ = (wchar_t)( L'0' + on % 010 );
 841                         }
 842
 843                         opsz++;
 844                         len += 4;
 845                     }
 846                 }
 847                 else // MAP_INVALID_UTF8_NOT
 848                 {
 849                     return wxCONV_FAILED;
 850                 }
 851             }
 852         }
 853     }
 854
 855     if (buf && (len < n))
 856         *buf = 0;
 857
 858     return len;
 859 }
 860
 861 static inline bool isoctal(wchar_t wch)
 862 {
 863     return L'0' <= wch && wch <= L'7';
 864 }
 865
 866 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 867 {
 868     size_t len = 0;
 869
 870     while (*psz && ((!buf) || (len < n)))
 871     {
 872         wxUint32 cc;
 873
 874 #ifdef WC_UTF16
 875         // cast is ok for WC_UTF16
 876         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 877         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 878 #else
 879         cc = (*psz++) & 0x7fffffff;
 880 #endif
 881
 882         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 883                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 884         {
 885             if (buf)
 886                 *buf++ = (char)(cc - wxUnicodePUA);
 887             len++;
 888         }
 889         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 890                     && cc == L'\\' && psz[0] == L'\\' )
 891         {
 892             if (buf)
 893                 *buf++ = (char)cc;
 894             psz++;
 895             len++;
 896         }
 897         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 898                     cc == L'\\' &&
 899                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 900         {
 901             if (buf)
 902             {
 903                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 904                                  (psz[1] - L'0') * 010 +
 905                                  (psz[2] - L'0'));
 906             }
 907
 908             psz += 3;
 909             len++;
 910         }
 911         else
 912         {
 913             unsigned cnt;
 914             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 915             {
 916             }
 917
 918             if (!cnt)
 919             {
 920                 // plain ASCII char
 921                 if (buf)
 922                     *buf++ = (char) cc;
 923                 len++;
 924             }
 925             else
 926             {
 927                 len += cnt + 1;
 928                 if (buf)
 929                 {
 930                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 931                     while (cnt--)
 932                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 933                 }
 934             }
 935         }
 936     }
 937
 938     if (buf && (len < n))
 939         *buf = 0;
 940
 941     return len;
 942 }
 943
 944 // ============================================================================
 945 // UTF-16
 946 // ============================================================================
 947
 948 #ifdef WORDS_BIGENDIAN
 949     #define wxMBConvUTF16straight wxMBConvUTF16BE
 950     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 951 #else
 952     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 953     #define wxMBConvUTF16straight wxMBConvUTF16LE
 954 #endif
 955
 956 /* static */
 957 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 958 {
 959     if ( srcLen == wxNO_LEN )
 960     {
 961         // count the number of bytes in input, including the trailing NULs
 962         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 963         for ( srcLen = 1; *inBuff++; srcLen++ )
 964             ;
 965
 966         srcLen *= BYTES_PER_CHAR;
 967     }
 968     else // we already have the length
 969     {
 970         // we can only convert an entire number of UTF-16 characters
 971         if ( srcLen % BYTES_PER_CHAR )
 972             return wxCONV_FAILED;
 973     }
 974
 975     return srcLen;
 976 }
 977
 978 // case when in-memory representation is UTF-16 too
 979 #ifdef WC_UTF16
 980
 981 // ----------------------------------------------------------------------------
 982 // conversions without endianness change
 983 // ----------------------------------------------------------------------------
 984
 985 size_t
 986 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 987                                const char *src, size_t srcLen) const
 988 {
 989     // set up the scene for using memcpy() (which is presumably more efficient
 990     // than copying the bytes one by one)
 991     srcLen = GetLength(src, srcLen);
 992     if ( srcLen == wxNO_LEN )
 993         return wxCONV_FAILED;
 994
 995     const size_t inLen = srcLen / BYTES_PER_CHAR;
 996     if ( dst )
 997     {
 998         if ( dstLen < inLen )
 999             return wxCONV_FAILED;
1000
1001         memcpy(dst, src, srcLen);
1002     }
1003
1004     return inLen;
1005 }
1006
1007 size_t
1008 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1009                                  const wchar_t *src, size_t srcLen) const
1010 {
1011     if ( srcLen == wxNO_LEN )
1012         srcLen = wxWcslen(src) + 1;
1013
1014     srcLen *= BYTES_PER_CHAR;
1015
1016     if ( dst )
1017     {
1018         if ( dstLen < srcLen )
1019             return wxCONV_FAILED;
1020
1021         memcpy(dst, src, srcLen);
1022     }
1023
1024     return srcLen;
1025 }
1026
1027 // ----------------------------------------------------------------------------
1028 // endian-reversing conversions
1029 // ----------------------------------------------------------------------------
1030
1031 size_t
1032 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1033                            const char *src, size_t srcLen) const
1034 {
1035     srcLen = GetLength(src, srcLen);
1036     if ( srcLen == wxNO_LEN )
1037         return wxCONV_FAILED;
1038
1039     srcLen /= BYTES_PER_CHAR;
1040
1041     if ( dst )
1042     {
1043         if ( dstLen < srcLen )
1044             return wxCONV_FAILED;
1045
1046         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1047         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1048         {
1049             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1050         }
1051     }
1052
1053     return srcLen;
1054 }
1055
1056 size_t
1057 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1058                              const wchar_t *src, size_t srcLen) const
1059 {
1060     if ( srcLen == wxNO_LEN )
1061         srcLen = wxWcslen(src) + 1;
1062
1063     srcLen *= BYTES_PER_CHAR;
1064
1065     if ( dst )
1066     {
1067         if ( dstLen < srcLen )
1068             return wxCONV_FAILED;
1069
1070         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1071         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1072         {
1073             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1074         }
1075     }
1076
1077     return srcLen;
1078 }
1079
1080 #else // !WC_UTF16: wchar_t is UTF-32
1081
1082 // ----------------------------------------------------------------------------
1083 // conversions without endianness change
1084 // ----------------------------------------------------------------------------
1085
1086 size_t
1087 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1088                                const char *src, size_t srcLen) const
1089 {
1090     srcLen = GetLength(src, srcLen);
1091     if ( srcLen == wxNO_LEN )
1092         return wxCONV_FAILED;
1093
1094     const size_t inLen = srcLen / BYTES_PER_CHAR;
1095     if ( !dst )
1096     {
1097         // optimization: return maximal space which could be needed for this
1098         // string even if the real size could be smaller if the buffer contains
1099         // any surrogates
1100         return inLen;
1101     }
1102
1103     size_t outLen = 0;
1104     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1105     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1106     {
1107         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1108         if ( !inBuff )
1109             return wxCONV_FAILED;
1110
1111         if ( ++outLen > dstLen )
1112             return wxCONV_FAILED;
1113
1114         *dst++ = ch;
1115     }
1116
1117
1118     return outLen;
1119 }
1120
1121 size_t
1122 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1123                                  const wchar_t *src, size_t srcLen) const
1124 {
1125     if ( srcLen == wxNO_LEN )
1126         srcLen = wxWcslen(src) + 1;
1127
1128     size_t outLen = 0;
1129     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1130     for ( size_t n = 0; n < srcLen; n++ )
1131     {
1132         wxUint16 cc[2];
1133         const size_t numChars = encode_utf16(*src++, cc);
1134         if ( numChars == wxCONV_FAILED )
1135             return wxCONV_FAILED;
1136
1137         outLen += numChars * BYTES_PER_CHAR;
1138         if ( outBuff )
1139         {
1140             if ( outLen > dstLen )
1141                 return wxCONV_FAILED;
1142
1143             *outBuff++ = cc[0];
1144             if ( numChars == 2 )
1145             {
1146                 // second character of a surrogate
1147                 *outBuff++ = cc[1];
1148             }
1149         }
1150     }
1151
1152     return outLen;
1153 }
1154
1155 // ----------------------------------------------------------------------------
1156 // endian-reversing conversions
1157 // ----------------------------------------------------------------------------
1158
1159 size_t
1160 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1161                            const char *src, size_t srcLen) const
1162 {
1163     srcLen = GetLength(src, srcLen);
1164     if ( srcLen == wxNO_LEN )
1165         return wxCONV_FAILED;
1166
1167     const size_t inLen = srcLen / BYTES_PER_CHAR;
1168     if ( !dst )
1169     {
1170         // optimization: return maximal space which could be needed for this
1171         // string even if the real size could be smaller if the buffer contains
1172         // any surrogates
1173         return inLen;
1174     }
1175
1176     size_t outLen = 0;
1177     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1178     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1179     {
1180         wxUint32 ch;
1181         wxUint16 tmp[2];
1182
1183         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1184         inBuff++;
1185         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1186
1187         const size_t numChars = decode_utf16(tmp, ch);
1188         if ( numChars == wxCONV_FAILED )
1189             return wxCONV_FAILED;
1190
1191         if ( numChars == 2 )
1192             inBuff++;
1193
1194         if ( ++outLen > dstLen )
1195             return wxCONV_FAILED;
1196
1197         *dst++ = ch;
1198     }
1199
1200
1201     return outLen;
1202 }
1203
1204 size_t
1205 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1206                              const wchar_t *src, size_t srcLen) const
1207 {
1208     if ( srcLen == wxNO_LEN )
1209         srcLen = wxWcslen(src) + 1;
1210
1211     size_t outLen = 0;
1212     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1213     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1214     {
1215         wxUint16 cc[2];
1216         const size_t numChars = encode_utf16(*src, cc);
1217         if ( numChars == wxCONV_FAILED )
1218             return wxCONV_FAILED;
1219
1220         outLen += numChars * BYTES_PER_CHAR;
1221         if ( outBuff )
1222         {
1223             if ( outLen > dstLen )
1224                 return wxCONV_FAILED;
1225
1226             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1227             if ( numChars == 2 )
1228             {
1229                 // second character of a surrogate
1230                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1231             }
1232         }
1233     }
1234
1235     return outLen;
1236 }
1237
1238 #endif // WC_UTF16/!WC_UTF16
1239
1240
1241 // ============================================================================
1242 // UTF-32
1243 // ============================================================================
1244
1245 #ifdef WORDS_BIGENDIAN
1246     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1247     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1248 #else
1249     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1250     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1251 #endif
1252
1253
1254 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1255 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1256
1257 /* static */
1258 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1259 {
1260     if ( srcLen == wxNO_LEN )
1261     {
1262         // count the number of bytes in input, including the trailing NULs
1263         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1264         for ( srcLen = 1; *inBuff++; srcLen++ )
1265             ;
1266
1267         srcLen *= BYTES_PER_CHAR;
1268     }
1269     else // we already have the length
1270     {
1271         // we can only convert an entire number of UTF-32 characters
1272         if ( srcLen % BYTES_PER_CHAR )
1273             return wxCONV_FAILED;
1274     }
1275
1276     return srcLen;
1277 }
1278
1279 // case when in-memory representation is UTF-16
1280 #ifdef WC_UTF16
1281
1282 // ----------------------------------------------------------------------------
1283 // conversions without endianness change
1284 // ----------------------------------------------------------------------------
1285
1286 size_t
1287 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1288                                const char *src, size_t srcLen) const
1289 {
1290     srcLen = GetLength(src, srcLen);
1291     if ( srcLen == wxNO_LEN )
1292         return wxCONV_FAILED;
1293
1294     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1295     const size_t inLen = srcLen / BYTES_PER_CHAR;
1296     size_t outLen = 0;
1297     for ( size_t n = 0; n < inLen; n++ )
1298     {
1299         wxUint16 cc[2];
1300         const size_t numChars = encode_utf16(*inBuff++, cc);
1301         if ( numChars == wxCONV_FAILED )
1302             return wxCONV_FAILED;
1303
1304         outLen += numChars;
1305         if ( dst )
1306         {
1307             if ( outLen > dstLen )
1308                 return wxCONV_FAILED;
1309
1310             *dst++ = cc[0];
1311             if ( numChars == 2 )
1312             {
1313                 // second character of a surrogate
1314                 *dst++ = cc[1];
1315             }
1316         }
1317     }
1318
1319     return outLen;
1320 }
1321
1322 size_t
1323 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1324                                  const wchar_t *src, size_t srcLen) const
1325 {
1326     if ( srcLen == wxNO_LEN )
1327         srcLen = wxWcslen(src) + 1;
1328
1329     if ( !dst )
1330     {
1331         // optimization: return maximal space which could be needed for this
1332         // string instead of the exact amount which could be less if there are
1333         // any surrogates in the input
1334         //
1335         // we consider that surrogates are rare enough to make it worthwhile to
1336         // avoid running the loop below at the cost of slightly extra memory
1337         // consumption
1338         return srcLen * BYTES_PER_CHAR;
1339     }
1340
1341     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1342     size_t outLen = 0;
1343     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1344     {
1345         const wxUint32 ch = wxDecodeSurrogate(&src);
1346         if ( !src )
1347             return wxCONV_FAILED;
1348
1349         outLen += BYTES_PER_CHAR;
1350
1351         if ( outLen > dstLen )
1352             return wxCONV_FAILED;
1353
1354         *outBuff++ = ch;
1355     }
1356
1357     return outLen;
1358 }
1359
1360 // ----------------------------------------------------------------------------
1361 // endian-reversing conversions
1362 // ----------------------------------------------------------------------------
1363
1364 size_t
1365 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1366                            const char *src, size_t srcLen) const
1367 {
1368     srcLen = GetLength(src, srcLen);
1369     if ( srcLen == wxNO_LEN )
1370         return wxCONV_FAILED;
1371
1372     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1373     const size_t inLen = srcLen / BYTES_PER_CHAR;
1374     size_t outLen = 0;
1375     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1376     {
1377         wxUint16 cc[2];
1378         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1379         if ( numChars == wxCONV_FAILED )
1380             return wxCONV_FAILED;
1381
1382         outLen += numChars;
1383         if ( dst )
1384         {
1385             if ( outLen > dstLen )
1386                 return wxCONV_FAILED;
1387
1388             *dst++ = cc[0];
1389             if ( numChars == 2 )
1390             {
1391                 // second character of a surrogate
1392                 *dst++ = cc[1];
1393             }
1394         }
1395     }
1396
1397     return outLen;
1398 }
1399
1400 size_t
1401 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1402                              const wchar_t *src, size_t srcLen) const
1403 {
1404     if ( srcLen == wxNO_LEN )
1405         srcLen = wxWcslen(src) + 1;
1406
1407     if ( !dst )
1408     {
1409         // optimization: return maximal space which could be needed for this
1410         // string instead of the exact amount which could be less if there are
1411         // any surrogates in the input
1412         //
1413         // we consider that surrogates are rare enough to make it worthwhile to
1414         // avoid running the loop below at the cost of slightly extra memory
1415         // consumption
1416         return srcLen*BYTES_PER_CHAR;
1417     }
1418
1419     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1420     size_t outLen = 0;
1421     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1422     {
1423         const wxUint32 ch = wxDecodeSurrogate(&src);
1424         if ( !src )
1425             return wxCONV_FAILED;
1426
1427         outLen += BYTES_PER_CHAR;
1428
1429         if ( outLen > dstLen )
1430             return wxCONV_FAILED;
1431
1432         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1433     }
1434
1435     return outLen;
1436 }
1437
1438 #else // !WC_UTF16: wchar_t is UTF-32
1439
1440 // ----------------------------------------------------------------------------
1441 // conversions without endianness change
1442 // ----------------------------------------------------------------------------
1443
1444 size_t
1445 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1446                                const char *src, size_t srcLen) const
1447 {
1448     // use memcpy() as it should be much faster than hand-written loop
1449     srcLen = GetLength(src, srcLen);
1450     if ( srcLen == wxNO_LEN )
1451         return wxCONV_FAILED;
1452
1453     const size_t inLen = srcLen/BYTES_PER_CHAR;
1454     if ( dst )
1455     {
1456         if ( dstLen < inLen )
1457             return wxCONV_FAILED;
1458
1459         memcpy(dst, src, srcLen);
1460     }
1461
1462     return inLen;
1463 }
1464
1465 size_t
1466 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1467                                  const wchar_t *src, size_t srcLen) const
1468 {
1469     if ( srcLen == wxNO_LEN )
1470         srcLen = wxWcslen(src) + 1;
1471
1472     srcLen *= BYTES_PER_CHAR;
1473
1474     if ( dst )
1475     {
1476         if ( dstLen < srcLen )
1477             return wxCONV_FAILED;
1478
1479         memcpy(dst, src, srcLen);
1480     }
1481
1482     return srcLen;
1483 }
1484
1485 // ----------------------------------------------------------------------------
1486 // endian-reversing conversions
1487 // ----------------------------------------------------------------------------
1488
1489 size_t
1490 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1491                            const char *src, size_t srcLen) const
1492 {
1493     srcLen = GetLength(src, srcLen);
1494     if ( srcLen == wxNO_LEN )
1495         return wxCONV_FAILED;
1496
1497     srcLen /= BYTES_PER_CHAR;
1498
1499     if ( dst )
1500     {
1501         if ( dstLen < srcLen )
1502             return wxCONV_FAILED;
1503
1504         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1505         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1506         {
1507             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1508         }
1509     }
1510
1511     return srcLen;
1512 }
1513
1514 size_t
1515 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1516                              const wchar_t *src, size_t srcLen) const
1517 {
1518     if ( srcLen == wxNO_LEN )
1519         srcLen = wxWcslen(src) + 1;
1520
1521     srcLen *= BYTES_PER_CHAR;
1522
1523     if ( dst )
1524     {
1525         if ( dstLen < srcLen )
1526             return wxCONV_FAILED;
1527
1528         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1529         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1530         {
1531             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1532         }
1533     }
1534
1535     return srcLen;
1536 }
1537
1538 #endif // WC_UTF16/!WC_UTF16
1539
1540
1541 // ============================================================================
1542 // The classes doing conversion using the iconv_xxx() functions
1543 // ============================================================================
1544
1545 #ifdef HAVE_ICONV
1546
1547 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1548 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1549 //     (unless there's yet another bug in glibc) the only case when iconv()
1550 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1551 //     left in the input buffer -- when _real_ error occurs,
1552 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1553 //     iconv() failure.
1554 //     [This bug does not appear in glibc 2.2.]
1555 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1556 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1557                                      (errno != E2BIG || bufLeft != 0))
1558 #else
1559 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1560 #endif
1561
1562 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1563
1564 #define ICONV_T_INVALID ((iconv_t)-1)
1565
1566 #if SIZEOF_WCHAR_T == 4
1567     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1568     #define WC_ENC      wxFONTENCODING_UTF32
1569 #elif SIZEOF_WCHAR_T == 2
1570     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1571     #define WC_ENC      wxFONTENCODING_UTF16
1572 #else // sizeof(wchar_t) != 2 nor 4
1573     // does this ever happen?
1574     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1575 #endif
1576
1577 // ----------------------------------------------------------------------------
1578 // wxMBConv_iconv: encapsulates an iconv character set
1579 // ----------------------------------------------------------------------------
1580
1581 class wxMBConv_iconv : public wxMBConv
1582 {
1583 public:
1584     wxMBConv_iconv(const char *name);
1585     virtual ~wxMBConv_iconv();
1586
1587     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1588     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1589
1590     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1591     virtual size_t GetMBNulLen() const;
1592
1593 #if wxUSE_UNICODE_UTF8
1594     virtual bool IsUTF8() const;
1595 #endif
1596
1597     virtual wxMBConv *Clone() const
1598     {
1599         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1600         p->m_minMBCharWidth = m_minMBCharWidth;
1601         return p;
1602     }
1603
1604     bool IsOk() const
1605         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1606
1607 protected:
1608     // the iconv handlers used to translate from multibyte
1609     // to wide char and in the other direction
1610     iconv_t m2w,
1611             w2m;
1612
1613 #if wxUSE_THREADS
1614     // guards access to m2w and w2m objects
1615     wxMutex m_iconvMutex;
1616 #endif
1617
1618 private:
1619     // the name (for iconv_open()) of a wide char charset -- if none is
1620     // available on this machine, it will remain NULL
1621     static wxString ms_wcCharsetName;
1622
1623     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1624     // different endian-ness than the native one
1625     static bool ms_wcNeedsSwap;
1626
1627
1628     // name of the encoding handled by this conversion
1629     wxString m_name;
1630
1631     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1632     // initially
1633     size_t m_minMBCharWidth;
1634 };
1635
1636 // make the constructor available for unit testing
1637 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1638 {
1639     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1640     if ( !result->IsOk() )
1641     {
1642         delete result;
1643         return 0;
1644     }
1645
1646     return result;
1647 }
1648
1649 wxString wxMBConv_iconv::ms_wcCharsetName;
1650 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1651
1652 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1653               : m_name(name)
1654 {
1655     m_minMBCharWidth = 0;
1656
1657     // check for charset that represents wchar_t:
1658     if ( ms_wcCharsetName.empty() )
1659     {
1660         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1661
1662 #if wxUSE_FONTMAP
1663         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1664 #else // !wxUSE_FONTMAP
1665         static const wxChar *names_static[] =
1666         {
1667 #if SIZEOF_WCHAR_T == 4
1668             _T("UCS-4"),
1669 #elif SIZEOF_WCHAR_T = 2
1670             _T("UCS-2"),
1671 #endif
1672             NULL
1673         };
1674         const wxChar **names = names_static;
1675 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1676
1677         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1678         {
1679             const wxString nameCS(*names);
1680
1681             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1682             wxString nameXE(nameCS);
1683
1684 #ifdef WORDS_BIGENDIAN
1685                 nameXE += _T("BE");
1686 #else // little endian
1687                 nameXE += _T("LE");
1688 #endif
1689
1690             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1691                        nameXE.c_str());
1692
1693             m2w = iconv_open(nameXE.ToAscii(), name);
1694             if ( m2w == ICONV_T_INVALID )
1695             {
1696                 // try charset w/o bytesex info (e.g. "UCS4")
1697                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1698                            nameCS.c_str());
1699                 m2w = iconv_open(nameCS.ToAscii(), name);
1700
1701                 // and check for bytesex ourselves:
1702                 if ( m2w != ICONV_T_INVALID )
1703                 {
1704                     char    buf[2], *bufPtr;
1705                     wchar_t wbuf[2], *wbufPtr;
1706                     size_t  insz, outsz;
1707                     size_t  res;
1708
1709                     buf[0] = 'A';
1710                     buf[1] = 0;
1711                     wbuf[0] = 0;
1712                     insz = 2;
1713                     outsz = SIZEOF_WCHAR_T * 2;
1714                     wbufPtr = wbuf;
1715                     bufPtr = buf;
1716
1717                     res = iconv(
1718                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1719                         (char**)&wbufPtr, &outsz);
1720
1721                     if (ICONV_FAILED(res, insz))
1722                     {
1723                         wxLogLastError(wxT("iconv"));
1724                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1725                                    nameCS.c_str());
1726                     }
1727                     else // ok, can convert to this encoding, remember it
1728                     {
1729                         ms_wcCharsetName = nameCS;
1730                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1731                     }
1732                 }
1733             }
1734             else // use charset not requiring byte swapping
1735             {
1736                 ms_wcCharsetName = nameXE;
1737             }
1738         }
1739
1740         wxLogTrace(TRACE_STRCONV,
1741                    wxT("iconv wchar_t charset is \"%s\"%s"),
1742                    ms_wcCharsetName.empty() ? wxString("<none>")
1743                                             : ms_wcCharsetName,
1744                    ms_wcNeedsSwap ? _T(" (needs swap)")
1745                                   : _T(""));
1746     }
1747     else // we already have ms_wcCharsetName
1748     {
1749         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1750     }
1751
1752     if ( ms_wcCharsetName.empty() )
1753     {
1754         w2m = ICONV_T_INVALID;
1755     }
1756     else
1757     {
1758         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1759         if ( w2m == ICONV_T_INVALID )
1760         {
1761             wxLogTrace(TRACE_STRCONV,
1762                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1763                        ms_wcCharsetName.c_str(), name);
1764         }
1765     }
1766 }
1767
1768 wxMBConv_iconv::~wxMBConv_iconv()
1769 {
1770     if ( m2w != ICONV_T_INVALID )
1771         iconv_close(m2w);
1772     if ( w2m != ICONV_T_INVALID )
1773         iconv_close(w2m);
1774 }
1775
1776 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1777 {
1778     // find the string length: notice that must be done differently for
1779     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1780     size_t inbuf;
1781     const size_t nulLen = GetMBNulLen();
1782     switch ( nulLen )
1783     {
1784         default:
1785             return wxCONV_FAILED;
1786
1787         case 1:
1788             inbuf = strlen(psz); // arguably more optimized than our version
1789             break;
1790
1791         case 2:
1792         case 4:
1793             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1794             // they also have to start at character boundary and not span two
1795             // adjacent characters
1796             const char *p;
1797             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1798                 ;
1799             inbuf = p - psz;
1800             break;
1801     }
1802
1803 #if wxUSE_THREADS
1804     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1805     //     Unfortunately there are a couple of global wxCSConv objects such as
1806     //     wxConvLocal that are used all over wx code, so we have to make sure
1807     //     the handle is used by at most one thread at the time. Otherwise
1808     //     only a few wx classes would be safe to use from non-main threads
1809     //     as MB<->WC conversion would fail "randomly".
1810     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1811 #endif // wxUSE_THREADS
1812
1813     size_t outbuf = n * SIZEOF_WCHAR_T;
1814     size_t res, cres;
1815     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1816     wchar_t *bufPtr = buf;
1817     const char *pszPtr = psz;
1818
1819     if (buf)
1820     {
1821         // have destination buffer, convert there
1822         cres = iconv(m2w,
1823                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1824                      (char**)&bufPtr, &outbuf);
1825         res = n - (outbuf / SIZEOF_WCHAR_T);
1826
1827         if (ms_wcNeedsSwap)
1828         {
1829             // convert to native endianness
1830             for ( unsigned i = 0; i < res; i++ )
1831                 buf[n] = WC_BSWAP(buf[i]);
1832         }
1833
1834         // NUL-terminate the string if there is any space left
1835         if (res < n)
1836             buf[res] = 0;
1837     }
1838     else
1839     {
1840         // no destination buffer... convert using temp buffer
1841         // to calculate destination buffer requirement
1842         wchar_t tbuf[8];
1843         res = 0;
1844
1845         do
1846         {
1847             bufPtr = tbuf;
1848             outbuf = 8 * SIZEOF_WCHAR_T;
1849
1850             cres = iconv(m2w,
1851                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1852                          (char**)&bufPtr, &outbuf );
1853
1854             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1855         }
1856         while ((cres == (size_t)-1) && (errno == E2BIG));
1857     }
1858
1859     if (ICONV_FAILED(cres, inbuf))
1860     {
1861         //VS: it is ok if iconv fails, hence trace only
1862         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1863         return wxCONV_FAILED;
1864     }
1865
1866     return res;
1867 }
1868
1869 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1870 {
1871 #if wxUSE_THREADS
1872     // NB: explained in MB2WC
1873     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1874 #endif
1875
1876     size_t inlen = wxWcslen(psz);
1877     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1878     size_t outbuf = n;
1879     size_t res, cres;
1880
1881     wchar_t *tmpbuf = 0;
1882
1883     if (ms_wcNeedsSwap)
1884     {
1885         // need to copy to temp buffer to switch endianness
1886         // (doing WC_BSWAP twice on the original buffer won't help, as it
1887         //  could be in read-only memory, or be accessed in some other thread)
1888         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1889         for ( size_t i = 0; i < inlen; i++ )
1890             tmpbuf[n] = WC_BSWAP(psz[i]);
1891
1892         tmpbuf[inlen] = L'\0';
1893         psz = tmpbuf;
1894     }
1895
1896     if (buf)
1897     {
1898         // have destination buffer, convert there
1899         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1900
1901         res = n - outbuf;
1902
1903         // NB: iconv was given only wcslen(psz) characters on input, and so
1904         //     it couldn't convert the trailing zero. Let's do it ourselves
1905         //     if there's some room left for it in the output buffer.
1906         if (res < n)
1907             buf[0] = 0;
1908     }
1909     else
1910     {
1911         // no destination buffer: convert using temp buffer
1912         // to calculate destination buffer requirement
1913         char tbuf[16];
1914         res = 0;
1915         do
1916         {
1917             buf = tbuf;
1918             outbuf = 16;
1919
1920             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1921
1922             res += 16 - outbuf;
1923         }
1924         while ((cres == (size_t)-1) && (errno == E2BIG));
1925     }
1926
1927     if (ms_wcNeedsSwap)
1928     {
1929         free(tmpbuf);
1930     }
1931
1932     if (ICONV_FAILED(cres, inbuf))
1933     {
1934         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1935         return wxCONV_FAILED;
1936     }
1937
1938     return res;
1939 }
1940
1941 size_t wxMBConv_iconv::GetMBNulLen() const
1942 {
1943     if ( m_minMBCharWidth == 0 )
1944     {
1945         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1946
1947 #if wxUSE_THREADS
1948         // NB: explained in MB2WC
1949         wxMutexLocker lock(self->m_iconvMutex);
1950 #endif
1951
1952         const wchar_t *wnul = L"";
1953         char buf[8]; // should be enough for NUL in any encoding
1954         size_t inLen = sizeof(wchar_t),
1955                outLen = WXSIZEOF(buf);
1956         char *inBuff = (char *)wnul;
1957         char *outBuff = buf;
1958         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1959         {
1960             self->m_minMBCharWidth = (size_t)-1;
1961         }
1962         else // ok
1963         {
1964             self->m_minMBCharWidth = outBuff - buf;
1965         }
1966     }
1967
1968     return m_minMBCharWidth;
1969 }
1970
1971 #if wxUSE_UNICODE_UTF8
1972 bool wxMBConv_iconv::IsUTF8() const
1973 {
1974     return wxStricmp(m_name, "UTF-8") == 0 ||
1975            wxStricmp(m_name, "UTF8") == 0;
1976 }
1977 #endif
1978
1979 #endif // HAVE_ICONV
1980
1981
1982 // ============================================================================
1983 // Win32 conversion classes
1984 // ============================================================================
1985
1986 #ifdef wxHAVE_WIN32_MB2WC
1987
1988 // from utils.cpp
1989 #if wxUSE_FONTMAP
1990 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
1991 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1992 #endif
1993
1994 class wxMBConv_win32 : public wxMBConv
1995 {
1996 public:
1997     wxMBConv_win32()
1998     {
1999         m_CodePage = CP_ACP;
2000         m_minMBCharWidth = 0;
2001     }
2002
2003     wxMBConv_win32(const wxMBConv_win32& conv)
2004         : wxMBConv()
2005     {
2006         m_CodePage = conv.m_CodePage;
2007         m_minMBCharWidth = conv.m_minMBCharWidth;
2008     }
2009
2010 #if wxUSE_FONTMAP
2011     wxMBConv_win32(const char* name)
2012     {
2013         m_CodePage = wxCharsetToCodepage(name);
2014         m_minMBCharWidth = 0;
2015     }
2016
2017     wxMBConv_win32(wxFontEncoding encoding)
2018     {
2019         m_CodePage = wxEncodingToCodepage(encoding);
2020         m_minMBCharWidth = 0;
2021     }
2022 #endif // wxUSE_FONTMAP
2023
2024     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2025     {
2026         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2027         // the behaviour is not compatible with the Unix version (using iconv)
2028         // and break the library itself, e.g. wxTextInputStream::NextChar()
2029         // wouldn't work if reading an incomplete MB char didn't result in an
2030         // error
2031         //
2032         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2033         // Win XP or newer and it is not supported for UTF-[78] so we always
2034         // use our own conversions in this case. See
2035         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2036         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2037         if ( m_CodePage == CP_UTF8 )
2038         {
2039             return wxMBConvUTF8().MB2WC(buf, psz, n);
2040         }
2041
2042         if ( m_CodePage == CP_UTF7 )
2043         {
2044             return wxMBConvUTF7().MB2WC(buf, psz, n);
2045         }
2046
2047         int flags = 0;
2048         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2049                 IsAtLeastWin2kSP4() )
2050         {
2051             flags = MB_ERR_INVALID_CHARS;
2052         }
2053
2054         const size_t len = ::MultiByteToWideChar
2055                              (
2056                                 m_CodePage,     // code page
2057                                 flags,          // flags: fall on error
2058                                 psz,            // input string
2059                                 -1,             // its length (NUL-terminated)
2060                                 buf,            // output string
2061                                 buf ? n : 0     // size of output buffer
2062                              );
2063         if ( !len )
2064         {
2065             // function totally failed
2066             return wxCONV_FAILED;
2067         }
2068
2069         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2070         // check if we succeeded, by doing a double trip:
2071         if ( !flags && buf )
2072         {
2073             const size_t mbLen = strlen(psz);
2074             wxCharBuffer mbBuf(mbLen);
2075             if ( ::WideCharToMultiByte
2076                    (
2077                       m_CodePage,
2078                       0,
2079                       buf,
2080                       -1,
2081                       mbBuf.data(),
2082                       mbLen + 1,        // size in bytes, not length
2083                       NULL,
2084                       NULL
2085                    ) == 0 ||
2086                   strcmp(mbBuf, psz) != 0 )
2087             {
2088                 // we didn't obtain the same thing we started from, hence
2089                 // the conversion was lossy and we consider that it failed
2090                 return wxCONV_FAILED;
2091             }
2092         }
2093
2094         // note that it returns count of written chars for buf != NULL and size
2095         // of the needed buffer for buf == NULL so in either case the length of
2096         // the string (which never includes the terminating NUL) is one less
2097         return len - 1;
2098     }
2099
2100     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2101     {
2102         /*
2103             we have a problem here: by default, WideCharToMultiByte() may
2104             replace characters unrepresentable in the target code page with bad
2105             quality approximations such as turning "1/2" symbol (U+00BD) into
2106             "1" for the code pages which don't have it and we, obviously, want
2107             to avoid this at any price
2108
2109             the trouble is that this function does it _silently_, i.e. it won't
2110             even tell us whether it did or not... Win98/2000 and higher provide
2111             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2112             we have to resort to a round trip, i.e. check that converting back
2113             results in the same string -- this is, of course, expensive but
2114             otherwise we simply can't be sure to not garble the data.
2115          */
2116
2117         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2118         // it doesn't work with CJK encodings (which we test for rather roughly
2119         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2120         // supporting it
2121         BOOL usedDef wxDUMMY_INITIALIZE(false);
2122         BOOL *pUsedDef;
2123         int flags;
2124         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2125         {
2126             // it's our lucky day
2127             flags = WC_NO_BEST_FIT_CHARS;
2128             pUsedDef = &usedDef;
2129         }
2130         else // old system or unsupported encoding
2131         {
2132             flags = 0;
2133             pUsedDef = NULL;
2134         }
2135
2136         const size_t len = ::WideCharToMultiByte
2137                              (
2138                                 m_CodePage,     // code page
2139                                 flags,          // either none or no best fit
2140                                 pwz,            // input string
2141                                 -1,             // it is (wide) NUL-terminated
2142                                 buf,            // output buffer
2143                                 buf ? n : 0,    // and its size
2144                                 NULL,           // default "replacement" char
2145                                 pUsedDef        // [out] was it used?
2146                              );
2147
2148         if ( !len )
2149         {
2150             // function totally failed
2151             return wxCONV_FAILED;
2152         }
2153
2154         // if we were really converting, check if we succeeded
2155         if ( buf )
2156         {
2157             if ( flags )
2158             {
2159                 // check if the conversion failed, i.e. if any replacements
2160                 // were done
2161                 if ( usedDef )
2162                     return wxCONV_FAILED;
2163             }
2164             else // we must resort to double tripping...
2165             {
2166                 wxWCharBuffer wcBuf(n);
2167                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2168                         wcscmp(wcBuf, pwz) != 0 )
2169                 {
2170                     // we didn't obtain the same thing we started from, hence
2171                     // the conversion was lossy and we consider that it failed
2172                     return wxCONV_FAILED;
2173                 }
2174             }
2175         }
2176
2177         // see the comment above for the reason of "len - 1"
2178         return len - 1;
2179     }
2180
2181     virtual size_t GetMBNulLen() const
2182     {
2183         if ( m_minMBCharWidth == 0 )
2184         {
2185             int len = ::WideCharToMultiByte
2186                         (
2187                             m_CodePage,     // code page
2188                             0,              // no flags
2189                             L"",            // input string
2190                             1,              // translate just the NUL
2191                             NULL,           // output buffer
2192                             0,              // and its size
2193                             NULL,           // no replacement char
2194                             NULL            // [out] don't care if it was used
2195                         );
2196
2197             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2198             switch ( len )
2199             {
2200                 default:
2201                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2202                     self->m_minMBCharWidth = (size_t)-1;
2203                     break;
2204
2205                 case 0:
2206                     self->m_minMBCharWidth = (size_t)-1;
2207                     break;
2208
2209                 case 1:
2210                 case 2:
2211                 case 4:
2212                     self->m_minMBCharWidth = len;
2213                     break;
2214             }
2215         }
2216
2217         return m_minMBCharWidth;
2218     }
2219
2220     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2221
2222     bool IsOk() const { return m_CodePage != -1; }
2223
2224 private:
2225     static bool CanUseNoBestFit()
2226     {
2227         static int s_isWin98Or2k = -1;
2228
2229         if ( s_isWin98Or2k == -1 )
2230         {
2231             int verMaj, verMin;
2232             switch ( wxGetOsVersion(&verMaj, &verMin) )
2233             {
2234                 case wxOS_WINDOWS_9X:
2235                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2236                     break;
2237
2238                 case wxOS_WINDOWS_NT:
2239                     s_isWin98Or2k = verMaj >= 5;
2240                     break;
2241
2242                 default:
2243                     // unknown: be conservative by default
2244                     s_isWin98Or2k = 0;
2245                     break;
2246             }
2247
2248             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2249         }
2250
2251         return s_isWin98Or2k == 1;
2252     }
2253
2254     static bool IsAtLeastWin2kSP4()
2255     {
2256 #ifdef __WXWINCE__
2257         return false;
2258 #else
2259         static int s_isAtLeastWin2kSP4 = -1;
2260
2261         if ( s_isAtLeastWin2kSP4 == -1 )
2262         {
2263             OSVERSIONINFOEX ver;
2264
2265             memset(&ver, 0, sizeof(ver));
2266             ver.dwOSVersionInfoSize = sizeof(ver);
2267             GetVersionEx((OSVERSIONINFO*)&ver);
2268
2269             s_isAtLeastWin2kSP4 =
2270               ((ver.dwMajorVersion > 5) || // Vista+
2271                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2272                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2273                ver.wServicePackMajor >= 4)) // 2000 SP4+
2274               ? 1 : 0;
2275         }
2276
2277         return s_isAtLeastWin2kSP4 == 1;
2278 #endif
2279     }
2280
2281
2282     // the code page we're working with
2283     long m_CodePage;
2284
2285     // cached result of GetMBNulLen(), set to 0 initially meaning
2286     // "unknown"
2287     size_t m_minMBCharWidth;
2288 };
2289
2290 #endif // wxHAVE_WIN32_MB2WC
2291
2292
2293 // ============================================================================
2294 // wxEncodingConverter based conversion classes
2295 // ============================================================================
2296
2297 #if wxUSE_FONTMAP
2298
2299 class wxMBConv_wxwin : public wxMBConv
2300 {
2301 private:
2302     void Init()
2303     {
2304         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2305         // The wxMBConv_cf class does a better job.
2306         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2307                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2308                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2309     }
2310
2311 public:
2312     // temporarily just use wxEncodingConverter stuff,
2313     // so that it works while a better implementation is built
2314     wxMBConv_wxwin(const char* name)
2315     {
2316         if (name)
2317             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2318         else
2319             m_enc = wxFONTENCODING_SYSTEM;
2320
2321         Init();
2322     }
2323
2324     wxMBConv_wxwin(wxFontEncoding enc)
2325     {
2326         m_enc = enc;
2327
2328         Init();
2329     }
2330
2331     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2332     {
2333         size_t inbuf = strlen(psz);
2334         if (buf)
2335         {
2336             if (!m2w.Convert(psz, buf))
2337                 return wxCONV_FAILED;
2338         }
2339         return inbuf;
2340     }
2341
2342     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2343     {
2344         const size_t inbuf = wxWcslen(psz);
2345         if (buf)
2346         {
2347             if (!w2m.Convert(psz, buf))
2348                 return wxCONV_FAILED;
2349         }
2350
2351         return inbuf;
2352     }
2353
2354     virtual size_t GetMBNulLen() const
2355     {
2356         switch ( m_enc )
2357         {
2358             case wxFONTENCODING_UTF16BE:
2359             case wxFONTENCODING_UTF16LE:
2360                 return 2;
2361
2362             case wxFONTENCODING_UTF32BE:
2363             case wxFONTENCODING_UTF32LE:
2364                 return 4;
2365
2366             default:
2367                 return 1;
2368         }
2369     }
2370
2371     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2372
2373     bool IsOk() const { return m_ok; }
2374
2375 public:
2376     wxFontEncoding m_enc;
2377     wxEncodingConverter m2w, w2m;
2378
2379 private:
2380     // were we initialized successfully?
2381     bool m_ok;
2382
2383     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2384 };
2385
2386 // make the constructors available for unit testing
2387 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2388 {
2389     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2390     if ( !result->IsOk() )
2391     {
2392         delete result;
2393         return 0;
2394     }
2395
2396     return result;
2397 }
2398
2399 #endif // wxUSE_FONTMAP
2400
2401 // ============================================================================
2402 // wxCSConv implementation
2403 // ============================================================================
2404
2405 void wxCSConv::Init()
2406 {
2407     m_name = NULL;
2408     m_convReal =  NULL;
2409     m_deferred = true;
2410 }
2411
2412 wxCSConv::wxCSConv(const wxString& charset)
2413 {
2414     Init();
2415
2416     if ( !charset.empty() )
2417     {
2418         SetName(charset.ToAscii());
2419     }
2420
2421 #if wxUSE_FONTMAP
2422     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2423 #else
2424     m_encoding = wxFONTENCODING_SYSTEM;
2425 #endif
2426 }
2427
2428 wxCSConv::wxCSConv(wxFontEncoding encoding)
2429 {
2430     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2431     {
2432         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2433
2434         encoding = wxFONTENCODING_SYSTEM;
2435     }
2436
2437     Init();
2438
2439     m_encoding = encoding;
2440 }
2441
2442 wxCSConv::~wxCSConv()
2443 {
2444     Clear();
2445 }
2446
2447 wxCSConv::wxCSConv(const wxCSConv& conv)
2448         : wxMBConv()
2449 {
2450     Init();
2451
2452     SetName(conv.m_name);
2453     m_encoding = conv.m_encoding;
2454 }
2455
2456 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2457 {
2458     Clear();
2459
2460     SetName(conv.m_name);
2461     m_encoding = conv.m_encoding;
2462
2463     return *this;
2464 }
2465
2466 void wxCSConv::Clear()
2467 {
2468     free(m_name);
2469     delete m_convReal;
2470
2471     m_name = NULL;
2472     m_convReal = NULL;
2473 }
2474
2475 void wxCSConv::SetName(const char *charset)
2476 {
2477     if (charset)
2478     {
2479         m_name = strdup(charset);
2480         m_deferred = true;
2481     }
2482 }
2483
2484 #if wxUSE_FONTMAP
2485
2486 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2487                      wxEncodingNameCache );
2488
2489 static wxEncodingNameCache gs_nameCache;
2490 #endif
2491
2492 wxMBConv *wxCSConv::DoCreate() const
2493 {
2494 #if wxUSE_FONTMAP
2495     wxLogTrace(TRACE_STRCONV,
2496                wxT("creating conversion for %s"),
2497                (m_name ? m_name
2498                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2499 #endif // wxUSE_FONTMAP
2500
2501     // check for the special case of ASCII or ISO8859-1 charset: as we have
2502     // special knowledge of it anyhow, we don't need to create a special
2503     // conversion object
2504     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2505             m_encoding == wxFONTENCODING_DEFAULT )
2506     {
2507         // don't convert at all
2508         return NULL;
2509     }
2510
2511     // we trust OS to do conversion better than we can so try external
2512     // conversion methods first
2513     //
2514     // the full order is:
2515     //      1. OS conversion (iconv() under Unix or Win32 API)
2516     //      2. hard coded conversions for UTF
2517     //      3. wxEncodingConverter as fall back
2518
2519     // step (1)
2520 #ifdef HAVE_ICONV
2521 #if !wxUSE_FONTMAP
2522     if ( m_name )
2523 #endif // !wxUSE_FONTMAP
2524     {
2525 #if wxUSE_FONTMAP
2526         wxFontEncoding encoding(m_encoding);
2527 #endif
2528
2529         if ( m_name )
2530         {
2531             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2532             if ( conv->IsOk() )
2533                 return conv;
2534
2535             delete conv;
2536
2537 #if wxUSE_FONTMAP
2538             encoding =
2539                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2540 #endif // wxUSE_FONTMAP
2541         }
2542 #if wxUSE_FONTMAP
2543         {
2544             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2545             if ( it != gs_nameCache.end() )
2546             {
2547                 if ( it->second.empty() )
2548                     return NULL;
2549
2550                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2551                 if ( conv->IsOk() )
2552                     return conv;
2553
2554                 delete conv;
2555             }
2556
2557             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2558             // CS : in case this does not return valid names (eg for MacRoman)
2559             // encoding got a 'failure' entry in the cache all the same,
2560             // although it just has to be created using a different method, so
2561             // only store failed iconv creation attempts (or perhaps we
2562             // shoulnd't do this at all ?)
2563             if ( names[0] != NULL )
2564             {
2565                 for ( ; *names; ++names )
2566                 {
2567                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2568                     //             will need changes that will obsolete this
2569                     wxString name(*names);
2570                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2571                     if ( conv->IsOk() )
2572                     {
2573                         gs_nameCache[encoding] = *names;
2574                         return conv;
2575                     }
2576
2577                     delete conv;
2578                 }
2579
2580                 gs_nameCache[encoding] = _T(""); // cache the failure
2581             }
2582         }
2583 #endif // wxUSE_FONTMAP
2584     }
2585 #endif // HAVE_ICONV
2586
2587 #ifdef wxHAVE_WIN32_MB2WC
2588     {
2589 #if wxUSE_FONTMAP
2590         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2591                                       : new wxMBConv_win32(m_encoding);
2592         if ( conv->IsOk() )
2593             return conv;
2594
2595         delete conv;
2596 #else
2597         return NULL;
2598 #endif
2599     }
2600 #endif // wxHAVE_WIN32_MB2WC
2601
2602 #ifdef __DARWIN__
2603     {
2604         // leave UTF16 and UTF32 to the built-ins of wx
2605         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2606             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2607         {
2608 #if wxUSE_FONTMAP
2609             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2610                                           : new wxMBConv_cf(m_encoding);
2611 #else
2612             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2613 #endif
2614
2615             if ( conv->IsOk() )
2616                  return conv;
2617
2618             delete conv;
2619         }
2620     }
2621 #endif // __DARWIN__
2622
2623     // step (2)
2624     wxFontEncoding enc = m_encoding;
2625 #if wxUSE_FONTMAP
2626     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2627     {
2628         // use "false" to suppress interactive dialogs -- we can be called from
2629         // anywhere and popping up a dialog from here is the last thing we want to
2630         // do
2631         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2632     }
2633 #endif // wxUSE_FONTMAP
2634
2635     switch ( enc )
2636     {
2637         case wxFONTENCODING_UTF7:
2638              return new wxMBConvUTF7;
2639
2640         case wxFONTENCODING_UTF8:
2641              return new wxMBConvUTF8;
2642
2643         case wxFONTENCODING_UTF16BE:
2644              return new wxMBConvUTF16BE;
2645
2646         case wxFONTENCODING_UTF16LE:
2647              return new wxMBConvUTF16LE;
2648
2649         case wxFONTENCODING_UTF32BE:
2650              return new wxMBConvUTF32BE;
2651
2652         case wxFONTENCODING_UTF32LE:
2653              return new wxMBConvUTF32LE;
2654
2655         default:
2656              // nothing to do but put here to suppress gcc warnings
2657              break;
2658     }
2659
2660     // step (3)
2661 #if wxUSE_FONTMAP
2662     {
2663         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2664                                       : new wxMBConv_wxwin(m_encoding);
2665         if ( conv->IsOk() )
2666             return conv;
2667
2668         delete conv;
2669     }
2670 #endif // wxUSE_FONTMAP
2671
2672     // NB: This is a hack to prevent deadlock. What could otherwise happen
2673     //     in Unicode build: wxConvLocal creation ends up being here
2674     //     because of some failure and logs the error. But wxLog will try to
2675     //     attach a timestamp, for which it will need wxConvLocal (to convert
2676     //     time to char* and then wchar_t*), but that fails, tries to log the
2677     //     error, but wxLog has an (already locked) critical section that
2678     //     guards the static buffer.
2679     static bool alreadyLoggingError = false;
2680     if (!alreadyLoggingError)
2681     {
2682         alreadyLoggingError = true;
2683         wxLogError(_("Cannot convert from the charset '%s'!"),
2684                    m_name ? m_name
2685                       :
2686 #if wxUSE_FONTMAP
2687                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2688 #else // !wxUSE_FONTMAP
2689                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2690 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2691               );
2692
2693         alreadyLoggingError = false;
2694     }
2695
2696     return NULL;
2697 }
2698
2699 void wxCSConv::CreateConvIfNeeded() const
2700 {
2701     if ( m_deferred )
2702     {
2703         wxCSConv *self = (wxCSConv *)this; // const_cast
2704
2705         // if we don't have neither the name nor the encoding, use the default
2706         // encoding for this system
2707         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2708         {
2709 #if wxUSE_INTL
2710             self->m_encoding = wxLocale::GetSystemEncoding();
2711 #else
2712             // fallback to some reasonable default:
2713             self->m_encoding = wxFONTENCODING_ISO8859_1;
2714 #endif // wxUSE_INTL
2715         }
2716
2717         self->m_convReal = DoCreate();
2718         self->m_deferred = false;
2719     }
2720 }
2721
2722 bool wxCSConv::IsOk() const
2723 {
2724     CreateConvIfNeeded();
2725
2726     // special case: no convReal created for wxFONTENCODING_ISO8859_1
2727     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2728         return true; // always ok as we do it ourselves
2729
2730     // m_convReal->IsOk() is called at its own creation, so we know it must
2731     // be ok if m_convReal is non-NULL
2732     return m_convReal != NULL;
2733 }
2734
2735 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
2736                          const char *src, size_t srcLen) const
2737 {
2738     CreateConvIfNeeded();
2739
2740     if (m_convReal)
2741         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
2742
2743     // latin-1 (direct)
2744     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
2745 }
2746
2747 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
2748                            const wchar_t *src, size_t srcLen) const
2749 {
2750     CreateConvIfNeeded();
2751
2752     if (m_convReal)
2753         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
2754
2755     // latin-1 (direct)
2756     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
2757 }
2758
2759 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2760 {
2761     CreateConvIfNeeded();
2762
2763     if (m_convReal)
2764         return m_convReal->MB2WC(buf, psz, n);
2765
2766     // latin-1 (direct)
2767     size_t len = strlen(psz);
2768
2769     if (buf)
2770     {
2771         for (size_t c = 0; c <= len; c++)
2772             buf[c] = (unsigned char)(psz[c]);
2773     }
2774
2775     return len;
2776 }
2777
2778 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2779 {
2780     CreateConvIfNeeded();
2781
2782     if (m_convReal)
2783         return m_convReal->WC2MB(buf, psz, n);
2784
2785     // latin-1 (direct)
2786     const size_t len = wxWcslen(psz);
2787     if (buf)
2788     {
2789         for (size_t c = 0; c <= len; c++)
2790         {
2791             if (psz[c] > 0xFF)
2792                 return wxCONV_FAILED;
2793
2794             buf[c] = (char)psz[c];
2795         }
2796     }
2797     else
2798     {
2799         for (size_t c = 0; c <= len; c++)
2800         {
2801             if (psz[c] > 0xFF)
2802                 return wxCONV_FAILED;
2803         }
2804     }
2805
2806     return len;
2807 }
2808
2809 size_t wxCSConv::GetMBNulLen() const
2810 {
2811     CreateConvIfNeeded();
2812
2813     if ( m_convReal )
2814     {
2815         return m_convReal->GetMBNulLen();
2816     }
2817
2818     // otherwise, we are ISO-8859-1
2819     return 1;
2820 }
2821
2822 #if wxUSE_UNICODE_UTF8
2823 bool wxCSConv::IsUTF8() const
2824 {
2825     CreateConvIfNeeded();
2826
2827     if ( m_convReal )
2828     {
2829         return m_convReal->IsUTF8();
2830     }
2831
2832     // otherwise, we are ISO-8859-1
2833     return false;
2834 }
2835 #endif
2836
2837
2838 #if wxUSE_UNICODE
2839
2840 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
2841 {
2842     if ( !s )
2843         return wxWCharBuffer();
2844
2845     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
2846     if ( !wbuf )
2847         wbuf = wxMBConvUTF8().cMB2WX(s);
2848     if ( !wbuf )
2849         wbuf = wxConvISO8859_1.cMB2WX(s);
2850
2851     return wbuf;
2852 }
2853
2854 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
2855 {
2856     if ( !ws )
2857         return wxCharBuffer();
2858
2859     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
2860     if ( !buf )
2861         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
2862
2863     return buf;
2864 }
2865
2866 #endif // wxUSE_UNICODE
2867
2868 // ----------------------------------------------------------------------------
2869 // globals
2870 // ----------------------------------------------------------------------------
2871
2872 // NB: The reason why we create converted objects in this convoluted way,
2873 //     using a factory function instead of global variable, is that they
2874 //     may be used at static initialization time (some of them are used by
2875 //     wxString ctors and there may be a global wxString object). In other
2876 //     words, possibly _before_ the converter global object would be
2877 //     initialized.
2878
2879 #undef wxConvLibc
2880 #undef wxConvUTF8
2881 #undef wxConvUTF7
2882 #undef wxConvLocal
2883 #undef wxConvISO8859_1
2884
2885 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
2886     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
2887     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
2888     {                                                                   \
2889         static impl_klass name##Obj ctor_args;                          \
2890         return &name##Obj;                                              \
2891     }                                                                   \
2892     /* this ensures that all global converter objects are created */    \
2893     /* by the time static initialization is done, i.e. before any */    \
2894     /* thread is launched: */                                           \
2895     static klass* gs_##name##instance = wxGet_##name##Ptr()
2896
2897 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
2898     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
2899
2900 #ifdef __WINDOWS__
2901     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
2902 #else
2903     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
2904 #endif
2905
2906 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
2907 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
2908
2909 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
2910 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
2911
2912 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
2913 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
2914
2915 #ifdef __DARWIN__
2916 // The xnu kernel always communicates file paths in decomposed UTF-8.
2917 // WARNING: Are we sure that CFString's conversion will cause decomposition?
2918 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
2919 #endif
2920
2921 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
2922 #ifdef __DARWIN__
2923                                     &wxConvMacUTF8DObj;
2924 #else // !__DARWIN__
2925                                     wxGet_wxConvLibcPtr();
2926 #endif // __DARWIN__/!__DARWIN__
2927
2928 #else // !wxUSE_WCHAR_T
2929
2930 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
2931 // stand-ins in absence of wchar_t
2932 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2933                                 wxConvISO8859_1,
2934                                 wxConvLocal,
2935                                 wxConvUTF8;
2936
2937 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T