src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487
 488 //
 489 // BASE64 decoding table
 490 //
 491 static const unsigned char utf7unb64[] =
 492 {
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 499     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 500     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 502     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 503     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 504     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 506     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 507     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 508     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 525 };
 526
 527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 528 {
 529     size_t len = 0;
 530
 531     while ( *psz && (!buf || (len < n)) )
 532     {
 533         unsigned char cc = *psz++;
 534         if (cc != '+')
 535         {
 536             // plain ASCII char
 537             if (buf)
 538                 *buf++ = cc;
 539             len++;
 540         }
 541         else if (*psz == '-')
 542         {
 543             // encoded plus sign
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547             psz++;
 548         }
 549         else // start of BASE64 encoded string
 550         {
 551             bool lsb, ok;
 552             unsigned int d, l;
 553             for ( ok = lsb = false, d = 0, l = 0;
 554                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 555                   psz++ )
 556             {
 557                 d <<= 6;
 558                 d += cc;
 559                 for (l += 6; l >= 8; lsb = !lsb)
 560                 {
 561                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 562                     if (lsb)
 563                     {
 564                         if (buf)
 565                             *buf++ |= c;
 566                         len ++;
 567                         ok = true;
 568                     }
 569                     else
 570                     {
 571                         if (buf)
 572                             *buf = (wchar_t)(c << 8);
 573                     }
 574                 }
 575             }
 576
 577             if ( !ok )
 578             {
 579                 // in valid UTF7 we should have valid characters after '+'
 580                 return wxCONV_FAILED;
 581             }
 582
 583             if (*psz == '-')
 584                 psz++;
 585         }
 586     }
 587
 588     if ( buf && (len < n) )
 589         *buf = '\0';
 590
 591     return len;
 592 }
 593
 594 //
 595 // BASE64 encoding table
 596 //
 597 static const unsigned char utf7enb64[] =
 598 {
 599     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 600     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 601     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 602     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 603     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 604     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 605     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 606     '4', '5', '6', '7', '8', '9', '+', '/'
 607 };
 608
 609 //
 610 // UTF-7 encoding table
 611 //
 612 // 0 - Set D (directly encoded characters)
 613 // 1 - Set O (optional direct characters)
 614 // 2 - whitespace characters (optional)
 615 // 3 - special characters
 616 //
 617 static const unsigned char utf7encode[128] =
 618 {
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 620     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 621     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 622     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 623     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 624     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 625     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 627 };
 628
 629 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 630 {
 631     size_t len = 0;
 632
 633     while (*psz && ((!buf) || (len < n)))
 634     {
 635         wchar_t cc = *psz++;
 636         if (cc < 0x80 && utf7encode[cc] < 1)
 637         {
 638             // plain ASCII char
 639             if (buf)
 640                 *buf++ = (char)cc;
 641
 642             len++;
 643         }
 644 #ifndef WC_UTF16
 645         else if (((wxUint32)cc) > 0xffff)
 646         {
 647             // no surrogate pair generation (yet?)
 648             return wxCONV_FAILED;
 649         }
 650 #endif
 651         else
 652         {
 653             if (buf)
 654                 *buf++ = '+';
 655
 656             len++;
 657             if (cc != '+')
 658             {
 659                 // BASE64 encode string
 660                 unsigned int lsb, d, l;
 661                 for (d = 0, l = 0; /*nothing*/; psz++)
 662                 {
 663                     for (lsb = 0; lsb < 2; lsb ++)
 664                     {
 665                         d <<= 8;
 666                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 667
 668                         for (l += 8; l >= 6; )
 669                         {
 670                             l -= 6;
 671                             if (buf)
 672                                 *buf++ = utf7enb64[(d >> l) % 64];
 673                             len++;
 674                         }
 675                     }
 676
 677                     cc = *psz;
 678                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 679                         break;
 680                 }
 681
 682                 if (l != 0)
 683                 {
 684                     if (buf)
 685                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 686
 687                     len++;
 688                 }
 689             }
 690
 691             if (buf)
 692                 *buf++ = '-';
 693             len++;
 694         }
 695     }
 696
 697     if (buf && (len < n))
 698         *buf = 0;
 699
 700     return len;
 701 }
 702
 703 // ----------------------------------------------------------------------------
 704 // UTF-8
 705 // ----------------------------------------------------------------------------
 706
 707 static const wxUint32 utf8_max[]=
 708     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 709
 710 // boundaries of the private use area we use to (temporarily) remap invalid
 711 // characters invalid in a UTF-8 encoded string
 712 const wxUint32 wxUnicodePUA = 0x100000;
 713 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 714
 715 // this table gives the length of the UTF-8 encoding from its first character:
 716 const unsigned char tableUtf8Lengths[256] = {
 717     // single-byte sequences (ASCII):
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 726
 727     // these are invalid:
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 732     0, 0,                                            // C0,C1
 733
 734     // two-byte sequences:
 735           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 736     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 737
 738     // three-byte sequences:
 739     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 740
 741     // four-byte sequences:
 742     4, 4, 4, 4, 4,                                   // F0..F4
 743
 744     // these are invalid again (5- or 6-byte
 745     // sequences and sequences for code points
 746     // above U+10FFFF, as restricted by RFC 3629):
 747                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 748 };
 749
 750 size_t
 751 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 752                             const char *src, size_t srcLen) const
 753 {
 754     wchar_t *out = dstLen ? dst : NULL;
 755     size_t written = 0;
 756
 757     if ( srcLen == wxNO_LEN )
 758         srcLen = strlen(src) + 1;
 759
 760     for ( const char *p = src; ; p++ )
 761     {
 762         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 763         {
 764             // all done successfully, just add the trailing NULL if we are not
 765             // using explicit length
 766             if ( srcLen == wxNO_LEN )
 767             {
 768                 if ( out )
 769                 {
 770                     if ( !dstLen )
 771                         break;
 772
 773                     *out = L'\0';
 774                 }
 775
 776                 written++;
 777             }
 778
 779             return written;
 780         }
 781
 782         if ( out && !dstLen-- )
 783             break;
 784
 785         wxUint32 code;
 786         unsigned char c = *p;
 787
 788         if ( c < 0x80 )
 789         {
 790             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 791                 break;
 792
 793             if ( srcLen != wxNO_LEN )
 794                 srcLen--;
 795
 796             code = c;
 797         }
 798         else
 799         {
 800             unsigned len = tableUtf8Lengths[c];
 801             if ( !len )
 802                 break;
 803
 804             if ( srcLen < len ) // the test works for wxNO_LEN too
 805                 break;
 806
 807             if ( srcLen != wxNO_LEN )
 808                 srcLen -= len;
 809
 810             //   Char. number range   |        UTF-8 octet sequence
 811             //      (hexadecimal)     |              (binary)
 812             //  ----------------------+----------------------------------------
 813             //  0000 0000 - 0000 007F | 0xxxxxxx
 814             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 815             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 816             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 817             //
 818             //  Code point value is stored in bits marked with 'x',
 819             //  lowest-order bit of the value on the right side in the diagram
 820             //  above.                                         (from RFC 3629)
 821
 822             // mask to extract lead byte's value ('x' bits above), by sequence
 823             // length:
 824             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 825
 826             // mask and value of lead byte's most significant bits, by length:
 827             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 828             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 829
 830             len--; // it's more convenient to work with 0-based length here
 831
 832             // extract the lead byte's value bits:
 833             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 834                 break;
 835
 836             code = c & leadValueMask[len];
 837
 838             // all remaining bytes, if any, are handled in the same way
 839             // regardless of sequence's length:
 840             for ( ; len; --len )
 841             {
 842                 c = *++p;
 843                 if ( (c & 0xC0) != 0x80 )
 844                     return wxCONV_FAILED;
 845
 846                 code <<= 6;
 847                 code |= c & 0x3F;
 848             }
 849         }
 850
 851 #ifdef WC_UTF16
 852         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 853         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 854         {
 855             if ( out )
 856                 out++;
 857             written++;
 858         }
 859 #else // !WC_UTF16
 860         if ( out )
 861             *out = code;
 862 #endif // WC_UTF16/!WC_UTF16
 863
 864         if ( out )
 865             out++;
 866
 867         written++;
 868     }
 869
 870     return wxCONV_FAILED;
 871 }
 872
 873 size_t
 874 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 875                               const wchar_t *src, size_t srcLen) const
 876 {
 877     char *out = dstLen ? dst : NULL;
 878     size_t written = 0;
 879
 880     for ( const wchar_t *wp = src; ; wp++ )
 881     {
 882         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 883         {
 884             // all done successfully, just add the trailing NULL if we are not
 885             // using explicit length
 886             if ( srcLen == wxNO_LEN )
 887             {
 888                 if ( out )
 889                 {
 890                     if ( !dstLen )
 891                         break;
 892
 893                     *out = '\0';
 894                 }
 895
 896                 written++;
 897             }
 898
 899             return written;
 900         }
 901
 902
 903         wxUint32 code;
 904 #ifdef WC_UTF16
 905         // cast is ok for WC_UTF16
 906         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 907         {
 908             // skip the next char too as we decoded a surrogate
 909             wp++;
 910         }
 911 #else // wchar_t is UTF-32
 912         code = *wp & 0x7fffffff;
 913 #endif
 914
 915         unsigned len;
 916         if ( code <= 0x7F )
 917         {
 918             len = 1;
 919             if ( out )
 920             {
 921                 if ( dstLen < len )
 922                     break;
 923
 924                 out[0] = (char)code;
 925             }
 926         }
 927         else if ( code <= 0x07FF )
 928         {
 929             len = 2;
 930             if ( out )
 931             {
 932                 if ( dstLen < len )
 933                     break;
 934
 935                 // NB: this line takes 6 least significant bits, encodes them as
 936                 // 10xxxxxx and discards them so that the next byte can be encoded:
 937                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 938                 out[0] = 0xC0 | code;
 939             }
 940         }
 941         else if ( code < 0xFFFF )
 942         {
 943             len = 3;
 944             if ( out )
 945             {
 946                 if ( dstLen < len )
 947                     break;
 948
 949                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[0] = 0xE0 | code;
 952             }
 953         }
 954         else if ( code <= 0x10FFFF )
 955         {
 956             len = 4;
 957             if ( out )
 958             {
 959                 if ( dstLen < len )
 960                     break;
 961
 962                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 963                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 965                 out[0] = 0xF0 | code;
 966             }
 967         }
 968         else
 969         {
 970             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 971             break;
 972         }
 973
 974         if ( out )
 975         {
 976             out += len;
 977             dstLen -= len;
 978         }
 979
 980         written += len;
 981     }
 982
 983     // we only get here if an error occurs during decoding
 984     return wxCONV_FAILED;
 985 }
 986
 987 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 988                              const char *psz, size_t srcLen) const
 989 {
 990     if ( m_options == MAP_INVALID_UTF8_NOT )
 991         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 992
 993     size_t len = 0;
 994
 995     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 996     {
 997         const char *opsz = psz;
 998         bool invalid = false;
 999         unsigned char cc = *psz++, fc = cc;
1000         unsigned cnt;
1001         for (cnt = 0; fc & 0x80; cnt++)
1002             fc <<= 1;
1003
1004         if (!cnt)
1005         {
1006             // plain ASCII char
1007             if (buf)
1008                 *buf++ = cc;
1009             len++;
1010
1011             // escape the escape character for octal escapes
1012             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1013                     && cc == '\\' && (!buf || len < n))
1014             {
1015                 if (buf)
1016                     *buf++ = cc;
1017                 len++;
1018             }
1019         }
1020         else
1021         {
1022             cnt--;
1023             if (!cnt)
1024             {
1025                 // invalid UTF-8 sequence
1026                 invalid = true;
1027             }
1028             else
1029             {
1030                 unsigned ocnt = cnt - 1;
1031                 wxUint32 res = cc & (0x3f >> cnt);
1032                 while (cnt--)
1033                 {
1034                     cc = *psz;
1035                     if ((cc & 0xC0) != 0x80)
1036                     {
1037                         // invalid UTF-8 sequence
1038                         invalid = true;
1039                         break;
1040                     }
1041
1042                     psz++;
1043                     res = (res << 6) | (cc & 0x3f);
1044                 }
1045
1046                 if (invalid || res <= utf8_max[ocnt])
1047                 {
1048                     // illegal UTF-8 encoding
1049                     invalid = true;
1050                 }
1051                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1052                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1053                 {
1054                     // if one of our PUA characters turns up externally
1055                     // it must also be treated as an illegal sequence
1056                     // (a bit like you have to escape an escape character)
1057                     invalid = true;
1058                 }
1059                 else
1060                 {
1061 #ifdef WC_UTF16
1062                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1063                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1064                     if (pa == wxCONV_FAILED)
1065                     {
1066                         invalid = true;
1067                     }
1068                     else
1069                     {
1070                         if (buf)
1071                             buf += pa;
1072                         len += pa;
1073                     }
1074 #else // !WC_UTF16
1075                     if (buf)
1076                         *buf++ = (wchar_t)res;
1077                     len++;
1078 #endif // WC_UTF16/!WC_UTF16
1079                 }
1080             }
1081
1082             if (invalid)
1083             {
1084                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1085                 {
1086                     while (opsz < psz && (!buf || len < n))
1087                     {
1088 #ifdef WC_UTF16
1089                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1090                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1091                         wxASSERT(pa != wxCONV_FAILED);
1092                         if (buf)
1093                             buf += pa;
1094                         opsz++;
1095                         len += pa;
1096 #else
1097                         if (buf)
1098                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1099                         opsz++;
1100                         len++;
1101 #endif
1102                     }
1103                 }
1104                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1105                 {
1106                     while (opsz < psz && (!buf || len < n))
1107                     {
1108                         if ( buf && len + 3 < n )
1109                         {
1110                             unsigned char on = *opsz;
1111                             *buf++ = L'\\';
1112                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1113                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1114                             *buf++ = (wchar_t)( L'0' + on % 010 );
1115                         }
1116
1117                         opsz++;
1118                         len += 4;
1119                     }
1120                 }
1121                 else // MAP_INVALID_UTF8_NOT
1122                 {
1123                     return wxCONV_FAILED;
1124                 }
1125             }
1126         }
1127     }
1128
1129     if (srcLen == wxNO_LEN && buf && (len < n))
1130         *buf = 0;
1131
1132     return len + 1;
1133 }
1134
1135 static inline bool isoctal(wchar_t wch)
1136 {
1137     return L'0' <= wch && wch <= L'7';
1138 }
1139
1140 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1141                                const wchar_t *psz, size_t srcLen) const
1142 {
1143     if ( m_options == MAP_INVALID_UTF8_NOT )
1144         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1145
1146     size_t len = 0;
1147
1148     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1149     {
1150         wxUint32 cc;
1151
1152 #ifdef WC_UTF16
1153         // cast is ok for WC_UTF16
1154         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1155         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1156 #else
1157         cc = (*psz++) & 0x7fffffff;
1158 #endif
1159
1160         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1161                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1162         {
1163             if (buf)
1164                 *buf++ = (char)(cc - wxUnicodePUA);
1165             len++;
1166         }
1167         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1168                     && cc == L'\\' && psz[0] == L'\\' )
1169         {
1170             if (buf)
1171                 *buf++ = (char)cc;
1172             psz++;
1173             len++;
1174         }
1175         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1176                     cc == L'\\' &&
1177                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1178         {
1179             if (buf)
1180             {
1181                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1182                                  (psz[1] - L'0') * 010 +
1183                                  (psz[2] - L'0'));
1184             }
1185
1186             psz += 3;
1187             len++;
1188         }
1189         else
1190         {
1191             unsigned cnt;
1192             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1193             {
1194             }
1195
1196             if (!cnt)
1197             {
1198                 // plain ASCII char
1199                 if (buf)
1200                     *buf++ = (char) cc;
1201                 len++;
1202             }
1203             else
1204             {
1205                 len += cnt + 1;
1206                 if (buf)
1207                 {
1208                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1209                     while (cnt--)
1210                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1211                 }
1212             }
1213         }
1214     }
1215
1216     if (srcLen == wxNO_LEN && buf && (len < n))
1217         *buf = 0;
1218
1219     return len + 1;
1220 }
1221
1222 // ============================================================================
1223 // UTF-16
1224 // ============================================================================
1225
1226 #ifdef WORDS_BIGENDIAN
1227     #define wxMBConvUTF16straight wxMBConvUTF16BE
1228     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1229 #else
1230     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1231     #define wxMBConvUTF16straight wxMBConvUTF16LE
1232 #endif
1233
1234 /* static */
1235 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1236 {
1237     if ( srcLen == wxNO_LEN )
1238     {
1239         // count the number of bytes in input, including the trailing NULs
1240         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1241         for ( srcLen = 1; *inBuff++; srcLen++ )
1242             ;
1243
1244         srcLen *= BYTES_PER_CHAR;
1245     }
1246     else // we already have the length
1247     {
1248         // we can only convert an entire number of UTF-16 characters
1249         if ( srcLen % BYTES_PER_CHAR )
1250             return wxCONV_FAILED;
1251     }
1252
1253     return srcLen;
1254 }
1255
1256 // case when in-memory representation is UTF-16 too
1257 #ifdef WC_UTF16
1258
1259 // ----------------------------------------------------------------------------
1260 // conversions without endianness change
1261 // ----------------------------------------------------------------------------
1262
1263 size_t
1264 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1265                                const char *src, size_t srcLen) const
1266 {
1267     // set up the scene for using memcpy() (which is presumably more efficient
1268     // than copying the bytes one by one)
1269     srcLen = GetLength(src, srcLen);
1270     if ( srcLen == wxNO_LEN )
1271         return wxCONV_FAILED;
1272
1273     const size_t inLen = srcLen / BYTES_PER_CHAR;
1274     if ( dst )
1275     {
1276         if ( dstLen < inLen )
1277             return wxCONV_FAILED;
1278
1279         memcpy(dst, src, srcLen);
1280     }
1281
1282     return inLen;
1283 }
1284
1285 size_t
1286 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1287                                  const wchar_t *src, size_t srcLen) const
1288 {
1289     if ( srcLen == wxNO_LEN )
1290         srcLen = wxWcslen(src) + 1;
1291
1292     srcLen *= BYTES_PER_CHAR;
1293
1294     if ( dst )
1295     {
1296         if ( dstLen < srcLen )
1297             return wxCONV_FAILED;
1298
1299         memcpy(dst, src, srcLen);
1300     }
1301
1302     return srcLen;
1303 }
1304
1305 // ----------------------------------------------------------------------------
1306 // endian-reversing conversions
1307 // ----------------------------------------------------------------------------
1308
1309 size_t
1310 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1311                            const char *src, size_t srcLen) const
1312 {
1313     srcLen = GetLength(src, srcLen);
1314     if ( srcLen == wxNO_LEN )
1315         return wxCONV_FAILED;
1316
1317     srcLen /= BYTES_PER_CHAR;
1318
1319     if ( dst )
1320     {
1321         if ( dstLen < srcLen )
1322             return wxCONV_FAILED;
1323
1324         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1325         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1326         {
1327             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1328         }
1329     }
1330
1331     return srcLen;
1332 }
1333
1334 size_t
1335 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1336                              const wchar_t *src, size_t srcLen) const
1337 {
1338     if ( srcLen == wxNO_LEN )
1339         srcLen = wxWcslen(src) + 1;
1340
1341     srcLen *= BYTES_PER_CHAR;
1342
1343     if ( dst )
1344     {
1345         if ( dstLen < srcLen )
1346             return wxCONV_FAILED;
1347
1348         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1349         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1350         {
1351             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1352         }
1353     }
1354
1355     return srcLen;
1356 }
1357
1358 #else // !WC_UTF16: wchar_t is UTF-32
1359
1360 // ----------------------------------------------------------------------------
1361 // conversions without endianness change
1362 // ----------------------------------------------------------------------------
1363
1364 size_t
1365 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1366                                const char *src, size_t srcLen) const
1367 {
1368     srcLen = GetLength(src, srcLen);
1369     if ( srcLen == wxNO_LEN )
1370         return wxCONV_FAILED;
1371
1372     const size_t inLen = srcLen / BYTES_PER_CHAR;
1373     if ( !dst )
1374     {
1375         // optimization: return maximal space which could be needed for this
1376         // string even if the real size could be smaller if the buffer contains
1377         // any surrogates
1378         return inLen;
1379     }
1380
1381     size_t outLen = 0;
1382     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1383     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1384     {
1385         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1386         if ( !inBuff )
1387             return wxCONV_FAILED;
1388
1389         if ( ++outLen > dstLen )
1390             return wxCONV_FAILED;
1391
1392         *dst++ = ch;
1393     }
1394
1395
1396     return outLen;
1397 }
1398
1399 size_t
1400 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1401                                  const wchar_t *src, size_t srcLen) const
1402 {
1403     if ( srcLen == wxNO_LEN )
1404         srcLen = wxWcslen(src) + 1;
1405
1406     size_t outLen = 0;
1407     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1408     for ( size_t n = 0; n < srcLen; n++ )
1409     {
1410         wxUint16 cc[2];
1411         const size_t numChars = encode_utf16(*src++, cc);
1412         if ( numChars == wxCONV_FAILED )
1413             return wxCONV_FAILED;
1414
1415         outLen += numChars * BYTES_PER_CHAR;
1416         if ( outBuff )
1417         {
1418             if ( outLen > dstLen )
1419                 return wxCONV_FAILED;
1420
1421             *outBuff++ = cc[0];
1422             if ( numChars == 2 )
1423             {
1424                 // second character of a surrogate
1425                 *outBuff++ = cc[1];
1426             }
1427         }
1428     }
1429
1430     return outLen;
1431 }
1432
1433 // ----------------------------------------------------------------------------
1434 // endian-reversing conversions
1435 // ----------------------------------------------------------------------------
1436
1437 size_t
1438 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1439                            const char *src, size_t srcLen) const
1440 {
1441     srcLen = GetLength(src, srcLen);
1442     if ( srcLen == wxNO_LEN )
1443         return wxCONV_FAILED;
1444
1445     const size_t inLen = srcLen / BYTES_PER_CHAR;
1446     if ( !dst )
1447     {
1448         // optimization: return maximal space which could be needed for this
1449         // string even if the real size could be smaller if the buffer contains
1450         // any surrogates
1451         return inLen;
1452     }
1453
1454     size_t outLen = 0;
1455     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1456     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1457     {
1458         wxUint32 ch;
1459         wxUint16 tmp[2];
1460
1461         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1462         inBuff++;
1463         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1464
1465         const size_t numChars = decode_utf16(tmp, ch);
1466         if ( numChars == wxCONV_FAILED )
1467             return wxCONV_FAILED;
1468
1469         if ( numChars == 2 )
1470             inBuff++;
1471
1472         if ( ++outLen > dstLen )
1473             return wxCONV_FAILED;
1474
1475         *dst++ = ch;
1476     }
1477
1478
1479     return outLen;
1480 }
1481
1482 size_t
1483 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1484                              const wchar_t *src, size_t srcLen) const
1485 {
1486     if ( srcLen == wxNO_LEN )
1487         srcLen = wxWcslen(src) + 1;
1488
1489     size_t outLen = 0;
1490     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1491     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1492     {
1493         wxUint16 cc[2];
1494         const size_t numChars = encode_utf16(*src, cc);
1495         if ( numChars == wxCONV_FAILED )
1496             return wxCONV_FAILED;
1497
1498         outLen += numChars * BYTES_PER_CHAR;
1499         if ( outBuff )
1500         {
1501             if ( outLen > dstLen )
1502                 return wxCONV_FAILED;
1503
1504             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1505             if ( numChars == 2 )
1506             {
1507                 // second character of a surrogate
1508                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1509             }
1510         }
1511     }
1512
1513     return outLen;
1514 }
1515
1516 #endif // WC_UTF16/!WC_UTF16
1517
1518
1519 // ============================================================================
1520 // UTF-32
1521 // ============================================================================
1522
1523 #ifdef WORDS_BIGENDIAN
1524     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1525     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1526 #else
1527     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1528     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1529 #endif
1530
1531
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1534
1535 /* static */
1536 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1537 {
1538     if ( srcLen == wxNO_LEN )
1539     {
1540         // count the number of bytes in input, including the trailing NULs
1541         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1542         for ( srcLen = 1; *inBuff++; srcLen++ )
1543             ;
1544
1545         srcLen *= BYTES_PER_CHAR;
1546     }
1547     else // we already have the length
1548     {
1549         // we can only convert an entire number of UTF-32 characters
1550         if ( srcLen % BYTES_PER_CHAR )
1551             return wxCONV_FAILED;
1552     }
1553
1554     return srcLen;
1555 }
1556
1557 // case when in-memory representation is UTF-16
1558 #ifdef WC_UTF16
1559
1560 // ----------------------------------------------------------------------------
1561 // conversions without endianness change
1562 // ----------------------------------------------------------------------------
1563
1564 size_t
1565 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1566                                const char *src, size_t srcLen) const
1567 {
1568     srcLen = GetLength(src, srcLen);
1569     if ( srcLen == wxNO_LEN )
1570         return wxCONV_FAILED;
1571
1572     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1573     const size_t inLen = srcLen / BYTES_PER_CHAR;
1574     size_t outLen = 0;
1575     for ( size_t n = 0; n < inLen; n++ )
1576     {
1577         wxUint16 cc[2];
1578         const size_t numChars = encode_utf16(*inBuff++, cc);
1579         if ( numChars == wxCONV_FAILED )
1580             return wxCONV_FAILED;
1581
1582         outLen += numChars;
1583         if ( dst )
1584         {
1585             if ( outLen > dstLen )
1586                 return wxCONV_FAILED;
1587
1588             *dst++ = cc[0];
1589             if ( numChars == 2 )
1590             {
1591                 // second character of a surrogate
1592                 *dst++ = cc[1];
1593             }
1594         }
1595     }
1596
1597     return outLen;
1598 }
1599
1600 size_t
1601 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1602                                  const wchar_t *src, size_t srcLen) const
1603 {
1604     if ( srcLen == wxNO_LEN )
1605         srcLen = wxWcslen(src) + 1;
1606
1607     if ( !dst )
1608     {
1609         // optimization: return maximal space which could be needed for this
1610         // string instead of the exact amount which could be less if there are
1611         // any surrogates in the input
1612         //
1613         // we consider that surrogates are rare enough to make it worthwhile to
1614         // avoid running the loop below at the cost of slightly extra memory
1615         // consumption
1616         return srcLen * BYTES_PER_CHAR;
1617     }
1618
1619     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1620     size_t outLen = 0;
1621     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1622     {
1623         const wxUint32 ch = wxDecodeSurrogate(&src);
1624         if ( !src )
1625             return wxCONV_FAILED;
1626
1627         outLen += BYTES_PER_CHAR;
1628
1629         if ( outLen > dstLen )
1630             return wxCONV_FAILED;
1631
1632         *outBuff++ = ch;
1633     }
1634
1635     return outLen;
1636 }
1637
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1641
1642 size_t
1643 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1644                            const char *src, size_t srcLen) const
1645 {
1646     srcLen = GetLength(src, srcLen);
1647     if ( srcLen == wxNO_LEN )
1648         return wxCONV_FAILED;
1649
1650     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1651     const size_t inLen = srcLen / BYTES_PER_CHAR;
1652     size_t outLen = 0;
1653     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1654     {
1655         wxUint16 cc[2];
1656         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1657         if ( numChars == wxCONV_FAILED )
1658             return wxCONV_FAILED;
1659
1660         outLen += numChars;
1661         if ( dst )
1662         {
1663             if ( outLen > dstLen )
1664                 return wxCONV_FAILED;
1665
1666             *dst++ = cc[0];
1667             if ( numChars == 2 )
1668             {
1669                 // second character of a surrogate
1670                 *dst++ = cc[1];
1671             }
1672         }
1673     }
1674
1675     return outLen;
1676 }
1677
1678 size_t
1679 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1680                              const wchar_t *src, size_t srcLen) const
1681 {
1682     if ( srcLen == wxNO_LEN )
1683         srcLen = wxWcslen(src) + 1;
1684
1685     if ( !dst )
1686     {
1687         // optimization: return maximal space which could be needed for this
1688         // string instead of the exact amount which could be less if there are
1689         // any surrogates in the input
1690         //
1691         // we consider that surrogates are rare enough to make it worthwhile to
1692         // avoid running the loop below at the cost of slightly extra memory
1693         // consumption
1694         return srcLen*BYTES_PER_CHAR;
1695     }
1696
1697     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1698     size_t outLen = 0;
1699     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1700     {
1701         const wxUint32 ch = wxDecodeSurrogate(&src);
1702         if ( !src )
1703             return wxCONV_FAILED;
1704
1705         outLen += BYTES_PER_CHAR;
1706
1707         if ( outLen > dstLen )
1708             return wxCONV_FAILED;
1709
1710         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1711     }
1712
1713     return outLen;
1714 }
1715
1716 #else // !WC_UTF16: wchar_t is UTF-32
1717
1718 // ----------------------------------------------------------------------------
1719 // conversions without endianness change
1720 // ----------------------------------------------------------------------------
1721
1722 size_t
1723 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1724                                const char *src, size_t srcLen) const
1725 {
1726     // use memcpy() as it should be much faster than hand-written loop
1727     srcLen = GetLength(src, srcLen);
1728     if ( srcLen == wxNO_LEN )
1729         return wxCONV_FAILED;
1730
1731     const size_t inLen = srcLen/BYTES_PER_CHAR;
1732     if ( dst )
1733     {
1734         if ( dstLen < inLen )
1735             return wxCONV_FAILED;
1736
1737         memcpy(dst, src, srcLen);
1738     }
1739
1740     return inLen;
1741 }
1742
1743 size_t
1744 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1745                                  const wchar_t *src, size_t srcLen) const
1746 {
1747     if ( srcLen == wxNO_LEN )
1748         srcLen = wxWcslen(src) + 1;
1749
1750     srcLen *= BYTES_PER_CHAR;
1751
1752     if ( dst )
1753     {
1754         if ( dstLen < srcLen )
1755             return wxCONV_FAILED;
1756
1757         memcpy(dst, src, srcLen);
1758     }
1759
1760     return srcLen;
1761 }
1762
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1766
1767 size_t
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1769                            const char *src, size_t srcLen) const
1770 {
1771     srcLen = GetLength(src, srcLen);
1772     if ( srcLen == wxNO_LEN )
1773         return wxCONV_FAILED;
1774
1775     srcLen /= BYTES_PER_CHAR;
1776
1777     if ( dst )
1778     {
1779         if ( dstLen < srcLen )
1780             return wxCONV_FAILED;
1781
1782         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1783         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1784         {
1785             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1786         }
1787     }
1788
1789     return srcLen;
1790 }
1791
1792 size_t
1793 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1794                              const wchar_t *src, size_t srcLen) const
1795 {
1796     if ( srcLen == wxNO_LEN )
1797         srcLen = wxWcslen(src) + 1;
1798
1799     srcLen *= BYTES_PER_CHAR;
1800
1801     if ( dst )
1802     {
1803         if ( dstLen < srcLen )
1804             return wxCONV_FAILED;
1805
1806         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1807         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1808         {
1809             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1810         }
1811     }
1812
1813     return srcLen;
1814 }
1815
1816 #endif // WC_UTF16/!WC_UTF16
1817
1818
1819 // ============================================================================
1820 // The classes doing conversion using the iconv_xxx() functions
1821 // ============================================================================
1822
1823 #ifdef HAVE_ICONV
1824
1825 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1826 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1827 //     (unless there's yet another bug in glibc) the only case when iconv()
1828 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1829 //     left in the input buffer -- when _real_ error occurs,
1830 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1831 //     iconv() failure.
1832 //     [This bug does not appear in glibc 2.2.]
1833 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1834 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1835                                      (errno != E2BIG || bufLeft != 0))
1836 #else
1837 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1838 #endif
1839
1840 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1841
1842 #define ICONV_T_INVALID ((iconv_t)-1)
1843
1844 #if SIZEOF_WCHAR_T == 4
1845     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1846     #define WC_ENC      wxFONTENCODING_UTF32
1847 #elif SIZEOF_WCHAR_T == 2
1848     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1849     #define WC_ENC      wxFONTENCODING_UTF16
1850 #else // sizeof(wchar_t) != 2 nor 4
1851     // does this ever happen?
1852     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1853 #endif
1854
1855 // ----------------------------------------------------------------------------
1856 // wxMBConv_iconv: encapsulates an iconv character set
1857 // ----------------------------------------------------------------------------
1858
1859 class wxMBConv_iconv : public wxMBConv
1860 {
1861 public:
1862     wxMBConv_iconv(const char *name);
1863     virtual ~wxMBConv_iconv();
1864
1865     // implement base class virtual methods
1866     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1867                            const char *src, size_t srcLen = wxNO_LEN) const;
1868     virtual size_t FromWChar(char *dst, size_t dstLen,
1869                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1870     virtual size_t GetMBNulLen() const;
1871
1872 #if wxUSE_UNICODE_UTF8
1873     virtual bool IsUTF8() const;
1874 #endif
1875
1876     virtual wxMBConv *Clone() const
1877     {
1878         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1879         p->m_minMBCharWidth = m_minMBCharWidth;
1880         return p;
1881     }
1882
1883     bool IsOk() const
1884         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1885
1886 protected:
1887     // the iconv handlers used to translate from multibyte
1888     // to wide char and in the other direction
1889     iconv_t m2w,
1890             w2m;
1891
1892 #if wxUSE_THREADS
1893     // guards access to m2w and w2m objects
1894     wxMutex m_iconvMutex;
1895 #endif
1896
1897 private:
1898     // the name (for iconv_open()) of a wide char charset -- if none is
1899     // available on this machine, it will remain NULL
1900     static wxString ms_wcCharsetName;
1901
1902     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903     // different endian-ness than the native one
1904     static bool ms_wcNeedsSwap;
1905
1906
1907     // name of the encoding handled by this conversion
1908     wxString m_name;
1909
1910     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1911     // initially
1912     size_t m_minMBCharWidth;
1913 };
1914
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1917 {
1918     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919     if ( !result->IsOk() )
1920     {
1921         delete result;
1922         return 0;
1923     }
1924
1925     return result;
1926 }
1927
1928 wxString wxMBConv_iconv::ms_wcCharsetName;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1930
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1932               : m_name(name)
1933 {
1934     m_minMBCharWidth = 0;
1935
1936     // check for charset that represents wchar_t:
1937     if ( ms_wcCharsetName.empty() )
1938     {
1939         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
1941 #if wxUSE_FONTMAP
1942         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943 #else // !wxUSE_FONTMAP
1944         static const wxChar *names_static[] =
1945         {
1946 #if SIZEOF_WCHAR_T == 4
1947             _T("UCS-4"),
1948 #elif SIZEOF_WCHAR_T = 2
1949             _T("UCS-2"),
1950 #endif
1951             NULL
1952         };
1953         const wxChar **names = names_static;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1955
1956         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1957         {
1958             const wxString nameCS(*names);
1959
1960             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961             wxString nameXE(nameCS);
1962
1963 #ifdef WORDS_BIGENDIAN
1964                 nameXE += _T("BE");
1965 #else // little endian
1966                 nameXE += _T("LE");
1967 #endif
1968
1969             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1970                        nameXE.c_str());
1971
1972             m2w = iconv_open(nameXE.ToAscii(), name);
1973             if ( m2w == ICONV_T_INVALID )
1974             {
1975                 // try charset w/o bytesex info (e.g. "UCS4")
1976                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1977                            nameCS.c_str());
1978                 m2w = iconv_open(nameCS.ToAscii(), name);
1979
1980                 // and check for bytesex ourselves:
1981                 if ( m2w != ICONV_T_INVALID )
1982                 {
1983                     char    buf[2], *bufPtr;
1984                     wchar_t wbuf[2];
1985                     size_t  insz, outsz;
1986                     size_t  res;
1987
1988                     buf[0] = 'A';
1989                     buf[1] = 0;
1990                     wbuf[0] = 0;
1991                     insz = 2;
1992                     outsz = SIZEOF_WCHAR_T * 2;
1993                     char* wbufPtr = (char*)wbuf;
1994                     bufPtr = buf;
1995
1996                     res = iconv(
1997                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1998                         &wbufPtr, &outsz);
1999
2000                     if (ICONV_FAILED(res, insz))
2001                     {
2002                         wxLogLastError(wxT("iconv"));
2003                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2004                                    nameCS.c_str());
2005                     }
2006                     else // ok, can convert to this encoding, remember it
2007                     {
2008                         ms_wcCharsetName = nameCS;
2009                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010                     }
2011                 }
2012             }
2013             else // use charset not requiring byte swapping
2014             {
2015                 ms_wcCharsetName = nameXE;
2016             }
2017         }
2018
2019         wxLogTrace(TRACE_STRCONV,
2020                    wxT("iconv wchar_t charset is \"%s\"%s"),
2021                    ms_wcCharsetName.empty() ? wxString("<none>")
2022                                             : ms_wcCharsetName,
2023                    ms_wcNeedsSwap ? _T(" (needs swap)")
2024                                   : _T(""));
2025     }
2026     else // we already have ms_wcCharsetName
2027     {
2028         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2029     }
2030
2031     if ( ms_wcCharsetName.empty() )
2032     {
2033         w2m = ICONV_T_INVALID;
2034     }
2035     else
2036     {
2037         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2038         if ( w2m == ICONV_T_INVALID )
2039         {
2040             wxLogTrace(TRACE_STRCONV,
2041                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042                        ms_wcCharsetName.c_str(), name);
2043         }
2044     }
2045 }
2046
2047 wxMBConv_iconv::~wxMBConv_iconv()
2048 {
2049     if ( m2w != ICONV_T_INVALID )
2050         iconv_close(m2w);
2051     if ( w2m != ICONV_T_INVALID )
2052         iconv_close(w2m);
2053 }
2054
2055 size_t
2056 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2057                         const char *src, size_t srcLen) const
2058 {
2059     if ( srcLen == wxNO_LEN )
2060     {
2061         // find the string length: notice that must be done differently for
2062         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2063         // consecutive NULs
2064         const size_t nulLen = GetMBNulLen();
2065         switch ( nulLen )
2066         {
2067             default:
2068                 return wxCONV_FAILED;
2069
2070             case 1:
2071                 srcLen = strlen(src); // arguably more optimized than our version
2072                 break;
2073
2074             case 2:
2075             case 4:
2076                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2077                 // but they also have to start at character boundary and not
2078                 // span two adjacent characters
2079                 const char *p;
2080                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2081                     ;
2082                 srcLen = p - src;
2083                 break;
2084         }
2085
2086         // when we're determining the length of the string ourselves we count
2087         // the terminating NUL(s) as part of it and always NUL-terminate the
2088         // output
2089         srcLen += nulLen;
2090     }
2091
2092     // we express length in the number of (wide) characters but iconv always
2093     // counts buffer sizes it in bytes
2094     dstLen *= SIZEOF_WCHAR_T;
2095
2096 #if wxUSE_THREADS
2097     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2098     //     Unfortunately there are a couple of global wxCSConv objects such as
2099     //     wxConvLocal that are used all over wx code, so we have to make sure
2100     //     the handle is used by at most one thread at the time. Otherwise
2101     //     only a few wx classes would be safe to use from non-main threads
2102     //     as MB<->WC conversion would fail "randomly".
2103     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2104 #endif // wxUSE_THREADS
2105
2106     size_t res, cres;
2107     const char *pszPtr = src;
2108
2109     if ( dst )
2110     {
2111         char* bufPtr = (char*)dst;
2112
2113         // have destination buffer, convert there
2114         cres = iconv(m2w,
2115                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2116                      &bufPtr, &dstLen);
2117         res = dstLen - (dstLen / SIZEOF_WCHAR_T);
2118
2119         if (ms_wcNeedsSwap)
2120         {
2121             // convert to native endianness
2122             for ( unsigned i = 0; i < res; i++ )
2123                 dst[i] = WC_BSWAP(dst[i]);
2124         }
2125
2126         // NUL-terminate the string if there is any space left
2127         if (res < dstLen)
2128             dst[res] = 0;
2129     }
2130     else // no destination buffer
2131     {
2132         // convert using temp buffer to calculate the size of the buffer needed
2133         wchar_t tbuf[8];
2134         res = 0;
2135
2136         do
2137         {
2138             char* bufPtr = (char*)tbuf;
2139             dstLen = 8 * SIZEOF_WCHAR_T;
2140
2141             cres = iconv(m2w,
2142                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2143                          &bufPtr, &dstLen );
2144
2145             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2146         }
2147         while ((cres == (size_t)-1) && (errno == E2BIG));
2148     }
2149
2150     if (ICONV_FAILED(cres, srcLen))
2151     {
2152         //VS: it is ok if iconv fails, hence trace only
2153         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2154         return wxCONV_FAILED;
2155     }
2156
2157     return res;
2158 }
2159
2160 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2161                                  const wchar_t *src, size_t srcLen) const
2162 {
2163 #if wxUSE_THREADS
2164     // NB: explained in MB2WC
2165     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2166 #endif
2167
2168     if ( srcLen == wxNO_LEN )
2169         srcLen = wxWcslen(src);
2170
2171     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2172     size_t outbuflen = dstLen;
2173     size_t res, cres;
2174
2175     wchar_t *tmpbuf = 0;
2176
2177     if (ms_wcNeedsSwap)
2178     {
2179         // need to copy to temp buffer to switch endianness
2180         // (doing WC_BSWAP twice on the original buffer won't help, as it
2181         //  could be in read-only memory, or be accessed in some other thread)
2182         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2183         for ( size_t i = 0; i < srcLen; i++ )
2184             tmpbuf[i] = WC_BSWAP(src[i]);
2185
2186         tmpbuf[srcLen] = L'\0';
2187         src = tmpbuf;
2188     }
2189
2190     char* inbuf = (char*)src;
2191     if ( dst )
2192     {
2193         // have destination buffer, convert there
2194         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2195
2196         res = dstLen - outbuflen;
2197
2198         // NB: iconv was given only wcslen(src) characters on input, and so
2199         //     it couldn't convert the trailing zero. Let's do it ourselves
2200         //     if there's some room left for it in the output buffer.
2201         if (res < dstLen)
2202             dst[0] = 0;
2203     }
2204     else // no destination buffer
2205     {
2206         // convert using temp buffer to calculate the size of the buffer needed
2207         char tbuf[16];
2208         res = 0;
2209         do
2210         {
2211             dst = tbuf;
2212             outbuflen = 16;
2213
2214             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2215
2216             res += 16 - outbuflen;
2217         }
2218         while ((cres == (size_t)-1) && (errno == E2BIG));
2219     }
2220
2221     if (ms_wcNeedsSwap)
2222     {
2223         free(tmpbuf);
2224     }
2225
2226     if (ICONV_FAILED(cres, inbuflen))
2227     {
2228         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2229         return wxCONV_FAILED;
2230     }
2231
2232     return res;
2233 }
2234
2235 size_t wxMBConv_iconv::GetMBNulLen() const
2236 {
2237     if ( m_minMBCharWidth == 0 )
2238     {
2239         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2240
2241 #if wxUSE_THREADS
2242         // NB: explained in MB2WC
2243         wxMutexLocker lock(self->m_iconvMutex);
2244 #endif
2245
2246         const wchar_t *wnul = L"";
2247         char buf[8]; // should be enough for NUL in any encoding
2248         size_t inLen = sizeof(wchar_t),
2249                outLen = WXSIZEOF(buf);
2250         char *inBuff = (char *)wnul;
2251         char *outBuff = buf;
2252         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2253         {
2254             self->m_minMBCharWidth = (size_t)-1;
2255         }
2256         else // ok
2257         {
2258             self->m_minMBCharWidth = outBuff - buf;
2259         }
2260     }
2261
2262     return m_minMBCharWidth;
2263 }
2264
2265 #if wxUSE_UNICODE_UTF8
2266 bool wxMBConv_iconv::IsUTF8() const
2267 {
2268     return wxStricmp(m_name, "UTF-8") == 0 ||
2269            wxStricmp(m_name, "UTF8") == 0;
2270 }
2271 #endif
2272
2273 #endif // HAVE_ICONV
2274
2275
2276 // ============================================================================
2277 // Win32 conversion classes
2278 // ============================================================================
2279
2280 #ifdef wxHAVE_WIN32_MB2WC
2281
2282 // from utils.cpp
2283 #if wxUSE_FONTMAP
2284 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2285 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2286 #endif
2287
2288 class wxMBConv_win32 : public wxMBConv
2289 {
2290 public:
2291     wxMBConv_win32()
2292     {
2293         m_CodePage = CP_ACP;
2294         m_minMBCharWidth = 0;
2295     }
2296
2297     wxMBConv_win32(const wxMBConv_win32& conv)
2298         : wxMBConv()
2299     {
2300         m_CodePage = conv.m_CodePage;
2301         m_minMBCharWidth = conv.m_minMBCharWidth;
2302     }
2303
2304 #if wxUSE_FONTMAP
2305     wxMBConv_win32(const char* name)
2306     {
2307         m_CodePage = wxCharsetToCodepage(name);
2308         m_minMBCharWidth = 0;
2309     }
2310
2311     wxMBConv_win32(wxFontEncoding encoding)
2312     {
2313         m_CodePage = wxEncodingToCodepage(encoding);
2314         m_minMBCharWidth = 0;
2315     }
2316 #endif // wxUSE_FONTMAP
2317
2318     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2319     {
2320         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2321         // the behaviour is not compatible with the Unix version (using iconv)
2322         // and break the library itself, e.g. wxTextInputStream::NextChar()
2323         // wouldn't work if reading an incomplete MB char didn't result in an
2324         // error
2325         //
2326         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2327         // Win XP or newer and it is not supported for UTF-[78] so we always
2328         // use our own conversions in this case. See
2329         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2330         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2331         if ( m_CodePage == CP_UTF8 )
2332         {
2333             return wxMBConvUTF8().MB2WC(buf, psz, n);
2334         }
2335
2336         if ( m_CodePage == CP_UTF7 )
2337         {
2338             return wxMBConvUTF7().MB2WC(buf, psz, n);
2339         }
2340
2341         int flags = 0;
2342         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2343                 IsAtLeastWin2kSP4() )
2344         {
2345             flags = MB_ERR_INVALID_CHARS;
2346         }
2347
2348         const size_t len = ::MultiByteToWideChar
2349                              (
2350                                 m_CodePage,     // code page
2351                                 flags,          // flags: fall on error
2352                                 psz,            // input string
2353                                 -1,             // its length (NUL-terminated)
2354                                 buf,            // output string
2355                                 buf ? n : 0     // size of output buffer
2356                              );
2357         if ( !len )
2358         {
2359             // function totally failed
2360             return wxCONV_FAILED;
2361         }
2362
2363         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2364         // check if we succeeded, by doing a double trip:
2365         if ( !flags && buf )
2366         {
2367             const size_t mbLen = strlen(psz);
2368             wxCharBuffer mbBuf(mbLen);
2369             if ( ::WideCharToMultiByte
2370                    (
2371                       m_CodePage,
2372                       0,
2373                       buf,
2374                       -1,
2375                       mbBuf.data(),
2376                       mbLen + 1,        // size in bytes, not length
2377                       NULL,
2378                       NULL
2379                    ) == 0 ||
2380                   strcmp(mbBuf, psz) != 0 )
2381             {
2382                 // we didn't obtain the same thing we started from, hence
2383                 // the conversion was lossy and we consider that it failed
2384                 return wxCONV_FAILED;
2385             }
2386         }
2387
2388         // note that it returns count of written chars for buf != NULL and size
2389         // of the needed buffer for buf == NULL so in either case the length of
2390         // the string (which never includes the terminating NUL) is one less
2391         return len - 1;
2392     }
2393
2394     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2395     {
2396         /*
2397             we have a problem here: by default, WideCharToMultiByte() may
2398             replace characters unrepresentable in the target code page with bad
2399             quality approximations such as turning "1/2" symbol (U+00BD) into
2400             "1" for the code pages which don't have it and we, obviously, want
2401             to avoid this at any price
2402
2403             the trouble is that this function does it _silently_, i.e. it won't
2404             even tell us whether it did or not... Win98/2000 and higher provide
2405             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2406             we have to resort to a round trip, i.e. check that converting back
2407             results in the same string -- this is, of course, expensive but
2408             otherwise we simply can't be sure to not garble the data.
2409          */
2410
2411         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2412         // it doesn't work with CJK encodings (which we test for rather roughly
2413         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2414         // supporting it
2415         BOOL usedDef wxDUMMY_INITIALIZE(false);
2416         BOOL *pUsedDef;
2417         int flags;
2418         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2419         {
2420             // it's our lucky day
2421             flags = WC_NO_BEST_FIT_CHARS;
2422             pUsedDef = &usedDef;
2423         }
2424         else // old system or unsupported encoding
2425         {
2426             flags = 0;
2427             pUsedDef = NULL;
2428         }
2429
2430         const size_t len = ::WideCharToMultiByte
2431                              (
2432                                 m_CodePage,     // code page
2433                                 flags,          // either none or no best fit
2434                                 pwz,            // input string
2435                                 -1,             // it is (wide) NUL-terminated
2436                                 buf,            // output buffer
2437                                 buf ? n : 0,    // and its size
2438                                 NULL,           // default "replacement" char
2439                                 pUsedDef        // [out] was it used?
2440                              );
2441
2442         if ( !len )
2443         {
2444             // function totally failed
2445             return wxCONV_FAILED;
2446         }
2447
2448         // we did something, check if we really succeeded
2449         if ( flags )
2450         {
2451             // check if the conversion failed, i.e. if any replacements
2452             // were done
2453             if ( usedDef )
2454                 return wxCONV_FAILED;
2455         }
2456         else // we must resort to double tripping...
2457         {
2458             // first we need to ensure that we really have the MB data: this is
2459             // not the case if we're called with NULL buffer, in which case we
2460             // need to do the conversion yet again
2461             wxCharBuffer bufDef;
2462             if ( !buf )
2463             {
2464                 bufDef = wxCharBuffer(len);
2465                 buf = bufDef.data();
2466                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2467                                             buf, len, NULL, NULL) )
2468                     return wxCONV_FAILED;
2469             }
2470
2471             if ( !n )
2472                 n = wcslen(pwz);
2473             wxWCharBuffer wcBuf(n);
2474             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2475                     wcscmp(wcBuf, pwz) != 0 )
2476             {
2477                 // we didn't obtain the same thing we started from, hence
2478                 // the conversion was lossy and we consider that it failed
2479                 return wxCONV_FAILED;
2480             }
2481         }
2482
2483         // see the comment above for the reason of "len - 1"
2484         return len - 1;
2485     }
2486
2487     virtual size_t GetMBNulLen() const
2488     {
2489         if ( m_minMBCharWidth == 0 )
2490         {
2491             int len = ::WideCharToMultiByte
2492                         (
2493                             m_CodePage,     // code page
2494                             0,              // no flags
2495                             L"",            // input string
2496                             1,              // translate just the NUL
2497                             NULL,           // output buffer
2498                             0,              // and its size
2499                             NULL,           // no replacement char
2500                             NULL            // [out] don't care if it was used
2501                         );
2502
2503             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2504             switch ( len )
2505             {
2506                 default:
2507                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2508                     self->m_minMBCharWidth = (size_t)-1;
2509                     break;
2510
2511                 case 0:
2512                     self->m_minMBCharWidth = (size_t)-1;
2513                     break;
2514
2515                 case 1:
2516                 case 2:
2517                 case 4:
2518                     self->m_minMBCharWidth = len;
2519                     break;
2520             }
2521         }
2522
2523         return m_minMBCharWidth;
2524     }
2525
2526     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2527
2528     bool IsOk() const { return m_CodePage != -1; }
2529
2530 private:
2531     static bool CanUseNoBestFit()
2532     {
2533         static int s_isWin98Or2k = -1;
2534
2535         if ( s_isWin98Or2k == -1 )
2536         {
2537             int verMaj, verMin;
2538             switch ( wxGetOsVersion(&verMaj, &verMin) )
2539             {
2540                 case wxOS_WINDOWS_9X:
2541                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2542                     break;
2543
2544                 case wxOS_WINDOWS_NT:
2545                     s_isWin98Or2k = verMaj >= 5;
2546                     break;
2547
2548                 default:
2549                     // unknown: be conservative by default
2550                     s_isWin98Or2k = 0;
2551                     break;
2552             }
2553
2554             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2555         }
2556
2557         return s_isWin98Or2k == 1;
2558     }
2559
2560     static bool IsAtLeastWin2kSP4()
2561     {
2562 #ifdef __WXWINCE__
2563         return false;
2564 #else
2565         static int s_isAtLeastWin2kSP4 = -1;
2566
2567         if ( s_isAtLeastWin2kSP4 == -1 )
2568         {
2569             OSVERSIONINFOEX ver;
2570
2571             memset(&ver, 0, sizeof(ver));
2572             ver.dwOSVersionInfoSize = sizeof(ver);
2573             GetVersionEx((OSVERSIONINFO*)&ver);
2574
2575             s_isAtLeastWin2kSP4 =
2576               ((ver.dwMajorVersion > 5) || // Vista+
2577                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2578                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2579                ver.wServicePackMajor >= 4)) // 2000 SP4+
2580               ? 1 : 0;
2581         }
2582
2583         return s_isAtLeastWin2kSP4 == 1;
2584 #endif
2585     }
2586
2587
2588     // the code page we're working with
2589     long m_CodePage;
2590
2591     // cached result of GetMBNulLen(), set to 0 initially meaning
2592     // "unknown"
2593     size_t m_minMBCharWidth;
2594 };
2595
2596 #endif // wxHAVE_WIN32_MB2WC
2597
2598
2599 // ============================================================================
2600 // wxEncodingConverter based conversion classes
2601 // ============================================================================
2602
2603 #if wxUSE_FONTMAP
2604
2605 class wxMBConv_wxwin : public wxMBConv
2606 {
2607 private:
2608     void Init()
2609     {
2610         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2611         // The wxMBConv_cf class does a better job.
2612         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2613                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2614                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2615     }
2616
2617 public:
2618     // temporarily just use wxEncodingConverter stuff,
2619     // so that it works while a better implementation is built
2620     wxMBConv_wxwin(const char* name)
2621     {
2622         if (name)
2623             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2624         else
2625             m_enc = wxFONTENCODING_SYSTEM;
2626
2627         Init();
2628     }
2629
2630     wxMBConv_wxwin(wxFontEncoding enc)
2631     {
2632         m_enc = enc;
2633
2634         Init();
2635     }
2636
2637     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2638     {
2639         size_t inbuf = strlen(psz);
2640         if (buf)
2641         {
2642             if (!m2w.Convert(psz, buf))
2643                 return wxCONV_FAILED;
2644         }
2645         return inbuf;
2646     }
2647
2648     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2649     {
2650         const size_t inbuf = wxWcslen(psz);
2651         if (buf)
2652         {
2653             if (!w2m.Convert(psz, buf))
2654                 return wxCONV_FAILED;
2655         }
2656
2657         return inbuf;
2658     }
2659
2660     virtual size_t GetMBNulLen() const
2661     {
2662         switch ( m_enc )
2663         {
2664             case wxFONTENCODING_UTF16BE:
2665             case wxFONTENCODING_UTF16LE:
2666                 return 2;
2667
2668             case wxFONTENCODING_UTF32BE:
2669             case wxFONTENCODING_UTF32LE:
2670                 return 4;
2671
2672             default:
2673                 return 1;
2674         }
2675     }
2676
2677     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2678
2679     bool IsOk() const { return m_ok; }
2680
2681 public:
2682     wxFontEncoding m_enc;
2683     wxEncodingConverter m2w, w2m;
2684
2685 private:
2686     // were we initialized successfully?
2687     bool m_ok;
2688
2689     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2690 };
2691
2692 // make the constructors available for unit testing
2693 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2694 {
2695     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2696     if ( !result->IsOk() )
2697     {
2698         delete result;
2699         return 0;
2700     }
2701
2702     return result;
2703 }
2704
2705 #endif // wxUSE_FONTMAP
2706
2707 // ============================================================================
2708 // wxCSConv implementation
2709 // ============================================================================
2710
2711 void wxCSConv::Init()
2712 {
2713     m_name = NULL;
2714     m_convReal =  NULL;
2715     m_deferred = true;
2716 }
2717
2718 wxCSConv::wxCSConv(const wxString& charset)
2719 {
2720     Init();
2721
2722     if ( !charset.empty() )
2723     {
2724         SetName(charset.ToAscii());
2725     }
2726
2727 #if wxUSE_FONTMAP
2728     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2729 #else
2730     m_encoding = wxFONTENCODING_SYSTEM;
2731 #endif
2732 }
2733
2734 wxCSConv::wxCSConv(wxFontEncoding encoding)
2735 {
2736     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2737     {
2738         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2739
2740         encoding = wxFONTENCODING_SYSTEM;
2741     }
2742
2743     Init();
2744
2745     m_encoding = encoding;
2746 }
2747
2748 wxCSConv::~wxCSConv()
2749 {
2750     Clear();
2751 }
2752
2753 wxCSConv::wxCSConv(const wxCSConv& conv)
2754         : wxMBConv()
2755 {
2756     Init();
2757
2758     SetName(conv.m_name);
2759     m_encoding = conv.m_encoding;
2760 }
2761
2762 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2763 {
2764     Clear();
2765
2766     SetName(conv.m_name);
2767     m_encoding = conv.m_encoding;
2768
2769     return *this;
2770 }
2771
2772 void wxCSConv::Clear()
2773 {
2774     free(m_name);
2775     delete m_convReal;
2776
2777     m_name = NULL;
2778     m_convReal = NULL;
2779 }
2780
2781 void wxCSConv::SetName(const char *charset)
2782 {
2783     if (charset)
2784     {
2785         m_name = wxStrdup(charset);
2786         m_deferred = true;
2787     }
2788 }
2789
2790 #if wxUSE_FONTMAP
2791
2792 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2793                      wxEncodingNameCache );
2794
2795 static wxEncodingNameCache gs_nameCache;
2796 #endif
2797
2798 wxMBConv *wxCSConv::DoCreate() const
2799 {
2800 #if wxUSE_FONTMAP
2801     wxLogTrace(TRACE_STRCONV,
2802                wxT("creating conversion for %s"),
2803                (m_name ? m_name
2804                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2805 #endif // wxUSE_FONTMAP
2806
2807     // check for the special case of ASCII or ISO8859-1 charset: as we have
2808     // special knowledge of it anyhow, we don't need to create a special
2809     // conversion object
2810     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2811             m_encoding == wxFONTENCODING_DEFAULT )
2812     {
2813         // don't convert at all
2814         return NULL;
2815     }
2816
2817     // we trust OS to do conversion better than we can so try external
2818     // conversion methods first
2819     //
2820     // the full order is:
2821     //      1. OS conversion (iconv() under Unix or Win32 API)
2822     //      2. hard coded conversions for UTF
2823     //      3. wxEncodingConverter as fall back
2824
2825     // step (1)
2826 #ifdef HAVE_ICONV
2827 #if !wxUSE_FONTMAP
2828     if ( m_name )
2829 #endif // !wxUSE_FONTMAP
2830     {
2831 #if wxUSE_FONTMAP
2832         wxFontEncoding encoding(m_encoding);
2833 #endif
2834
2835         if ( m_name )
2836         {
2837             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2838             if ( conv->IsOk() )
2839                 return conv;
2840
2841             delete conv;
2842
2843 #if wxUSE_FONTMAP
2844             encoding =
2845                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2846 #endif // wxUSE_FONTMAP
2847         }
2848 #if wxUSE_FONTMAP
2849         {
2850             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2851             if ( it != gs_nameCache.end() )
2852             {
2853                 if ( it->second.empty() )
2854                     return NULL;
2855
2856                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2857                 if ( conv->IsOk() )
2858                     return conv;
2859
2860                 delete conv;
2861             }
2862
2863             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2864             // CS : in case this does not return valid names (eg for MacRoman)
2865             // encoding got a 'failure' entry in the cache all the same,
2866             // although it just has to be created using a different method, so
2867             // only store failed iconv creation attempts (or perhaps we
2868             // shoulnd't do this at all ?)
2869             if ( names[0] != NULL )
2870             {
2871                 for ( ; *names; ++names )
2872                 {
2873                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2874                     //             will need changes that will obsolete this
2875                     wxString name(*names);
2876                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2877                     if ( conv->IsOk() )
2878                     {
2879                         gs_nameCache[encoding] = *names;
2880                         return conv;
2881                     }
2882
2883                     delete conv;
2884                 }
2885
2886                 gs_nameCache[encoding] = _T(""); // cache the failure
2887             }
2888         }
2889 #endif // wxUSE_FONTMAP
2890     }
2891 #endif // HAVE_ICONV
2892
2893 #ifdef wxHAVE_WIN32_MB2WC
2894     {
2895 #if wxUSE_FONTMAP
2896         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2897                                       : new wxMBConv_win32(m_encoding);
2898         if ( conv->IsOk() )
2899             return conv;
2900
2901         delete conv;
2902 #else
2903         return NULL;
2904 #endif
2905     }
2906 #endif // wxHAVE_WIN32_MB2WC
2907
2908 #ifdef __DARWIN__
2909     {
2910         // leave UTF16 and UTF32 to the built-ins of wx
2911         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2912             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2913         {
2914 #if wxUSE_FONTMAP
2915             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2916                                           : new wxMBConv_cf(m_encoding);
2917 #else
2918             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2919 #endif
2920
2921             if ( conv->IsOk() )
2922                  return conv;
2923
2924             delete conv;
2925         }
2926     }
2927 #endif // __DARWIN__
2928
2929     // step (2)
2930     wxFontEncoding enc = m_encoding;
2931 #if wxUSE_FONTMAP
2932     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2933     {
2934         // use "false" to suppress interactive dialogs -- we can be called from
2935         // anywhere and popping up a dialog from here is the last thing we want to
2936         // do
2937         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2938     }
2939 #endif // wxUSE_FONTMAP
2940
2941     switch ( enc )
2942     {
2943         case wxFONTENCODING_UTF7:
2944              return new wxMBConvUTF7;
2945
2946         case wxFONTENCODING_UTF8:
2947              return new wxMBConvUTF8;
2948
2949         case wxFONTENCODING_UTF16BE:
2950              return new wxMBConvUTF16BE;
2951
2952         case wxFONTENCODING_UTF16LE:
2953              return new wxMBConvUTF16LE;
2954
2955         case wxFONTENCODING_UTF32BE:
2956              return new wxMBConvUTF32BE;
2957
2958         case wxFONTENCODING_UTF32LE:
2959              return new wxMBConvUTF32LE;
2960
2961         default:
2962              // nothing to do but put here to suppress gcc warnings
2963              break;
2964     }
2965
2966     // step (3)
2967 #if wxUSE_FONTMAP
2968     {
2969         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2970                                       : new wxMBConv_wxwin(m_encoding);
2971         if ( conv->IsOk() )
2972             return conv;
2973
2974         delete conv;
2975     }
2976 #endif // wxUSE_FONTMAP
2977
2978     // NB: This is a hack to prevent deadlock. What could otherwise happen
2979     //     in Unicode build: wxConvLocal creation ends up being here
2980     //     because of some failure and logs the error. But wxLog will try to
2981     //     attach a timestamp, for which it will need wxConvLocal (to convert
2982     //     time to char* and then wchar_t*), but that fails, tries to log the
2983     //     error, but wxLog has an (already locked) critical section that
2984     //     guards the static buffer.
2985     static bool alreadyLoggingError = false;
2986     if (!alreadyLoggingError)
2987     {
2988         alreadyLoggingError = true;
2989         wxLogError(_("Cannot convert from the charset '%s'!"),
2990                    m_name ? m_name
2991                       :
2992 #if wxUSE_FONTMAP
2993                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2994 #else // !wxUSE_FONTMAP
2995                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2996 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2997               );
2998
2999         alreadyLoggingError = false;
3000     }
3001
3002     return NULL;
3003 }
3004
3005 void wxCSConv::CreateConvIfNeeded() const
3006 {
3007     if ( m_deferred )
3008     {
3009         wxCSConv *self = (wxCSConv *)this; // const_cast
3010
3011         // if we don't have neither the name nor the encoding, use the default
3012         // encoding for this system
3013         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3014         {
3015 #if wxUSE_INTL
3016             self->m_encoding = wxLocale::GetSystemEncoding();
3017 #else
3018             // fallback to some reasonable default:
3019             self->m_encoding = wxFONTENCODING_ISO8859_1;
3020 #endif // wxUSE_INTL
3021         }
3022
3023         self->m_convReal = DoCreate();
3024         self->m_deferred = false;
3025     }
3026 }
3027
3028 bool wxCSConv::IsOk() const
3029 {
3030     CreateConvIfNeeded();
3031
3032     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3033     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3034         return true; // always ok as we do it ourselves
3035
3036     // m_convReal->IsOk() is called at its own creation, so we know it must
3037     // be ok if m_convReal is non-NULL
3038     return m_convReal != NULL;
3039 }
3040
3041 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3042                          const char *src, size_t srcLen) const
3043 {
3044     CreateConvIfNeeded();
3045
3046     if (m_convReal)
3047         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3048
3049     // latin-1 (direct)
3050     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3051 }
3052
3053 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3054                            const wchar_t *src, size_t srcLen) const
3055 {
3056     CreateConvIfNeeded();
3057
3058     if (m_convReal)
3059         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3060
3061     // latin-1 (direct)
3062     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3063 }
3064
3065 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3066 {
3067     CreateConvIfNeeded();
3068
3069     if (m_convReal)
3070         return m_convReal->MB2WC(buf, psz, n);
3071
3072     // latin-1 (direct)
3073     size_t len = strlen(psz);
3074
3075     if (buf)
3076     {
3077         for (size_t c = 0; c <= len; c++)
3078             buf[c] = (unsigned char)(psz[c]);
3079     }
3080
3081     return len;
3082 }
3083
3084 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3085 {
3086     CreateConvIfNeeded();
3087
3088     if (m_convReal)
3089         return m_convReal->WC2MB(buf, psz, n);
3090
3091     // latin-1 (direct)
3092     const size_t len = wxWcslen(psz);
3093     if (buf)
3094     {
3095         for (size_t c = 0; c <= len; c++)
3096         {
3097             if (psz[c] > 0xFF)
3098                 return wxCONV_FAILED;
3099
3100             buf[c] = (char)psz[c];
3101         }
3102     }
3103     else
3104     {
3105         for (size_t c = 0; c <= len; c++)
3106         {
3107             if (psz[c] > 0xFF)
3108                 return wxCONV_FAILED;
3109         }
3110     }
3111
3112     return len;
3113 }
3114
3115 size_t wxCSConv::GetMBNulLen() const
3116 {
3117     CreateConvIfNeeded();
3118
3119     if ( m_convReal )
3120     {
3121         return m_convReal->GetMBNulLen();
3122     }
3123
3124     // otherwise, we are ISO-8859-1
3125     return 1;
3126 }
3127
3128 #if wxUSE_UNICODE_UTF8
3129 bool wxCSConv::IsUTF8() const
3130 {
3131     CreateConvIfNeeded();
3132
3133     if ( m_convReal )
3134     {
3135         return m_convReal->IsUTF8();
3136     }
3137
3138     // otherwise, we are ISO-8859-1
3139     return false;
3140 }
3141 #endif
3142
3143
3144 #if wxUSE_UNICODE
3145
3146 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3147 {
3148     if ( !s )
3149         return wxWCharBuffer();
3150
3151     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3152     if ( !wbuf )
3153         wbuf = wxMBConvUTF8().cMB2WX(s);
3154     if ( !wbuf )
3155         wbuf = wxConvISO8859_1.cMB2WX(s);
3156
3157     return wbuf;
3158 }
3159
3160 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3161 {
3162     if ( !ws )
3163         return wxCharBuffer();
3164
3165     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3166     if ( !buf )
3167         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3168
3169     return buf;
3170 }
3171
3172 #endif // wxUSE_UNICODE
3173
3174 // ----------------------------------------------------------------------------
3175 // globals
3176 // ----------------------------------------------------------------------------
3177
3178 // NB: The reason why we create converted objects in this convoluted way,
3179 //     using a factory function instead of global variable, is that they
3180 //     may be used at static initialization time (some of them are used by
3181 //     wxString ctors and there may be a global wxString object). In other
3182 //     words, possibly _before_ the converter global object would be
3183 //     initialized.
3184
3185 #undef wxConvLibc
3186 #undef wxConvUTF8
3187 #undef wxConvUTF7
3188 #undef wxConvLocal
3189 #undef wxConvISO8859_1
3190
3191 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3192     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3193     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3194     {                                                                   \
3195         static impl_klass name##Obj ctor_args;                          \
3196         return &name##Obj;                                              \
3197     }                                                                   \
3198     /* this ensures that all global converter objects are created */    \
3199     /* by the time static initialization is done, i.e. before any */    \
3200     /* thread is launched: */                                           \
3201     static klass* gs_##name##instance = wxGet_##name##Ptr()
3202
3203 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3204     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3205
3206 #ifdef __WINDOWS__
3207     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3208 #else
3209     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3210 #endif
3211
3212 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3213 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3214 //     provokes an error message about "not enough macro parameters"; and we
3215 //     can't use "()" here as the name##Obj declaration would be parsed as a
3216 //     function declaration then, so use a semicolon and live with an extra
3217 //     empty statement (and hope that no compilers warns about this)
3218 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3219 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3220
3221 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3222 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3223
3224 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3225 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3226
3227 #ifdef __DARWIN__
3228 // The xnu kernel always communicates file paths in decomposed UTF-8.
3229 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3230 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3231 #endif
3232
3233 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3234 #ifdef __DARWIN__
3235                                     &wxConvMacUTF8DObj;
3236 #else // !__DARWIN__
3237                                     wxGet_wxConvLibcPtr();
3238 #endif // __DARWIN__/!__DARWIN__
3239
3240 #else // !wxUSE_WCHAR_T
3241
3242 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3243 // stand-ins in absence of wchar_t
3244 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3245                                 wxConvISO8859_1,
3246                                 wxConvLocal,
3247                                 wxConvUTF8;
3248
3249 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T