src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487
 488 //
 489 // BASE64 decoding table
 490 //
 491 static const unsigned char utf7unb64[] =
 492 {
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 499     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 500     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 502     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 503     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 504     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 506     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 507     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 508     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 525 };
 526
 527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 528 {
 529     size_t len = 0;
 530
 531     while ( *psz && (!buf || (len < n)) )
 532     {
 533         unsigned char cc = *psz++;
 534         if (cc != '+')
 535         {
 536             // plain ASCII char
 537             if (buf)
 538                 *buf++ = cc;
 539             len++;
 540         }
 541         else if (*psz == '-')
 542         {
 543             // encoded plus sign
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547             psz++;
 548         }
 549         else // start of BASE64 encoded string
 550         {
 551             bool lsb, ok;
 552             unsigned int d, l;
 553             for ( ok = lsb = false, d = 0, l = 0;
 554                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 555                   psz++ )
 556             {
 557                 d <<= 6;
 558                 d += cc;
 559                 for (l += 6; l >= 8; lsb = !lsb)
 560                 {
 561                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 562                     if (lsb)
 563                     {
 564                         if (buf)
 565                             *buf++ |= c;
 566                         len ++;
 567                         ok = true;
 568                     }
 569                     else
 570                     {
 571                         if (buf)
 572                             *buf = (wchar_t)(c << 8);
 573                     }
 574                 }
 575             }
 576
 577             if ( !ok )
 578             {
 579                 // in valid UTF7 we should have valid characters after '+'
 580                 return wxCONV_FAILED;
 581             }
 582
 583             if (*psz == '-')
 584                 psz++;
 585         }
 586     }
 587
 588     if ( buf && (len < n) )
 589         *buf = '\0';
 590
 591     return len;
 592 }
 593
 594 //
 595 // BASE64 encoding table
 596 //
 597 static const unsigned char utf7enb64[] =
 598 {
 599     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 600     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 601     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 602     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 603     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 604     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 605     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 606     '4', '5', '6', '7', '8', '9', '+', '/'
 607 };
 608
 609 //
 610 // UTF-7 encoding table
 611 //
 612 // 0 - Set D (directly encoded characters)
 613 // 1 - Set O (optional direct characters)
 614 // 2 - whitespace characters (optional)
 615 // 3 - special characters
 616 //
 617 static const unsigned char utf7encode[128] =
 618 {
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 620     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 621     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 622     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 623     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 624     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 625     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 627 };
 628
 629 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 630 {
 631     size_t len = 0;
 632
 633     while (*psz && ((!buf) || (len < n)))
 634     {
 635         wchar_t cc = *psz++;
 636         if (cc < 0x80 && utf7encode[cc] < 1)
 637         {
 638             // plain ASCII char
 639             if (buf)
 640                 *buf++ = (char)cc;
 641
 642             len++;
 643         }
 644 #ifndef WC_UTF16
 645         else if (((wxUint32)cc) > 0xffff)
 646         {
 647             // no surrogate pair generation (yet?)
 648             return wxCONV_FAILED;
 649         }
 650 #endif
 651         else
 652         {
 653             if (buf)
 654                 *buf++ = '+';
 655
 656             len++;
 657             if (cc != '+')
 658             {
 659                 // BASE64 encode string
 660                 unsigned int lsb, d, l;
 661                 for (d = 0, l = 0; /*nothing*/; psz++)
 662                 {
 663                     for (lsb = 0; lsb < 2; lsb ++)
 664                     {
 665                         d <<= 8;
 666                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 667
 668                         for (l += 8; l >= 6; )
 669                         {
 670                             l -= 6;
 671                             if (buf)
 672                                 *buf++ = utf7enb64[(d >> l) % 64];
 673                             len++;
 674                         }
 675                     }
 676
 677                     cc = *psz;
 678                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 679                         break;
 680                 }
 681
 682                 if (l != 0)
 683                 {
 684                     if (buf)
 685                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 686
 687                     len++;
 688                 }
 689             }
 690
 691             if (buf)
 692                 *buf++ = '-';
 693             len++;
 694         }
 695     }
 696
 697     if (buf && (len < n))
 698         *buf = 0;
 699
 700     return len;
 701 }
 702
 703 // ----------------------------------------------------------------------------
 704 // UTF-8
 705 // ----------------------------------------------------------------------------
 706
 707 static const wxUint32 utf8_max[]=
 708     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 709
 710 // boundaries of the private use area we use to (temporarily) remap invalid
 711 // characters invalid in a UTF-8 encoded string
 712 const wxUint32 wxUnicodePUA = 0x100000;
 713 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 714
 715 // this table gives the length of the UTF-8 encoding from its first character:
 716 const unsigned char tableUtf8Lengths[256] = {
 717     // single-byte sequences (ASCII):
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 726
 727     // these are invalid:
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 732     0, 0,                                            // C0,C1
 733
 734     // two-byte sequences:
 735           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 736     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 737
 738     // three-byte sequences:
 739     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 740
 741     // four-byte sequences:
 742     4, 4, 4, 4, 4,                                   // F0..F4
 743
 744     // these are invalid again (5- or 6-byte
 745     // sequences and sequences for code points
 746     // above U+10FFFF, as restricted by RFC 3629):
 747                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 748 };
 749
 750 size_t
 751 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 752                             const char *src, size_t srcLen) const
 753 {
 754     wchar_t *out = dstLen ? dst : NULL;
 755     size_t written = 0;
 756
 757     if ( srcLen == wxNO_LEN )
 758         srcLen = strlen(src) + 1;
 759
 760     for ( const char *p = src; ; p++ )
 761     {
 762         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 763         {
 764             // all done successfully, just add the trailing NULL if we are not
 765             // using explicit length
 766             if ( srcLen == wxNO_LEN )
 767             {
 768                 if ( out )
 769                 {
 770                     if ( !dstLen )
 771                         break;
 772
 773                     *out = L'\0';
 774                 }
 775
 776                 written++;
 777             }
 778
 779             return written;
 780         }
 781
 782         if ( out && !dstLen-- )
 783             break;
 784
 785         wxUint32 code;
 786         unsigned char c = *p;
 787
 788         if ( c < 0x80 )
 789         {
 790             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 791                 break;
 792
 793             if ( srcLen != wxNO_LEN )
 794                 srcLen--;
 795
 796             code = c;
 797         }
 798         else
 799         {
 800             unsigned len = tableUtf8Lengths[c];
 801             if ( !len )
 802                 break;
 803
 804             if ( srcLen < len ) // the test works for wxNO_LEN too
 805                 break;
 806
 807             if ( srcLen != wxNO_LEN )
 808                 srcLen -= len;
 809
 810             //   Char. number range   |        UTF-8 octet sequence
 811             //      (hexadecimal)     |              (binary)
 812             //  ----------------------+----------------------------------------
 813             //  0000 0000 - 0000 007F | 0xxxxxxx
 814             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 815             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 816             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 817             //
 818             //  Code point value is stored in bits marked with 'x',
 819             //  lowest-order bit of the value on the right side in the diagram
 820             //  above.                                         (from RFC 3629)
 821
 822             // mask to extract lead byte's value ('x' bits above), by sequence
 823             // length:
 824             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 825
 826             // mask and value of lead byte's most significant bits, by length:
 827             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 828             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 829
 830             len--; // it's more convenient to work with 0-based length here
 831
 832             // extract the lead byte's value bits:
 833             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 834                 break;
 835
 836             code = c & leadValueMask[len];
 837
 838             // all remaining bytes, if any, are handled in the same way
 839             // regardless of sequence's length:
 840             for ( ; len; --len )
 841             {
 842                 c = *++p;
 843                 if ( (c & 0xC0) != 0x80 )
 844                     return wxCONV_FAILED;
 845
 846                 code <<= 6;
 847                 code |= c & 0x3F;
 848             }
 849         }
 850
 851 #ifdef WC_UTF16
 852         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 853         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 854         {
 855             if ( out )
 856                 out++;
 857             written++;
 858         }
 859 #else // !WC_UTF16
 860         if ( out )
 861             *out = code;
 862 #endif // WC_UTF16/!WC_UTF16
 863
 864         if ( out )
 865             out++;
 866
 867         written++;
 868     }
 869
 870     return wxCONV_FAILED;
 871 }
 872
 873 size_t
 874 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 875                               const wchar_t *src, size_t srcLen) const
 876 {
 877     char *out = dstLen ? dst : NULL;
 878     size_t written = 0;
 879
 880     for ( const wchar_t *wp = src; ; wp++ )
 881     {
 882         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 883         {
 884             // all done successfully, just add the trailing NULL if we are not
 885             // using explicit length
 886             if ( srcLen == wxNO_LEN )
 887             {
 888                 if ( out )
 889                 {
 890                     if ( !dstLen )
 891                         break;
 892
 893                     *out = '\0';
 894                 }
 895
 896                 written++;
 897             }
 898
 899             return written;
 900         }
 901
 902
 903         wxUint32 code;
 904 #ifdef WC_UTF16
 905         // cast is ok for WC_UTF16
 906         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 907         {
 908             // skip the next char too as we decoded a surrogate
 909             wp++;
 910         }
 911 #else // wchar_t is UTF-32
 912         code = *wp & 0x7fffffff;
 913 #endif
 914
 915         unsigned len;
 916         if ( code <= 0x7F )
 917         {
 918             len = 1;
 919             if ( out )
 920             {
 921                 if ( dstLen < len )
 922                     break;
 923
 924                 out[0] = (char)code;
 925             }
 926         }
 927         else if ( code <= 0x07FF )
 928         {
 929             len = 2;
 930             if ( out )
 931             {
 932                 if ( dstLen < len )
 933                     break;
 934
 935                 // NB: this line takes 6 least significant bits, encodes them as
 936                 // 10xxxxxx and discards them so that the next byte can be encoded:
 937                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 938                 out[0] = 0xC0 | code;
 939             }
 940         }
 941         else if ( code < 0xFFFF )
 942         {
 943             len = 3;
 944             if ( out )
 945             {
 946                 if ( dstLen < len )
 947                     break;
 948
 949                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[0] = 0xE0 | code;
 952             }
 953         }
 954         else if ( code <= 0x10FFFF )
 955         {
 956             len = 4;
 957             if ( out )
 958             {
 959                 if ( dstLen < len )
 960                     break;
 961
 962                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 963                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 965                 out[0] = 0xF0 | code;
 966             }
 967         }
 968         else
 969         {
 970             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 971             break;
 972         }
 973
 974         if ( out )
 975         {
 976             out += len;
 977             dstLen -= len;
 978         }
 979
 980         written += len;
 981     }
 982
 983     // we only get here if an error occurs during decoding
 984     return wxCONV_FAILED;
 985 }
 986
 987 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 988                              const char *psz, size_t srcLen) const
 989 {
 990     if ( m_options == MAP_INVALID_UTF8_NOT )
 991         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 992
 993     size_t len = 0;
 994
 995     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 996     {
 997         const char *opsz = psz;
 998         bool invalid = false;
 999         unsigned char cc = *psz++, fc = cc;
1000         unsigned cnt;
1001         for (cnt = 0; fc & 0x80; cnt++)
1002             fc <<= 1;
1003
1004         if (!cnt)
1005         {
1006             // plain ASCII char
1007             if (buf)
1008                 *buf++ = cc;
1009             len++;
1010
1011             // escape the escape character for octal escapes
1012             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1013                     && cc == '\\' && (!buf || len < n))
1014             {
1015                 if (buf)
1016                     *buf++ = cc;
1017                 len++;
1018             }
1019         }
1020         else
1021         {
1022             cnt--;
1023             if (!cnt)
1024             {
1025                 // invalid UTF-8 sequence
1026                 invalid = true;
1027             }
1028             else
1029             {
1030                 unsigned ocnt = cnt - 1;
1031                 wxUint32 res = cc & (0x3f >> cnt);
1032                 while (cnt--)
1033                 {
1034                     cc = *psz;
1035                     if ((cc & 0xC0) != 0x80)
1036                     {
1037                         // invalid UTF-8 sequence
1038                         invalid = true;
1039                         break;
1040                     }
1041
1042                     psz++;
1043                     res = (res << 6) | (cc & 0x3f);
1044                 }
1045
1046                 if (invalid || res <= utf8_max[ocnt])
1047                 {
1048                     // illegal UTF-8 encoding
1049                     invalid = true;
1050                 }
1051                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1052                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1053                 {
1054                     // if one of our PUA characters turns up externally
1055                     // it must also be treated as an illegal sequence
1056                     // (a bit like you have to escape an escape character)
1057                     invalid = true;
1058                 }
1059                 else
1060                 {
1061 #ifdef WC_UTF16
1062                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1063                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1064                     if (pa == wxCONV_FAILED)
1065                     {
1066                         invalid = true;
1067                     }
1068                     else
1069                     {
1070                         if (buf)
1071                             buf += pa;
1072                         len += pa;
1073                     }
1074 #else // !WC_UTF16
1075                     if (buf)
1076                         *buf++ = (wchar_t)res;
1077                     len++;
1078 #endif // WC_UTF16/!WC_UTF16
1079                 }
1080             }
1081
1082             if (invalid)
1083             {
1084                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1085                 {
1086                     while (opsz < psz && (!buf || len < n))
1087                     {
1088 #ifdef WC_UTF16
1089                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1090                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1091                         wxASSERT(pa != wxCONV_FAILED);
1092                         if (buf)
1093                             buf += pa;
1094                         opsz++;
1095                         len += pa;
1096 #else
1097                         if (buf)
1098                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1099                         opsz++;
1100                         len++;
1101 #endif
1102                     }
1103                 }
1104                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1105                 {
1106                     while (opsz < psz && (!buf || len < n))
1107                     {
1108                         if ( buf && len + 3 < n )
1109                         {
1110                             unsigned char on = *opsz;
1111                             *buf++ = L'\\';
1112                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1113                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1114                             *buf++ = (wchar_t)( L'0' + on % 010 );
1115                         }
1116
1117                         opsz++;
1118                         len += 4;
1119                     }
1120                 }
1121                 else // MAP_INVALID_UTF8_NOT
1122                 {
1123                     return wxCONV_FAILED;
1124                 }
1125             }
1126         }
1127     }
1128
1129     if (srcLen == wxNO_LEN && buf && (len < n))
1130         *buf = 0;
1131
1132     return len + 1;
1133 }
1134
1135 static inline bool isoctal(wchar_t wch)
1136 {
1137     return L'0' <= wch && wch <= L'7';
1138 }
1139
1140 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1141                                const wchar_t *psz, size_t srcLen) const
1142 {
1143     if ( m_options == MAP_INVALID_UTF8_NOT )
1144         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1145
1146     size_t len = 0;
1147
1148     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1149     {
1150         wxUint32 cc;
1151
1152 #ifdef WC_UTF16
1153         // cast is ok for WC_UTF16
1154         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1155         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1156 #else
1157         cc = (*psz++) & 0x7fffffff;
1158 #endif
1159
1160         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1161                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1162         {
1163             if (buf)
1164                 *buf++ = (char)(cc - wxUnicodePUA);
1165             len++;
1166         }
1167         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1168                     && cc == L'\\' && psz[0] == L'\\' )
1169         {
1170             if (buf)
1171                 *buf++ = (char)cc;
1172             psz++;
1173             len++;
1174         }
1175         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1176                     cc == L'\\' &&
1177                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1178         {
1179             if (buf)
1180             {
1181                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1182                                  (psz[1] - L'0') * 010 +
1183                                  (psz[2] - L'0'));
1184             }
1185
1186             psz += 3;
1187             len++;
1188         }
1189         else
1190         {
1191             unsigned cnt;
1192             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1193             {
1194             }
1195
1196             if (!cnt)
1197             {
1198                 // plain ASCII char
1199                 if (buf)
1200                     *buf++ = (char) cc;
1201                 len++;
1202             }
1203             else
1204             {
1205                 len += cnt + 1;
1206                 if (buf)
1207                 {
1208                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1209                     while (cnt--)
1210                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1211                 }
1212             }
1213         }
1214     }
1215
1216     if (srcLen == wxNO_LEN && buf && (len < n))
1217         *buf = 0;
1218
1219     return len + 1;
1220 }
1221
1222 // ============================================================================
1223 // UTF-16
1224 // ============================================================================
1225
1226 #ifdef WORDS_BIGENDIAN
1227     #define wxMBConvUTF16straight wxMBConvUTF16BE
1228     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1229 #else
1230     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1231     #define wxMBConvUTF16straight wxMBConvUTF16LE
1232 #endif
1233
1234 /* static */
1235 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1236 {
1237     if ( srcLen == wxNO_LEN )
1238     {
1239         // count the number of bytes in input, including the trailing NULs
1240         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1241         for ( srcLen = 1; *inBuff++; srcLen++ )
1242             ;
1243
1244         srcLen *= BYTES_PER_CHAR;
1245     }
1246     else // we already have the length
1247     {
1248         // we can only convert an entire number of UTF-16 characters
1249         if ( srcLen % BYTES_PER_CHAR )
1250             return wxCONV_FAILED;
1251     }
1252
1253     return srcLen;
1254 }
1255
1256 // case when in-memory representation is UTF-16 too
1257 #ifdef WC_UTF16
1258
1259 // ----------------------------------------------------------------------------
1260 // conversions without endianness change
1261 // ----------------------------------------------------------------------------
1262
1263 size_t
1264 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1265                                const char *src, size_t srcLen) const
1266 {
1267     // set up the scene for using memcpy() (which is presumably more efficient
1268     // than copying the bytes one by one)
1269     srcLen = GetLength(src, srcLen);
1270     if ( srcLen == wxNO_LEN )
1271         return wxCONV_FAILED;
1272
1273     const size_t inLen = srcLen / BYTES_PER_CHAR;
1274     if ( dst )
1275     {
1276         if ( dstLen < inLen )
1277             return wxCONV_FAILED;
1278
1279         memcpy(dst, src, srcLen);
1280     }
1281
1282     return inLen;
1283 }
1284
1285 size_t
1286 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1287                                  const wchar_t *src, size_t srcLen) const
1288 {
1289     if ( srcLen == wxNO_LEN )
1290         srcLen = wxWcslen(src) + 1;
1291
1292     srcLen *= BYTES_PER_CHAR;
1293
1294     if ( dst )
1295     {
1296         if ( dstLen < srcLen )
1297             return wxCONV_FAILED;
1298
1299         memcpy(dst, src, srcLen);
1300     }
1301
1302     return srcLen;
1303 }
1304
1305 // ----------------------------------------------------------------------------
1306 // endian-reversing conversions
1307 // ----------------------------------------------------------------------------
1308
1309 size_t
1310 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1311                            const char *src, size_t srcLen) const
1312 {
1313     srcLen = GetLength(src, srcLen);
1314     if ( srcLen == wxNO_LEN )
1315         return wxCONV_FAILED;
1316
1317     srcLen /= BYTES_PER_CHAR;
1318
1319     if ( dst )
1320     {
1321         if ( dstLen < srcLen )
1322             return wxCONV_FAILED;
1323
1324         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1325         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1326         {
1327             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1328         }
1329     }
1330
1331     return srcLen;
1332 }
1333
1334 size_t
1335 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1336                              const wchar_t *src, size_t srcLen) const
1337 {
1338     if ( srcLen == wxNO_LEN )
1339         srcLen = wxWcslen(src) + 1;
1340
1341     srcLen *= BYTES_PER_CHAR;
1342
1343     if ( dst )
1344     {
1345         if ( dstLen < srcLen )
1346             return wxCONV_FAILED;
1347
1348         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1349         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1350         {
1351             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1352         }
1353     }
1354
1355     return srcLen;
1356 }
1357
1358 #else // !WC_UTF16: wchar_t is UTF-32
1359
1360 // ----------------------------------------------------------------------------
1361 // conversions without endianness change
1362 // ----------------------------------------------------------------------------
1363
1364 size_t
1365 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1366                                const char *src, size_t srcLen) const
1367 {
1368     srcLen = GetLength(src, srcLen);
1369     if ( srcLen == wxNO_LEN )
1370         return wxCONV_FAILED;
1371
1372     const size_t inLen = srcLen / BYTES_PER_CHAR;
1373     if ( !dst )
1374     {
1375         // optimization: return maximal space which could be needed for this
1376         // string even if the real size could be smaller if the buffer contains
1377         // any surrogates
1378         return inLen;
1379     }
1380
1381     size_t outLen = 0;
1382     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1383     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1384     {
1385         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1386         if ( !inBuff )
1387             return wxCONV_FAILED;
1388
1389         if ( ++outLen > dstLen )
1390             return wxCONV_FAILED;
1391
1392         *dst++ = ch;
1393     }
1394
1395
1396     return outLen;
1397 }
1398
1399 size_t
1400 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1401                                  const wchar_t *src, size_t srcLen) const
1402 {
1403     if ( srcLen == wxNO_LEN )
1404         srcLen = wxWcslen(src) + 1;
1405
1406     size_t outLen = 0;
1407     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1408     for ( size_t n = 0; n < srcLen; n++ )
1409     {
1410         wxUint16 cc[2];
1411         const size_t numChars = encode_utf16(*src++, cc);
1412         if ( numChars == wxCONV_FAILED )
1413             return wxCONV_FAILED;
1414
1415         outLen += numChars * BYTES_PER_CHAR;
1416         if ( outBuff )
1417         {
1418             if ( outLen > dstLen )
1419                 return wxCONV_FAILED;
1420
1421             *outBuff++ = cc[0];
1422             if ( numChars == 2 )
1423             {
1424                 // second character of a surrogate
1425                 *outBuff++ = cc[1];
1426             }
1427         }
1428     }
1429
1430     return outLen;
1431 }
1432
1433 // ----------------------------------------------------------------------------
1434 // endian-reversing conversions
1435 // ----------------------------------------------------------------------------
1436
1437 size_t
1438 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1439                            const char *src, size_t srcLen) const
1440 {
1441     srcLen = GetLength(src, srcLen);
1442     if ( srcLen == wxNO_LEN )
1443         return wxCONV_FAILED;
1444
1445     const size_t inLen = srcLen / BYTES_PER_CHAR;
1446     if ( !dst )
1447     {
1448         // optimization: return maximal space which could be needed for this
1449         // string even if the real size could be smaller if the buffer contains
1450         // any surrogates
1451         return inLen;
1452     }
1453
1454     size_t outLen = 0;
1455     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1456     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1457     {
1458         wxUint32 ch;
1459         wxUint16 tmp[2];
1460
1461         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1462         inBuff++;
1463         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1464
1465         const size_t numChars = decode_utf16(tmp, ch);
1466         if ( numChars == wxCONV_FAILED )
1467             return wxCONV_FAILED;
1468
1469         if ( numChars == 2 )
1470             inBuff++;
1471
1472         if ( ++outLen > dstLen )
1473             return wxCONV_FAILED;
1474
1475         *dst++ = ch;
1476     }
1477
1478
1479     return outLen;
1480 }
1481
1482 size_t
1483 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1484                              const wchar_t *src, size_t srcLen) const
1485 {
1486     if ( srcLen == wxNO_LEN )
1487         srcLen = wxWcslen(src) + 1;
1488
1489     size_t outLen = 0;
1490     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1491     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1492     {
1493         wxUint16 cc[2];
1494         const size_t numChars = encode_utf16(*src, cc);
1495         if ( numChars == wxCONV_FAILED )
1496             return wxCONV_FAILED;
1497
1498         outLen += numChars * BYTES_PER_CHAR;
1499         if ( outBuff )
1500         {
1501             if ( outLen > dstLen )
1502                 return wxCONV_FAILED;
1503
1504             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1505             if ( numChars == 2 )
1506             {
1507                 // second character of a surrogate
1508                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1509             }
1510         }
1511     }
1512
1513     return outLen;
1514 }
1515
1516 #endif // WC_UTF16/!WC_UTF16
1517
1518
1519 // ============================================================================
1520 // UTF-32
1521 // ============================================================================
1522
1523 #ifdef WORDS_BIGENDIAN
1524     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1525     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1526 #else
1527     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1528     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1529 #endif
1530
1531
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1534
1535 /* static */
1536 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1537 {
1538     if ( srcLen == wxNO_LEN )
1539     {
1540         // count the number of bytes in input, including the trailing NULs
1541         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1542         for ( srcLen = 1; *inBuff++; srcLen++ )
1543             ;
1544
1545         srcLen *= BYTES_PER_CHAR;
1546     }
1547     else // we already have the length
1548     {
1549         // we can only convert an entire number of UTF-32 characters
1550         if ( srcLen % BYTES_PER_CHAR )
1551             return wxCONV_FAILED;
1552     }
1553
1554     return srcLen;
1555 }
1556
1557 // case when in-memory representation is UTF-16
1558 #ifdef WC_UTF16
1559
1560 // ----------------------------------------------------------------------------
1561 // conversions without endianness change
1562 // ----------------------------------------------------------------------------
1563
1564 size_t
1565 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1566                                const char *src, size_t srcLen) const
1567 {
1568     srcLen = GetLength(src, srcLen);
1569     if ( srcLen == wxNO_LEN )
1570         return wxCONV_FAILED;
1571
1572     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1573     const size_t inLen = srcLen / BYTES_PER_CHAR;
1574     size_t outLen = 0;
1575     for ( size_t n = 0; n < inLen; n++ )
1576     {
1577         wxUint16 cc[2];
1578         const size_t numChars = encode_utf16(*inBuff++, cc);
1579         if ( numChars == wxCONV_FAILED )
1580             return wxCONV_FAILED;
1581
1582         outLen += numChars;
1583         if ( dst )
1584         {
1585             if ( outLen > dstLen )
1586                 return wxCONV_FAILED;
1587
1588             *dst++ = cc[0];
1589             if ( numChars == 2 )
1590             {
1591                 // second character of a surrogate
1592                 *dst++ = cc[1];
1593             }
1594         }
1595     }
1596
1597     return outLen;
1598 }
1599
1600 size_t
1601 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1602                                  const wchar_t *src, size_t srcLen) const
1603 {
1604     if ( srcLen == wxNO_LEN )
1605         srcLen = wxWcslen(src) + 1;
1606
1607     if ( !dst )
1608     {
1609         // optimization: return maximal space which could be needed for this
1610         // string instead of the exact amount which could be less if there are
1611         // any surrogates in the input
1612         //
1613         // we consider that surrogates are rare enough to make it worthwhile to
1614         // avoid running the loop below at the cost of slightly extra memory
1615         // consumption
1616         return srcLen * BYTES_PER_CHAR;
1617     }
1618
1619     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1620     size_t outLen = 0;
1621     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1622     {
1623         const wxUint32 ch = wxDecodeSurrogate(&src);
1624         if ( !src )
1625             return wxCONV_FAILED;
1626
1627         outLen += BYTES_PER_CHAR;
1628
1629         if ( outLen > dstLen )
1630             return wxCONV_FAILED;
1631
1632         *outBuff++ = ch;
1633     }
1634
1635     return outLen;
1636 }
1637
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1641
1642 size_t
1643 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1644                            const char *src, size_t srcLen) const
1645 {
1646     srcLen = GetLength(src, srcLen);
1647     if ( srcLen == wxNO_LEN )
1648         return wxCONV_FAILED;
1649
1650     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1651     const size_t inLen = srcLen / BYTES_PER_CHAR;
1652     size_t outLen = 0;
1653     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1654     {
1655         wxUint16 cc[2];
1656         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1657         if ( numChars == wxCONV_FAILED )
1658             return wxCONV_FAILED;
1659
1660         outLen += numChars;
1661         if ( dst )
1662         {
1663             if ( outLen > dstLen )
1664                 return wxCONV_FAILED;
1665
1666             *dst++ = cc[0];
1667             if ( numChars == 2 )
1668             {
1669                 // second character of a surrogate
1670                 *dst++ = cc[1];
1671             }
1672         }
1673     }
1674
1675     return outLen;
1676 }
1677
1678 size_t
1679 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1680                              const wchar_t *src, size_t srcLen) const
1681 {
1682     if ( srcLen == wxNO_LEN )
1683         srcLen = wxWcslen(src) + 1;
1684
1685     if ( !dst )
1686     {
1687         // optimization: return maximal space which could be needed for this
1688         // string instead of the exact amount which could be less if there are
1689         // any surrogates in the input
1690         //
1691         // we consider that surrogates are rare enough to make it worthwhile to
1692         // avoid running the loop below at the cost of slightly extra memory
1693         // consumption
1694         return srcLen*BYTES_PER_CHAR;
1695     }
1696
1697     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1698     size_t outLen = 0;
1699     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1700     {
1701         const wxUint32 ch = wxDecodeSurrogate(&src);
1702         if ( !src )
1703             return wxCONV_FAILED;
1704
1705         outLen += BYTES_PER_CHAR;
1706
1707         if ( outLen > dstLen )
1708             return wxCONV_FAILED;
1709
1710         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1711     }
1712
1713     return outLen;
1714 }
1715
1716 #else // !WC_UTF16: wchar_t is UTF-32
1717
1718 // ----------------------------------------------------------------------------
1719 // conversions without endianness change
1720 // ----------------------------------------------------------------------------
1721
1722 size_t
1723 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1724                                const char *src, size_t srcLen) const
1725 {
1726     // use memcpy() as it should be much faster than hand-written loop
1727     srcLen = GetLength(src, srcLen);
1728     if ( srcLen == wxNO_LEN )
1729         return wxCONV_FAILED;
1730
1731     const size_t inLen = srcLen/BYTES_PER_CHAR;
1732     if ( dst )
1733     {
1734         if ( dstLen < inLen )
1735             return wxCONV_FAILED;
1736
1737         memcpy(dst, src, srcLen);
1738     }
1739
1740     return inLen;
1741 }
1742
1743 size_t
1744 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1745                                  const wchar_t *src, size_t srcLen) const
1746 {
1747     if ( srcLen == wxNO_LEN )
1748         srcLen = wxWcslen(src) + 1;
1749
1750     srcLen *= BYTES_PER_CHAR;
1751
1752     if ( dst )
1753     {
1754         if ( dstLen < srcLen )
1755             return wxCONV_FAILED;
1756
1757         memcpy(dst, src, srcLen);
1758     }
1759
1760     return srcLen;
1761 }
1762
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1766
1767 size_t
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1769                            const char *src, size_t srcLen) const
1770 {
1771     srcLen = GetLength(src, srcLen);
1772     if ( srcLen == wxNO_LEN )
1773         return wxCONV_FAILED;
1774
1775     srcLen /= BYTES_PER_CHAR;
1776
1777     if ( dst )
1778     {
1779         if ( dstLen < srcLen )
1780             return wxCONV_FAILED;
1781
1782         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1783         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1784         {
1785             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1786         }
1787     }
1788
1789     return srcLen;
1790 }
1791
1792 size_t
1793 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1794                              const wchar_t *src, size_t srcLen) const
1795 {
1796     if ( srcLen == wxNO_LEN )
1797         srcLen = wxWcslen(src) + 1;
1798
1799     srcLen *= BYTES_PER_CHAR;
1800
1801     if ( dst )
1802     {
1803         if ( dstLen < srcLen )
1804             return wxCONV_FAILED;
1805
1806         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1807         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1808         {
1809             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1810         }
1811     }
1812
1813     return srcLen;
1814 }
1815
1816 #endif // WC_UTF16/!WC_UTF16
1817
1818
1819 // ============================================================================
1820 // The classes doing conversion using the iconv_xxx() functions
1821 // ============================================================================
1822
1823 #ifdef HAVE_ICONV
1824
1825 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1826 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1827 //     (unless there's yet another bug in glibc) the only case when iconv()
1828 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1829 //     left in the input buffer -- when _real_ error occurs,
1830 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1831 //     iconv() failure.
1832 //     [This bug does not appear in glibc 2.2.]
1833 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1834 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1835                                      (errno != E2BIG || bufLeft != 0))
1836 #else
1837 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1838 #endif
1839
1840 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1841
1842 #define ICONV_T_INVALID ((iconv_t)-1)
1843
1844 #if SIZEOF_WCHAR_T == 4
1845     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1846     #define WC_ENC      wxFONTENCODING_UTF32
1847 #elif SIZEOF_WCHAR_T == 2
1848     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1849     #define WC_ENC      wxFONTENCODING_UTF16
1850 #else // sizeof(wchar_t) != 2 nor 4
1851     // does this ever happen?
1852     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1853 #endif
1854
1855 // ----------------------------------------------------------------------------
1856 // wxMBConv_iconv: encapsulates an iconv character set
1857 // ----------------------------------------------------------------------------
1858
1859 class wxMBConv_iconv : public wxMBConv
1860 {
1861 public:
1862     wxMBConv_iconv(const char *name);
1863     virtual ~wxMBConv_iconv();
1864
1865     // implement base class virtual methods
1866     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1867                            const char *src, size_t srcLen = wxNO_LEN) const;
1868     virtual size_t FromWChar(char *dst, size_t dstLen,
1869                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1870     virtual size_t GetMBNulLen() const;
1871
1872 #if wxUSE_UNICODE_UTF8
1873     virtual bool IsUTF8() const;
1874 #endif
1875
1876     virtual wxMBConv *Clone() const
1877     {
1878         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1879         p->m_minMBCharWidth = m_minMBCharWidth;
1880         return p;
1881     }
1882
1883     bool IsOk() const
1884         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1885
1886 protected:
1887     // the iconv handlers used to translate from multibyte
1888     // to wide char and in the other direction
1889     iconv_t m2w,
1890             w2m;
1891
1892 #if wxUSE_THREADS
1893     // guards access to m2w and w2m objects
1894     wxMutex m_iconvMutex;
1895 #endif
1896
1897 private:
1898     // the name (for iconv_open()) of a wide char charset -- if none is
1899     // available on this machine, it will remain NULL
1900     static wxString ms_wcCharsetName;
1901
1902     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903     // different endian-ness than the native one
1904     static bool ms_wcNeedsSwap;
1905
1906
1907     // name of the encoding handled by this conversion
1908     wxString m_name;
1909
1910     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1911     // initially
1912     size_t m_minMBCharWidth;
1913 };
1914
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1917 {
1918     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919     if ( !result->IsOk() )
1920     {
1921         delete result;
1922         return 0;
1923     }
1924
1925     return result;
1926 }
1927
1928 wxString wxMBConv_iconv::ms_wcCharsetName;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1930
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1932               : m_name(name)
1933 {
1934     m_minMBCharWidth = 0;
1935
1936     // check for charset that represents wchar_t:
1937     if ( ms_wcCharsetName.empty() )
1938     {
1939         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
1941 #if wxUSE_FONTMAP
1942         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943 #else // !wxUSE_FONTMAP
1944         static const wxChar *names_static[] =
1945         {
1946 #if SIZEOF_WCHAR_T == 4
1947             _T("UCS-4"),
1948 #elif SIZEOF_WCHAR_T = 2
1949             _T("UCS-2"),
1950 #endif
1951             NULL
1952         };
1953         const wxChar **names = names_static;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1955
1956         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1957         {
1958             const wxString nameCS(*names);
1959
1960             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961             wxString nameXE(nameCS);
1962
1963 #ifdef WORDS_BIGENDIAN
1964                 nameXE += _T("BE");
1965 #else // little endian
1966                 nameXE += _T("LE");
1967 #endif
1968
1969             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1970                        nameXE.c_str());
1971
1972             m2w = iconv_open(nameXE.ToAscii(), name);
1973             if ( m2w == ICONV_T_INVALID )
1974             {
1975                 // try charset w/o bytesex info (e.g. "UCS4")
1976                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1977                            nameCS.c_str());
1978                 m2w = iconv_open(nameCS.ToAscii(), name);
1979
1980                 // and check for bytesex ourselves:
1981                 if ( m2w != ICONV_T_INVALID )
1982                 {
1983                     char    buf[2], *bufPtr;
1984                     wchar_t wbuf[2];
1985                     size_t  insz, outsz;
1986                     size_t  res;
1987
1988                     buf[0] = 'A';
1989                     buf[1] = 0;
1990                     wbuf[0] = 0;
1991                     insz = 2;
1992                     outsz = SIZEOF_WCHAR_T * 2;
1993                     char* wbufPtr = (char*)wbuf;
1994                     bufPtr = buf;
1995
1996                     res = iconv(
1997                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1998                         &wbufPtr, &outsz);
1999
2000                     if (ICONV_FAILED(res, insz))
2001                     {
2002                         wxLogLastError(wxT("iconv"));
2003                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2004                                    nameCS.c_str());
2005                     }
2006                     else // ok, can convert to this encoding, remember it
2007                     {
2008                         ms_wcCharsetName = nameCS;
2009                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010                     }
2011                 }
2012             }
2013             else // use charset not requiring byte swapping
2014             {
2015                 ms_wcCharsetName = nameXE;
2016             }
2017         }
2018
2019         wxLogTrace(TRACE_STRCONV,
2020                    wxT("iconv wchar_t charset is \"%s\"%s"),
2021                    ms_wcCharsetName.empty() ? wxString("<none>")
2022                                             : ms_wcCharsetName,
2023                    ms_wcNeedsSwap ? _T(" (needs swap)")
2024                                   : _T(""));
2025     }
2026     else // we already have ms_wcCharsetName
2027     {
2028         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2029     }
2030
2031     if ( ms_wcCharsetName.empty() )
2032     {
2033         w2m = ICONV_T_INVALID;
2034     }
2035     else
2036     {
2037         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2038         if ( w2m == ICONV_T_INVALID )
2039         {
2040             wxLogTrace(TRACE_STRCONV,
2041                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042                        ms_wcCharsetName.c_str(), name);
2043         }
2044     }
2045 }
2046
2047 wxMBConv_iconv::~wxMBConv_iconv()
2048 {
2049     if ( m2w != ICONV_T_INVALID )
2050         iconv_close(m2w);
2051     if ( w2m != ICONV_T_INVALID )
2052         iconv_close(w2m);
2053 }
2054
2055 size_t
2056 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2057                         const char *src, size_t srcLen) const
2058 {
2059     if ( srcLen == wxNO_LEN )
2060     {
2061         // find the string length: notice that must be done differently for
2062         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2063         // consecutive NULs
2064         const size_t nulLen = GetMBNulLen();
2065         switch ( nulLen )
2066         {
2067             default:
2068                 return wxCONV_FAILED;
2069
2070             case 1:
2071                 srcLen = strlen(src); // arguably more optimized than our version
2072                 break;
2073
2074             case 2:
2075             case 4:
2076                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2077                 // but they also have to start at character boundary and not
2078                 // span two adjacent characters
2079                 const char *p;
2080                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2081                     ;
2082                 srcLen = p - src;
2083                 break;
2084         }
2085     }
2086
2087     // we express length in the number of (wide) characters but iconv always
2088     // counts buffer sizes it in bytes
2089     dstLen *= SIZEOF_WCHAR_T;
2090
2091 #if wxUSE_THREADS
2092     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2093     //     Unfortunately there are a couple of global wxCSConv objects such as
2094     //     wxConvLocal that are used all over wx code, so we have to make sure
2095     //     the handle is used by at most one thread at the time. Otherwise
2096     //     only a few wx classes would be safe to use from non-main threads
2097     //     as MB<->WC conversion would fail "randomly".
2098     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2099 #endif // wxUSE_THREADS
2100
2101     size_t res, cres;
2102     const char *pszPtr = src;
2103
2104     if ( dst )
2105     {
2106         char* bufPtr = (char*)dst;
2107
2108         // have destination buffer, convert there
2109         cres = iconv(m2w,
2110                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2111                      &bufPtr, &dstLen);
2112         res = dstLen - (dstLen / SIZEOF_WCHAR_T);
2113
2114         if (ms_wcNeedsSwap)
2115         {
2116             // convert to native endianness
2117             for ( unsigned i = 0; i < res; i++ )
2118                 dst[dstLen] = WC_BSWAP(dst[i]);
2119         }
2120
2121         // NUL-terminate the string if there is any space left
2122         if (res < dstLen)
2123             dst[res] = 0;
2124     }
2125     else // no destination buffer
2126     {
2127         // convert using temp buffer to calculate the size of the buffer needed
2128         wchar_t tbuf[8];
2129         res = 0;
2130
2131         do
2132         {
2133             char* bufPtr = (char*)tbuf;
2134             dstLen = 8 * SIZEOF_WCHAR_T;
2135
2136             cres = iconv(m2w,
2137                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2138                          &bufPtr, &dstLen );
2139
2140             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2141         }
2142         while ((cres == (size_t)-1) && (errno == E2BIG));
2143     }
2144
2145     if (ICONV_FAILED(cres, srcLen))
2146     {
2147         //VS: it is ok if iconv fails, hence trace only
2148         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2149         return wxCONV_FAILED;
2150     }
2151
2152     return res;
2153 }
2154
2155 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2156                                  const wchar_t *src, size_t srcLen) const
2157 {
2158 #if wxUSE_THREADS
2159     // NB: explained in MB2WC
2160     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2161 #endif
2162
2163     if ( srcLen == wxNO_LEN )
2164         srcLen = wxWcslen(src);
2165
2166     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2167     size_t outbuflen = dstLen;
2168     size_t res, cres;
2169
2170     wchar_t *tmpbuf = 0;
2171
2172     if (ms_wcNeedsSwap)
2173     {
2174         // need to copy to temp buffer to switch endianness
2175         // (doing WC_BSWAP twice on the original buffer won't help, as it
2176         //  could be in read-only memory, or be accessed in some other thread)
2177         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2178         for ( size_t i = 0; i < srcLen; i++ )
2179             tmpbuf[i] = WC_BSWAP(src[i]);
2180
2181         tmpbuf[srcLen] = L'\0';
2182         src = tmpbuf;
2183     }
2184
2185     char* inbuf = (char*)src;
2186     if ( dst )
2187     {
2188         // have destination buffer, convert there
2189         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2190
2191         res = dstLen - outbuflen;
2192
2193         // NB: iconv was given only wcslen(src) characters on input, and so
2194         //     it couldn't convert the trailing zero. Let's do it ourselves
2195         //     if there's some room left for it in the output buffer.
2196         if (res < dstLen)
2197             dst[0] = 0;
2198     }
2199     else // no destination buffer
2200     {
2201         // convert using temp buffer to calculate the size of the buffer needed
2202         char tbuf[16];
2203         res = 0;
2204         do
2205         {
2206             dst = tbuf;
2207             outbuflen = 16;
2208
2209             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2210
2211             res += 16 - outbuflen;
2212         }
2213         while ((cres == (size_t)-1) && (errno == E2BIG));
2214     }
2215
2216     if (ms_wcNeedsSwap)
2217     {
2218         free(tmpbuf);
2219     }
2220
2221     if (ICONV_FAILED(cres, inbuflen))
2222     {
2223         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2224         return wxCONV_FAILED;
2225     }
2226
2227     return res;
2228 }
2229
2230 size_t wxMBConv_iconv::GetMBNulLen() const
2231 {
2232     if ( m_minMBCharWidth == 0 )
2233     {
2234         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2235
2236 #if wxUSE_THREADS
2237         // NB: explained in MB2WC
2238         wxMutexLocker lock(self->m_iconvMutex);
2239 #endif
2240
2241         const wchar_t *wnul = L"";
2242         char buf[8]; // should be enough for NUL in any encoding
2243         size_t inLen = sizeof(wchar_t),
2244                outLen = WXSIZEOF(buf);
2245         char *inBuff = (char *)wnul;
2246         char *outBuff = buf;
2247         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2248         {
2249             self->m_minMBCharWidth = (size_t)-1;
2250         }
2251         else // ok
2252         {
2253             self->m_minMBCharWidth = outBuff - buf;
2254         }
2255     }
2256
2257     return m_minMBCharWidth;
2258 }
2259
2260 #if wxUSE_UNICODE_UTF8
2261 bool wxMBConv_iconv::IsUTF8() const
2262 {
2263     return wxStricmp(m_name, "UTF-8") == 0 ||
2264            wxStricmp(m_name, "UTF8") == 0;
2265 }
2266 #endif
2267
2268 #endif // HAVE_ICONV
2269
2270
2271 // ============================================================================
2272 // Win32 conversion classes
2273 // ============================================================================
2274
2275 #ifdef wxHAVE_WIN32_MB2WC
2276
2277 // from utils.cpp
2278 #if wxUSE_FONTMAP
2279 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2280 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2281 #endif
2282
2283 class wxMBConv_win32 : public wxMBConv
2284 {
2285 public:
2286     wxMBConv_win32()
2287     {
2288         m_CodePage = CP_ACP;
2289         m_minMBCharWidth = 0;
2290     }
2291
2292     wxMBConv_win32(const wxMBConv_win32& conv)
2293         : wxMBConv()
2294     {
2295         m_CodePage = conv.m_CodePage;
2296         m_minMBCharWidth = conv.m_minMBCharWidth;
2297     }
2298
2299 #if wxUSE_FONTMAP
2300     wxMBConv_win32(const char* name)
2301     {
2302         m_CodePage = wxCharsetToCodepage(name);
2303         m_minMBCharWidth = 0;
2304     }
2305
2306     wxMBConv_win32(wxFontEncoding encoding)
2307     {
2308         m_CodePage = wxEncodingToCodepage(encoding);
2309         m_minMBCharWidth = 0;
2310     }
2311 #endif // wxUSE_FONTMAP
2312
2313     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2314     {
2315         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2316         // the behaviour is not compatible with the Unix version (using iconv)
2317         // and break the library itself, e.g. wxTextInputStream::NextChar()
2318         // wouldn't work if reading an incomplete MB char didn't result in an
2319         // error
2320         //
2321         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2322         // Win XP or newer and it is not supported for UTF-[78] so we always
2323         // use our own conversions in this case. See
2324         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2325         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2326         if ( m_CodePage == CP_UTF8 )
2327         {
2328             return wxMBConvUTF8().MB2WC(buf, psz, n);
2329         }
2330
2331         if ( m_CodePage == CP_UTF7 )
2332         {
2333             return wxMBConvUTF7().MB2WC(buf, psz, n);
2334         }
2335
2336         int flags = 0;
2337         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2338                 IsAtLeastWin2kSP4() )
2339         {
2340             flags = MB_ERR_INVALID_CHARS;
2341         }
2342
2343         const size_t len = ::MultiByteToWideChar
2344                              (
2345                                 m_CodePage,     // code page
2346                                 flags,          // flags: fall on error
2347                                 psz,            // input string
2348                                 -1,             // its length (NUL-terminated)
2349                                 buf,            // output string
2350                                 buf ? n : 0     // size of output buffer
2351                              );
2352         if ( !len )
2353         {
2354             // function totally failed
2355             return wxCONV_FAILED;
2356         }
2357
2358         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2359         // check if we succeeded, by doing a double trip:
2360         if ( !flags && buf )
2361         {
2362             const size_t mbLen = strlen(psz);
2363             wxCharBuffer mbBuf(mbLen);
2364             if ( ::WideCharToMultiByte
2365                    (
2366                       m_CodePage,
2367                       0,
2368                       buf,
2369                       -1,
2370                       mbBuf.data(),
2371                       mbLen + 1,        // size in bytes, not length
2372                       NULL,
2373                       NULL
2374                    ) == 0 ||
2375                   strcmp(mbBuf, psz) != 0 )
2376             {
2377                 // we didn't obtain the same thing we started from, hence
2378                 // the conversion was lossy and we consider that it failed
2379                 return wxCONV_FAILED;
2380             }
2381         }
2382
2383         // note that it returns count of written chars for buf != NULL and size
2384         // of the needed buffer for buf == NULL so in either case the length of
2385         // the string (which never includes the terminating NUL) is one less
2386         return len - 1;
2387     }
2388
2389     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2390     {
2391         /*
2392             we have a problem here: by default, WideCharToMultiByte() may
2393             replace characters unrepresentable in the target code page with bad
2394             quality approximations such as turning "1/2" symbol (U+00BD) into
2395             "1" for the code pages which don't have it and we, obviously, want
2396             to avoid this at any price
2397
2398             the trouble is that this function does it _silently_, i.e. it won't
2399             even tell us whether it did or not... Win98/2000 and higher provide
2400             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2401             we have to resort to a round trip, i.e. check that converting back
2402             results in the same string -- this is, of course, expensive but
2403             otherwise we simply can't be sure to not garble the data.
2404          */
2405
2406         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2407         // it doesn't work with CJK encodings (which we test for rather roughly
2408         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2409         // supporting it
2410         BOOL usedDef wxDUMMY_INITIALIZE(false);
2411         BOOL *pUsedDef;
2412         int flags;
2413         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2414         {
2415             // it's our lucky day
2416             flags = WC_NO_BEST_FIT_CHARS;
2417             pUsedDef = &usedDef;
2418         }
2419         else // old system or unsupported encoding
2420         {
2421             flags = 0;
2422             pUsedDef = NULL;
2423         }
2424
2425         const size_t len = ::WideCharToMultiByte
2426                              (
2427                                 m_CodePage,     // code page
2428                                 flags,          // either none or no best fit
2429                                 pwz,            // input string
2430                                 -1,             // it is (wide) NUL-terminated
2431                                 buf,            // output buffer
2432                                 buf ? n : 0,    // and its size
2433                                 NULL,           // default "replacement" char
2434                                 pUsedDef        // [out] was it used?
2435                              );
2436
2437         if ( !len )
2438         {
2439             // function totally failed
2440             return wxCONV_FAILED;
2441         }
2442
2443         // we did something, check if we really succeeded
2444         if ( flags )
2445         {
2446             // check if the conversion failed, i.e. if any replacements
2447             // were done
2448             if ( usedDef )
2449                 return wxCONV_FAILED;
2450         }
2451         else // we must resort to double tripping...
2452         {
2453             // first we need to ensure that we really have the MB data: this is
2454             // not the case if we're called with NULL buffer, in which case we
2455             // need to do the conversion yet again
2456             wxCharBuffer bufDef;
2457             if ( !buf )
2458             {
2459                 bufDef = wxCharBuffer(len);
2460                 buf = bufDef.data();
2461                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2462                                             buf, len, NULL, NULL) )
2463                     return wxCONV_FAILED;
2464             }
2465
2466             if ( !n )
2467                 n = wcslen(pwz);
2468             wxWCharBuffer wcBuf(n);
2469             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2470                     wcscmp(wcBuf, pwz) != 0 )
2471             {
2472                 // we didn't obtain the same thing we started from, hence
2473                 // the conversion was lossy and we consider that it failed
2474                 return wxCONV_FAILED;
2475             }
2476         }
2477
2478         // see the comment above for the reason of "len - 1"
2479         return len - 1;
2480     }
2481
2482     virtual size_t GetMBNulLen() const
2483     {
2484         if ( m_minMBCharWidth == 0 )
2485         {
2486             int len = ::WideCharToMultiByte
2487                         (
2488                             m_CodePage,     // code page
2489                             0,              // no flags
2490                             L"",            // input string
2491                             1,              // translate just the NUL
2492                             NULL,           // output buffer
2493                             0,              // and its size
2494                             NULL,           // no replacement char
2495                             NULL            // [out] don't care if it was used
2496                         );
2497
2498             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2499             switch ( len )
2500             {
2501                 default:
2502                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2503                     self->m_minMBCharWidth = (size_t)-1;
2504                     break;
2505
2506                 case 0:
2507                     self->m_minMBCharWidth = (size_t)-1;
2508                     break;
2509
2510                 case 1:
2511                 case 2:
2512                 case 4:
2513                     self->m_minMBCharWidth = len;
2514                     break;
2515             }
2516         }
2517
2518         return m_minMBCharWidth;
2519     }
2520
2521     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2522
2523     bool IsOk() const { return m_CodePage != -1; }
2524
2525 private:
2526     static bool CanUseNoBestFit()
2527     {
2528         static int s_isWin98Or2k = -1;
2529
2530         if ( s_isWin98Or2k == -1 )
2531         {
2532             int verMaj, verMin;
2533             switch ( wxGetOsVersion(&verMaj, &verMin) )
2534             {
2535                 case wxOS_WINDOWS_9X:
2536                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2537                     break;
2538
2539                 case wxOS_WINDOWS_NT:
2540                     s_isWin98Or2k = verMaj >= 5;
2541                     break;
2542
2543                 default:
2544                     // unknown: be conservative by default
2545                     s_isWin98Or2k = 0;
2546                     break;
2547             }
2548
2549             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2550         }
2551
2552         return s_isWin98Or2k == 1;
2553     }
2554
2555     static bool IsAtLeastWin2kSP4()
2556     {
2557 #ifdef __WXWINCE__
2558         return false;
2559 #else
2560         static int s_isAtLeastWin2kSP4 = -1;
2561
2562         if ( s_isAtLeastWin2kSP4 == -1 )
2563         {
2564             OSVERSIONINFOEX ver;
2565
2566             memset(&ver, 0, sizeof(ver));
2567             ver.dwOSVersionInfoSize = sizeof(ver);
2568             GetVersionEx((OSVERSIONINFO*)&ver);
2569
2570             s_isAtLeastWin2kSP4 =
2571               ((ver.dwMajorVersion > 5) || // Vista+
2572                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2573                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2574                ver.wServicePackMajor >= 4)) // 2000 SP4+
2575               ? 1 : 0;
2576         }
2577
2578         return s_isAtLeastWin2kSP4 == 1;
2579 #endif
2580     }
2581
2582
2583     // the code page we're working with
2584     long m_CodePage;
2585
2586     // cached result of GetMBNulLen(), set to 0 initially meaning
2587     // "unknown"
2588     size_t m_minMBCharWidth;
2589 };
2590
2591 #endif // wxHAVE_WIN32_MB2WC
2592
2593
2594 // ============================================================================
2595 // wxEncodingConverter based conversion classes
2596 // ============================================================================
2597
2598 #if wxUSE_FONTMAP
2599
2600 class wxMBConv_wxwin : public wxMBConv
2601 {
2602 private:
2603     void Init()
2604     {
2605         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2606         // The wxMBConv_cf class does a better job.
2607         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2608                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2609                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2610     }
2611
2612 public:
2613     // temporarily just use wxEncodingConverter stuff,
2614     // so that it works while a better implementation is built
2615     wxMBConv_wxwin(const char* name)
2616     {
2617         if (name)
2618             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2619         else
2620             m_enc = wxFONTENCODING_SYSTEM;
2621
2622         Init();
2623     }
2624
2625     wxMBConv_wxwin(wxFontEncoding enc)
2626     {
2627         m_enc = enc;
2628
2629         Init();
2630     }
2631
2632     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2633     {
2634         size_t inbuf = strlen(psz);
2635         if (buf)
2636         {
2637             if (!m2w.Convert(psz, buf))
2638                 return wxCONV_FAILED;
2639         }
2640         return inbuf;
2641     }
2642
2643     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2644     {
2645         const size_t inbuf = wxWcslen(psz);
2646         if (buf)
2647         {
2648             if (!w2m.Convert(psz, buf))
2649                 return wxCONV_FAILED;
2650         }
2651
2652         return inbuf;
2653     }
2654
2655     virtual size_t GetMBNulLen() const
2656     {
2657         switch ( m_enc )
2658         {
2659             case wxFONTENCODING_UTF16BE:
2660             case wxFONTENCODING_UTF16LE:
2661                 return 2;
2662
2663             case wxFONTENCODING_UTF32BE:
2664             case wxFONTENCODING_UTF32LE:
2665                 return 4;
2666
2667             default:
2668                 return 1;
2669         }
2670     }
2671
2672     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2673
2674     bool IsOk() const { return m_ok; }
2675
2676 public:
2677     wxFontEncoding m_enc;
2678     wxEncodingConverter m2w, w2m;
2679
2680 private:
2681     // were we initialized successfully?
2682     bool m_ok;
2683
2684     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2685 };
2686
2687 // make the constructors available for unit testing
2688 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2689 {
2690     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2691     if ( !result->IsOk() )
2692     {
2693         delete result;
2694         return 0;
2695     }
2696
2697     return result;
2698 }
2699
2700 #endif // wxUSE_FONTMAP
2701
2702 // ============================================================================
2703 // wxCSConv implementation
2704 // ============================================================================
2705
2706 void wxCSConv::Init()
2707 {
2708     m_name = NULL;
2709     m_convReal =  NULL;
2710     m_deferred = true;
2711 }
2712
2713 wxCSConv::wxCSConv(const wxString& charset)
2714 {
2715     Init();
2716
2717     if ( !charset.empty() )
2718     {
2719         SetName(charset.ToAscii());
2720     }
2721
2722 #if wxUSE_FONTMAP
2723     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2724 #else
2725     m_encoding = wxFONTENCODING_SYSTEM;
2726 #endif
2727 }
2728
2729 wxCSConv::wxCSConv(wxFontEncoding encoding)
2730 {
2731     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2732     {
2733         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2734
2735         encoding = wxFONTENCODING_SYSTEM;
2736     }
2737
2738     Init();
2739
2740     m_encoding = encoding;
2741 }
2742
2743 wxCSConv::~wxCSConv()
2744 {
2745     Clear();
2746 }
2747
2748 wxCSConv::wxCSConv(const wxCSConv& conv)
2749         : wxMBConv()
2750 {
2751     Init();
2752
2753     SetName(conv.m_name);
2754     m_encoding = conv.m_encoding;
2755 }
2756
2757 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2758 {
2759     Clear();
2760
2761     SetName(conv.m_name);
2762     m_encoding = conv.m_encoding;
2763
2764     return *this;
2765 }
2766
2767 void wxCSConv::Clear()
2768 {
2769     free(m_name);
2770     delete m_convReal;
2771
2772     m_name = NULL;
2773     m_convReal = NULL;
2774 }
2775
2776 void wxCSConv::SetName(const char *charset)
2777 {
2778     if (charset)
2779     {
2780         m_name = wxStrdup(charset);
2781         m_deferred = true;
2782     }
2783 }
2784
2785 #if wxUSE_FONTMAP
2786
2787 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2788                      wxEncodingNameCache );
2789
2790 static wxEncodingNameCache gs_nameCache;
2791 #endif
2792
2793 wxMBConv *wxCSConv::DoCreate() const
2794 {
2795 #if wxUSE_FONTMAP
2796     wxLogTrace(TRACE_STRCONV,
2797                wxT("creating conversion for %s"),
2798                (m_name ? m_name
2799                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2800 #endif // wxUSE_FONTMAP
2801
2802     // check for the special case of ASCII or ISO8859-1 charset: as we have
2803     // special knowledge of it anyhow, we don't need to create a special
2804     // conversion object
2805     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2806             m_encoding == wxFONTENCODING_DEFAULT )
2807     {
2808         // don't convert at all
2809         return NULL;
2810     }
2811
2812     // we trust OS to do conversion better than we can so try external
2813     // conversion methods first
2814     //
2815     // the full order is:
2816     //      1. OS conversion (iconv() under Unix or Win32 API)
2817     //      2. hard coded conversions for UTF
2818     //      3. wxEncodingConverter as fall back
2819
2820     // step (1)
2821 #ifdef HAVE_ICONV
2822 #if !wxUSE_FONTMAP
2823     if ( m_name )
2824 #endif // !wxUSE_FONTMAP
2825     {
2826 #if wxUSE_FONTMAP
2827         wxFontEncoding encoding(m_encoding);
2828 #endif
2829
2830         if ( m_name )
2831         {
2832             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2833             if ( conv->IsOk() )
2834                 return conv;
2835
2836             delete conv;
2837
2838 #if wxUSE_FONTMAP
2839             encoding =
2840                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2841 #endif // wxUSE_FONTMAP
2842         }
2843 #if wxUSE_FONTMAP
2844         {
2845             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2846             if ( it != gs_nameCache.end() )
2847             {
2848                 if ( it->second.empty() )
2849                     return NULL;
2850
2851                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2852                 if ( conv->IsOk() )
2853                     return conv;
2854
2855                 delete conv;
2856             }
2857
2858             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2859             // CS : in case this does not return valid names (eg for MacRoman)
2860             // encoding got a 'failure' entry in the cache all the same,
2861             // although it just has to be created using a different method, so
2862             // only store failed iconv creation attempts (or perhaps we
2863             // shoulnd't do this at all ?)
2864             if ( names[0] != NULL )
2865             {
2866                 for ( ; *names; ++names )
2867                 {
2868                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2869                     //             will need changes that will obsolete this
2870                     wxString name(*names);
2871                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2872                     if ( conv->IsOk() )
2873                     {
2874                         gs_nameCache[encoding] = *names;
2875                         return conv;
2876                     }
2877
2878                     delete conv;
2879                 }
2880
2881                 gs_nameCache[encoding] = _T(""); // cache the failure
2882             }
2883         }
2884 #endif // wxUSE_FONTMAP
2885     }
2886 #endif // HAVE_ICONV
2887
2888 #ifdef wxHAVE_WIN32_MB2WC
2889     {
2890 #if wxUSE_FONTMAP
2891         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2892                                       : new wxMBConv_win32(m_encoding);
2893         if ( conv->IsOk() )
2894             return conv;
2895
2896         delete conv;
2897 #else
2898         return NULL;
2899 #endif
2900     }
2901 #endif // wxHAVE_WIN32_MB2WC
2902
2903 #ifdef __DARWIN__
2904     {
2905         // leave UTF16 and UTF32 to the built-ins of wx
2906         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2907             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2908         {
2909 #if wxUSE_FONTMAP
2910             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2911                                           : new wxMBConv_cf(m_encoding);
2912 #else
2913             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2914 #endif
2915
2916             if ( conv->IsOk() )
2917                  return conv;
2918
2919             delete conv;
2920         }
2921     }
2922 #endif // __DARWIN__
2923
2924     // step (2)
2925     wxFontEncoding enc = m_encoding;
2926 #if wxUSE_FONTMAP
2927     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2928     {
2929         // use "false" to suppress interactive dialogs -- we can be called from
2930         // anywhere and popping up a dialog from here is the last thing we want to
2931         // do
2932         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2933     }
2934 #endif // wxUSE_FONTMAP
2935
2936     switch ( enc )
2937     {
2938         case wxFONTENCODING_UTF7:
2939              return new wxMBConvUTF7;
2940
2941         case wxFONTENCODING_UTF8:
2942              return new wxMBConvUTF8;
2943
2944         case wxFONTENCODING_UTF16BE:
2945              return new wxMBConvUTF16BE;
2946
2947         case wxFONTENCODING_UTF16LE:
2948              return new wxMBConvUTF16LE;
2949
2950         case wxFONTENCODING_UTF32BE:
2951              return new wxMBConvUTF32BE;
2952
2953         case wxFONTENCODING_UTF32LE:
2954              return new wxMBConvUTF32LE;
2955
2956         default:
2957              // nothing to do but put here to suppress gcc warnings
2958              break;
2959     }
2960
2961     // step (3)
2962 #if wxUSE_FONTMAP
2963     {
2964         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2965                                       : new wxMBConv_wxwin(m_encoding);
2966         if ( conv->IsOk() )
2967             return conv;
2968
2969         delete conv;
2970     }
2971 #endif // wxUSE_FONTMAP
2972
2973     // NB: This is a hack to prevent deadlock. What could otherwise happen
2974     //     in Unicode build: wxConvLocal creation ends up being here
2975     //     because of some failure and logs the error. But wxLog will try to
2976     //     attach a timestamp, for which it will need wxConvLocal (to convert
2977     //     time to char* and then wchar_t*), but that fails, tries to log the
2978     //     error, but wxLog has an (already locked) critical section that
2979     //     guards the static buffer.
2980     static bool alreadyLoggingError = false;
2981     if (!alreadyLoggingError)
2982     {
2983         alreadyLoggingError = true;
2984         wxLogError(_("Cannot convert from the charset '%s'!"),
2985                    m_name ? m_name
2986                       :
2987 #if wxUSE_FONTMAP
2988                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2989 #else // !wxUSE_FONTMAP
2990                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2991 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2992               );
2993
2994         alreadyLoggingError = false;
2995     }
2996
2997     return NULL;
2998 }
2999
3000 void wxCSConv::CreateConvIfNeeded() const
3001 {
3002     if ( m_deferred )
3003     {
3004         wxCSConv *self = (wxCSConv *)this; // const_cast
3005
3006         // if we don't have neither the name nor the encoding, use the default
3007         // encoding for this system
3008         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3009         {
3010 #if wxUSE_INTL
3011             self->m_encoding = wxLocale::GetSystemEncoding();
3012 #else
3013             // fallback to some reasonable default:
3014             self->m_encoding = wxFONTENCODING_ISO8859_1;
3015 #endif // wxUSE_INTL
3016         }
3017
3018         self->m_convReal = DoCreate();
3019         self->m_deferred = false;
3020     }
3021 }
3022
3023 bool wxCSConv::IsOk() const
3024 {
3025     CreateConvIfNeeded();
3026
3027     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3028     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3029         return true; // always ok as we do it ourselves
3030
3031     // m_convReal->IsOk() is called at its own creation, so we know it must
3032     // be ok if m_convReal is non-NULL
3033     return m_convReal != NULL;
3034 }
3035
3036 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3037                          const char *src, size_t srcLen) const
3038 {
3039     CreateConvIfNeeded();
3040
3041     if (m_convReal)
3042         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3043
3044     // latin-1 (direct)
3045     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3046 }
3047
3048 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3049                            const wchar_t *src, size_t srcLen) const
3050 {
3051     CreateConvIfNeeded();
3052
3053     if (m_convReal)
3054         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3055
3056     // latin-1 (direct)
3057     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3058 }
3059
3060 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3061 {
3062     CreateConvIfNeeded();
3063
3064     if (m_convReal)
3065         return m_convReal->MB2WC(buf, psz, n);
3066
3067     // latin-1 (direct)
3068     size_t len = strlen(psz);
3069
3070     if (buf)
3071     {
3072         for (size_t c = 0; c <= len; c++)
3073             buf[c] = (unsigned char)(psz[c]);
3074     }
3075
3076     return len;
3077 }
3078
3079 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3080 {
3081     CreateConvIfNeeded();
3082
3083     if (m_convReal)
3084         return m_convReal->WC2MB(buf, psz, n);
3085
3086     // latin-1 (direct)
3087     const size_t len = wxWcslen(psz);
3088     if (buf)
3089     {
3090         for (size_t c = 0; c <= len; c++)
3091         {
3092             if (psz[c] > 0xFF)
3093                 return wxCONV_FAILED;
3094
3095             buf[c] = (char)psz[c];
3096         }
3097     }
3098     else
3099     {
3100         for (size_t c = 0; c <= len; c++)
3101         {
3102             if (psz[c] > 0xFF)
3103                 return wxCONV_FAILED;
3104         }
3105     }
3106
3107     return len;
3108 }
3109
3110 size_t wxCSConv::GetMBNulLen() const
3111 {
3112     CreateConvIfNeeded();
3113
3114     if ( m_convReal )
3115     {
3116         return m_convReal->GetMBNulLen();
3117     }
3118
3119     // otherwise, we are ISO-8859-1
3120     return 1;
3121 }
3122
3123 #if wxUSE_UNICODE_UTF8
3124 bool wxCSConv::IsUTF8() const
3125 {
3126     CreateConvIfNeeded();
3127
3128     if ( m_convReal )
3129     {
3130         return m_convReal->IsUTF8();
3131     }
3132
3133     // otherwise, we are ISO-8859-1
3134     return false;
3135 }
3136 #endif
3137
3138
3139 #if wxUSE_UNICODE
3140
3141 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3142 {
3143     if ( !s )
3144         return wxWCharBuffer();
3145
3146     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3147     if ( !wbuf )
3148         wbuf = wxMBConvUTF8().cMB2WX(s);
3149     if ( !wbuf )
3150         wbuf = wxConvISO8859_1.cMB2WX(s);
3151
3152     return wbuf;
3153 }
3154
3155 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3156 {
3157     if ( !ws )
3158         return wxCharBuffer();
3159
3160     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3161     if ( !buf )
3162         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3163
3164     return buf;
3165 }
3166
3167 #endif // wxUSE_UNICODE
3168
3169 // ----------------------------------------------------------------------------
3170 // globals
3171 // ----------------------------------------------------------------------------
3172
3173 // NB: The reason why we create converted objects in this convoluted way,
3174 //     using a factory function instead of global variable, is that they
3175 //     may be used at static initialization time (some of them are used by
3176 //     wxString ctors and there may be a global wxString object). In other
3177 //     words, possibly _before_ the converter global object would be
3178 //     initialized.
3179
3180 #undef wxConvLibc
3181 #undef wxConvUTF8
3182 #undef wxConvUTF7
3183 #undef wxConvLocal
3184 #undef wxConvISO8859_1
3185
3186 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3187     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3188     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3189     {                                                                   \
3190         static impl_klass name##Obj ctor_args;                          \
3191         return &name##Obj;                                              \
3192     }                                                                   \
3193     /* this ensures that all global converter objects are created */    \
3194     /* by the time static initialization is done, i.e. before any */    \
3195     /* thread is launched: */                                           \
3196     static klass* gs_##name##instance = wxGet_##name##Ptr()
3197
3198 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3199     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3200
3201 #ifdef __WINDOWS__
3202     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3203 #else
3204     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3205 #endif
3206
3207 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3208 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3209 //     provokes an error message about "not enough macro parameters"; and we
3210 //     can't use "()" here as the name##Obj declaration would be parsed as a
3211 //     function declaration then, so use a semicolon and live with an extra
3212 //     empty statement (and hope that no compilers warns about this)
3213 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3214 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3215
3216 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3217 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3218
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3220 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3221
3222 #ifdef __DARWIN__
3223 // The xnu kernel always communicates file paths in decomposed UTF-8.
3224 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3225 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3226 #endif
3227
3228 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3229 #ifdef __DARWIN__
3230                                     &wxConvMacUTF8DObj;
3231 #else // !__DARWIN__
3232                                     wxGet_wxConvLibcPtr();
3233 #endif // __DARWIN__/!__DARWIN__
3234
3235 #else // !wxUSE_WCHAR_T
3236
3237 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3238 // stand-ins in absence of wchar_t
3239 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3240                                 wxConvISO8859_1,
3241                                 wxConvLocal,
3242                                 wxConvUTF8;
3243
3244 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T