src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487
 488 //
 489 // BASE64 decoding table
 490 //
 491 static const unsigned char utf7unb64[] =
 492 {
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 499     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 500     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 502     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 503     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 504     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 506     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 507     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 508     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 525 };
 526
 527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 528 {
 529     size_t len = 0;
 530
 531     while ( *psz && (!buf || (len < n)) )
 532     {
 533         unsigned char cc = *psz++;
 534         if (cc != '+')
 535         {
 536             // plain ASCII char
 537             if (buf)
 538                 *buf++ = cc;
 539             len++;
 540         }
 541         else if (*psz == '-')
 542         {
 543             // encoded plus sign
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547             psz++;
 548         }
 549         else // start of BASE64 encoded string
 550         {
 551             bool lsb, ok;
 552             unsigned int d, l;
 553             for ( ok = lsb = false, d = 0, l = 0;
 554                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 555                   psz++ )
 556             {
 557                 d <<= 6;
 558                 d += cc;
 559                 for (l += 6; l >= 8; lsb = !lsb)
 560                 {
 561                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 562                     if (lsb)
 563                     {
 564                         if (buf)
 565                             *buf++ |= c;
 566                         len ++;
 567                         ok = true;
 568                     }
 569                     else
 570                     {
 571                         if (buf)
 572                             *buf = (wchar_t)(c << 8);
 573                     }
 574                 }
 575             }
 576
 577             if ( !ok )
 578             {
 579                 // in valid UTF7 we should have valid characters after '+'
 580                 return wxCONV_FAILED;
 581             }
 582
 583             if (*psz == '-')
 584                 psz++;
 585         }
 586     }
 587
 588     if ( buf && (len < n) )
 589         *buf = '\0';
 590
 591     return len;
 592 }
 593
 594 //
 595 // BASE64 encoding table
 596 //
 597 static const unsigned char utf7enb64[] =
 598 {
 599     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 600     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 601     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 602     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 603     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 604     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 605     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 606     '4', '5', '6', '7', '8', '9', '+', '/'
 607 };
 608
 609 //
 610 // UTF-7 encoding table
 611 //
 612 // 0 - Set D (directly encoded characters)
 613 // 1 - Set O (optional direct characters)
 614 // 2 - whitespace characters (optional)
 615 // 3 - special characters
 616 //
 617 static const unsigned char utf7encode[128] =
 618 {
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 620     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 621     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 622     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 623     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 624     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 625     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 627 };
 628
 629 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 630 {
 631     size_t len = 0;
 632
 633     while (*psz && ((!buf) || (len < n)))
 634     {
 635         wchar_t cc = *psz++;
 636         if (cc < 0x80 && utf7encode[cc] < 1)
 637         {
 638             // plain ASCII char
 639             if (buf)
 640                 *buf++ = (char)cc;
 641
 642             len++;
 643         }
 644 #ifndef WC_UTF16
 645         else if (((wxUint32)cc) > 0xffff)
 646         {
 647             // no surrogate pair generation (yet?)
 648             return wxCONV_FAILED;
 649         }
 650 #endif
 651         else
 652         {
 653             if (buf)
 654                 *buf++ = '+';
 655
 656             len++;
 657             if (cc != '+')
 658             {
 659                 // BASE64 encode string
 660                 unsigned int lsb, d, l;
 661                 for (d = 0, l = 0; /*nothing*/; psz++)
 662                 {
 663                     for (lsb = 0; lsb < 2; lsb ++)
 664                     {
 665                         d <<= 8;
 666                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 667
 668                         for (l += 8; l >= 6; )
 669                         {
 670                             l -= 6;
 671                             if (buf)
 672                                 *buf++ = utf7enb64[(d >> l) % 64];
 673                             len++;
 674                         }
 675                     }
 676
 677                     cc = *psz;
 678                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 679                         break;
 680                 }
 681
 682                 if (l != 0)
 683                 {
 684                     if (buf)
 685                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 686
 687                     len++;
 688                 }
 689             }
 690
 691             if (buf)
 692                 *buf++ = '-';
 693             len++;
 694         }
 695     }
 696
 697     if (buf && (len < n))
 698         *buf = 0;
 699
 700     return len;
 701 }
 702
 703 // ----------------------------------------------------------------------------
 704 // UTF-8
 705 // ----------------------------------------------------------------------------
 706
 707 static const wxUint32 utf8_max[]=
 708     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 709
 710 // boundaries of the private use area we use to (temporarily) remap invalid
 711 // characters invalid in a UTF-8 encoded string
 712 const wxUint32 wxUnicodePUA = 0x100000;
 713 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 714
 715 // this table gives the length of the UTF-8 encoding from its first character:
 716 const unsigned char tableUtf8Lengths[256] = {
 717     // single-byte sequences (ASCII):
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 726
 727     // these are invalid:
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 732     0, 0,                                            // C0,C1
 733
 734     // two-byte sequences:
 735           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 736     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 737
 738     // three-byte sequences:
 739     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 740
 741     // four-byte sequences:
 742     4, 4, 4, 4, 4,                                   // F0..F4
 743
 744     // these are invalid again (5- or 6-byte
 745     // sequences and sequences for code points
 746     // above U+10FFFF, as restricted by RFC 3629):
 747                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 748 };
 749
 750 size_t
 751 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 752                             const char *src, size_t srcLen) const
 753 {
 754     wchar_t *out = dstLen ? dst : NULL;
 755     size_t written = 0;
 756
 757     if ( srcLen == wxNO_LEN )
 758         srcLen = strlen(src) + 1;
 759
 760     for ( const char *p = src; ; p++ )
 761     {
 762         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 763         {
 764             // all done successfully, just add the trailing NULL if we are not
 765             // using explicit length
 766             if ( srcLen == wxNO_LEN )
 767             {
 768                 if ( out )
 769                 {
 770                     if ( !dstLen )
 771                         break;
 772
 773                     *out = L'\0';
 774                 }
 775
 776                 written++;
 777             }
 778
 779             return written;
 780         }
 781
 782         if ( out && !dstLen-- )
 783             break;
 784
 785         wxUint32 code;
 786         unsigned char c = *p;
 787
 788         if ( c < 0x80 )
 789         {
 790             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 791                 break;
 792
 793             if ( srcLen != wxNO_LEN )
 794                 srcLen--;
 795
 796             code = c;
 797         }
 798         else
 799         {
 800             unsigned len = tableUtf8Lengths[c];
 801             if ( !len )
 802                 break;
 803
 804             if ( srcLen < len ) // the test works for wxNO_LEN too
 805                 break;
 806
 807             if ( srcLen != wxNO_LEN )
 808                 srcLen -= len;
 809
 810             //   Char. number range   |        UTF-8 octet sequence
 811             //      (hexadecimal)     |              (binary)
 812             //  ----------------------+----------------------------------------
 813             //  0000 0000 - 0000 007F | 0xxxxxxx
 814             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 815             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 816             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 817             //
 818             //  Code point value is stored in bits marked with 'x',
 819             //  lowest-order bit of the value on the right side in the diagram
 820             //  above.                                         (from RFC 3629)
 821
 822             // mask to extract lead byte's value ('x' bits above), by sequence
 823             // length:
 824             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 825
 826             // mask and value of lead byte's most significant bits, by length:
 827             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 828             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 829
 830             len--; // it's more convenient to work with 0-based length here
 831
 832             // extract the lead byte's value bits:
 833             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 834                 break;
 835
 836             code = c & leadValueMask[len];
 837
 838             // all remaining bytes, if any, are handled in the same way
 839             // regardless of sequence's length:
 840             for ( ; len; --len )
 841             {
 842                 c = *++p;
 843                 if ( (c & 0xC0) != 0x80 )
 844                     return wxCONV_FAILED;
 845
 846                 code <<= 6;
 847                 code |= c & 0x3F;
 848             }
 849         }
 850
 851 #ifdef WC_UTF16
 852         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 853         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 854         {
 855             if ( out )
 856                 out++;
 857             written++;
 858         }
 859 #else // !WC_UTF16
 860         if ( out )
 861             *out = code;
 862 #endif // WC_UTF16/!WC_UTF16
 863
 864         if ( out )
 865             out++;
 866
 867         written++;
 868     }
 869
 870     return wxCONV_FAILED;
 871 }
 872
 873 size_t
 874 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 875                               const wchar_t *src, size_t srcLen) const
 876 {
 877     char *out = dstLen ? dst : NULL;
 878     size_t written = 0;
 879
 880     for ( const wchar_t *wp = src; ; wp++ )
 881     {
 882         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 883         {
 884             // all done successfully, just add the trailing NULL if we are not
 885             // using explicit length
 886             if ( srcLen == wxNO_LEN )
 887             {
 888                 if ( out )
 889                 {
 890                     if ( !dstLen )
 891                         break;
 892
 893                     *out = '\0';
 894                 }
 895
 896                 written++;
 897             }
 898
 899             return written;
 900         }
 901
 902
 903         wxUint32 code;
 904 #ifdef WC_UTF16
 905         // cast is ok for WC_UTF16
 906         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 907         {
 908             // skip the next char too as we decoded a surrogate
 909             wp++;
 910         }
 911 #else // wchar_t is UTF-32
 912         code = *wp & 0x7fffffff;
 913 #endif
 914
 915         unsigned len;
 916         if ( code <= 0x7F )
 917         {
 918             len = 1;
 919             if ( out )
 920             {
 921                 if ( dstLen < len )
 922                     break;
 923
 924                 out[0] = (char)code;
 925             }
 926         }
 927         else if ( code <= 0x07FF )
 928         {
 929             len = 2;
 930             if ( out )
 931             {
 932                 if ( dstLen < len )
 933                     break;
 934
 935                 // NB: this line takes 6 least significant bits, encodes them as
 936                 // 10xxxxxx and discards them so that the next byte can be encoded:
 937                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 938                 out[0] = 0xC0 | code;
 939             }
 940         }
 941         else if ( code < 0xFFFF )
 942         {
 943             len = 3;
 944             if ( out )
 945             {
 946                 if ( dstLen < len )
 947                     break;
 948
 949                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[0] = 0xE0 | code;
 952             }
 953         }
 954         else if ( code <= 0x10FFFF )
 955         {
 956             len = 4;
 957             if ( out )
 958             {
 959                 if ( dstLen < len )
 960                     break;
 961
 962                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 963                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 965                 out[0] = 0xF0 | code;
 966             }
 967         }
 968         else
 969         {
 970             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 971             break;
 972         }
 973
 974         if ( out )
 975         {
 976             out += len;
 977             dstLen -= len;
 978         }
 979
 980         written += len;
 981     }
 982
 983     // we only get here if an error occurs during decoding
 984     return wxCONV_FAILED;
 985 }
 986
 987 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 988                              const char *psz, size_t srcLen) const
 989 {
 990     if ( m_options == MAP_INVALID_UTF8_NOT )
 991         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 992
 993     size_t len = 0;
 994
 995     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 996     {
 997         const char *opsz = psz;
 998         bool invalid = false;
 999         unsigned char cc = *psz++, fc = cc;
1000         unsigned cnt;
1001         for (cnt = 0; fc & 0x80; cnt++)
1002             fc <<= 1;
1003
1004         if (!cnt)
1005         {
1006             // plain ASCII char
1007             if (buf)
1008                 *buf++ = cc;
1009             len++;
1010
1011             // escape the escape character for octal escapes
1012             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1013                     && cc == '\\' && (!buf || len < n))
1014             {
1015                 if (buf)
1016                     *buf++ = cc;
1017                 len++;
1018             }
1019         }
1020         else
1021         {
1022             cnt--;
1023             if (!cnt)
1024             {
1025                 // invalid UTF-8 sequence
1026                 invalid = true;
1027             }
1028             else
1029             {
1030                 unsigned ocnt = cnt - 1;
1031                 wxUint32 res = cc & (0x3f >> cnt);
1032                 while (cnt--)
1033                 {
1034                     cc = *psz;
1035                     if ((cc & 0xC0) != 0x80)
1036                     {
1037                         // invalid UTF-8 sequence
1038                         invalid = true;
1039                         break;
1040                     }
1041
1042                     psz++;
1043                     res = (res << 6) | (cc & 0x3f);
1044                 }
1045
1046                 if (invalid || res <= utf8_max[ocnt])
1047                 {
1048                     // illegal UTF-8 encoding
1049                     invalid = true;
1050                 }
1051                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1052                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1053                 {
1054                     // if one of our PUA characters turns up externally
1055                     // it must also be treated as an illegal sequence
1056                     // (a bit like you have to escape an escape character)
1057                     invalid = true;
1058                 }
1059                 else
1060                 {
1061 #ifdef WC_UTF16
1062                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1063                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1064                     if (pa == wxCONV_FAILED)
1065                     {
1066                         invalid = true;
1067                     }
1068                     else
1069                     {
1070                         if (buf)
1071                             buf += pa;
1072                         len += pa;
1073                     }
1074 #else // !WC_UTF16
1075                     if (buf)
1076                         *buf++ = (wchar_t)res;
1077                     len++;
1078 #endif // WC_UTF16/!WC_UTF16
1079                 }
1080             }
1081
1082             if (invalid)
1083             {
1084                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1085                 {
1086                     while (opsz < psz && (!buf || len < n))
1087                     {
1088 #ifdef WC_UTF16
1089                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1090                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1091                         wxASSERT(pa != wxCONV_FAILED);
1092                         if (buf)
1093                             buf += pa;
1094                         opsz++;
1095                         len += pa;
1096 #else
1097                         if (buf)
1098                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1099                         opsz++;
1100                         len++;
1101 #endif
1102                     }
1103                 }
1104                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1105                 {
1106                     while (opsz < psz && (!buf || len < n))
1107                     {
1108                         if ( buf && len + 3 < n )
1109                         {
1110                             unsigned char on = *opsz;
1111                             *buf++ = L'\\';
1112                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1113                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1114                             *buf++ = (wchar_t)( L'0' + on % 010 );
1115                         }
1116
1117                         opsz++;
1118                         len += 4;
1119                     }
1120                 }
1121                 else // MAP_INVALID_UTF8_NOT
1122                 {
1123                     return wxCONV_FAILED;
1124                 }
1125             }
1126         }
1127     }
1128
1129     if (srcLen == wxNO_LEN && buf && (len < n))
1130         *buf = 0;
1131
1132     return len + 1;
1133 }
1134
1135 static inline bool isoctal(wchar_t wch)
1136 {
1137     return L'0' <= wch && wch <= L'7';
1138 }
1139
1140 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1141                                const wchar_t *psz, size_t srcLen) const
1142 {
1143     if ( m_options == MAP_INVALID_UTF8_NOT )
1144         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1145
1146     size_t len = 0;
1147
1148     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1149     {
1150         wxUint32 cc;
1151
1152 #ifdef WC_UTF16
1153         // cast is ok for WC_UTF16
1154         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1155         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1156 #else
1157         cc = (*psz++) & 0x7fffffff;
1158 #endif
1159
1160         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1161                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1162         {
1163             if (buf)
1164                 *buf++ = (char)(cc - wxUnicodePUA);
1165             len++;
1166         }
1167         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1168                     && cc == L'\\' && psz[0] == L'\\' )
1169         {
1170             if (buf)
1171                 *buf++ = (char)cc;
1172             psz++;
1173             len++;
1174         }
1175         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1176                     cc == L'\\' &&
1177                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1178         {
1179             if (buf)
1180             {
1181                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1182                                  (psz[1] - L'0') * 010 +
1183                                  (psz[2] - L'0'));
1184             }
1185
1186             psz += 3;
1187             len++;
1188         }
1189         else
1190         {
1191             unsigned cnt;
1192             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1193             {
1194             }
1195
1196             if (!cnt)
1197             {
1198                 // plain ASCII char
1199                 if (buf)
1200                     *buf++ = (char) cc;
1201                 len++;
1202             }
1203             else
1204             {
1205                 len += cnt + 1;
1206                 if (buf)
1207                 {
1208                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1209                     while (cnt--)
1210                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1211                 }
1212             }
1213         }
1214     }
1215
1216     if (srcLen == wxNO_LEN && buf && (len < n))
1217         *buf = 0;
1218
1219     return len + 1;
1220 }
1221
1222 // ============================================================================
1223 // UTF-16
1224 // ============================================================================
1225
1226 #ifdef WORDS_BIGENDIAN
1227     #define wxMBConvUTF16straight wxMBConvUTF16BE
1228     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1229 #else
1230     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1231     #define wxMBConvUTF16straight wxMBConvUTF16LE
1232 #endif
1233
1234 /* static */
1235 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1236 {
1237     if ( srcLen == wxNO_LEN )
1238     {
1239         // count the number of bytes in input, including the trailing NULs
1240         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1241         for ( srcLen = 1; *inBuff++; srcLen++ )
1242             ;
1243
1244         srcLen *= BYTES_PER_CHAR;
1245     }
1246     else // we already have the length
1247     {
1248         // we can only convert an entire number of UTF-16 characters
1249         if ( srcLen % BYTES_PER_CHAR )
1250             return wxCONV_FAILED;
1251     }
1252
1253     return srcLen;
1254 }
1255
1256 // case when in-memory representation is UTF-16 too
1257 #ifdef WC_UTF16
1258
1259 // ----------------------------------------------------------------------------
1260 // conversions without endianness change
1261 // ----------------------------------------------------------------------------
1262
1263 size_t
1264 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1265                                const char *src, size_t srcLen) const
1266 {
1267     // set up the scene for using memcpy() (which is presumably more efficient
1268     // than copying the bytes one by one)
1269     srcLen = GetLength(src, srcLen);
1270     if ( srcLen == wxNO_LEN )
1271         return wxCONV_FAILED;
1272
1273     const size_t inLen = srcLen / BYTES_PER_CHAR;
1274     if ( dst )
1275     {
1276         if ( dstLen < inLen )
1277             return wxCONV_FAILED;
1278
1279         memcpy(dst, src, srcLen);
1280     }
1281
1282     return inLen;
1283 }
1284
1285 size_t
1286 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1287                                  const wchar_t *src, size_t srcLen) const
1288 {
1289     if ( srcLen == wxNO_LEN )
1290         srcLen = wxWcslen(src) + 1;
1291
1292     srcLen *= BYTES_PER_CHAR;
1293
1294     if ( dst )
1295     {
1296         if ( dstLen < srcLen )
1297             return wxCONV_FAILED;
1298
1299         memcpy(dst, src, srcLen);
1300     }
1301
1302     return srcLen;
1303 }
1304
1305 // ----------------------------------------------------------------------------
1306 // endian-reversing conversions
1307 // ----------------------------------------------------------------------------
1308
1309 size_t
1310 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1311                            const char *src, size_t srcLen) const
1312 {
1313     srcLen = GetLength(src, srcLen);
1314     if ( srcLen == wxNO_LEN )
1315         return wxCONV_FAILED;
1316
1317     srcLen /= BYTES_PER_CHAR;
1318
1319     if ( dst )
1320     {
1321         if ( dstLen < srcLen )
1322             return wxCONV_FAILED;
1323
1324         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1325         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1326         {
1327             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1328         }
1329     }
1330
1331     return srcLen;
1332 }
1333
1334 size_t
1335 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1336                              const wchar_t *src, size_t srcLen) const
1337 {
1338     if ( srcLen == wxNO_LEN )
1339         srcLen = wxWcslen(src) + 1;
1340
1341     srcLen *= BYTES_PER_CHAR;
1342
1343     if ( dst )
1344     {
1345         if ( dstLen < srcLen )
1346             return wxCONV_FAILED;
1347
1348         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1349         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1350         {
1351             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1352         }
1353     }
1354
1355     return srcLen;
1356 }
1357
1358 #else // !WC_UTF16: wchar_t is UTF-32
1359
1360 // ----------------------------------------------------------------------------
1361 // conversions without endianness change
1362 // ----------------------------------------------------------------------------
1363
1364 size_t
1365 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1366                                const char *src, size_t srcLen) const
1367 {
1368     srcLen = GetLength(src, srcLen);
1369     if ( srcLen == wxNO_LEN )
1370         return wxCONV_FAILED;
1371
1372     const size_t inLen = srcLen / BYTES_PER_CHAR;
1373     if ( !dst )
1374     {
1375         // optimization: return maximal space which could be needed for this
1376         // string even if the real size could be smaller if the buffer contains
1377         // any surrogates
1378         return inLen;
1379     }
1380
1381     size_t outLen = 0;
1382     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1383     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1384     {
1385         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1386         if ( !inBuff )
1387             return wxCONV_FAILED;
1388
1389         if ( ++outLen > dstLen )
1390             return wxCONV_FAILED;
1391
1392         *dst++ = ch;
1393     }
1394
1395
1396     return outLen;
1397 }
1398
1399 size_t
1400 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1401                                  const wchar_t *src, size_t srcLen) const
1402 {
1403     if ( srcLen == wxNO_LEN )
1404         srcLen = wxWcslen(src) + 1;
1405
1406     size_t outLen = 0;
1407     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1408     for ( size_t n = 0; n < srcLen; n++ )
1409     {
1410         wxUint16 cc[2];
1411         const size_t numChars = encode_utf16(*src++, cc);
1412         if ( numChars == wxCONV_FAILED )
1413             return wxCONV_FAILED;
1414
1415         outLen += numChars * BYTES_PER_CHAR;
1416         if ( outBuff )
1417         {
1418             if ( outLen > dstLen )
1419                 return wxCONV_FAILED;
1420
1421             *outBuff++ = cc[0];
1422             if ( numChars == 2 )
1423             {
1424                 // second character of a surrogate
1425                 *outBuff++ = cc[1];
1426             }
1427         }
1428     }
1429
1430     return outLen;
1431 }
1432
1433 // ----------------------------------------------------------------------------
1434 // endian-reversing conversions
1435 // ----------------------------------------------------------------------------
1436
1437 size_t
1438 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1439                            const char *src, size_t srcLen) const
1440 {
1441     srcLen = GetLength(src, srcLen);
1442     if ( srcLen == wxNO_LEN )
1443         return wxCONV_FAILED;
1444
1445     const size_t inLen = srcLen / BYTES_PER_CHAR;
1446     if ( !dst )
1447     {
1448         // optimization: return maximal space which could be needed for this
1449         // string even if the real size could be smaller if the buffer contains
1450         // any surrogates
1451         return inLen;
1452     }
1453
1454     size_t outLen = 0;
1455     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1456     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1457     {
1458         wxUint32 ch;
1459         wxUint16 tmp[2];
1460
1461         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1462         inBuff++;
1463         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1464
1465         const size_t numChars = decode_utf16(tmp, ch);
1466         if ( numChars == wxCONV_FAILED )
1467             return wxCONV_FAILED;
1468
1469         if ( numChars == 2 )
1470             inBuff++;
1471
1472         if ( ++outLen > dstLen )
1473             return wxCONV_FAILED;
1474
1475         *dst++ = ch;
1476     }
1477
1478
1479     return outLen;
1480 }
1481
1482 size_t
1483 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1484                              const wchar_t *src, size_t srcLen) const
1485 {
1486     if ( srcLen == wxNO_LEN )
1487         srcLen = wxWcslen(src) + 1;
1488
1489     size_t outLen = 0;
1490     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1491     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1492     {
1493         wxUint16 cc[2];
1494         const size_t numChars = encode_utf16(*src, cc);
1495         if ( numChars == wxCONV_FAILED )
1496             return wxCONV_FAILED;
1497
1498         outLen += numChars * BYTES_PER_CHAR;
1499         if ( outBuff )
1500         {
1501             if ( outLen > dstLen )
1502                 return wxCONV_FAILED;
1503
1504             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1505             if ( numChars == 2 )
1506             {
1507                 // second character of a surrogate
1508                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1509             }
1510         }
1511     }
1512
1513     return outLen;
1514 }
1515
1516 #endif // WC_UTF16/!WC_UTF16
1517
1518
1519 // ============================================================================
1520 // UTF-32
1521 // ============================================================================
1522
1523 #ifdef WORDS_BIGENDIAN
1524     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1525     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1526 #else
1527     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1528     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1529 #endif
1530
1531
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1534
1535 /* static */
1536 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1537 {
1538     if ( srcLen == wxNO_LEN )
1539     {
1540         // count the number of bytes in input, including the trailing NULs
1541         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1542         for ( srcLen = 1; *inBuff++; srcLen++ )
1543             ;
1544
1545         srcLen *= BYTES_PER_CHAR;
1546     }
1547     else // we already have the length
1548     {
1549         // we can only convert an entire number of UTF-32 characters
1550         if ( srcLen % BYTES_PER_CHAR )
1551             return wxCONV_FAILED;
1552     }
1553
1554     return srcLen;
1555 }
1556
1557 // case when in-memory representation is UTF-16
1558 #ifdef WC_UTF16
1559
1560 // ----------------------------------------------------------------------------
1561 // conversions without endianness change
1562 // ----------------------------------------------------------------------------
1563
1564 size_t
1565 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1566                                const char *src, size_t srcLen) const
1567 {
1568     srcLen = GetLength(src, srcLen);
1569     if ( srcLen == wxNO_LEN )
1570         return wxCONV_FAILED;
1571
1572     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1573     const size_t inLen = srcLen / BYTES_PER_CHAR;
1574     size_t outLen = 0;
1575     for ( size_t n = 0; n < inLen; n++ )
1576     {
1577         wxUint16 cc[2];
1578         const size_t numChars = encode_utf16(*inBuff++, cc);
1579         if ( numChars == wxCONV_FAILED )
1580             return wxCONV_FAILED;
1581
1582         outLen += numChars;
1583         if ( dst )
1584         {
1585             if ( outLen > dstLen )
1586                 return wxCONV_FAILED;
1587
1588             *dst++ = cc[0];
1589             if ( numChars == 2 )
1590             {
1591                 // second character of a surrogate
1592                 *dst++ = cc[1];
1593             }
1594         }
1595     }
1596
1597     return outLen;
1598 }
1599
1600 size_t
1601 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1602                                  const wchar_t *src, size_t srcLen) const
1603 {
1604     if ( srcLen == wxNO_LEN )
1605         srcLen = wxWcslen(src) + 1;
1606
1607     if ( !dst )
1608     {
1609         // optimization: return maximal space which could be needed for this
1610         // string instead of the exact amount which could be less if there are
1611         // any surrogates in the input
1612         //
1613         // we consider that surrogates are rare enough to make it worthwhile to
1614         // avoid running the loop below at the cost of slightly extra memory
1615         // consumption
1616         return srcLen * BYTES_PER_CHAR;
1617     }
1618
1619     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1620     size_t outLen = 0;
1621     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1622     {
1623         const wxUint32 ch = wxDecodeSurrogate(&src);
1624         if ( !src )
1625             return wxCONV_FAILED;
1626
1627         outLen += BYTES_PER_CHAR;
1628
1629         if ( outLen > dstLen )
1630             return wxCONV_FAILED;
1631
1632         *outBuff++ = ch;
1633     }
1634
1635     return outLen;
1636 }
1637
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1641
1642 size_t
1643 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1644                            const char *src, size_t srcLen) const
1645 {
1646     srcLen = GetLength(src, srcLen);
1647     if ( srcLen == wxNO_LEN )
1648         return wxCONV_FAILED;
1649
1650     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1651     const size_t inLen = srcLen / BYTES_PER_CHAR;
1652     size_t outLen = 0;
1653     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1654     {
1655         wxUint16 cc[2];
1656         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1657         if ( numChars == wxCONV_FAILED )
1658             return wxCONV_FAILED;
1659
1660         outLen += numChars;
1661         if ( dst )
1662         {
1663             if ( outLen > dstLen )
1664                 return wxCONV_FAILED;
1665
1666             *dst++ = cc[0];
1667             if ( numChars == 2 )
1668             {
1669                 // second character of a surrogate
1670                 *dst++ = cc[1];
1671             }
1672         }
1673     }
1674
1675     return outLen;
1676 }
1677
1678 size_t
1679 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1680                              const wchar_t *src, size_t srcLen) const
1681 {
1682     if ( srcLen == wxNO_LEN )
1683         srcLen = wxWcslen(src) + 1;
1684
1685     if ( !dst )
1686     {
1687         // optimization: return maximal space which could be needed for this
1688         // string instead of the exact amount which could be less if there are
1689         // any surrogates in the input
1690         //
1691         // we consider that surrogates are rare enough to make it worthwhile to
1692         // avoid running the loop below at the cost of slightly extra memory
1693         // consumption
1694         return srcLen*BYTES_PER_CHAR;
1695     }
1696
1697     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1698     size_t outLen = 0;
1699     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1700     {
1701         const wxUint32 ch = wxDecodeSurrogate(&src);
1702         if ( !src )
1703             return wxCONV_FAILED;
1704
1705         outLen += BYTES_PER_CHAR;
1706
1707         if ( outLen > dstLen )
1708             return wxCONV_FAILED;
1709
1710         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1711     }
1712
1713     return outLen;
1714 }
1715
1716 #else // !WC_UTF16: wchar_t is UTF-32
1717
1718 // ----------------------------------------------------------------------------
1719 // conversions without endianness change
1720 // ----------------------------------------------------------------------------
1721
1722 size_t
1723 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1724                                const char *src, size_t srcLen) const
1725 {
1726     // use memcpy() as it should be much faster than hand-written loop
1727     srcLen = GetLength(src, srcLen);
1728     if ( srcLen == wxNO_LEN )
1729         return wxCONV_FAILED;
1730
1731     const size_t inLen = srcLen/BYTES_PER_CHAR;
1732     if ( dst )
1733     {
1734         if ( dstLen < inLen )
1735             return wxCONV_FAILED;
1736
1737         memcpy(dst, src, srcLen);
1738     }
1739
1740     return inLen;
1741 }
1742
1743 size_t
1744 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1745                                  const wchar_t *src, size_t srcLen) const
1746 {
1747     if ( srcLen == wxNO_LEN )
1748         srcLen = wxWcslen(src) + 1;
1749
1750     srcLen *= BYTES_PER_CHAR;
1751
1752     if ( dst )
1753     {
1754         if ( dstLen < srcLen )
1755             return wxCONV_FAILED;
1756
1757         memcpy(dst, src, srcLen);
1758     }
1759
1760     return srcLen;
1761 }
1762
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1766
1767 size_t
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1769                            const char *src, size_t srcLen) const
1770 {
1771     srcLen = GetLength(src, srcLen);
1772     if ( srcLen == wxNO_LEN )
1773         return wxCONV_FAILED;
1774
1775     srcLen /= BYTES_PER_CHAR;
1776
1777     if ( dst )
1778     {
1779         if ( dstLen < srcLen )
1780             return wxCONV_FAILED;
1781
1782         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1783         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1784         {
1785             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1786         }
1787     }
1788
1789     return srcLen;
1790 }
1791
1792 size_t
1793 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1794                              const wchar_t *src, size_t srcLen) const
1795 {
1796     if ( srcLen == wxNO_LEN )
1797         srcLen = wxWcslen(src) + 1;
1798
1799     srcLen *= BYTES_PER_CHAR;
1800
1801     if ( dst )
1802     {
1803         if ( dstLen < srcLen )
1804             return wxCONV_FAILED;
1805
1806         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1807         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1808         {
1809             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1810         }
1811     }
1812
1813     return srcLen;
1814 }
1815
1816 #endif // WC_UTF16/!WC_UTF16
1817
1818
1819 // ============================================================================
1820 // The classes doing conversion using the iconv_xxx() functions
1821 // ============================================================================
1822
1823 #ifdef HAVE_ICONV
1824
1825 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1826 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1827 //     (unless there's yet another bug in glibc) the only case when iconv()
1828 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1829 //     left in the input buffer -- when _real_ error occurs,
1830 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1831 //     iconv() failure.
1832 //     [This bug does not appear in glibc 2.2.]
1833 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1834 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1835                                      (errno != E2BIG || bufLeft != 0))
1836 #else
1837 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1838 #endif
1839
1840 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1841
1842 #define ICONV_T_INVALID ((iconv_t)-1)
1843
1844 #if SIZEOF_WCHAR_T == 4
1845     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1846     #define WC_ENC      wxFONTENCODING_UTF32
1847 #elif SIZEOF_WCHAR_T == 2
1848     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1849     #define WC_ENC      wxFONTENCODING_UTF16
1850 #else // sizeof(wchar_t) != 2 nor 4
1851     // does this ever happen?
1852     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1853 #endif
1854
1855 // ----------------------------------------------------------------------------
1856 // wxMBConv_iconv: encapsulates an iconv character set
1857 // ----------------------------------------------------------------------------
1858
1859 class wxMBConv_iconv : public wxMBConv
1860 {
1861 public:
1862     wxMBConv_iconv(const char *name);
1863     virtual ~wxMBConv_iconv();
1864
1865     // implement base class virtual methods
1866     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1867                            const char *src, size_t srcLen = wxNO_LEN) const;
1868     virtual size_t FromWChar(char *dst, size_t dstLen,
1869                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1870     virtual size_t GetMBNulLen() const;
1871
1872 #if wxUSE_UNICODE_UTF8
1873     virtual bool IsUTF8() const;
1874 #endif
1875
1876     virtual wxMBConv *Clone() const
1877     {
1878         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1879         p->m_minMBCharWidth = m_minMBCharWidth;
1880         return p;
1881     }
1882
1883     bool IsOk() const
1884         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1885
1886 protected:
1887     // the iconv handlers used to translate from multibyte
1888     // to wide char and in the other direction
1889     iconv_t m2w,
1890             w2m;
1891
1892 #if wxUSE_THREADS
1893     // guards access to m2w and w2m objects
1894     wxMutex m_iconvMutex;
1895 #endif
1896
1897 private:
1898     // the name (for iconv_open()) of a wide char charset -- if none is
1899     // available on this machine, it will remain NULL
1900     static wxString ms_wcCharsetName;
1901
1902     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903     // different endian-ness than the native one
1904     static bool ms_wcNeedsSwap;
1905
1906
1907     // name of the encoding handled by this conversion
1908     wxString m_name;
1909
1910     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1911     // initially
1912     size_t m_minMBCharWidth;
1913 };
1914
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1917 {
1918     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919     if ( !result->IsOk() )
1920     {
1921         delete result;
1922         return 0;
1923     }
1924
1925     return result;
1926 }
1927
1928 wxString wxMBConv_iconv::ms_wcCharsetName;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1930
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1932               : m_name(name)
1933 {
1934     m_minMBCharWidth = 0;
1935
1936     // check for charset that represents wchar_t:
1937     if ( ms_wcCharsetName.empty() )
1938     {
1939         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
1941 #if wxUSE_FONTMAP
1942         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943 #else // !wxUSE_FONTMAP
1944         static const wxChar *names_static[] =
1945         {
1946 #if SIZEOF_WCHAR_T == 4
1947             _T("UCS-4"),
1948 #elif SIZEOF_WCHAR_T = 2
1949             _T("UCS-2"),
1950 #endif
1951             NULL
1952         };
1953         const wxChar **names = names_static;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1955
1956         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1957         {
1958             const wxString nameCS(*names);
1959
1960             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961             wxString nameXE(nameCS);
1962
1963 #ifdef WORDS_BIGENDIAN
1964                 nameXE += _T("BE");
1965 #else // little endian
1966                 nameXE += _T("LE");
1967 #endif
1968
1969             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1970                        nameXE.c_str());
1971
1972             m2w = iconv_open(nameXE.ToAscii(), name);
1973             if ( m2w == ICONV_T_INVALID )
1974             {
1975                 // try charset w/o bytesex info (e.g. "UCS4")
1976                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1977                            nameCS.c_str());
1978                 m2w = iconv_open(nameCS.ToAscii(), name);
1979
1980                 // and check for bytesex ourselves:
1981                 if ( m2w != ICONV_T_INVALID )
1982                 {
1983                     char    buf[2], *bufPtr;
1984                     wchar_t wbuf[2];
1985                     size_t  insz, outsz;
1986                     size_t  res;
1987
1988                     buf[0] = 'A';
1989                     buf[1] = 0;
1990                     wbuf[0] = 0;
1991                     insz = 2;
1992                     outsz = SIZEOF_WCHAR_T * 2;
1993                     char* wbufPtr = (char*)wbuf;
1994                     bufPtr = buf;
1995
1996                     res = iconv(
1997                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1998                         &wbufPtr, &outsz);
1999
2000                     if (ICONV_FAILED(res, insz))
2001                     {
2002                         wxLogLastError(wxT("iconv"));
2003                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2004                                    nameCS.c_str());
2005                     }
2006                     else // ok, can convert to this encoding, remember it
2007                     {
2008                         ms_wcCharsetName = nameCS;
2009                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010                     }
2011                 }
2012             }
2013             else // use charset not requiring byte swapping
2014             {
2015                 ms_wcCharsetName = nameXE;
2016             }
2017         }
2018
2019         wxLogTrace(TRACE_STRCONV,
2020                    wxT("iconv wchar_t charset is \"%s\"%s"),
2021                    ms_wcCharsetName.empty() ? wxString("<none>")
2022                                             : ms_wcCharsetName,
2023                    ms_wcNeedsSwap ? _T(" (needs swap)")
2024                                   : _T(""));
2025     }
2026     else // we already have ms_wcCharsetName
2027     {
2028         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2029     }
2030
2031     if ( ms_wcCharsetName.empty() )
2032     {
2033         w2m = ICONV_T_INVALID;
2034     }
2035     else
2036     {
2037         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2038         if ( w2m == ICONV_T_INVALID )
2039         {
2040             wxLogTrace(TRACE_STRCONV,
2041                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042                        ms_wcCharsetName.c_str(), name);
2043         }
2044     }
2045 }
2046
2047 wxMBConv_iconv::~wxMBConv_iconv()
2048 {
2049     if ( m2w != ICONV_T_INVALID )
2050         iconv_close(m2w);
2051     if ( w2m != ICONV_T_INVALID )
2052         iconv_close(w2m);
2053 }
2054
2055 size_t
2056 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2057                         const char *src, size_t srcLen) const
2058 {
2059     if ( srcLen == wxNO_LEN )
2060     {
2061         // find the string length: notice that must be done differently for
2062         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2063         // consecutive NULs
2064         const size_t nulLen = GetMBNulLen();
2065         switch ( nulLen )
2066         {
2067             default:
2068                 return wxCONV_FAILED;
2069
2070             case 1:
2071                 srcLen = strlen(src); // arguably more optimized than our version
2072                 break;
2073
2074             case 2:
2075             case 4:
2076                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2077                 // but they also have to start at character boundary and not
2078                 // span two adjacent characters
2079                 const char *p;
2080                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2081                     ;
2082                 srcLen = p - src;
2083                 break;
2084         }
2085
2086         // when we're determining the length of the string ourselves we count
2087         // the terminating NUL(s) as part of it and always NUL-terminate the
2088         // output
2089         srcLen += nulLen;
2090     }
2091
2092     // we express length in the number of (wide) characters but iconv always
2093     // counts buffer sizes it in bytes
2094     dstLen *= SIZEOF_WCHAR_T;
2095
2096 #if wxUSE_THREADS
2097     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2098     //     Unfortunately there are a couple of global wxCSConv objects such as
2099     //     wxConvLocal that are used all over wx code, so we have to make sure
2100     //     the handle is used by at most one thread at the time. Otherwise
2101     //     only a few wx classes would be safe to use from non-main threads
2102     //     as MB<->WC conversion would fail "randomly".
2103     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2104 #endif // wxUSE_THREADS
2105
2106     size_t res, cres;
2107     const char *pszPtr = src;
2108
2109     if ( dst )
2110     {
2111         char* bufPtr = (char*)dst;
2112
2113         // have destination buffer, convert there
2114         size_t dstLenOrig = dstLen;
2115         cres = iconv(m2w,
2116                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2117                      &bufPtr, &dstLen);
2118
2119         // convert the number of bytes converted as returned by iconv to the
2120         // number of (wide) characters converted that we need
2121         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2122
2123         if (ms_wcNeedsSwap)
2124         {
2125             // convert to native endianness
2126             for ( unsigned i = 0; i < res; i++ )
2127                 dst[i] = WC_BSWAP(dst[i]);
2128         }
2129     }
2130     else // no destination buffer
2131     {
2132         // convert using temp buffer to calculate the size of the buffer needed
2133         wchar_t tbuf[8];
2134         res = 0;
2135
2136         do
2137         {
2138             char* bufPtr = (char*)tbuf;
2139             dstLen = 8 * SIZEOF_WCHAR_T;
2140
2141             cres = iconv(m2w,
2142                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2143                          &bufPtr, &dstLen );
2144
2145             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2146         }
2147         while ((cres == (size_t)-1) && (errno == E2BIG));
2148     }
2149
2150     if (ICONV_FAILED(cres, srcLen))
2151     {
2152         //VS: it is ok if iconv fails, hence trace only
2153         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2154         return wxCONV_FAILED;
2155     }
2156
2157     return res;
2158 }
2159
2160 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2161                                  const wchar_t *src, size_t srcLen) const
2162 {
2163 #if wxUSE_THREADS
2164     // NB: explained in MB2WC
2165     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2166 #endif
2167
2168     if ( srcLen == wxNO_LEN )
2169         srcLen = wxWcslen(src) + 1;
2170
2171     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2172     size_t outbuflen = dstLen;
2173     size_t res, cres;
2174
2175     wchar_t *tmpbuf = 0;
2176
2177     if (ms_wcNeedsSwap)
2178     {
2179         // need to copy to temp buffer to switch endianness
2180         // (doing WC_BSWAP twice on the original buffer won't help, as it
2181         //  could be in read-only memory, or be accessed in some other thread)
2182         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2183         for ( size_t i = 0; i < srcLen; i++ )
2184             tmpbuf[i] = WC_BSWAP(src[i]);
2185
2186         tmpbuf[srcLen] = L'\0';
2187         src = tmpbuf;
2188     }
2189
2190     char* inbuf = (char*)src;
2191     if ( dst )
2192     {
2193         // have destination buffer, convert there
2194         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2195
2196         res = dstLen - outbuflen;
2197     }
2198     else // no destination buffer
2199     {
2200         // convert using temp buffer to calculate the size of the buffer needed
2201         char tbuf[16];
2202         res = 0;
2203         do
2204         {
2205             dst = tbuf;
2206             outbuflen = 16;
2207
2208             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2209
2210             res += 16 - outbuflen;
2211         }
2212         while ((cres == (size_t)-1) && (errno == E2BIG));
2213     }
2214
2215     if (ms_wcNeedsSwap)
2216     {
2217         free(tmpbuf);
2218     }
2219
2220     if (ICONV_FAILED(cres, inbuflen))
2221     {
2222         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2223         return wxCONV_FAILED;
2224     }
2225
2226     return res;
2227 }
2228
2229 size_t wxMBConv_iconv::GetMBNulLen() const
2230 {
2231     if ( m_minMBCharWidth == 0 )
2232     {
2233         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2234
2235 #if wxUSE_THREADS
2236         // NB: explained in MB2WC
2237         wxMutexLocker lock(self->m_iconvMutex);
2238 #endif
2239
2240         const wchar_t *wnul = L"";
2241         char buf[8]; // should be enough for NUL in any encoding
2242         size_t inLen = sizeof(wchar_t),
2243                outLen = WXSIZEOF(buf);
2244         char *inBuff = (char *)wnul;
2245         char *outBuff = buf;
2246         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2247         {
2248             self->m_minMBCharWidth = (size_t)-1;
2249         }
2250         else // ok
2251         {
2252             self->m_minMBCharWidth = outBuff - buf;
2253         }
2254     }
2255
2256     return m_minMBCharWidth;
2257 }
2258
2259 #if wxUSE_UNICODE_UTF8
2260 bool wxMBConv_iconv::IsUTF8() const
2261 {
2262     return wxStricmp(m_name, "UTF-8") == 0 ||
2263            wxStricmp(m_name, "UTF8") == 0;
2264 }
2265 #endif
2266
2267 #endif // HAVE_ICONV
2268
2269
2270 // ============================================================================
2271 // Win32 conversion classes
2272 // ============================================================================
2273
2274 #ifdef wxHAVE_WIN32_MB2WC
2275
2276 // from utils.cpp
2277 #if wxUSE_FONTMAP
2278 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2279 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2280 #endif
2281
2282 class wxMBConv_win32 : public wxMBConv
2283 {
2284 public:
2285     wxMBConv_win32()
2286     {
2287         m_CodePage = CP_ACP;
2288         m_minMBCharWidth = 0;
2289     }
2290
2291     wxMBConv_win32(const wxMBConv_win32& conv)
2292         : wxMBConv()
2293     {
2294         m_CodePage = conv.m_CodePage;
2295         m_minMBCharWidth = conv.m_minMBCharWidth;
2296     }
2297
2298 #if wxUSE_FONTMAP
2299     wxMBConv_win32(const char* name)
2300     {
2301         m_CodePage = wxCharsetToCodepage(name);
2302         m_minMBCharWidth = 0;
2303     }
2304
2305     wxMBConv_win32(wxFontEncoding encoding)
2306     {
2307         m_CodePage = wxEncodingToCodepage(encoding);
2308         m_minMBCharWidth = 0;
2309     }
2310 #endif // wxUSE_FONTMAP
2311
2312     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2313     {
2314         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2315         // the behaviour is not compatible with the Unix version (using iconv)
2316         // and break the library itself, e.g. wxTextInputStream::NextChar()
2317         // wouldn't work if reading an incomplete MB char didn't result in an
2318         // error
2319         //
2320         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2321         // Win XP or newer and it is not supported for UTF-[78] so we always
2322         // use our own conversions in this case. See
2323         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2324         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2325         if ( m_CodePage == CP_UTF8 )
2326         {
2327             return wxMBConvUTF8().MB2WC(buf, psz, n);
2328         }
2329
2330         if ( m_CodePage == CP_UTF7 )
2331         {
2332             return wxMBConvUTF7().MB2WC(buf, psz, n);
2333         }
2334
2335         int flags = 0;
2336         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2337                 IsAtLeastWin2kSP4() )
2338         {
2339             flags = MB_ERR_INVALID_CHARS;
2340         }
2341
2342         const size_t len = ::MultiByteToWideChar
2343                              (
2344                                 m_CodePage,     // code page
2345                                 flags,          // flags: fall on error
2346                                 psz,            // input string
2347                                 -1,             // its length (NUL-terminated)
2348                                 buf,            // output string
2349                                 buf ? n : 0     // size of output buffer
2350                              );
2351         if ( !len )
2352         {
2353             // function totally failed
2354             return wxCONV_FAILED;
2355         }
2356
2357         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2358         // check if we succeeded, by doing a double trip:
2359         if ( !flags && buf )
2360         {
2361             const size_t mbLen = strlen(psz);
2362             wxCharBuffer mbBuf(mbLen);
2363             if ( ::WideCharToMultiByte
2364                    (
2365                       m_CodePage,
2366                       0,
2367                       buf,
2368                       -1,
2369                       mbBuf.data(),
2370                       mbLen + 1,        // size in bytes, not length
2371                       NULL,
2372                       NULL
2373                    ) == 0 ||
2374                   strcmp(mbBuf, psz) != 0 )
2375             {
2376                 // we didn't obtain the same thing we started from, hence
2377                 // the conversion was lossy and we consider that it failed
2378                 return wxCONV_FAILED;
2379             }
2380         }
2381
2382         // note that it returns count of written chars for buf != NULL and size
2383         // of the needed buffer for buf == NULL so in either case the length of
2384         // the string (which never includes the terminating NUL) is one less
2385         return len - 1;
2386     }
2387
2388     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2389     {
2390         /*
2391             we have a problem here: by default, WideCharToMultiByte() may
2392             replace characters unrepresentable in the target code page with bad
2393             quality approximations such as turning "1/2" symbol (U+00BD) into
2394             "1" for the code pages which don't have it and we, obviously, want
2395             to avoid this at any price
2396
2397             the trouble is that this function does it _silently_, i.e. it won't
2398             even tell us whether it did or not... Win98/2000 and higher provide
2399             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2400             we have to resort to a round trip, i.e. check that converting back
2401             results in the same string -- this is, of course, expensive but
2402             otherwise we simply can't be sure to not garble the data.
2403          */
2404
2405         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2406         // it doesn't work with CJK encodings (which we test for rather roughly
2407         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2408         // supporting it
2409         BOOL usedDef wxDUMMY_INITIALIZE(false);
2410         BOOL *pUsedDef;
2411         int flags;
2412         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2413         {
2414             // it's our lucky day
2415             flags = WC_NO_BEST_FIT_CHARS;
2416             pUsedDef = &usedDef;
2417         }
2418         else // old system or unsupported encoding
2419         {
2420             flags = 0;
2421             pUsedDef = NULL;
2422         }
2423
2424         const size_t len = ::WideCharToMultiByte
2425                              (
2426                                 m_CodePage,     // code page
2427                                 flags,          // either none or no best fit
2428                                 pwz,            // input string
2429                                 -1,             // it is (wide) NUL-terminated
2430                                 buf,            // output buffer
2431                                 buf ? n : 0,    // and its size
2432                                 NULL,           // default "replacement" char
2433                                 pUsedDef        // [out] was it used?
2434                              );
2435
2436         if ( !len )
2437         {
2438             // function totally failed
2439             return wxCONV_FAILED;
2440         }
2441
2442         // we did something, check if we really succeeded
2443         if ( flags )
2444         {
2445             // check if the conversion failed, i.e. if any replacements
2446             // were done
2447             if ( usedDef )
2448                 return wxCONV_FAILED;
2449         }
2450         else // we must resort to double tripping...
2451         {
2452             // first we need to ensure that we really have the MB data: this is
2453             // not the case if we're called with NULL buffer, in which case we
2454             // need to do the conversion yet again
2455             wxCharBuffer bufDef;
2456             if ( !buf )
2457             {
2458                 bufDef = wxCharBuffer(len);
2459                 buf = bufDef.data();
2460                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2461                                             buf, len, NULL, NULL) )
2462                     return wxCONV_FAILED;
2463             }
2464
2465             if ( !n )
2466                 n = wcslen(pwz);
2467             wxWCharBuffer wcBuf(n);
2468             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2469                     wcscmp(wcBuf, pwz) != 0 )
2470             {
2471                 // we didn't obtain the same thing we started from, hence
2472                 // the conversion was lossy and we consider that it failed
2473                 return wxCONV_FAILED;
2474             }
2475         }
2476
2477         // see the comment above for the reason of "len - 1"
2478         return len - 1;
2479     }
2480
2481     virtual size_t GetMBNulLen() const
2482     {
2483         if ( m_minMBCharWidth == 0 )
2484         {
2485             int len = ::WideCharToMultiByte
2486                         (
2487                             m_CodePage,     // code page
2488                             0,              // no flags
2489                             L"",            // input string
2490                             1,              // translate just the NUL
2491                             NULL,           // output buffer
2492                             0,              // and its size
2493                             NULL,           // no replacement char
2494                             NULL            // [out] don't care if it was used
2495                         );
2496
2497             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2498             switch ( len )
2499             {
2500                 default:
2501                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2502                     self->m_minMBCharWidth = (size_t)-1;
2503                     break;
2504
2505                 case 0:
2506                     self->m_minMBCharWidth = (size_t)-1;
2507                     break;
2508
2509                 case 1:
2510                 case 2:
2511                 case 4:
2512                     self->m_minMBCharWidth = len;
2513                     break;
2514             }
2515         }
2516
2517         return m_minMBCharWidth;
2518     }
2519
2520     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2521
2522     bool IsOk() const { return m_CodePage != -1; }
2523
2524 private:
2525     static bool CanUseNoBestFit()
2526     {
2527         static int s_isWin98Or2k = -1;
2528
2529         if ( s_isWin98Or2k == -1 )
2530         {
2531             int verMaj, verMin;
2532             switch ( wxGetOsVersion(&verMaj, &verMin) )
2533             {
2534                 case wxOS_WINDOWS_9X:
2535                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2536                     break;
2537
2538                 case wxOS_WINDOWS_NT:
2539                     s_isWin98Or2k = verMaj >= 5;
2540                     break;
2541
2542                 default:
2543                     // unknown: be conservative by default
2544                     s_isWin98Or2k = 0;
2545                     break;
2546             }
2547
2548             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2549         }
2550
2551         return s_isWin98Or2k == 1;
2552     }
2553
2554     static bool IsAtLeastWin2kSP4()
2555     {
2556 #ifdef __WXWINCE__
2557         return false;
2558 #else
2559         static int s_isAtLeastWin2kSP4 = -1;
2560
2561         if ( s_isAtLeastWin2kSP4 == -1 )
2562         {
2563             OSVERSIONINFOEX ver;
2564
2565             memset(&ver, 0, sizeof(ver));
2566             ver.dwOSVersionInfoSize = sizeof(ver);
2567             GetVersionEx((OSVERSIONINFO*)&ver);
2568
2569             s_isAtLeastWin2kSP4 =
2570               ((ver.dwMajorVersion > 5) || // Vista+
2571                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2572                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2573                ver.wServicePackMajor >= 4)) // 2000 SP4+
2574               ? 1 : 0;
2575         }
2576
2577         return s_isAtLeastWin2kSP4 == 1;
2578 #endif
2579     }
2580
2581
2582     // the code page we're working with
2583     long m_CodePage;
2584
2585     // cached result of GetMBNulLen(), set to 0 initially meaning
2586     // "unknown"
2587     size_t m_minMBCharWidth;
2588 };
2589
2590 #endif // wxHAVE_WIN32_MB2WC
2591
2592
2593 // ============================================================================
2594 // wxEncodingConverter based conversion classes
2595 // ============================================================================
2596
2597 #if wxUSE_FONTMAP
2598
2599 class wxMBConv_wxwin : public wxMBConv
2600 {
2601 private:
2602     void Init()
2603     {
2604         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2605         // The wxMBConv_cf class does a better job.
2606         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2607                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2608                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2609     }
2610
2611 public:
2612     // temporarily just use wxEncodingConverter stuff,
2613     // so that it works while a better implementation is built
2614     wxMBConv_wxwin(const char* name)
2615     {
2616         if (name)
2617             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2618         else
2619             m_enc = wxFONTENCODING_SYSTEM;
2620
2621         Init();
2622     }
2623
2624     wxMBConv_wxwin(wxFontEncoding enc)
2625     {
2626         m_enc = enc;
2627
2628         Init();
2629     }
2630
2631     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2632     {
2633         size_t inbuf = strlen(psz);
2634         if (buf)
2635         {
2636             if (!m2w.Convert(psz, buf))
2637                 return wxCONV_FAILED;
2638         }
2639         return inbuf;
2640     }
2641
2642     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2643     {
2644         const size_t inbuf = wxWcslen(psz);
2645         if (buf)
2646         {
2647             if (!w2m.Convert(psz, buf))
2648                 return wxCONV_FAILED;
2649         }
2650
2651         return inbuf;
2652     }
2653
2654     virtual size_t GetMBNulLen() const
2655     {
2656         switch ( m_enc )
2657         {
2658             case wxFONTENCODING_UTF16BE:
2659             case wxFONTENCODING_UTF16LE:
2660                 return 2;
2661
2662             case wxFONTENCODING_UTF32BE:
2663             case wxFONTENCODING_UTF32LE:
2664                 return 4;
2665
2666             default:
2667                 return 1;
2668         }
2669     }
2670
2671     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2672
2673     bool IsOk() const { return m_ok; }
2674
2675 public:
2676     wxFontEncoding m_enc;
2677     wxEncodingConverter m2w, w2m;
2678
2679 private:
2680     // were we initialized successfully?
2681     bool m_ok;
2682
2683     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2684 };
2685
2686 // make the constructors available for unit testing
2687 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2688 {
2689     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2690     if ( !result->IsOk() )
2691     {
2692         delete result;
2693         return 0;
2694     }
2695
2696     return result;
2697 }
2698
2699 #endif // wxUSE_FONTMAP
2700
2701 // ============================================================================
2702 // wxCSConv implementation
2703 // ============================================================================
2704
2705 void wxCSConv::Init()
2706 {
2707     m_name = NULL;
2708     m_convReal =  NULL;
2709     m_deferred = true;
2710 }
2711
2712 wxCSConv::wxCSConv(const wxString& charset)
2713 {
2714     Init();
2715
2716     if ( !charset.empty() )
2717     {
2718         SetName(charset.ToAscii());
2719     }
2720
2721 #if wxUSE_FONTMAP
2722     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2723 #else
2724     m_encoding = wxFONTENCODING_SYSTEM;
2725 #endif
2726 }
2727
2728 wxCSConv::wxCSConv(wxFontEncoding encoding)
2729 {
2730     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2731     {
2732         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2733
2734         encoding = wxFONTENCODING_SYSTEM;
2735     }
2736
2737     Init();
2738
2739     m_encoding = encoding;
2740 }
2741
2742 wxCSConv::~wxCSConv()
2743 {
2744     Clear();
2745 }
2746
2747 wxCSConv::wxCSConv(const wxCSConv& conv)
2748         : wxMBConv()
2749 {
2750     Init();
2751
2752     SetName(conv.m_name);
2753     m_encoding = conv.m_encoding;
2754 }
2755
2756 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2757 {
2758     Clear();
2759
2760     SetName(conv.m_name);
2761     m_encoding = conv.m_encoding;
2762
2763     return *this;
2764 }
2765
2766 void wxCSConv::Clear()
2767 {
2768     free(m_name);
2769     delete m_convReal;
2770
2771     m_name = NULL;
2772     m_convReal = NULL;
2773 }
2774
2775 void wxCSConv::SetName(const char *charset)
2776 {
2777     if (charset)
2778     {
2779         m_name = wxStrdup(charset);
2780         m_deferred = true;
2781     }
2782 }
2783
2784 #if wxUSE_FONTMAP
2785
2786 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2787                      wxEncodingNameCache );
2788
2789 static wxEncodingNameCache gs_nameCache;
2790 #endif
2791
2792 wxMBConv *wxCSConv::DoCreate() const
2793 {
2794 #if wxUSE_FONTMAP
2795     wxLogTrace(TRACE_STRCONV,
2796                wxT("creating conversion for %s"),
2797                (m_name ? m_name
2798                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2799 #endif // wxUSE_FONTMAP
2800
2801     // check for the special case of ASCII or ISO8859-1 charset: as we have
2802     // special knowledge of it anyhow, we don't need to create a special
2803     // conversion object
2804     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2805             m_encoding == wxFONTENCODING_DEFAULT )
2806     {
2807         // don't convert at all
2808         return NULL;
2809     }
2810
2811     // we trust OS to do conversion better than we can so try external
2812     // conversion methods first
2813     //
2814     // the full order is:
2815     //      1. OS conversion (iconv() under Unix or Win32 API)
2816     //      2. hard coded conversions for UTF
2817     //      3. wxEncodingConverter as fall back
2818
2819     // step (1)
2820 #ifdef HAVE_ICONV
2821 #if !wxUSE_FONTMAP
2822     if ( m_name )
2823 #endif // !wxUSE_FONTMAP
2824     {
2825 #if wxUSE_FONTMAP
2826         wxFontEncoding encoding(m_encoding);
2827 #endif
2828
2829         if ( m_name )
2830         {
2831             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2832             if ( conv->IsOk() )
2833                 return conv;
2834
2835             delete conv;
2836
2837 #if wxUSE_FONTMAP
2838             encoding =
2839                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2840 #endif // wxUSE_FONTMAP
2841         }
2842 #if wxUSE_FONTMAP
2843         {
2844             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2845             if ( it != gs_nameCache.end() )
2846             {
2847                 if ( it->second.empty() )
2848                     return NULL;
2849
2850                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2851                 if ( conv->IsOk() )
2852                     return conv;
2853
2854                 delete conv;
2855             }
2856
2857             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2858             // CS : in case this does not return valid names (eg for MacRoman)
2859             // encoding got a 'failure' entry in the cache all the same,
2860             // although it just has to be created using a different method, so
2861             // only store failed iconv creation attempts (or perhaps we
2862             // shoulnd't do this at all ?)
2863             if ( names[0] != NULL )
2864             {
2865                 for ( ; *names; ++names )
2866                 {
2867                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2868                     //             will need changes that will obsolete this
2869                     wxString name(*names);
2870                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2871                     if ( conv->IsOk() )
2872                     {
2873                         gs_nameCache[encoding] = *names;
2874                         return conv;
2875                     }
2876
2877                     delete conv;
2878                 }
2879
2880                 gs_nameCache[encoding] = _T(""); // cache the failure
2881             }
2882         }
2883 #endif // wxUSE_FONTMAP
2884     }
2885 #endif // HAVE_ICONV
2886
2887 #ifdef wxHAVE_WIN32_MB2WC
2888     {
2889 #if wxUSE_FONTMAP
2890         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2891                                       : new wxMBConv_win32(m_encoding);
2892         if ( conv->IsOk() )
2893             return conv;
2894
2895         delete conv;
2896 #else
2897         return NULL;
2898 #endif
2899     }
2900 #endif // wxHAVE_WIN32_MB2WC
2901
2902 #ifdef __DARWIN__
2903     {
2904         // leave UTF16 and UTF32 to the built-ins of wx
2905         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2906             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2907         {
2908 #if wxUSE_FONTMAP
2909             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2910                                           : new wxMBConv_cf(m_encoding);
2911 #else
2912             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2913 #endif
2914
2915             if ( conv->IsOk() )
2916                  return conv;
2917
2918             delete conv;
2919         }
2920     }
2921 #endif // __DARWIN__
2922
2923     // step (2)
2924     wxFontEncoding enc = m_encoding;
2925 #if wxUSE_FONTMAP
2926     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2927     {
2928         // use "false" to suppress interactive dialogs -- we can be called from
2929         // anywhere and popping up a dialog from here is the last thing we want to
2930         // do
2931         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2932     }
2933 #endif // wxUSE_FONTMAP
2934
2935     switch ( enc )
2936     {
2937         case wxFONTENCODING_UTF7:
2938              return new wxMBConvUTF7;
2939
2940         case wxFONTENCODING_UTF8:
2941              return new wxMBConvUTF8;
2942
2943         case wxFONTENCODING_UTF16BE:
2944              return new wxMBConvUTF16BE;
2945
2946         case wxFONTENCODING_UTF16LE:
2947              return new wxMBConvUTF16LE;
2948
2949         case wxFONTENCODING_UTF32BE:
2950              return new wxMBConvUTF32BE;
2951
2952         case wxFONTENCODING_UTF32LE:
2953              return new wxMBConvUTF32LE;
2954
2955         default:
2956              // nothing to do but put here to suppress gcc warnings
2957              break;
2958     }
2959
2960     // step (3)
2961 #if wxUSE_FONTMAP
2962     {
2963         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2964                                       : new wxMBConv_wxwin(m_encoding);
2965         if ( conv->IsOk() )
2966             return conv;
2967
2968         delete conv;
2969     }
2970 #endif // wxUSE_FONTMAP
2971
2972     // NB: This is a hack to prevent deadlock. What could otherwise happen
2973     //     in Unicode build: wxConvLocal creation ends up being here
2974     //     because of some failure and logs the error. But wxLog will try to
2975     //     attach a timestamp, for which it will need wxConvLocal (to convert
2976     //     time to char* and then wchar_t*), but that fails, tries to log the
2977     //     error, but wxLog has an (already locked) critical section that
2978     //     guards the static buffer.
2979     static bool alreadyLoggingError = false;
2980     if (!alreadyLoggingError)
2981     {
2982         alreadyLoggingError = true;
2983         wxLogError(_("Cannot convert from the charset '%s'!"),
2984                    m_name ? m_name
2985                       :
2986 #if wxUSE_FONTMAP
2987                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2988 #else // !wxUSE_FONTMAP
2989                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2990 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2991               );
2992
2993         alreadyLoggingError = false;
2994     }
2995
2996     return NULL;
2997 }
2998
2999 void wxCSConv::CreateConvIfNeeded() const
3000 {
3001     if ( m_deferred )
3002     {
3003         wxCSConv *self = (wxCSConv *)this; // const_cast
3004
3005         // if we don't have neither the name nor the encoding, use the default
3006         // encoding for this system
3007         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3008         {
3009 #if wxUSE_INTL
3010             self->m_encoding = wxLocale::GetSystemEncoding();
3011 #else
3012             // fallback to some reasonable default:
3013             self->m_encoding = wxFONTENCODING_ISO8859_1;
3014 #endif // wxUSE_INTL
3015         }
3016
3017         self->m_convReal = DoCreate();
3018         self->m_deferred = false;
3019     }
3020 }
3021
3022 bool wxCSConv::IsOk() const
3023 {
3024     CreateConvIfNeeded();
3025
3026     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3027     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3028         return true; // always ok as we do it ourselves
3029
3030     // m_convReal->IsOk() is called at its own creation, so we know it must
3031     // be ok if m_convReal is non-NULL
3032     return m_convReal != NULL;
3033 }
3034
3035 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3036                          const char *src, size_t srcLen) const
3037 {
3038     CreateConvIfNeeded();
3039
3040     if (m_convReal)
3041         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3042
3043     // latin-1 (direct)
3044     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3045 }
3046
3047 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3048                            const wchar_t *src, size_t srcLen) const
3049 {
3050     CreateConvIfNeeded();
3051
3052     if (m_convReal)
3053         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3054
3055     // latin-1 (direct)
3056     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3057 }
3058
3059 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3060 {
3061     CreateConvIfNeeded();
3062
3063     if (m_convReal)
3064         return m_convReal->MB2WC(buf, psz, n);
3065
3066     // latin-1 (direct)
3067     size_t len = strlen(psz);
3068
3069     if (buf)
3070     {
3071         for (size_t c = 0; c <= len; c++)
3072             buf[c] = (unsigned char)(psz[c]);
3073     }
3074
3075     return len;
3076 }
3077
3078 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3079 {
3080     CreateConvIfNeeded();
3081
3082     if (m_convReal)
3083         return m_convReal->WC2MB(buf, psz, n);
3084
3085     // latin-1 (direct)
3086     const size_t len = wxWcslen(psz);
3087     if (buf)
3088     {
3089         for (size_t c = 0; c <= len; c++)
3090         {
3091             if (psz[c] > 0xFF)
3092                 return wxCONV_FAILED;
3093
3094             buf[c] = (char)psz[c];
3095         }
3096     }
3097     else
3098     {
3099         for (size_t c = 0; c <= len; c++)
3100         {
3101             if (psz[c] > 0xFF)
3102                 return wxCONV_FAILED;
3103         }
3104     }
3105
3106     return len;
3107 }
3108
3109 size_t wxCSConv::GetMBNulLen() const
3110 {
3111     CreateConvIfNeeded();
3112
3113     if ( m_convReal )
3114     {
3115         return m_convReal->GetMBNulLen();
3116     }
3117
3118     // otherwise, we are ISO-8859-1
3119     return 1;
3120 }
3121
3122 #if wxUSE_UNICODE_UTF8
3123 bool wxCSConv::IsUTF8() const
3124 {
3125     CreateConvIfNeeded();
3126
3127     if ( m_convReal )
3128     {
3129         return m_convReal->IsUTF8();
3130     }
3131
3132     // otherwise, we are ISO-8859-1
3133     return false;
3134 }
3135 #endif
3136
3137
3138 #if wxUSE_UNICODE
3139
3140 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3141 {
3142     if ( !s )
3143         return wxWCharBuffer();
3144
3145     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3146     if ( !wbuf )
3147         wbuf = wxMBConvUTF8().cMB2WX(s);
3148     if ( !wbuf )
3149         wbuf = wxConvISO8859_1.cMB2WX(s);
3150
3151     return wbuf;
3152 }
3153
3154 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3155 {
3156     if ( !ws )
3157         return wxCharBuffer();
3158
3159     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3160     if ( !buf )
3161         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3162
3163     return buf;
3164 }
3165
3166 #endif // wxUSE_UNICODE
3167
3168 // ----------------------------------------------------------------------------
3169 // globals
3170 // ----------------------------------------------------------------------------
3171
3172 // NB: The reason why we create converted objects in this convoluted way,
3173 //     using a factory function instead of global variable, is that they
3174 //     may be used at static initialization time (some of them are used by
3175 //     wxString ctors and there may be a global wxString object). In other
3176 //     words, possibly _before_ the converter global object would be
3177 //     initialized.
3178
3179 #undef wxConvLibc
3180 #undef wxConvUTF8
3181 #undef wxConvUTF7
3182 #undef wxConvLocal
3183 #undef wxConvISO8859_1
3184
3185 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3186     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3187     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3188     {                                                                   \
3189         static impl_klass name##Obj ctor_args;                          \
3190         return &name##Obj;                                              \
3191     }                                                                   \
3192     /* this ensures that all global converter objects are created */    \
3193     /* by the time static initialization is done, i.e. before any */    \
3194     /* thread is launched: */                                           \
3195     static klass* gs_##name##instance = wxGet_##name##Ptr()
3196
3197 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3198     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3199
3200 #ifdef __WINDOWS__
3201     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3202 #else
3203     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3204 #endif
3205
3206 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3207 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3208 //     provokes an error message about "not enough macro parameters"; and we
3209 //     can't use "()" here as the name##Obj declaration would be parsed as a
3210 //     function declaration then, so use a semicolon and live with an extra
3211 //     empty statement (and hope that no compilers warns about this)
3212 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3213 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3214
3215 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3216 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3217
3218 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3220
3221 #ifdef __DARWIN__
3222 // The xnu kernel always communicates file paths in decomposed UTF-8.
3223 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3224 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3225 #endif
3226
3227 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3228 #ifdef __DARWIN__
3229                                     &wxConvMacUTF8DObj;
3230 #else // !__DARWIN__
3231                                     wxGet_wxConvLibcPtr();
3232 #endif // __DARWIN__/!__DARWIN__
3233
3234 #else // !wxUSE_WCHAR_T
3235
3236 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3237 // stand-ins in absence of wchar_t
3238 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3239                                 wxConvISO8859_1,
3240                                 wxConvLocal,
3241                                 wxConvUTF8;
3242
3243 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T