src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487
 488 //
 489 // BASE64 decoding table
 490 //
 491 static const unsigned char utf7unb64[] =
 492 {
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 499     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 500     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 502     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 503     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 504     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 506     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 507     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 508     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 525 };
 526
 527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 528 {
 529     size_t len = 0;
 530
 531     while ( *psz && (!buf || (len < n)) )
 532     {
 533         unsigned char cc = *psz++;
 534         if (cc != '+')
 535         {
 536             // plain ASCII char
 537             if (buf)
 538                 *buf++ = cc;
 539             len++;
 540         }
 541         else if (*psz == '-')
 542         {
 543             // encoded plus sign
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547             psz++;
 548         }
 549         else // start of BASE64 encoded string
 550         {
 551             bool lsb, ok;
 552             unsigned int d, l;
 553             for ( ok = lsb = false, d = 0, l = 0;
 554                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 555                   psz++ )
 556             {
 557                 d <<= 6;
 558                 d += cc;
 559                 for (l += 6; l >= 8; lsb = !lsb)
 560                 {
 561                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 562                     if (lsb)
 563                     {
 564                         if (buf)
 565                             *buf++ |= c;
 566                         len ++;
 567                     }
 568                     else
 569                     {
 570                         if (buf)
 571                             *buf = (wchar_t)(c << 8);
 572                     }
 573
 574                     ok = true;
 575                 }
 576             }
 577
 578             if ( !ok )
 579             {
 580                 // in valid UTF7 we should have valid characters after '+'
 581                 return wxCONV_FAILED;
 582             }
 583
 584             if (*psz == '-')
 585                 psz++;
 586         }
 587     }
 588
 589     if ( buf && (len < n) )
 590         *buf = '\0';
 591
 592     return len;
 593 }
 594
 595 //
 596 // BASE64 encoding table
 597 //
 598 static const unsigned char utf7enb64[] =
 599 {
 600     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 601     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 602     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 603     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 604     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 605     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 606     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 607     '4', '5', '6', '7', '8', '9', '+', '/'
 608 };
 609
 610 //
 611 // UTF-7 encoding table
 612 //
 613 // 0 - Set D (directly encoded characters)
 614 // 1 - Set O (optional direct characters)
 615 // 2 - whitespace characters (optional)
 616 // 3 - special characters
 617 //
 618 static const unsigned char utf7encode[128] =
 619 {
 620     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 621     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 622     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 626     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 628 };
 629
 630 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 631 {
 632     size_t len = 0;
 633
 634     while (*psz && ((!buf) || (len < n)))
 635     {
 636         wchar_t cc = *psz++;
 637         if (cc < 0x80 && utf7encode[cc] < 1)
 638         {
 639             // plain ASCII char
 640             if (buf)
 641                 *buf++ = (char)cc;
 642
 643             len++;
 644         }
 645 #ifndef WC_UTF16
 646         else if (((wxUint32)cc) > 0xffff)
 647         {
 648             // no surrogate pair generation (yet?)
 649             return wxCONV_FAILED;
 650         }
 651 #endif
 652         else
 653         {
 654             if (buf)
 655                 *buf++ = '+';
 656
 657             len++;
 658             if (cc != '+')
 659             {
 660                 // BASE64 encode string
 661                 unsigned int lsb, d, l;
 662                 for (d = 0, l = 0; /*nothing*/; psz++)
 663                 {
 664                     for (lsb = 0; lsb < 2; lsb ++)
 665                     {
 666                         d <<= 8;
 667                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 668
 669                         for (l += 8; l >= 6; )
 670                         {
 671                             l -= 6;
 672                             if (buf)
 673                                 *buf++ = utf7enb64[(d >> l) % 64];
 674                             len++;
 675                         }
 676                     }
 677
 678                     cc = *psz;
 679                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 680                         break;
 681                 }
 682
 683                 if (l != 0)
 684                 {
 685                     if (buf)
 686                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 687
 688                     len++;
 689                 }
 690             }
 691
 692             if (buf)
 693                 *buf++ = '-';
 694             len++;
 695         }
 696     }
 697
 698     if (buf && (len < n))
 699         *buf = 0;
 700
 701     return len;
 702 }
 703
 704 // ----------------------------------------------------------------------------
 705 // UTF-8
 706 // ----------------------------------------------------------------------------
 707
 708 static const wxUint32 utf8_max[]=
 709     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 710
 711 // boundaries of the private use area we use to (temporarily) remap invalid
 712 // characters invalid in a UTF-8 encoded string
 713 const wxUint32 wxUnicodePUA = 0x100000;
 714 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 715
 716 // this table gives the length of the UTF-8 encoding from its first character:
 717 const unsigned char tableUtf8Lengths[256] = {
 718     // single-byte sequences (ASCII):
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 726     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 727
 728     // these are invalid:
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 733     0, 0,                                            // C0,C1
 734
 735     // two-byte sequences:
 736           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 737     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 738
 739     // three-byte sequences:
 740     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 741
 742     // four-byte sequences:
 743     4, 4, 4, 4, 4,                                   // F0..F4
 744
 745     // these are invalid again (5- or 6-byte
 746     // sequences and sequences for code points
 747     // above U+10FFFF, as restricted by RFC 3629):
 748                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 749 };
 750
 751 size_t
 752 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 753                             const char *src, size_t srcLen) const
 754 {
 755     wchar_t *out = dstLen ? dst : NULL;
 756     size_t written = 0;
 757
 758     if ( srcLen == wxNO_LEN )
 759         srcLen = strlen(src) + 1;
 760
 761     for ( const char *p = src; ; p++ )
 762     {
 763         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 764         {
 765             // all done successfully, just add the trailing NULL if we are not
 766             // using explicit length
 767             if ( srcLen == wxNO_LEN )
 768             {
 769                 if ( out )
 770                 {
 771                     if ( !dstLen )
 772                         break;
 773
 774                     *out = L'\0';
 775                 }
 776
 777                 written++;
 778             }
 779
 780             return written;
 781         }
 782
 783         if ( out && !dstLen-- )
 784             break;
 785
 786         wxUint32 code;
 787         unsigned char c = *p;
 788
 789         if ( c < 0x80 )
 790         {
 791             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 792                 break;
 793
 794             if ( srcLen != wxNO_LEN )
 795                 srcLen--;
 796
 797             code = c;
 798         }
 799         else
 800         {
 801             unsigned len = tableUtf8Lengths[c];
 802             if ( !len )
 803                 break;
 804
 805             if ( srcLen < len ) // the test works for wxNO_LEN too
 806                 break;
 807
 808             if ( srcLen != wxNO_LEN )
 809                 srcLen -= len;
 810
 811             //   Char. number range   |        UTF-8 octet sequence
 812             //      (hexadecimal)     |              (binary)
 813             //  ----------------------+----------------------------------------
 814             //  0000 0000 - 0000 007F | 0xxxxxxx
 815             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 816             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 817             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 818             //
 819             //  Code point value is stored in bits marked with 'x',
 820             //  lowest-order bit of the value on the right side in the diagram
 821             //  above.                                         (from RFC 3629)
 822
 823             // mask to extract lead byte's value ('x' bits above), by sequence
 824             // length:
 825             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 826
 827             // mask and value of lead byte's most significant bits, by length:
 828             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 829             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 830
 831             len--; // it's more convenient to work with 0-based length here
 832
 833             // extract the lead byte's value bits:
 834             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 835                 break;
 836
 837             code = c & leadValueMask[len];
 838
 839             // all remaining bytes, if any, are handled in the same way
 840             // regardless of sequence's length:
 841             for ( ; len; --len )
 842             {
 843                 c = *++p;
 844                 if ( (c & 0xC0) != 0x80 )
 845                     return wxCONV_FAILED;
 846
 847                 code <<= 6;
 848                 code |= c & 0x3F;
 849             }
 850         }
 851
 852 #ifdef WC_UTF16
 853         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 854         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 855         {
 856             if ( out )
 857                 out++;
 858             written++;
 859         }
 860 #else // !WC_UTF16
 861         if ( out )
 862             *out = code;
 863 #endif // WC_UTF16/!WC_UTF16
 864
 865         if ( out )
 866             out++;
 867
 868         written++;
 869     }
 870
 871     return wxCONV_FAILED;
 872 }
 873
 874 size_t
 875 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 876                               const wchar_t *src, size_t srcLen) const
 877 {
 878     char *out = dstLen ? dst : NULL;
 879     size_t written = 0;
 880
 881     for ( const wchar_t *wp = src; ; wp++ )
 882     {
 883         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 884         {
 885             // all done successfully, just add the trailing NULL if we are not
 886             // using explicit length
 887             if ( srcLen == wxNO_LEN )
 888             {
 889                 if ( out )
 890                 {
 891                     if ( !dstLen )
 892                         break;
 893
 894                     *out = '\0';
 895                 }
 896
 897                 written++;
 898             }
 899
 900             return written;
 901         }
 902
 903
 904         wxUint32 code;
 905 #ifdef WC_UTF16
 906         // cast is ok for WC_UTF16
 907         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 908         {
 909             // skip the next char too as we decoded a surrogate
 910             wp++;
 911         }
 912 #else // wchar_t is UTF-32
 913         code = *wp & 0x7fffffff;
 914 #endif
 915
 916         unsigned len;
 917         if ( code <= 0x7F )
 918         {
 919             len = 1;
 920             if ( out )
 921             {
 922                 if ( dstLen < len )
 923                     break;
 924
 925                 out[0] = (char)code;
 926             }
 927         }
 928         else if ( code <= 0x07FF )
 929         {
 930             len = 2;
 931             if ( out )
 932             {
 933                 if ( dstLen < len )
 934                     break;
 935
 936                 // NB: this line takes 6 least significant bits, encodes them as
 937                 // 10xxxxxx and discards them so that the next byte can be encoded:
 938                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 939                 out[0] = 0xC0 | code;
 940             }
 941         }
 942         else if ( code < 0xFFFF )
 943         {
 944             len = 3;
 945             if ( out )
 946             {
 947                 if ( dstLen < len )
 948                     break;
 949
 950                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 952                 out[0] = 0xE0 | code;
 953             }
 954         }
 955         else if ( code <= 0x10FFFF )
 956         {
 957             len = 4;
 958             if ( out )
 959             {
 960                 if ( dstLen < len )
 961                     break;
 962
 963                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 965                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 966                 out[0] = 0xF0 | code;
 967             }
 968         }
 969         else
 970         {
 971             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 972             break;
 973         }
 974
 975         if ( out )
 976         {
 977             out += len;
 978             dstLen -= len;
 979         }
 980
 981         written += len;
 982     }
 983
 984     // we only get here if an error occurs during decoding
 985     return wxCONV_FAILED;
 986 }
 987
 988 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 989                              const char *psz, size_t srcLen) const
 990 {
 991     if ( m_options == MAP_INVALID_UTF8_NOT )
 992         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 993
 994     size_t len = 0;
 995
 996     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 997     {
 998         const char *opsz = psz;
 999         bool invalid = false;
1000         unsigned char cc = *psz++, fc = cc;
1001         unsigned cnt;
1002         for (cnt = 0; fc & 0x80; cnt++)
1003             fc <<= 1;
1004
1005         if (!cnt)
1006         {
1007             // plain ASCII char
1008             if (buf)
1009                 *buf++ = cc;
1010             len++;
1011
1012             // escape the escape character for octal escapes
1013             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1014                     && cc == '\\' && (!buf || len < n))
1015             {
1016                 if (buf)
1017                     *buf++ = cc;
1018                 len++;
1019             }
1020         }
1021         else
1022         {
1023             cnt--;
1024             if (!cnt)
1025             {
1026                 // invalid UTF-8 sequence
1027                 invalid = true;
1028             }
1029             else
1030             {
1031                 unsigned ocnt = cnt - 1;
1032                 wxUint32 res = cc & (0x3f >> cnt);
1033                 while (cnt--)
1034                 {
1035                     cc = *psz;
1036                     if ((cc & 0xC0) != 0x80)
1037                     {
1038                         // invalid UTF-8 sequence
1039                         invalid = true;
1040                         break;
1041                     }
1042
1043                     psz++;
1044                     res = (res << 6) | (cc & 0x3f);
1045                 }
1046
1047                 if (invalid || res <= utf8_max[ocnt])
1048                 {
1049                     // illegal UTF-8 encoding
1050                     invalid = true;
1051                 }
1052                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1053                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1054                 {
1055                     // if one of our PUA characters turns up externally
1056                     // it must also be treated as an illegal sequence
1057                     // (a bit like you have to escape an escape character)
1058                     invalid = true;
1059                 }
1060                 else
1061                 {
1062 #ifdef WC_UTF16
1063                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1064                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1065                     if (pa == wxCONV_FAILED)
1066                     {
1067                         invalid = true;
1068                     }
1069                     else
1070                     {
1071                         if (buf)
1072                             buf += pa;
1073                         len += pa;
1074                     }
1075 #else // !WC_UTF16
1076                     if (buf)
1077                         *buf++ = (wchar_t)res;
1078                     len++;
1079 #endif // WC_UTF16/!WC_UTF16
1080                 }
1081             }
1082
1083             if (invalid)
1084             {
1085                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1086                 {
1087                     while (opsz < psz && (!buf || len < n))
1088                     {
1089 #ifdef WC_UTF16
1090                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1091                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1092                         wxASSERT(pa != wxCONV_FAILED);
1093                         if (buf)
1094                             buf += pa;
1095                         opsz++;
1096                         len += pa;
1097 #else
1098                         if (buf)
1099                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1100                         opsz++;
1101                         len++;
1102 #endif
1103                     }
1104                 }
1105                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1106                 {
1107                     while (opsz < psz && (!buf || len < n))
1108                     {
1109                         if ( buf && len + 3 < n )
1110                         {
1111                             unsigned char on = *opsz;
1112                             *buf++ = L'\\';
1113                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1114                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1115                             *buf++ = (wchar_t)( L'0' + on % 010 );
1116                         }
1117
1118                         opsz++;
1119                         len += 4;
1120                     }
1121                 }
1122                 else // MAP_INVALID_UTF8_NOT
1123                 {
1124                     return wxCONV_FAILED;
1125                 }
1126             }
1127         }
1128     }
1129
1130     if (srcLen == wxNO_LEN && buf && (len < n))
1131         *buf = 0;
1132
1133     return len + 1;
1134 }
1135
1136 static inline bool isoctal(wchar_t wch)
1137 {
1138     return L'0' <= wch && wch <= L'7';
1139 }
1140
1141 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1142                                const wchar_t *psz, size_t srcLen) const
1143 {
1144     if ( m_options == MAP_INVALID_UTF8_NOT )
1145         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1146
1147     size_t len = 0;
1148
1149     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1150     {
1151         wxUint32 cc;
1152
1153 #ifdef WC_UTF16
1154         // cast is ok for WC_UTF16
1155         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1156         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1157 #else
1158         cc = (*psz++) & 0x7fffffff;
1159 #endif
1160
1161         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1162                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1163         {
1164             if (buf)
1165                 *buf++ = (char)(cc - wxUnicodePUA);
1166             len++;
1167         }
1168         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1169                     && cc == L'\\' && psz[0] == L'\\' )
1170         {
1171             if (buf)
1172                 *buf++ = (char)cc;
1173             psz++;
1174             len++;
1175         }
1176         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1177                     cc == L'\\' &&
1178                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1179         {
1180             if (buf)
1181             {
1182                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1183                                  (psz[1] - L'0') * 010 +
1184                                  (psz[2] - L'0'));
1185             }
1186
1187             psz += 3;
1188             len++;
1189         }
1190         else
1191         {
1192             unsigned cnt;
1193             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1194             {
1195             }
1196
1197             if (!cnt)
1198             {
1199                 // plain ASCII char
1200                 if (buf)
1201                     *buf++ = (char) cc;
1202                 len++;
1203             }
1204             else
1205             {
1206                 len += cnt + 1;
1207                 if (buf)
1208                 {
1209                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1210                     while (cnt--)
1211                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1212                 }
1213             }
1214         }
1215     }
1216
1217     if (srcLen == wxNO_LEN && buf && (len < n))
1218         *buf = 0;
1219
1220     return len + 1;
1221 }
1222
1223 // ============================================================================
1224 // UTF-16
1225 // ============================================================================
1226
1227 #ifdef WORDS_BIGENDIAN
1228     #define wxMBConvUTF16straight wxMBConvUTF16BE
1229     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1230 #else
1231     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1232     #define wxMBConvUTF16straight wxMBConvUTF16LE
1233 #endif
1234
1235 /* static */
1236 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1237 {
1238     if ( srcLen == wxNO_LEN )
1239     {
1240         // count the number of bytes in input, including the trailing NULs
1241         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1242         for ( srcLen = 1; *inBuff++; srcLen++ )
1243             ;
1244
1245         srcLen *= BYTES_PER_CHAR;
1246     }
1247     else // we already have the length
1248     {
1249         // we can only convert an entire number of UTF-16 characters
1250         if ( srcLen % BYTES_PER_CHAR )
1251             return wxCONV_FAILED;
1252     }
1253
1254     return srcLen;
1255 }
1256
1257 // case when in-memory representation is UTF-16 too
1258 #ifdef WC_UTF16
1259
1260 // ----------------------------------------------------------------------------
1261 // conversions without endianness change
1262 // ----------------------------------------------------------------------------
1263
1264 size_t
1265 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1266                                const char *src, size_t srcLen) const
1267 {
1268     // set up the scene for using memcpy() (which is presumably more efficient
1269     // than copying the bytes one by one)
1270     srcLen = GetLength(src, srcLen);
1271     if ( srcLen == wxNO_LEN )
1272         return wxCONV_FAILED;
1273
1274     const size_t inLen = srcLen / BYTES_PER_CHAR;
1275     if ( dst )
1276     {
1277         if ( dstLen < inLen )
1278             return wxCONV_FAILED;
1279
1280         memcpy(dst, src, srcLen);
1281     }
1282
1283     return inLen;
1284 }
1285
1286 size_t
1287 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1288                                  const wchar_t *src, size_t srcLen) const
1289 {
1290     if ( srcLen == wxNO_LEN )
1291         srcLen = wxWcslen(src) + 1;
1292
1293     srcLen *= BYTES_PER_CHAR;
1294
1295     if ( dst )
1296     {
1297         if ( dstLen < srcLen )
1298             return wxCONV_FAILED;
1299
1300         memcpy(dst, src, srcLen);
1301     }
1302
1303     return srcLen;
1304 }
1305
1306 // ----------------------------------------------------------------------------
1307 // endian-reversing conversions
1308 // ----------------------------------------------------------------------------
1309
1310 size_t
1311 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1312                            const char *src, size_t srcLen) const
1313 {
1314     srcLen = GetLength(src, srcLen);
1315     if ( srcLen == wxNO_LEN )
1316         return wxCONV_FAILED;
1317
1318     srcLen /= BYTES_PER_CHAR;
1319
1320     if ( dst )
1321     {
1322         if ( dstLen < srcLen )
1323             return wxCONV_FAILED;
1324
1325         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1326         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1327         {
1328             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1329         }
1330     }
1331
1332     return srcLen;
1333 }
1334
1335 size_t
1336 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1337                              const wchar_t *src, size_t srcLen) const
1338 {
1339     if ( srcLen == wxNO_LEN )
1340         srcLen = wxWcslen(src) + 1;
1341
1342     srcLen *= BYTES_PER_CHAR;
1343
1344     if ( dst )
1345     {
1346         if ( dstLen < srcLen )
1347             return wxCONV_FAILED;
1348
1349         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1350         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1351         {
1352             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1353         }
1354     }
1355
1356     return srcLen;
1357 }
1358
1359 #else // !WC_UTF16: wchar_t is UTF-32
1360
1361 // ----------------------------------------------------------------------------
1362 // conversions without endianness change
1363 // ----------------------------------------------------------------------------
1364
1365 size_t
1366 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1367                                const char *src, size_t srcLen) const
1368 {
1369     srcLen = GetLength(src, srcLen);
1370     if ( srcLen == wxNO_LEN )
1371         return wxCONV_FAILED;
1372
1373     const size_t inLen = srcLen / BYTES_PER_CHAR;
1374     if ( !dst )
1375     {
1376         // optimization: return maximal space which could be needed for this
1377         // string even if the real size could be smaller if the buffer contains
1378         // any surrogates
1379         return inLen;
1380     }
1381
1382     size_t outLen = 0;
1383     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1384     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1385     {
1386         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1387         if ( !inBuff )
1388             return wxCONV_FAILED;
1389
1390         if ( ++outLen > dstLen )
1391             return wxCONV_FAILED;
1392
1393         *dst++ = ch;
1394     }
1395
1396
1397     return outLen;
1398 }
1399
1400 size_t
1401 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1402                                  const wchar_t *src, size_t srcLen) const
1403 {
1404     if ( srcLen == wxNO_LEN )
1405         srcLen = wxWcslen(src) + 1;
1406
1407     size_t outLen = 0;
1408     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1409     for ( size_t n = 0; n < srcLen; n++ )
1410     {
1411         wxUint16 cc[2];
1412         const size_t numChars = encode_utf16(*src++, cc);
1413         if ( numChars == wxCONV_FAILED )
1414             return wxCONV_FAILED;
1415
1416         outLen += numChars * BYTES_PER_CHAR;
1417         if ( outBuff )
1418         {
1419             if ( outLen > dstLen )
1420                 return wxCONV_FAILED;
1421
1422             *outBuff++ = cc[0];
1423             if ( numChars == 2 )
1424             {
1425                 // second character of a surrogate
1426                 *outBuff++ = cc[1];
1427             }
1428         }
1429     }
1430
1431     return outLen;
1432 }
1433
1434 // ----------------------------------------------------------------------------
1435 // endian-reversing conversions
1436 // ----------------------------------------------------------------------------
1437
1438 size_t
1439 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1440                            const char *src, size_t srcLen) const
1441 {
1442     srcLen = GetLength(src, srcLen);
1443     if ( srcLen == wxNO_LEN )
1444         return wxCONV_FAILED;
1445
1446     const size_t inLen = srcLen / BYTES_PER_CHAR;
1447     if ( !dst )
1448     {
1449         // optimization: return maximal space which could be needed for this
1450         // string even if the real size could be smaller if the buffer contains
1451         // any surrogates
1452         return inLen;
1453     }
1454
1455     size_t outLen = 0;
1456     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1457     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1458     {
1459         wxUint32 ch;
1460         wxUint16 tmp[2];
1461
1462         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463         inBuff++;
1464         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1465
1466         const size_t numChars = decode_utf16(tmp, ch);
1467         if ( numChars == wxCONV_FAILED )
1468             return wxCONV_FAILED;
1469
1470         if ( numChars == 2 )
1471             inBuff++;
1472
1473         if ( ++outLen > dstLen )
1474             return wxCONV_FAILED;
1475
1476         *dst++ = ch;
1477     }
1478
1479
1480     return outLen;
1481 }
1482
1483 size_t
1484 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1485                              const wchar_t *src, size_t srcLen) const
1486 {
1487     if ( srcLen == wxNO_LEN )
1488         srcLen = wxWcslen(src) + 1;
1489
1490     size_t outLen = 0;
1491     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1492     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1493     {
1494         wxUint16 cc[2];
1495         const size_t numChars = encode_utf16(*src, cc);
1496         if ( numChars == wxCONV_FAILED )
1497             return wxCONV_FAILED;
1498
1499         outLen += numChars * BYTES_PER_CHAR;
1500         if ( outBuff )
1501         {
1502             if ( outLen > dstLen )
1503                 return wxCONV_FAILED;
1504
1505             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1506             if ( numChars == 2 )
1507             {
1508                 // second character of a surrogate
1509                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1510             }
1511         }
1512     }
1513
1514     return outLen;
1515 }
1516
1517 #endif // WC_UTF16/!WC_UTF16
1518
1519
1520 // ============================================================================
1521 // UTF-32
1522 // ============================================================================
1523
1524 #ifdef WORDS_BIGENDIAN
1525     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1526     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1527 #else
1528     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1529     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1530 #endif
1531
1532
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1534 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1535
1536 /* static */
1537 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1538 {
1539     if ( srcLen == wxNO_LEN )
1540     {
1541         // count the number of bytes in input, including the trailing NULs
1542         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1543         for ( srcLen = 1; *inBuff++; srcLen++ )
1544             ;
1545
1546         srcLen *= BYTES_PER_CHAR;
1547     }
1548     else // we already have the length
1549     {
1550         // we can only convert an entire number of UTF-32 characters
1551         if ( srcLen % BYTES_PER_CHAR )
1552             return wxCONV_FAILED;
1553     }
1554
1555     return srcLen;
1556 }
1557
1558 // case when in-memory representation is UTF-16
1559 #ifdef WC_UTF16
1560
1561 // ----------------------------------------------------------------------------
1562 // conversions without endianness change
1563 // ----------------------------------------------------------------------------
1564
1565 size_t
1566 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1567                                const char *src, size_t srcLen) const
1568 {
1569     srcLen = GetLength(src, srcLen);
1570     if ( srcLen == wxNO_LEN )
1571         return wxCONV_FAILED;
1572
1573     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1574     const size_t inLen = srcLen / BYTES_PER_CHAR;
1575     size_t outLen = 0;
1576     for ( size_t n = 0; n < inLen; n++ )
1577     {
1578         wxUint16 cc[2];
1579         const size_t numChars = encode_utf16(*inBuff++, cc);
1580         if ( numChars == wxCONV_FAILED )
1581             return wxCONV_FAILED;
1582
1583         outLen += numChars;
1584         if ( dst )
1585         {
1586             if ( outLen > dstLen )
1587                 return wxCONV_FAILED;
1588
1589             *dst++ = cc[0];
1590             if ( numChars == 2 )
1591             {
1592                 // second character of a surrogate
1593                 *dst++ = cc[1];
1594             }
1595         }
1596     }
1597
1598     return outLen;
1599 }
1600
1601 size_t
1602 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1603                                  const wchar_t *src, size_t srcLen) const
1604 {
1605     if ( srcLen == wxNO_LEN )
1606         srcLen = wxWcslen(src) + 1;
1607
1608     if ( !dst )
1609     {
1610         // optimization: return maximal space which could be needed for this
1611         // string instead of the exact amount which could be less if there are
1612         // any surrogates in the input
1613         //
1614         // we consider that surrogates are rare enough to make it worthwhile to
1615         // avoid running the loop below at the cost of slightly extra memory
1616         // consumption
1617         return srcLen * BYTES_PER_CHAR;
1618     }
1619
1620     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1621     size_t outLen = 0;
1622     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1623     {
1624         const wxUint32 ch = wxDecodeSurrogate(&src);
1625         if ( !src )
1626             return wxCONV_FAILED;
1627
1628         outLen += BYTES_PER_CHAR;
1629
1630         if ( outLen > dstLen )
1631             return wxCONV_FAILED;
1632
1633         *outBuff++ = ch;
1634     }
1635
1636     return outLen;
1637 }
1638
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1642
1643 size_t
1644 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1645                            const char *src, size_t srcLen) const
1646 {
1647     srcLen = GetLength(src, srcLen);
1648     if ( srcLen == wxNO_LEN )
1649         return wxCONV_FAILED;
1650
1651     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652     const size_t inLen = srcLen / BYTES_PER_CHAR;
1653     size_t outLen = 0;
1654     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1655     {
1656         wxUint16 cc[2];
1657         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1658         if ( numChars == wxCONV_FAILED )
1659             return wxCONV_FAILED;
1660
1661         outLen += numChars;
1662         if ( dst )
1663         {
1664             if ( outLen > dstLen )
1665                 return wxCONV_FAILED;
1666
1667             *dst++ = cc[0];
1668             if ( numChars == 2 )
1669             {
1670                 // second character of a surrogate
1671                 *dst++ = cc[1];
1672             }
1673         }
1674     }
1675
1676     return outLen;
1677 }
1678
1679 size_t
1680 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1681                              const wchar_t *src, size_t srcLen) const
1682 {
1683     if ( srcLen == wxNO_LEN )
1684         srcLen = wxWcslen(src) + 1;
1685
1686     if ( !dst )
1687     {
1688         // optimization: return maximal space which could be needed for this
1689         // string instead of the exact amount which could be less if there are
1690         // any surrogates in the input
1691         //
1692         // we consider that surrogates are rare enough to make it worthwhile to
1693         // avoid running the loop below at the cost of slightly extra memory
1694         // consumption
1695         return srcLen*BYTES_PER_CHAR;
1696     }
1697
1698     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1699     size_t outLen = 0;
1700     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1701     {
1702         const wxUint32 ch = wxDecodeSurrogate(&src);
1703         if ( !src )
1704             return wxCONV_FAILED;
1705
1706         outLen += BYTES_PER_CHAR;
1707
1708         if ( outLen > dstLen )
1709             return wxCONV_FAILED;
1710
1711         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1712     }
1713
1714     return outLen;
1715 }
1716
1717 #else // !WC_UTF16: wchar_t is UTF-32
1718
1719 // ----------------------------------------------------------------------------
1720 // conversions without endianness change
1721 // ----------------------------------------------------------------------------
1722
1723 size_t
1724 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1725                                const char *src, size_t srcLen) const
1726 {
1727     // use memcpy() as it should be much faster than hand-written loop
1728     srcLen = GetLength(src, srcLen);
1729     if ( srcLen == wxNO_LEN )
1730         return wxCONV_FAILED;
1731
1732     const size_t inLen = srcLen/BYTES_PER_CHAR;
1733     if ( dst )
1734     {
1735         if ( dstLen < inLen )
1736             return wxCONV_FAILED;
1737
1738         memcpy(dst, src, srcLen);
1739     }
1740
1741     return inLen;
1742 }
1743
1744 size_t
1745 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1746                                  const wchar_t *src, size_t srcLen) const
1747 {
1748     if ( srcLen == wxNO_LEN )
1749         srcLen = wxWcslen(src) + 1;
1750
1751     srcLen *= BYTES_PER_CHAR;
1752
1753     if ( dst )
1754     {
1755         if ( dstLen < srcLen )
1756             return wxCONV_FAILED;
1757
1758         memcpy(dst, src, srcLen);
1759     }
1760
1761     return srcLen;
1762 }
1763
1764 // ----------------------------------------------------------------------------
1765 // endian-reversing conversions
1766 // ----------------------------------------------------------------------------
1767
1768 size_t
1769 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1770                            const char *src, size_t srcLen) const
1771 {
1772     srcLen = GetLength(src, srcLen);
1773     if ( srcLen == wxNO_LEN )
1774         return wxCONV_FAILED;
1775
1776     srcLen /= BYTES_PER_CHAR;
1777
1778     if ( dst )
1779     {
1780         if ( dstLen < srcLen )
1781             return wxCONV_FAILED;
1782
1783         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1784         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1785         {
1786             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1787         }
1788     }
1789
1790     return srcLen;
1791 }
1792
1793 size_t
1794 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1795                              const wchar_t *src, size_t srcLen) const
1796 {
1797     if ( srcLen == wxNO_LEN )
1798         srcLen = wxWcslen(src) + 1;
1799
1800     srcLen *= BYTES_PER_CHAR;
1801
1802     if ( dst )
1803     {
1804         if ( dstLen < srcLen )
1805             return wxCONV_FAILED;
1806
1807         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1808         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1809         {
1810             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1811         }
1812     }
1813
1814     return srcLen;
1815 }
1816
1817 #endif // WC_UTF16/!WC_UTF16
1818
1819
1820 // ============================================================================
1821 // The classes doing conversion using the iconv_xxx() functions
1822 // ============================================================================
1823
1824 #ifdef HAVE_ICONV
1825
1826 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1827 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1828 //     (unless there's yet another bug in glibc) the only case when iconv()
1829 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1830 //     left in the input buffer -- when _real_ error occurs,
1831 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1832 //     iconv() failure.
1833 //     [This bug does not appear in glibc 2.2.]
1834 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1835 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1836                                      (errno != E2BIG || bufLeft != 0))
1837 #else
1838 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1839 #endif
1840
1841 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1842
1843 #define ICONV_T_INVALID ((iconv_t)-1)
1844
1845 #if SIZEOF_WCHAR_T == 4
1846     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1847     #define WC_ENC      wxFONTENCODING_UTF32
1848 #elif SIZEOF_WCHAR_T == 2
1849     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1850     #define WC_ENC      wxFONTENCODING_UTF16
1851 #else // sizeof(wchar_t) != 2 nor 4
1852     // does this ever happen?
1853     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1854 #endif
1855
1856 // ----------------------------------------------------------------------------
1857 // wxMBConv_iconv: encapsulates an iconv character set
1858 // ----------------------------------------------------------------------------
1859
1860 class wxMBConv_iconv : public wxMBConv
1861 {
1862 public:
1863     wxMBConv_iconv(const char *name);
1864     virtual ~wxMBConv_iconv();
1865
1866     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1867     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1868
1869     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1870     virtual size_t GetMBNulLen() const;
1871
1872 #if wxUSE_UNICODE_UTF8
1873     virtual bool IsUTF8() const;
1874 #endif
1875
1876     virtual wxMBConv *Clone() const
1877     {
1878         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1879         p->m_minMBCharWidth = m_minMBCharWidth;
1880         return p;
1881     }
1882
1883     bool IsOk() const
1884         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1885
1886 protected:
1887     // the iconv handlers used to translate from multibyte
1888     // to wide char and in the other direction
1889     iconv_t m2w,
1890             w2m;
1891
1892 #if wxUSE_THREADS
1893     // guards access to m2w and w2m objects
1894     wxMutex m_iconvMutex;
1895 #endif
1896
1897 private:
1898     // the name (for iconv_open()) of a wide char charset -- if none is
1899     // available on this machine, it will remain NULL
1900     static wxString ms_wcCharsetName;
1901
1902     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903     // different endian-ness than the native one
1904     static bool ms_wcNeedsSwap;
1905
1906
1907     // name of the encoding handled by this conversion
1908     wxString m_name;
1909
1910     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1911     // initially
1912     size_t m_minMBCharWidth;
1913 };
1914
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1917 {
1918     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919     if ( !result->IsOk() )
1920     {
1921         delete result;
1922         return 0;
1923     }
1924
1925     return result;
1926 }
1927
1928 wxString wxMBConv_iconv::ms_wcCharsetName;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1930
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1932               : m_name(name)
1933 {
1934     m_minMBCharWidth = 0;
1935
1936     // check for charset that represents wchar_t:
1937     if ( ms_wcCharsetName.empty() )
1938     {
1939         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
1941 #if wxUSE_FONTMAP
1942         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943 #else // !wxUSE_FONTMAP
1944         static const wxChar *names_static[] =
1945         {
1946 #if SIZEOF_WCHAR_T == 4
1947             _T("UCS-4"),
1948 #elif SIZEOF_WCHAR_T = 2
1949             _T("UCS-2"),
1950 #endif
1951             NULL
1952         };
1953         const wxChar **names = names_static;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1955
1956         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1957         {
1958             const wxString nameCS(*names);
1959
1960             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961             wxString nameXE(nameCS);
1962
1963 #ifdef WORDS_BIGENDIAN
1964                 nameXE += _T("BE");
1965 #else // little endian
1966                 nameXE += _T("LE");
1967 #endif
1968
1969             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1970                        nameXE.c_str());
1971
1972             m2w = iconv_open(nameXE.ToAscii(), name);
1973             if ( m2w == ICONV_T_INVALID )
1974             {
1975                 // try charset w/o bytesex info (e.g. "UCS4")
1976                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1977                            nameCS.c_str());
1978                 m2w = iconv_open(nameCS.ToAscii(), name);
1979
1980                 // and check for bytesex ourselves:
1981                 if ( m2w != ICONV_T_INVALID )
1982                 {
1983                     char    buf[2], *bufPtr;
1984                     wchar_t wbuf[2], *wbufPtr;
1985                     size_t  insz, outsz;
1986                     size_t  res;
1987
1988                     buf[0] = 'A';
1989                     buf[1] = 0;
1990                     wbuf[0] = 0;
1991                     insz = 2;
1992                     outsz = SIZEOF_WCHAR_T * 2;
1993                     wbufPtr = wbuf;
1994                     bufPtr = buf;
1995
1996                     res = iconv(
1997                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1998                         (char**)&wbufPtr, &outsz);
1999
2000                     if (ICONV_FAILED(res, insz))
2001                     {
2002                         wxLogLastError(wxT("iconv"));
2003                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2004                                    nameCS.c_str());
2005                     }
2006                     else // ok, can convert to this encoding, remember it
2007                     {
2008                         ms_wcCharsetName = nameCS;
2009                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010                     }
2011                 }
2012             }
2013             else // use charset not requiring byte swapping
2014             {
2015                 ms_wcCharsetName = nameXE;
2016             }
2017         }
2018
2019         wxLogTrace(TRACE_STRCONV,
2020                    wxT("iconv wchar_t charset is \"%s\"%s"),
2021                    ms_wcCharsetName.empty() ? wxString("<none>")
2022                                             : ms_wcCharsetName,
2023                    ms_wcNeedsSwap ? _T(" (needs swap)")
2024                                   : _T(""));
2025     }
2026     else // we already have ms_wcCharsetName
2027     {
2028         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2029     }
2030
2031     if ( ms_wcCharsetName.empty() )
2032     {
2033         w2m = ICONV_T_INVALID;
2034     }
2035     else
2036     {
2037         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2038         if ( w2m == ICONV_T_INVALID )
2039         {
2040             wxLogTrace(TRACE_STRCONV,
2041                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042                        ms_wcCharsetName.c_str(), name);
2043         }
2044     }
2045 }
2046
2047 wxMBConv_iconv::~wxMBConv_iconv()
2048 {
2049     if ( m2w != ICONV_T_INVALID )
2050         iconv_close(m2w);
2051     if ( w2m != ICONV_T_INVALID )
2052         iconv_close(w2m);
2053 }
2054
2055 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2056 {
2057     // find the string length: notice that must be done differently for
2058     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2059     size_t inbuf;
2060     const size_t nulLen = GetMBNulLen();
2061     switch ( nulLen )
2062     {
2063         default:
2064             return wxCONV_FAILED;
2065
2066         case 1:
2067             inbuf = strlen(psz); // arguably more optimized than our version
2068             break;
2069
2070         case 2:
2071         case 4:
2072             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2073             // they also have to start at character boundary and not span two
2074             // adjacent characters
2075             const char *p;
2076             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2077                 ;
2078             inbuf = p - psz;
2079             break;
2080     }
2081
2082 #if wxUSE_THREADS
2083     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2084     //     Unfortunately there are a couple of global wxCSConv objects such as
2085     //     wxConvLocal that are used all over wx code, so we have to make sure
2086     //     the handle is used by at most one thread at the time. Otherwise
2087     //     only a few wx classes would be safe to use from non-main threads
2088     //     as MB<->WC conversion would fail "randomly".
2089     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2090 #endif // wxUSE_THREADS
2091
2092     size_t outbuf = n * SIZEOF_WCHAR_T;
2093     size_t res, cres;
2094     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2095     wchar_t *bufPtr = buf;
2096     const char *pszPtr = psz;
2097
2098     if (buf)
2099     {
2100         // have destination buffer, convert there
2101         cres = iconv(m2w,
2102                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2103                      (char**)&bufPtr, &outbuf);
2104         res = n - (outbuf / SIZEOF_WCHAR_T);
2105
2106         if (ms_wcNeedsSwap)
2107         {
2108             // convert to native endianness
2109             for ( unsigned i = 0; i < res; i++ )
2110                 buf[n] = WC_BSWAP(buf[i]);
2111         }
2112
2113         // NUL-terminate the string if there is any space left
2114         if (res < n)
2115             buf[res] = 0;
2116     }
2117     else
2118     {
2119         // no destination buffer... convert using temp buffer
2120         // to calculate destination buffer requirement
2121         wchar_t tbuf[8];
2122         res = 0;
2123
2124         do
2125         {
2126             bufPtr = tbuf;
2127             outbuf = 8 * SIZEOF_WCHAR_T;
2128
2129             cres = iconv(m2w,
2130                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2131                          (char**)&bufPtr, &outbuf );
2132
2133             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2134         }
2135         while ((cres == (size_t)-1) && (errno == E2BIG));
2136     }
2137
2138     if (ICONV_FAILED(cres, inbuf))
2139     {
2140         //VS: it is ok if iconv fails, hence trace only
2141         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2142         return wxCONV_FAILED;
2143     }
2144
2145     return res;
2146 }
2147
2148 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2149 {
2150 #if wxUSE_THREADS
2151     // NB: explained in MB2WC
2152     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2153 #endif
2154
2155     size_t inlen = wxWcslen(psz);
2156     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2157     size_t outbuf = n;
2158     size_t res, cres;
2159
2160     wchar_t *tmpbuf = 0;
2161
2162     if (ms_wcNeedsSwap)
2163     {
2164         // need to copy to temp buffer to switch endianness
2165         // (doing WC_BSWAP twice on the original buffer won't help, as it
2166         //  could be in read-only memory, or be accessed in some other thread)
2167         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2168         for ( size_t i = 0; i < inlen; i++ )
2169             tmpbuf[n] = WC_BSWAP(psz[i]);
2170
2171         tmpbuf[inlen] = L'\0';
2172         psz = tmpbuf;
2173     }
2174
2175     if (buf)
2176     {
2177         // have destination buffer, convert there
2178         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2179
2180         res = n - outbuf;
2181
2182         // NB: iconv was given only wcslen(psz) characters on input, and so
2183         //     it couldn't convert the trailing zero. Let's do it ourselves
2184         //     if there's some room left for it in the output buffer.
2185         if (res < n)
2186             buf[0] = 0;
2187     }
2188     else
2189     {
2190         // no destination buffer: convert using temp buffer
2191         // to calculate destination buffer requirement
2192         char tbuf[16];
2193         res = 0;
2194         do
2195         {
2196             buf = tbuf;
2197             outbuf = 16;
2198
2199             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2200
2201             res += 16 - outbuf;
2202         }
2203         while ((cres == (size_t)-1) && (errno == E2BIG));
2204     }
2205
2206     if (ms_wcNeedsSwap)
2207     {
2208         free(tmpbuf);
2209     }
2210
2211     if (ICONV_FAILED(cres, inbuf))
2212     {
2213         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2214         return wxCONV_FAILED;
2215     }
2216
2217     return res;
2218 }
2219
2220 size_t wxMBConv_iconv::GetMBNulLen() const
2221 {
2222     if ( m_minMBCharWidth == 0 )
2223     {
2224         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2225
2226 #if wxUSE_THREADS
2227         // NB: explained in MB2WC
2228         wxMutexLocker lock(self->m_iconvMutex);
2229 #endif
2230
2231         const wchar_t *wnul = L"";
2232         char buf[8]; // should be enough for NUL in any encoding
2233         size_t inLen = sizeof(wchar_t),
2234                outLen = WXSIZEOF(buf);
2235         char *inBuff = (char *)wnul;
2236         char *outBuff = buf;
2237         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2238         {
2239             self->m_minMBCharWidth = (size_t)-1;
2240         }
2241         else // ok
2242         {
2243             self->m_minMBCharWidth = outBuff - buf;
2244         }
2245     }
2246
2247     return m_minMBCharWidth;
2248 }
2249
2250 #if wxUSE_UNICODE_UTF8
2251 bool wxMBConv_iconv::IsUTF8() const
2252 {
2253     return wxStricmp(m_name, "UTF-8") == 0 ||
2254            wxStricmp(m_name, "UTF8") == 0;
2255 }
2256 #endif
2257
2258 #endif // HAVE_ICONV
2259
2260
2261 // ============================================================================
2262 // Win32 conversion classes
2263 // ============================================================================
2264
2265 #ifdef wxHAVE_WIN32_MB2WC
2266
2267 // from utils.cpp
2268 #if wxUSE_FONTMAP
2269 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2270 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2271 #endif
2272
2273 class wxMBConv_win32 : public wxMBConv
2274 {
2275 public:
2276     wxMBConv_win32()
2277     {
2278         m_CodePage = CP_ACP;
2279         m_minMBCharWidth = 0;
2280     }
2281
2282     wxMBConv_win32(const wxMBConv_win32& conv)
2283         : wxMBConv()
2284     {
2285         m_CodePage = conv.m_CodePage;
2286         m_minMBCharWidth = conv.m_minMBCharWidth;
2287     }
2288
2289 #if wxUSE_FONTMAP
2290     wxMBConv_win32(const char* name)
2291     {
2292         m_CodePage = wxCharsetToCodepage(name);
2293         m_minMBCharWidth = 0;
2294     }
2295
2296     wxMBConv_win32(wxFontEncoding encoding)
2297     {
2298         m_CodePage = wxEncodingToCodepage(encoding);
2299         m_minMBCharWidth = 0;
2300     }
2301 #endif // wxUSE_FONTMAP
2302
2303     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2304     {
2305         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2306         // the behaviour is not compatible with the Unix version (using iconv)
2307         // and break the library itself, e.g. wxTextInputStream::NextChar()
2308         // wouldn't work if reading an incomplete MB char didn't result in an
2309         // error
2310         //
2311         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2312         // Win XP or newer and it is not supported for UTF-[78] so we always
2313         // use our own conversions in this case. See
2314         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2315         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2316         if ( m_CodePage == CP_UTF8 )
2317         {
2318             return wxMBConvUTF8().MB2WC(buf, psz, n);
2319         }
2320
2321         if ( m_CodePage == CP_UTF7 )
2322         {
2323             return wxMBConvUTF7().MB2WC(buf, psz, n);
2324         }
2325
2326         int flags = 0;
2327         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2328                 IsAtLeastWin2kSP4() )
2329         {
2330             flags = MB_ERR_INVALID_CHARS;
2331         }
2332
2333         const size_t len = ::MultiByteToWideChar
2334                              (
2335                                 m_CodePage,     // code page
2336                                 flags,          // flags: fall on error
2337                                 psz,            // input string
2338                                 -1,             // its length (NUL-terminated)
2339                                 buf,            // output string
2340                                 buf ? n : 0     // size of output buffer
2341                              );
2342         if ( !len )
2343         {
2344             // function totally failed
2345             return wxCONV_FAILED;
2346         }
2347
2348         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2349         // check if we succeeded, by doing a double trip:
2350         if ( !flags && buf )
2351         {
2352             const size_t mbLen = strlen(psz);
2353             wxCharBuffer mbBuf(mbLen);
2354             if ( ::WideCharToMultiByte
2355                    (
2356                       m_CodePage,
2357                       0,
2358                       buf,
2359                       -1,
2360                       mbBuf.data(),
2361                       mbLen + 1,        // size in bytes, not length
2362                       NULL,
2363                       NULL
2364                    ) == 0 ||
2365                   strcmp(mbBuf, psz) != 0 )
2366             {
2367                 // we didn't obtain the same thing we started from, hence
2368                 // the conversion was lossy and we consider that it failed
2369                 return wxCONV_FAILED;
2370             }
2371         }
2372
2373         // note that it returns count of written chars for buf != NULL and size
2374         // of the needed buffer for buf == NULL so in either case the length of
2375         // the string (which never includes the terminating NUL) is one less
2376         return len - 1;
2377     }
2378
2379     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2380     {
2381         /*
2382             we have a problem here: by default, WideCharToMultiByte() may
2383             replace characters unrepresentable in the target code page with bad
2384             quality approximations such as turning "1/2" symbol (U+00BD) into
2385             "1" for the code pages which don't have it and we, obviously, want
2386             to avoid this at any price
2387
2388             the trouble is that this function does it _silently_, i.e. it won't
2389             even tell us whether it did or not... Win98/2000 and higher provide
2390             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2391             we have to resort to a round trip, i.e. check that converting back
2392             results in the same string -- this is, of course, expensive but
2393             otherwise we simply can't be sure to not garble the data.
2394          */
2395
2396         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2397         // it doesn't work with CJK encodings (which we test for rather roughly
2398         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2399         // supporting it
2400         BOOL usedDef wxDUMMY_INITIALIZE(false);
2401         BOOL *pUsedDef;
2402         int flags;
2403         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2404         {
2405             // it's our lucky day
2406             flags = WC_NO_BEST_FIT_CHARS;
2407             pUsedDef = &usedDef;
2408         }
2409         else // old system or unsupported encoding
2410         {
2411             flags = 0;
2412             pUsedDef = NULL;
2413         }
2414
2415         const size_t len = ::WideCharToMultiByte
2416                              (
2417                                 m_CodePage,     // code page
2418                                 flags,          // either none or no best fit
2419                                 pwz,            // input string
2420                                 -1,             // it is (wide) NUL-terminated
2421                                 buf,            // output buffer
2422                                 buf ? n : 0,    // and its size
2423                                 NULL,           // default "replacement" char
2424                                 pUsedDef        // [out] was it used?
2425                              );
2426
2427         if ( !len )
2428         {
2429             // function totally failed
2430             return wxCONV_FAILED;
2431         }
2432
2433         // we did something, check if we really succeeded
2434         if ( flags )
2435         {
2436             // check if the conversion failed, i.e. if any replacements
2437             // were done
2438             if ( usedDef )
2439                 return wxCONV_FAILED;
2440         }
2441         else // we must resort to double tripping...
2442         {
2443             // first we need to ensure that we really have the MB data: this is
2444             // not the case if we're called with NULL buffer, in which case we
2445             // need to do the conversion yet again
2446             wxCharBuffer bufDef;
2447             if ( !buf )
2448             {
2449                 bufDef = wxCharBuffer(len);
2450                 buf = bufDef.data();
2451                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2452                                             buf, len, NULL, NULL) )
2453                     return wxCONV_FAILED;
2454             }
2455
2456             if ( !n )
2457                 n = wcslen(pwz);
2458             wxWCharBuffer wcBuf(n);
2459             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2460                     wcscmp(wcBuf, pwz) != 0 )
2461             {
2462                 // we didn't obtain the same thing we started from, hence
2463                 // the conversion was lossy and we consider that it failed
2464                 return wxCONV_FAILED;
2465             }
2466         }
2467
2468         // see the comment above for the reason of "len - 1"
2469         return len - 1;
2470     }
2471
2472     virtual size_t GetMBNulLen() const
2473     {
2474         if ( m_minMBCharWidth == 0 )
2475         {
2476             int len = ::WideCharToMultiByte
2477                         (
2478                             m_CodePage,     // code page
2479                             0,              // no flags
2480                             L"",            // input string
2481                             1,              // translate just the NUL
2482                             NULL,           // output buffer
2483                             0,              // and its size
2484                             NULL,           // no replacement char
2485                             NULL            // [out] don't care if it was used
2486                         );
2487
2488             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2489             switch ( len )
2490             {
2491                 default:
2492                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2493                     self->m_minMBCharWidth = (size_t)-1;
2494                     break;
2495
2496                 case 0:
2497                     self->m_minMBCharWidth = (size_t)-1;
2498                     break;
2499
2500                 case 1:
2501                 case 2:
2502                 case 4:
2503                     self->m_minMBCharWidth = len;
2504                     break;
2505             }
2506         }
2507
2508         return m_minMBCharWidth;
2509     }
2510
2511     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2512
2513     bool IsOk() const { return m_CodePage != -1; }
2514
2515 private:
2516     static bool CanUseNoBestFit()
2517     {
2518         static int s_isWin98Or2k = -1;
2519
2520         if ( s_isWin98Or2k == -1 )
2521         {
2522             int verMaj, verMin;
2523             switch ( wxGetOsVersion(&verMaj, &verMin) )
2524             {
2525                 case wxOS_WINDOWS_9X:
2526                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2527                     break;
2528
2529                 case wxOS_WINDOWS_NT:
2530                     s_isWin98Or2k = verMaj >= 5;
2531                     break;
2532
2533                 default:
2534                     // unknown: be conservative by default
2535                     s_isWin98Or2k = 0;
2536                     break;
2537             }
2538
2539             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2540         }
2541
2542         return s_isWin98Or2k == 1;
2543     }
2544
2545     static bool IsAtLeastWin2kSP4()
2546     {
2547 #ifdef __WXWINCE__
2548         return false;
2549 #else
2550         static int s_isAtLeastWin2kSP4 = -1;
2551
2552         if ( s_isAtLeastWin2kSP4 == -1 )
2553         {
2554             OSVERSIONINFOEX ver;
2555
2556             memset(&ver, 0, sizeof(ver));
2557             ver.dwOSVersionInfoSize = sizeof(ver);
2558             GetVersionEx((OSVERSIONINFO*)&ver);
2559
2560             s_isAtLeastWin2kSP4 =
2561               ((ver.dwMajorVersion > 5) || // Vista+
2562                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2563                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2564                ver.wServicePackMajor >= 4)) // 2000 SP4+
2565               ? 1 : 0;
2566         }
2567
2568         return s_isAtLeastWin2kSP4 == 1;
2569 #endif
2570     }
2571
2572
2573     // the code page we're working with
2574     long m_CodePage;
2575
2576     // cached result of GetMBNulLen(), set to 0 initially meaning
2577     // "unknown"
2578     size_t m_minMBCharWidth;
2579 };
2580
2581 #endif // wxHAVE_WIN32_MB2WC
2582
2583
2584 // ============================================================================
2585 // wxEncodingConverter based conversion classes
2586 // ============================================================================
2587
2588 #if wxUSE_FONTMAP
2589
2590 class wxMBConv_wxwin : public wxMBConv
2591 {
2592 private:
2593     void Init()
2594     {
2595         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2596         // The wxMBConv_cf class does a better job.
2597         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2598                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2599                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2600     }
2601
2602 public:
2603     // temporarily just use wxEncodingConverter stuff,
2604     // so that it works while a better implementation is built
2605     wxMBConv_wxwin(const char* name)
2606     {
2607         if (name)
2608             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2609         else
2610             m_enc = wxFONTENCODING_SYSTEM;
2611
2612         Init();
2613     }
2614
2615     wxMBConv_wxwin(wxFontEncoding enc)
2616     {
2617         m_enc = enc;
2618
2619         Init();
2620     }
2621
2622     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2623     {
2624         size_t inbuf = strlen(psz);
2625         if (buf)
2626         {
2627             if (!m2w.Convert(psz, buf))
2628                 return wxCONV_FAILED;
2629         }
2630         return inbuf;
2631     }
2632
2633     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2634     {
2635         const size_t inbuf = wxWcslen(psz);
2636         if (buf)
2637         {
2638             if (!w2m.Convert(psz, buf))
2639                 return wxCONV_FAILED;
2640         }
2641
2642         return inbuf;
2643     }
2644
2645     virtual size_t GetMBNulLen() const
2646     {
2647         switch ( m_enc )
2648         {
2649             case wxFONTENCODING_UTF16BE:
2650             case wxFONTENCODING_UTF16LE:
2651                 return 2;
2652
2653             case wxFONTENCODING_UTF32BE:
2654             case wxFONTENCODING_UTF32LE:
2655                 return 4;
2656
2657             default:
2658                 return 1;
2659         }
2660     }
2661
2662     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2663
2664     bool IsOk() const { return m_ok; }
2665
2666 public:
2667     wxFontEncoding m_enc;
2668     wxEncodingConverter m2w, w2m;
2669
2670 private:
2671     // were we initialized successfully?
2672     bool m_ok;
2673
2674     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2675 };
2676
2677 // make the constructors available for unit testing
2678 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2679 {
2680     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2681     if ( !result->IsOk() )
2682     {
2683         delete result;
2684         return 0;
2685     }
2686
2687     return result;
2688 }
2689
2690 #endif // wxUSE_FONTMAP
2691
2692 // ============================================================================
2693 // wxCSConv implementation
2694 // ============================================================================
2695
2696 void wxCSConv::Init()
2697 {
2698     m_name = NULL;
2699     m_convReal =  NULL;
2700     m_deferred = true;
2701 }
2702
2703 wxCSConv::wxCSConv(const wxString& charset)
2704 {
2705     Init();
2706
2707     if ( !charset.empty() )
2708     {
2709         SetName(charset.ToAscii());
2710     }
2711
2712 #if wxUSE_FONTMAP
2713     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2714 #else
2715     m_encoding = wxFONTENCODING_SYSTEM;
2716 #endif
2717 }
2718
2719 wxCSConv::wxCSConv(wxFontEncoding encoding)
2720 {
2721     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2722     {
2723         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2724
2725         encoding = wxFONTENCODING_SYSTEM;
2726     }
2727
2728     Init();
2729
2730     m_encoding = encoding;
2731 }
2732
2733 wxCSConv::~wxCSConv()
2734 {
2735     Clear();
2736 }
2737
2738 wxCSConv::wxCSConv(const wxCSConv& conv)
2739         : wxMBConv()
2740 {
2741     Init();
2742
2743     SetName(conv.m_name);
2744     m_encoding = conv.m_encoding;
2745 }
2746
2747 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2748 {
2749     Clear();
2750
2751     SetName(conv.m_name);
2752     m_encoding = conv.m_encoding;
2753
2754     return *this;
2755 }
2756
2757 void wxCSConv::Clear()
2758 {
2759     free(m_name);
2760     delete m_convReal;
2761
2762     m_name = NULL;
2763     m_convReal = NULL;
2764 }
2765
2766 void wxCSConv::SetName(const char *charset)
2767 {
2768     if (charset)
2769     {
2770         m_name = wxStrdup(charset);
2771         m_deferred = true;
2772     }
2773 }
2774
2775 #if wxUSE_FONTMAP
2776
2777 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2778                      wxEncodingNameCache );
2779
2780 static wxEncodingNameCache gs_nameCache;
2781 #endif
2782
2783 wxMBConv *wxCSConv::DoCreate() const
2784 {
2785 #if wxUSE_FONTMAP
2786     wxLogTrace(TRACE_STRCONV,
2787                wxT("creating conversion for %s"),
2788                (m_name ? m_name
2789                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2790 #endif // wxUSE_FONTMAP
2791
2792     // check for the special case of ASCII or ISO8859-1 charset: as we have
2793     // special knowledge of it anyhow, we don't need to create a special
2794     // conversion object
2795     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2796             m_encoding == wxFONTENCODING_DEFAULT )
2797     {
2798         // don't convert at all
2799         return NULL;
2800     }
2801
2802     // we trust OS to do conversion better than we can so try external
2803     // conversion methods first
2804     //
2805     // the full order is:
2806     //      1. OS conversion (iconv() under Unix or Win32 API)
2807     //      2. hard coded conversions for UTF
2808     //      3. wxEncodingConverter as fall back
2809
2810     // step (1)
2811 #ifdef HAVE_ICONV
2812 #if !wxUSE_FONTMAP
2813     if ( m_name )
2814 #endif // !wxUSE_FONTMAP
2815     {
2816 #if wxUSE_FONTMAP
2817         wxFontEncoding encoding(m_encoding);
2818 #endif
2819
2820         if ( m_name )
2821         {
2822             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2823             if ( conv->IsOk() )
2824                 return conv;
2825
2826             delete conv;
2827
2828 #if wxUSE_FONTMAP
2829             encoding =
2830                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2831 #endif // wxUSE_FONTMAP
2832         }
2833 #if wxUSE_FONTMAP
2834         {
2835             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2836             if ( it != gs_nameCache.end() )
2837             {
2838                 if ( it->second.empty() )
2839                     return NULL;
2840
2841                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2842                 if ( conv->IsOk() )
2843                     return conv;
2844
2845                 delete conv;
2846             }
2847
2848             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2849             // CS : in case this does not return valid names (eg for MacRoman)
2850             // encoding got a 'failure' entry in the cache all the same,
2851             // although it just has to be created using a different method, so
2852             // only store failed iconv creation attempts (or perhaps we
2853             // shoulnd't do this at all ?)
2854             if ( names[0] != NULL )
2855             {
2856                 for ( ; *names; ++names )
2857                 {
2858                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2859                     //             will need changes that will obsolete this
2860                     wxString name(*names);
2861                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2862                     if ( conv->IsOk() )
2863                     {
2864                         gs_nameCache[encoding] = *names;
2865                         return conv;
2866                     }
2867
2868                     delete conv;
2869                 }
2870
2871                 gs_nameCache[encoding] = _T(""); // cache the failure
2872             }
2873         }
2874 #endif // wxUSE_FONTMAP
2875     }
2876 #endif // HAVE_ICONV
2877
2878 #ifdef wxHAVE_WIN32_MB2WC
2879     {
2880 #if wxUSE_FONTMAP
2881         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2882                                       : new wxMBConv_win32(m_encoding);
2883         if ( conv->IsOk() )
2884             return conv;
2885
2886         delete conv;
2887 #else
2888         return NULL;
2889 #endif
2890     }
2891 #endif // wxHAVE_WIN32_MB2WC
2892
2893 #ifdef __DARWIN__
2894     {
2895         // leave UTF16 and UTF32 to the built-ins of wx
2896         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2897             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2898         {
2899 #if wxUSE_FONTMAP
2900             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2901                                           : new wxMBConv_cf(m_encoding);
2902 #else
2903             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2904 #endif
2905
2906             if ( conv->IsOk() )
2907                  return conv;
2908
2909             delete conv;
2910         }
2911     }
2912 #endif // __DARWIN__
2913
2914     // step (2)
2915     wxFontEncoding enc = m_encoding;
2916 #if wxUSE_FONTMAP
2917     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2918     {
2919         // use "false" to suppress interactive dialogs -- we can be called from
2920         // anywhere and popping up a dialog from here is the last thing we want to
2921         // do
2922         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2923     }
2924 #endif // wxUSE_FONTMAP
2925
2926     switch ( enc )
2927     {
2928         case wxFONTENCODING_UTF7:
2929              return new wxMBConvUTF7;
2930
2931         case wxFONTENCODING_UTF8:
2932              return new wxMBConvUTF8;
2933
2934         case wxFONTENCODING_UTF16BE:
2935              return new wxMBConvUTF16BE;
2936
2937         case wxFONTENCODING_UTF16LE:
2938              return new wxMBConvUTF16LE;
2939
2940         case wxFONTENCODING_UTF32BE:
2941              return new wxMBConvUTF32BE;
2942
2943         case wxFONTENCODING_UTF32LE:
2944              return new wxMBConvUTF32LE;
2945
2946         default:
2947              // nothing to do but put here to suppress gcc warnings
2948              break;
2949     }
2950
2951     // step (3)
2952 #if wxUSE_FONTMAP
2953     {
2954         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2955                                       : new wxMBConv_wxwin(m_encoding);
2956         if ( conv->IsOk() )
2957             return conv;
2958
2959         delete conv;
2960     }
2961 #endif // wxUSE_FONTMAP
2962
2963     // NB: This is a hack to prevent deadlock. What could otherwise happen
2964     //     in Unicode build: wxConvLocal creation ends up being here
2965     //     because of some failure and logs the error. But wxLog will try to
2966     //     attach a timestamp, for which it will need wxConvLocal (to convert
2967     //     time to char* and then wchar_t*), but that fails, tries to log the
2968     //     error, but wxLog has an (already locked) critical section that
2969     //     guards the static buffer.
2970     static bool alreadyLoggingError = false;
2971     if (!alreadyLoggingError)
2972     {
2973         alreadyLoggingError = true;
2974         wxLogError(_("Cannot convert from the charset '%s'!"),
2975                    m_name ? m_name
2976                       :
2977 #if wxUSE_FONTMAP
2978                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2979 #else // !wxUSE_FONTMAP
2980                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2981 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2982               );
2983
2984         alreadyLoggingError = false;
2985     }
2986
2987     return NULL;
2988 }
2989
2990 void wxCSConv::CreateConvIfNeeded() const
2991 {
2992     if ( m_deferred )
2993     {
2994         wxCSConv *self = (wxCSConv *)this; // const_cast
2995
2996         // if we don't have neither the name nor the encoding, use the default
2997         // encoding for this system
2998         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2999         {
3000 #if wxUSE_INTL
3001             self->m_encoding = wxLocale::GetSystemEncoding();
3002 #else
3003             // fallback to some reasonable default:
3004             self->m_encoding = wxFONTENCODING_ISO8859_1;
3005 #endif // wxUSE_INTL
3006         }
3007
3008         self->m_convReal = DoCreate();
3009         self->m_deferred = false;
3010     }
3011 }
3012
3013 bool wxCSConv::IsOk() const
3014 {
3015     CreateConvIfNeeded();
3016
3017     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3018     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3019         return true; // always ok as we do it ourselves
3020
3021     // m_convReal->IsOk() is called at its own creation, so we know it must
3022     // be ok if m_convReal is non-NULL
3023     return m_convReal != NULL;
3024 }
3025
3026 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3027                          const char *src, size_t srcLen) const
3028 {
3029     CreateConvIfNeeded();
3030
3031     if (m_convReal)
3032         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3033
3034     // latin-1 (direct)
3035     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3036 }
3037
3038 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3039                            const wchar_t *src, size_t srcLen) const
3040 {
3041     CreateConvIfNeeded();
3042
3043     if (m_convReal)
3044         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3045
3046     // latin-1 (direct)
3047     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3048 }
3049
3050 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3051 {
3052     CreateConvIfNeeded();
3053
3054     if (m_convReal)
3055         return m_convReal->MB2WC(buf, psz, n);
3056
3057     // latin-1 (direct)
3058     size_t len = strlen(psz);
3059
3060     if (buf)
3061     {
3062         for (size_t c = 0; c <= len; c++)
3063             buf[c] = (unsigned char)(psz[c]);
3064     }
3065
3066     return len;
3067 }
3068
3069 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3070 {
3071     CreateConvIfNeeded();
3072
3073     if (m_convReal)
3074         return m_convReal->WC2MB(buf, psz, n);
3075
3076     // latin-1 (direct)
3077     const size_t len = wxWcslen(psz);
3078     if (buf)
3079     {
3080         for (size_t c = 0; c <= len; c++)
3081         {
3082             if (psz[c] > 0xFF)
3083                 return wxCONV_FAILED;
3084
3085             buf[c] = (char)psz[c];
3086         }
3087     }
3088     else
3089     {
3090         for (size_t c = 0; c <= len; c++)
3091         {
3092             if (psz[c] > 0xFF)
3093                 return wxCONV_FAILED;
3094         }
3095     }
3096
3097     return len;
3098 }
3099
3100 size_t wxCSConv::GetMBNulLen() const
3101 {
3102     CreateConvIfNeeded();
3103
3104     if ( m_convReal )
3105     {
3106         return m_convReal->GetMBNulLen();
3107     }
3108
3109     // otherwise, we are ISO-8859-1
3110     return 1;
3111 }
3112
3113 #if wxUSE_UNICODE_UTF8
3114 bool wxCSConv::IsUTF8() const
3115 {
3116     CreateConvIfNeeded();
3117
3118     if ( m_convReal )
3119     {
3120         return m_convReal->IsUTF8();
3121     }
3122
3123     // otherwise, we are ISO-8859-1
3124     return false;
3125 }
3126 #endif
3127
3128
3129 #if wxUSE_UNICODE
3130
3131 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3132 {
3133     if ( !s )
3134         return wxWCharBuffer();
3135
3136     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3137     if ( !wbuf )
3138         wbuf = wxMBConvUTF8().cMB2WX(s);
3139     if ( !wbuf )
3140         wbuf = wxConvISO8859_1.cMB2WX(s);
3141
3142     return wbuf;
3143 }
3144
3145 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3146 {
3147     if ( !ws )
3148         return wxCharBuffer();
3149
3150     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3151     if ( !buf )
3152         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3153
3154     return buf;
3155 }
3156
3157 #endif // wxUSE_UNICODE
3158
3159 // ----------------------------------------------------------------------------
3160 // globals
3161 // ----------------------------------------------------------------------------
3162
3163 // NB: The reason why we create converted objects in this convoluted way,
3164 //     using a factory function instead of global variable, is that they
3165 //     may be used at static initialization time (some of them are used by
3166 //     wxString ctors and there may be a global wxString object). In other
3167 //     words, possibly _before_ the converter global object would be
3168 //     initialized.
3169
3170 #undef wxConvLibc
3171 #undef wxConvUTF8
3172 #undef wxConvUTF7
3173 #undef wxConvLocal
3174 #undef wxConvISO8859_1
3175
3176 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3177     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3178     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3179     {                                                                   \
3180         static impl_klass name##Obj ctor_args;                          \
3181         return &name##Obj;                                              \
3182     }                                                                   \
3183     /* this ensures that all global converter objects are created */    \
3184     /* by the time static initialization is done, i.e. before any */    \
3185     /* thread is launched: */                                           \
3186     static klass* gs_##name##instance = wxGet_##name##Ptr()
3187
3188 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3189     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3190
3191 #ifdef __WINDOWS__
3192     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3193 #else
3194     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3195 #endif
3196
3197 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3198 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3199 //     provokes an error message about "not enough macro parameters"; and we
3200 //     can't use "()" here as the name##Obj declaration would be parsed as a
3201 //     function declaration then, so use a semicolon and live with an extra
3202 //     empty statement (and hope that no compilers warns about this)
3203 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3204 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3205
3206 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3207 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3208
3209 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3211
3212 #ifdef __DARWIN__
3213 // The xnu kernel always communicates file paths in decomposed UTF-8.
3214 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3215 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3216 #endif
3217
3218 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3219 #ifdef __DARWIN__
3220                                     &wxConvMacUTF8DObj;
3221 #else // !__DARWIN__
3222                                     wxGet_wxConvLibcPtr();
3223 #endif // __DARWIN__/!__DARWIN__
3224
3225 #else // !wxUSE_WCHAR_T
3226
3227 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3228 // stand-ins in absence of wchar_t
3229 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3230                                 wxConvISO8859_1,
3231                                 wxConvLocal,
3232                                 wxConvUTF8;
3233
3234 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T