src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487
 488 //
 489 // BASE64 decoding table
 490 //
 491 static const unsigned char utf7unb64[] =
 492 {
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 499     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 500     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 502     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 503     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 504     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 506     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 507     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 508     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 525 };
 526
 527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 528 {
 529     size_t len = 0;
 530
 531     while ( *psz && (!buf || (len < n)) )
 532     {
 533         unsigned char cc = *psz++;
 534         if (cc != '+')
 535         {
 536             // plain ASCII char
 537             if (buf)
 538                 *buf++ = cc;
 539             len++;
 540         }
 541         else if (*psz == '-')
 542         {
 543             // encoded plus sign
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547             psz++;
 548         }
 549         else // start of BASE64 encoded string
 550         {
 551             bool lsb, ok;
 552             unsigned int d, l;
 553             for ( ok = lsb = false, d = 0, l = 0;
 554                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 555                   psz++ )
 556             {
 557                 d <<= 6;
 558                 d += cc;
 559                 for (l += 6; l >= 8; lsb = !lsb)
 560                 {
 561                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 562                     if (lsb)
 563                     {
 564                         if (buf)
 565                             *buf++ |= c;
 566                         len ++;
 567                     }
 568                     else
 569                     {
 570                         if (buf)
 571                             *buf = (wchar_t)(c << 8);
 572                     }
 573
 574                     ok = true;
 575                 }
 576             }
 577
 578             if ( !ok )
 579             {
 580                 // in valid UTF7 we should have valid characters after '+'
 581                 return wxCONV_FAILED;
 582             }
 583
 584             if (*psz == '-')
 585                 psz++;
 586         }
 587     }
 588
 589     if ( buf && (len < n) )
 590         *buf = '\0';
 591
 592     return len;
 593 }
 594
 595 //
 596 // BASE64 encoding table
 597 //
 598 static const unsigned char utf7enb64[] =
 599 {
 600     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 601     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 602     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 603     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 604     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 605     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 606     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 607     '4', '5', '6', '7', '8', '9', '+', '/'
 608 };
 609
 610 //
 611 // UTF-7 encoding table
 612 //
 613 // 0 - Set D (directly encoded characters)
 614 // 1 - Set O (optional direct characters)
 615 // 2 - whitespace characters (optional)
 616 // 3 - special characters
 617 //
 618 static const unsigned char utf7encode[128] =
 619 {
 620     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 621     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 622     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 626     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 628 };
 629
 630 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 631 {
 632     size_t len = 0;
 633
 634     while (*psz && ((!buf) || (len < n)))
 635     {
 636         wchar_t cc = *psz++;
 637         if (cc < 0x80 && utf7encode[cc] < 1)
 638         {
 639             // plain ASCII char
 640             if (buf)
 641                 *buf++ = (char)cc;
 642
 643             len++;
 644         }
 645 #ifndef WC_UTF16
 646         else if (((wxUint32)cc) > 0xffff)
 647         {
 648             // no surrogate pair generation (yet?)
 649             return wxCONV_FAILED;
 650         }
 651 #endif
 652         else
 653         {
 654             if (buf)
 655                 *buf++ = '+';
 656
 657             len++;
 658             if (cc != '+')
 659             {
 660                 // BASE64 encode string
 661                 unsigned int lsb, d, l;
 662                 for (d = 0, l = 0; /*nothing*/; psz++)
 663                 {
 664                     for (lsb = 0; lsb < 2; lsb ++)
 665                     {
 666                         d <<= 8;
 667                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 668
 669                         for (l += 8; l >= 6; )
 670                         {
 671                             l -= 6;
 672                             if (buf)
 673                                 *buf++ = utf7enb64[(d >> l) % 64];
 674                             len++;
 675                         }
 676                     }
 677
 678                     cc = *psz;
 679                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 680                         break;
 681                 }
 682
 683                 if (l != 0)
 684                 {
 685                     if (buf)
 686                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 687
 688                     len++;
 689                 }
 690             }
 691
 692             if (buf)
 693                 *buf++ = '-';
 694             len++;
 695         }
 696     }
 697
 698     if (buf && (len < n))
 699         *buf = 0;
 700
 701     return len;
 702 }
 703
 704 // ----------------------------------------------------------------------------
 705 // UTF-8
 706 // ----------------------------------------------------------------------------
 707
 708 static const wxUint32 utf8_max[]=
 709     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 710
 711 // boundaries of the private use area we use to (temporarily) remap invalid
 712 // characters invalid in a UTF-8 encoded string
 713 const wxUint32 wxUnicodePUA = 0x100000;
 714 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 715
 716 // this table gives the length of the UTF-8 encoding from its first character:
 717 const unsigned char tableUtf8Lengths[256] = {
 718     // single-byte sequences (ASCII):
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 726     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 727
 728     // these are invalid:
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 733     0, 0,                                            // C0,C1
 734
 735     // two-byte sequences:
 736           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 737     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 738
 739     // three-byte sequences:
 740     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 741
 742     // four-byte sequences:
 743     4, 4, 4, 4, 4,                                   // F0..F4
 744
 745     // these are invalid again (5- or 6-byte
 746     // sequences and sequences for code points
 747     // above U+10FFFF, as restricted by RFC 3629):
 748                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 749 };
 750
 751 size_t
 752 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 753                             const char *src, size_t srcLen) const
 754 {
 755     wchar_t *out = dstLen ? dst : NULL;
 756     size_t written = 0;
 757
 758     if ( srcLen == wxNO_LEN )
 759         srcLen = strlen(src) + 1;
 760
 761     for ( const char *p = src; ; p++ )
 762     {
 763         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 764         {
 765             // all done successfully, just add the trailing NULL if we are not
 766             // using explicit length
 767             if ( srcLen == wxNO_LEN )
 768             {
 769                 if ( out )
 770                 {
 771                     if ( !dstLen )
 772                         break;
 773
 774                     *out = L'\0';
 775                 }
 776
 777                 written++;
 778             }
 779
 780             return written;
 781         }
 782
 783         if ( out && !dstLen-- )
 784             break;
 785
 786         wxUint32 code;
 787         unsigned char c = *p;
 788
 789         if ( c < 0x80 )
 790         {
 791             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 792                 break;
 793
 794             if ( srcLen != wxNO_LEN )
 795                 srcLen--;
 796
 797             code = c;
 798         }
 799         else
 800         {
 801             unsigned len = tableUtf8Lengths[c];
 802             if ( !len )
 803                 break;
 804
 805             if ( srcLen < len ) // the test works for wxNO_LEN too
 806                 break;
 807
 808             if ( srcLen != wxNO_LEN )
 809                 srcLen -= len;
 810
 811             //   Char. number range   |        UTF-8 octet sequence
 812             //      (hexadecimal)     |              (binary)
 813             //  ----------------------+----------------------------------------
 814             //  0000 0000 - 0000 007F | 0xxxxxxx
 815             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 816             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 817             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 818             //
 819             //  Code point value is stored in bits marked with 'x',
 820             //  lowest-order bit of the value on the right side in the diagram
 821             //  above.                                         (from RFC 3629)
 822
 823             // mask to extract lead byte's value ('x' bits above), by sequence
 824             // length:
 825             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 826
 827             // mask and value of lead byte's most significant bits, by length:
 828             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 829             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 830
 831             len--; // it's more convenient to work with 0-based length here
 832
 833             // extract the lead byte's value bits:
 834             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 835                 break;
 836
 837             code = c & leadValueMask[len];
 838
 839             // all remaining bytes, if any, are handled in the same way
 840             // regardless of sequence's length:
 841             for ( ; len; --len )
 842             {
 843                 c = *++p;
 844                 if ( (c & 0xC0) != 0x80 )
 845                     return wxCONV_FAILED;
 846
 847                 code <<= 6;
 848                 code |= c & 0x3F;
 849             }
 850         }
 851
 852 #ifdef WC_UTF16
 853         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 854         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 855         {
 856             if ( out )
 857                 out++;
 858             written++;
 859         }
 860 #else // !WC_UTF16
 861         if ( out )
 862             *out = code;
 863 #endif // WC_UTF16/!WC_UTF16
 864
 865         if ( out )
 866             out++;
 867
 868         written++;
 869     }
 870
 871     return wxCONV_FAILED;
 872 }
 873
 874 size_t
 875 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 876                               const wchar_t *src, size_t srcLen) const
 877 {
 878     char *out = dstLen ? dst : NULL;
 879     size_t written = 0;
 880
 881     for ( const wchar_t *wp = src; ; wp++ )
 882     {
 883         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 884         {
 885             // all done successfully, just add the trailing NULL if we are not
 886             // using explicit length
 887             if ( srcLen == wxNO_LEN )
 888             {
 889                 if ( out )
 890                 {
 891                     if ( !dstLen )
 892                         break;
 893
 894                     *out = '\0';
 895                 }
 896
 897                 written++;
 898             }
 899
 900             return written;
 901         }
 902
 903
 904         wxUint32 code;
 905 #ifdef WC_UTF16
 906         // cast is ok for WC_UTF16
 907         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 908         {
 909             // skip the next char too as we decoded a surrogate
 910             wp++;
 911         }
 912 #else // wchar_t is UTF-32
 913         code = *wp & 0x7fffffff;
 914 #endif
 915
 916         unsigned len;
 917         if ( code <= 0x7F )
 918         {
 919             len = 1;
 920             if ( out )
 921             {
 922                 if ( dstLen < len )
 923                     break;
 924
 925                 out[0] = (char)code;
 926             }
 927         }
 928         else if ( code <= 0x07FF )
 929         {
 930             len = 2;
 931             if ( out )
 932             {
 933                 if ( dstLen < len )
 934                     break;
 935
 936                 // NB: this line takes 6 least significant bits, encodes them as
 937                 // 10xxxxxx and discards them so that the next byte can be encoded:
 938                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 939                 out[0] = 0xC0 | code;
 940             }
 941         }
 942         else if ( code < 0xFFFF )
 943         {
 944             len = 3;
 945             if ( out )
 946             {
 947                 if ( dstLen < len )
 948                     break;
 949
 950                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 952                 out[0] = 0xE0 | code;
 953             }
 954         }
 955         else if ( code <= 0x10FFFF )
 956         {
 957             len = 4;
 958             if ( out )
 959             {
 960                 if ( dstLen < len )
 961                     break;
 962
 963                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 965                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 966                 out[0] = 0xF0 | code;
 967             }
 968         }
 969         else
 970         {
 971             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 972             break;
 973         }
 974
 975         if ( out )
 976         {
 977             out += len;
 978             dstLen -= len;
 979         }
 980
 981         written += len;
 982     }
 983
 984     // we only get here if an error occurs during decoding
 985     return wxCONV_FAILED;
 986 }
 987
 988 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 989                              const char *psz, size_t srcLen) const
 990 {
 991     if ( m_options == MAP_INVALID_UTF8_NOT )
 992         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 993
 994     size_t len = 0;
 995
 996     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 997     {
 998         const char *opsz = psz;
 999         bool invalid = false;
1000         unsigned char cc = *psz++, fc = cc;
1001         unsigned cnt;
1002         for (cnt = 0; fc & 0x80; cnt++)
1003             fc <<= 1;
1004
1005         if (!cnt)
1006         {
1007             // plain ASCII char
1008             if (buf)
1009                 *buf++ = cc;
1010             len++;
1011
1012             // escape the escape character for octal escapes
1013             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1014                     && cc == '\\' && (!buf || len < n))
1015             {
1016                 if (buf)
1017                     *buf++ = cc;
1018                 len++;
1019             }
1020         }
1021         else
1022         {
1023             cnt--;
1024             if (!cnt)
1025             {
1026                 // invalid UTF-8 sequence
1027                 invalid = true;
1028             }
1029             else
1030             {
1031                 unsigned ocnt = cnt - 1;
1032                 wxUint32 res = cc & (0x3f >> cnt);
1033                 while (cnt--)
1034                 {
1035                     cc = *psz;
1036                     if ((cc & 0xC0) != 0x80)
1037                     {
1038                         // invalid UTF-8 sequence
1039                         invalid = true;
1040                         break;
1041                     }
1042
1043                     psz++;
1044                     res = (res << 6) | (cc & 0x3f);
1045                 }
1046
1047                 if (invalid || res <= utf8_max[ocnt])
1048                 {
1049                     // illegal UTF-8 encoding
1050                     invalid = true;
1051                 }
1052                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1053                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1054                 {
1055                     // if one of our PUA characters turns up externally
1056                     // it must also be treated as an illegal sequence
1057                     // (a bit like you have to escape an escape character)
1058                     invalid = true;
1059                 }
1060                 else
1061                 {
1062 #ifdef WC_UTF16
1063                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1064                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1065                     if (pa == wxCONV_FAILED)
1066                     {
1067                         invalid = true;
1068                     }
1069                     else
1070                     {
1071                         if (buf)
1072                             buf += pa;
1073                         len += pa;
1074                     }
1075 #else // !WC_UTF16
1076                     if (buf)
1077                         *buf++ = (wchar_t)res;
1078                     len++;
1079 #endif // WC_UTF16/!WC_UTF16
1080                 }
1081             }
1082
1083             if (invalid)
1084             {
1085                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1086                 {
1087                     while (opsz < psz && (!buf || len < n))
1088                     {
1089 #ifdef WC_UTF16
1090                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1091                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1092                         wxASSERT(pa != wxCONV_FAILED);
1093                         if (buf)
1094                             buf += pa;
1095                         opsz++;
1096                         len += pa;
1097 #else
1098                         if (buf)
1099                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1100                         opsz++;
1101                         len++;
1102 #endif
1103                     }
1104                 }
1105                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1106                 {
1107                     while (opsz < psz && (!buf || len < n))
1108                     {
1109                         if ( buf && len + 3 < n )
1110                         {
1111                             unsigned char on = *opsz;
1112                             *buf++ = L'\\';
1113                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1114                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1115                             *buf++ = (wchar_t)( L'0' + on % 010 );
1116                         }
1117
1118                         opsz++;
1119                         len += 4;
1120                     }
1121                 }
1122                 else // MAP_INVALID_UTF8_NOT
1123                 {
1124                     return wxCONV_FAILED;
1125                 }
1126             }
1127         }
1128     }
1129
1130     if (srcLen == wxNO_LEN && buf && (len < n))
1131         *buf = 0;
1132
1133     return len + 1;
1134 }
1135
1136 static inline bool isoctal(wchar_t wch)
1137 {
1138     return L'0' <= wch && wch <= L'7';
1139 }
1140
1141 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1142                                const wchar_t *psz, size_t srcLen) const
1143 {
1144     if ( m_options == MAP_INVALID_UTF8_NOT )
1145         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1146
1147     size_t len = 0;
1148
1149     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1150     {
1151         wxUint32 cc;
1152
1153 #ifdef WC_UTF16
1154         // cast is ok for WC_UTF16
1155         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1156         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1157 #else
1158         cc = (*psz++) & 0x7fffffff;
1159 #endif
1160
1161         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1162                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1163         {
1164             if (buf)
1165                 *buf++ = (char)(cc - wxUnicodePUA);
1166             len++;
1167         }
1168         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1169                     && cc == L'\\' && psz[0] == L'\\' )
1170         {
1171             if (buf)
1172                 *buf++ = (char)cc;
1173             psz++;
1174             len++;
1175         }
1176         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1177                     cc == L'\\' &&
1178                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1179         {
1180             if (buf)
1181             {
1182                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1183                                  (psz[1] - L'0') * 010 +
1184                                  (psz[2] - L'0'));
1185             }
1186
1187             psz += 3;
1188             len++;
1189         }
1190         else
1191         {
1192             unsigned cnt;
1193             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1194             {
1195             }
1196
1197             if (!cnt)
1198             {
1199                 // plain ASCII char
1200                 if (buf)
1201                     *buf++ = (char) cc;
1202                 len++;
1203             }
1204             else
1205             {
1206                 len += cnt + 1;
1207                 if (buf)
1208                 {
1209                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1210                     while (cnt--)
1211                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1212                 }
1213             }
1214         }
1215     }
1216
1217     if (srcLen == wxNO_LEN && buf && (len < n))
1218         *buf = 0;
1219
1220     return len + 1;
1221 }
1222
1223 // ============================================================================
1224 // UTF-16
1225 // ============================================================================
1226
1227 #ifdef WORDS_BIGENDIAN
1228     #define wxMBConvUTF16straight wxMBConvUTF16BE
1229     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1230 #else
1231     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1232     #define wxMBConvUTF16straight wxMBConvUTF16LE
1233 #endif
1234
1235 /* static */
1236 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1237 {
1238     if ( srcLen == wxNO_LEN )
1239     {
1240         // count the number of bytes in input, including the trailing NULs
1241         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1242         for ( srcLen = 1; *inBuff++; srcLen++ )
1243             ;
1244
1245         srcLen *= BYTES_PER_CHAR;
1246     }
1247     else // we already have the length
1248     {
1249         // we can only convert an entire number of UTF-16 characters
1250         if ( srcLen % BYTES_PER_CHAR )
1251             return wxCONV_FAILED;
1252     }
1253
1254     return srcLen;
1255 }
1256
1257 // case when in-memory representation is UTF-16 too
1258 #ifdef WC_UTF16
1259
1260 // ----------------------------------------------------------------------------
1261 // conversions without endianness change
1262 // ----------------------------------------------------------------------------
1263
1264 size_t
1265 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1266                                const char *src, size_t srcLen) const
1267 {
1268     // set up the scene for using memcpy() (which is presumably more efficient
1269     // than copying the bytes one by one)
1270     srcLen = GetLength(src, srcLen);
1271     if ( srcLen == wxNO_LEN )
1272         return wxCONV_FAILED;
1273
1274     const size_t inLen = srcLen / BYTES_PER_CHAR;
1275     if ( dst )
1276     {
1277         if ( dstLen < inLen )
1278             return wxCONV_FAILED;
1279
1280         memcpy(dst, src, srcLen);
1281     }
1282
1283     return inLen;
1284 }
1285
1286 size_t
1287 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1288                                  const wchar_t *src, size_t srcLen) const
1289 {
1290     if ( srcLen == wxNO_LEN )
1291         srcLen = wxWcslen(src) + 1;
1292
1293     srcLen *= BYTES_PER_CHAR;
1294
1295     if ( dst )
1296     {
1297         if ( dstLen < srcLen )
1298             return wxCONV_FAILED;
1299
1300         memcpy(dst, src, srcLen);
1301     }
1302
1303     return srcLen;
1304 }
1305
1306 // ----------------------------------------------------------------------------
1307 // endian-reversing conversions
1308 // ----------------------------------------------------------------------------
1309
1310 size_t
1311 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1312                            const char *src, size_t srcLen) const
1313 {
1314     srcLen = GetLength(src, srcLen);
1315     if ( srcLen == wxNO_LEN )
1316         return wxCONV_FAILED;
1317
1318     srcLen /= BYTES_PER_CHAR;
1319
1320     if ( dst )
1321     {
1322         if ( dstLen < srcLen )
1323             return wxCONV_FAILED;
1324
1325         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1326         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1327         {
1328             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1329         }
1330     }
1331
1332     return srcLen;
1333 }
1334
1335 size_t
1336 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1337                              const wchar_t *src, size_t srcLen) const
1338 {
1339     if ( srcLen == wxNO_LEN )
1340         srcLen = wxWcslen(src) + 1;
1341
1342     srcLen *= BYTES_PER_CHAR;
1343
1344     if ( dst )
1345     {
1346         if ( dstLen < srcLen )
1347             return wxCONV_FAILED;
1348
1349         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1350         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1351         {
1352             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1353         }
1354     }
1355
1356     return srcLen;
1357 }
1358
1359 #else // !WC_UTF16: wchar_t is UTF-32
1360
1361 // ----------------------------------------------------------------------------
1362 // conversions without endianness change
1363 // ----------------------------------------------------------------------------
1364
1365 size_t
1366 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1367                                const char *src, size_t srcLen) const
1368 {
1369     srcLen = GetLength(src, srcLen);
1370     if ( srcLen == wxNO_LEN )
1371         return wxCONV_FAILED;
1372
1373     const size_t inLen = srcLen / BYTES_PER_CHAR;
1374     if ( !dst )
1375     {
1376         // optimization: return maximal space which could be needed for this
1377         // string even if the real size could be smaller if the buffer contains
1378         // any surrogates
1379         return inLen;
1380     }
1381
1382     size_t outLen = 0;
1383     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1384     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1385     {
1386         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1387         if ( !inBuff )
1388             return wxCONV_FAILED;
1389
1390         if ( ++outLen > dstLen )
1391             return wxCONV_FAILED;
1392
1393         *dst++ = ch;
1394     }
1395
1396
1397     return outLen;
1398 }
1399
1400 size_t
1401 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1402                                  const wchar_t *src, size_t srcLen) const
1403 {
1404     if ( srcLen == wxNO_LEN )
1405         srcLen = wxWcslen(src) + 1;
1406
1407     size_t outLen = 0;
1408     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1409     for ( size_t n = 0; n < srcLen; n++ )
1410     {
1411         wxUint16 cc[2];
1412         const size_t numChars = encode_utf16(*src++, cc);
1413         if ( numChars == wxCONV_FAILED )
1414             return wxCONV_FAILED;
1415
1416         outLen += numChars * BYTES_PER_CHAR;
1417         if ( outBuff )
1418         {
1419             if ( outLen > dstLen )
1420                 return wxCONV_FAILED;
1421
1422             *outBuff++ = cc[0];
1423             if ( numChars == 2 )
1424             {
1425                 // second character of a surrogate
1426                 *outBuff++ = cc[1];
1427             }
1428         }
1429     }
1430
1431     return outLen;
1432 }
1433
1434 // ----------------------------------------------------------------------------
1435 // endian-reversing conversions
1436 // ----------------------------------------------------------------------------
1437
1438 size_t
1439 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1440                            const char *src, size_t srcLen) const
1441 {
1442     srcLen = GetLength(src, srcLen);
1443     if ( srcLen == wxNO_LEN )
1444         return wxCONV_FAILED;
1445
1446     const size_t inLen = srcLen / BYTES_PER_CHAR;
1447     if ( !dst )
1448     {
1449         // optimization: return maximal space which could be needed for this
1450         // string even if the real size could be smaller if the buffer contains
1451         // any surrogates
1452         return inLen;
1453     }
1454
1455     size_t outLen = 0;
1456     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1457     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1458     {
1459         wxUint32 ch;
1460         wxUint16 tmp[2];
1461
1462         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463         inBuff++;
1464         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1465
1466         const size_t numChars = decode_utf16(tmp, ch);
1467         if ( numChars == wxCONV_FAILED )
1468             return wxCONV_FAILED;
1469
1470         if ( numChars == 2 )
1471             inBuff++;
1472
1473         if ( ++outLen > dstLen )
1474             return wxCONV_FAILED;
1475
1476         *dst++ = ch;
1477     }
1478
1479
1480     return outLen;
1481 }
1482
1483 size_t
1484 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1485                              const wchar_t *src, size_t srcLen) const
1486 {
1487     if ( srcLen == wxNO_LEN )
1488         srcLen = wxWcslen(src) + 1;
1489
1490     size_t outLen = 0;
1491     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1492     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1493     {
1494         wxUint16 cc[2];
1495         const size_t numChars = encode_utf16(*src, cc);
1496         if ( numChars == wxCONV_FAILED )
1497             return wxCONV_FAILED;
1498
1499         outLen += numChars * BYTES_PER_CHAR;
1500         if ( outBuff )
1501         {
1502             if ( outLen > dstLen )
1503                 return wxCONV_FAILED;
1504
1505             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1506             if ( numChars == 2 )
1507             {
1508                 // second character of a surrogate
1509                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1510             }
1511         }
1512     }
1513
1514     return outLen;
1515 }
1516
1517 #endif // WC_UTF16/!WC_UTF16
1518
1519
1520 // ============================================================================
1521 // UTF-32
1522 // ============================================================================
1523
1524 #ifdef WORDS_BIGENDIAN
1525     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1526     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1527 #else
1528     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1529     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1530 #endif
1531
1532
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1534 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1535
1536 /* static */
1537 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1538 {
1539     if ( srcLen == wxNO_LEN )
1540     {
1541         // count the number of bytes in input, including the trailing NULs
1542         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1543         for ( srcLen = 1; *inBuff++; srcLen++ )
1544             ;
1545
1546         srcLen *= BYTES_PER_CHAR;
1547     }
1548     else // we already have the length
1549     {
1550         // we can only convert an entire number of UTF-32 characters
1551         if ( srcLen % BYTES_PER_CHAR )
1552             return wxCONV_FAILED;
1553     }
1554
1555     return srcLen;
1556 }
1557
1558 // case when in-memory representation is UTF-16
1559 #ifdef WC_UTF16
1560
1561 // ----------------------------------------------------------------------------
1562 // conversions without endianness change
1563 // ----------------------------------------------------------------------------
1564
1565 size_t
1566 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1567                                const char *src, size_t srcLen) const
1568 {
1569     srcLen = GetLength(src, srcLen);
1570     if ( srcLen == wxNO_LEN )
1571         return wxCONV_FAILED;
1572
1573     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1574     const size_t inLen = srcLen / BYTES_PER_CHAR;
1575     size_t outLen = 0;
1576     for ( size_t n = 0; n < inLen; n++ )
1577     {
1578         wxUint16 cc[2];
1579         const size_t numChars = encode_utf16(*inBuff++, cc);
1580         if ( numChars == wxCONV_FAILED )
1581             return wxCONV_FAILED;
1582
1583         outLen += numChars;
1584         if ( dst )
1585         {
1586             if ( outLen > dstLen )
1587                 return wxCONV_FAILED;
1588
1589             *dst++ = cc[0];
1590             if ( numChars == 2 )
1591             {
1592                 // second character of a surrogate
1593                 *dst++ = cc[1];
1594             }
1595         }
1596     }
1597
1598     return outLen;
1599 }
1600
1601 size_t
1602 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1603                                  const wchar_t *src, size_t srcLen) const
1604 {
1605     if ( srcLen == wxNO_LEN )
1606         srcLen = wxWcslen(src) + 1;
1607
1608     if ( !dst )
1609     {
1610         // optimization: return maximal space which could be needed for this
1611         // string instead of the exact amount which could be less if there are
1612         // any surrogates in the input
1613         //
1614         // we consider that surrogates are rare enough to make it worthwhile to
1615         // avoid running the loop below at the cost of slightly extra memory
1616         // consumption
1617         return srcLen * BYTES_PER_CHAR;
1618     }
1619
1620     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1621     size_t outLen = 0;
1622     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1623     {
1624         const wxUint32 ch = wxDecodeSurrogate(&src);
1625         if ( !src )
1626             return wxCONV_FAILED;
1627
1628         outLen += BYTES_PER_CHAR;
1629
1630         if ( outLen > dstLen )
1631             return wxCONV_FAILED;
1632
1633         *outBuff++ = ch;
1634     }
1635
1636     return outLen;
1637 }
1638
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1642
1643 size_t
1644 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1645                            const char *src, size_t srcLen) const
1646 {
1647     srcLen = GetLength(src, srcLen);
1648     if ( srcLen == wxNO_LEN )
1649         return wxCONV_FAILED;
1650
1651     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652     const size_t inLen = srcLen / BYTES_PER_CHAR;
1653     size_t outLen = 0;
1654     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1655     {
1656         wxUint16 cc[2];
1657         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1658         if ( numChars == wxCONV_FAILED )
1659             return wxCONV_FAILED;
1660
1661         outLen += numChars;
1662         if ( dst )
1663         {
1664             if ( outLen > dstLen )
1665                 return wxCONV_FAILED;
1666
1667             *dst++ = cc[0];
1668             if ( numChars == 2 )
1669             {
1670                 // second character of a surrogate
1671                 *dst++ = cc[1];
1672             }
1673         }
1674     }
1675
1676     return outLen;
1677 }
1678
1679 size_t
1680 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1681                              const wchar_t *src, size_t srcLen) const
1682 {
1683     if ( srcLen == wxNO_LEN )
1684         srcLen = wxWcslen(src) + 1;
1685
1686     if ( !dst )
1687     {
1688         // optimization: return maximal space which could be needed for this
1689         // string instead of the exact amount which could be less if there are
1690         // any surrogates in the input
1691         //
1692         // we consider that surrogates are rare enough to make it worthwhile to
1693         // avoid running the loop below at the cost of slightly extra memory
1694         // consumption
1695         return srcLen*BYTES_PER_CHAR;
1696     }
1697
1698     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1699     size_t outLen = 0;
1700     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1701     {
1702         const wxUint32 ch = wxDecodeSurrogate(&src);
1703         if ( !src )
1704             return wxCONV_FAILED;
1705
1706         outLen += BYTES_PER_CHAR;
1707
1708         if ( outLen > dstLen )
1709             return wxCONV_FAILED;
1710
1711         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1712     }
1713
1714     return outLen;
1715 }
1716
1717 #else // !WC_UTF16: wchar_t is UTF-32
1718
1719 // ----------------------------------------------------------------------------
1720 // conversions without endianness change
1721 // ----------------------------------------------------------------------------
1722
1723 size_t
1724 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1725                                const char *src, size_t srcLen) const
1726 {
1727     // use memcpy() as it should be much faster than hand-written loop
1728     srcLen = GetLength(src, srcLen);
1729     if ( srcLen == wxNO_LEN )
1730         return wxCONV_FAILED;
1731
1732     const size_t inLen = srcLen/BYTES_PER_CHAR;
1733     if ( dst )
1734     {
1735         if ( dstLen < inLen )
1736             return wxCONV_FAILED;
1737
1738         memcpy(dst, src, srcLen);
1739     }
1740
1741     return inLen;
1742 }
1743
1744 size_t
1745 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1746                                  const wchar_t *src, size_t srcLen) const
1747 {
1748     if ( srcLen == wxNO_LEN )
1749         srcLen = wxWcslen(src) + 1;
1750
1751     srcLen *= BYTES_PER_CHAR;
1752
1753     if ( dst )
1754     {
1755         if ( dstLen < srcLen )
1756             return wxCONV_FAILED;
1757
1758         memcpy(dst, src, srcLen);
1759     }
1760
1761     return srcLen;
1762 }
1763
1764 // ----------------------------------------------------------------------------
1765 // endian-reversing conversions
1766 // ----------------------------------------------------------------------------
1767
1768 size_t
1769 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1770                            const char *src, size_t srcLen) const
1771 {
1772     srcLen = GetLength(src, srcLen);
1773     if ( srcLen == wxNO_LEN )
1774         return wxCONV_FAILED;
1775
1776     srcLen /= BYTES_PER_CHAR;
1777
1778     if ( dst )
1779     {
1780         if ( dstLen < srcLen )
1781             return wxCONV_FAILED;
1782
1783         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1784         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1785         {
1786             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1787         }
1788     }
1789
1790     return srcLen;
1791 }
1792
1793 size_t
1794 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1795                              const wchar_t *src, size_t srcLen) const
1796 {
1797     if ( srcLen == wxNO_LEN )
1798         srcLen = wxWcslen(src) + 1;
1799
1800     srcLen *= BYTES_PER_CHAR;
1801
1802     if ( dst )
1803     {
1804         if ( dstLen < srcLen )
1805             return wxCONV_FAILED;
1806
1807         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1808         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1809         {
1810             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1811         }
1812     }
1813
1814     return srcLen;
1815 }
1816
1817 #endif // WC_UTF16/!WC_UTF16
1818
1819
1820 // ============================================================================
1821 // The classes doing conversion using the iconv_xxx() functions
1822 // ============================================================================
1823
1824 #ifdef HAVE_ICONV
1825
1826 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1827 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1828 //     (unless there's yet another bug in glibc) the only case when iconv()
1829 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1830 //     left in the input buffer -- when _real_ error occurs,
1831 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1832 //     iconv() failure.
1833 //     [This bug does not appear in glibc 2.2.]
1834 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1835 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1836                                      (errno != E2BIG || bufLeft != 0))
1837 #else
1838 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1839 #endif
1840
1841 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1842
1843 #define ICONV_T_INVALID ((iconv_t)-1)
1844
1845 #if SIZEOF_WCHAR_T == 4
1846     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1847     #define WC_ENC      wxFONTENCODING_UTF32
1848 #elif SIZEOF_WCHAR_T == 2
1849     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1850     #define WC_ENC      wxFONTENCODING_UTF16
1851 #else // sizeof(wchar_t) != 2 nor 4
1852     // does this ever happen?
1853     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1854 #endif
1855
1856 // ----------------------------------------------------------------------------
1857 // wxMBConv_iconv: encapsulates an iconv character set
1858 // ----------------------------------------------------------------------------
1859
1860 class wxMBConv_iconv : public wxMBConv
1861 {
1862 public:
1863     wxMBConv_iconv(const char *name);
1864     virtual ~wxMBConv_iconv();
1865
1866     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1867     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1868
1869     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1870     virtual size_t GetMBNulLen() const;
1871
1872 #if wxUSE_UNICODE_UTF8
1873     virtual bool IsUTF8() const;
1874 #endif
1875
1876     virtual wxMBConv *Clone() const
1877     {
1878         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1879         p->m_minMBCharWidth = m_minMBCharWidth;
1880         return p;
1881     }
1882
1883     bool IsOk() const
1884         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1885
1886 protected:
1887     // the iconv handlers used to translate from multibyte
1888     // to wide char and in the other direction
1889     iconv_t m2w,
1890             w2m;
1891
1892 #if wxUSE_THREADS
1893     // guards access to m2w and w2m objects
1894     wxMutex m_iconvMutex;
1895 #endif
1896
1897 private:
1898     // the name (for iconv_open()) of a wide char charset -- if none is
1899     // available on this machine, it will remain NULL
1900     static wxString ms_wcCharsetName;
1901
1902     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903     // different endian-ness than the native one
1904     static bool ms_wcNeedsSwap;
1905
1906
1907     // name of the encoding handled by this conversion
1908     wxString m_name;
1909
1910     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1911     // initially
1912     size_t m_minMBCharWidth;
1913 };
1914
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1917 {
1918     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919     if ( !result->IsOk() )
1920     {
1921         delete result;
1922         return 0;
1923     }
1924
1925     return result;
1926 }
1927
1928 wxString wxMBConv_iconv::ms_wcCharsetName;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1930
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1932               : m_name(name)
1933 {
1934     m_minMBCharWidth = 0;
1935
1936     // check for charset that represents wchar_t:
1937     if ( ms_wcCharsetName.empty() )
1938     {
1939         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
1941 #if wxUSE_FONTMAP
1942         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943 #else // !wxUSE_FONTMAP
1944         static const wxChar *names_static[] =
1945         {
1946 #if SIZEOF_WCHAR_T == 4
1947             _T("UCS-4"),
1948 #elif SIZEOF_WCHAR_T = 2
1949             _T("UCS-2"),
1950 #endif
1951             NULL
1952         };
1953         const wxChar **names = names_static;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1955
1956         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1957         {
1958             const wxString nameCS(*names);
1959
1960             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961             wxString nameXE(nameCS);
1962
1963 #ifdef WORDS_BIGENDIAN
1964                 nameXE += _T("BE");
1965 #else // little endian
1966                 nameXE += _T("LE");
1967 #endif
1968
1969             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1970                        nameXE.c_str());
1971
1972             m2w = iconv_open(nameXE.ToAscii(), name);
1973             if ( m2w == ICONV_T_INVALID )
1974             {
1975                 // try charset w/o bytesex info (e.g. "UCS4")
1976                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1977                            nameCS.c_str());
1978                 m2w = iconv_open(nameCS.ToAscii(), name);
1979
1980                 // and check for bytesex ourselves:
1981                 if ( m2w != ICONV_T_INVALID )
1982                 {
1983                     char    buf[2], *bufPtr;
1984                     wchar_t wbuf[2];
1985                     size_t  insz, outsz;
1986                     size_t  res;
1987
1988                     buf[0] = 'A';
1989                     buf[1] = 0;
1990                     wbuf[0] = 0;
1991                     insz = 2;
1992                     outsz = SIZEOF_WCHAR_T * 2;
1993                     char* wbufPtr = (char*)wbuf;
1994                     bufPtr = buf;
1995
1996                     res = iconv(
1997                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1998                         &wbufPtr, &outsz);
1999
2000                     if (ICONV_FAILED(res, insz))
2001                     {
2002                         wxLogLastError(wxT("iconv"));
2003                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2004                                    nameCS.c_str());
2005                     }
2006                     else // ok, can convert to this encoding, remember it
2007                     {
2008                         ms_wcCharsetName = nameCS;
2009                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010                     }
2011                 }
2012             }
2013             else // use charset not requiring byte swapping
2014             {
2015                 ms_wcCharsetName = nameXE;
2016             }
2017         }
2018
2019         wxLogTrace(TRACE_STRCONV,
2020                    wxT("iconv wchar_t charset is \"%s\"%s"),
2021                    ms_wcCharsetName.empty() ? wxString("<none>")
2022                                             : ms_wcCharsetName,
2023                    ms_wcNeedsSwap ? _T(" (needs swap)")
2024                                   : _T(""));
2025     }
2026     else // we already have ms_wcCharsetName
2027     {
2028         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2029     }
2030
2031     if ( ms_wcCharsetName.empty() )
2032     {
2033         w2m = ICONV_T_INVALID;
2034     }
2035     else
2036     {
2037         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2038         if ( w2m == ICONV_T_INVALID )
2039         {
2040             wxLogTrace(TRACE_STRCONV,
2041                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042                        ms_wcCharsetName.c_str(), name);
2043         }
2044     }
2045 }
2046
2047 wxMBConv_iconv::~wxMBConv_iconv()
2048 {
2049     if ( m2w != ICONV_T_INVALID )
2050         iconv_close(m2w);
2051     if ( w2m != ICONV_T_INVALID )
2052         iconv_close(w2m);
2053 }
2054
2055 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2056 {
2057     // find the string length: notice that must be done differently for
2058     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2059     size_t inbuf;
2060     const size_t nulLen = GetMBNulLen();
2061     switch ( nulLen )
2062     {
2063         default:
2064             return wxCONV_FAILED;
2065
2066         case 1:
2067             inbuf = strlen(psz); // arguably more optimized than our version
2068             break;
2069
2070         case 2:
2071         case 4:
2072             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2073             // they also have to start at character boundary and not span two
2074             // adjacent characters
2075             const char *p;
2076             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2077                 ;
2078             inbuf = p - psz;
2079             break;
2080     }
2081
2082 #if wxUSE_THREADS
2083     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2084     //     Unfortunately there are a couple of global wxCSConv objects such as
2085     //     wxConvLocal that are used all over wx code, so we have to make sure
2086     //     the handle is used by at most one thread at the time. Otherwise
2087     //     only a few wx classes would be safe to use from non-main threads
2088     //     as MB<->WC conversion would fail "randomly".
2089     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2090 #endif // wxUSE_THREADS
2091
2092     size_t outbuf = n * SIZEOF_WCHAR_T;
2093     size_t res, cres;
2094     const char *pszPtr = psz;
2095
2096     if (buf)
2097     {
2098         char* bufPtr = (char*)buf;
2099
2100         // have destination buffer, convert there
2101         cres = iconv(m2w,
2102                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2103                      &bufPtr, &outbuf);
2104         res = n - (outbuf / SIZEOF_WCHAR_T);
2105
2106         if (ms_wcNeedsSwap)
2107         {
2108             // convert to native endianness
2109             for ( unsigned i = 0; i < res; i++ )
2110                 buf[n] = WC_BSWAP(buf[i]);
2111         }
2112
2113         // NUL-terminate the string if there is any space left
2114         if (res < n)
2115             buf[res] = 0;
2116     }
2117     else
2118     {
2119         // no destination buffer... convert using temp buffer
2120         // to calculate destination buffer requirement
2121         wchar_t tbuf[8];
2122         res = 0;
2123
2124         do
2125         {
2126             char* bufPtr = (char*)tbuf;
2127             outbuf = 8 * SIZEOF_WCHAR_T;
2128
2129             cres = iconv(m2w,
2130                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2131                          &bufPtr, &outbuf );
2132
2133             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2134         }
2135         while ((cres == (size_t)-1) && (errno == E2BIG));
2136     }
2137
2138     if (ICONV_FAILED(cres, inbuf))
2139     {
2140         //VS: it is ok if iconv fails, hence trace only
2141         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2142         return wxCONV_FAILED;
2143     }
2144
2145     return res;
2146 }
2147
2148 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2149 {
2150 #if wxUSE_THREADS
2151     // NB: explained in MB2WC
2152     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2153 #endif
2154
2155     size_t inlen = wxWcslen(psz);
2156     size_t inbuflen = inlen * SIZEOF_WCHAR_T;
2157     size_t outbuflen = n;
2158     size_t res, cres;
2159
2160     wchar_t *tmpbuf = 0;
2161
2162     if (ms_wcNeedsSwap)
2163     {
2164         // need to copy to temp buffer to switch endianness
2165         // (doing WC_BSWAP twice on the original buffer won't help, as it
2166         //  could be in read-only memory, or be accessed in some other thread)
2167         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2168         for ( size_t i = 0; i < inlen; i++ )
2169             tmpbuf[n] = WC_BSWAP(psz[i]);
2170
2171         tmpbuf[inlen] = L'\0';
2172         psz = tmpbuf;
2173     }
2174
2175     char* inbuf = (char*)psz;
2176     if (buf)
2177     {
2178         // have destination buffer, convert there
2179         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen);
2180
2181         res = n - outbuflen;
2182
2183         // NB: iconv was given only wcslen(psz) characters on input, and so
2184         //     it couldn't convert the trailing zero. Let's do it ourselves
2185         //     if there's some room left for it in the output buffer.
2186         if (res < n)
2187             buf[0] = 0;
2188     }
2189     else
2190     {
2191         // no destination buffer: convert using temp buffer
2192         // to calculate destination buffer requirement
2193         char tbuf[16];
2194         res = 0;
2195         do
2196         {
2197             buf = tbuf;
2198             outbuflen = 16;
2199
2200             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen);
2201
2202             res += 16 - outbuflen;
2203         }
2204         while ((cres == (size_t)-1) && (errno == E2BIG));
2205     }
2206
2207     if (ms_wcNeedsSwap)
2208     {
2209         free(tmpbuf);
2210     }
2211
2212     if (ICONV_FAILED(cres, inbuflen))
2213     {
2214         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2215         return wxCONV_FAILED;
2216     }
2217
2218     return res;
2219 }
2220
2221 size_t wxMBConv_iconv::GetMBNulLen() const
2222 {
2223     if ( m_minMBCharWidth == 0 )
2224     {
2225         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2226
2227 #if wxUSE_THREADS
2228         // NB: explained in MB2WC
2229         wxMutexLocker lock(self->m_iconvMutex);
2230 #endif
2231
2232         const wchar_t *wnul = L"";
2233         char buf[8]; // should be enough for NUL in any encoding
2234         size_t inLen = sizeof(wchar_t),
2235                outLen = WXSIZEOF(buf);
2236         char *inBuff = (char *)wnul;
2237         char *outBuff = buf;
2238         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2239         {
2240             self->m_minMBCharWidth = (size_t)-1;
2241         }
2242         else // ok
2243         {
2244             self->m_minMBCharWidth = outBuff - buf;
2245         }
2246     }
2247
2248     return m_minMBCharWidth;
2249 }
2250
2251 #if wxUSE_UNICODE_UTF8
2252 bool wxMBConv_iconv::IsUTF8() const
2253 {
2254     return wxStricmp(m_name, "UTF-8") == 0 ||
2255            wxStricmp(m_name, "UTF8") == 0;
2256 }
2257 #endif
2258
2259 #endif // HAVE_ICONV
2260
2261
2262 // ============================================================================
2263 // Win32 conversion classes
2264 // ============================================================================
2265
2266 #ifdef wxHAVE_WIN32_MB2WC
2267
2268 // from utils.cpp
2269 #if wxUSE_FONTMAP
2270 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2271 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2272 #endif
2273
2274 class wxMBConv_win32 : public wxMBConv
2275 {
2276 public:
2277     wxMBConv_win32()
2278     {
2279         m_CodePage = CP_ACP;
2280         m_minMBCharWidth = 0;
2281     }
2282
2283     wxMBConv_win32(const wxMBConv_win32& conv)
2284         : wxMBConv()
2285     {
2286         m_CodePage = conv.m_CodePage;
2287         m_minMBCharWidth = conv.m_minMBCharWidth;
2288     }
2289
2290 #if wxUSE_FONTMAP
2291     wxMBConv_win32(const char* name)
2292     {
2293         m_CodePage = wxCharsetToCodepage(name);
2294         m_minMBCharWidth = 0;
2295     }
2296
2297     wxMBConv_win32(wxFontEncoding encoding)
2298     {
2299         m_CodePage = wxEncodingToCodepage(encoding);
2300         m_minMBCharWidth = 0;
2301     }
2302 #endif // wxUSE_FONTMAP
2303
2304     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2305     {
2306         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2307         // the behaviour is not compatible with the Unix version (using iconv)
2308         // and break the library itself, e.g. wxTextInputStream::NextChar()
2309         // wouldn't work if reading an incomplete MB char didn't result in an
2310         // error
2311         //
2312         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2313         // Win XP or newer and it is not supported for UTF-[78] so we always
2314         // use our own conversions in this case. See
2315         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2316         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2317         if ( m_CodePage == CP_UTF8 )
2318         {
2319             return wxMBConvUTF8().MB2WC(buf, psz, n);
2320         }
2321
2322         if ( m_CodePage == CP_UTF7 )
2323         {
2324             return wxMBConvUTF7().MB2WC(buf, psz, n);
2325         }
2326
2327         int flags = 0;
2328         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2329                 IsAtLeastWin2kSP4() )
2330         {
2331             flags = MB_ERR_INVALID_CHARS;
2332         }
2333
2334         const size_t len = ::MultiByteToWideChar
2335                              (
2336                                 m_CodePage,     // code page
2337                                 flags,          // flags: fall on error
2338                                 psz,            // input string
2339                                 -1,             // its length (NUL-terminated)
2340                                 buf,            // output string
2341                                 buf ? n : 0     // size of output buffer
2342                              );
2343         if ( !len )
2344         {
2345             // function totally failed
2346             return wxCONV_FAILED;
2347         }
2348
2349         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2350         // check if we succeeded, by doing a double trip:
2351         if ( !flags && buf )
2352         {
2353             const size_t mbLen = strlen(psz);
2354             wxCharBuffer mbBuf(mbLen);
2355             if ( ::WideCharToMultiByte
2356                    (
2357                       m_CodePage,
2358                       0,
2359                       buf,
2360                       -1,
2361                       mbBuf.data(),
2362                       mbLen + 1,        // size in bytes, not length
2363                       NULL,
2364                       NULL
2365                    ) == 0 ||
2366                   strcmp(mbBuf, psz) != 0 )
2367             {
2368                 // we didn't obtain the same thing we started from, hence
2369                 // the conversion was lossy and we consider that it failed
2370                 return wxCONV_FAILED;
2371             }
2372         }
2373
2374         // note that it returns count of written chars for buf != NULL and size
2375         // of the needed buffer for buf == NULL so in either case the length of
2376         // the string (which never includes the terminating NUL) is one less
2377         return len - 1;
2378     }
2379
2380     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2381     {
2382         /*
2383             we have a problem here: by default, WideCharToMultiByte() may
2384             replace characters unrepresentable in the target code page with bad
2385             quality approximations such as turning "1/2" symbol (U+00BD) into
2386             "1" for the code pages which don't have it and we, obviously, want
2387             to avoid this at any price
2388
2389             the trouble is that this function does it _silently_, i.e. it won't
2390             even tell us whether it did or not... Win98/2000 and higher provide
2391             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2392             we have to resort to a round trip, i.e. check that converting back
2393             results in the same string -- this is, of course, expensive but
2394             otherwise we simply can't be sure to not garble the data.
2395          */
2396
2397         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2398         // it doesn't work with CJK encodings (which we test for rather roughly
2399         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2400         // supporting it
2401         BOOL usedDef wxDUMMY_INITIALIZE(false);
2402         BOOL *pUsedDef;
2403         int flags;
2404         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2405         {
2406             // it's our lucky day
2407             flags = WC_NO_BEST_FIT_CHARS;
2408             pUsedDef = &usedDef;
2409         }
2410         else // old system or unsupported encoding
2411         {
2412             flags = 0;
2413             pUsedDef = NULL;
2414         }
2415
2416         const size_t len = ::WideCharToMultiByte
2417                              (
2418                                 m_CodePage,     // code page
2419                                 flags,          // either none or no best fit
2420                                 pwz,            // input string
2421                                 -1,             // it is (wide) NUL-terminated
2422                                 buf,            // output buffer
2423                                 buf ? n : 0,    // and its size
2424                                 NULL,           // default "replacement" char
2425                                 pUsedDef        // [out] was it used?
2426                              );
2427
2428         if ( !len )
2429         {
2430             // function totally failed
2431             return wxCONV_FAILED;
2432         }
2433
2434         // we did something, check if we really succeeded
2435         if ( flags )
2436         {
2437             // check if the conversion failed, i.e. if any replacements
2438             // were done
2439             if ( usedDef )
2440                 return wxCONV_FAILED;
2441         }
2442         else // we must resort to double tripping...
2443         {
2444             // first we need to ensure that we really have the MB data: this is
2445             // not the case if we're called with NULL buffer, in which case we
2446             // need to do the conversion yet again
2447             wxCharBuffer bufDef;
2448             if ( !buf )
2449             {
2450                 bufDef = wxCharBuffer(len);
2451                 buf = bufDef.data();
2452                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2453                                             buf, len, NULL, NULL) )
2454                     return wxCONV_FAILED;
2455             }
2456
2457             if ( !n )
2458                 n = wcslen(pwz);
2459             wxWCharBuffer wcBuf(n);
2460             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2461                     wcscmp(wcBuf, pwz) != 0 )
2462             {
2463                 // we didn't obtain the same thing we started from, hence
2464                 // the conversion was lossy and we consider that it failed
2465                 return wxCONV_FAILED;
2466             }
2467         }
2468
2469         // see the comment above for the reason of "len - 1"
2470         return len - 1;
2471     }
2472
2473     virtual size_t GetMBNulLen() const
2474     {
2475         if ( m_minMBCharWidth == 0 )
2476         {
2477             int len = ::WideCharToMultiByte
2478                         (
2479                             m_CodePage,     // code page
2480                             0,              // no flags
2481                             L"",            // input string
2482                             1,              // translate just the NUL
2483                             NULL,           // output buffer
2484                             0,              // and its size
2485                             NULL,           // no replacement char
2486                             NULL            // [out] don't care if it was used
2487                         );
2488
2489             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2490             switch ( len )
2491             {
2492                 default:
2493                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2494                     self->m_minMBCharWidth = (size_t)-1;
2495                     break;
2496
2497                 case 0:
2498                     self->m_minMBCharWidth = (size_t)-1;
2499                     break;
2500
2501                 case 1:
2502                 case 2:
2503                 case 4:
2504                     self->m_minMBCharWidth = len;
2505                     break;
2506             }
2507         }
2508
2509         return m_minMBCharWidth;
2510     }
2511
2512     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2513
2514     bool IsOk() const { return m_CodePage != -1; }
2515
2516 private:
2517     static bool CanUseNoBestFit()
2518     {
2519         static int s_isWin98Or2k = -1;
2520
2521         if ( s_isWin98Or2k == -1 )
2522         {
2523             int verMaj, verMin;
2524             switch ( wxGetOsVersion(&verMaj, &verMin) )
2525             {
2526                 case wxOS_WINDOWS_9X:
2527                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2528                     break;
2529
2530                 case wxOS_WINDOWS_NT:
2531                     s_isWin98Or2k = verMaj >= 5;
2532                     break;
2533
2534                 default:
2535                     // unknown: be conservative by default
2536                     s_isWin98Or2k = 0;
2537                     break;
2538             }
2539
2540             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2541         }
2542
2543         return s_isWin98Or2k == 1;
2544     }
2545
2546     static bool IsAtLeastWin2kSP4()
2547     {
2548 #ifdef __WXWINCE__
2549         return false;
2550 #else
2551         static int s_isAtLeastWin2kSP4 = -1;
2552
2553         if ( s_isAtLeastWin2kSP4 == -1 )
2554         {
2555             OSVERSIONINFOEX ver;
2556
2557             memset(&ver, 0, sizeof(ver));
2558             ver.dwOSVersionInfoSize = sizeof(ver);
2559             GetVersionEx((OSVERSIONINFO*)&ver);
2560
2561             s_isAtLeastWin2kSP4 =
2562               ((ver.dwMajorVersion > 5) || // Vista+
2563                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2564                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2565                ver.wServicePackMajor >= 4)) // 2000 SP4+
2566               ? 1 : 0;
2567         }
2568
2569         return s_isAtLeastWin2kSP4 == 1;
2570 #endif
2571     }
2572
2573
2574     // the code page we're working with
2575     long m_CodePage;
2576
2577     // cached result of GetMBNulLen(), set to 0 initially meaning
2578     // "unknown"
2579     size_t m_minMBCharWidth;
2580 };
2581
2582 #endif // wxHAVE_WIN32_MB2WC
2583
2584
2585 // ============================================================================
2586 // wxEncodingConverter based conversion classes
2587 // ============================================================================
2588
2589 #if wxUSE_FONTMAP
2590
2591 class wxMBConv_wxwin : public wxMBConv
2592 {
2593 private:
2594     void Init()
2595     {
2596         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597         // The wxMBConv_cf class does a better job.
2598         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2599                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2600                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2601     }
2602
2603 public:
2604     // temporarily just use wxEncodingConverter stuff,
2605     // so that it works while a better implementation is built
2606     wxMBConv_wxwin(const char* name)
2607     {
2608         if (name)
2609             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2610         else
2611             m_enc = wxFONTENCODING_SYSTEM;
2612
2613         Init();
2614     }
2615
2616     wxMBConv_wxwin(wxFontEncoding enc)
2617     {
2618         m_enc = enc;
2619
2620         Init();
2621     }
2622
2623     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2624     {
2625         size_t inbuf = strlen(psz);
2626         if (buf)
2627         {
2628             if (!m2w.Convert(psz, buf))
2629                 return wxCONV_FAILED;
2630         }
2631         return inbuf;
2632     }
2633
2634     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2635     {
2636         const size_t inbuf = wxWcslen(psz);
2637         if (buf)
2638         {
2639             if (!w2m.Convert(psz, buf))
2640                 return wxCONV_FAILED;
2641         }
2642
2643         return inbuf;
2644     }
2645
2646     virtual size_t GetMBNulLen() const
2647     {
2648         switch ( m_enc )
2649         {
2650             case wxFONTENCODING_UTF16BE:
2651             case wxFONTENCODING_UTF16LE:
2652                 return 2;
2653
2654             case wxFONTENCODING_UTF32BE:
2655             case wxFONTENCODING_UTF32LE:
2656                 return 4;
2657
2658             default:
2659                 return 1;
2660         }
2661     }
2662
2663     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2664
2665     bool IsOk() const { return m_ok; }
2666
2667 public:
2668     wxFontEncoding m_enc;
2669     wxEncodingConverter m2w, w2m;
2670
2671 private:
2672     // were we initialized successfully?
2673     bool m_ok;
2674
2675     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2676 };
2677
2678 // make the constructors available for unit testing
2679 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2680 {
2681     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2682     if ( !result->IsOk() )
2683     {
2684         delete result;
2685         return 0;
2686     }
2687
2688     return result;
2689 }
2690
2691 #endif // wxUSE_FONTMAP
2692
2693 // ============================================================================
2694 // wxCSConv implementation
2695 // ============================================================================
2696
2697 void wxCSConv::Init()
2698 {
2699     m_name = NULL;
2700     m_convReal =  NULL;
2701     m_deferred = true;
2702 }
2703
2704 wxCSConv::wxCSConv(const wxString& charset)
2705 {
2706     Init();
2707
2708     if ( !charset.empty() )
2709     {
2710         SetName(charset.ToAscii());
2711     }
2712
2713 #if wxUSE_FONTMAP
2714     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2715 #else
2716     m_encoding = wxFONTENCODING_SYSTEM;
2717 #endif
2718 }
2719
2720 wxCSConv::wxCSConv(wxFontEncoding encoding)
2721 {
2722     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2723     {
2724         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2725
2726         encoding = wxFONTENCODING_SYSTEM;
2727     }
2728
2729     Init();
2730
2731     m_encoding = encoding;
2732 }
2733
2734 wxCSConv::~wxCSConv()
2735 {
2736     Clear();
2737 }
2738
2739 wxCSConv::wxCSConv(const wxCSConv& conv)
2740         : wxMBConv()
2741 {
2742     Init();
2743
2744     SetName(conv.m_name);
2745     m_encoding = conv.m_encoding;
2746 }
2747
2748 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2749 {
2750     Clear();
2751
2752     SetName(conv.m_name);
2753     m_encoding = conv.m_encoding;
2754
2755     return *this;
2756 }
2757
2758 void wxCSConv::Clear()
2759 {
2760     free(m_name);
2761     delete m_convReal;
2762
2763     m_name = NULL;
2764     m_convReal = NULL;
2765 }
2766
2767 void wxCSConv::SetName(const char *charset)
2768 {
2769     if (charset)
2770     {
2771         m_name = wxStrdup(charset);
2772         m_deferred = true;
2773     }
2774 }
2775
2776 #if wxUSE_FONTMAP
2777
2778 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2779                      wxEncodingNameCache );
2780
2781 static wxEncodingNameCache gs_nameCache;
2782 #endif
2783
2784 wxMBConv *wxCSConv::DoCreate() const
2785 {
2786 #if wxUSE_FONTMAP
2787     wxLogTrace(TRACE_STRCONV,
2788                wxT("creating conversion for %s"),
2789                (m_name ? m_name
2790                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2791 #endif // wxUSE_FONTMAP
2792
2793     // check for the special case of ASCII or ISO8859-1 charset: as we have
2794     // special knowledge of it anyhow, we don't need to create a special
2795     // conversion object
2796     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2797             m_encoding == wxFONTENCODING_DEFAULT )
2798     {
2799         // don't convert at all
2800         return NULL;
2801     }
2802
2803     // we trust OS to do conversion better than we can so try external
2804     // conversion methods first
2805     //
2806     // the full order is:
2807     //      1. OS conversion (iconv() under Unix or Win32 API)
2808     //      2. hard coded conversions for UTF
2809     //      3. wxEncodingConverter as fall back
2810
2811     // step (1)
2812 #ifdef HAVE_ICONV
2813 #if !wxUSE_FONTMAP
2814     if ( m_name )
2815 #endif // !wxUSE_FONTMAP
2816     {
2817 #if wxUSE_FONTMAP
2818         wxFontEncoding encoding(m_encoding);
2819 #endif
2820
2821         if ( m_name )
2822         {
2823             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2824             if ( conv->IsOk() )
2825                 return conv;
2826
2827             delete conv;
2828
2829 #if wxUSE_FONTMAP
2830             encoding =
2831                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2832 #endif // wxUSE_FONTMAP
2833         }
2834 #if wxUSE_FONTMAP
2835         {
2836             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2837             if ( it != gs_nameCache.end() )
2838             {
2839                 if ( it->second.empty() )
2840                     return NULL;
2841
2842                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2843                 if ( conv->IsOk() )
2844                     return conv;
2845
2846                 delete conv;
2847             }
2848
2849             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2850             // CS : in case this does not return valid names (eg for MacRoman)
2851             // encoding got a 'failure' entry in the cache all the same,
2852             // although it just has to be created using a different method, so
2853             // only store failed iconv creation attempts (or perhaps we
2854             // shoulnd't do this at all ?)
2855             if ( names[0] != NULL )
2856             {
2857                 for ( ; *names; ++names )
2858                 {
2859                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860                     //             will need changes that will obsolete this
2861                     wxString name(*names);
2862                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2863                     if ( conv->IsOk() )
2864                     {
2865                         gs_nameCache[encoding] = *names;
2866                         return conv;
2867                     }
2868
2869                     delete conv;
2870                 }
2871
2872                 gs_nameCache[encoding] = _T(""); // cache the failure
2873             }
2874         }
2875 #endif // wxUSE_FONTMAP
2876     }
2877 #endif // HAVE_ICONV
2878
2879 #ifdef wxHAVE_WIN32_MB2WC
2880     {
2881 #if wxUSE_FONTMAP
2882         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2883                                       : new wxMBConv_win32(m_encoding);
2884         if ( conv->IsOk() )
2885             return conv;
2886
2887         delete conv;
2888 #else
2889         return NULL;
2890 #endif
2891     }
2892 #endif // wxHAVE_WIN32_MB2WC
2893
2894 #ifdef __DARWIN__
2895     {
2896         // leave UTF16 and UTF32 to the built-ins of wx
2897         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2898             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2899         {
2900 #if wxUSE_FONTMAP
2901             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2902                                           : new wxMBConv_cf(m_encoding);
2903 #else
2904             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2905 #endif
2906
2907             if ( conv->IsOk() )
2908                  return conv;
2909
2910             delete conv;
2911         }
2912     }
2913 #endif // __DARWIN__
2914
2915     // step (2)
2916     wxFontEncoding enc = m_encoding;
2917 #if wxUSE_FONTMAP
2918     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2919     {
2920         // use "false" to suppress interactive dialogs -- we can be called from
2921         // anywhere and popping up a dialog from here is the last thing we want to
2922         // do
2923         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2924     }
2925 #endif // wxUSE_FONTMAP
2926
2927     switch ( enc )
2928     {
2929         case wxFONTENCODING_UTF7:
2930              return new wxMBConvUTF7;
2931
2932         case wxFONTENCODING_UTF8:
2933              return new wxMBConvUTF8;
2934
2935         case wxFONTENCODING_UTF16BE:
2936              return new wxMBConvUTF16BE;
2937
2938         case wxFONTENCODING_UTF16LE:
2939              return new wxMBConvUTF16LE;
2940
2941         case wxFONTENCODING_UTF32BE:
2942              return new wxMBConvUTF32BE;
2943
2944         case wxFONTENCODING_UTF32LE:
2945              return new wxMBConvUTF32LE;
2946
2947         default:
2948              // nothing to do but put here to suppress gcc warnings
2949              break;
2950     }
2951
2952     // step (3)
2953 #if wxUSE_FONTMAP
2954     {
2955         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2956                                       : new wxMBConv_wxwin(m_encoding);
2957         if ( conv->IsOk() )
2958             return conv;
2959
2960         delete conv;
2961     }
2962 #endif // wxUSE_FONTMAP
2963
2964     // NB: This is a hack to prevent deadlock. What could otherwise happen
2965     //     in Unicode build: wxConvLocal creation ends up being here
2966     //     because of some failure and logs the error. But wxLog will try to
2967     //     attach a timestamp, for which it will need wxConvLocal (to convert
2968     //     time to char* and then wchar_t*), but that fails, tries to log the
2969     //     error, but wxLog has an (already locked) critical section that
2970     //     guards the static buffer.
2971     static bool alreadyLoggingError = false;
2972     if (!alreadyLoggingError)
2973     {
2974         alreadyLoggingError = true;
2975         wxLogError(_("Cannot convert from the charset '%s'!"),
2976                    m_name ? m_name
2977                       :
2978 #if wxUSE_FONTMAP
2979                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2980 #else // !wxUSE_FONTMAP
2981                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2982 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2983               );
2984
2985         alreadyLoggingError = false;
2986     }
2987
2988     return NULL;
2989 }
2990
2991 void wxCSConv::CreateConvIfNeeded() const
2992 {
2993     if ( m_deferred )
2994     {
2995         wxCSConv *self = (wxCSConv *)this; // const_cast
2996
2997         // if we don't have neither the name nor the encoding, use the default
2998         // encoding for this system
2999         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3000         {
3001 #if wxUSE_INTL
3002             self->m_encoding = wxLocale::GetSystemEncoding();
3003 #else
3004             // fallback to some reasonable default:
3005             self->m_encoding = wxFONTENCODING_ISO8859_1;
3006 #endif // wxUSE_INTL
3007         }
3008
3009         self->m_convReal = DoCreate();
3010         self->m_deferred = false;
3011     }
3012 }
3013
3014 bool wxCSConv::IsOk() const
3015 {
3016     CreateConvIfNeeded();
3017
3018     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3020         return true; // always ok as we do it ourselves
3021
3022     // m_convReal->IsOk() is called at its own creation, so we know it must
3023     // be ok if m_convReal is non-NULL
3024     return m_convReal != NULL;
3025 }
3026
3027 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3028                          const char *src, size_t srcLen) const
3029 {
3030     CreateConvIfNeeded();
3031
3032     if (m_convReal)
3033         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3034
3035     // latin-1 (direct)
3036     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3037 }
3038
3039 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3040                            const wchar_t *src, size_t srcLen) const
3041 {
3042     CreateConvIfNeeded();
3043
3044     if (m_convReal)
3045         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3046
3047     // latin-1 (direct)
3048     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3049 }
3050
3051 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3052 {
3053     CreateConvIfNeeded();
3054
3055     if (m_convReal)
3056         return m_convReal->MB2WC(buf, psz, n);
3057
3058     // latin-1 (direct)
3059     size_t len = strlen(psz);
3060
3061     if (buf)
3062     {
3063         for (size_t c = 0; c <= len; c++)
3064             buf[c] = (unsigned char)(psz[c]);
3065     }
3066
3067     return len;
3068 }
3069
3070 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3071 {
3072     CreateConvIfNeeded();
3073
3074     if (m_convReal)
3075         return m_convReal->WC2MB(buf, psz, n);
3076
3077     // latin-1 (direct)
3078     const size_t len = wxWcslen(psz);
3079     if (buf)
3080     {
3081         for (size_t c = 0; c <= len; c++)
3082         {
3083             if (psz[c] > 0xFF)
3084                 return wxCONV_FAILED;
3085
3086             buf[c] = (char)psz[c];
3087         }
3088     }
3089     else
3090     {
3091         for (size_t c = 0; c <= len; c++)
3092         {
3093             if (psz[c] > 0xFF)
3094                 return wxCONV_FAILED;
3095         }
3096     }
3097
3098     return len;
3099 }
3100
3101 size_t wxCSConv::GetMBNulLen() const
3102 {
3103     CreateConvIfNeeded();
3104
3105     if ( m_convReal )
3106     {
3107         return m_convReal->GetMBNulLen();
3108     }
3109
3110     // otherwise, we are ISO-8859-1
3111     return 1;
3112 }
3113
3114 #if wxUSE_UNICODE_UTF8
3115 bool wxCSConv::IsUTF8() const
3116 {
3117     CreateConvIfNeeded();
3118
3119     if ( m_convReal )
3120     {
3121         return m_convReal->IsUTF8();
3122     }
3123
3124     // otherwise, we are ISO-8859-1
3125     return false;
3126 }
3127 #endif
3128
3129
3130 #if wxUSE_UNICODE
3131
3132 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3133 {
3134     if ( !s )
3135         return wxWCharBuffer();
3136
3137     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3138     if ( !wbuf )
3139         wbuf = wxMBConvUTF8().cMB2WX(s);
3140     if ( !wbuf )
3141         wbuf = wxConvISO8859_1.cMB2WX(s);
3142
3143     return wbuf;
3144 }
3145
3146 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3147 {
3148     if ( !ws )
3149         return wxCharBuffer();
3150
3151     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3152     if ( !buf )
3153         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3154
3155     return buf;
3156 }
3157
3158 #endif // wxUSE_UNICODE
3159
3160 // ----------------------------------------------------------------------------
3161 // globals
3162 // ----------------------------------------------------------------------------
3163
3164 // NB: The reason why we create converted objects in this convoluted way,
3165 //     using a factory function instead of global variable, is that they
3166 //     may be used at static initialization time (some of them are used by
3167 //     wxString ctors and there may be a global wxString object). In other
3168 //     words, possibly _before_ the converter global object would be
3169 //     initialized.
3170
3171 #undef wxConvLibc
3172 #undef wxConvUTF8
3173 #undef wxConvUTF7
3174 #undef wxConvLocal
3175 #undef wxConvISO8859_1
3176
3177 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3178     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3179     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3180     {                                                                   \
3181         static impl_klass name##Obj ctor_args;                          \
3182         return &name##Obj;                                              \
3183     }                                                                   \
3184     /* this ensures that all global converter objects are created */    \
3185     /* by the time static initialization is done, i.e. before any */    \
3186     /* thread is launched: */                                           \
3187     static klass* gs_##name##instance = wxGet_##name##Ptr()
3188
3189 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3191
3192 #ifdef __WINDOWS__
3193     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3194 #else
3195     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3196 #endif
3197
3198 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200 //     provokes an error message about "not enough macro parameters"; and we
3201 //     can't use "()" here as the name##Obj declaration would be parsed as a
3202 //     function declaration then, so use a semicolon and live with an extra
3203 //     empty statement (and hope that no compilers warns about this)
3204 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3205 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3206
3207 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3208 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3209
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3211 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3212
3213 #ifdef __DARWIN__
3214 // The xnu kernel always communicates file paths in decomposed UTF-8.
3215 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3216 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3217 #endif
3218
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3220 #ifdef __DARWIN__
3221                                     &wxConvMacUTF8DObj;
3222 #else // !__DARWIN__
3223                                     wxGet_wxConvLibcPtr();
3224 #endif // __DARWIN__/!__DARWIN__
3225
3226 #else // !wxUSE_WCHAR_T
3227
3228 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3229 // stand-ins in absence of wchar_t
3230 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3231                                 wxConvISO8859_1,
3232                                 wxConvLocal,
3233                                 wxConvUTF8;
3234
3235 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T