src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = ToWChar(NULL, 0, psz);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = FromWChar(NULL, 0, pwz);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             wxCharBuffer buf(nLen - 1);
 380             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 381                 return buf;
 382         }
 383     }
 384
 385     return wxCharBuffer();
 386 }
 387
 388 const wxWCharBuffer
 389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 390 {
 391     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 392     if ( dstLen != wxCONV_FAILED )
 393     {
 394         wxWCharBuffer wbuf(dstLen - 1);
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         // special case of empty input: can't allocate 0 size buffer below as
 421         // wxCharBuffer insists on NUL-terminating it
 422         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 423         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 424         {
 425             if ( outLen )
 426             {
 427                 *outLen = dstLen;
 428
 429                 const size_t nulLen = GetMBNulLen();
 430                 if ( dstLen >= nulLen &&
 431                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 432                 {
 433                     // in this case the output is NUL-terminated and we're not
 434                     // supposed to count NUL
 435                     *outLen -= nulLen;
 436                 }
 437             }
 438
 439             return buf;
 440         }
 441     }
 442
 443     if ( outLen )
 444         *outLen = 0;
 445
 446     return wxCharBuffer();
 447 }
 448
 449 // ----------------------------------------------------------------------------
 450 // wxMBConvLibc
 451 // ----------------------------------------------------------------------------
 452
 453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     return wxMB2WC(buf, psz, n);
 456 }
 457
 458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 459 {
 460     return wxWC2MB(buf, psz, n);
 461 }
 462
 463 // ----------------------------------------------------------------------------
 464 // wxConvBrokenFileNames
 465 // ----------------------------------------------------------------------------
 466
 467 #ifdef __UNIX__
 468
 469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 470 {
 471     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 472          wxStricmp(charset, _T("UTF8")) == 0  )
 473         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 474     else
 475         m_conv = new wxCSConv(charset);
 476 }
 477
 478 #endif // __UNIX__
 479
 480 // ----------------------------------------------------------------------------
 481 // UTF-7
 482 // ----------------------------------------------------------------------------
 483
 484 // Implementation (C) 2004 Fredrik Roubert
 485
 486 //
 487 // BASE64 decoding table
 488 //
 489 static const unsigned char utf7unb64[] =
 490 {
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 497     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 498     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 500     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 501     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 502     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 504     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 505     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 506     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 523 };
 524
 525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 526 {
 527     size_t len = 0;
 528
 529     while ( *psz && (!buf || (len < n)) )
 530     {
 531         unsigned char cc = *psz++;
 532         if (cc != '+')
 533         {
 534             // plain ASCII char
 535             if (buf)
 536                 *buf++ = cc;
 537             len++;
 538         }
 539         else if (*psz == '-')
 540         {
 541             // encoded plus sign
 542             if (buf)
 543                 *buf++ = cc;
 544             len++;
 545             psz++;
 546         }
 547         else // start of BASE64 encoded string
 548         {
 549             bool lsb, ok;
 550             unsigned int d, l;
 551             for ( ok = lsb = false, d = 0, l = 0;
 552                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 553                   psz++ )
 554             {
 555                 d <<= 6;
 556                 d += cc;
 557                 for (l += 6; l >= 8; lsb = !lsb)
 558                 {
 559                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 560                     if (lsb)
 561                     {
 562                         if (buf)
 563                             *buf++ |= c;
 564                         len ++;
 565                     }
 566                     else
 567                     {
 568                         if (buf)
 569                             *buf = (wchar_t)(c << 8);
 570                     }
 571
 572                     ok = true;
 573                 }
 574             }
 575
 576             if ( !ok )
 577             {
 578                 // in valid UTF7 we should have valid characters after '+'
 579                 return wxCONV_FAILED;
 580             }
 581
 582             if (*psz == '-')
 583                 psz++;
 584         }
 585     }
 586
 587     if ( buf && (len < n) )
 588         *buf = '\0';
 589
 590     return len;
 591 }
 592
 593 //
 594 // BASE64 encoding table
 595 //
 596 static const unsigned char utf7enb64[] =
 597 {
 598     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 599     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 600     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 601     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 602     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 603     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 604     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 605     '4', '5', '6', '7', '8', '9', '+', '/'
 606 };
 607
 608 //
 609 // UTF-7 encoding table
 610 //
 611 // 0 - Set D (directly encoded characters)
 612 // 1 - Set O (optional direct characters)
 613 // 2 - whitespace characters (optional)
 614 // 3 - special characters
 615 //
 616 static const unsigned char utf7encode[128] =
 617 {
 618     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 620     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 622     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 626 };
 627
 628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 629 {
 630     size_t len = 0;
 631
 632     while (*psz && ((!buf) || (len < n)))
 633     {
 634         wchar_t cc = *psz++;
 635         if (cc < 0x80 && utf7encode[cc] < 1)
 636         {
 637             // plain ASCII char
 638             if (buf)
 639                 *buf++ = (char)cc;
 640
 641             len++;
 642         }
 643 #ifndef WC_UTF16
 644         else if (((wxUint32)cc) > 0xffff)
 645         {
 646             // no surrogate pair generation (yet?)
 647             return wxCONV_FAILED;
 648         }
 649 #endif
 650         else
 651         {
 652             if (buf)
 653                 *buf++ = '+';
 654
 655             len++;
 656             if (cc != '+')
 657             {
 658                 // BASE64 encode string
 659                 unsigned int lsb, d, l;
 660                 for (d = 0, l = 0; /*nothing*/; psz++)
 661                 {
 662                     for (lsb = 0; lsb < 2; lsb ++)
 663                     {
 664                         d <<= 8;
 665                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 666
 667                         for (l += 8; l >= 6; )
 668                         {
 669                             l -= 6;
 670                             if (buf)
 671                                 *buf++ = utf7enb64[(d >> l) % 64];
 672                             len++;
 673                         }
 674                     }
 675
 676                     cc = *psz;
 677                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 678                         break;
 679                 }
 680
 681                 if (l != 0)
 682                 {
 683                     if (buf)
 684                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 685
 686                     len++;
 687                 }
 688             }
 689
 690             if (buf)
 691                 *buf++ = '-';
 692             len++;
 693         }
 694     }
 695
 696     if (buf && (len < n))
 697         *buf = 0;
 698
 699     return len;
 700 }
 701
 702 // ----------------------------------------------------------------------------
 703 // UTF-8
 704 // ----------------------------------------------------------------------------
 705
 706 static const wxUint32 utf8_max[]=
 707     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 708
 709 // boundaries of the private use area we use to (temporarily) remap invalid
 710 // characters invalid in a UTF-8 encoded string
 711 const wxUint32 wxUnicodePUA = 0x100000;
 712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 713
 714 // this table gives the length of the UTF-8 encoding from its first character:
 715 const unsigned char tableUtf8Lengths[256] = {
 716     // single-byte sequences (ASCII):
 717     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 725
 726     // these are invalid:
 727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 731     0, 0,                                            // C0,C1
 732
 733     // two-byte sequences:
 734           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 735     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 736
 737     // three-byte sequences:
 738     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 739
 740     // four-byte sequences:
 741     4, 4, 4, 4, 4,                                   // F0..F4
 742
 743     // these are invalid again (5- or 6-byte
 744     // sequences and sequences for code points
 745     // above U+10FFFF, as restricted by RFC 3629):
 746                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 747 };
 748
 749 size_t
 750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 751                             const char *src, size_t srcLen) const
 752 {
 753     wchar_t *out = dstLen ? dst : NULL;
 754     size_t written = 0;
 755
 756     if ( srcLen == wxNO_LEN )
 757         srcLen = strlen(src) + 1;
 758
 759     for ( const char *p = src; ; p++ )
 760     {
 761         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 762         {
 763             // all done successfully, just add the trailing NULL if we are not
 764             // using explicit length
 765             if ( srcLen == wxNO_LEN )
 766             {
 767                 if ( out )
 768                 {
 769                     if ( !dstLen )
 770                         break;
 771
 772                     *out = L'\0';
 773                 }
 774
 775                 written++;
 776             }
 777
 778             return written;
 779         }
 780
 781         if ( out && !dstLen-- )
 782             break;
 783
 784         wxUint32 code;
 785         unsigned char c = *p;
 786
 787         if ( c < 0x80 )
 788         {
 789             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 790                 break;
 791
 792             if ( srcLen != wxNO_LEN )
 793                 srcLen--;
 794
 795             code = c;
 796         }
 797         else
 798         {
 799             unsigned len = tableUtf8Lengths[c];
 800             if ( !len )
 801                 break;
 802
 803             if ( srcLen < len ) // the test works for wxNO_LEN too
 804                 break;
 805
 806             if ( srcLen != wxNO_LEN )
 807                 srcLen -= len;
 808
 809             //   Char. number range   |        UTF-8 octet sequence
 810             //      (hexadecimal)     |              (binary)
 811             //  ----------------------+----------------------------------------
 812             //  0000 0000 - 0000 007F | 0xxxxxxx
 813             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 814             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 815             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 816             //
 817             //  Code point value is stored in bits marked with 'x',
 818             //  lowest-order bit of the value on the right side in the diagram
 819             //  above.                                         (from RFC 3629)
 820
 821             // mask to extract lead byte's value ('x' bits above), by sequence
 822             // length:
 823             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 824
 825             // mask and value of lead byte's most significant bits, by length:
 826             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 827             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 828
 829             len--; // it's more convenient to work with 0-based length here
 830
 831             // extract the lead byte's value bits:
 832             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 833                 break;
 834
 835             code = c & leadValueMask[len];
 836
 837             // all remaining bytes, if any, are handled in the same way
 838             // regardless of sequence's length:
 839             for ( ; len; --len )
 840             {
 841                 c = *++p;
 842                 if ( (c & 0xC0) != 0x80 )
 843                     return wxCONV_FAILED;
 844
 845                 code <<= 6;
 846                 code |= c & 0x3F;
 847             }
 848         }
 849
 850 #ifdef WC_UTF16
 851         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 852         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 853         {
 854             if ( out )
 855                 out++;
 856             written++;
 857         }
 858 #else // !WC_UTF16
 859         if ( out )
 860             *out = code;
 861 #endif // WC_UTF16/!WC_UTF16
 862
 863         if ( out )
 864             out++;
 865
 866         written++;
 867     }
 868
 869     return wxCONV_FAILED;
 870 }
 871
 872 size_t
 873 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 874                               const wchar_t *src, size_t srcLen) const
 875 {
 876     char *out = dstLen ? dst : NULL;
 877     size_t written = 0;
 878
 879     for ( const wchar_t *wp = src; ; wp++ )
 880     {
 881         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 882         {
 883             // all done successfully, just add the trailing NULL if we are not
 884             // using explicit length
 885             if ( srcLen == wxNO_LEN )
 886             {
 887                 if ( out )
 888                 {
 889                     if ( !dstLen )
 890                         break;
 891
 892                     *out = '\0';
 893                 }
 894
 895                 written++;
 896             }
 897
 898             return written;
 899         }
 900
 901
 902         wxUint32 code;
 903 #ifdef WC_UTF16
 904         // cast is ok for WC_UTF16
 905         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 906         {
 907             // skip the next char too as we decoded a surrogate
 908             wp++;
 909         }
 910 #else // wchar_t is UTF-32
 911         code = *wp & 0x7fffffff;
 912 #endif
 913
 914         unsigned len;
 915         if ( code <= 0x7F )
 916         {
 917             len = 1;
 918             if ( out )
 919             {
 920                 if ( dstLen < len )
 921                     break;
 922
 923                 out[0] = (char)code;
 924             }
 925         }
 926         else if ( code <= 0x07FF )
 927         {
 928             len = 2;
 929             if ( out )
 930             {
 931                 if ( dstLen < len )
 932                     break;
 933
 934                 // NB: this line takes 6 least significant bits, encodes them as
 935                 // 10xxxxxx and discards them so that the next byte can be encoded:
 936                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 937                 out[0] = 0xC0 | code;
 938             }
 939         }
 940         else if ( code < 0xFFFF )
 941         {
 942             len = 3;
 943             if ( out )
 944             {
 945                 if ( dstLen < len )
 946                     break;
 947
 948                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 949                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[0] = 0xE0 | code;
 951             }
 952         }
 953         else if ( code <= 0x10FFFF )
 954         {
 955             len = 4;
 956             if ( out )
 957             {
 958                 if ( dstLen < len )
 959                     break;
 960
 961                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 962                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 963                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[0] = 0xF0 | code;
 965             }
 966         }
 967         else
 968         {
 969             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 970             break;
 971         }
 972
 973         if ( out )
 974         {
 975             out += len;
 976             dstLen -= len;
 977         }
 978
 979         written += len;
 980     }
 981
 982     // we only get here if an error occurs during decoding
 983     return wxCONV_FAILED;
 984 }
 985
 986 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 987                              const char *psz, size_t srcLen) const
 988 {
 989     if ( m_options == MAP_INVALID_UTF8_NOT )
 990         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 991
 992     size_t len = 0;
 993
 994     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 995     {
 996         const char *opsz = psz;
 997         bool invalid = false;
 998         unsigned char cc = *psz++, fc = cc;
 999         unsigned cnt;
1000         for (cnt = 0; fc & 0x80; cnt++)
1001             fc <<= 1;
1002
1003         if (!cnt)
1004         {
1005             // plain ASCII char
1006             if (buf)
1007                 *buf++ = cc;
1008             len++;
1009
1010             // escape the escape character for octal escapes
1011             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1012                     && cc == '\\' && (!buf || len < n))
1013             {
1014                 if (buf)
1015                     *buf++ = cc;
1016                 len++;
1017             }
1018         }
1019         else
1020         {
1021             cnt--;
1022             if (!cnt)
1023             {
1024                 // invalid UTF-8 sequence
1025                 invalid = true;
1026             }
1027             else
1028             {
1029                 unsigned ocnt = cnt - 1;
1030                 wxUint32 res = cc & (0x3f >> cnt);
1031                 while (cnt--)
1032                 {
1033                     cc = *psz;
1034                     if ((cc & 0xC0) != 0x80)
1035                     {
1036                         // invalid UTF-8 sequence
1037                         invalid = true;
1038                         break;
1039                     }
1040
1041                     psz++;
1042                     res = (res << 6) | (cc & 0x3f);
1043                 }
1044
1045                 if (invalid || res <= utf8_max[ocnt])
1046                 {
1047                     // illegal UTF-8 encoding
1048                     invalid = true;
1049                 }
1050                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1051                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1052                 {
1053                     // if one of our PUA characters turns up externally
1054                     // it must also be treated as an illegal sequence
1055                     // (a bit like you have to escape an escape character)
1056                     invalid = true;
1057                 }
1058                 else
1059                 {
1060 #ifdef WC_UTF16
1061                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1062                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1063                     if (pa == wxCONV_FAILED)
1064                     {
1065                         invalid = true;
1066                     }
1067                     else
1068                     {
1069                         if (buf)
1070                             buf += pa;
1071                         len += pa;
1072                     }
1073 #else // !WC_UTF16
1074                     if (buf)
1075                         *buf++ = (wchar_t)res;
1076                     len++;
1077 #endif // WC_UTF16/!WC_UTF16
1078                 }
1079             }
1080
1081             if (invalid)
1082             {
1083                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1084                 {
1085                     while (opsz < psz && (!buf || len < n))
1086                     {
1087 #ifdef WC_UTF16
1088                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1089                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1090                         wxASSERT(pa != wxCONV_FAILED);
1091                         if (buf)
1092                             buf += pa;
1093                         opsz++;
1094                         len += pa;
1095 #else
1096                         if (buf)
1097                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1098                         opsz++;
1099                         len++;
1100 #endif
1101                     }
1102                 }
1103                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1104                 {
1105                     while (opsz < psz && (!buf || len < n))
1106                     {
1107                         if ( buf && len + 3 < n )
1108                         {
1109                             unsigned char on = *opsz;
1110                             *buf++ = L'\\';
1111                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1112                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1113                             *buf++ = (wchar_t)( L'0' + on % 010 );
1114                         }
1115
1116                         opsz++;
1117                         len += 4;
1118                     }
1119                 }
1120                 else // MAP_INVALID_UTF8_NOT
1121                 {
1122                     return wxCONV_FAILED;
1123                 }
1124             }
1125         }
1126     }
1127
1128     if (srcLen == wxNO_LEN && buf && (len < n))
1129         *buf = 0;
1130
1131     return len + 1;
1132 }
1133
1134 static inline bool isoctal(wchar_t wch)
1135 {
1136     return L'0' <= wch && wch <= L'7';
1137 }
1138
1139 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1140                                const wchar_t *psz, size_t srcLen) const
1141 {
1142     if ( m_options == MAP_INVALID_UTF8_NOT )
1143         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1144
1145     size_t len = 0;
1146
1147     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1148     {
1149         wxUint32 cc;
1150
1151 #ifdef WC_UTF16
1152         // cast is ok for WC_UTF16
1153         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1154         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1155 #else
1156         cc = (*psz++) & 0x7fffffff;
1157 #endif
1158
1159         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1160                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1161         {
1162             if (buf)
1163                 *buf++ = (char)(cc - wxUnicodePUA);
1164             len++;
1165         }
1166         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1167                     && cc == L'\\' && psz[0] == L'\\' )
1168         {
1169             if (buf)
1170                 *buf++ = (char)cc;
1171             psz++;
1172             len++;
1173         }
1174         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1175                     cc == L'\\' &&
1176                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1177         {
1178             if (buf)
1179             {
1180                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1181                                  (psz[1] - L'0') * 010 +
1182                                  (psz[2] - L'0'));
1183             }
1184
1185             psz += 3;
1186             len++;
1187         }
1188         else
1189         {
1190             unsigned cnt;
1191             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1192             {
1193             }
1194
1195             if (!cnt)
1196             {
1197                 // plain ASCII char
1198                 if (buf)
1199                     *buf++ = (char) cc;
1200                 len++;
1201             }
1202             else
1203             {
1204                 len += cnt + 1;
1205                 if (buf)
1206                 {
1207                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1208                     while (cnt--)
1209                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1210                 }
1211             }
1212         }
1213     }
1214
1215     if (srcLen == wxNO_LEN && buf && (len < n))
1216         *buf = 0;
1217
1218     return len + 1;
1219 }
1220
1221 // ============================================================================
1222 // UTF-16
1223 // ============================================================================
1224
1225 #ifdef WORDS_BIGENDIAN
1226     #define wxMBConvUTF16straight wxMBConvUTF16BE
1227     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1228 #else
1229     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1230     #define wxMBConvUTF16straight wxMBConvUTF16LE
1231 #endif
1232
1233 /* static */
1234 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1235 {
1236     if ( srcLen == wxNO_LEN )
1237     {
1238         // count the number of bytes in input, including the trailing NULs
1239         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1240         for ( srcLen = 1; *inBuff++; srcLen++ )
1241             ;
1242
1243         srcLen *= BYTES_PER_CHAR;
1244     }
1245     else // we already have the length
1246     {
1247         // we can only convert an entire number of UTF-16 characters
1248         if ( srcLen % BYTES_PER_CHAR )
1249             return wxCONV_FAILED;
1250     }
1251
1252     return srcLen;
1253 }
1254
1255 // case when in-memory representation is UTF-16 too
1256 #ifdef WC_UTF16
1257
1258 // ----------------------------------------------------------------------------
1259 // conversions without endianness change
1260 // ----------------------------------------------------------------------------
1261
1262 size_t
1263 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1264                                const char *src, size_t srcLen) const
1265 {
1266     // set up the scene for using memcpy() (which is presumably more efficient
1267     // than copying the bytes one by one)
1268     srcLen = GetLength(src, srcLen);
1269     if ( srcLen == wxNO_LEN )
1270         return wxCONV_FAILED;
1271
1272     const size_t inLen = srcLen / BYTES_PER_CHAR;
1273     if ( dst )
1274     {
1275         if ( dstLen < inLen )
1276             return wxCONV_FAILED;
1277
1278         memcpy(dst, src, srcLen);
1279     }
1280
1281     return inLen;
1282 }
1283
1284 size_t
1285 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1286                                  const wchar_t *src, size_t srcLen) const
1287 {
1288     if ( srcLen == wxNO_LEN )
1289         srcLen = wxWcslen(src) + 1;
1290
1291     srcLen *= BYTES_PER_CHAR;
1292
1293     if ( dst )
1294     {
1295         if ( dstLen < srcLen )
1296             return wxCONV_FAILED;
1297
1298         memcpy(dst, src, srcLen);
1299     }
1300
1301     return srcLen;
1302 }
1303
1304 // ----------------------------------------------------------------------------
1305 // endian-reversing conversions
1306 // ----------------------------------------------------------------------------
1307
1308 size_t
1309 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1310                            const char *src, size_t srcLen) const
1311 {
1312     srcLen = GetLength(src, srcLen);
1313     if ( srcLen == wxNO_LEN )
1314         return wxCONV_FAILED;
1315
1316     srcLen /= BYTES_PER_CHAR;
1317
1318     if ( dst )
1319     {
1320         if ( dstLen < srcLen )
1321             return wxCONV_FAILED;
1322
1323         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1324         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1325         {
1326             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1327         }
1328     }
1329
1330     return srcLen;
1331 }
1332
1333 size_t
1334 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1335                              const wchar_t *src, size_t srcLen) const
1336 {
1337     if ( srcLen == wxNO_LEN )
1338         srcLen = wxWcslen(src) + 1;
1339
1340     srcLen *= BYTES_PER_CHAR;
1341
1342     if ( dst )
1343     {
1344         if ( dstLen < srcLen )
1345             return wxCONV_FAILED;
1346
1347         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1348         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1349         {
1350             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1351         }
1352     }
1353
1354     return srcLen;
1355 }
1356
1357 #else // !WC_UTF16: wchar_t is UTF-32
1358
1359 // ----------------------------------------------------------------------------
1360 // conversions without endianness change
1361 // ----------------------------------------------------------------------------
1362
1363 size_t
1364 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1365                                const char *src, size_t srcLen) const
1366 {
1367     srcLen = GetLength(src, srcLen);
1368     if ( srcLen == wxNO_LEN )
1369         return wxCONV_FAILED;
1370
1371     const size_t inLen = srcLen / BYTES_PER_CHAR;
1372     if ( !dst )
1373     {
1374         // optimization: return maximal space which could be needed for this
1375         // string even if the real size could be smaller if the buffer contains
1376         // any surrogates
1377         return inLen;
1378     }
1379
1380     size_t outLen = 0;
1381     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1382     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1383     {
1384         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1385         if ( !inBuff )
1386             return wxCONV_FAILED;
1387
1388         if ( ++outLen > dstLen )
1389             return wxCONV_FAILED;
1390
1391         *dst++ = ch;
1392     }
1393
1394
1395     return outLen;
1396 }
1397
1398 size_t
1399 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1400                                  const wchar_t *src, size_t srcLen) const
1401 {
1402     if ( srcLen == wxNO_LEN )
1403         srcLen = wxWcslen(src) + 1;
1404
1405     size_t outLen = 0;
1406     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1407     for ( size_t n = 0; n < srcLen; n++ )
1408     {
1409         wxUint16 cc[2];
1410         const size_t numChars = encode_utf16(*src++, cc);
1411         if ( numChars == wxCONV_FAILED )
1412             return wxCONV_FAILED;
1413
1414         outLen += numChars * BYTES_PER_CHAR;
1415         if ( outBuff )
1416         {
1417             if ( outLen > dstLen )
1418                 return wxCONV_FAILED;
1419
1420             *outBuff++ = cc[0];
1421             if ( numChars == 2 )
1422             {
1423                 // second character of a surrogate
1424                 *outBuff++ = cc[1];
1425             }
1426         }
1427     }
1428
1429     return outLen;
1430 }
1431
1432 // ----------------------------------------------------------------------------
1433 // endian-reversing conversions
1434 // ----------------------------------------------------------------------------
1435
1436 size_t
1437 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1438                            const char *src, size_t srcLen) const
1439 {
1440     srcLen = GetLength(src, srcLen);
1441     if ( srcLen == wxNO_LEN )
1442         return wxCONV_FAILED;
1443
1444     const size_t inLen = srcLen / BYTES_PER_CHAR;
1445     if ( !dst )
1446     {
1447         // optimization: return maximal space which could be needed for this
1448         // string even if the real size could be smaller if the buffer contains
1449         // any surrogates
1450         return inLen;
1451     }
1452
1453     size_t outLen = 0;
1454     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1455     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1456     {
1457         wxUint32 ch;
1458         wxUint16 tmp[2];
1459
1460         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1461         inBuff++;
1462         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463
1464         const size_t numChars = decode_utf16(tmp, ch);
1465         if ( numChars == wxCONV_FAILED )
1466             return wxCONV_FAILED;
1467
1468         if ( numChars == 2 )
1469             inBuff++;
1470
1471         if ( ++outLen > dstLen )
1472             return wxCONV_FAILED;
1473
1474         *dst++ = ch;
1475     }
1476
1477
1478     return outLen;
1479 }
1480
1481 size_t
1482 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1483                              const wchar_t *src, size_t srcLen) const
1484 {
1485     if ( srcLen == wxNO_LEN )
1486         srcLen = wxWcslen(src) + 1;
1487
1488     size_t outLen = 0;
1489     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1490     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1491     {
1492         wxUint16 cc[2];
1493         const size_t numChars = encode_utf16(*src, cc);
1494         if ( numChars == wxCONV_FAILED )
1495             return wxCONV_FAILED;
1496
1497         outLen += numChars * BYTES_PER_CHAR;
1498         if ( outBuff )
1499         {
1500             if ( outLen > dstLen )
1501                 return wxCONV_FAILED;
1502
1503             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1504             if ( numChars == 2 )
1505             {
1506                 // second character of a surrogate
1507                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1508             }
1509         }
1510     }
1511
1512     return outLen;
1513 }
1514
1515 #endif // WC_UTF16/!WC_UTF16
1516
1517
1518 // ============================================================================
1519 // UTF-32
1520 // ============================================================================
1521
1522 #ifdef WORDS_BIGENDIAN
1523     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1524     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1525 #else
1526     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1527     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1528 #endif
1529
1530
1531 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1533
1534 /* static */
1535 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1536 {
1537     if ( srcLen == wxNO_LEN )
1538     {
1539         // count the number of bytes in input, including the trailing NULs
1540         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1541         for ( srcLen = 1; *inBuff++; srcLen++ )
1542             ;
1543
1544         srcLen *= BYTES_PER_CHAR;
1545     }
1546     else // we already have the length
1547     {
1548         // we can only convert an entire number of UTF-32 characters
1549         if ( srcLen % BYTES_PER_CHAR )
1550             return wxCONV_FAILED;
1551     }
1552
1553     return srcLen;
1554 }
1555
1556 // case when in-memory representation is UTF-16
1557 #ifdef WC_UTF16
1558
1559 // ----------------------------------------------------------------------------
1560 // conversions without endianness change
1561 // ----------------------------------------------------------------------------
1562
1563 size_t
1564 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1565                                const char *src, size_t srcLen) const
1566 {
1567     srcLen = GetLength(src, srcLen);
1568     if ( srcLen == wxNO_LEN )
1569         return wxCONV_FAILED;
1570
1571     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1572     const size_t inLen = srcLen / BYTES_PER_CHAR;
1573     size_t outLen = 0;
1574     for ( size_t n = 0; n < inLen; n++ )
1575     {
1576         wxUint16 cc[2];
1577         const size_t numChars = encode_utf16(*inBuff++, cc);
1578         if ( numChars == wxCONV_FAILED )
1579             return wxCONV_FAILED;
1580
1581         outLen += numChars;
1582         if ( dst )
1583         {
1584             if ( outLen > dstLen )
1585                 return wxCONV_FAILED;
1586
1587             *dst++ = cc[0];
1588             if ( numChars == 2 )
1589             {
1590                 // second character of a surrogate
1591                 *dst++ = cc[1];
1592             }
1593         }
1594     }
1595
1596     return outLen;
1597 }
1598
1599 size_t
1600 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1601                                  const wchar_t *src, size_t srcLen) const
1602 {
1603     if ( srcLen == wxNO_LEN )
1604         srcLen = wxWcslen(src) + 1;
1605
1606     if ( !dst )
1607     {
1608         // optimization: return maximal space which could be needed for this
1609         // string instead of the exact amount which could be less if there are
1610         // any surrogates in the input
1611         //
1612         // we consider that surrogates are rare enough to make it worthwhile to
1613         // avoid running the loop below at the cost of slightly extra memory
1614         // consumption
1615         return srcLen * BYTES_PER_CHAR;
1616     }
1617
1618     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1619     size_t outLen = 0;
1620     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1621     {
1622         const wxUint32 ch = wxDecodeSurrogate(&src);
1623         if ( !src )
1624             return wxCONV_FAILED;
1625
1626         outLen += BYTES_PER_CHAR;
1627
1628         if ( outLen > dstLen )
1629             return wxCONV_FAILED;
1630
1631         *outBuff++ = ch;
1632     }
1633
1634     return outLen;
1635 }
1636
1637 // ----------------------------------------------------------------------------
1638 // endian-reversing conversions
1639 // ----------------------------------------------------------------------------
1640
1641 size_t
1642 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1643                            const char *src, size_t srcLen) const
1644 {
1645     srcLen = GetLength(src, srcLen);
1646     if ( srcLen == wxNO_LEN )
1647         return wxCONV_FAILED;
1648
1649     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1650     const size_t inLen = srcLen / BYTES_PER_CHAR;
1651     size_t outLen = 0;
1652     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1653     {
1654         wxUint16 cc[2];
1655         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1656         if ( numChars == wxCONV_FAILED )
1657             return wxCONV_FAILED;
1658
1659         outLen += numChars;
1660         if ( dst )
1661         {
1662             if ( outLen > dstLen )
1663                 return wxCONV_FAILED;
1664
1665             *dst++ = cc[0];
1666             if ( numChars == 2 )
1667             {
1668                 // second character of a surrogate
1669                 *dst++ = cc[1];
1670             }
1671         }
1672     }
1673
1674     return outLen;
1675 }
1676
1677 size_t
1678 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1679                              const wchar_t *src, size_t srcLen) const
1680 {
1681     if ( srcLen == wxNO_LEN )
1682         srcLen = wxWcslen(src) + 1;
1683
1684     if ( !dst )
1685     {
1686         // optimization: return maximal space which could be needed for this
1687         // string instead of the exact amount which could be less if there are
1688         // any surrogates in the input
1689         //
1690         // we consider that surrogates are rare enough to make it worthwhile to
1691         // avoid running the loop below at the cost of slightly extra memory
1692         // consumption
1693         return srcLen*BYTES_PER_CHAR;
1694     }
1695
1696     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1697     size_t outLen = 0;
1698     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1699     {
1700         const wxUint32 ch = wxDecodeSurrogate(&src);
1701         if ( !src )
1702             return wxCONV_FAILED;
1703
1704         outLen += BYTES_PER_CHAR;
1705
1706         if ( outLen > dstLen )
1707             return wxCONV_FAILED;
1708
1709         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1710     }
1711
1712     return outLen;
1713 }
1714
1715 #else // !WC_UTF16: wchar_t is UTF-32
1716
1717 // ----------------------------------------------------------------------------
1718 // conversions without endianness change
1719 // ----------------------------------------------------------------------------
1720
1721 size_t
1722 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1723                                const char *src, size_t srcLen) const
1724 {
1725     // use memcpy() as it should be much faster than hand-written loop
1726     srcLen = GetLength(src, srcLen);
1727     if ( srcLen == wxNO_LEN )
1728         return wxCONV_FAILED;
1729
1730     const size_t inLen = srcLen/BYTES_PER_CHAR;
1731     if ( dst )
1732     {
1733         if ( dstLen < inLen )
1734             return wxCONV_FAILED;
1735
1736         memcpy(dst, src, srcLen);
1737     }
1738
1739     return inLen;
1740 }
1741
1742 size_t
1743 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1744                                  const wchar_t *src, size_t srcLen) const
1745 {
1746     if ( srcLen == wxNO_LEN )
1747         srcLen = wxWcslen(src) + 1;
1748
1749     srcLen *= BYTES_PER_CHAR;
1750
1751     if ( dst )
1752     {
1753         if ( dstLen < srcLen )
1754             return wxCONV_FAILED;
1755
1756         memcpy(dst, src, srcLen);
1757     }
1758
1759     return srcLen;
1760 }
1761
1762 // ----------------------------------------------------------------------------
1763 // endian-reversing conversions
1764 // ----------------------------------------------------------------------------
1765
1766 size_t
1767 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1768                            const char *src, size_t srcLen) const
1769 {
1770     srcLen = GetLength(src, srcLen);
1771     if ( srcLen == wxNO_LEN )
1772         return wxCONV_FAILED;
1773
1774     srcLen /= BYTES_PER_CHAR;
1775
1776     if ( dst )
1777     {
1778         if ( dstLen < srcLen )
1779             return wxCONV_FAILED;
1780
1781         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1782         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1783         {
1784             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1785         }
1786     }
1787
1788     return srcLen;
1789 }
1790
1791 size_t
1792 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1793                              const wchar_t *src, size_t srcLen) const
1794 {
1795     if ( srcLen == wxNO_LEN )
1796         srcLen = wxWcslen(src) + 1;
1797
1798     srcLen *= BYTES_PER_CHAR;
1799
1800     if ( dst )
1801     {
1802         if ( dstLen < srcLen )
1803             return wxCONV_FAILED;
1804
1805         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1806         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1807         {
1808             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1809         }
1810     }
1811
1812     return srcLen;
1813 }
1814
1815 #endif // WC_UTF16/!WC_UTF16
1816
1817
1818 // ============================================================================
1819 // The classes doing conversion using the iconv_xxx() functions
1820 // ============================================================================
1821
1822 #ifdef HAVE_ICONV
1823
1824 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1825 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1826 //     (unless there's yet another bug in glibc) the only case when iconv()
1827 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1828 //     left in the input buffer -- when _real_ error occurs,
1829 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1830 //     iconv() failure.
1831 //     [This bug does not appear in glibc 2.2.]
1832 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1833 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1834                                      (errno != E2BIG || bufLeft != 0))
1835 #else
1836 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1837 #endif
1838
1839 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1840
1841 #define ICONV_T_INVALID ((iconv_t)-1)
1842
1843 #if SIZEOF_WCHAR_T == 4
1844     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1845     #define WC_ENC      wxFONTENCODING_UTF32
1846 #elif SIZEOF_WCHAR_T == 2
1847     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1848     #define WC_ENC      wxFONTENCODING_UTF16
1849 #else // sizeof(wchar_t) != 2 nor 4
1850     // does this ever happen?
1851     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1852 #endif
1853
1854 // ----------------------------------------------------------------------------
1855 // wxMBConv_iconv: encapsulates an iconv character set
1856 // ----------------------------------------------------------------------------
1857
1858 class wxMBConv_iconv : public wxMBConv
1859 {
1860 public:
1861     wxMBConv_iconv(const char *name);
1862     virtual ~wxMBConv_iconv();
1863
1864     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1865     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1866
1867     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1868     virtual size_t GetMBNulLen() const;
1869
1870 #if wxUSE_UNICODE_UTF8
1871     virtual bool IsUTF8() const;
1872 #endif
1873
1874     virtual wxMBConv *Clone() const
1875     {
1876         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1877         p->m_minMBCharWidth = m_minMBCharWidth;
1878         return p;
1879     }
1880
1881     bool IsOk() const
1882         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1883
1884 protected:
1885     // the iconv handlers used to translate from multibyte
1886     // to wide char and in the other direction
1887     iconv_t m2w,
1888             w2m;
1889
1890 #if wxUSE_THREADS
1891     // guards access to m2w and w2m objects
1892     wxMutex m_iconvMutex;
1893 #endif
1894
1895 private:
1896     // the name (for iconv_open()) of a wide char charset -- if none is
1897     // available on this machine, it will remain NULL
1898     static wxString ms_wcCharsetName;
1899
1900     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1901     // different endian-ness than the native one
1902     static bool ms_wcNeedsSwap;
1903
1904
1905     // name of the encoding handled by this conversion
1906     wxString m_name;
1907
1908     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1909     // initially
1910     size_t m_minMBCharWidth;
1911 };
1912
1913 // make the constructor available for unit testing
1914 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1915 {
1916     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1917     if ( !result->IsOk() )
1918     {
1919         delete result;
1920         return 0;
1921     }
1922
1923     return result;
1924 }
1925
1926 wxString wxMBConv_iconv::ms_wcCharsetName;
1927 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1928
1929 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1930               : m_name(name)
1931 {
1932     m_minMBCharWidth = 0;
1933
1934     // check for charset that represents wchar_t:
1935     if ( ms_wcCharsetName.empty() )
1936     {
1937         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1938
1939 #if wxUSE_FONTMAP
1940         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1941 #else // !wxUSE_FONTMAP
1942         static const wxChar *names_static[] =
1943         {
1944 #if SIZEOF_WCHAR_T == 4
1945             _T("UCS-4"),
1946 #elif SIZEOF_WCHAR_T = 2
1947             _T("UCS-2"),
1948 #endif
1949             NULL
1950         };
1951         const wxChar **names = names_static;
1952 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1953
1954         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1955         {
1956             const wxString nameCS(*names);
1957
1958             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1959             wxString nameXE(nameCS);
1960
1961 #ifdef WORDS_BIGENDIAN
1962                 nameXE += _T("BE");
1963 #else // little endian
1964                 nameXE += _T("LE");
1965 #endif
1966
1967             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1968                        nameXE.c_str());
1969
1970             m2w = iconv_open(nameXE.ToAscii(), name);
1971             if ( m2w == ICONV_T_INVALID )
1972             {
1973                 // try charset w/o bytesex info (e.g. "UCS4")
1974                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1975                            nameCS.c_str());
1976                 m2w = iconv_open(nameCS.ToAscii(), name);
1977
1978                 // and check for bytesex ourselves:
1979                 if ( m2w != ICONV_T_INVALID )
1980                 {
1981                     char    buf[2], *bufPtr;
1982                     wchar_t wbuf[2], *wbufPtr;
1983                     size_t  insz, outsz;
1984                     size_t  res;
1985
1986                     buf[0] = 'A';
1987                     buf[1] = 0;
1988                     wbuf[0] = 0;
1989                     insz = 2;
1990                     outsz = SIZEOF_WCHAR_T * 2;
1991                     wbufPtr = wbuf;
1992                     bufPtr = buf;
1993
1994                     res = iconv(
1995                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1996                         (char**)&wbufPtr, &outsz);
1997
1998                     if (ICONV_FAILED(res, insz))
1999                     {
2000                         wxLogLastError(wxT("iconv"));
2001                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2002                                    nameCS.c_str());
2003                     }
2004                     else // ok, can convert to this encoding, remember it
2005                     {
2006                         ms_wcCharsetName = nameCS;
2007                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2008                     }
2009                 }
2010             }
2011             else // use charset not requiring byte swapping
2012             {
2013                 ms_wcCharsetName = nameXE;
2014             }
2015         }
2016
2017         wxLogTrace(TRACE_STRCONV,
2018                    wxT("iconv wchar_t charset is \"%s\"%s"),
2019                    ms_wcCharsetName.empty() ? wxString("<none>")
2020                                             : ms_wcCharsetName,
2021                    ms_wcNeedsSwap ? _T(" (needs swap)")
2022                                   : _T(""));
2023     }
2024     else // we already have ms_wcCharsetName
2025     {
2026         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2027     }
2028
2029     if ( ms_wcCharsetName.empty() )
2030     {
2031         w2m = ICONV_T_INVALID;
2032     }
2033     else
2034     {
2035         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2036         if ( w2m == ICONV_T_INVALID )
2037         {
2038             wxLogTrace(TRACE_STRCONV,
2039                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2040                        ms_wcCharsetName.c_str(), name);
2041         }
2042     }
2043 }
2044
2045 wxMBConv_iconv::~wxMBConv_iconv()
2046 {
2047     if ( m2w != ICONV_T_INVALID )
2048         iconv_close(m2w);
2049     if ( w2m != ICONV_T_INVALID )
2050         iconv_close(w2m);
2051 }
2052
2053 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2054 {
2055     // find the string length: notice that must be done differently for
2056     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2057     size_t inbuf;
2058     const size_t nulLen = GetMBNulLen();
2059     switch ( nulLen )
2060     {
2061         default:
2062             return wxCONV_FAILED;
2063
2064         case 1:
2065             inbuf = strlen(psz); // arguably more optimized than our version
2066             break;
2067
2068         case 2:
2069         case 4:
2070             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2071             // they also have to start at character boundary and not span two
2072             // adjacent characters
2073             const char *p;
2074             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2075                 ;
2076             inbuf = p - psz;
2077             break;
2078     }
2079
2080 #if wxUSE_THREADS
2081     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2082     //     Unfortunately there are a couple of global wxCSConv objects such as
2083     //     wxConvLocal that are used all over wx code, so we have to make sure
2084     //     the handle is used by at most one thread at the time. Otherwise
2085     //     only a few wx classes would be safe to use from non-main threads
2086     //     as MB<->WC conversion would fail "randomly".
2087     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2088 #endif // wxUSE_THREADS
2089
2090     size_t outbuf = n * SIZEOF_WCHAR_T;
2091     size_t res, cres;
2092     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2093     wchar_t *bufPtr = buf;
2094     const char *pszPtr = psz;
2095
2096     if (buf)
2097     {
2098         // have destination buffer, convert there
2099         cres = iconv(m2w,
2100                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2101                      (char**)&bufPtr, &outbuf);
2102         res = n - (outbuf / SIZEOF_WCHAR_T);
2103
2104         if (ms_wcNeedsSwap)
2105         {
2106             // convert to native endianness
2107             for ( unsigned i = 0; i < res; i++ )
2108                 buf[n] = WC_BSWAP(buf[i]);
2109         }
2110
2111         // NUL-terminate the string if there is any space left
2112         if (res < n)
2113             buf[res] = 0;
2114     }
2115     else
2116     {
2117         // no destination buffer... convert using temp buffer
2118         // to calculate destination buffer requirement
2119         wchar_t tbuf[8];
2120         res = 0;
2121
2122         do
2123         {
2124             bufPtr = tbuf;
2125             outbuf = 8 * SIZEOF_WCHAR_T;
2126
2127             cres = iconv(m2w,
2128                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2129                          (char**)&bufPtr, &outbuf );
2130
2131             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2132         }
2133         while ((cres == (size_t)-1) && (errno == E2BIG));
2134     }
2135
2136     if (ICONV_FAILED(cres, inbuf))
2137     {
2138         //VS: it is ok if iconv fails, hence trace only
2139         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2140         return wxCONV_FAILED;
2141     }
2142
2143     return res;
2144 }
2145
2146 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2147 {
2148 #if wxUSE_THREADS
2149     // NB: explained in MB2WC
2150     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2151 #endif
2152
2153     size_t inlen = wxWcslen(psz);
2154     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2155     size_t outbuf = n;
2156     size_t res, cres;
2157
2158     wchar_t *tmpbuf = 0;
2159
2160     if (ms_wcNeedsSwap)
2161     {
2162         // need to copy to temp buffer to switch endianness
2163         // (doing WC_BSWAP twice on the original buffer won't help, as it
2164         //  could be in read-only memory, or be accessed in some other thread)
2165         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2166         for ( size_t i = 0; i < inlen; i++ )
2167             tmpbuf[n] = WC_BSWAP(psz[i]);
2168
2169         tmpbuf[inlen] = L'\0';
2170         psz = tmpbuf;
2171     }
2172
2173     if (buf)
2174     {
2175         // have destination buffer, convert there
2176         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2177
2178         res = n - outbuf;
2179
2180         // NB: iconv was given only wcslen(psz) characters on input, and so
2181         //     it couldn't convert the trailing zero. Let's do it ourselves
2182         //     if there's some room left for it in the output buffer.
2183         if (res < n)
2184             buf[0] = 0;
2185     }
2186     else
2187     {
2188         // no destination buffer: convert using temp buffer
2189         // to calculate destination buffer requirement
2190         char tbuf[16];
2191         res = 0;
2192         do
2193         {
2194             buf = tbuf;
2195             outbuf = 16;
2196
2197             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2198
2199             res += 16 - outbuf;
2200         }
2201         while ((cres == (size_t)-1) && (errno == E2BIG));
2202     }
2203
2204     if (ms_wcNeedsSwap)
2205     {
2206         free(tmpbuf);
2207     }
2208
2209     if (ICONV_FAILED(cres, inbuf))
2210     {
2211         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2212         return wxCONV_FAILED;
2213     }
2214
2215     return res;
2216 }
2217
2218 size_t wxMBConv_iconv::GetMBNulLen() const
2219 {
2220     if ( m_minMBCharWidth == 0 )
2221     {
2222         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2223
2224 #if wxUSE_THREADS
2225         // NB: explained in MB2WC
2226         wxMutexLocker lock(self->m_iconvMutex);
2227 #endif
2228
2229         const wchar_t *wnul = L"";
2230         char buf[8]; // should be enough for NUL in any encoding
2231         size_t inLen = sizeof(wchar_t),
2232                outLen = WXSIZEOF(buf);
2233         char *inBuff = (char *)wnul;
2234         char *outBuff = buf;
2235         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2236         {
2237             self->m_minMBCharWidth = (size_t)-1;
2238         }
2239         else // ok
2240         {
2241             self->m_minMBCharWidth = outBuff - buf;
2242         }
2243     }
2244
2245     return m_minMBCharWidth;
2246 }
2247
2248 #if wxUSE_UNICODE_UTF8
2249 bool wxMBConv_iconv::IsUTF8() const
2250 {
2251     return wxStricmp(m_name, "UTF-8") == 0 ||
2252            wxStricmp(m_name, "UTF8") == 0;
2253 }
2254 #endif
2255
2256 #endif // HAVE_ICONV
2257
2258
2259 // ============================================================================
2260 // Win32 conversion classes
2261 // ============================================================================
2262
2263 #ifdef wxHAVE_WIN32_MB2WC
2264
2265 // from utils.cpp
2266 #if wxUSE_FONTMAP
2267 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2268 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2269 #endif
2270
2271 class wxMBConv_win32 : public wxMBConv
2272 {
2273 public:
2274     wxMBConv_win32()
2275     {
2276         m_CodePage = CP_ACP;
2277         m_minMBCharWidth = 0;
2278     }
2279
2280     wxMBConv_win32(const wxMBConv_win32& conv)
2281         : wxMBConv()
2282     {
2283         m_CodePage = conv.m_CodePage;
2284         m_minMBCharWidth = conv.m_minMBCharWidth;
2285     }
2286
2287 #if wxUSE_FONTMAP
2288     wxMBConv_win32(const char* name)
2289     {
2290         m_CodePage = wxCharsetToCodepage(name);
2291         m_minMBCharWidth = 0;
2292     }
2293
2294     wxMBConv_win32(wxFontEncoding encoding)
2295     {
2296         m_CodePage = wxEncodingToCodepage(encoding);
2297         m_minMBCharWidth = 0;
2298     }
2299 #endif // wxUSE_FONTMAP
2300
2301     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2302     {
2303         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2304         // the behaviour is not compatible with the Unix version (using iconv)
2305         // and break the library itself, e.g. wxTextInputStream::NextChar()
2306         // wouldn't work if reading an incomplete MB char didn't result in an
2307         // error
2308         //
2309         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2310         // Win XP or newer and it is not supported for UTF-[78] so we always
2311         // use our own conversions in this case. See
2312         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2313         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2314         if ( m_CodePage == CP_UTF8 )
2315         {
2316             return wxMBConvUTF8().MB2WC(buf, psz, n);
2317         }
2318
2319         if ( m_CodePage == CP_UTF7 )
2320         {
2321             return wxMBConvUTF7().MB2WC(buf, psz, n);
2322         }
2323
2324         int flags = 0;
2325         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2326                 IsAtLeastWin2kSP4() )
2327         {
2328             flags = MB_ERR_INVALID_CHARS;
2329         }
2330
2331         const size_t len = ::MultiByteToWideChar
2332                              (
2333                                 m_CodePage,     // code page
2334                                 flags,          // flags: fall on error
2335                                 psz,            // input string
2336                                 -1,             // its length (NUL-terminated)
2337                                 buf,            // output string
2338                                 buf ? n : 0     // size of output buffer
2339                              );
2340         if ( !len )
2341         {
2342             // function totally failed
2343             return wxCONV_FAILED;
2344         }
2345
2346         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2347         // check if we succeeded, by doing a double trip:
2348         if ( !flags && buf )
2349         {
2350             const size_t mbLen = strlen(psz);
2351             wxCharBuffer mbBuf(mbLen);
2352             if ( ::WideCharToMultiByte
2353                    (
2354                       m_CodePage,
2355                       0,
2356                       buf,
2357                       -1,
2358                       mbBuf.data(),
2359                       mbLen + 1,        // size in bytes, not length
2360                       NULL,
2361                       NULL
2362                    ) == 0 ||
2363                   strcmp(mbBuf, psz) != 0 )
2364             {
2365                 // we didn't obtain the same thing we started from, hence
2366                 // the conversion was lossy and we consider that it failed
2367                 return wxCONV_FAILED;
2368             }
2369         }
2370
2371         // note that it returns count of written chars for buf != NULL and size
2372         // of the needed buffer for buf == NULL so in either case the length of
2373         // the string (which never includes the terminating NUL) is one less
2374         return len - 1;
2375     }
2376
2377     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2378     {
2379         /*
2380             we have a problem here: by default, WideCharToMultiByte() may
2381             replace characters unrepresentable in the target code page with bad
2382             quality approximations such as turning "1/2" symbol (U+00BD) into
2383             "1" for the code pages which don't have it and we, obviously, want
2384             to avoid this at any price
2385
2386             the trouble is that this function does it _silently_, i.e. it won't
2387             even tell us whether it did or not... Win98/2000 and higher provide
2388             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2389             we have to resort to a round trip, i.e. check that converting back
2390             results in the same string -- this is, of course, expensive but
2391             otherwise we simply can't be sure to not garble the data.
2392          */
2393
2394         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2395         // it doesn't work with CJK encodings (which we test for rather roughly
2396         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2397         // supporting it
2398         BOOL usedDef wxDUMMY_INITIALIZE(false);
2399         BOOL *pUsedDef;
2400         int flags;
2401         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2402         {
2403             // it's our lucky day
2404             flags = WC_NO_BEST_FIT_CHARS;
2405             pUsedDef = &usedDef;
2406         }
2407         else // old system or unsupported encoding
2408         {
2409             flags = 0;
2410             pUsedDef = NULL;
2411         }
2412
2413         const size_t len = ::WideCharToMultiByte
2414                              (
2415                                 m_CodePage,     // code page
2416                                 flags,          // either none or no best fit
2417                                 pwz,            // input string
2418                                 -1,             // it is (wide) NUL-terminated
2419                                 buf,            // output buffer
2420                                 buf ? n : 0,    // and its size
2421                                 NULL,           // default "replacement" char
2422                                 pUsedDef        // [out] was it used?
2423                              );
2424
2425         if ( !len )
2426         {
2427             // function totally failed
2428             return wxCONV_FAILED;
2429         }
2430
2431         // we did something, check if we really succeeded
2432         if ( flags )
2433         {
2434             // check if the conversion failed, i.e. if any replacements
2435             // were done
2436             if ( usedDef )
2437                 return wxCONV_FAILED;
2438         }
2439         else // we must resort to double tripping...
2440         {
2441             // first we need to ensure that we really have the MB data: this is
2442             // not the case if we're called with NULL buffer, in which case we
2443             // need to do the conversion yet again
2444             wxCharBuffer bufDef;
2445             if ( !buf )
2446             {
2447                 bufDef = wxCharBuffer(len);
2448                 buf = bufDef.data();
2449                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2450                                             buf, len, NULL, NULL) )
2451                     return wxCONV_FAILED;
2452             }
2453
2454             wxWCharBuffer wcBuf(n);
2455             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2456                     wcscmp(wcBuf, pwz) != 0 )
2457             {
2458                 // we didn't obtain the same thing we started from, hence
2459                 // the conversion was lossy and we consider that it failed
2460                 return wxCONV_FAILED;
2461             }
2462         }
2463
2464         // see the comment above for the reason of "len - 1"
2465         return len - 1;
2466     }
2467
2468     virtual size_t GetMBNulLen() const
2469     {
2470         if ( m_minMBCharWidth == 0 )
2471         {
2472             int len = ::WideCharToMultiByte
2473                         (
2474                             m_CodePage,     // code page
2475                             0,              // no flags
2476                             L"",            // input string
2477                             1,              // translate just the NUL
2478                             NULL,           // output buffer
2479                             0,              // and its size
2480                             NULL,           // no replacement char
2481                             NULL            // [out] don't care if it was used
2482                         );
2483
2484             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2485             switch ( len )
2486             {
2487                 default:
2488                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2489                     self->m_minMBCharWidth = (size_t)-1;
2490                     break;
2491
2492                 case 0:
2493                     self->m_minMBCharWidth = (size_t)-1;
2494                     break;
2495
2496                 case 1:
2497                 case 2:
2498                 case 4:
2499                     self->m_minMBCharWidth = len;
2500                     break;
2501             }
2502         }
2503
2504         return m_minMBCharWidth;
2505     }
2506
2507     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2508
2509     bool IsOk() const { return m_CodePage != -1; }
2510
2511 private:
2512     static bool CanUseNoBestFit()
2513     {
2514         static int s_isWin98Or2k = -1;
2515
2516         if ( s_isWin98Or2k == -1 )
2517         {
2518             int verMaj, verMin;
2519             switch ( wxGetOsVersion(&verMaj, &verMin) )
2520             {
2521                 case wxOS_WINDOWS_9X:
2522                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2523                     break;
2524
2525                 case wxOS_WINDOWS_NT:
2526                     s_isWin98Or2k = verMaj >= 5;
2527                     break;
2528
2529                 default:
2530                     // unknown: be conservative by default
2531                     s_isWin98Or2k = 0;
2532                     break;
2533             }
2534
2535             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2536         }
2537
2538         return s_isWin98Or2k == 1;
2539     }
2540
2541     static bool IsAtLeastWin2kSP4()
2542     {
2543 #ifdef __WXWINCE__
2544         return false;
2545 #else
2546         static int s_isAtLeastWin2kSP4 = -1;
2547
2548         if ( s_isAtLeastWin2kSP4 == -1 )
2549         {
2550             OSVERSIONINFOEX ver;
2551
2552             memset(&ver, 0, sizeof(ver));
2553             ver.dwOSVersionInfoSize = sizeof(ver);
2554             GetVersionEx((OSVERSIONINFO*)&ver);
2555
2556             s_isAtLeastWin2kSP4 =
2557               ((ver.dwMajorVersion > 5) || // Vista+
2558                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2559                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2560                ver.wServicePackMajor >= 4)) // 2000 SP4+
2561               ? 1 : 0;
2562         }
2563
2564         return s_isAtLeastWin2kSP4 == 1;
2565 #endif
2566     }
2567
2568
2569     // the code page we're working with
2570     long m_CodePage;
2571
2572     // cached result of GetMBNulLen(), set to 0 initially meaning
2573     // "unknown"
2574     size_t m_minMBCharWidth;
2575 };
2576
2577 #endif // wxHAVE_WIN32_MB2WC
2578
2579
2580 // ============================================================================
2581 // wxEncodingConverter based conversion classes
2582 // ============================================================================
2583
2584 #if wxUSE_FONTMAP
2585
2586 class wxMBConv_wxwin : public wxMBConv
2587 {
2588 private:
2589     void Init()
2590     {
2591         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2592         // The wxMBConv_cf class does a better job.
2593         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2594                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2595                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2596     }
2597
2598 public:
2599     // temporarily just use wxEncodingConverter stuff,
2600     // so that it works while a better implementation is built
2601     wxMBConv_wxwin(const char* name)
2602     {
2603         if (name)
2604             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2605         else
2606             m_enc = wxFONTENCODING_SYSTEM;
2607
2608         Init();
2609     }
2610
2611     wxMBConv_wxwin(wxFontEncoding enc)
2612     {
2613         m_enc = enc;
2614
2615         Init();
2616     }
2617
2618     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2619     {
2620         size_t inbuf = strlen(psz);
2621         if (buf)
2622         {
2623             if (!m2w.Convert(psz, buf))
2624                 return wxCONV_FAILED;
2625         }
2626         return inbuf;
2627     }
2628
2629     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2630     {
2631         const size_t inbuf = wxWcslen(psz);
2632         if (buf)
2633         {
2634             if (!w2m.Convert(psz, buf))
2635                 return wxCONV_FAILED;
2636         }
2637
2638         return inbuf;
2639     }
2640
2641     virtual size_t GetMBNulLen() const
2642     {
2643         switch ( m_enc )
2644         {
2645             case wxFONTENCODING_UTF16BE:
2646             case wxFONTENCODING_UTF16LE:
2647                 return 2;
2648
2649             case wxFONTENCODING_UTF32BE:
2650             case wxFONTENCODING_UTF32LE:
2651                 return 4;
2652
2653             default:
2654                 return 1;
2655         }
2656     }
2657
2658     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2659
2660     bool IsOk() const { return m_ok; }
2661
2662 public:
2663     wxFontEncoding m_enc;
2664     wxEncodingConverter m2w, w2m;
2665
2666 private:
2667     // were we initialized successfully?
2668     bool m_ok;
2669
2670     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2671 };
2672
2673 // make the constructors available for unit testing
2674 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2675 {
2676     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2677     if ( !result->IsOk() )
2678     {
2679         delete result;
2680         return 0;
2681     }
2682
2683     return result;
2684 }
2685
2686 #endif // wxUSE_FONTMAP
2687
2688 // ============================================================================
2689 // wxCSConv implementation
2690 // ============================================================================
2691
2692 void wxCSConv::Init()
2693 {
2694     m_name = NULL;
2695     m_convReal =  NULL;
2696     m_deferred = true;
2697 }
2698
2699 wxCSConv::wxCSConv(const wxString& charset)
2700 {
2701     Init();
2702
2703     if ( !charset.empty() )
2704     {
2705         SetName(charset.ToAscii());
2706     }
2707
2708 #if wxUSE_FONTMAP
2709     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2710 #else
2711     m_encoding = wxFONTENCODING_SYSTEM;
2712 #endif
2713 }
2714
2715 wxCSConv::wxCSConv(wxFontEncoding encoding)
2716 {
2717     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2718     {
2719         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2720
2721         encoding = wxFONTENCODING_SYSTEM;
2722     }
2723
2724     Init();
2725
2726     m_encoding = encoding;
2727 }
2728
2729 wxCSConv::~wxCSConv()
2730 {
2731     Clear();
2732 }
2733
2734 wxCSConv::wxCSConv(const wxCSConv& conv)
2735         : wxMBConv()
2736 {
2737     Init();
2738
2739     SetName(conv.m_name);
2740     m_encoding = conv.m_encoding;
2741 }
2742
2743 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2744 {
2745     Clear();
2746
2747     SetName(conv.m_name);
2748     m_encoding = conv.m_encoding;
2749
2750     return *this;
2751 }
2752
2753 void wxCSConv::Clear()
2754 {
2755     free(m_name);
2756     delete m_convReal;
2757
2758     m_name = NULL;
2759     m_convReal = NULL;
2760 }
2761
2762 void wxCSConv::SetName(const char *charset)
2763 {
2764     if (charset)
2765     {
2766         m_name = wxStrdup(charset);
2767         m_deferred = true;
2768     }
2769 }
2770
2771 #if wxUSE_FONTMAP
2772
2773 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2774                      wxEncodingNameCache );
2775
2776 static wxEncodingNameCache gs_nameCache;
2777 #endif
2778
2779 wxMBConv *wxCSConv::DoCreate() const
2780 {
2781 #if wxUSE_FONTMAP
2782     wxLogTrace(TRACE_STRCONV,
2783                wxT("creating conversion for %s"),
2784                (m_name ? m_name
2785                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2786 #endif // wxUSE_FONTMAP
2787
2788     // check for the special case of ASCII or ISO8859-1 charset: as we have
2789     // special knowledge of it anyhow, we don't need to create a special
2790     // conversion object
2791     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2792             m_encoding == wxFONTENCODING_DEFAULT )
2793     {
2794         // don't convert at all
2795         return NULL;
2796     }
2797
2798     // we trust OS to do conversion better than we can so try external
2799     // conversion methods first
2800     //
2801     // the full order is:
2802     //      1. OS conversion (iconv() under Unix or Win32 API)
2803     //      2. hard coded conversions for UTF
2804     //      3. wxEncodingConverter as fall back
2805
2806     // step (1)
2807 #ifdef HAVE_ICONV
2808 #if !wxUSE_FONTMAP
2809     if ( m_name )
2810 #endif // !wxUSE_FONTMAP
2811     {
2812 #if wxUSE_FONTMAP
2813         wxFontEncoding encoding(m_encoding);
2814 #endif
2815
2816         if ( m_name )
2817         {
2818             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2819             if ( conv->IsOk() )
2820                 return conv;
2821
2822             delete conv;
2823
2824 #if wxUSE_FONTMAP
2825             encoding =
2826                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2827 #endif // wxUSE_FONTMAP
2828         }
2829 #if wxUSE_FONTMAP
2830         {
2831             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2832             if ( it != gs_nameCache.end() )
2833             {
2834                 if ( it->second.empty() )
2835                     return NULL;
2836
2837                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2838                 if ( conv->IsOk() )
2839                     return conv;
2840
2841                 delete conv;
2842             }
2843
2844             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2845             // CS : in case this does not return valid names (eg for MacRoman)
2846             // encoding got a 'failure' entry in the cache all the same,
2847             // although it just has to be created using a different method, so
2848             // only store failed iconv creation attempts (or perhaps we
2849             // shoulnd't do this at all ?)
2850             if ( names[0] != NULL )
2851             {
2852                 for ( ; *names; ++names )
2853                 {
2854                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2855                     //             will need changes that will obsolete this
2856                     wxString name(*names);
2857                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2858                     if ( conv->IsOk() )
2859                     {
2860                         gs_nameCache[encoding] = *names;
2861                         return conv;
2862                     }
2863
2864                     delete conv;
2865                 }
2866
2867                 gs_nameCache[encoding] = _T(""); // cache the failure
2868             }
2869         }
2870 #endif // wxUSE_FONTMAP
2871     }
2872 #endif // HAVE_ICONV
2873
2874 #ifdef wxHAVE_WIN32_MB2WC
2875     {
2876 #if wxUSE_FONTMAP
2877         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2878                                       : new wxMBConv_win32(m_encoding);
2879         if ( conv->IsOk() )
2880             return conv;
2881
2882         delete conv;
2883 #else
2884         return NULL;
2885 #endif
2886     }
2887 #endif // wxHAVE_WIN32_MB2WC
2888
2889 #ifdef __DARWIN__
2890     {
2891         // leave UTF16 and UTF32 to the built-ins of wx
2892         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2893             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2894         {
2895 #if wxUSE_FONTMAP
2896             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2897                                           : new wxMBConv_cf(m_encoding);
2898 #else
2899             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2900 #endif
2901
2902             if ( conv->IsOk() )
2903                  return conv;
2904
2905             delete conv;
2906         }
2907     }
2908 #endif // __DARWIN__
2909
2910     // step (2)
2911     wxFontEncoding enc = m_encoding;
2912 #if wxUSE_FONTMAP
2913     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2914     {
2915         // use "false" to suppress interactive dialogs -- we can be called from
2916         // anywhere and popping up a dialog from here is the last thing we want to
2917         // do
2918         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2919     }
2920 #endif // wxUSE_FONTMAP
2921
2922     switch ( enc )
2923     {
2924         case wxFONTENCODING_UTF7:
2925              return new wxMBConvUTF7;
2926
2927         case wxFONTENCODING_UTF8:
2928              return new wxMBConvUTF8;
2929
2930         case wxFONTENCODING_UTF16BE:
2931              return new wxMBConvUTF16BE;
2932
2933         case wxFONTENCODING_UTF16LE:
2934              return new wxMBConvUTF16LE;
2935
2936         case wxFONTENCODING_UTF32BE:
2937              return new wxMBConvUTF32BE;
2938
2939         case wxFONTENCODING_UTF32LE:
2940              return new wxMBConvUTF32LE;
2941
2942         default:
2943              // nothing to do but put here to suppress gcc warnings
2944              break;
2945     }
2946
2947     // step (3)
2948 #if wxUSE_FONTMAP
2949     {
2950         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2951                                       : new wxMBConv_wxwin(m_encoding);
2952         if ( conv->IsOk() )
2953             return conv;
2954
2955         delete conv;
2956     }
2957 #endif // wxUSE_FONTMAP
2958
2959     // NB: This is a hack to prevent deadlock. What could otherwise happen
2960     //     in Unicode build: wxConvLocal creation ends up being here
2961     //     because of some failure and logs the error. But wxLog will try to
2962     //     attach a timestamp, for which it will need wxConvLocal (to convert
2963     //     time to char* and then wchar_t*), but that fails, tries to log the
2964     //     error, but wxLog has an (already locked) critical section that
2965     //     guards the static buffer.
2966     static bool alreadyLoggingError = false;
2967     if (!alreadyLoggingError)
2968     {
2969         alreadyLoggingError = true;
2970         wxLogError(_("Cannot convert from the charset '%s'!"),
2971                    m_name ? m_name
2972                       :
2973 #if wxUSE_FONTMAP
2974                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2975 #else // !wxUSE_FONTMAP
2976                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2977 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2978               );
2979
2980         alreadyLoggingError = false;
2981     }
2982
2983     return NULL;
2984 }
2985
2986 void wxCSConv::CreateConvIfNeeded() const
2987 {
2988     if ( m_deferred )
2989     {
2990         wxCSConv *self = (wxCSConv *)this; // const_cast
2991
2992         // if we don't have neither the name nor the encoding, use the default
2993         // encoding for this system
2994         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2995         {
2996 #if wxUSE_INTL
2997             self->m_encoding = wxLocale::GetSystemEncoding();
2998 #else
2999             // fallback to some reasonable default:
3000             self->m_encoding = wxFONTENCODING_ISO8859_1;
3001 #endif // wxUSE_INTL
3002         }
3003
3004         self->m_convReal = DoCreate();
3005         self->m_deferred = false;
3006     }
3007 }
3008
3009 bool wxCSConv::IsOk() const
3010 {
3011     CreateConvIfNeeded();
3012
3013     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3014     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3015         return true; // always ok as we do it ourselves
3016
3017     // m_convReal->IsOk() is called at its own creation, so we know it must
3018     // be ok if m_convReal is non-NULL
3019     return m_convReal != NULL;
3020 }
3021
3022 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3023                          const char *src, size_t srcLen) const
3024 {
3025     CreateConvIfNeeded();
3026
3027     if (m_convReal)
3028         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3029
3030     // latin-1 (direct)
3031     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3032 }
3033
3034 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3035                            const wchar_t *src, size_t srcLen) const
3036 {
3037     CreateConvIfNeeded();
3038
3039     if (m_convReal)
3040         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3041
3042     // latin-1 (direct)
3043     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3044 }
3045
3046 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3047 {
3048     CreateConvIfNeeded();
3049
3050     if (m_convReal)
3051         return m_convReal->MB2WC(buf, psz, n);
3052
3053     // latin-1 (direct)
3054     size_t len = strlen(psz);
3055
3056     if (buf)
3057     {
3058         for (size_t c = 0; c <= len; c++)
3059             buf[c] = (unsigned char)(psz[c]);
3060     }
3061
3062     return len;
3063 }
3064
3065 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3066 {
3067     CreateConvIfNeeded();
3068
3069     if (m_convReal)
3070         return m_convReal->WC2MB(buf, psz, n);
3071
3072     // latin-1 (direct)
3073     const size_t len = wxWcslen(psz);
3074     if (buf)
3075     {
3076         for (size_t c = 0; c <= len; c++)
3077         {
3078             if (psz[c] > 0xFF)
3079                 return wxCONV_FAILED;
3080
3081             buf[c] = (char)psz[c];
3082         }
3083     }
3084     else
3085     {
3086         for (size_t c = 0; c <= len; c++)
3087         {
3088             if (psz[c] > 0xFF)
3089                 return wxCONV_FAILED;
3090         }
3091     }
3092
3093     return len;
3094 }
3095
3096 size_t wxCSConv::GetMBNulLen() const
3097 {
3098     CreateConvIfNeeded();
3099
3100     if ( m_convReal )
3101     {
3102         return m_convReal->GetMBNulLen();
3103     }
3104
3105     // otherwise, we are ISO-8859-1
3106     return 1;
3107 }
3108
3109 #if wxUSE_UNICODE_UTF8
3110 bool wxCSConv::IsUTF8() const
3111 {
3112     CreateConvIfNeeded();
3113
3114     if ( m_convReal )
3115     {
3116         return m_convReal->IsUTF8();
3117     }
3118
3119     // otherwise, we are ISO-8859-1
3120     return false;
3121 }
3122 #endif
3123
3124
3125 #if wxUSE_UNICODE
3126
3127 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3128 {
3129     if ( !s )
3130         return wxWCharBuffer();
3131
3132     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3133     if ( !wbuf )
3134         wbuf = wxMBConvUTF8().cMB2WX(s);
3135     if ( !wbuf )
3136         wbuf = wxConvISO8859_1.cMB2WX(s);
3137
3138     return wbuf;
3139 }
3140
3141 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3142 {
3143     if ( !ws )
3144         return wxCharBuffer();
3145
3146     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3147     if ( !buf )
3148         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3149
3150     return buf;
3151 }
3152
3153 #endif // wxUSE_UNICODE
3154
3155 // ----------------------------------------------------------------------------
3156 // globals
3157 // ----------------------------------------------------------------------------
3158
3159 // NB: The reason why we create converted objects in this convoluted way,
3160 //     using a factory function instead of global variable, is that they
3161 //     may be used at static initialization time (some of them are used by
3162 //     wxString ctors and there may be a global wxString object). In other
3163 //     words, possibly _before_ the converter global object would be
3164 //     initialized.
3165
3166 #undef wxConvLibc
3167 #undef wxConvUTF8
3168 #undef wxConvUTF7
3169 #undef wxConvLocal
3170 #undef wxConvISO8859_1
3171
3172 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3173     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3174     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3175     {                                                                   \
3176         static impl_klass name##Obj ctor_args;                          \
3177         return &name##Obj;                                              \
3178     }                                                                   \
3179     /* this ensures that all global converter objects are created */    \
3180     /* by the time static initialization is done, i.e. before any */    \
3181     /* thread is launched: */                                           \
3182     static klass* gs_##name##instance = wxGet_##name##Ptr()
3183
3184 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3185     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3186
3187 #ifdef __WINDOWS__
3188     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3189 #else
3190     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3191 #endif
3192
3193 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3194 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3195 //     provokes an error message about "not enough macro parameters"; and we
3196 //     can't use "()" here as the name##Obj declaration would be parsed as a
3197 //     function declaration then, so use a semicolon and live with an extra
3198 //     empty statement (and hope that no compilers warns about this)
3199 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3200 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3201
3202 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3203 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3204
3205 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3206 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3207
3208 #ifdef __DARWIN__
3209 // The xnu kernel always communicates file paths in decomposed UTF-8.
3210 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3211 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3212 #endif
3213
3214 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3215 #ifdef __DARWIN__
3216                                     &wxConvMacUTF8DObj;
3217 #else // !__DARWIN__
3218                                     wxGet_wxConvLibcPtr();
3219 #endif // __DARWIN__/!__DARWIN__
3220
3221 #else // !wxUSE_WCHAR_T
3222
3223 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3224 // stand-ins in absence of wchar_t
3225 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3226                                 wxConvISO8859_1,
3227                                 wxConvLocal,
3228                                 wxConvUTF8;
3229
3230 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T