src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = ToWChar(NULL, 0, psz);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = FromWChar(NULL, 0, pwz);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             wxCharBuffer buf(nLen - 1);
 380             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 381                 return buf;
 382         }
 383     }
 384
 385     return wxCharBuffer();
 386 }
 387
 388 const wxWCharBuffer
 389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 390 {
 391     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 392     if ( dstLen != wxCONV_FAILED )
 393     {
 394         // notice that we allocate space for dstLen+1 wide characters here
 395         // because we want the buffer to always be NUL-terminated, even if the
 396         // input isn't (as otherwise the caller has no way to know its length)
 397         wxWCharBuffer wbuf(dstLen);
 398         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 399         {
 400             if ( outLen )
 401             {
 402                 *outLen = dstLen;
 403                 if ( wbuf[dstLen - 1] == L'\0' )
 404                     (*outLen)--;
 405             }
 406
 407             return wbuf;
 408         }
 409     }
 410
 411     if ( outLen )
 412         *outLen = 0;
 413
 414     return wxWCharBuffer();
 415 }
 416
 417 const wxCharBuffer
 418 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 419 {
 420     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 421     if ( dstLen != wxCONV_FAILED )
 422     {
 423         const size_t nulLen = GetMBNulLen();
 424
 425         // as above, ensure that the buffer is always NUL-terminated, even if
 426         // the input is not
 427         wxCharBuffer buf(dstLen + nulLen - 1);
 428         memset(buf.data() + dstLen, 0, nulLen);
 429         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 430         {
 431             if ( outLen )
 432             {
 433                 *outLen = dstLen;
 434
 435                 if ( dstLen >= nulLen &&
 436                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 437                 {
 438                     // in this case the output is NUL-terminated and we're not
 439                     // supposed to count NUL
 440                     *outLen -= nulLen;
 441                 }
 442             }
 443
 444             return buf;
 445         }
 446     }
 447
 448     if ( outLen )
 449         *outLen = 0;
 450
 451     return wxCharBuffer();
 452 }
 453
 454 // ----------------------------------------------------------------------------
 455 // wxMBConvLibc
 456 // ----------------------------------------------------------------------------
 457
 458 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 459 {
 460     return wxMB2WC(buf, psz, n);
 461 }
 462
 463 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 464 {
 465     return wxWC2MB(buf, psz, n);
 466 }
 467
 468 // ----------------------------------------------------------------------------
 469 // wxConvBrokenFileNames
 470 // ----------------------------------------------------------------------------
 471
 472 #ifdef __UNIX__
 473
 474 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 475 {
 476     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 477          wxStricmp(charset, _T("UTF8")) == 0  )
 478         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 479     else
 480         m_conv = new wxCSConv(charset);
 481 }
 482
 483 #endif // __UNIX__
 484
 485 // ----------------------------------------------------------------------------
 486 // UTF-7
 487 // ----------------------------------------------------------------------------
 488
 489 // Implementation (C) 2004 Fredrik Roubert
 490
 491 //
 492 // BASE64 decoding table
 493 //
 494 static const unsigned char utf7unb64[] =
 495 {
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 502     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 503     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 505     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 506     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 507     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 509     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 510     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 511     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 528 };
 529
 530 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 531 {
 532     size_t len = 0;
 533
 534     while ( *psz && (!buf || (len < n)) )
 535     {
 536         unsigned char cc = *psz++;
 537         if (cc != '+')
 538         {
 539             // plain ASCII char
 540             if (buf)
 541                 *buf++ = cc;
 542             len++;
 543         }
 544         else if (*psz == '-')
 545         {
 546             // encoded plus sign
 547             if (buf)
 548                 *buf++ = cc;
 549             len++;
 550             psz++;
 551         }
 552         else // start of BASE64 encoded string
 553         {
 554             bool lsb, ok;
 555             unsigned int d, l;
 556             for ( ok = lsb = false, d = 0, l = 0;
 557                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 558                   psz++ )
 559             {
 560                 d <<= 6;
 561                 d += cc;
 562                 for (l += 6; l >= 8; lsb = !lsb)
 563                 {
 564                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 565                     if (lsb)
 566                     {
 567                         if (buf)
 568                             *buf++ |= c;
 569                         len ++;
 570                     }
 571                     else
 572                     {
 573                         if (buf)
 574                             *buf = (wchar_t)(c << 8);
 575                     }
 576
 577                     ok = true;
 578                 }
 579             }
 580
 581             if ( !ok )
 582             {
 583                 // in valid UTF7 we should have valid characters after '+'
 584                 return wxCONV_FAILED;
 585             }
 586
 587             if (*psz == '-')
 588                 psz++;
 589         }
 590     }
 591
 592     if ( buf && (len < n) )
 593         *buf = '\0';
 594
 595     return len;
 596 }
 597
 598 //
 599 // BASE64 encoding table
 600 //
 601 static const unsigned char utf7enb64[] =
 602 {
 603     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 604     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 605     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 606     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 607     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 608     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 609     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 610     '4', '5', '6', '7', '8', '9', '+', '/'
 611 };
 612
 613 //
 614 // UTF-7 encoding table
 615 //
 616 // 0 - Set D (directly encoded characters)
 617 // 1 - Set O (optional direct characters)
 618 // 2 - whitespace characters (optional)
 619 // 3 - special characters
 620 //
 621 static const unsigned char utf7encode[128] =
 622 {
 623     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 624     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 625     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 627     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 628     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 629     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 630     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 631 };
 632
 633 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 634 {
 635     size_t len = 0;
 636
 637     while (*psz && ((!buf) || (len < n)))
 638     {
 639         wchar_t cc = *psz++;
 640         if (cc < 0x80 && utf7encode[cc] < 1)
 641         {
 642             // plain ASCII char
 643             if (buf)
 644                 *buf++ = (char)cc;
 645
 646             len++;
 647         }
 648 #ifndef WC_UTF16
 649         else if (((wxUint32)cc) > 0xffff)
 650         {
 651             // no surrogate pair generation (yet?)
 652             return wxCONV_FAILED;
 653         }
 654 #endif
 655         else
 656         {
 657             if (buf)
 658                 *buf++ = '+';
 659
 660             len++;
 661             if (cc != '+')
 662             {
 663                 // BASE64 encode string
 664                 unsigned int lsb, d, l;
 665                 for (d = 0, l = 0; /*nothing*/; psz++)
 666                 {
 667                     for (lsb = 0; lsb < 2; lsb ++)
 668                     {
 669                         d <<= 8;
 670                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 671
 672                         for (l += 8; l >= 6; )
 673                         {
 674                             l -= 6;
 675                             if (buf)
 676                                 *buf++ = utf7enb64[(d >> l) % 64];
 677                             len++;
 678                         }
 679                     }
 680
 681                     cc = *psz;
 682                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 683                         break;
 684                 }
 685
 686                 if (l != 0)
 687                 {
 688                     if (buf)
 689                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 690
 691                     len++;
 692                 }
 693             }
 694
 695             if (buf)
 696                 *buf++ = '-';
 697             len++;
 698         }
 699     }
 700
 701     if (buf && (len < n))
 702         *buf = 0;
 703
 704     return len;
 705 }
 706
 707 // ----------------------------------------------------------------------------
 708 // UTF-8
 709 // ----------------------------------------------------------------------------
 710
 711 static const wxUint32 utf8_max[]=
 712     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 713
 714 // boundaries of the private use area we use to (temporarily) remap invalid
 715 // characters invalid in a UTF-8 encoded string
 716 const wxUint32 wxUnicodePUA = 0x100000;
 717 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 718
 719 // this table gives the length of the UTF-8 encoding from its first character:
 720 const unsigned char tableUtf8Lengths[256] = {
 721     // single-byte sequences (ASCII):
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 726     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 727     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 728     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 729     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 730
 731     // these are invalid:
 732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 733     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 734     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 735     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 736     0, 0,                                            // C0,C1
 737
 738     // two-byte sequences:
 739           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 740     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 741
 742     // three-byte sequences:
 743     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 744
 745     // four-byte sequences:
 746     4, 4, 4, 4, 4,                                   // F0..F4
 747
 748     // these are invalid again (5- or 6-byte
 749     // sequences and sequences for code points
 750     // above U+10FFFF, as restricted by RFC 3629):
 751                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 752 };
 753
 754 size_t
 755 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 756                             const char *src, size_t srcLen) const
 757 {
 758     wchar_t *out = dstLen ? dst : NULL;
 759     size_t written = 0;
 760
 761     if ( srcLen == wxNO_LEN )
 762         srcLen = strlen(src) + 1;
 763
 764     for ( const char *p = src; ; p++ )
 765     {
 766         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 767         {
 768             // all done successfully, just add the trailing NULL if we are not
 769             // using explicit length
 770             if ( srcLen == wxNO_LEN )
 771             {
 772                 if ( out )
 773                 {
 774                     if ( !dstLen )
 775                         break;
 776
 777                     *out = L'\0';
 778                 }
 779
 780                 written++;
 781             }
 782
 783             return written;
 784         }
 785
 786         if ( out && !dstLen-- )
 787             break;
 788
 789         wxUint32 code;
 790         unsigned char c = *p;
 791
 792         if ( c < 0x80 )
 793         {
 794             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 795                 break;
 796
 797             if ( srcLen != wxNO_LEN )
 798                 srcLen--;
 799
 800             code = c;
 801         }
 802         else
 803         {
 804             unsigned len = tableUtf8Lengths[c];
 805             if ( !len )
 806                 break;
 807
 808             if ( srcLen < len ) // the test works for wxNO_LEN too
 809                 break;
 810
 811             if ( srcLen != wxNO_LEN )
 812                 srcLen -= len;
 813
 814             //   Char. number range   |        UTF-8 octet sequence
 815             //      (hexadecimal)     |              (binary)
 816             //  ----------------------+----------------------------------------
 817             //  0000 0000 - 0000 007F | 0xxxxxxx
 818             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 819             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 820             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 821             //
 822             //  Code point value is stored in bits marked with 'x',
 823             //  lowest-order bit of the value on the right side in the diagram
 824             //  above.                                         (from RFC 3629)
 825
 826             // mask to extract lead byte's value ('x' bits above), by sequence
 827             // length:
 828             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 829
 830             // mask and value of lead byte's most significant bits, by length:
 831             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 832             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 833
 834             len--; // it's more convenient to work with 0-based length here
 835
 836             // extract the lead byte's value bits:
 837             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 838                 break;
 839
 840             code = c & leadValueMask[len];
 841
 842             // all remaining bytes, if any, are handled in the same way
 843             // regardless of sequence's length:
 844             for ( ; len; --len )
 845             {
 846                 c = *++p;
 847                 if ( (c & 0xC0) != 0x80 )
 848                     return wxCONV_FAILED;
 849
 850                 code <<= 6;
 851                 code |= c & 0x3F;
 852             }
 853         }
 854
 855 #ifdef WC_UTF16
 856         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 857         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 858         {
 859             if ( out )
 860                 out++;
 861             written++;
 862         }
 863 #else // !WC_UTF16
 864         if ( out )
 865             *out = code;
 866 #endif // WC_UTF16/!WC_UTF16
 867
 868         if ( out )
 869             out++;
 870
 871         written++;
 872     }
 873
 874     return wxCONV_FAILED;
 875 }
 876
 877 size_t
 878 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 879                               const wchar_t *src, size_t srcLen) const
 880 {
 881     char *out = dstLen ? dst : NULL;
 882     size_t written = 0;
 883
 884     for ( const wchar_t *wp = src; ; wp++ )
 885     {
 886         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 887         {
 888             // all done successfully, just add the trailing NULL if we are not
 889             // using explicit length
 890             if ( srcLen == wxNO_LEN )
 891             {
 892                 if ( out )
 893                 {
 894                     if ( !dstLen )
 895                         break;
 896
 897                     *out = '\0';
 898                 }
 899
 900                 written++;
 901             }
 902
 903             return written;
 904         }
 905
 906
 907         wxUint32 code;
 908 #ifdef WC_UTF16
 909         // cast is ok for WC_UTF16
 910         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 911         {
 912             // skip the next char too as we decoded a surrogate
 913             wp++;
 914         }
 915 #else // wchar_t is UTF-32
 916         code = *wp & 0x7fffffff;
 917 #endif
 918
 919         unsigned len;
 920         if ( code <= 0x7F )
 921         {
 922             len = 1;
 923             if ( out )
 924             {
 925                 if ( dstLen < len )
 926                     break;
 927
 928                 out[0] = (char)code;
 929             }
 930         }
 931         else if ( code <= 0x07FF )
 932         {
 933             len = 2;
 934             if ( out )
 935             {
 936                 if ( dstLen < len )
 937                     break;
 938
 939                 // NB: this line takes 6 least significant bits, encodes them as
 940                 // 10xxxxxx and discards them so that the next byte can be encoded:
 941                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 942                 out[0] = 0xC0 | code;
 943             }
 944         }
 945         else if ( code < 0xFFFF )
 946         {
 947             len = 3;
 948             if ( out )
 949             {
 950                 if ( dstLen < len )
 951                     break;
 952
 953                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 954                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 955                 out[0] = 0xE0 | code;
 956             }
 957         }
 958         else if ( code <= 0x10FFFF )
 959         {
 960             len = 4;
 961             if ( out )
 962             {
 963                 if ( dstLen < len )
 964                     break;
 965
 966                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 967                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 968                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 969                 out[0] = 0xF0 | code;
 970             }
 971         }
 972         else
 973         {
 974             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 975             break;
 976         }
 977
 978         if ( out )
 979         {
 980             out += len;
 981             dstLen -= len;
 982         }
 983
 984         written += len;
 985     }
 986
 987     // we only get here if an error occurs during decoding
 988     return wxCONV_FAILED;
 989 }
 990
 991 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 992                              const char *psz, size_t srcLen) const
 993 {
 994     if ( m_options == MAP_INVALID_UTF8_NOT )
 995         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 996
 997     size_t len = 0;
 998
 999     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1000     {
1001         const char *opsz = psz;
1002         bool invalid = false;
1003         unsigned char cc = *psz++, fc = cc;
1004         unsigned cnt;
1005         for (cnt = 0; fc & 0x80; cnt++)
1006             fc <<= 1;
1007
1008         if (!cnt)
1009         {
1010             // plain ASCII char
1011             if (buf)
1012                 *buf++ = cc;
1013             len++;
1014
1015             // escape the escape character for octal escapes
1016             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1017                     && cc == '\\' && (!buf || len < n))
1018             {
1019                 if (buf)
1020                     *buf++ = cc;
1021                 len++;
1022             }
1023         }
1024         else
1025         {
1026             cnt--;
1027             if (!cnt)
1028             {
1029                 // invalid UTF-8 sequence
1030                 invalid = true;
1031             }
1032             else
1033             {
1034                 unsigned ocnt = cnt - 1;
1035                 wxUint32 res = cc & (0x3f >> cnt);
1036                 while (cnt--)
1037                 {
1038                     cc = *psz;
1039                     if ((cc & 0xC0) != 0x80)
1040                     {
1041                         // invalid UTF-8 sequence
1042                         invalid = true;
1043                         break;
1044                     }
1045
1046                     psz++;
1047                     res = (res << 6) | (cc & 0x3f);
1048                 }
1049
1050                 if (invalid || res <= utf8_max[ocnt])
1051                 {
1052                     // illegal UTF-8 encoding
1053                     invalid = true;
1054                 }
1055                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1056                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1057                 {
1058                     // if one of our PUA characters turns up externally
1059                     // it must also be treated as an illegal sequence
1060                     // (a bit like you have to escape an escape character)
1061                     invalid = true;
1062                 }
1063                 else
1064                 {
1065 #ifdef WC_UTF16
1066                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1067                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1068                     if (pa == wxCONV_FAILED)
1069                     {
1070                         invalid = true;
1071                     }
1072                     else
1073                     {
1074                         if (buf)
1075                             buf += pa;
1076                         len += pa;
1077                     }
1078 #else // !WC_UTF16
1079                     if (buf)
1080                         *buf++ = (wchar_t)res;
1081                     len++;
1082 #endif // WC_UTF16/!WC_UTF16
1083                 }
1084             }
1085
1086             if (invalid)
1087             {
1088                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1089                 {
1090                     while (opsz < psz && (!buf || len < n))
1091                     {
1092 #ifdef WC_UTF16
1093                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1094                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1095                         wxASSERT(pa != wxCONV_FAILED);
1096                         if (buf)
1097                             buf += pa;
1098                         opsz++;
1099                         len += pa;
1100 #else
1101                         if (buf)
1102                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1103                         opsz++;
1104                         len++;
1105 #endif
1106                     }
1107                 }
1108                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1109                 {
1110                     while (opsz < psz && (!buf || len < n))
1111                     {
1112                         if ( buf && len + 3 < n )
1113                         {
1114                             unsigned char on = *opsz;
1115                             *buf++ = L'\\';
1116                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1117                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1118                             *buf++ = (wchar_t)( L'0' + on % 010 );
1119                         }
1120
1121                         opsz++;
1122                         len += 4;
1123                     }
1124                 }
1125                 else // MAP_INVALID_UTF8_NOT
1126                 {
1127                     return wxCONV_FAILED;
1128                 }
1129             }
1130         }
1131     }
1132
1133     if (srcLen == wxNO_LEN && buf && (len < n))
1134         *buf = 0;
1135
1136     return len + 1;
1137 }
1138
1139 static inline bool isoctal(wchar_t wch)
1140 {
1141     return L'0' <= wch && wch <= L'7';
1142 }
1143
1144 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1145                                const wchar_t *psz, size_t srcLen) const
1146 {
1147     if ( m_options == MAP_INVALID_UTF8_NOT )
1148         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1149
1150     size_t len = 0;
1151
1152     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1153     {
1154         wxUint32 cc;
1155
1156 #ifdef WC_UTF16
1157         // cast is ok for WC_UTF16
1158         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1159         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1160 #else
1161         cc = (*psz++) & 0x7fffffff;
1162 #endif
1163
1164         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1165                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1166         {
1167             if (buf)
1168                 *buf++ = (char)(cc - wxUnicodePUA);
1169             len++;
1170         }
1171         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1172                     && cc == L'\\' && psz[0] == L'\\' )
1173         {
1174             if (buf)
1175                 *buf++ = (char)cc;
1176             psz++;
1177             len++;
1178         }
1179         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1180                     cc == L'\\' &&
1181                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1182         {
1183             if (buf)
1184             {
1185                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1186                                  (psz[1] - L'0') * 010 +
1187                                  (psz[2] - L'0'));
1188             }
1189
1190             psz += 3;
1191             len++;
1192         }
1193         else
1194         {
1195             unsigned cnt;
1196             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1197             {
1198             }
1199
1200             if (!cnt)
1201             {
1202                 // plain ASCII char
1203                 if (buf)
1204                     *buf++ = (char) cc;
1205                 len++;
1206             }
1207             else
1208             {
1209                 len += cnt + 1;
1210                 if (buf)
1211                 {
1212                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1213                     while (cnt--)
1214                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1215                 }
1216             }
1217         }
1218     }
1219
1220     if (srcLen == wxNO_LEN && buf && (len < n))
1221         *buf = 0;
1222
1223     return len + 1;
1224 }
1225
1226 // ============================================================================
1227 // UTF-16
1228 // ============================================================================
1229
1230 #ifdef WORDS_BIGENDIAN
1231     #define wxMBConvUTF16straight wxMBConvUTF16BE
1232     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1233 #else
1234     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1235     #define wxMBConvUTF16straight wxMBConvUTF16LE
1236 #endif
1237
1238 /* static */
1239 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1240 {
1241     if ( srcLen == wxNO_LEN )
1242     {
1243         // count the number of bytes in input, including the trailing NULs
1244         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1245         for ( srcLen = 1; *inBuff++; srcLen++ )
1246             ;
1247
1248         srcLen *= BYTES_PER_CHAR;
1249     }
1250     else // we already have the length
1251     {
1252         // we can only convert an entire number of UTF-16 characters
1253         if ( srcLen % BYTES_PER_CHAR )
1254             return wxCONV_FAILED;
1255     }
1256
1257     return srcLen;
1258 }
1259
1260 // case when in-memory representation is UTF-16 too
1261 #ifdef WC_UTF16
1262
1263 // ----------------------------------------------------------------------------
1264 // conversions without endianness change
1265 // ----------------------------------------------------------------------------
1266
1267 size_t
1268 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1269                                const char *src, size_t srcLen) const
1270 {
1271     // set up the scene for using memcpy() (which is presumably more efficient
1272     // than copying the bytes one by one)
1273     srcLen = GetLength(src, srcLen);
1274     if ( srcLen == wxNO_LEN )
1275         return wxCONV_FAILED;
1276
1277     const size_t inLen = srcLen / BYTES_PER_CHAR;
1278     if ( dst )
1279     {
1280         if ( dstLen < inLen )
1281             return wxCONV_FAILED;
1282
1283         memcpy(dst, src, srcLen);
1284     }
1285
1286     return inLen;
1287 }
1288
1289 size_t
1290 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1291                                  const wchar_t *src, size_t srcLen) const
1292 {
1293     if ( srcLen == wxNO_LEN )
1294         srcLen = wxWcslen(src) + 1;
1295
1296     srcLen *= BYTES_PER_CHAR;
1297
1298     if ( dst )
1299     {
1300         if ( dstLen < srcLen )
1301             return wxCONV_FAILED;
1302
1303         memcpy(dst, src, srcLen);
1304     }
1305
1306     return srcLen;
1307 }
1308
1309 // ----------------------------------------------------------------------------
1310 // endian-reversing conversions
1311 // ----------------------------------------------------------------------------
1312
1313 size_t
1314 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1315                            const char *src, size_t srcLen) const
1316 {
1317     srcLen = GetLength(src, srcLen);
1318     if ( srcLen == wxNO_LEN )
1319         return wxCONV_FAILED;
1320
1321     srcLen /= BYTES_PER_CHAR;
1322
1323     if ( dst )
1324     {
1325         if ( dstLen < srcLen )
1326             return wxCONV_FAILED;
1327
1328         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1329         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1330         {
1331             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1332         }
1333     }
1334
1335     return srcLen;
1336 }
1337
1338 size_t
1339 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1340                              const wchar_t *src, size_t srcLen) const
1341 {
1342     if ( srcLen == wxNO_LEN )
1343         srcLen = wxWcslen(src) + 1;
1344
1345     srcLen *= BYTES_PER_CHAR;
1346
1347     if ( dst )
1348     {
1349         if ( dstLen < srcLen )
1350             return wxCONV_FAILED;
1351
1352         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1353         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1354         {
1355             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1356         }
1357     }
1358
1359     return srcLen;
1360 }
1361
1362 #else // !WC_UTF16: wchar_t is UTF-32
1363
1364 // ----------------------------------------------------------------------------
1365 // conversions without endianness change
1366 // ----------------------------------------------------------------------------
1367
1368 size_t
1369 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1370                                const char *src, size_t srcLen) const
1371 {
1372     srcLen = GetLength(src, srcLen);
1373     if ( srcLen == wxNO_LEN )
1374         return wxCONV_FAILED;
1375
1376     const size_t inLen = srcLen / BYTES_PER_CHAR;
1377     if ( !dst )
1378     {
1379         // optimization: return maximal space which could be needed for this
1380         // string even if the real size could be smaller if the buffer contains
1381         // any surrogates
1382         return inLen;
1383     }
1384
1385     size_t outLen = 0;
1386     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1387     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1388     {
1389         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1390         if ( !inBuff )
1391             return wxCONV_FAILED;
1392
1393         if ( ++outLen > dstLen )
1394             return wxCONV_FAILED;
1395
1396         *dst++ = ch;
1397     }
1398
1399
1400     return outLen;
1401 }
1402
1403 size_t
1404 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1405                                  const wchar_t *src, size_t srcLen) const
1406 {
1407     if ( srcLen == wxNO_LEN )
1408         srcLen = wxWcslen(src) + 1;
1409
1410     size_t outLen = 0;
1411     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1412     for ( size_t n = 0; n < srcLen; n++ )
1413     {
1414         wxUint16 cc[2];
1415         const size_t numChars = encode_utf16(*src++, cc);
1416         if ( numChars == wxCONV_FAILED )
1417             return wxCONV_FAILED;
1418
1419         outLen += numChars * BYTES_PER_CHAR;
1420         if ( outBuff )
1421         {
1422             if ( outLen > dstLen )
1423                 return wxCONV_FAILED;
1424
1425             *outBuff++ = cc[0];
1426             if ( numChars == 2 )
1427             {
1428                 // second character of a surrogate
1429                 *outBuff++ = cc[1];
1430             }
1431         }
1432     }
1433
1434     return outLen;
1435 }
1436
1437 // ----------------------------------------------------------------------------
1438 // endian-reversing conversions
1439 // ----------------------------------------------------------------------------
1440
1441 size_t
1442 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1443                            const char *src, size_t srcLen) const
1444 {
1445     srcLen = GetLength(src, srcLen);
1446     if ( srcLen == wxNO_LEN )
1447         return wxCONV_FAILED;
1448
1449     const size_t inLen = srcLen / BYTES_PER_CHAR;
1450     if ( !dst )
1451     {
1452         // optimization: return maximal space which could be needed for this
1453         // string even if the real size could be smaller if the buffer contains
1454         // any surrogates
1455         return inLen;
1456     }
1457
1458     size_t outLen = 0;
1459     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1460     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1461     {
1462         wxUint32 ch;
1463         wxUint16 tmp[2];
1464
1465         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1466         inBuff++;
1467         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1468
1469         const size_t numChars = decode_utf16(tmp, ch);
1470         if ( numChars == wxCONV_FAILED )
1471             return wxCONV_FAILED;
1472
1473         if ( numChars == 2 )
1474             inBuff++;
1475
1476         if ( ++outLen > dstLen )
1477             return wxCONV_FAILED;
1478
1479         *dst++ = ch;
1480     }
1481
1482
1483     return outLen;
1484 }
1485
1486 size_t
1487 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1488                              const wchar_t *src, size_t srcLen) const
1489 {
1490     if ( srcLen == wxNO_LEN )
1491         srcLen = wxWcslen(src) + 1;
1492
1493     size_t outLen = 0;
1494     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1495     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1496     {
1497         wxUint16 cc[2];
1498         const size_t numChars = encode_utf16(*src, cc);
1499         if ( numChars == wxCONV_FAILED )
1500             return wxCONV_FAILED;
1501
1502         outLen += numChars * BYTES_PER_CHAR;
1503         if ( outBuff )
1504         {
1505             if ( outLen > dstLen )
1506                 return wxCONV_FAILED;
1507
1508             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1509             if ( numChars == 2 )
1510             {
1511                 // second character of a surrogate
1512                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1513             }
1514         }
1515     }
1516
1517     return outLen;
1518 }
1519
1520 #endif // WC_UTF16/!WC_UTF16
1521
1522
1523 // ============================================================================
1524 // UTF-32
1525 // ============================================================================
1526
1527 #ifdef WORDS_BIGENDIAN
1528     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1529     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1530 #else
1531     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1532     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1533 #endif
1534
1535
1536 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1537 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1538
1539 /* static */
1540 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1541 {
1542     if ( srcLen == wxNO_LEN )
1543     {
1544         // count the number of bytes in input, including the trailing NULs
1545         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1546         for ( srcLen = 1; *inBuff++; srcLen++ )
1547             ;
1548
1549         srcLen *= BYTES_PER_CHAR;
1550     }
1551     else // we already have the length
1552     {
1553         // we can only convert an entire number of UTF-32 characters
1554         if ( srcLen % BYTES_PER_CHAR )
1555             return wxCONV_FAILED;
1556     }
1557
1558     return srcLen;
1559 }
1560
1561 // case when in-memory representation is UTF-16
1562 #ifdef WC_UTF16
1563
1564 // ----------------------------------------------------------------------------
1565 // conversions without endianness change
1566 // ----------------------------------------------------------------------------
1567
1568 size_t
1569 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1570                                const char *src, size_t srcLen) const
1571 {
1572     srcLen = GetLength(src, srcLen);
1573     if ( srcLen == wxNO_LEN )
1574         return wxCONV_FAILED;
1575
1576     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1577     const size_t inLen = srcLen / BYTES_PER_CHAR;
1578     size_t outLen = 0;
1579     for ( size_t n = 0; n < inLen; n++ )
1580     {
1581         wxUint16 cc[2];
1582         const size_t numChars = encode_utf16(*inBuff++, cc);
1583         if ( numChars == wxCONV_FAILED )
1584             return wxCONV_FAILED;
1585
1586         outLen += numChars;
1587         if ( dst )
1588         {
1589             if ( outLen > dstLen )
1590                 return wxCONV_FAILED;
1591
1592             *dst++ = cc[0];
1593             if ( numChars == 2 )
1594             {
1595                 // second character of a surrogate
1596                 *dst++ = cc[1];
1597             }
1598         }
1599     }
1600
1601     return outLen;
1602 }
1603
1604 size_t
1605 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1606                                  const wchar_t *src, size_t srcLen) const
1607 {
1608     if ( srcLen == wxNO_LEN )
1609         srcLen = wxWcslen(src) + 1;
1610
1611     if ( !dst )
1612     {
1613         // optimization: return maximal space which could be needed for this
1614         // string instead of the exact amount which could be less if there are
1615         // any surrogates in the input
1616         //
1617         // we consider that surrogates are rare enough to make it worthwhile to
1618         // avoid running the loop below at the cost of slightly extra memory
1619         // consumption
1620         return srcLen * BYTES_PER_CHAR;
1621     }
1622
1623     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1624     size_t outLen = 0;
1625     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1626     {
1627         const wxUint32 ch = wxDecodeSurrogate(&src);
1628         if ( !src )
1629             return wxCONV_FAILED;
1630
1631         outLen += BYTES_PER_CHAR;
1632
1633         if ( outLen > dstLen )
1634             return wxCONV_FAILED;
1635
1636         *outBuff++ = ch;
1637     }
1638
1639     return outLen;
1640 }
1641
1642 // ----------------------------------------------------------------------------
1643 // endian-reversing conversions
1644 // ----------------------------------------------------------------------------
1645
1646 size_t
1647 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1648                            const char *src, size_t srcLen) const
1649 {
1650     srcLen = GetLength(src, srcLen);
1651     if ( srcLen == wxNO_LEN )
1652         return wxCONV_FAILED;
1653
1654     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1655     const size_t inLen = srcLen / BYTES_PER_CHAR;
1656     size_t outLen = 0;
1657     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1658     {
1659         wxUint16 cc[2];
1660         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1661         if ( numChars == wxCONV_FAILED )
1662             return wxCONV_FAILED;
1663
1664         outLen += numChars;
1665         if ( dst )
1666         {
1667             if ( outLen > dstLen )
1668                 return wxCONV_FAILED;
1669
1670             *dst++ = cc[0];
1671             if ( numChars == 2 )
1672             {
1673                 // second character of a surrogate
1674                 *dst++ = cc[1];
1675             }
1676         }
1677     }
1678
1679     return outLen;
1680 }
1681
1682 size_t
1683 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1684                              const wchar_t *src, size_t srcLen) const
1685 {
1686     if ( srcLen == wxNO_LEN )
1687         srcLen = wxWcslen(src) + 1;
1688
1689     if ( !dst )
1690     {
1691         // optimization: return maximal space which could be needed for this
1692         // string instead of the exact amount which could be less if there are
1693         // any surrogates in the input
1694         //
1695         // we consider that surrogates are rare enough to make it worthwhile to
1696         // avoid running the loop below at the cost of slightly extra memory
1697         // consumption
1698         return srcLen*BYTES_PER_CHAR;
1699     }
1700
1701     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1702     size_t outLen = 0;
1703     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1704     {
1705         const wxUint32 ch = wxDecodeSurrogate(&src);
1706         if ( !src )
1707             return wxCONV_FAILED;
1708
1709         outLen += BYTES_PER_CHAR;
1710
1711         if ( outLen > dstLen )
1712             return wxCONV_FAILED;
1713
1714         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1715     }
1716
1717     return outLen;
1718 }
1719
1720 #else // !WC_UTF16: wchar_t is UTF-32
1721
1722 // ----------------------------------------------------------------------------
1723 // conversions without endianness change
1724 // ----------------------------------------------------------------------------
1725
1726 size_t
1727 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1728                                const char *src, size_t srcLen) const
1729 {
1730     // use memcpy() as it should be much faster than hand-written loop
1731     srcLen = GetLength(src, srcLen);
1732     if ( srcLen == wxNO_LEN )
1733         return wxCONV_FAILED;
1734
1735     const size_t inLen = srcLen/BYTES_PER_CHAR;
1736     if ( dst )
1737     {
1738         if ( dstLen < inLen )
1739             return wxCONV_FAILED;
1740
1741         memcpy(dst, src, srcLen);
1742     }
1743
1744     return inLen;
1745 }
1746
1747 size_t
1748 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1749                                  const wchar_t *src, size_t srcLen) const
1750 {
1751     if ( srcLen == wxNO_LEN )
1752         srcLen = wxWcslen(src) + 1;
1753
1754     srcLen *= BYTES_PER_CHAR;
1755
1756     if ( dst )
1757     {
1758         if ( dstLen < srcLen )
1759             return wxCONV_FAILED;
1760
1761         memcpy(dst, src, srcLen);
1762     }
1763
1764     return srcLen;
1765 }
1766
1767 // ----------------------------------------------------------------------------
1768 // endian-reversing conversions
1769 // ----------------------------------------------------------------------------
1770
1771 size_t
1772 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1773                            const char *src, size_t srcLen) const
1774 {
1775     srcLen = GetLength(src, srcLen);
1776     if ( srcLen == wxNO_LEN )
1777         return wxCONV_FAILED;
1778
1779     srcLen /= BYTES_PER_CHAR;
1780
1781     if ( dst )
1782     {
1783         if ( dstLen < srcLen )
1784             return wxCONV_FAILED;
1785
1786         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1787         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1788         {
1789             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1790         }
1791     }
1792
1793     return srcLen;
1794 }
1795
1796 size_t
1797 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1798                              const wchar_t *src, size_t srcLen) const
1799 {
1800     if ( srcLen == wxNO_LEN )
1801         srcLen = wxWcslen(src) + 1;
1802
1803     srcLen *= BYTES_PER_CHAR;
1804
1805     if ( dst )
1806     {
1807         if ( dstLen < srcLen )
1808             return wxCONV_FAILED;
1809
1810         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1811         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1812         {
1813             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1814         }
1815     }
1816
1817     return srcLen;
1818 }
1819
1820 #endif // WC_UTF16/!WC_UTF16
1821
1822
1823 // ============================================================================
1824 // The classes doing conversion using the iconv_xxx() functions
1825 // ============================================================================
1826
1827 #ifdef HAVE_ICONV
1828
1829 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1830 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1831 //     (unless there's yet another bug in glibc) the only case when iconv()
1832 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1833 //     left in the input buffer -- when _real_ error occurs,
1834 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1835 //     iconv() failure.
1836 //     [This bug does not appear in glibc 2.2.]
1837 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1838 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1839                                      (errno != E2BIG || bufLeft != 0))
1840 #else
1841 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1842 #endif
1843
1844 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1845
1846 #define ICONV_T_INVALID ((iconv_t)-1)
1847
1848 #if SIZEOF_WCHAR_T == 4
1849     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1850     #define WC_ENC      wxFONTENCODING_UTF32
1851 #elif SIZEOF_WCHAR_T == 2
1852     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1853     #define WC_ENC      wxFONTENCODING_UTF16
1854 #else // sizeof(wchar_t) != 2 nor 4
1855     // does this ever happen?
1856     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1857 #endif
1858
1859 // ----------------------------------------------------------------------------
1860 // wxMBConv_iconv: encapsulates an iconv character set
1861 // ----------------------------------------------------------------------------
1862
1863 class wxMBConv_iconv : public wxMBConv
1864 {
1865 public:
1866     wxMBConv_iconv(const char *name);
1867     virtual ~wxMBConv_iconv();
1868
1869     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1870     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1871
1872     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1873     virtual size_t GetMBNulLen() const;
1874
1875 #if wxUSE_UNICODE_UTF8
1876     virtual bool IsUTF8() const;
1877 #endif
1878
1879     virtual wxMBConv *Clone() const
1880     {
1881         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1882         p->m_minMBCharWidth = m_minMBCharWidth;
1883         return p;
1884     }
1885
1886     bool IsOk() const
1887         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1888
1889 protected:
1890     // the iconv handlers used to translate from multibyte
1891     // to wide char and in the other direction
1892     iconv_t m2w,
1893             w2m;
1894
1895 #if wxUSE_THREADS
1896     // guards access to m2w and w2m objects
1897     wxMutex m_iconvMutex;
1898 #endif
1899
1900 private:
1901     // the name (for iconv_open()) of a wide char charset -- if none is
1902     // available on this machine, it will remain NULL
1903     static wxString ms_wcCharsetName;
1904
1905     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1906     // different endian-ness than the native one
1907     static bool ms_wcNeedsSwap;
1908
1909
1910     // name of the encoding handled by this conversion
1911     wxString m_name;
1912
1913     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1914     // initially
1915     size_t m_minMBCharWidth;
1916 };
1917
1918 // make the constructor available for unit testing
1919 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1920 {
1921     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1922     if ( !result->IsOk() )
1923     {
1924         delete result;
1925         return 0;
1926     }
1927
1928     return result;
1929 }
1930
1931 wxString wxMBConv_iconv::ms_wcCharsetName;
1932 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1933
1934 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1935               : m_name(name)
1936 {
1937     m_minMBCharWidth = 0;
1938
1939     // check for charset that represents wchar_t:
1940     if ( ms_wcCharsetName.empty() )
1941     {
1942         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1943
1944 #if wxUSE_FONTMAP
1945         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1946 #else // !wxUSE_FONTMAP
1947         static const wxChar *names_static[] =
1948         {
1949 #if SIZEOF_WCHAR_T == 4
1950             _T("UCS-4"),
1951 #elif SIZEOF_WCHAR_T = 2
1952             _T("UCS-2"),
1953 #endif
1954             NULL
1955         };
1956         const wxChar **names = names_static;
1957 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1958
1959         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1960         {
1961             const wxString nameCS(*names);
1962
1963             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1964             wxString nameXE(nameCS);
1965
1966 #ifdef WORDS_BIGENDIAN
1967                 nameXE += _T("BE");
1968 #else // little endian
1969                 nameXE += _T("LE");
1970 #endif
1971
1972             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1973                        nameXE.c_str());
1974
1975             m2w = iconv_open(nameXE.ToAscii(), name);
1976             if ( m2w == ICONV_T_INVALID )
1977             {
1978                 // try charset w/o bytesex info (e.g. "UCS4")
1979                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1980                            nameCS.c_str());
1981                 m2w = iconv_open(nameCS.ToAscii(), name);
1982
1983                 // and check for bytesex ourselves:
1984                 if ( m2w != ICONV_T_INVALID )
1985                 {
1986                     char    buf[2], *bufPtr;
1987                     wchar_t wbuf[2], *wbufPtr;
1988                     size_t  insz, outsz;
1989                     size_t  res;
1990
1991                     buf[0] = 'A';
1992                     buf[1] = 0;
1993                     wbuf[0] = 0;
1994                     insz = 2;
1995                     outsz = SIZEOF_WCHAR_T * 2;
1996                     wbufPtr = wbuf;
1997                     bufPtr = buf;
1998
1999                     res = iconv(
2000                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2001                         (char**)&wbufPtr, &outsz);
2002
2003                     if (ICONV_FAILED(res, insz))
2004                     {
2005                         wxLogLastError(wxT("iconv"));
2006                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2007                                    nameCS.c_str());
2008                     }
2009                     else // ok, can convert to this encoding, remember it
2010                     {
2011                         ms_wcCharsetName = nameCS;
2012                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2013                     }
2014                 }
2015             }
2016             else // use charset not requiring byte swapping
2017             {
2018                 ms_wcCharsetName = nameXE;
2019             }
2020         }
2021
2022         wxLogTrace(TRACE_STRCONV,
2023                    wxT("iconv wchar_t charset is \"%s\"%s"),
2024                    ms_wcCharsetName.empty() ? wxString("<none>")
2025                                             : ms_wcCharsetName,
2026                    ms_wcNeedsSwap ? _T(" (needs swap)")
2027                                   : _T(""));
2028     }
2029     else // we already have ms_wcCharsetName
2030     {
2031         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2032     }
2033
2034     if ( ms_wcCharsetName.empty() )
2035     {
2036         w2m = ICONV_T_INVALID;
2037     }
2038     else
2039     {
2040         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2041         if ( w2m == ICONV_T_INVALID )
2042         {
2043             wxLogTrace(TRACE_STRCONV,
2044                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2045                        ms_wcCharsetName.c_str(), name);
2046         }
2047     }
2048 }
2049
2050 wxMBConv_iconv::~wxMBConv_iconv()
2051 {
2052     if ( m2w != ICONV_T_INVALID )
2053         iconv_close(m2w);
2054     if ( w2m != ICONV_T_INVALID )
2055         iconv_close(w2m);
2056 }
2057
2058 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2059 {
2060     // find the string length: notice that must be done differently for
2061     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2062     size_t inbuf;
2063     const size_t nulLen = GetMBNulLen();
2064     switch ( nulLen )
2065     {
2066         default:
2067             return wxCONV_FAILED;
2068
2069         case 1:
2070             inbuf = strlen(psz); // arguably more optimized than our version
2071             break;
2072
2073         case 2:
2074         case 4:
2075             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2076             // they also have to start at character boundary and not span two
2077             // adjacent characters
2078             const char *p;
2079             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2080                 ;
2081             inbuf = p - psz;
2082             break;
2083     }
2084
2085 #if wxUSE_THREADS
2086     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2087     //     Unfortunately there are a couple of global wxCSConv objects such as
2088     //     wxConvLocal that are used all over wx code, so we have to make sure
2089     //     the handle is used by at most one thread at the time. Otherwise
2090     //     only a few wx classes would be safe to use from non-main threads
2091     //     as MB<->WC conversion would fail "randomly".
2092     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2093 #endif // wxUSE_THREADS
2094
2095     size_t outbuf = n * SIZEOF_WCHAR_T;
2096     size_t res, cres;
2097     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2098     wchar_t *bufPtr = buf;
2099     const char *pszPtr = psz;
2100
2101     if (buf)
2102     {
2103         // have destination buffer, convert there
2104         cres = iconv(m2w,
2105                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2106                      (char**)&bufPtr, &outbuf);
2107         res = n - (outbuf / SIZEOF_WCHAR_T);
2108
2109         if (ms_wcNeedsSwap)
2110         {
2111             // convert to native endianness
2112             for ( unsigned i = 0; i < res; i++ )
2113                 buf[n] = WC_BSWAP(buf[i]);
2114         }
2115
2116         // NUL-terminate the string if there is any space left
2117         if (res < n)
2118             buf[res] = 0;
2119     }
2120     else
2121     {
2122         // no destination buffer... convert using temp buffer
2123         // to calculate destination buffer requirement
2124         wchar_t tbuf[8];
2125         res = 0;
2126
2127         do
2128         {
2129             bufPtr = tbuf;
2130             outbuf = 8 * SIZEOF_WCHAR_T;
2131
2132             cres = iconv(m2w,
2133                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2134                          (char**)&bufPtr, &outbuf );
2135
2136             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2137         }
2138         while ((cres == (size_t)-1) && (errno == E2BIG));
2139     }
2140
2141     if (ICONV_FAILED(cres, inbuf))
2142     {
2143         //VS: it is ok if iconv fails, hence trace only
2144         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2145         return wxCONV_FAILED;
2146     }
2147
2148     return res;
2149 }
2150
2151 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2152 {
2153 #if wxUSE_THREADS
2154     // NB: explained in MB2WC
2155     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2156 #endif
2157
2158     size_t inlen = wxWcslen(psz);
2159     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2160     size_t outbuf = n;
2161     size_t res, cres;
2162
2163     wchar_t *tmpbuf = 0;
2164
2165     if (ms_wcNeedsSwap)
2166     {
2167         // need to copy to temp buffer to switch endianness
2168         // (doing WC_BSWAP twice on the original buffer won't help, as it
2169         //  could be in read-only memory, or be accessed in some other thread)
2170         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2171         for ( size_t i = 0; i < inlen; i++ )
2172             tmpbuf[n] = WC_BSWAP(psz[i]);
2173
2174         tmpbuf[inlen] = L'\0';
2175         psz = tmpbuf;
2176     }
2177
2178     if (buf)
2179     {
2180         // have destination buffer, convert there
2181         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2182
2183         res = n - outbuf;
2184
2185         // NB: iconv was given only wcslen(psz) characters on input, and so
2186         //     it couldn't convert the trailing zero. Let's do it ourselves
2187         //     if there's some room left for it in the output buffer.
2188         if (res < n)
2189             buf[0] = 0;
2190     }
2191     else
2192     {
2193         // no destination buffer: convert using temp buffer
2194         // to calculate destination buffer requirement
2195         char tbuf[16];
2196         res = 0;
2197         do
2198         {
2199             buf = tbuf;
2200             outbuf = 16;
2201
2202             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2203
2204             res += 16 - outbuf;
2205         }
2206         while ((cres == (size_t)-1) && (errno == E2BIG));
2207     }
2208
2209     if (ms_wcNeedsSwap)
2210     {
2211         free(tmpbuf);
2212     }
2213
2214     if (ICONV_FAILED(cres, inbuf))
2215     {
2216         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2217         return wxCONV_FAILED;
2218     }
2219
2220     return res;
2221 }
2222
2223 size_t wxMBConv_iconv::GetMBNulLen() const
2224 {
2225     if ( m_minMBCharWidth == 0 )
2226     {
2227         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2228
2229 #if wxUSE_THREADS
2230         // NB: explained in MB2WC
2231         wxMutexLocker lock(self->m_iconvMutex);
2232 #endif
2233
2234         const wchar_t *wnul = L"";
2235         char buf[8]; // should be enough for NUL in any encoding
2236         size_t inLen = sizeof(wchar_t),
2237                outLen = WXSIZEOF(buf);
2238         char *inBuff = (char *)wnul;
2239         char *outBuff = buf;
2240         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2241         {
2242             self->m_minMBCharWidth = (size_t)-1;
2243         }
2244         else // ok
2245         {
2246             self->m_minMBCharWidth = outBuff - buf;
2247         }
2248     }
2249
2250     return m_minMBCharWidth;
2251 }
2252
2253 #if wxUSE_UNICODE_UTF8
2254 bool wxMBConv_iconv::IsUTF8() const
2255 {
2256     return wxStricmp(m_name, "UTF-8") == 0 ||
2257            wxStricmp(m_name, "UTF8") == 0;
2258 }
2259 #endif
2260
2261 #endif // HAVE_ICONV
2262
2263
2264 // ============================================================================
2265 // Win32 conversion classes
2266 // ============================================================================
2267
2268 #ifdef wxHAVE_WIN32_MB2WC
2269
2270 // from utils.cpp
2271 #if wxUSE_FONTMAP
2272 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2273 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2274 #endif
2275
2276 class wxMBConv_win32 : public wxMBConv
2277 {
2278 public:
2279     wxMBConv_win32()
2280     {
2281         m_CodePage = CP_ACP;
2282         m_minMBCharWidth = 0;
2283     }
2284
2285     wxMBConv_win32(const wxMBConv_win32& conv)
2286         : wxMBConv()
2287     {
2288         m_CodePage = conv.m_CodePage;
2289         m_minMBCharWidth = conv.m_minMBCharWidth;
2290     }
2291
2292 #if wxUSE_FONTMAP
2293     wxMBConv_win32(const char* name)
2294     {
2295         m_CodePage = wxCharsetToCodepage(name);
2296         m_minMBCharWidth = 0;
2297     }
2298
2299     wxMBConv_win32(wxFontEncoding encoding)
2300     {
2301         m_CodePage = wxEncodingToCodepage(encoding);
2302         m_minMBCharWidth = 0;
2303     }
2304 #endif // wxUSE_FONTMAP
2305
2306     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2307     {
2308         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2309         // the behaviour is not compatible with the Unix version (using iconv)
2310         // and break the library itself, e.g. wxTextInputStream::NextChar()
2311         // wouldn't work if reading an incomplete MB char didn't result in an
2312         // error
2313         //
2314         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2315         // Win XP or newer and it is not supported for UTF-[78] so we always
2316         // use our own conversions in this case. See
2317         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2318         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2319         if ( m_CodePage == CP_UTF8 )
2320         {
2321             return wxMBConvUTF8().MB2WC(buf, psz, n);
2322         }
2323
2324         if ( m_CodePage == CP_UTF7 )
2325         {
2326             return wxMBConvUTF7().MB2WC(buf, psz, n);
2327         }
2328
2329         int flags = 0;
2330         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2331                 IsAtLeastWin2kSP4() )
2332         {
2333             flags = MB_ERR_INVALID_CHARS;
2334         }
2335
2336         const size_t len = ::MultiByteToWideChar
2337                              (
2338                                 m_CodePage,     // code page
2339                                 flags,          // flags: fall on error
2340                                 psz,            // input string
2341                                 -1,             // its length (NUL-terminated)
2342                                 buf,            // output string
2343                                 buf ? n : 0     // size of output buffer
2344                              );
2345         if ( !len )
2346         {
2347             // function totally failed
2348             return wxCONV_FAILED;
2349         }
2350
2351         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2352         // check if we succeeded, by doing a double trip:
2353         if ( !flags && buf )
2354         {
2355             const size_t mbLen = strlen(psz);
2356             wxCharBuffer mbBuf(mbLen);
2357             if ( ::WideCharToMultiByte
2358                    (
2359                       m_CodePage,
2360                       0,
2361                       buf,
2362                       -1,
2363                       mbBuf.data(),
2364                       mbLen + 1,        // size in bytes, not length
2365                       NULL,
2366                       NULL
2367                    ) == 0 ||
2368                   strcmp(mbBuf, psz) != 0 )
2369             {
2370                 // we didn't obtain the same thing we started from, hence
2371                 // the conversion was lossy and we consider that it failed
2372                 return wxCONV_FAILED;
2373             }
2374         }
2375
2376         // note that it returns count of written chars for buf != NULL and size
2377         // of the needed buffer for buf == NULL so in either case the length of
2378         // the string (which never includes the terminating NUL) is one less
2379         return len - 1;
2380     }
2381
2382     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2383     {
2384         /*
2385             we have a problem here: by default, WideCharToMultiByte() may
2386             replace characters unrepresentable in the target code page with bad
2387             quality approximations such as turning "1/2" symbol (U+00BD) into
2388             "1" for the code pages which don't have it and we, obviously, want
2389             to avoid this at any price
2390
2391             the trouble is that this function does it _silently_, i.e. it won't
2392             even tell us whether it did or not... Win98/2000 and higher provide
2393             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2394             we have to resort to a round trip, i.e. check that converting back
2395             results in the same string -- this is, of course, expensive but
2396             otherwise we simply can't be sure to not garble the data.
2397          */
2398
2399         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2400         // it doesn't work with CJK encodings (which we test for rather roughly
2401         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2402         // supporting it
2403         BOOL usedDef wxDUMMY_INITIALIZE(false);
2404         BOOL *pUsedDef;
2405         int flags;
2406         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2407         {
2408             // it's our lucky day
2409             flags = WC_NO_BEST_FIT_CHARS;
2410             pUsedDef = &usedDef;
2411         }
2412         else // old system or unsupported encoding
2413         {
2414             flags = 0;
2415             pUsedDef = NULL;
2416         }
2417
2418         const size_t len = ::WideCharToMultiByte
2419                              (
2420                                 m_CodePage,     // code page
2421                                 flags,          // either none or no best fit
2422                                 pwz,            // input string
2423                                 -1,             // it is (wide) NUL-terminated
2424                                 buf,            // output buffer
2425                                 buf ? n : 0,    // and its size
2426                                 NULL,           // default "replacement" char
2427                                 pUsedDef        // [out] was it used?
2428                              );
2429
2430         if ( !len )
2431         {
2432             // function totally failed
2433             return wxCONV_FAILED;
2434         }
2435
2436         // we did something, check if we really succeeded
2437         if ( flags )
2438         {
2439             // check if the conversion failed, i.e. if any replacements
2440             // were done
2441             if ( usedDef )
2442                 return wxCONV_FAILED;
2443         }
2444         else // we must resort to double tripping...
2445         {
2446             // first we need to ensure that we really have the MB data: this is
2447             // not the case if we're called with NULL buffer, in which case we
2448             // need to do the conversion yet again
2449             wxCharBuffer bufDef;
2450             if ( !buf )
2451             {
2452                 bufDef = wxCharBuffer(len);
2453                 buf = bufDef.data();
2454                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2455                                             buf, len, NULL, NULL) )
2456                     return wxCONV_FAILED;
2457             }
2458
2459             wxWCharBuffer wcBuf(n);
2460             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2461                     wcscmp(wcBuf, pwz) != 0 )
2462             {
2463                 // we didn't obtain the same thing we started from, hence
2464                 // the conversion was lossy and we consider that it failed
2465                 return wxCONV_FAILED;
2466             }
2467         }
2468
2469         // see the comment above for the reason of "len - 1"
2470         return len - 1;
2471     }
2472
2473     virtual size_t GetMBNulLen() const
2474     {
2475         if ( m_minMBCharWidth == 0 )
2476         {
2477             int len = ::WideCharToMultiByte
2478                         (
2479                             m_CodePage,     // code page
2480                             0,              // no flags
2481                             L"",            // input string
2482                             1,              // translate just the NUL
2483                             NULL,           // output buffer
2484                             0,              // and its size
2485                             NULL,           // no replacement char
2486                             NULL            // [out] don't care if it was used
2487                         );
2488
2489             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2490             switch ( len )
2491             {
2492                 default:
2493                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2494                     self->m_minMBCharWidth = (size_t)-1;
2495                     break;
2496
2497                 case 0:
2498                     self->m_minMBCharWidth = (size_t)-1;
2499                     break;
2500
2501                 case 1:
2502                 case 2:
2503                 case 4:
2504                     self->m_minMBCharWidth = len;
2505                     break;
2506             }
2507         }
2508
2509         return m_minMBCharWidth;
2510     }
2511
2512     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2513
2514     bool IsOk() const { return m_CodePage != -1; }
2515
2516 private:
2517     static bool CanUseNoBestFit()
2518     {
2519         static int s_isWin98Or2k = -1;
2520
2521         if ( s_isWin98Or2k == -1 )
2522         {
2523             int verMaj, verMin;
2524             switch ( wxGetOsVersion(&verMaj, &verMin) )
2525             {
2526                 case wxOS_WINDOWS_9X:
2527                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2528                     break;
2529
2530                 case wxOS_WINDOWS_NT:
2531                     s_isWin98Or2k = verMaj >= 5;
2532                     break;
2533
2534                 default:
2535                     // unknown: be conservative by default
2536                     s_isWin98Or2k = 0;
2537                     break;
2538             }
2539
2540             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2541         }
2542
2543         return s_isWin98Or2k == 1;
2544     }
2545
2546     static bool IsAtLeastWin2kSP4()
2547     {
2548 #ifdef __WXWINCE__
2549         return false;
2550 #else
2551         static int s_isAtLeastWin2kSP4 = -1;
2552
2553         if ( s_isAtLeastWin2kSP4 == -1 )
2554         {
2555             OSVERSIONINFOEX ver;
2556
2557             memset(&ver, 0, sizeof(ver));
2558             ver.dwOSVersionInfoSize = sizeof(ver);
2559             GetVersionEx((OSVERSIONINFO*)&ver);
2560
2561             s_isAtLeastWin2kSP4 =
2562               ((ver.dwMajorVersion > 5) || // Vista+
2563                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2564                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2565                ver.wServicePackMajor >= 4)) // 2000 SP4+
2566               ? 1 : 0;
2567         }
2568
2569         return s_isAtLeastWin2kSP4 == 1;
2570 #endif
2571     }
2572
2573
2574     // the code page we're working with
2575     long m_CodePage;
2576
2577     // cached result of GetMBNulLen(), set to 0 initially meaning
2578     // "unknown"
2579     size_t m_minMBCharWidth;
2580 };
2581
2582 #endif // wxHAVE_WIN32_MB2WC
2583
2584
2585 // ============================================================================
2586 // wxEncodingConverter based conversion classes
2587 // ============================================================================
2588
2589 #if wxUSE_FONTMAP
2590
2591 class wxMBConv_wxwin : public wxMBConv
2592 {
2593 private:
2594     void Init()
2595     {
2596         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597         // The wxMBConv_cf class does a better job.
2598         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2599                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2600                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2601     }
2602
2603 public:
2604     // temporarily just use wxEncodingConverter stuff,
2605     // so that it works while a better implementation is built
2606     wxMBConv_wxwin(const char* name)
2607     {
2608         if (name)
2609             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2610         else
2611             m_enc = wxFONTENCODING_SYSTEM;
2612
2613         Init();
2614     }
2615
2616     wxMBConv_wxwin(wxFontEncoding enc)
2617     {
2618         m_enc = enc;
2619
2620         Init();
2621     }
2622
2623     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2624     {
2625         size_t inbuf = strlen(psz);
2626         if (buf)
2627         {
2628             if (!m2w.Convert(psz, buf))
2629                 return wxCONV_FAILED;
2630         }
2631         return inbuf;
2632     }
2633
2634     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2635     {
2636         const size_t inbuf = wxWcslen(psz);
2637         if (buf)
2638         {
2639             if (!w2m.Convert(psz, buf))
2640                 return wxCONV_FAILED;
2641         }
2642
2643         return inbuf;
2644     }
2645
2646     virtual size_t GetMBNulLen() const
2647     {
2648         switch ( m_enc )
2649         {
2650             case wxFONTENCODING_UTF16BE:
2651             case wxFONTENCODING_UTF16LE:
2652                 return 2;
2653
2654             case wxFONTENCODING_UTF32BE:
2655             case wxFONTENCODING_UTF32LE:
2656                 return 4;
2657
2658             default:
2659                 return 1;
2660         }
2661     }
2662
2663     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2664
2665     bool IsOk() const { return m_ok; }
2666
2667 public:
2668     wxFontEncoding m_enc;
2669     wxEncodingConverter m2w, w2m;
2670
2671 private:
2672     // were we initialized successfully?
2673     bool m_ok;
2674
2675     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2676 };
2677
2678 // make the constructors available for unit testing
2679 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2680 {
2681     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2682     if ( !result->IsOk() )
2683     {
2684         delete result;
2685         return 0;
2686     }
2687
2688     return result;
2689 }
2690
2691 #endif // wxUSE_FONTMAP
2692
2693 // ============================================================================
2694 // wxCSConv implementation
2695 // ============================================================================
2696
2697 void wxCSConv::Init()
2698 {
2699     m_name = NULL;
2700     m_convReal =  NULL;
2701     m_deferred = true;
2702 }
2703
2704 wxCSConv::wxCSConv(const wxString& charset)
2705 {
2706     Init();
2707
2708     if ( !charset.empty() )
2709     {
2710         SetName(charset.ToAscii());
2711     }
2712
2713 #if wxUSE_FONTMAP
2714     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2715 #else
2716     m_encoding = wxFONTENCODING_SYSTEM;
2717 #endif
2718 }
2719
2720 wxCSConv::wxCSConv(wxFontEncoding encoding)
2721 {
2722     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2723     {
2724         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2725
2726         encoding = wxFONTENCODING_SYSTEM;
2727     }
2728
2729     Init();
2730
2731     m_encoding = encoding;
2732 }
2733
2734 wxCSConv::~wxCSConv()
2735 {
2736     Clear();
2737 }
2738
2739 wxCSConv::wxCSConv(const wxCSConv& conv)
2740         : wxMBConv()
2741 {
2742     Init();
2743
2744     SetName(conv.m_name);
2745     m_encoding = conv.m_encoding;
2746 }
2747
2748 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2749 {
2750     Clear();
2751
2752     SetName(conv.m_name);
2753     m_encoding = conv.m_encoding;
2754
2755     return *this;
2756 }
2757
2758 void wxCSConv::Clear()
2759 {
2760     free(m_name);
2761     delete m_convReal;
2762
2763     m_name = NULL;
2764     m_convReal = NULL;
2765 }
2766
2767 void wxCSConv::SetName(const char *charset)
2768 {
2769     if (charset)
2770     {
2771         m_name = wxStrdup(charset);
2772         m_deferred = true;
2773     }
2774 }
2775
2776 #if wxUSE_FONTMAP
2777
2778 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2779                      wxEncodingNameCache );
2780
2781 static wxEncodingNameCache gs_nameCache;
2782 #endif
2783
2784 wxMBConv *wxCSConv::DoCreate() const
2785 {
2786 #if wxUSE_FONTMAP
2787     wxLogTrace(TRACE_STRCONV,
2788                wxT("creating conversion for %s"),
2789                (m_name ? m_name
2790                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2791 #endif // wxUSE_FONTMAP
2792
2793     // check for the special case of ASCII or ISO8859-1 charset: as we have
2794     // special knowledge of it anyhow, we don't need to create a special
2795     // conversion object
2796     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2797             m_encoding == wxFONTENCODING_DEFAULT )
2798     {
2799         // don't convert at all
2800         return NULL;
2801     }
2802
2803     // we trust OS to do conversion better than we can so try external
2804     // conversion methods first
2805     //
2806     // the full order is:
2807     //      1. OS conversion (iconv() under Unix or Win32 API)
2808     //      2. hard coded conversions for UTF
2809     //      3. wxEncodingConverter as fall back
2810
2811     // step (1)
2812 #ifdef HAVE_ICONV
2813 #if !wxUSE_FONTMAP
2814     if ( m_name )
2815 #endif // !wxUSE_FONTMAP
2816     {
2817 #if wxUSE_FONTMAP
2818         wxFontEncoding encoding(m_encoding);
2819 #endif
2820
2821         if ( m_name )
2822         {
2823             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2824             if ( conv->IsOk() )
2825                 return conv;
2826
2827             delete conv;
2828
2829 #if wxUSE_FONTMAP
2830             encoding =
2831                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2832 #endif // wxUSE_FONTMAP
2833         }
2834 #if wxUSE_FONTMAP
2835         {
2836             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2837             if ( it != gs_nameCache.end() )
2838             {
2839                 if ( it->second.empty() )
2840                     return NULL;
2841
2842                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2843                 if ( conv->IsOk() )
2844                     return conv;
2845
2846                 delete conv;
2847             }
2848
2849             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2850             // CS : in case this does not return valid names (eg for MacRoman)
2851             // encoding got a 'failure' entry in the cache all the same,
2852             // although it just has to be created using a different method, so
2853             // only store failed iconv creation attempts (or perhaps we
2854             // shoulnd't do this at all ?)
2855             if ( names[0] != NULL )
2856             {
2857                 for ( ; *names; ++names )
2858                 {
2859                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860                     //             will need changes that will obsolete this
2861                     wxString name(*names);
2862                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2863                     if ( conv->IsOk() )
2864                     {
2865                         gs_nameCache[encoding] = *names;
2866                         return conv;
2867                     }
2868
2869                     delete conv;
2870                 }
2871
2872                 gs_nameCache[encoding] = _T(""); // cache the failure
2873             }
2874         }
2875 #endif // wxUSE_FONTMAP
2876     }
2877 #endif // HAVE_ICONV
2878
2879 #ifdef wxHAVE_WIN32_MB2WC
2880     {
2881 #if wxUSE_FONTMAP
2882         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2883                                       : new wxMBConv_win32(m_encoding);
2884         if ( conv->IsOk() )
2885             return conv;
2886
2887         delete conv;
2888 #else
2889         return NULL;
2890 #endif
2891     }
2892 #endif // wxHAVE_WIN32_MB2WC
2893
2894 #ifdef __DARWIN__
2895     {
2896         // leave UTF16 and UTF32 to the built-ins of wx
2897         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2898             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2899         {
2900 #if wxUSE_FONTMAP
2901             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2902                                           : new wxMBConv_cf(m_encoding);
2903 #else
2904             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2905 #endif
2906
2907             if ( conv->IsOk() )
2908                  return conv;
2909
2910             delete conv;
2911         }
2912     }
2913 #endif // __DARWIN__
2914
2915     // step (2)
2916     wxFontEncoding enc = m_encoding;
2917 #if wxUSE_FONTMAP
2918     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2919     {
2920         // use "false" to suppress interactive dialogs -- we can be called from
2921         // anywhere and popping up a dialog from here is the last thing we want to
2922         // do
2923         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2924     }
2925 #endif // wxUSE_FONTMAP
2926
2927     switch ( enc )
2928     {
2929         case wxFONTENCODING_UTF7:
2930              return new wxMBConvUTF7;
2931
2932         case wxFONTENCODING_UTF8:
2933              return new wxMBConvUTF8;
2934
2935         case wxFONTENCODING_UTF16BE:
2936              return new wxMBConvUTF16BE;
2937
2938         case wxFONTENCODING_UTF16LE:
2939              return new wxMBConvUTF16LE;
2940
2941         case wxFONTENCODING_UTF32BE:
2942              return new wxMBConvUTF32BE;
2943
2944         case wxFONTENCODING_UTF32LE:
2945              return new wxMBConvUTF32LE;
2946
2947         default:
2948              // nothing to do but put here to suppress gcc warnings
2949              break;
2950     }
2951
2952     // step (3)
2953 #if wxUSE_FONTMAP
2954     {
2955         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2956                                       : new wxMBConv_wxwin(m_encoding);
2957         if ( conv->IsOk() )
2958             return conv;
2959
2960         delete conv;
2961     }
2962 #endif // wxUSE_FONTMAP
2963
2964     // NB: This is a hack to prevent deadlock. What could otherwise happen
2965     //     in Unicode build: wxConvLocal creation ends up being here
2966     //     because of some failure and logs the error. But wxLog will try to
2967     //     attach a timestamp, for which it will need wxConvLocal (to convert
2968     //     time to char* and then wchar_t*), but that fails, tries to log the
2969     //     error, but wxLog has an (already locked) critical section that
2970     //     guards the static buffer.
2971     static bool alreadyLoggingError = false;
2972     if (!alreadyLoggingError)
2973     {
2974         alreadyLoggingError = true;
2975         wxLogError(_("Cannot convert from the charset '%s'!"),
2976                    m_name ? m_name
2977                       :
2978 #if wxUSE_FONTMAP
2979                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2980 #else // !wxUSE_FONTMAP
2981                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2982 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2983               );
2984
2985         alreadyLoggingError = false;
2986     }
2987
2988     return NULL;
2989 }
2990
2991 void wxCSConv::CreateConvIfNeeded() const
2992 {
2993     if ( m_deferred )
2994     {
2995         wxCSConv *self = (wxCSConv *)this; // const_cast
2996
2997         // if we don't have neither the name nor the encoding, use the default
2998         // encoding for this system
2999         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3000         {
3001 #if wxUSE_INTL
3002             self->m_encoding = wxLocale::GetSystemEncoding();
3003 #else
3004             // fallback to some reasonable default:
3005             self->m_encoding = wxFONTENCODING_ISO8859_1;
3006 #endif // wxUSE_INTL
3007         }
3008
3009         self->m_convReal = DoCreate();
3010         self->m_deferred = false;
3011     }
3012 }
3013
3014 bool wxCSConv::IsOk() const
3015 {
3016     CreateConvIfNeeded();
3017
3018     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3020         return true; // always ok as we do it ourselves
3021
3022     // m_convReal->IsOk() is called at its own creation, so we know it must
3023     // be ok if m_convReal is non-NULL
3024     return m_convReal != NULL;
3025 }
3026
3027 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3028                          const char *src, size_t srcLen) const
3029 {
3030     CreateConvIfNeeded();
3031
3032     if (m_convReal)
3033         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3034
3035     // latin-1 (direct)
3036     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3037 }
3038
3039 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3040                            const wchar_t *src, size_t srcLen) const
3041 {
3042     CreateConvIfNeeded();
3043
3044     if (m_convReal)
3045         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3046
3047     // latin-1 (direct)
3048     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3049 }
3050
3051 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3052 {
3053     CreateConvIfNeeded();
3054
3055     if (m_convReal)
3056         return m_convReal->MB2WC(buf, psz, n);
3057
3058     // latin-1 (direct)
3059     size_t len = strlen(psz);
3060
3061     if (buf)
3062     {
3063         for (size_t c = 0; c <= len; c++)
3064             buf[c] = (unsigned char)(psz[c]);
3065     }
3066
3067     return len;
3068 }
3069
3070 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3071 {
3072     CreateConvIfNeeded();
3073
3074     if (m_convReal)
3075         return m_convReal->WC2MB(buf, psz, n);
3076
3077     // latin-1 (direct)
3078     const size_t len = wxWcslen(psz);
3079     if (buf)
3080     {
3081         for (size_t c = 0; c <= len; c++)
3082         {
3083             if (psz[c] > 0xFF)
3084                 return wxCONV_FAILED;
3085
3086             buf[c] = (char)psz[c];
3087         }
3088     }
3089     else
3090     {
3091         for (size_t c = 0; c <= len; c++)
3092         {
3093             if (psz[c] > 0xFF)
3094                 return wxCONV_FAILED;
3095         }
3096     }
3097
3098     return len;
3099 }
3100
3101 size_t wxCSConv::GetMBNulLen() const
3102 {
3103     CreateConvIfNeeded();
3104
3105     if ( m_convReal )
3106     {
3107         return m_convReal->GetMBNulLen();
3108     }
3109
3110     // otherwise, we are ISO-8859-1
3111     return 1;
3112 }
3113
3114 #if wxUSE_UNICODE_UTF8
3115 bool wxCSConv::IsUTF8() const
3116 {
3117     CreateConvIfNeeded();
3118
3119     if ( m_convReal )
3120     {
3121         return m_convReal->IsUTF8();
3122     }
3123
3124     // otherwise, we are ISO-8859-1
3125     return false;
3126 }
3127 #endif
3128
3129
3130 #if wxUSE_UNICODE
3131
3132 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3133 {
3134     if ( !s )
3135         return wxWCharBuffer();
3136
3137     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3138     if ( !wbuf )
3139         wbuf = wxMBConvUTF8().cMB2WX(s);
3140     if ( !wbuf )
3141         wbuf = wxConvISO8859_1.cMB2WX(s);
3142
3143     return wbuf;
3144 }
3145
3146 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3147 {
3148     if ( !ws )
3149         return wxCharBuffer();
3150
3151     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3152     if ( !buf )
3153         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3154
3155     return buf;
3156 }
3157
3158 #endif // wxUSE_UNICODE
3159
3160 // ----------------------------------------------------------------------------
3161 // globals
3162 // ----------------------------------------------------------------------------
3163
3164 // NB: The reason why we create converted objects in this convoluted way,
3165 //     using a factory function instead of global variable, is that they
3166 //     may be used at static initialization time (some of them are used by
3167 //     wxString ctors and there may be a global wxString object). In other
3168 //     words, possibly _before_ the converter global object would be
3169 //     initialized.
3170
3171 #undef wxConvLibc
3172 #undef wxConvUTF8
3173 #undef wxConvUTF7
3174 #undef wxConvLocal
3175 #undef wxConvISO8859_1
3176
3177 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3178     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3179     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3180     {                                                                   \
3181         static impl_klass name##Obj ctor_args;                          \
3182         return &name##Obj;                                              \
3183     }                                                                   \
3184     /* this ensures that all global converter objects are created */    \
3185     /* by the time static initialization is done, i.e. before any */    \
3186     /* thread is launched: */                                           \
3187     static klass* gs_##name##instance = wxGet_##name##Ptr()
3188
3189 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3191
3192 #ifdef __WINDOWS__
3193     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3194 #else
3195     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3196 #endif
3197
3198 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200 //     provokes an error message about "not enough macro parameters"; and we
3201 //     can't use "()" here as the name##Obj declaration would be parsed as a
3202 //     function declaration then, so use a semicolon and live with an extra
3203 //     empty statement (and hope that no compilers warns about this)
3204 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3205 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3206
3207 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3208 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3209
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3211 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3212
3213 #ifdef __DARWIN__
3214 // The xnu kernel always communicates file paths in decomposed UTF-8.
3215 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3216 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3217 #endif
3218
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3220 #ifdef __DARWIN__
3221                                     &wxConvMacUTF8DObj;
3222 #else // !__DARWIN__
3223                                     wxGet_wxConvLibcPtr();
3224 #endif // __DARWIN__/!__DARWIN__
3225
3226 #else // !wxUSE_WCHAR_T
3227
3228 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3229 // stand-ins in absence of wchar_t
3230 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3231                                 wxConvISO8859_1,
3232                                 wxConvLocal,
3233                                 wxConvUTF8;
3234
3235 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T