src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = ToWChar(NULL, 0, psz);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = FromWChar(NULL, 0, pwz);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             wxCharBuffer buf(nLen - 1);
 380             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 381                 return buf;
 382         }
 383     }
 384
 385     return wxCharBuffer();
 386 }
 387
 388 const wxWCharBuffer
 389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 390 {
 391     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 392     if ( dstLen != wxCONV_FAILED )
 393     {
 394         wxWCharBuffer wbuf(dstLen - 1);
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         // special case of empty input: can't allocate 0 size buffer below as
 421         // wxCharBuffer insists on NUL-terminating it
 422         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 423         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 424         {
 425             if ( outLen )
 426             {
 427                 *outLen = dstLen;
 428
 429                 const size_t nulLen = GetMBNulLen();
 430                 if ( dstLen >= nulLen &&
 431                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 432                 {
 433                     // in this case the output is NUL-terminated and we're not
 434                     // supposed to count NUL
 435                     *outLen -= nulLen;
 436                 }
 437             }
 438
 439             return buf;
 440         }
 441     }
 442
 443     if ( outLen )
 444         *outLen = 0;
 445
 446     return wxCharBuffer();
 447 }
 448
 449 // ----------------------------------------------------------------------------
 450 // wxMBConvLibc
 451 // ----------------------------------------------------------------------------
 452
 453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     return wxMB2WC(buf, psz, n);
 456 }
 457
 458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 459 {
 460     return wxWC2MB(buf, psz, n);
 461 }
 462
 463 // ----------------------------------------------------------------------------
 464 // wxConvBrokenFileNames
 465 // ----------------------------------------------------------------------------
 466
 467 #ifdef __UNIX__
 468
 469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 470 {
 471     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 472          wxStricmp(charset, _T("UTF8")) == 0  )
 473         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 474     else
 475         m_conv = new wxCSConv(charset);
 476 }
 477
 478 #endif // __UNIX__
 479
 480 // ----------------------------------------------------------------------------
 481 // UTF-7
 482 // ----------------------------------------------------------------------------
 483
 484 // Implementation (C) 2004 Fredrik Roubert
 485
 486 //
 487 // BASE64 decoding table
 488 //
 489 static const unsigned char utf7unb64[] =
 490 {
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 497     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 498     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 500     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 501     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 502     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 504     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 505     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 506     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 523 };
 524
 525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 526 {
 527     size_t len = 0;
 528
 529     while ( *psz && (!buf || (len < n)) )
 530     {
 531         unsigned char cc = *psz++;
 532         if (cc != '+')
 533         {
 534             // plain ASCII char
 535             if (buf)
 536                 *buf++ = cc;
 537             len++;
 538         }
 539         else if (*psz == '-')
 540         {
 541             // encoded plus sign
 542             if (buf)
 543                 *buf++ = cc;
 544             len++;
 545             psz++;
 546         }
 547         else // start of BASE64 encoded string
 548         {
 549             bool lsb, ok;
 550             unsigned int d, l;
 551             for ( ok = lsb = false, d = 0, l = 0;
 552                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 553                   psz++ )
 554             {
 555                 d <<= 6;
 556                 d += cc;
 557                 for (l += 6; l >= 8; lsb = !lsb)
 558                 {
 559                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 560                     if (lsb)
 561                     {
 562                         if (buf)
 563                             *buf++ |= c;
 564                         len ++;
 565                     }
 566                     else
 567                     {
 568                         if (buf)
 569                             *buf = (wchar_t)(c << 8);
 570                     }
 571
 572                     ok = true;
 573                 }
 574             }
 575
 576             if ( !ok )
 577             {
 578                 // in valid UTF7 we should have valid characters after '+'
 579                 return wxCONV_FAILED;
 580             }
 581
 582             if (*psz == '-')
 583                 psz++;
 584         }
 585     }
 586
 587     if ( buf && (len < n) )
 588         *buf = '\0';
 589
 590     return len;
 591 }
 592
 593 //
 594 // BASE64 encoding table
 595 //
 596 static const unsigned char utf7enb64[] =
 597 {
 598     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 599     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 600     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 601     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 602     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 603     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 604     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 605     '4', '5', '6', '7', '8', '9', '+', '/'
 606 };
 607
 608 //
 609 // UTF-7 encoding table
 610 //
 611 // 0 - Set D (directly encoded characters)
 612 // 1 - Set O (optional direct characters)
 613 // 2 - whitespace characters (optional)
 614 // 3 - special characters
 615 //
 616 static const unsigned char utf7encode[128] =
 617 {
 618     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 620     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 622     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 626 };
 627
 628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 629 {
 630     size_t len = 0;
 631
 632     while (*psz && ((!buf) || (len < n)))
 633     {
 634         wchar_t cc = *psz++;
 635         if (cc < 0x80 && utf7encode[cc] < 1)
 636         {
 637             // plain ASCII char
 638             if (buf)
 639                 *buf++ = (char)cc;
 640
 641             len++;
 642         }
 643 #ifndef WC_UTF16
 644         else if (((wxUint32)cc) > 0xffff)
 645         {
 646             // no surrogate pair generation (yet?)
 647             return wxCONV_FAILED;
 648         }
 649 #endif
 650         else
 651         {
 652             if (buf)
 653                 *buf++ = '+';
 654
 655             len++;
 656             if (cc != '+')
 657             {
 658                 // BASE64 encode string
 659                 unsigned int lsb, d, l;
 660                 for (d = 0, l = 0; /*nothing*/; psz++)
 661                 {
 662                     for (lsb = 0; lsb < 2; lsb ++)
 663                     {
 664                         d <<= 8;
 665                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 666
 667                         for (l += 8; l >= 6; )
 668                         {
 669                             l -= 6;
 670                             if (buf)
 671                                 *buf++ = utf7enb64[(d >> l) % 64];
 672                             len++;
 673                         }
 674                     }
 675
 676                     cc = *psz;
 677                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 678                         break;
 679                 }
 680
 681                 if (l != 0)
 682                 {
 683                     if (buf)
 684                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 685
 686                     len++;
 687                 }
 688             }
 689
 690             if (buf)
 691                 *buf++ = '-';
 692             len++;
 693         }
 694     }
 695
 696     if (buf && (len < n))
 697         *buf = 0;
 698
 699     return len;
 700 }
 701
 702 // ----------------------------------------------------------------------------
 703 // UTF-8
 704 // ----------------------------------------------------------------------------
 705
 706 static wxUint32 utf8_max[]=
 707     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 708
 709 // boundaries of the private use area we use to (temporarily) remap invalid
 710 // characters invalid in a UTF-8 encoded string
 711 const wxUint32 wxUnicodePUA = 0x100000;
 712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 713
 714 // this table gives the length of the UTF-8 encoding from its first character:
 715 unsigned char tableUtf8Lengths[256] = {
 716     // single-byte sequences (ASCII):
 717     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 725
 726     // these are invalid:
 727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 731     0, 0,                                            // C0,C1
 732
 733     // two-byte sequences:
 734           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 735     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 736
 737     // three-byte sequences:
 738     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 739
 740     // four-byte sequences:
 741     4, 4, 4, 4, 4,                                   // F0..F4
 742
 743     // these are invalid again (5- or 6-byte
 744     // sequences and sequences for code points
 745     // above U+10FFFF, as restricted by RFC 3629):
 746                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 747 };
 748
 749 size_t
 750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 751                             const char *src, size_t srcLen) const
 752 {
 753     wchar_t *out = dstLen ? dst : NULL;
 754     size_t written = 0;
 755
 756     if ( srcLen == wxNO_LEN )
 757         srcLen = strlen(src) + 1;
 758
 759     for ( const char *p = src; ; p++ )
 760     {
 761         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 762         {
 763             // all done successfully, just add the trailing NULL if we are not
 764             // using explicit length
 765             if ( srcLen == wxNO_LEN )
 766             {
 767                 if ( out )
 768                 {
 769                     if ( !dstLen )
 770                         break;
 771
 772                     *out = L'\0';
 773                 }
 774
 775                 written++;
 776             }
 777
 778             return written;
 779         }
 780
 781         unsigned char c = *p;
 782         unsigned len = tableUtf8Lengths[c];
 783         if ( !len )
 784             break;
 785
 786         if ( srcLen < len ) // the test works for wxNO_LEN too
 787             break;
 788
 789         if ( srcLen != wxNO_LEN )
 790             srcLen -= len;
 791
 792         if ( out && !dstLen-- )
 793             break;
 794
 795
 796         //   Char. number range   |        UTF-8 octet sequence
 797         //      (hexadecimal)     |              (binary)
 798         //  ----------------------+---------------------------------------------
 799         //  0000 0000 - 0000 007F | 0xxxxxxx
 800         //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 801         //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 802         //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 803         //
 804         //  Code point value is stored in bits marked with 'x', lowest-order bit
 805         //  of the value on the right side in the diagram above.
 806         //                                                       (from RFC 3629)
 807
 808         // mask to extract lead byte's value ('x' bits above), by sequence length:
 809         static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 810
 811         // mask and value of lead byte's most significant bits, by length:
 812         static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 813         static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 814
 815         len--; // it's more convenient to work with 0-based length here
 816
 817         // extract the lead byte's value bits:
 818         if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 819             break;
 820
 821         wxUint32 code = c & leadValueMask[len];
 822
 823         // all remaining bytes, if any, are handled in the same way regardless of
 824         // sequence's length:
 825         for ( ; len; --len )
 826         {
 827             c = *++p;
 828             if ( (c & 0xC0) != 0x80 )
 829                 return wxCONV_FAILED;
 830
 831             code <<= 6;
 832             code |= c & 0x3F;
 833         }
 834
 835 #ifdef WC_UTF16
 836         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 837         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 838         {
 839             if ( out )
 840                 out++;
 841             written++;
 842         }
 843 #else // !WC_UTF16
 844         if ( out )
 845             *out = code;
 846 #endif // WC_UTF16/!WC_UTF16
 847
 848         if ( out )
 849             out++;
 850
 851         written++;
 852     }
 853
 854     return wxCONV_FAILED;
 855 }
 856
 857 size_t
 858 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 859                               const wchar_t *src, size_t srcLen) const
 860 {
 861     char *out = dstLen ? dst : NULL;
 862     size_t written = 0;
 863
 864     for ( const wchar_t *wp = src; ; wp++ )
 865     {
 866         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 867         {
 868             // all done successfully, just add the trailing NULL if we are not
 869             // using explicit length
 870             if ( srcLen == wxNO_LEN )
 871             {
 872                 if ( out )
 873                 {
 874                     if ( !dstLen )
 875                         break;
 876
 877                     *out = '\0';
 878                 }
 879
 880                 written++;
 881             }
 882
 883             return written;
 884         }
 885
 886
 887         wxUint32 code;
 888 #ifdef WC_UTF16
 889         // cast is ok for WC_UTF16
 890         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 891         {
 892             // skip the next char too as we decoded a surrogate
 893             wp++;
 894         }
 895 #else // wchar_t is UTF-32
 896         code = *wp & 0x7fffffff;
 897 #endif
 898
 899         unsigned len;
 900         if ( code <= 0x7F )
 901         {
 902             len = 1;
 903             if ( out )
 904             {
 905                 if ( dstLen < len )
 906                     break;
 907
 908                 out[0] = (char)code;
 909             }
 910         }
 911         else if ( code <= 0x07FF )
 912         {
 913             len = 2;
 914             if ( out )
 915             {
 916                 if ( dstLen < len )
 917                     break;
 918
 919                 // NB: this line takes 6 least significant bits, encodes them as
 920                 // 10xxxxxx and discards them so that the next byte can be encoded:
 921                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 922                 out[0] = 0xC0 | code;
 923             }
 924         }
 925         else if ( code < 0xFFFF )
 926         {
 927             len = 3;
 928             if ( out )
 929             {
 930                 if ( dstLen < len )
 931                     break;
 932
 933                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 934                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 935                 out[0] = 0xE0 | code;
 936             }
 937         }
 938         else if ( code <= 0x10FFFF )
 939         {
 940             len = 4;
 941             if ( out )
 942             {
 943                 if ( dstLen < len )
 944                     break;
 945
 946                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 947                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 948                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 949                 out[0] = 0xF0 | code;
 950             }
 951         }
 952         else
 953         {
 954             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 955             break;
 956         }
 957
 958         if ( out )
 959         {
 960             out += len;
 961             dstLen -= len;
 962         }
 963
 964         written += len;
 965     }
 966
 967     // we only get here if an error occurs during decoding
 968     return wxCONV_FAILED;
 969 }
 970
 971 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 972 {
 973     if ( m_options == MAP_INVALID_UTF8_NOT )
 974         return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
 975
 976     size_t len = 0;
 977
 978     while (*psz && ((!buf) || (len < n)))
 979     {
 980         const char *opsz = psz;
 981         bool invalid = false;
 982         unsigned char cc = *psz++, fc = cc;
 983         unsigned cnt;
 984         for (cnt = 0; fc & 0x80; cnt++)
 985             fc <<= 1;
 986
 987         if (!cnt)
 988         {
 989             // plain ASCII char
 990             if (buf)
 991                 *buf++ = cc;
 992             len++;
 993
 994             // escape the escape character for octal escapes
 995             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 996                     && cc == '\\' && (!buf || len < n))
 997             {
 998                 if (buf)
 999                     *buf++ = cc;
1000                 len++;
1001             }
1002         }
1003         else
1004         {
1005             cnt--;
1006             if (!cnt)
1007             {
1008                 // invalid UTF-8 sequence
1009                 invalid = true;
1010             }
1011             else
1012             {
1013                 unsigned ocnt = cnt - 1;
1014                 wxUint32 res = cc & (0x3f >> cnt);
1015                 while (cnt--)
1016                 {
1017                     cc = *psz;
1018                     if ((cc & 0xC0) != 0x80)
1019                     {
1020                         // invalid UTF-8 sequence
1021                         invalid = true;
1022                         break;
1023                     }
1024
1025                     psz++;
1026                     res = (res << 6) | (cc & 0x3f);
1027                 }
1028
1029                 if (invalid || res <= utf8_max[ocnt])
1030                 {
1031                     // illegal UTF-8 encoding
1032                     invalid = true;
1033                 }
1034                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1035                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1036                 {
1037                     // if one of our PUA characters turns up externally
1038                     // it must also be treated as an illegal sequence
1039                     // (a bit like you have to escape an escape character)
1040                     invalid = true;
1041                 }
1042                 else
1043                 {
1044 #ifdef WC_UTF16
1045                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1046                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1047                     if (pa == wxCONV_FAILED)
1048                     {
1049                         invalid = true;
1050                     }
1051                     else
1052                     {
1053                         if (buf)
1054                             buf += pa;
1055                         len += pa;
1056                     }
1057 #else // !WC_UTF16
1058                     if (buf)
1059                         *buf++ = (wchar_t)res;
1060                     len++;
1061 #endif // WC_UTF16/!WC_UTF16
1062                 }
1063             }
1064
1065             if (invalid)
1066             {
1067                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1068                 {
1069                     while (opsz < psz && (!buf || len < n))
1070                     {
1071 #ifdef WC_UTF16
1072                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1073                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1074                         wxASSERT(pa != wxCONV_FAILED);
1075                         if (buf)
1076                             buf += pa;
1077                         opsz++;
1078                         len += pa;
1079 #else
1080                         if (buf)
1081                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1082                         opsz++;
1083                         len++;
1084 #endif
1085                     }
1086                 }
1087                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1088                 {
1089                     while (opsz < psz && (!buf || len < n))
1090                     {
1091                         if ( buf && len + 3 < n )
1092                         {
1093                             unsigned char on = *opsz;
1094                             *buf++ = L'\\';
1095                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1096                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1097                             *buf++ = (wchar_t)( L'0' + on % 010 );
1098                         }
1099
1100                         opsz++;
1101                         len += 4;
1102                     }
1103                 }
1104                 else // MAP_INVALID_UTF8_NOT
1105                 {
1106                     return wxCONV_FAILED;
1107                 }
1108             }
1109         }
1110     }
1111
1112     if (buf && (len < n))
1113         *buf = 0;
1114
1115     return len;
1116 }
1117
1118 static inline bool isoctal(wchar_t wch)
1119 {
1120     return L'0' <= wch && wch <= L'7';
1121 }
1122
1123 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1124 {
1125     if ( m_options == MAP_INVALID_UTF8_NOT )
1126         return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1127
1128     size_t len = 0;
1129
1130     while (*psz && ((!buf) || (len < n)))
1131     {
1132         wxUint32 cc;
1133
1134 #ifdef WC_UTF16
1135         // cast is ok for WC_UTF16
1136         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1137         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1138 #else
1139         cc = (*psz++) & 0x7fffffff;
1140 #endif
1141
1142         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1143                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1144         {
1145             if (buf)
1146                 *buf++ = (char)(cc - wxUnicodePUA);
1147             len++;
1148         }
1149         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1150                     && cc == L'\\' && psz[0] == L'\\' )
1151         {
1152             if (buf)
1153                 *buf++ = (char)cc;
1154             psz++;
1155             len++;
1156         }
1157         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1158                     cc == L'\\' &&
1159                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1160         {
1161             if (buf)
1162             {
1163                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1164                                  (psz[1] - L'0') * 010 +
1165                                  (psz[2] - L'0'));
1166             }
1167
1168             psz += 3;
1169             len++;
1170         }
1171         else
1172         {
1173             unsigned cnt;
1174             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1175             {
1176             }
1177
1178             if (!cnt)
1179             {
1180                 // plain ASCII char
1181                 if (buf)
1182                     *buf++ = (char) cc;
1183                 len++;
1184             }
1185             else
1186             {
1187                 len += cnt + 1;
1188                 if (buf)
1189                 {
1190                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1191                     while (cnt--)
1192                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1193                 }
1194             }
1195         }
1196     }
1197
1198     if (buf && (len < n))
1199         *buf = 0;
1200
1201     return len;
1202 }
1203
1204 // ============================================================================
1205 // UTF-16
1206 // ============================================================================
1207
1208 #ifdef WORDS_BIGENDIAN
1209     #define wxMBConvUTF16straight wxMBConvUTF16BE
1210     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1211 #else
1212     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1213     #define wxMBConvUTF16straight wxMBConvUTF16LE
1214 #endif
1215
1216 /* static */
1217 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1218 {
1219     if ( srcLen == wxNO_LEN )
1220     {
1221         // count the number of bytes in input, including the trailing NULs
1222         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1223         for ( srcLen = 1; *inBuff++; srcLen++ )
1224             ;
1225
1226         srcLen *= BYTES_PER_CHAR;
1227     }
1228     else // we already have the length
1229     {
1230         // we can only convert an entire number of UTF-16 characters
1231         if ( srcLen % BYTES_PER_CHAR )
1232             return wxCONV_FAILED;
1233     }
1234
1235     return srcLen;
1236 }
1237
1238 // case when in-memory representation is UTF-16 too
1239 #ifdef WC_UTF16
1240
1241 // ----------------------------------------------------------------------------
1242 // conversions without endianness change
1243 // ----------------------------------------------------------------------------
1244
1245 size_t
1246 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1247                                const char *src, size_t srcLen) const
1248 {
1249     // set up the scene for using memcpy() (which is presumably more efficient
1250     // than copying the bytes one by one)
1251     srcLen = GetLength(src, srcLen);
1252     if ( srcLen == wxNO_LEN )
1253         return wxCONV_FAILED;
1254
1255     const size_t inLen = srcLen / BYTES_PER_CHAR;
1256     if ( dst )
1257     {
1258         if ( dstLen < inLen )
1259             return wxCONV_FAILED;
1260
1261         memcpy(dst, src, srcLen);
1262     }
1263
1264     return inLen;
1265 }
1266
1267 size_t
1268 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1269                                  const wchar_t *src, size_t srcLen) const
1270 {
1271     if ( srcLen == wxNO_LEN )
1272         srcLen = wxWcslen(src) + 1;
1273
1274     srcLen *= BYTES_PER_CHAR;
1275
1276     if ( dst )
1277     {
1278         if ( dstLen < srcLen )
1279             return wxCONV_FAILED;
1280
1281         memcpy(dst, src, srcLen);
1282     }
1283
1284     return srcLen;
1285 }
1286
1287 // ----------------------------------------------------------------------------
1288 // endian-reversing conversions
1289 // ----------------------------------------------------------------------------
1290
1291 size_t
1292 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1293                            const char *src, size_t srcLen) const
1294 {
1295     srcLen = GetLength(src, srcLen);
1296     if ( srcLen == wxNO_LEN )
1297         return wxCONV_FAILED;
1298
1299     srcLen /= BYTES_PER_CHAR;
1300
1301     if ( dst )
1302     {
1303         if ( dstLen < srcLen )
1304             return wxCONV_FAILED;
1305
1306         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1307         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1308         {
1309             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1310         }
1311     }
1312
1313     return srcLen;
1314 }
1315
1316 size_t
1317 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1318                              const wchar_t *src, size_t srcLen) const
1319 {
1320     if ( srcLen == wxNO_LEN )
1321         srcLen = wxWcslen(src) + 1;
1322
1323     srcLen *= BYTES_PER_CHAR;
1324
1325     if ( dst )
1326     {
1327         if ( dstLen < srcLen )
1328             return wxCONV_FAILED;
1329
1330         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1331         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1332         {
1333             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1334         }
1335     }
1336
1337     return srcLen;
1338 }
1339
1340 #else // !WC_UTF16: wchar_t is UTF-32
1341
1342 // ----------------------------------------------------------------------------
1343 // conversions without endianness change
1344 // ----------------------------------------------------------------------------
1345
1346 size_t
1347 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1348                                const char *src, size_t srcLen) const
1349 {
1350     srcLen = GetLength(src, srcLen);
1351     if ( srcLen == wxNO_LEN )
1352         return wxCONV_FAILED;
1353
1354     const size_t inLen = srcLen / BYTES_PER_CHAR;
1355     if ( !dst )
1356     {
1357         // optimization: return maximal space which could be needed for this
1358         // string even if the real size could be smaller if the buffer contains
1359         // any surrogates
1360         return inLen;
1361     }
1362
1363     size_t outLen = 0;
1364     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1365     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1366     {
1367         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1368         if ( !inBuff )
1369             return wxCONV_FAILED;
1370
1371         if ( ++outLen > dstLen )
1372             return wxCONV_FAILED;
1373
1374         *dst++ = ch;
1375     }
1376
1377
1378     return outLen;
1379 }
1380
1381 size_t
1382 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1383                                  const wchar_t *src, size_t srcLen) const
1384 {
1385     if ( srcLen == wxNO_LEN )
1386         srcLen = wxWcslen(src) + 1;
1387
1388     size_t outLen = 0;
1389     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1390     for ( size_t n = 0; n < srcLen; n++ )
1391     {
1392         wxUint16 cc[2];
1393         const size_t numChars = encode_utf16(*src++, cc);
1394         if ( numChars == wxCONV_FAILED )
1395             return wxCONV_FAILED;
1396
1397         outLen += numChars * BYTES_PER_CHAR;
1398         if ( outBuff )
1399         {
1400             if ( outLen > dstLen )
1401                 return wxCONV_FAILED;
1402
1403             *outBuff++ = cc[0];
1404             if ( numChars == 2 )
1405             {
1406                 // second character of a surrogate
1407                 *outBuff++ = cc[1];
1408             }
1409         }
1410     }
1411
1412     return outLen;
1413 }
1414
1415 // ----------------------------------------------------------------------------
1416 // endian-reversing conversions
1417 // ----------------------------------------------------------------------------
1418
1419 size_t
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1421                            const char *src, size_t srcLen) const
1422 {
1423     srcLen = GetLength(src, srcLen);
1424     if ( srcLen == wxNO_LEN )
1425         return wxCONV_FAILED;
1426
1427     const size_t inLen = srcLen / BYTES_PER_CHAR;
1428     if ( !dst )
1429     {
1430         // optimization: return maximal space which could be needed for this
1431         // string even if the real size could be smaller if the buffer contains
1432         // any surrogates
1433         return inLen;
1434     }
1435
1436     size_t outLen = 0;
1437     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1438     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1439     {
1440         wxUint32 ch;
1441         wxUint16 tmp[2];
1442
1443         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1444         inBuff++;
1445         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1446
1447         const size_t numChars = decode_utf16(tmp, ch);
1448         if ( numChars == wxCONV_FAILED )
1449             return wxCONV_FAILED;
1450
1451         if ( numChars == 2 )
1452             inBuff++;
1453
1454         if ( ++outLen > dstLen )
1455             return wxCONV_FAILED;
1456
1457         *dst++ = ch;
1458     }
1459
1460
1461     return outLen;
1462 }
1463
1464 size_t
1465 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1466                              const wchar_t *src, size_t srcLen) const
1467 {
1468     if ( srcLen == wxNO_LEN )
1469         srcLen = wxWcslen(src) + 1;
1470
1471     size_t outLen = 0;
1472     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1473     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1474     {
1475         wxUint16 cc[2];
1476         const size_t numChars = encode_utf16(*src, cc);
1477         if ( numChars == wxCONV_FAILED )
1478             return wxCONV_FAILED;
1479
1480         outLen += numChars * BYTES_PER_CHAR;
1481         if ( outBuff )
1482         {
1483             if ( outLen > dstLen )
1484                 return wxCONV_FAILED;
1485
1486             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1487             if ( numChars == 2 )
1488             {
1489                 // second character of a surrogate
1490                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1491             }
1492         }
1493     }
1494
1495     return outLen;
1496 }
1497
1498 #endif // WC_UTF16/!WC_UTF16
1499
1500
1501 // ============================================================================
1502 // UTF-32
1503 // ============================================================================
1504
1505 #ifdef WORDS_BIGENDIAN
1506     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1507     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1508 #else
1509     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1510     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1511 #endif
1512
1513
1514 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1515 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1516
1517 /* static */
1518 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1519 {
1520     if ( srcLen == wxNO_LEN )
1521     {
1522         // count the number of bytes in input, including the trailing NULs
1523         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1524         for ( srcLen = 1; *inBuff++; srcLen++ )
1525             ;
1526
1527         srcLen *= BYTES_PER_CHAR;
1528     }
1529     else // we already have the length
1530     {
1531         // we can only convert an entire number of UTF-32 characters
1532         if ( srcLen % BYTES_PER_CHAR )
1533             return wxCONV_FAILED;
1534     }
1535
1536     return srcLen;
1537 }
1538
1539 // case when in-memory representation is UTF-16
1540 #ifdef WC_UTF16
1541
1542 // ----------------------------------------------------------------------------
1543 // conversions without endianness change
1544 // ----------------------------------------------------------------------------
1545
1546 size_t
1547 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1548                                const char *src, size_t srcLen) const
1549 {
1550     srcLen = GetLength(src, srcLen);
1551     if ( srcLen == wxNO_LEN )
1552         return wxCONV_FAILED;
1553
1554     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1555     const size_t inLen = srcLen / BYTES_PER_CHAR;
1556     size_t outLen = 0;
1557     for ( size_t n = 0; n < inLen; n++ )
1558     {
1559         wxUint16 cc[2];
1560         const size_t numChars = encode_utf16(*inBuff++, cc);
1561         if ( numChars == wxCONV_FAILED )
1562             return wxCONV_FAILED;
1563
1564         outLen += numChars;
1565         if ( dst )
1566         {
1567             if ( outLen > dstLen )
1568                 return wxCONV_FAILED;
1569
1570             *dst++ = cc[0];
1571             if ( numChars == 2 )
1572             {
1573                 // second character of a surrogate
1574                 *dst++ = cc[1];
1575             }
1576         }
1577     }
1578
1579     return outLen;
1580 }
1581
1582 size_t
1583 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1584                                  const wchar_t *src, size_t srcLen) const
1585 {
1586     if ( srcLen == wxNO_LEN )
1587         srcLen = wxWcslen(src) + 1;
1588
1589     if ( !dst )
1590     {
1591         // optimization: return maximal space which could be needed for this
1592         // string instead of the exact amount which could be less if there are
1593         // any surrogates in the input
1594         //
1595         // we consider that surrogates are rare enough to make it worthwhile to
1596         // avoid running the loop below at the cost of slightly extra memory
1597         // consumption
1598         return srcLen * BYTES_PER_CHAR;
1599     }
1600
1601     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1602     size_t outLen = 0;
1603     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1604     {
1605         const wxUint32 ch = wxDecodeSurrogate(&src);
1606         if ( !src )
1607             return wxCONV_FAILED;
1608
1609         outLen += BYTES_PER_CHAR;
1610
1611         if ( outLen > dstLen )
1612             return wxCONV_FAILED;
1613
1614         *outBuff++ = ch;
1615     }
1616
1617     return outLen;
1618 }
1619
1620 // ----------------------------------------------------------------------------
1621 // endian-reversing conversions
1622 // ----------------------------------------------------------------------------
1623
1624 size_t
1625 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1626                            const char *src, size_t srcLen) const
1627 {
1628     srcLen = GetLength(src, srcLen);
1629     if ( srcLen == wxNO_LEN )
1630         return wxCONV_FAILED;
1631
1632     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1633     const size_t inLen = srcLen / BYTES_PER_CHAR;
1634     size_t outLen = 0;
1635     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1636     {
1637         wxUint16 cc[2];
1638         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1639         if ( numChars == wxCONV_FAILED )
1640             return wxCONV_FAILED;
1641
1642         outLen += numChars;
1643         if ( dst )
1644         {
1645             if ( outLen > dstLen )
1646                 return wxCONV_FAILED;
1647
1648             *dst++ = cc[0];
1649             if ( numChars == 2 )
1650             {
1651                 // second character of a surrogate
1652                 *dst++ = cc[1];
1653             }
1654         }
1655     }
1656
1657     return outLen;
1658 }
1659
1660 size_t
1661 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1662                              const wchar_t *src, size_t srcLen) const
1663 {
1664     if ( srcLen == wxNO_LEN )
1665         srcLen = wxWcslen(src) + 1;
1666
1667     if ( !dst )
1668     {
1669         // optimization: return maximal space which could be needed for this
1670         // string instead of the exact amount which could be less if there are
1671         // any surrogates in the input
1672         //
1673         // we consider that surrogates are rare enough to make it worthwhile to
1674         // avoid running the loop below at the cost of slightly extra memory
1675         // consumption
1676         return srcLen*BYTES_PER_CHAR;
1677     }
1678
1679     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1680     size_t outLen = 0;
1681     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1682     {
1683         const wxUint32 ch = wxDecodeSurrogate(&src);
1684         if ( !src )
1685             return wxCONV_FAILED;
1686
1687         outLen += BYTES_PER_CHAR;
1688
1689         if ( outLen > dstLen )
1690             return wxCONV_FAILED;
1691
1692         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1693     }
1694
1695     return outLen;
1696 }
1697
1698 #else // !WC_UTF16: wchar_t is UTF-32
1699
1700 // ----------------------------------------------------------------------------
1701 // conversions without endianness change
1702 // ----------------------------------------------------------------------------
1703
1704 size_t
1705 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1706                                const char *src, size_t srcLen) const
1707 {
1708     // use memcpy() as it should be much faster than hand-written loop
1709     srcLen = GetLength(src, srcLen);
1710     if ( srcLen == wxNO_LEN )
1711         return wxCONV_FAILED;
1712
1713     const size_t inLen = srcLen/BYTES_PER_CHAR;
1714     if ( dst )
1715     {
1716         if ( dstLen < inLen )
1717             return wxCONV_FAILED;
1718
1719         memcpy(dst, src, srcLen);
1720     }
1721
1722     return inLen;
1723 }
1724
1725 size_t
1726 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1727                                  const wchar_t *src, size_t srcLen) const
1728 {
1729     if ( srcLen == wxNO_LEN )
1730         srcLen = wxWcslen(src) + 1;
1731
1732     srcLen *= BYTES_PER_CHAR;
1733
1734     if ( dst )
1735     {
1736         if ( dstLen < srcLen )
1737             return wxCONV_FAILED;
1738
1739         memcpy(dst, src, srcLen);
1740     }
1741
1742     return srcLen;
1743 }
1744
1745 // ----------------------------------------------------------------------------
1746 // endian-reversing conversions
1747 // ----------------------------------------------------------------------------
1748
1749 size_t
1750 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1751                            const char *src, size_t srcLen) const
1752 {
1753     srcLen = GetLength(src, srcLen);
1754     if ( srcLen == wxNO_LEN )
1755         return wxCONV_FAILED;
1756
1757     srcLen /= BYTES_PER_CHAR;
1758
1759     if ( dst )
1760     {
1761         if ( dstLen < srcLen )
1762             return wxCONV_FAILED;
1763
1764         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1765         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1766         {
1767             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1768         }
1769     }
1770
1771     return srcLen;
1772 }
1773
1774 size_t
1775 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1776                              const wchar_t *src, size_t srcLen) const
1777 {
1778     if ( srcLen == wxNO_LEN )
1779         srcLen = wxWcslen(src) + 1;
1780
1781     srcLen *= BYTES_PER_CHAR;
1782
1783     if ( dst )
1784     {
1785         if ( dstLen < srcLen )
1786             return wxCONV_FAILED;
1787
1788         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1789         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1790         {
1791             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1792         }
1793     }
1794
1795     return srcLen;
1796 }
1797
1798 #endif // WC_UTF16/!WC_UTF16
1799
1800
1801 // ============================================================================
1802 // The classes doing conversion using the iconv_xxx() functions
1803 // ============================================================================
1804
1805 #ifdef HAVE_ICONV
1806
1807 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1808 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1809 //     (unless there's yet another bug in glibc) the only case when iconv()
1810 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1811 //     left in the input buffer -- when _real_ error occurs,
1812 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1813 //     iconv() failure.
1814 //     [This bug does not appear in glibc 2.2.]
1815 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1816 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1817                                      (errno != E2BIG || bufLeft != 0))
1818 #else
1819 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1820 #endif
1821
1822 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1823
1824 #define ICONV_T_INVALID ((iconv_t)-1)
1825
1826 #if SIZEOF_WCHAR_T == 4
1827     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1828     #define WC_ENC      wxFONTENCODING_UTF32
1829 #elif SIZEOF_WCHAR_T == 2
1830     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1831     #define WC_ENC      wxFONTENCODING_UTF16
1832 #else // sizeof(wchar_t) != 2 nor 4
1833     // does this ever happen?
1834     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1835 #endif
1836
1837 // ----------------------------------------------------------------------------
1838 // wxMBConv_iconv: encapsulates an iconv character set
1839 // ----------------------------------------------------------------------------
1840
1841 class wxMBConv_iconv : public wxMBConv
1842 {
1843 public:
1844     wxMBConv_iconv(const char *name);
1845     virtual ~wxMBConv_iconv();
1846
1847     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1848     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1849
1850     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1851     virtual size_t GetMBNulLen() const;
1852
1853 #if wxUSE_UNICODE_UTF8
1854     virtual bool IsUTF8() const;
1855 #endif
1856
1857     virtual wxMBConv *Clone() const
1858     {
1859         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1860         p->m_minMBCharWidth = m_minMBCharWidth;
1861         return p;
1862     }
1863
1864     bool IsOk() const
1865         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1866
1867 protected:
1868     // the iconv handlers used to translate from multibyte
1869     // to wide char and in the other direction
1870     iconv_t m2w,
1871             w2m;
1872
1873 #if wxUSE_THREADS
1874     // guards access to m2w and w2m objects
1875     wxMutex m_iconvMutex;
1876 #endif
1877
1878 private:
1879     // the name (for iconv_open()) of a wide char charset -- if none is
1880     // available on this machine, it will remain NULL
1881     static wxString ms_wcCharsetName;
1882
1883     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1884     // different endian-ness than the native one
1885     static bool ms_wcNeedsSwap;
1886
1887
1888     // name of the encoding handled by this conversion
1889     wxString m_name;
1890
1891     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1892     // initially
1893     size_t m_minMBCharWidth;
1894 };
1895
1896 // make the constructor available for unit testing
1897 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1898 {
1899     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1900     if ( !result->IsOk() )
1901     {
1902         delete result;
1903         return 0;
1904     }
1905
1906     return result;
1907 }
1908
1909 wxString wxMBConv_iconv::ms_wcCharsetName;
1910 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1911
1912 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1913               : m_name(name)
1914 {
1915     m_minMBCharWidth = 0;
1916
1917     // check for charset that represents wchar_t:
1918     if ( ms_wcCharsetName.empty() )
1919     {
1920         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1921
1922 #if wxUSE_FONTMAP
1923         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1924 #else // !wxUSE_FONTMAP
1925         static const wxChar *names_static[] =
1926         {
1927 #if SIZEOF_WCHAR_T == 4
1928             _T("UCS-4"),
1929 #elif SIZEOF_WCHAR_T = 2
1930             _T("UCS-2"),
1931 #endif
1932             NULL
1933         };
1934         const wxChar **names = names_static;
1935 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1936
1937         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1938         {
1939             const wxString nameCS(*names);
1940
1941             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1942             wxString nameXE(nameCS);
1943
1944 #ifdef WORDS_BIGENDIAN
1945                 nameXE += _T("BE");
1946 #else // little endian
1947                 nameXE += _T("LE");
1948 #endif
1949
1950             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1951                        nameXE.c_str());
1952
1953             m2w = iconv_open(nameXE.ToAscii(), name);
1954             if ( m2w == ICONV_T_INVALID )
1955             {
1956                 // try charset w/o bytesex info (e.g. "UCS4")
1957                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1958                            nameCS.c_str());
1959                 m2w = iconv_open(nameCS.ToAscii(), name);
1960
1961                 // and check for bytesex ourselves:
1962                 if ( m2w != ICONV_T_INVALID )
1963                 {
1964                     char    buf[2], *bufPtr;
1965                     wchar_t wbuf[2], *wbufPtr;
1966                     size_t  insz, outsz;
1967                     size_t  res;
1968
1969                     buf[0] = 'A';
1970                     buf[1] = 0;
1971                     wbuf[0] = 0;
1972                     insz = 2;
1973                     outsz = SIZEOF_WCHAR_T * 2;
1974                     wbufPtr = wbuf;
1975                     bufPtr = buf;
1976
1977                     res = iconv(
1978                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1979                         (char**)&wbufPtr, &outsz);
1980
1981                     if (ICONV_FAILED(res, insz))
1982                     {
1983                         wxLogLastError(wxT("iconv"));
1984                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1985                                    nameCS.c_str());
1986                     }
1987                     else // ok, can convert to this encoding, remember it
1988                     {
1989                         ms_wcCharsetName = nameCS;
1990                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1991                     }
1992                 }
1993             }
1994             else // use charset not requiring byte swapping
1995             {
1996                 ms_wcCharsetName = nameXE;
1997             }
1998         }
1999
2000         wxLogTrace(TRACE_STRCONV,
2001                    wxT("iconv wchar_t charset is \"%s\"%s"),
2002                    ms_wcCharsetName.empty() ? wxString("<none>")
2003                                             : ms_wcCharsetName,
2004                    ms_wcNeedsSwap ? _T(" (needs swap)")
2005                                   : _T(""));
2006     }
2007     else // we already have ms_wcCharsetName
2008     {
2009         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2010     }
2011
2012     if ( ms_wcCharsetName.empty() )
2013     {
2014         w2m = ICONV_T_INVALID;
2015     }
2016     else
2017     {
2018         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2019         if ( w2m == ICONV_T_INVALID )
2020         {
2021             wxLogTrace(TRACE_STRCONV,
2022                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2023                        ms_wcCharsetName.c_str(), name);
2024         }
2025     }
2026 }
2027
2028 wxMBConv_iconv::~wxMBConv_iconv()
2029 {
2030     if ( m2w != ICONV_T_INVALID )
2031         iconv_close(m2w);
2032     if ( w2m != ICONV_T_INVALID )
2033         iconv_close(w2m);
2034 }
2035
2036 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2037 {
2038     // find the string length: notice that must be done differently for
2039     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2040     size_t inbuf;
2041     const size_t nulLen = GetMBNulLen();
2042     switch ( nulLen )
2043     {
2044         default:
2045             return wxCONV_FAILED;
2046
2047         case 1:
2048             inbuf = strlen(psz); // arguably more optimized than our version
2049             break;
2050
2051         case 2:
2052         case 4:
2053             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2054             // they also have to start at character boundary and not span two
2055             // adjacent characters
2056             const char *p;
2057             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2058                 ;
2059             inbuf = p - psz;
2060             break;
2061     }
2062
2063 #if wxUSE_THREADS
2064     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2065     //     Unfortunately there are a couple of global wxCSConv objects such as
2066     //     wxConvLocal that are used all over wx code, so we have to make sure
2067     //     the handle is used by at most one thread at the time. Otherwise
2068     //     only a few wx classes would be safe to use from non-main threads
2069     //     as MB<->WC conversion would fail "randomly".
2070     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2071 #endif // wxUSE_THREADS
2072
2073     size_t outbuf = n * SIZEOF_WCHAR_T;
2074     size_t res, cres;
2075     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2076     wchar_t *bufPtr = buf;
2077     const char *pszPtr = psz;
2078
2079     if (buf)
2080     {
2081         // have destination buffer, convert there
2082         cres = iconv(m2w,
2083                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2084                      (char**)&bufPtr, &outbuf);
2085         res = n - (outbuf / SIZEOF_WCHAR_T);
2086
2087         if (ms_wcNeedsSwap)
2088         {
2089             // convert to native endianness
2090             for ( unsigned i = 0; i < res; i++ )
2091                 buf[n] = WC_BSWAP(buf[i]);
2092         }
2093
2094         // NUL-terminate the string if there is any space left
2095         if (res < n)
2096             buf[res] = 0;
2097     }
2098     else
2099     {
2100         // no destination buffer... convert using temp buffer
2101         // to calculate destination buffer requirement
2102         wchar_t tbuf[8];
2103         res = 0;
2104
2105         do
2106         {
2107             bufPtr = tbuf;
2108             outbuf = 8 * SIZEOF_WCHAR_T;
2109
2110             cres = iconv(m2w,
2111                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2112                          (char**)&bufPtr, &outbuf );
2113
2114             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2115         }
2116         while ((cres == (size_t)-1) && (errno == E2BIG));
2117     }
2118
2119     if (ICONV_FAILED(cres, inbuf))
2120     {
2121         //VS: it is ok if iconv fails, hence trace only
2122         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2123         return wxCONV_FAILED;
2124     }
2125
2126     return res;
2127 }
2128
2129 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2130 {
2131 #if wxUSE_THREADS
2132     // NB: explained in MB2WC
2133     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2134 #endif
2135
2136     size_t inlen = wxWcslen(psz);
2137     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2138     size_t outbuf = n;
2139     size_t res, cres;
2140
2141     wchar_t *tmpbuf = 0;
2142
2143     if (ms_wcNeedsSwap)
2144     {
2145         // need to copy to temp buffer to switch endianness
2146         // (doing WC_BSWAP twice on the original buffer won't help, as it
2147         //  could be in read-only memory, or be accessed in some other thread)
2148         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2149         for ( size_t i = 0; i < inlen; i++ )
2150             tmpbuf[n] = WC_BSWAP(psz[i]);
2151
2152         tmpbuf[inlen] = L'\0';
2153         psz = tmpbuf;
2154     }
2155
2156     if (buf)
2157     {
2158         // have destination buffer, convert there
2159         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2160
2161         res = n - outbuf;
2162
2163         // NB: iconv was given only wcslen(psz) characters on input, and so
2164         //     it couldn't convert the trailing zero. Let's do it ourselves
2165         //     if there's some room left for it in the output buffer.
2166         if (res < n)
2167             buf[0] = 0;
2168     }
2169     else
2170     {
2171         // no destination buffer: convert using temp buffer
2172         // to calculate destination buffer requirement
2173         char tbuf[16];
2174         res = 0;
2175         do
2176         {
2177             buf = tbuf;
2178             outbuf = 16;
2179
2180             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2181
2182             res += 16 - outbuf;
2183         }
2184         while ((cres == (size_t)-1) && (errno == E2BIG));
2185     }
2186
2187     if (ms_wcNeedsSwap)
2188     {
2189         free(tmpbuf);
2190     }
2191
2192     if (ICONV_FAILED(cres, inbuf))
2193     {
2194         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2195         return wxCONV_FAILED;
2196     }
2197
2198     return res;
2199 }
2200
2201 size_t wxMBConv_iconv::GetMBNulLen() const
2202 {
2203     if ( m_minMBCharWidth == 0 )
2204     {
2205         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2206
2207 #if wxUSE_THREADS
2208         // NB: explained in MB2WC
2209         wxMutexLocker lock(self->m_iconvMutex);
2210 #endif
2211
2212         const wchar_t *wnul = L"";
2213         char buf[8]; // should be enough for NUL in any encoding
2214         size_t inLen = sizeof(wchar_t),
2215                outLen = WXSIZEOF(buf);
2216         char *inBuff = (char *)wnul;
2217         char *outBuff = buf;
2218         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2219         {
2220             self->m_minMBCharWidth = (size_t)-1;
2221         }
2222         else // ok
2223         {
2224             self->m_minMBCharWidth = outBuff - buf;
2225         }
2226     }
2227
2228     return m_minMBCharWidth;
2229 }
2230
2231 #if wxUSE_UNICODE_UTF8
2232 bool wxMBConv_iconv::IsUTF8() const
2233 {
2234     return wxStricmp(m_name, "UTF-8") == 0 ||
2235            wxStricmp(m_name, "UTF8") == 0;
2236 }
2237 #endif
2238
2239 #endif // HAVE_ICONV
2240
2241
2242 // ============================================================================
2243 // Win32 conversion classes
2244 // ============================================================================
2245
2246 #ifdef wxHAVE_WIN32_MB2WC
2247
2248 // from utils.cpp
2249 #if wxUSE_FONTMAP
2250 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2251 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2252 #endif
2253
2254 class wxMBConv_win32 : public wxMBConv
2255 {
2256 public:
2257     wxMBConv_win32()
2258     {
2259         m_CodePage = CP_ACP;
2260         m_minMBCharWidth = 0;
2261     }
2262
2263     wxMBConv_win32(const wxMBConv_win32& conv)
2264         : wxMBConv()
2265     {
2266         m_CodePage = conv.m_CodePage;
2267         m_minMBCharWidth = conv.m_minMBCharWidth;
2268     }
2269
2270 #if wxUSE_FONTMAP
2271     wxMBConv_win32(const char* name)
2272     {
2273         m_CodePage = wxCharsetToCodepage(name);
2274         m_minMBCharWidth = 0;
2275     }
2276
2277     wxMBConv_win32(wxFontEncoding encoding)
2278     {
2279         m_CodePage = wxEncodingToCodepage(encoding);
2280         m_minMBCharWidth = 0;
2281     }
2282 #endif // wxUSE_FONTMAP
2283
2284     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2285     {
2286         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2287         // the behaviour is not compatible with the Unix version (using iconv)
2288         // and break the library itself, e.g. wxTextInputStream::NextChar()
2289         // wouldn't work if reading an incomplete MB char didn't result in an
2290         // error
2291         //
2292         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2293         // Win XP or newer and it is not supported for UTF-[78] so we always
2294         // use our own conversions in this case. See
2295         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2296         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2297         if ( m_CodePage == CP_UTF8 )
2298         {
2299             return wxMBConvUTF8().MB2WC(buf, psz, n);
2300         }
2301
2302         if ( m_CodePage == CP_UTF7 )
2303         {
2304             return wxMBConvUTF7().MB2WC(buf, psz, n);
2305         }
2306
2307         int flags = 0;
2308         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2309                 IsAtLeastWin2kSP4() )
2310         {
2311             flags = MB_ERR_INVALID_CHARS;
2312         }
2313
2314         const size_t len = ::MultiByteToWideChar
2315                              (
2316                                 m_CodePage,     // code page
2317                                 flags,          // flags: fall on error
2318                                 psz,            // input string
2319                                 -1,             // its length (NUL-terminated)
2320                                 buf,            // output string
2321                                 buf ? n : 0     // size of output buffer
2322                              );
2323         if ( !len )
2324         {
2325             // function totally failed
2326             return wxCONV_FAILED;
2327         }
2328
2329         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2330         // check if we succeeded, by doing a double trip:
2331         if ( !flags && buf )
2332         {
2333             const size_t mbLen = strlen(psz);
2334             wxCharBuffer mbBuf(mbLen);
2335             if ( ::WideCharToMultiByte
2336                    (
2337                       m_CodePage,
2338                       0,
2339                       buf,
2340                       -1,
2341                       mbBuf.data(),
2342                       mbLen + 1,        // size in bytes, not length
2343                       NULL,
2344                       NULL
2345                    ) == 0 ||
2346                   strcmp(mbBuf, psz) != 0 )
2347             {
2348                 // we didn't obtain the same thing we started from, hence
2349                 // the conversion was lossy and we consider that it failed
2350                 return wxCONV_FAILED;
2351             }
2352         }
2353
2354         // note that it returns count of written chars for buf != NULL and size
2355         // of the needed buffer for buf == NULL so in either case the length of
2356         // the string (which never includes the terminating NUL) is one less
2357         return len - 1;
2358     }
2359
2360     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2361     {
2362         /*
2363             we have a problem here: by default, WideCharToMultiByte() may
2364             replace characters unrepresentable in the target code page with bad
2365             quality approximations such as turning "1/2" symbol (U+00BD) into
2366             "1" for the code pages which don't have it and we, obviously, want
2367             to avoid this at any price
2368
2369             the trouble is that this function does it _silently_, i.e. it won't
2370             even tell us whether it did or not... Win98/2000 and higher provide
2371             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2372             we have to resort to a round trip, i.e. check that converting back
2373             results in the same string -- this is, of course, expensive but
2374             otherwise we simply can't be sure to not garble the data.
2375          */
2376
2377         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2378         // it doesn't work with CJK encodings (which we test for rather roughly
2379         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2380         // supporting it
2381         BOOL usedDef wxDUMMY_INITIALIZE(false);
2382         BOOL *pUsedDef;
2383         int flags;
2384         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2385         {
2386             // it's our lucky day
2387             flags = WC_NO_BEST_FIT_CHARS;
2388             pUsedDef = &usedDef;
2389         }
2390         else // old system or unsupported encoding
2391         {
2392             flags = 0;
2393             pUsedDef = NULL;
2394         }
2395
2396         const size_t len = ::WideCharToMultiByte
2397                              (
2398                                 m_CodePage,     // code page
2399                                 flags,          // either none or no best fit
2400                                 pwz,            // input string
2401                                 -1,             // it is (wide) NUL-terminated
2402                                 buf,            // output buffer
2403                                 buf ? n : 0,    // and its size
2404                                 NULL,           // default "replacement" char
2405                                 pUsedDef        // [out] was it used?
2406                              );
2407
2408         if ( !len )
2409         {
2410             // function totally failed
2411             return wxCONV_FAILED;
2412         }
2413
2414         // if we were really converting, check if we succeeded
2415         if ( buf )
2416         {
2417             if ( flags )
2418             {
2419                 // check if the conversion failed, i.e. if any replacements
2420                 // were done
2421                 if ( usedDef )
2422                     return wxCONV_FAILED;
2423             }
2424             else // we must resort to double tripping...
2425             {
2426                 wxWCharBuffer wcBuf(n);
2427                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2428                         wcscmp(wcBuf, pwz) != 0 )
2429                 {
2430                     // we didn't obtain the same thing we started from, hence
2431                     // the conversion was lossy and we consider that it failed
2432                     return wxCONV_FAILED;
2433                 }
2434             }
2435         }
2436
2437         // see the comment above for the reason of "len - 1"
2438         return len - 1;
2439     }
2440
2441     virtual size_t GetMBNulLen() const
2442     {
2443         if ( m_minMBCharWidth == 0 )
2444         {
2445             int len = ::WideCharToMultiByte
2446                         (
2447                             m_CodePage,     // code page
2448                             0,              // no flags
2449                             L"",            // input string
2450                             1,              // translate just the NUL
2451                             NULL,           // output buffer
2452                             0,              // and its size
2453                             NULL,           // no replacement char
2454                             NULL            // [out] don't care if it was used
2455                         );
2456
2457             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2458             switch ( len )
2459             {
2460                 default:
2461                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2462                     self->m_minMBCharWidth = (size_t)-1;
2463                     break;
2464
2465                 case 0:
2466                     self->m_minMBCharWidth = (size_t)-1;
2467                     break;
2468
2469                 case 1:
2470                 case 2:
2471                 case 4:
2472                     self->m_minMBCharWidth = len;
2473                     break;
2474             }
2475         }
2476
2477         return m_minMBCharWidth;
2478     }
2479
2480     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2481
2482     bool IsOk() const { return m_CodePage != -1; }
2483
2484 private:
2485     static bool CanUseNoBestFit()
2486     {
2487         static int s_isWin98Or2k = -1;
2488
2489         if ( s_isWin98Or2k == -1 )
2490         {
2491             int verMaj, verMin;
2492             switch ( wxGetOsVersion(&verMaj, &verMin) )
2493             {
2494                 case wxOS_WINDOWS_9X:
2495                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2496                     break;
2497
2498                 case wxOS_WINDOWS_NT:
2499                     s_isWin98Or2k = verMaj >= 5;
2500                     break;
2501
2502                 default:
2503                     // unknown: be conservative by default
2504                     s_isWin98Or2k = 0;
2505                     break;
2506             }
2507
2508             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2509         }
2510
2511         return s_isWin98Or2k == 1;
2512     }
2513
2514     static bool IsAtLeastWin2kSP4()
2515     {
2516 #ifdef __WXWINCE__
2517         return false;
2518 #else
2519         static int s_isAtLeastWin2kSP4 = -1;
2520
2521         if ( s_isAtLeastWin2kSP4 == -1 )
2522         {
2523             OSVERSIONINFOEX ver;
2524
2525             memset(&ver, 0, sizeof(ver));
2526             ver.dwOSVersionInfoSize = sizeof(ver);
2527             GetVersionEx((OSVERSIONINFO*)&ver);
2528
2529             s_isAtLeastWin2kSP4 =
2530               ((ver.dwMajorVersion > 5) || // Vista+
2531                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2532                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2533                ver.wServicePackMajor >= 4)) // 2000 SP4+
2534               ? 1 : 0;
2535         }
2536
2537         return s_isAtLeastWin2kSP4 == 1;
2538 #endif
2539     }
2540
2541
2542     // the code page we're working with
2543     long m_CodePage;
2544
2545     // cached result of GetMBNulLen(), set to 0 initially meaning
2546     // "unknown"
2547     size_t m_minMBCharWidth;
2548 };
2549
2550 #endif // wxHAVE_WIN32_MB2WC
2551
2552
2553 // ============================================================================
2554 // wxEncodingConverter based conversion classes
2555 // ============================================================================
2556
2557 #if wxUSE_FONTMAP
2558
2559 class wxMBConv_wxwin : public wxMBConv
2560 {
2561 private:
2562     void Init()
2563     {
2564         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2565         // The wxMBConv_cf class does a better job.
2566         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2567                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2568                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2569     }
2570
2571 public:
2572     // temporarily just use wxEncodingConverter stuff,
2573     // so that it works while a better implementation is built
2574     wxMBConv_wxwin(const char* name)
2575     {
2576         if (name)
2577             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2578         else
2579             m_enc = wxFONTENCODING_SYSTEM;
2580
2581         Init();
2582     }
2583
2584     wxMBConv_wxwin(wxFontEncoding enc)
2585     {
2586         m_enc = enc;
2587
2588         Init();
2589     }
2590
2591     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2592     {
2593         size_t inbuf = strlen(psz);
2594         if (buf)
2595         {
2596             if (!m2w.Convert(psz, buf))
2597                 return wxCONV_FAILED;
2598         }
2599         return inbuf;
2600     }
2601
2602     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2603     {
2604         const size_t inbuf = wxWcslen(psz);
2605         if (buf)
2606         {
2607             if (!w2m.Convert(psz, buf))
2608                 return wxCONV_FAILED;
2609         }
2610
2611         return inbuf;
2612     }
2613
2614     virtual size_t GetMBNulLen() const
2615     {
2616         switch ( m_enc )
2617         {
2618             case wxFONTENCODING_UTF16BE:
2619             case wxFONTENCODING_UTF16LE:
2620                 return 2;
2621
2622             case wxFONTENCODING_UTF32BE:
2623             case wxFONTENCODING_UTF32LE:
2624                 return 4;
2625
2626             default:
2627                 return 1;
2628         }
2629     }
2630
2631     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2632
2633     bool IsOk() const { return m_ok; }
2634
2635 public:
2636     wxFontEncoding m_enc;
2637     wxEncodingConverter m2w, w2m;
2638
2639 private:
2640     // were we initialized successfully?
2641     bool m_ok;
2642
2643     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2644 };
2645
2646 // make the constructors available for unit testing
2647 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2648 {
2649     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2650     if ( !result->IsOk() )
2651     {
2652         delete result;
2653         return 0;
2654     }
2655
2656     return result;
2657 }
2658
2659 #endif // wxUSE_FONTMAP
2660
2661 // ============================================================================
2662 // wxCSConv implementation
2663 // ============================================================================
2664
2665 void wxCSConv::Init()
2666 {
2667     m_name = NULL;
2668     m_convReal =  NULL;
2669     m_deferred = true;
2670 }
2671
2672 wxCSConv::wxCSConv(const wxString& charset)
2673 {
2674     Init();
2675
2676     if ( !charset.empty() )
2677     {
2678         SetName(charset.ToAscii());
2679     }
2680
2681 #if wxUSE_FONTMAP
2682     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2683 #else
2684     m_encoding = wxFONTENCODING_SYSTEM;
2685 #endif
2686 }
2687
2688 wxCSConv::wxCSConv(wxFontEncoding encoding)
2689 {
2690     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2691     {
2692         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2693
2694         encoding = wxFONTENCODING_SYSTEM;
2695     }
2696
2697     Init();
2698
2699     m_encoding = encoding;
2700 }
2701
2702 wxCSConv::~wxCSConv()
2703 {
2704     Clear();
2705 }
2706
2707 wxCSConv::wxCSConv(const wxCSConv& conv)
2708         : wxMBConv()
2709 {
2710     Init();
2711
2712     SetName(conv.m_name);
2713     m_encoding = conv.m_encoding;
2714 }
2715
2716 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2717 {
2718     Clear();
2719
2720     SetName(conv.m_name);
2721     m_encoding = conv.m_encoding;
2722
2723     return *this;
2724 }
2725
2726 void wxCSConv::Clear()
2727 {
2728     free(m_name);
2729     delete m_convReal;
2730
2731     m_name = NULL;
2732     m_convReal = NULL;
2733 }
2734
2735 void wxCSConv::SetName(const char *charset)
2736 {
2737     if (charset)
2738     {
2739         m_name = wxStrdup(charset);
2740         m_deferred = true;
2741     }
2742 }
2743
2744 #if wxUSE_FONTMAP
2745
2746 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2747                      wxEncodingNameCache );
2748
2749 static wxEncodingNameCache gs_nameCache;
2750 #endif
2751
2752 wxMBConv *wxCSConv::DoCreate() const
2753 {
2754 #if wxUSE_FONTMAP
2755     wxLogTrace(TRACE_STRCONV,
2756                wxT("creating conversion for %s"),
2757                (m_name ? m_name
2758                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2759 #endif // wxUSE_FONTMAP
2760
2761     // check for the special case of ASCII or ISO8859-1 charset: as we have
2762     // special knowledge of it anyhow, we don't need to create a special
2763     // conversion object
2764     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2765             m_encoding == wxFONTENCODING_DEFAULT )
2766     {
2767         // don't convert at all
2768         return NULL;
2769     }
2770
2771     // we trust OS to do conversion better than we can so try external
2772     // conversion methods first
2773     //
2774     // the full order is:
2775     //      1. OS conversion (iconv() under Unix or Win32 API)
2776     //      2. hard coded conversions for UTF
2777     //      3. wxEncodingConverter as fall back
2778
2779     // step (1)
2780 #ifdef HAVE_ICONV
2781 #if !wxUSE_FONTMAP
2782     if ( m_name )
2783 #endif // !wxUSE_FONTMAP
2784     {
2785 #if wxUSE_FONTMAP
2786         wxFontEncoding encoding(m_encoding);
2787 #endif
2788
2789         if ( m_name )
2790         {
2791             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2792             if ( conv->IsOk() )
2793                 return conv;
2794
2795             delete conv;
2796
2797 #if wxUSE_FONTMAP
2798             encoding =
2799                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2800 #endif // wxUSE_FONTMAP
2801         }
2802 #if wxUSE_FONTMAP
2803         {
2804             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2805             if ( it != gs_nameCache.end() )
2806             {
2807                 if ( it->second.empty() )
2808                     return NULL;
2809
2810                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2811                 if ( conv->IsOk() )
2812                     return conv;
2813
2814                 delete conv;
2815             }
2816
2817             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2818             // CS : in case this does not return valid names (eg for MacRoman)
2819             // encoding got a 'failure' entry in the cache all the same,
2820             // although it just has to be created using a different method, so
2821             // only store failed iconv creation attempts (or perhaps we
2822             // shoulnd't do this at all ?)
2823             if ( names[0] != NULL )
2824             {
2825                 for ( ; *names; ++names )
2826                 {
2827                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2828                     //             will need changes that will obsolete this
2829                     wxString name(*names);
2830                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2831                     if ( conv->IsOk() )
2832                     {
2833                         gs_nameCache[encoding] = *names;
2834                         return conv;
2835                     }
2836
2837                     delete conv;
2838                 }
2839
2840                 gs_nameCache[encoding] = _T(""); // cache the failure
2841             }
2842         }
2843 #endif // wxUSE_FONTMAP
2844     }
2845 #endif // HAVE_ICONV
2846
2847 #ifdef wxHAVE_WIN32_MB2WC
2848     {
2849 #if wxUSE_FONTMAP
2850         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2851                                       : new wxMBConv_win32(m_encoding);
2852         if ( conv->IsOk() )
2853             return conv;
2854
2855         delete conv;
2856 #else
2857         return NULL;
2858 #endif
2859     }
2860 #endif // wxHAVE_WIN32_MB2WC
2861
2862 #ifdef __DARWIN__
2863     {
2864         // leave UTF16 and UTF32 to the built-ins of wx
2865         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2866             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2867         {
2868 #if wxUSE_FONTMAP
2869             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2870                                           : new wxMBConv_cf(m_encoding);
2871 #else
2872             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2873 #endif
2874
2875             if ( conv->IsOk() )
2876                  return conv;
2877
2878             delete conv;
2879         }
2880     }
2881 #endif // __DARWIN__
2882
2883     // step (2)
2884     wxFontEncoding enc = m_encoding;
2885 #if wxUSE_FONTMAP
2886     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2887     {
2888         // use "false" to suppress interactive dialogs -- we can be called from
2889         // anywhere and popping up a dialog from here is the last thing we want to
2890         // do
2891         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2892     }
2893 #endif // wxUSE_FONTMAP
2894
2895     switch ( enc )
2896     {
2897         case wxFONTENCODING_UTF7:
2898              return new wxMBConvUTF7;
2899
2900         case wxFONTENCODING_UTF8:
2901              return new wxMBConvUTF8;
2902
2903         case wxFONTENCODING_UTF16BE:
2904              return new wxMBConvUTF16BE;
2905
2906         case wxFONTENCODING_UTF16LE:
2907              return new wxMBConvUTF16LE;
2908
2909         case wxFONTENCODING_UTF32BE:
2910              return new wxMBConvUTF32BE;
2911
2912         case wxFONTENCODING_UTF32LE:
2913              return new wxMBConvUTF32LE;
2914
2915         default:
2916              // nothing to do but put here to suppress gcc warnings
2917              break;
2918     }
2919
2920     // step (3)
2921 #if wxUSE_FONTMAP
2922     {
2923         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2924                                       : new wxMBConv_wxwin(m_encoding);
2925         if ( conv->IsOk() )
2926             return conv;
2927
2928         delete conv;
2929     }
2930 #endif // wxUSE_FONTMAP
2931
2932     // NB: This is a hack to prevent deadlock. What could otherwise happen
2933     //     in Unicode build: wxConvLocal creation ends up being here
2934     //     because of some failure and logs the error. But wxLog will try to
2935     //     attach a timestamp, for which it will need wxConvLocal (to convert
2936     //     time to char* and then wchar_t*), but that fails, tries to log the
2937     //     error, but wxLog has an (already locked) critical section that
2938     //     guards the static buffer.
2939     static bool alreadyLoggingError = false;
2940     if (!alreadyLoggingError)
2941     {
2942         alreadyLoggingError = true;
2943         wxLogError(_("Cannot convert from the charset '%s'!"),
2944                    m_name ? m_name
2945                       :
2946 #if wxUSE_FONTMAP
2947                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2948 #else // !wxUSE_FONTMAP
2949                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2951               );
2952
2953         alreadyLoggingError = false;
2954     }
2955
2956     return NULL;
2957 }
2958
2959 void wxCSConv::CreateConvIfNeeded() const
2960 {
2961     if ( m_deferred )
2962     {
2963         wxCSConv *self = (wxCSConv *)this; // const_cast
2964
2965         // if we don't have neither the name nor the encoding, use the default
2966         // encoding for this system
2967         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2968         {
2969 #if wxUSE_INTL
2970             self->m_encoding = wxLocale::GetSystemEncoding();
2971 #else
2972             // fallback to some reasonable default:
2973             self->m_encoding = wxFONTENCODING_ISO8859_1;
2974 #endif // wxUSE_INTL
2975         }
2976
2977         self->m_convReal = DoCreate();
2978         self->m_deferred = false;
2979     }
2980 }
2981
2982 bool wxCSConv::IsOk() const
2983 {
2984     CreateConvIfNeeded();
2985
2986     // special case: no convReal created for wxFONTENCODING_ISO8859_1
2987     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2988         return true; // always ok as we do it ourselves
2989
2990     // m_convReal->IsOk() is called at its own creation, so we know it must
2991     // be ok if m_convReal is non-NULL
2992     return m_convReal != NULL;
2993 }
2994
2995 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
2996                          const char *src, size_t srcLen) const
2997 {
2998     CreateConvIfNeeded();
2999
3000     if (m_convReal)
3001         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3002
3003     // latin-1 (direct)
3004     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3005 }
3006
3007 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3008                            const wchar_t *src, size_t srcLen) const
3009 {
3010     CreateConvIfNeeded();
3011
3012     if (m_convReal)
3013         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3014
3015     // latin-1 (direct)
3016     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3017 }
3018
3019 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3020 {
3021     CreateConvIfNeeded();
3022
3023     if (m_convReal)
3024         return m_convReal->MB2WC(buf, psz, n);
3025
3026     // latin-1 (direct)
3027     size_t len = strlen(psz);
3028
3029     if (buf)
3030     {
3031         for (size_t c = 0; c <= len; c++)
3032             buf[c] = (unsigned char)(psz[c]);
3033     }
3034
3035     return len;
3036 }
3037
3038 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3039 {
3040     CreateConvIfNeeded();
3041
3042     if (m_convReal)
3043         return m_convReal->WC2MB(buf, psz, n);
3044
3045     // latin-1 (direct)
3046     const size_t len = wxWcslen(psz);
3047     if (buf)
3048     {
3049         for (size_t c = 0; c <= len; c++)
3050         {
3051             if (psz[c] > 0xFF)
3052                 return wxCONV_FAILED;
3053
3054             buf[c] = (char)psz[c];
3055         }
3056     }
3057     else
3058     {
3059         for (size_t c = 0; c <= len; c++)
3060         {
3061             if (psz[c] > 0xFF)
3062                 return wxCONV_FAILED;
3063         }
3064     }
3065
3066     return len;
3067 }
3068
3069 size_t wxCSConv::GetMBNulLen() const
3070 {
3071     CreateConvIfNeeded();
3072
3073     if ( m_convReal )
3074     {
3075         return m_convReal->GetMBNulLen();
3076     }
3077
3078     // otherwise, we are ISO-8859-1
3079     return 1;
3080 }
3081
3082 #if wxUSE_UNICODE_UTF8
3083 bool wxCSConv::IsUTF8() const
3084 {
3085     CreateConvIfNeeded();
3086
3087     if ( m_convReal )
3088     {
3089         return m_convReal->IsUTF8();
3090     }
3091
3092     // otherwise, we are ISO-8859-1
3093     return false;
3094 }
3095 #endif
3096
3097
3098 #if wxUSE_UNICODE
3099
3100 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3101 {
3102     if ( !s )
3103         return wxWCharBuffer();
3104
3105     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3106     if ( !wbuf )
3107         wbuf = wxMBConvUTF8().cMB2WX(s);
3108     if ( !wbuf )
3109         wbuf = wxConvISO8859_1.cMB2WX(s);
3110
3111     return wbuf;
3112 }
3113
3114 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3115 {
3116     if ( !ws )
3117         return wxCharBuffer();
3118
3119     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3120     if ( !buf )
3121         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3122
3123     return buf;
3124 }
3125
3126 #endif // wxUSE_UNICODE
3127
3128 // ----------------------------------------------------------------------------
3129 // globals
3130 // ----------------------------------------------------------------------------
3131
3132 // NB: The reason why we create converted objects in this convoluted way,
3133 //     using a factory function instead of global variable, is that they
3134 //     may be used at static initialization time (some of them are used by
3135 //     wxString ctors and there may be a global wxString object). In other
3136 //     words, possibly _before_ the converter global object would be
3137 //     initialized.
3138
3139 #undef wxConvLibc
3140 #undef wxConvUTF8
3141 #undef wxConvUTF7
3142 #undef wxConvLocal
3143 #undef wxConvISO8859_1
3144
3145 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3146     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3147     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3148     {                                                                   \
3149         static impl_klass name##Obj ctor_args;                          \
3150         return &name##Obj;                                              \
3151     }                                                                   \
3152     /* this ensures that all global converter objects are created */    \
3153     /* by the time static initialization is done, i.e. before any */    \
3154     /* thread is launched: */                                           \
3155     static klass* gs_##name##instance = wxGet_##name##Ptr()
3156
3157 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3158     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3159
3160 #ifdef __WINDOWS__
3161     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3162 #else
3163     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3164 #endif
3165
3166 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3167 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3168
3169 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3170 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3171
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3173 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3174
3175 #ifdef __DARWIN__
3176 // The xnu kernel always communicates file paths in decomposed UTF-8.
3177 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3178 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3179 #endif
3180
3181 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3182 #ifdef __DARWIN__
3183                                     &wxConvMacUTF8DObj;
3184 #else // !__DARWIN__
3185                                     wxGet_wxConvLibcPtr();
3186 #endif // __DARWIN__/!__DARWIN__
3187
3188 #else // !wxUSE_WCHAR_T
3189
3190 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3191 // stand-ins in absence of wchar_t
3192 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3193                                 wxConvISO8859_1,
3194                                 wxConvLocal,
3195                                 wxConvUTF8;
3196
3197 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T