src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = MB2WC(NULL, psz, 0);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = WC2MB(NULL, pwz, 0);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             // extra space for trailing NUL(s)
 380             static const size_t extraLen = GetMaxMBNulLen();
 381
 382             wxCharBuffer buf(nLen + extraLen - 1);
 383             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 384                 return buf;
 385         }
 386     }
 387
 388     return wxCharBuffer();
 389 }
 390
 391 const wxWCharBuffer
 392 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 393 {
 394     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 395     if ( dstLen != wxCONV_FAILED )
 396     {
 397         wxWCharBuffer wbuf(dstLen - 1);
 398         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 399         {
 400             if ( outLen )
 401             {
 402                 *outLen = dstLen;
 403                 if ( wbuf[dstLen - 1] == L'\0' )
 404                     (*outLen)--;
 405             }
 406
 407             return wbuf;
 408         }
 409     }
 410
 411     if ( outLen )
 412         *outLen = 0;
 413
 414     return wxWCharBuffer();
 415 }
 416
 417 const wxCharBuffer
 418 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 419 {
 420     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 421     if ( dstLen != wxCONV_FAILED )
 422     {
 423         // special case of empty input: can't allocate 0 size buffer below as
 424         // wxCharBuffer insists on NUL-terminating it
 425         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 const size_t nulLen = GetMBNulLen();
 433                 if ( dstLen >= nulLen &&
 434                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 435                 {
 436                     // in this case the output is NUL-terminated and we're not
 437                     // supposed to count NUL
 438                     *outLen -= nulLen;
 439                 }
 440             }
 441
 442             return buf;
 443         }
 444     }
 445
 446     if ( outLen )
 447         *outLen = 0;
 448
 449     return wxCharBuffer();
 450 }
 451
 452 // ----------------------------------------------------------------------------
 453 // wxMBConvLibc
 454 // ----------------------------------------------------------------------------
 455
 456 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 457 {
 458     return wxMB2WC(buf, psz, n);
 459 }
 460
 461 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 462 {
 463     return wxWC2MB(buf, psz, n);
 464 }
 465
 466 // ----------------------------------------------------------------------------
 467 // wxConvBrokenFileNames
 468 // ----------------------------------------------------------------------------
 469
 470 #ifdef __UNIX__
 471
 472 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 473 {
 474     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 475          wxStricmp(charset, _T("UTF8")) == 0  )
 476         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 477     else
 478         m_conv = new wxCSConv(charset);
 479 }
 480
 481 #endif // __UNIX__
 482
 483 // ----------------------------------------------------------------------------
 484 // UTF-7
 485 // ----------------------------------------------------------------------------
 486
 487 // Implementation (C) 2004 Fredrik Roubert
 488
 489 //
 490 // BASE64 decoding table
 491 //
 492 static const unsigned char utf7unb64[] =
 493 {
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 500     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 501     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 503     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 504     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 505     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 507     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 508     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 509     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 526 };
 527
 528 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 529 {
 530     size_t len = 0;
 531
 532     while ( *psz && (!buf || (len < n)) )
 533     {
 534         unsigned char cc = *psz++;
 535         if (cc != '+')
 536         {
 537             // plain ASCII char
 538             if (buf)
 539                 *buf++ = cc;
 540             len++;
 541         }
 542         else if (*psz == '-')
 543         {
 544             // encoded plus sign
 545             if (buf)
 546                 *buf++ = cc;
 547             len++;
 548             psz++;
 549         }
 550         else // start of BASE64 encoded string
 551         {
 552             bool lsb, ok;
 553             unsigned int d, l;
 554             for ( ok = lsb = false, d = 0, l = 0;
 555                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 556                   psz++ )
 557             {
 558                 d <<= 6;
 559                 d += cc;
 560                 for (l += 6; l >= 8; lsb = !lsb)
 561                 {
 562                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 563                     if (lsb)
 564                     {
 565                         if (buf)
 566                             *buf++ |= c;
 567                         len ++;
 568                     }
 569                     else
 570                     {
 571                         if (buf)
 572                             *buf = (wchar_t)(c << 8);
 573                     }
 574
 575                     ok = true;
 576                 }
 577             }
 578
 579             if ( !ok )
 580             {
 581                 // in valid UTF7 we should have valid characters after '+'
 582                 return wxCONV_FAILED;
 583             }
 584
 585             if (*psz == '-')
 586                 psz++;
 587         }
 588     }
 589
 590     if ( buf && (len < n) )
 591         *buf = '\0';
 592
 593     return len;
 594 }
 595
 596 //
 597 // BASE64 encoding table
 598 //
 599 static const unsigned char utf7enb64[] =
 600 {
 601     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 602     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 603     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 604     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 605     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 606     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 607     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 608     '4', '5', '6', '7', '8', '9', '+', '/'
 609 };
 610
 611 //
 612 // UTF-7 encoding table
 613 //
 614 // 0 - Set D (directly encoded characters)
 615 // 1 - Set O (optional direct characters)
 616 // 2 - whitespace characters (optional)
 617 // 3 - special characters
 618 //
 619 static const unsigned char utf7encode[128] =
 620 {
 621     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 622     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 623     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 624     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 625     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 627     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 628     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 629 };
 630
 631 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 632 {
 633     size_t len = 0;
 634
 635     while (*psz && ((!buf) || (len < n)))
 636     {
 637         wchar_t cc = *psz++;
 638         if (cc < 0x80 && utf7encode[cc] < 1)
 639         {
 640             // plain ASCII char
 641             if (buf)
 642                 *buf++ = (char)cc;
 643
 644             len++;
 645         }
 646 #ifndef WC_UTF16
 647         else if (((wxUint32)cc) > 0xffff)
 648         {
 649             // no surrogate pair generation (yet?)
 650             return wxCONV_FAILED;
 651         }
 652 #endif
 653         else
 654         {
 655             if (buf)
 656                 *buf++ = '+';
 657
 658             len++;
 659             if (cc != '+')
 660             {
 661                 // BASE64 encode string
 662                 unsigned int lsb, d, l;
 663                 for (d = 0, l = 0; /*nothing*/; psz++)
 664                 {
 665                     for (lsb = 0; lsb < 2; lsb ++)
 666                     {
 667                         d <<= 8;
 668                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 669
 670                         for (l += 8; l >= 6; )
 671                         {
 672                             l -= 6;
 673                             if (buf)
 674                                 *buf++ = utf7enb64[(d >> l) % 64];
 675                             len++;
 676                         }
 677                     }
 678
 679                     cc = *psz;
 680                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 681                         break;
 682                 }
 683
 684                 if (l != 0)
 685                 {
 686                     if (buf)
 687                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 688
 689                     len++;
 690                 }
 691             }
 692
 693             if (buf)
 694                 *buf++ = '-';
 695             len++;
 696         }
 697     }
 698
 699     if (buf && (len < n))
 700         *buf = 0;
 701
 702     return len;
 703 }
 704
 705 // ----------------------------------------------------------------------------
 706 // UTF-8
 707 // ----------------------------------------------------------------------------
 708
 709 static wxUint32 utf8_max[]=
 710     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 711
 712 // boundaries of the private use area we use to (temporarily) remap invalid
 713 // characters invalid in a UTF-8 encoded string
 714 const wxUint32 wxUnicodePUA = 0x100000;
 715 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 716
 717 // this table gives the length of the UTF-8 encoding from its first character:
 718 unsigned char tableUtf8Lengths[256] = {
 719     // single-byte sequences (ASCII):
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 726     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 727     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 728
 729     // these are invalid:
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 733     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 734     0, 0,                                            // C0,C1
 735
 736     // two-byte sequences:
 737           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 738     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 739
 740     // three-byte sequences:
 741     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 742
 743     // four-byte sequences:
 744     4, 4, 4, 4, 4,                                   // F0..F4
 745
 746     // these are invalid again (5- or 6-byte
 747     // sequences and sequences for code points
 748     // above U+10FFFF, as restricted by RFC 3629):
 749                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 750 };
 751
 752 size_t
 753 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 754                             const char *src, size_t srcLen) const
 755 {
 756     wchar_t *out = dstLen ? dst : NULL;
 757     size_t written = 0;
 758
 759     if ( srcLen == wxNO_LEN )
 760         srcLen = strlen(src) + 1;
 761
 762     for ( const char *p = src; ; p++ )
 763     {
 764         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 765         {
 766             // all done successfully, just add the trailing NULL if we are not
 767             // using explicit length
 768             if ( srcLen == wxNO_LEN )
 769             {
 770                 if ( out )
 771                 {
 772                     if ( !dstLen )
 773                         break;
 774
 775                     *out = L'\0';
 776                 }
 777
 778                 written++;
 779             }
 780
 781             return written;
 782         }
 783
 784         unsigned char c = *p;
 785         unsigned len = tableUtf8Lengths[c];
 786         if ( !len )
 787             break;
 788
 789         if ( srcLen < len ) // the test works for wxNO_LEN too
 790             break;
 791
 792         if ( srcLen != wxNO_LEN )
 793             srcLen -= len;
 794
 795         if ( out && !dstLen-- )
 796             break;
 797
 798
 799         //   Char. number range   |        UTF-8 octet sequence
 800         //      (hexadecimal)     |              (binary)
 801         //  ----------------------+---------------------------------------------
 802         //  0000 0000 - 0000 007F | 0xxxxxxx
 803         //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 804         //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 805         //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 806         //
 807         //  Code point value is stored in bits marked with 'x', lowest-order bit
 808         //  of the value on the right side in the diagram above.
 809         //                                                       (from RFC 3629)
 810
 811         // mask to extract lead byte's value ('x' bits above), by sequence length:
 812         static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 813
 814         // mask and value of lead byte's most significant bits, by length:
 815         static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 816         static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 817
 818         len--; // it's more convenient to work with 0-based length here
 819
 820         // extract the lead byte's value bits:
 821         if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 822             break;
 823
 824         wxUint32 code = c & leadValueMask[len];
 825
 826         // all remaining bytes, if any, are handled in the same way regardless of
 827         // sequence's length:
 828         for ( ; len; --len )
 829         {
 830             c = *++p;
 831             if ( (c & 0xC0) != 0x80 )
 832                 return wxCONV_FAILED;
 833
 834             code <<= 6;
 835             code |= c & 0x3F;
 836         }
 837
 838 #ifdef WC_UTF16
 839         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 840         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 841         {
 842             if ( out )
 843                 out++;
 844             written++;
 845         }
 846 #else // !WC_UTF16
 847         if ( out )
 848             *out = code;
 849 #endif // WC_UTF16/!WC_UTF16
 850
 851         if ( out )
 852             out++;
 853
 854         written++;
 855     }
 856
 857     return wxCONV_FAILED;
 858 }
 859
 860 size_t
 861 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 862                               const wchar_t *src, size_t srcLen) const
 863 {
 864     char *out = dstLen ? dst : NULL;
 865     size_t written = 0;
 866
 867     for ( const wchar_t *wp = src; ; wp++ )
 868     {
 869         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 870         {
 871             // all done successfully, just add the trailing NULL if we are not
 872             // using explicit length
 873             if ( srcLen == wxNO_LEN )
 874             {
 875                 if ( out )
 876                 {
 877                     if ( !dstLen )
 878                         break;
 879
 880                     *out = '\0';
 881                 }
 882
 883                 written++;
 884             }
 885
 886             return written;
 887         }
 888
 889
 890         wxUint32 code;
 891 #ifdef WC_UTF16
 892         // cast is ok for WC_UTF16
 893         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 894         {
 895             // skip the next char too as we decoded a surrogate
 896             wp++;
 897         }
 898 #else // wchar_t is UTF-32
 899         code = *wp & 0x7fffffff;
 900 #endif
 901
 902         unsigned len;
 903         if ( code <= 0x7F )
 904         {
 905             len = 1;
 906             if ( out )
 907             {
 908                 if ( dstLen < len )
 909                     break;
 910
 911                 out[0] = (char)code;
 912             }
 913         }
 914         else if ( code <= 0x07FF )
 915         {
 916             len = 2;
 917             if ( out )
 918             {
 919                 if ( dstLen < len )
 920                     break;
 921
 922                 // NB: this line takes 6 least significant bits, encodes them as
 923                 // 10xxxxxx and discards them so that the next byte can be encoded:
 924                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 925                 out[0] = 0xC0 | code;
 926             }
 927         }
 928         else if ( code < 0xFFFF )
 929         {
 930             len = 3;
 931             if ( out )
 932             {
 933                 if ( dstLen < len )
 934                     break;
 935
 936                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 937                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 938                 out[0] = 0xE0 | code;
 939             }
 940         }
 941         else if ( code <= 0x10FFFF )
 942         {
 943             len = 4;
 944             if ( out )
 945             {
 946                 if ( dstLen < len )
 947                     break;
 948
 949                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 952                 out[0] = 0xF0 | code;
 953             }
 954         }
 955         else
 956         {
 957             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 958             break;
 959         }
 960
 961         if ( out )
 962         {
 963             out += len;
 964             dstLen -= len;
 965         }
 966
 967         written += len;
 968     }
 969
 970     // we only get here if an error occurs during decoding
 971     return wxCONV_FAILED;
 972 }
 973
 974 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 975 {
 976     if ( m_options == MAP_INVALID_UTF8_NOT )
 977         return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
 978
 979     size_t len = 0;
 980
 981     while (*psz && ((!buf) || (len < n)))
 982     {
 983         const char *opsz = psz;
 984         bool invalid = false;
 985         unsigned char cc = *psz++, fc = cc;
 986         unsigned cnt;
 987         for (cnt = 0; fc & 0x80; cnt++)
 988             fc <<= 1;
 989
 990         if (!cnt)
 991         {
 992             // plain ASCII char
 993             if (buf)
 994                 *buf++ = cc;
 995             len++;
 996
 997             // escape the escape character for octal escapes
 998             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 999                     && cc == '\\' && (!buf || len < n))
1000             {
1001                 if (buf)
1002                     *buf++ = cc;
1003                 len++;
1004             }
1005         }
1006         else
1007         {
1008             cnt--;
1009             if (!cnt)
1010             {
1011                 // invalid UTF-8 sequence
1012                 invalid = true;
1013             }
1014             else
1015             {
1016                 unsigned ocnt = cnt - 1;
1017                 wxUint32 res = cc & (0x3f >> cnt);
1018                 while (cnt--)
1019                 {
1020                     cc = *psz;
1021                     if ((cc & 0xC0) != 0x80)
1022                     {
1023                         // invalid UTF-8 sequence
1024                         invalid = true;
1025                         break;
1026                     }
1027
1028                     psz++;
1029                     res = (res << 6) | (cc & 0x3f);
1030                 }
1031
1032                 if (invalid || res <= utf8_max[ocnt])
1033                 {
1034                     // illegal UTF-8 encoding
1035                     invalid = true;
1036                 }
1037                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1038                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1039                 {
1040                     // if one of our PUA characters turns up externally
1041                     // it must also be treated as an illegal sequence
1042                     // (a bit like you have to escape an escape character)
1043                     invalid = true;
1044                 }
1045                 else
1046                 {
1047 #ifdef WC_UTF16
1048                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1049                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1050                     if (pa == wxCONV_FAILED)
1051                     {
1052                         invalid = true;
1053                     }
1054                     else
1055                     {
1056                         if (buf)
1057                             buf += pa;
1058                         len += pa;
1059                     }
1060 #else // !WC_UTF16
1061                     if (buf)
1062                         *buf++ = (wchar_t)res;
1063                     len++;
1064 #endif // WC_UTF16/!WC_UTF16
1065                 }
1066             }
1067
1068             if (invalid)
1069             {
1070                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1071                 {
1072                     while (opsz < psz && (!buf || len < n))
1073                     {
1074 #ifdef WC_UTF16
1075                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1076                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1077                         wxASSERT(pa != wxCONV_FAILED);
1078                         if (buf)
1079                             buf += pa;
1080                         opsz++;
1081                         len += pa;
1082 #else
1083                         if (buf)
1084                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1085                         opsz++;
1086                         len++;
1087 #endif
1088                     }
1089                 }
1090                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1091                 {
1092                     while (opsz < psz && (!buf || len < n))
1093                     {
1094                         if ( buf && len + 3 < n )
1095                         {
1096                             unsigned char on = *opsz;
1097                             *buf++ = L'\\';
1098                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1099                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1100                             *buf++ = (wchar_t)( L'0' + on % 010 );
1101                         }
1102
1103                         opsz++;
1104                         len += 4;
1105                     }
1106                 }
1107                 else // MAP_INVALID_UTF8_NOT
1108                 {
1109                     return wxCONV_FAILED;
1110                 }
1111             }
1112         }
1113     }
1114
1115     if (buf && (len < n))
1116         *buf = 0;
1117
1118     return len;
1119 }
1120
1121 static inline bool isoctal(wchar_t wch)
1122 {
1123     return L'0' <= wch && wch <= L'7';
1124 }
1125
1126 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1127 {
1128     if ( m_options == MAP_INVALID_UTF8_NOT )
1129         return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1130
1131     size_t len = 0;
1132
1133     while (*psz && ((!buf) || (len < n)))
1134     {
1135         wxUint32 cc;
1136
1137 #ifdef WC_UTF16
1138         // cast is ok for WC_UTF16
1139         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1140         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1141 #else
1142         cc = (*psz++) & 0x7fffffff;
1143 #endif
1144
1145         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1146                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1147         {
1148             if (buf)
1149                 *buf++ = (char)(cc - wxUnicodePUA);
1150             len++;
1151         }
1152         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1153                     && cc == L'\\' && psz[0] == L'\\' )
1154         {
1155             if (buf)
1156                 *buf++ = (char)cc;
1157             psz++;
1158             len++;
1159         }
1160         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1161                     cc == L'\\' &&
1162                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1163         {
1164             if (buf)
1165             {
1166                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1167                                  (psz[1] - L'0') * 010 +
1168                                  (psz[2] - L'0'));
1169             }
1170
1171             psz += 3;
1172             len++;
1173         }
1174         else
1175         {
1176             unsigned cnt;
1177             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1178             {
1179             }
1180
1181             if (!cnt)
1182             {
1183                 // plain ASCII char
1184                 if (buf)
1185                     *buf++ = (char) cc;
1186                 len++;
1187             }
1188             else
1189             {
1190                 len += cnt + 1;
1191                 if (buf)
1192                 {
1193                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1194                     while (cnt--)
1195                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1196                 }
1197             }
1198         }
1199     }
1200
1201     if (buf && (len < n))
1202         *buf = 0;
1203
1204     return len;
1205 }
1206
1207 // ============================================================================
1208 // UTF-16
1209 // ============================================================================
1210
1211 #ifdef WORDS_BIGENDIAN
1212     #define wxMBConvUTF16straight wxMBConvUTF16BE
1213     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1214 #else
1215     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1216     #define wxMBConvUTF16straight wxMBConvUTF16LE
1217 #endif
1218
1219 /* static */
1220 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1221 {
1222     if ( srcLen == wxNO_LEN )
1223     {
1224         // count the number of bytes in input, including the trailing NULs
1225         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1226         for ( srcLen = 1; *inBuff++; srcLen++ )
1227             ;
1228
1229         srcLen *= BYTES_PER_CHAR;
1230     }
1231     else // we already have the length
1232     {
1233         // we can only convert an entire number of UTF-16 characters
1234         if ( srcLen % BYTES_PER_CHAR )
1235             return wxCONV_FAILED;
1236     }
1237
1238     return srcLen;
1239 }
1240
1241 // case when in-memory representation is UTF-16 too
1242 #ifdef WC_UTF16
1243
1244 // ----------------------------------------------------------------------------
1245 // conversions without endianness change
1246 // ----------------------------------------------------------------------------
1247
1248 size_t
1249 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1250                                const char *src, size_t srcLen) const
1251 {
1252     // set up the scene for using memcpy() (which is presumably more efficient
1253     // than copying the bytes one by one)
1254     srcLen = GetLength(src, srcLen);
1255     if ( srcLen == wxNO_LEN )
1256         return wxCONV_FAILED;
1257
1258     const size_t inLen = srcLen / BYTES_PER_CHAR;
1259     if ( dst )
1260     {
1261         if ( dstLen < inLen )
1262             return wxCONV_FAILED;
1263
1264         memcpy(dst, src, srcLen);
1265     }
1266
1267     return inLen;
1268 }
1269
1270 size_t
1271 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1272                                  const wchar_t *src, size_t srcLen) const
1273 {
1274     if ( srcLen == wxNO_LEN )
1275         srcLen = wxWcslen(src) + 1;
1276
1277     srcLen *= BYTES_PER_CHAR;
1278
1279     if ( dst )
1280     {
1281         if ( dstLen < srcLen )
1282             return wxCONV_FAILED;
1283
1284         memcpy(dst, src, srcLen);
1285     }
1286
1287     return srcLen;
1288 }
1289
1290 // ----------------------------------------------------------------------------
1291 // endian-reversing conversions
1292 // ----------------------------------------------------------------------------
1293
1294 size_t
1295 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1296                            const char *src, size_t srcLen) const
1297 {
1298     srcLen = GetLength(src, srcLen);
1299     if ( srcLen == wxNO_LEN )
1300         return wxCONV_FAILED;
1301
1302     srcLen /= BYTES_PER_CHAR;
1303
1304     if ( dst )
1305     {
1306         if ( dstLen < srcLen )
1307             return wxCONV_FAILED;
1308
1309         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1310         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1311         {
1312             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1313         }
1314     }
1315
1316     return srcLen;
1317 }
1318
1319 size_t
1320 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1321                              const wchar_t *src, size_t srcLen) const
1322 {
1323     if ( srcLen == wxNO_LEN )
1324         srcLen = wxWcslen(src) + 1;
1325
1326     srcLen *= BYTES_PER_CHAR;
1327
1328     if ( dst )
1329     {
1330         if ( dstLen < srcLen )
1331             return wxCONV_FAILED;
1332
1333         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1334         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1335         {
1336             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1337         }
1338     }
1339
1340     return srcLen;
1341 }
1342
1343 #else // !WC_UTF16: wchar_t is UTF-32
1344
1345 // ----------------------------------------------------------------------------
1346 // conversions without endianness change
1347 // ----------------------------------------------------------------------------
1348
1349 size_t
1350 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1351                                const char *src, size_t srcLen) const
1352 {
1353     srcLen = GetLength(src, srcLen);
1354     if ( srcLen == wxNO_LEN )
1355         return wxCONV_FAILED;
1356
1357     const size_t inLen = srcLen / BYTES_PER_CHAR;
1358     if ( !dst )
1359     {
1360         // optimization: return maximal space which could be needed for this
1361         // string even if the real size could be smaller if the buffer contains
1362         // any surrogates
1363         return inLen;
1364     }
1365
1366     size_t outLen = 0;
1367     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1368     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1369     {
1370         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1371         if ( !inBuff )
1372             return wxCONV_FAILED;
1373
1374         if ( ++outLen > dstLen )
1375             return wxCONV_FAILED;
1376
1377         *dst++ = ch;
1378     }
1379
1380
1381     return outLen;
1382 }
1383
1384 size_t
1385 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1386                                  const wchar_t *src, size_t srcLen) const
1387 {
1388     if ( srcLen == wxNO_LEN )
1389         srcLen = wxWcslen(src) + 1;
1390
1391     size_t outLen = 0;
1392     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1393     for ( size_t n = 0; n < srcLen; n++ )
1394     {
1395         wxUint16 cc[2];
1396         const size_t numChars = encode_utf16(*src++, cc);
1397         if ( numChars == wxCONV_FAILED )
1398             return wxCONV_FAILED;
1399
1400         outLen += numChars * BYTES_PER_CHAR;
1401         if ( outBuff )
1402         {
1403             if ( outLen > dstLen )
1404                 return wxCONV_FAILED;
1405
1406             *outBuff++ = cc[0];
1407             if ( numChars == 2 )
1408             {
1409                 // second character of a surrogate
1410                 *outBuff++ = cc[1];
1411             }
1412         }
1413     }
1414
1415     return outLen;
1416 }
1417
1418 // ----------------------------------------------------------------------------
1419 // endian-reversing conversions
1420 // ----------------------------------------------------------------------------
1421
1422 size_t
1423 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1424                            const char *src, size_t srcLen) const
1425 {
1426     srcLen = GetLength(src, srcLen);
1427     if ( srcLen == wxNO_LEN )
1428         return wxCONV_FAILED;
1429
1430     const size_t inLen = srcLen / BYTES_PER_CHAR;
1431     if ( !dst )
1432     {
1433         // optimization: return maximal space which could be needed for this
1434         // string even if the real size could be smaller if the buffer contains
1435         // any surrogates
1436         return inLen;
1437     }
1438
1439     size_t outLen = 0;
1440     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1441     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1442     {
1443         wxUint32 ch;
1444         wxUint16 tmp[2];
1445
1446         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1447         inBuff++;
1448         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1449
1450         const size_t numChars = decode_utf16(tmp, ch);
1451         if ( numChars == wxCONV_FAILED )
1452             return wxCONV_FAILED;
1453
1454         if ( numChars == 2 )
1455             inBuff++;
1456
1457         if ( ++outLen > dstLen )
1458             return wxCONV_FAILED;
1459
1460         *dst++ = ch;
1461     }
1462
1463
1464     return outLen;
1465 }
1466
1467 size_t
1468 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1469                              const wchar_t *src, size_t srcLen) const
1470 {
1471     if ( srcLen == wxNO_LEN )
1472         srcLen = wxWcslen(src) + 1;
1473
1474     size_t outLen = 0;
1475     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1476     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1477     {
1478         wxUint16 cc[2];
1479         const size_t numChars = encode_utf16(*src, cc);
1480         if ( numChars == wxCONV_FAILED )
1481             return wxCONV_FAILED;
1482
1483         outLen += numChars * BYTES_PER_CHAR;
1484         if ( outBuff )
1485         {
1486             if ( outLen > dstLen )
1487                 return wxCONV_FAILED;
1488
1489             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1490             if ( numChars == 2 )
1491             {
1492                 // second character of a surrogate
1493                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1494             }
1495         }
1496     }
1497
1498     return outLen;
1499 }
1500
1501 #endif // WC_UTF16/!WC_UTF16
1502
1503
1504 // ============================================================================
1505 // UTF-32
1506 // ============================================================================
1507
1508 #ifdef WORDS_BIGENDIAN
1509     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1510     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1511 #else
1512     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1513     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1514 #endif
1515
1516
1517 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1518 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1519
1520 /* static */
1521 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1522 {
1523     if ( srcLen == wxNO_LEN )
1524     {
1525         // count the number of bytes in input, including the trailing NULs
1526         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1527         for ( srcLen = 1; *inBuff++; srcLen++ )
1528             ;
1529
1530         srcLen *= BYTES_PER_CHAR;
1531     }
1532     else // we already have the length
1533     {
1534         // we can only convert an entire number of UTF-32 characters
1535         if ( srcLen % BYTES_PER_CHAR )
1536             return wxCONV_FAILED;
1537     }
1538
1539     return srcLen;
1540 }
1541
1542 // case when in-memory representation is UTF-16
1543 #ifdef WC_UTF16
1544
1545 // ----------------------------------------------------------------------------
1546 // conversions without endianness change
1547 // ----------------------------------------------------------------------------
1548
1549 size_t
1550 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1551                                const char *src, size_t srcLen) const
1552 {
1553     srcLen = GetLength(src, srcLen);
1554     if ( srcLen == wxNO_LEN )
1555         return wxCONV_FAILED;
1556
1557     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1558     const size_t inLen = srcLen / BYTES_PER_CHAR;
1559     size_t outLen = 0;
1560     for ( size_t n = 0; n < inLen; n++ )
1561     {
1562         wxUint16 cc[2];
1563         const size_t numChars = encode_utf16(*inBuff++, cc);
1564         if ( numChars == wxCONV_FAILED )
1565             return wxCONV_FAILED;
1566
1567         outLen += numChars;
1568         if ( dst )
1569         {
1570             if ( outLen > dstLen )
1571                 return wxCONV_FAILED;
1572
1573             *dst++ = cc[0];
1574             if ( numChars == 2 )
1575             {
1576                 // second character of a surrogate
1577                 *dst++ = cc[1];
1578             }
1579         }
1580     }
1581
1582     return outLen;
1583 }
1584
1585 size_t
1586 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1587                                  const wchar_t *src, size_t srcLen) const
1588 {
1589     if ( srcLen == wxNO_LEN )
1590         srcLen = wxWcslen(src) + 1;
1591
1592     if ( !dst )
1593     {
1594         // optimization: return maximal space which could be needed for this
1595         // string instead of the exact amount which could be less if there are
1596         // any surrogates in the input
1597         //
1598         // we consider that surrogates are rare enough to make it worthwhile to
1599         // avoid running the loop below at the cost of slightly extra memory
1600         // consumption
1601         return srcLen * BYTES_PER_CHAR;
1602     }
1603
1604     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1605     size_t outLen = 0;
1606     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1607     {
1608         const wxUint32 ch = wxDecodeSurrogate(&src);
1609         if ( !src )
1610             return wxCONV_FAILED;
1611
1612         outLen += BYTES_PER_CHAR;
1613
1614         if ( outLen > dstLen )
1615             return wxCONV_FAILED;
1616
1617         *outBuff++ = ch;
1618     }
1619
1620     return outLen;
1621 }
1622
1623 // ----------------------------------------------------------------------------
1624 // endian-reversing conversions
1625 // ----------------------------------------------------------------------------
1626
1627 size_t
1628 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1629                            const char *src, size_t srcLen) const
1630 {
1631     srcLen = GetLength(src, srcLen);
1632     if ( srcLen == wxNO_LEN )
1633         return wxCONV_FAILED;
1634
1635     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1636     const size_t inLen = srcLen / BYTES_PER_CHAR;
1637     size_t outLen = 0;
1638     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1639     {
1640         wxUint16 cc[2];
1641         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1642         if ( numChars == wxCONV_FAILED )
1643             return wxCONV_FAILED;
1644
1645         outLen += numChars;
1646         if ( dst )
1647         {
1648             if ( outLen > dstLen )
1649                 return wxCONV_FAILED;
1650
1651             *dst++ = cc[0];
1652             if ( numChars == 2 )
1653             {
1654                 // second character of a surrogate
1655                 *dst++ = cc[1];
1656             }
1657         }
1658     }
1659
1660     return outLen;
1661 }
1662
1663 size_t
1664 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1665                              const wchar_t *src, size_t srcLen) const
1666 {
1667     if ( srcLen == wxNO_LEN )
1668         srcLen = wxWcslen(src) + 1;
1669
1670     if ( !dst )
1671     {
1672         // optimization: return maximal space which could be needed for this
1673         // string instead of the exact amount which could be less if there are
1674         // any surrogates in the input
1675         //
1676         // we consider that surrogates are rare enough to make it worthwhile to
1677         // avoid running the loop below at the cost of slightly extra memory
1678         // consumption
1679         return srcLen*BYTES_PER_CHAR;
1680     }
1681
1682     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1683     size_t outLen = 0;
1684     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1685     {
1686         const wxUint32 ch = wxDecodeSurrogate(&src);
1687         if ( !src )
1688             return wxCONV_FAILED;
1689
1690         outLen += BYTES_PER_CHAR;
1691
1692         if ( outLen > dstLen )
1693             return wxCONV_FAILED;
1694
1695         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1696     }
1697
1698     return outLen;
1699 }
1700
1701 #else // !WC_UTF16: wchar_t is UTF-32
1702
1703 // ----------------------------------------------------------------------------
1704 // conversions without endianness change
1705 // ----------------------------------------------------------------------------
1706
1707 size_t
1708 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1709                                const char *src, size_t srcLen) const
1710 {
1711     // use memcpy() as it should be much faster than hand-written loop
1712     srcLen = GetLength(src, srcLen);
1713     if ( srcLen == wxNO_LEN )
1714         return wxCONV_FAILED;
1715
1716     const size_t inLen = srcLen/BYTES_PER_CHAR;
1717     if ( dst )
1718     {
1719         if ( dstLen < inLen )
1720             return wxCONV_FAILED;
1721
1722         memcpy(dst, src, srcLen);
1723     }
1724
1725     return inLen;
1726 }
1727
1728 size_t
1729 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1730                                  const wchar_t *src, size_t srcLen) const
1731 {
1732     if ( srcLen == wxNO_LEN )
1733         srcLen = wxWcslen(src) + 1;
1734
1735     srcLen *= BYTES_PER_CHAR;
1736
1737     if ( dst )
1738     {
1739         if ( dstLen < srcLen )
1740             return wxCONV_FAILED;
1741
1742         memcpy(dst, src, srcLen);
1743     }
1744
1745     return srcLen;
1746 }
1747
1748 // ----------------------------------------------------------------------------
1749 // endian-reversing conversions
1750 // ----------------------------------------------------------------------------
1751
1752 size_t
1753 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1754                            const char *src, size_t srcLen) const
1755 {
1756     srcLen = GetLength(src, srcLen);
1757     if ( srcLen == wxNO_LEN )
1758         return wxCONV_FAILED;
1759
1760     srcLen /= BYTES_PER_CHAR;
1761
1762     if ( dst )
1763     {
1764         if ( dstLen < srcLen )
1765             return wxCONV_FAILED;
1766
1767         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1768         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1769         {
1770             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1771         }
1772     }
1773
1774     return srcLen;
1775 }
1776
1777 size_t
1778 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1779                              const wchar_t *src, size_t srcLen) const
1780 {
1781     if ( srcLen == wxNO_LEN )
1782         srcLen = wxWcslen(src) + 1;
1783
1784     srcLen *= BYTES_PER_CHAR;
1785
1786     if ( dst )
1787     {
1788         if ( dstLen < srcLen )
1789             return wxCONV_FAILED;
1790
1791         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1792         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1793         {
1794             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1795         }
1796     }
1797
1798     return srcLen;
1799 }
1800
1801 #endif // WC_UTF16/!WC_UTF16
1802
1803
1804 // ============================================================================
1805 // The classes doing conversion using the iconv_xxx() functions
1806 // ============================================================================
1807
1808 #ifdef HAVE_ICONV
1809
1810 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1811 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1812 //     (unless there's yet another bug in glibc) the only case when iconv()
1813 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1814 //     left in the input buffer -- when _real_ error occurs,
1815 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1816 //     iconv() failure.
1817 //     [This bug does not appear in glibc 2.2.]
1818 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1819 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1820                                      (errno != E2BIG || bufLeft != 0))
1821 #else
1822 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1823 #endif
1824
1825 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1826
1827 #define ICONV_T_INVALID ((iconv_t)-1)
1828
1829 #if SIZEOF_WCHAR_T == 4
1830     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1831     #define WC_ENC      wxFONTENCODING_UTF32
1832 #elif SIZEOF_WCHAR_T == 2
1833     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1834     #define WC_ENC      wxFONTENCODING_UTF16
1835 #else // sizeof(wchar_t) != 2 nor 4
1836     // does this ever happen?
1837     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1838 #endif
1839
1840 // ----------------------------------------------------------------------------
1841 // wxMBConv_iconv: encapsulates an iconv character set
1842 // ----------------------------------------------------------------------------
1843
1844 class wxMBConv_iconv : public wxMBConv
1845 {
1846 public:
1847     wxMBConv_iconv(const char *name);
1848     virtual ~wxMBConv_iconv();
1849
1850     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1851     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1852
1853     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1854     virtual size_t GetMBNulLen() const;
1855
1856 #if wxUSE_UNICODE_UTF8
1857     virtual bool IsUTF8() const;
1858 #endif
1859
1860     virtual wxMBConv *Clone() const
1861     {
1862         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1863         p->m_minMBCharWidth = m_minMBCharWidth;
1864         return p;
1865     }
1866
1867     bool IsOk() const
1868         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1869
1870 protected:
1871     // the iconv handlers used to translate from multibyte
1872     // to wide char and in the other direction
1873     iconv_t m2w,
1874             w2m;
1875
1876 #if wxUSE_THREADS
1877     // guards access to m2w and w2m objects
1878     wxMutex m_iconvMutex;
1879 #endif
1880
1881 private:
1882     // the name (for iconv_open()) of a wide char charset -- if none is
1883     // available on this machine, it will remain NULL
1884     static wxString ms_wcCharsetName;
1885
1886     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1887     // different endian-ness than the native one
1888     static bool ms_wcNeedsSwap;
1889
1890
1891     // name of the encoding handled by this conversion
1892     wxString m_name;
1893
1894     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1895     // initially
1896     size_t m_minMBCharWidth;
1897 };
1898
1899 // make the constructor available for unit testing
1900 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1901 {
1902     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1903     if ( !result->IsOk() )
1904     {
1905         delete result;
1906         return 0;
1907     }
1908
1909     return result;
1910 }
1911
1912 wxString wxMBConv_iconv::ms_wcCharsetName;
1913 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1914
1915 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1916               : m_name(name)
1917 {
1918     m_minMBCharWidth = 0;
1919
1920     // check for charset that represents wchar_t:
1921     if ( ms_wcCharsetName.empty() )
1922     {
1923         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1924
1925 #if wxUSE_FONTMAP
1926         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1927 #else // !wxUSE_FONTMAP
1928         static const wxChar *names_static[] =
1929         {
1930 #if SIZEOF_WCHAR_T == 4
1931             _T("UCS-4"),
1932 #elif SIZEOF_WCHAR_T = 2
1933             _T("UCS-2"),
1934 #endif
1935             NULL
1936         };
1937         const wxChar **names = names_static;
1938 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1939
1940         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1941         {
1942             const wxString nameCS(*names);
1943
1944             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1945             wxString nameXE(nameCS);
1946
1947 #ifdef WORDS_BIGENDIAN
1948                 nameXE += _T("BE");
1949 #else // little endian
1950                 nameXE += _T("LE");
1951 #endif
1952
1953             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1954                        nameXE.c_str());
1955
1956             m2w = iconv_open(nameXE.ToAscii(), name);
1957             if ( m2w == ICONV_T_INVALID )
1958             {
1959                 // try charset w/o bytesex info (e.g. "UCS4")
1960                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1961                            nameCS.c_str());
1962                 m2w = iconv_open(nameCS.ToAscii(), name);
1963
1964                 // and check for bytesex ourselves:
1965                 if ( m2w != ICONV_T_INVALID )
1966                 {
1967                     char    buf[2], *bufPtr;
1968                     wchar_t wbuf[2], *wbufPtr;
1969                     size_t  insz, outsz;
1970                     size_t  res;
1971
1972                     buf[0] = 'A';
1973                     buf[1] = 0;
1974                     wbuf[0] = 0;
1975                     insz = 2;
1976                     outsz = SIZEOF_WCHAR_T * 2;
1977                     wbufPtr = wbuf;
1978                     bufPtr = buf;
1979
1980                     res = iconv(
1981                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1982                         (char**)&wbufPtr, &outsz);
1983
1984                     if (ICONV_FAILED(res, insz))
1985                     {
1986                         wxLogLastError(wxT("iconv"));
1987                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1988                                    nameCS.c_str());
1989                     }
1990                     else // ok, can convert to this encoding, remember it
1991                     {
1992                         ms_wcCharsetName = nameCS;
1993                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1994                     }
1995                 }
1996             }
1997             else // use charset not requiring byte swapping
1998             {
1999                 ms_wcCharsetName = nameXE;
2000             }
2001         }
2002
2003         wxLogTrace(TRACE_STRCONV,
2004                    wxT("iconv wchar_t charset is \"%s\"%s"),
2005                    ms_wcCharsetName.empty() ? wxString("<none>")
2006                                             : ms_wcCharsetName,
2007                    ms_wcNeedsSwap ? _T(" (needs swap)")
2008                                   : _T(""));
2009     }
2010     else // we already have ms_wcCharsetName
2011     {
2012         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2013     }
2014
2015     if ( ms_wcCharsetName.empty() )
2016     {
2017         w2m = ICONV_T_INVALID;
2018     }
2019     else
2020     {
2021         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2022         if ( w2m == ICONV_T_INVALID )
2023         {
2024             wxLogTrace(TRACE_STRCONV,
2025                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2026                        ms_wcCharsetName.c_str(), name);
2027         }
2028     }
2029 }
2030
2031 wxMBConv_iconv::~wxMBConv_iconv()
2032 {
2033     if ( m2w != ICONV_T_INVALID )
2034         iconv_close(m2w);
2035     if ( w2m != ICONV_T_INVALID )
2036         iconv_close(w2m);
2037 }
2038
2039 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2040 {
2041     // find the string length: notice that must be done differently for
2042     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2043     size_t inbuf;
2044     const size_t nulLen = GetMBNulLen();
2045     switch ( nulLen )
2046     {
2047         default:
2048             return wxCONV_FAILED;
2049
2050         case 1:
2051             inbuf = strlen(psz); // arguably more optimized than our version
2052             break;
2053
2054         case 2:
2055         case 4:
2056             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2057             // they also have to start at character boundary and not span two
2058             // adjacent characters
2059             const char *p;
2060             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2061                 ;
2062             inbuf = p - psz;
2063             break;
2064     }
2065
2066 #if wxUSE_THREADS
2067     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2068     //     Unfortunately there are a couple of global wxCSConv objects such as
2069     //     wxConvLocal that are used all over wx code, so we have to make sure
2070     //     the handle is used by at most one thread at the time. Otherwise
2071     //     only a few wx classes would be safe to use from non-main threads
2072     //     as MB<->WC conversion would fail "randomly".
2073     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2074 #endif // wxUSE_THREADS
2075
2076     size_t outbuf = n * SIZEOF_WCHAR_T;
2077     size_t res, cres;
2078     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2079     wchar_t *bufPtr = buf;
2080     const char *pszPtr = psz;
2081
2082     if (buf)
2083     {
2084         // have destination buffer, convert there
2085         cres = iconv(m2w,
2086                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2087                      (char**)&bufPtr, &outbuf);
2088         res = n - (outbuf / SIZEOF_WCHAR_T);
2089
2090         if (ms_wcNeedsSwap)
2091         {
2092             // convert to native endianness
2093             for ( unsigned i = 0; i < res; i++ )
2094                 buf[n] = WC_BSWAP(buf[i]);
2095         }
2096
2097         // NUL-terminate the string if there is any space left
2098         if (res < n)
2099             buf[res] = 0;
2100     }
2101     else
2102     {
2103         // no destination buffer... convert using temp buffer
2104         // to calculate destination buffer requirement
2105         wchar_t tbuf[8];
2106         res = 0;
2107
2108         do
2109         {
2110             bufPtr = tbuf;
2111             outbuf = 8 * SIZEOF_WCHAR_T;
2112
2113             cres = iconv(m2w,
2114                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2115                          (char**)&bufPtr, &outbuf );
2116
2117             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2118         }
2119         while ((cres == (size_t)-1) && (errno == E2BIG));
2120     }
2121
2122     if (ICONV_FAILED(cres, inbuf))
2123     {
2124         //VS: it is ok if iconv fails, hence trace only
2125         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2126         return wxCONV_FAILED;
2127     }
2128
2129     return res;
2130 }
2131
2132 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2133 {
2134 #if wxUSE_THREADS
2135     // NB: explained in MB2WC
2136     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2137 #endif
2138
2139     size_t inlen = wxWcslen(psz);
2140     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2141     size_t outbuf = n;
2142     size_t res, cres;
2143
2144     wchar_t *tmpbuf = 0;
2145
2146     if (ms_wcNeedsSwap)
2147     {
2148         // need to copy to temp buffer to switch endianness
2149         // (doing WC_BSWAP twice on the original buffer won't help, as it
2150         //  could be in read-only memory, or be accessed in some other thread)
2151         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2152         for ( size_t i = 0; i < inlen; i++ )
2153             tmpbuf[n] = WC_BSWAP(psz[i]);
2154
2155         tmpbuf[inlen] = L'\0';
2156         psz = tmpbuf;
2157     }
2158
2159     if (buf)
2160     {
2161         // have destination buffer, convert there
2162         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2163
2164         res = n - outbuf;
2165
2166         // NB: iconv was given only wcslen(psz) characters on input, and so
2167         //     it couldn't convert the trailing zero. Let's do it ourselves
2168         //     if there's some room left for it in the output buffer.
2169         if (res < n)
2170             buf[0] = 0;
2171     }
2172     else
2173     {
2174         // no destination buffer: convert using temp buffer
2175         // to calculate destination buffer requirement
2176         char tbuf[16];
2177         res = 0;
2178         do
2179         {
2180             buf = tbuf;
2181             outbuf = 16;
2182
2183             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2184
2185             res += 16 - outbuf;
2186         }
2187         while ((cres == (size_t)-1) && (errno == E2BIG));
2188     }
2189
2190     if (ms_wcNeedsSwap)
2191     {
2192         free(tmpbuf);
2193     }
2194
2195     if (ICONV_FAILED(cres, inbuf))
2196     {
2197         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2198         return wxCONV_FAILED;
2199     }
2200
2201     return res;
2202 }
2203
2204 size_t wxMBConv_iconv::GetMBNulLen() const
2205 {
2206     if ( m_minMBCharWidth == 0 )
2207     {
2208         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2209
2210 #if wxUSE_THREADS
2211         // NB: explained in MB2WC
2212         wxMutexLocker lock(self->m_iconvMutex);
2213 #endif
2214
2215         const wchar_t *wnul = L"";
2216         char buf[8]; // should be enough for NUL in any encoding
2217         size_t inLen = sizeof(wchar_t),
2218                outLen = WXSIZEOF(buf);
2219         char *inBuff = (char *)wnul;
2220         char *outBuff = buf;
2221         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2222         {
2223             self->m_minMBCharWidth = (size_t)-1;
2224         }
2225         else // ok
2226         {
2227             self->m_minMBCharWidth = outBuff - buf;
2228         }
2229     }
2230
2231     return m_minMBCharWidth;
2232 }
2233
2234 #if wxUSE_UNICODE_UTF8
2235 bool wxMBConv_iconv::IsUTF8() const
2236 {
2237     return wxStricmp(m_name, "UTF-8") == 0 ||
2238            wxStricmp(m_name, "UTF8") == 0;
2239 }
2240 #endif
2241
2242 #endif // HAVE_ICONV
2243
2244
2245 // ============================================================================
2246 // Win32 conversion classes
2247 // ============================================================================
2248
2249 #ifdef wxHAVE_WIN32_MB2WC
2250
2251 // from utils.cpp
2252 #if wxUSE_FONTMAP
2253 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2254 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2255 #endif
2256
2257 class wxMBConv_win32 : public wxMBConv
2258 {
2259 public:
2260     wxMBConv_win32()
2261     {
2262         m_CodePage = CP_ACP;
2263         m_minMBCharWidth = 0;
2264     }
2265
2266     wxMBConv_win32(const wxMBConv_win32& conv)
2267         : wxMBConv()
2268     {
2269         m_CodePage = conv.m_CodePage;
2270         m_minMBCharWidth = conv.m_minMBCharWidth;
2271     }
2272
2273 #if wxUSE_FONTMAP
2274     wxMBConv_win32(const char* name)
2275     {
2276         m_CodePage = wxCharsetToCodepage(name);
2277         m_minMBCharWidth = 0;
2278     }
2279
2280     wxMBConv_win32(wxFontEncoding encoding)
2281     {
2282         m_CodePage = wxEncodingToCodepage(encoding);
2283         m_minMBCharWidth = 0;
2284     }
2285 #endif // wxUSE_FONTMAP
2286
2287     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2288     {
2289         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2290         // the behaviour is not compatible with the Unix version (using iconv)
2291         // and break the library itself, e.g. wxTextInputStream::NextChar()
2292         // wouldn't work if reading an incomplete MB char didn't result in an
2293         // error
2294         //
2295         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2296         // Win XP or newer and it is not supported for UTF-[78] so we always
2297         // use our own conversions in this case. See
2298         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2299         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2300         if ( m_CodePage == CP_UTF8 )
2301         {
2302             return wxMBConvUTF8().MB2WC(buf, psz, n);
2303         }
2304
2305         if ( m_CodePage == CP_UTF7 )
2306         {
2307             return wxMBConvUTF7().MB2WC(buf, psz, n);
2308         }
2309
2310         int flags = 0;
2311         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2312                 IsAtLeastWin2kSP4() )
2313         {
2314             flags = MB_ERR_INVALID_CHARS;
2315         }
2316
2317         const size_t len = ::MultiByteToWideChar
2318                              (
2319                                 m_CodePage,     // code page
2320                                 flags,          // flags: fall on error
2321                                 psz,            // input string
2322                                 -1,             // its length (NUL-terminated)
2323                                 buf,            // output string
2324                                 buf ? n : 0     // size of output buffer
2325                              );
2326         if ( !len )
2327         {
2328             // function totally failed
2329             return wxCONV_FAILED;
2330         }
2331
2332         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2333         // check if we succeeded, by doing a double trip:
2334         if ( !flags && buf )
2335         {
2336             const size_t mbLen = strlen(psz);
2337             wxCharBuffer mbBuf(mbLen);
2338             if ( ::WideCharToMultiByte
2339                    (
2340                       m_CodePage,
2341                       0,
2342                       buf,
2343                       -1,
2344                       mbBuf.data(),
2345                       mbLen + 1,        // size in bytes, not length
2346                       NULL,
2347                       NULL
2348                    ) == 0 ||
2349                   strcmp(mbBuf, psz) != 0 )
2350             {
2351                 // we didn't obtain the same thing we started from, hence
2352                 // the conversion was lossy and we consider that it failed
2353                 return wxCONV_FAILED;
2354             }
2355         }
2356
2357         // note that it returns count of written chars for buf != NULL and size
2358         // of the needed buffer for buf == NULL so in either case the length of
2359         // the string (which never includes the terminating NUL) is one less
2360         return len - 1;
2361     }
2362
2363     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2364     {
2365         /*
2366             we have a problem here: by default, WideCharToMultiByte() may
2367             replace characters unrepresentable in the target code page with bad
2368             quality approximations such as turning "1/2" symbol (U+00BD) into
2369             "1" for the code pages which don't have it and we, obviously, want
2370             to avoid this at any price
2371
2372             the trouble is that this function does it _silently_, i.e. it won't
2373             even tell us whether it did or not... Win98/2000 and higher provide
2374             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2375             we have to resort to a round trip, i.e. check that converting back
2376             results in the same string -- this is, of course, expensive but
2377             otherwise we simply can't be sure to not garble the data.
2378          */
2379
2380         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2381         // it doesn't work with CJK encodings (which we test for rather roughly
2382         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2383         // supporting it
2384         BOOL usedDef wxDUMMY_INITIALIZE(false);
2385         BOOL *pUsedDef;
2386         int flags;
2387         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2388         {
2389             // it's our lucky day
2390             flags = WC_NO_BEST_FIT_CHARS;
2391             pUsedDef = &usedDef;
2392         }
2393         else // old system or unsupported encoding
2394         {
2395             flags = 0;
2396             pUsedDef = NULL;
2397         }
2398
2399         const size_t len = ::WideCharToMultiByte
2400                              (
2401                                 m_CodePage,     // code page
2402                                 flags,          // either none or no best fit
2403                                 pwz,            // input string
2404                                 -1,             // it is (wide) NUL-terminated
2405                                 buf,            // output buffer
2406                                 buf ? n : 0,    // and its size
2407                                 NULL,           // default "replacement" char
2408                                 pUsedDef        // [out] was it used?
2409                              );
2410
2411         if ( !len )
2412         {
2413             // function totally failed
2414             return wxCONV_FAILED;
2415         }
2416
2417         // if we were really converting, check if we succeeded
2418         if ( buf )
2419         {
2420             if ( flags )
2421             {
2422                 // check if the conversion failed, i.e. if any replacements
2423                 // were done
2424                 if ( usedDef )
2425                     return wxCONV_FAILED;
2426             }
2427             else // we must resort to double tripping...
2428             {
2429                 wxWCharBuffer wcBuf(n);
2430                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2431                         wcscmp(wcBuf, pwz) != 0 )
2432                 {
2433                     // we didn't obtain the same thing we started from, hence
2434                     // the conversion was lossy and we consider that it failed
2435                     return wxCONV_FAILED;
2436                 }
2437             }
2438         }
2439
2440         // see the comment above for the reason of "len - 1"
2441         return len - 1;
2442     }
2443
2444     virtual size_t GetMBNulLen() const
2445     {
2446         if ( m_minMBCharWidth == 0 )
2447         {
2448             int len = ::WideCharToMultiByte
2449                         (
2450                             m_CodePage,     // code page
2451                             0,              // no flags
2452                             L"",            // input string
2453                             1,              // translate just the NUL
2454                             NULL,           // output buffer
2455                             0,              // and its size
2456                             NULL,           // no replacement char
2457                             NULL            // [out] don't care if it was used
2458                         );
2459
2460             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2461             switch ( len )
2462             {
2463                 default:
2464                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2465                     self->m_minMBCharWidth = (size_t)-1;
2466                     break;
2467
2468                 case 0:
2469                     self->m_minMBCharWidth = (size_t)-1;
2470                     break;
2471
2472                 case 1:
2473                 case 2:
2474                 case 4:
2475                     self->m_minMBCharWidth = len;
2476                     break;
2477             }
2478         }
2479
2480         return m_minMBCharWidth;
2481     }
2482
2483     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2484
2485     bool IsOk() const { return m_CodePage != -1; }
2486
2487 private:
2488     static bool CanUseNoBestFit()
2489     {
2490         static int s_isWin98Or2k = -1;
2491
2492         if ( s_isWin98Or2k == -1 )
2493         {
2494             int verMaj, verMin;
2495             switch ( wxGetOsVersion(&verMaj, &verMin) )
2496             {
2497                 case wxOS_WINDOWS_9X:
2498                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2499                     break;
2500
2501                 case wxOS_WINDOWS_NT:
2502                     s_isWin98Or2k = verMaj >= 5;
2503                     break;
2504
2505                 default:
2506                     // unknown: be conservative by default
2507                     s_isWin98Or2k = 0;
2508                     break;
2509             }
2510
2511             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2512         }
2513
2514         return s_isWin98Or2k == 1;
2515     }
2516
2517     static bool IsAtLeastWin2kSP4()
2518     {
2519 #ifdef __WXWINCE__
2520         return false;
2521 #else
2522         static int s_isAtLeastWin2kSP4 = -1;
2523
2524         if ( s_isAtLeastWin2kSP4 == -1 )
2525         {
2526             OSVERSIONINFOEX ver;
2527
2528             memset(&ver, 0, sizeof(ver));
2529             ver.dwOSVersionInfoSize = sizeof(ver);
2530             GetVersionEx((OSVERSIONINFO*)&ver);
2531
2532             s_isAtLeastWin2kSP4 =
2533               ((ver.dwMajorVersion > 5) || // Vista+
2534                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2535                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2536                ver.wServicePackMajor >= 4)) // 2000 SP4+
2537               ? 1 : 0;
2538         }
2539
2540         return s_isAtLeastWin2kSP4 == 1;
2541 #endif
2542     }
2543
2544
2545     // the code page we're working with
2546     long m_CodePage;
2547
2548     // cached result of GetMBNulLen(), set to 0 initially meaning
2549     // "unknown"
2550     size_t m_minMBCharWidth;
2551 };
2552
2553 #endif // wxHAVE_WIN32_MB2WC
2554
2555
2556 // ============================================================================
2557 // wxEncodingConverter based conversion classes
2558 // ============================================================================
2559
2560 #if wxUSE_FONTMAP
2561
2562 class wxMBConv_wxwin : public wxMBConv
2563 {
2564 private:
2565     void Init()
2566     {
2567         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2568         // The wxMBConv_cf class does a better job.
2569         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2570                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2571                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2572     }
2573
2574 public:
2575     // temporarily just use wxEncodingConverter stuff,
2576     // so that it works while a better implementation is built
2577     wxMBConv_wxwin(const char* name)
2578     {
2579         if (name)
2580             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2581         else
2582             m_enc = wxFONTENCODING_SYSTEM;
2583
2584         Init();
2585     }
2586
2587     wxMBConv_wxwin(wxFontEncoding enc)
2588     {
2589         m_enc = enc;
2590
2591         Init();
2592     }
2593
2594     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2595     {
2596         size_t inbuf = strlen(psz);
2597         if (buf)
2598         {
2599             if (!m2w.Convert(psz, buf))
2600                 return wxCONV_FAILED;
2601         }
2602         return inbuf;
2603     }
2604
2605     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2606     {
2607         const size_t inbuf = wxWcslen(psz);
2608         if (buf)
2609         {
2610             if (!w2m.Convert(psz, buf))
2611                 return wxCONV_FAILED;
2612         }
2613
2614         return inbuf;
2615     }
2616
2617     virtual size_t GetMBNulLen() const
2618     {
2619         switch ( m_enc )
2620         {
2621             case wxFONTENCODING_UTF16BE:
2622             case wxFONTENCODING_UTF16LE:
2623                 return 2;
2624
2625             case wxFONTENCODING_UTF32BE:
2626             case wxFONTENCODING_UTF32LE:
2627                 return 4;
2628
2629             default:
2630                 return 1;
2631         }
2632     }
2633
2634     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2635
2636     bool IsOk() const { return m_ok; }
2637
2638 public:
2639     wxFontEncoding m_enc;
2640     wxEncodingConverter m2w, w2m;
2641
2642 private:
2643     // were we initialized successfully?
2644     bool m_ok;
2645
2646     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2647 };
2648
2649 // make the constructors available for unit testing
2650 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2651 {
2652     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2653     if ( !result->IsOk() )
2654     {
2655         delete result;
2656         return 0;
2657     }
2658
2659     return result;
2660 }
2661
2662 #endif // wxUSE_FONTMAP
2663
2664 // ============================================================================
2665 // wxCSConv implementation
2666 // ============================================================================
2667
2668 void wxCSConv::Init()
2669 {
2670     m_name = NULL;
2671     m_convReal =  NULL;
2672     m_deferred = true;
2673 }
2674
2675 wxCSConv::wxCSConv(const wxString& charset)
2676 {
2677     Init();
2678
2679     if ( !charset.empty() )
2680     {
2681         SetName(charset.ToAscii());
2682     }
2683
2684 #if wxUSE_FONTMAP
2685     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2686 #else
2687     m_encoding = wxFONTENCODING_SYSTEM;
2688 #endif
2689 }
2690
2691 wxCSConv::wxCSConv(wxFontEncoding encoding)
2692 {
2693     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2694     {
2695         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2696
2697         encoding = wxFONTENCODING_SYSTEM;
2698     }
2699
2700     Init();
2701
2702     m_encoding = encoding;
2703 }
2704
2705 wxCSConv::~wxCSConv()
2706 {
2707     Clear();
2708 }
2709
2710 wxCSConv::wxCSConv(const wxCSConv& conv)
2711         : wxMBConv()
2712 {
2713     Init();
2714
2715     SetName(conv.m_name);
2716     m_encoding = conv.m_encoding;
2717 }
2718
2719 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2720 {
2721     Clear();
2722
2723     SetName(conv.m_name);
2724     m_encoding = conv.m_encoding;
2725
2726     return *this;
2727 }
2728
2729 void wxCSConv::Clear()
2730 {
2731     free(m_name);
2732     delete m_convReal;
2733
2734     m_name = NULL;
2735     m_convReal = NULL;
2736 }
2737
2738 void wxCSConv::SetName(const char *charset)
2739 {
2740     if (charset)
2741     {
2742         m_name = strdup(charset);
2743         m_deferred = true;
2744     }
2745 }
2746
2747 #if wxUSE_FONTMAP
2748
2749 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2750                      wxEncodingNameCache );
2751
2752 static wxEncodingNameCache gs_nameCache;
2753 #endif
2754
2755 wxMBConv *wxCSConv::DoCreate() const
2756 {
2757 #if wxUSE_FONTMAP
2758     wxLogTrace(TRACE_STRCONV,
2759                wxT("creating conversion for %s"),
2760                (m_name ? m_name
2761                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2762 #endif // wxUSE_FONTMAP
2763
2764     // check for the special case of ASCII or ISO8859-1 charset: as we have
2765     // special knowledge of it anyhow, we don't need to create a special
2766     // conversion object
2767     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2768             m_encoding == wxFONTENCODING_DEFAULT )
2769     {
2770         // don't convert at all
2771         return NULL;
2772     }
2773
2774     // we trust OS to do conversion better than we can so try external
2775     // conversion methods first
2776     //
2777     // the full order is:
2778     //      1. OS conversion (iconv() under Unix or Win32 API)
2779     //      2. hard coded conversions for UTF
2780     //      3. wxEncodingConverter as fall back
2781
2782     // step (1)
2783 #ifdef HAVE_ICONV
2784 #if !wxUSE_FONTMAP
2785     if ( m_name )
2786 #endif // !wxUSE_FONTMAP
2787     {
2788 #if wxUSE_FONTMAP
2789         wxFontEncoding encoding(m_encoding);
2790 #endif
2791
2792         if ( m_name )
2793         {
2794             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2795             if ( conv->IsOk() )
2796                 return conv;
2797
2798             delete conv;
2799
2800 #if wxUSE_FONTMAP
2801             encoding =
2802                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2803 #endif // wxUSE_FONTMAP
2804         }
2805 #if wxUSE_FONTMAP
2806         {
2807             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2808             if ( it != gs_nameCache.end() )
2809             {
2810                 if ( it->second.empty() )
2811                     return NULL;
2812
2813                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2814                 if ( conv->IsOk() )
2815                     return conv;
2816
2817                 delete conv;
2818             }
2819
2820             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2821             // CS : in case this does not return valid names (eg for MacRoman)
2822             // encoding got a 'failure' entry in the cache all the same,
2823             // although it just has to be created using a different method, so
2824             // only store failed iconv creation attempts (or perhaps we
2825             // shoulnd't do this at all ?)
2826             if ( names[0] != NULL )
2827             {
2828                 for ( ; *names; ++names )
2829                 {
2830                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2831                     //             will need changes that will obsolete this
2832                     wxString name(*names);
2833                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2834                     if ( conv->IsOk() )
2835                     {
2836                         gs_nameCache[encoding] = *names;
2837                         return conv;
2838                     }
2839
2840                     delete conv;
2841                 }
2842
2843                 gs_nameCache[encoding] = _T(""); // cache the failure
2844             }
2845         }
2846 #endif // wxUSE_FONTMAP
2847     }
2848 #endif // HAVE_ICONV
2849
2850 #ifdef wxHAVE_WIN32_MB2WC
2851     {
2852 #if wxUSE_FONTMAP
2853         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2854                                       : new wxMBConv_win32(m_encoding);
2855         if ( conv->IsOk() )
2856             return conv;
2857
2858         delete conv;
2859 #else
2860         return NULL;
2861 #endif
2862     }
2863 #endif // wxHAVE_WIN32_MB2WC
2864
2865 #ifdef __DARWIN__
2866     {
2867         // leave UTF16 and UTF32 to the built-ins of wx
2868         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2869             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2870         {
2871 #if wxUSE_FONTMAP
2872             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2873                                           : new wxMBConv_cf(m_encoding);
2874 #else
2875             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2876 #endif
2877
2878             if ( conv->IsOk() )
2879                  return conv;
2880
2881             delete conv;
2882         }
2883     }
2884 #endif // __DARWIN__
2885
2886     // step (2)
2887     wxFontEncoding enc = m_encoding;
2888 #if wxUSE_FONTMAP
2889     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2890     {
2891         // use "false" to suppress interactive dialogs -- we can be called from
2892         // anywhere and popping up a dialog from here is the last thing we want to
2893         // do
2894         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2895     }
2896 #endif // wxUSE_FONTMAP
2897
2898     switch ( enc )
2899     {
2900         case wxFONTENCODING_UTF7:
2901              return new wxMBConvUTF7;
2902
2903         case wxFONTENCODING_UTF8:
2904              return new wxMBConvUTF8;
2905
2906         case wxFONTENCODING_UTF16BE:
2907              return new wxMBConvUTF16BE;
2908
2909         case wxFONTENCODING_UTF16LE:
2910              return new wxMBConvUTF16LE;
2911
2912         case wxFONTENCODING_UTF32BE:
2913              return new wxMBConvUTF32BE;
2914
2915         case wxFONTENCODING_UTF32LE:
2916              return new wxMBConvUTF32LE;
2917
2918         default:
2919              // nothing to do but put here to suppress gcc warnings
2920              break;
2921     }
2922
2923     // step (3)
2924 #if wxUSE_FONTMAP
2925     {
2926         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2927                                       : new wxMBConv_wxwin(m_encoding);
2928         if ( conv->IsOk() )
2929             return conv;
2930
2931         delete conv;
2932     }
2933 #endif // wxUSE_FONTMAP
2934
2935     // NB: This is a hack to prevent deadlock. What could otherwise happen
2936     //     in Unicode build: wxConvLocal creation ends up being here
2937     //     because of some failure and logs the error. But wxLog will try to
2938     //     attach a timestamp, for which it will need wxConvLocal (to convert
2939     //     time to char* and then wchar_t*), but that fails, tries to log the
2940     //     error, but wxLog has an (already locked) critical section that
2941     //     guards the static buffer.
2942     static bool alreadyLoggingError = false;
2943     if (!alreadyLoggingError)
2944     {
2945         alreadyLoggingError = true;
2946         wxLogError(_("Cannot convert from the charset '%s'!"),
2947                    m_name ? m_name
2948                       :
2949 #if wxUSE_FONTMAP
2950                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2951 #else // !wxUSE_FONTMAP
2952                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2953 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2954               );
2955
2956         alreadyLoggingError = false;
2957     }
2958
2959     return NULL;
2960 }
2961
2962 void wxCSConv::CreateConvIfNeeded() const
2963 {
2964     if ( m_deferred )
2965     {
2966         wxCSConv *self = (wxCSConv *)this; // const_cast
2967
2968         // if we don't have neither the name nor the encoding, use the default
2969         // encoding for this system
2970         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2971         {
2972 #if wxUSE_INTL
2973             self->m_encoding = wxLocale::GetSystemEncoding();
2974 #else
2975             // fallback to some reasonable default:
2976             self->m_encoding = wxFONTENCODING_ISO8859_1;
2977 #endif // wxUSE_INTL
2978         }
2979
2980         self->m_convReal = DoCreate();
2981         self->m_deferred = false;
2982     }
2983 }
2984
2985 bool wxCSConv::IsOk() const
2986 {
2987     CreateConvIfNeeded();
2988
2989     // special case: no convReal created for wxFONTENCODING_ISO8859_1
2990     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2991         return true; // always ok as we do it ourselves
2992
2993     // m_convReal->IsOk() is called at its own creation, so we know it must
2994     // be ok if m_convReal is non-NULL
2995     return m_convReal != NULL;
2996 }
2997
2998 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
2999                          const char *src, size_t srcLen) const
3000 {
3001     CreateConvIfNeeded();
3002
3003     if (m_convReal)
3004         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3005
3006     // latin-1 (direct)
3007     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3008 }
3009
3010 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3011                            const wchar_t *src, size_t srcLen) const
3012 {
3013     CreateConvIfNeeded();
3014
3015     if (m_convReal)
3016         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3017
3018     // latin-1 (direct)
3019     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3020 }
3021
3022 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3023 {
3024     CreateConvIfNeeded();
3025
3026     if (m_convReal)
3027         return m_convReal->MB2WC(buf, psz, n);
3028
3029     // latin-1 (direct)
3030     size_t len = strlen(psz);
3031
3032     if (buf)
3033     {
3034         for (size_t c = 0; c <= len; c++)
3035             buf[c] = (unsigned char)(psz[c]);
3036     }
3037
3038     return len;
3039 }
3040
3041 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3042 {
3043     CreateConvIfNeeded();
3044
3045     if (m_convReal)
3046         return m_convReal->WC2MB(buf, psz, n);
3047
3048     // latin-1 (direct)
3049     const size_t len = wxWcslen(psz);
3050     if (buf)
3051     {
3052         for (size_t c = 0; c <= len; c++)
3053         {
3054             if (psz[c] > 0xFF)
3055                 return wxCONV_FAILED;
3056
3057             buf[c] = (char)psz[c];
3058         }
3059     }
3060     else
3061     {
3062         for (size_t c = 0; c <= len; c++)
3063         {
3064             if (psz[c] > 0xFF)
3065                 return wxCONV_FAILED;
3066         }
3067     }
3068
3069     return len;
3070 }
3071
3072 size_t wxCSConv::GetMBNulLen() const
3073 {
3074     CreateConvIfNeeded();
3075
3076     if ( m_convReal )
3077     {
3078         return m_convReal->GetMBNulLen();
3079     }
3080
3081     // otherwise, we are ISO-8859-1
3082     return 1;
3083 }
3084
3085 #if wxUSE_UNICODE_UTF8
3086 bool wxCSConv::IsUTF8() const
3087 {
3088     CreateConvIfNeeded();
3089
3090     if ( m_convReal )
3091     {
3092         return m_convReal->IsUTF8();
3093     }
3094
3095     // otherwise, we are ISO-8859-1
3096     return false;
3097 }
3098 #endif
3099
3100
3101 #if wxUSE_UNICODE
3102
3103 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3104 {
3105     if ( !s )
3106         return wxWCharBuffer();
3107
3108     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3109     if ( !wbuf )
3110         wbuf = wxMBConvUTF8().cMB2WX(s);
3111     if ( !wbuf )
3112         wbuf = wxConvISO8859_1.cMB2WX(s);
3113
3114     return wbuf;
3115 }
3116
3117 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3118 {
3119     if ( !ws )
3120         return wxCharBuffer();
3121
3122     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3123     if ( !buf )
3124         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3125
3126     return buf;
3127 }
3128
3129 #endif // wxUSE_UNICODE
3130
3131 // ----------------------------------------------------------------------------
3132 // globals
3133 // ----------------------------------------------------------------------------
3134
3135 // NB: The reason why we create converted objects in this convoluted way,
3136 //     using a factory function instead of global variable, is that they
3137 //     may be used at static initialization time (some of them are used by
3138 //     wxString ctors and there may be a global wxString object). In other
3139 //     words, possibly _before_ the converter global object would be
3140 //     initialized.
3141
3142 #undef wxConvLibc
3143 #undef wxConvUTF8
3144 #undef wxConvUTF7
3145 #undef wxConvLocal
3146 #undef wxConvISO8859_1
3147
3148 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3149     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3150     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3151     {                                                                   \
3152         static impl_klass name##Obj ctor_args;                          \
3153         return &name##Obj;                                              \
3154     }                                                                   \
3155     /* this ensures that all global converter objects are created */    \
3156     /* by the time static initialization is done, i.e. before any */    \
3157     /* thread is launched: */                                           \
3158     static klass* gs_##name##instance = wxGet_##name##Ptr()
3159
3160 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3161     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3162
3163 #ifdef __WINDOWS__
3164     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3165 #else
3166     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3167 #endif
3168
3169 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3170 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3171
3172 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3173 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3174
3175 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3176 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3177
3178 #ifdef __DARWIN__
3179 // The xnu kernel always communicates file paths in decomposed UTF-8.
3180 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3181 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3182 #endif
3183
3184 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3185 #ifdef __DARWIN__
3186                                     &wxConvMacUTF8DObj;
3187 #else // !__DARWIN__
3188                                     wxGet_wxConvLibcPtr();
3189 #endif // __DARWIN__/!__DARWIN__
3190
3191 #else // !wxUSE_WCHAR_T
3192
3193 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3194 // stand-ins in absence of wchar_t
3195 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3196                                 wxConvISO8859_1,
3197                                 wxConvLocal,
3198                                 wxConvUTF8;
3199
3200 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T