src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = ToWChar(NULL, 0, psz);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = FromWChar(NULL, 0, pwz);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             wxCharBuffer buf(nLen - 1);
 380             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 381                 return buf;
 382         }
 383     }
 384
 385     return wxCharBuffer();
 386 }
 387
 388 const wxWCharBuffer
 389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 390 {
 391     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 392     if ( dstLen != wxCONV_FAILED )
 393     {
 394         wxWCharBuffer wbuf(dstLen - 1);
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         // special case of empty input: can't allocate 0 size buffer below as
 421         // wxCharBuffer insists on NUL-terminating it
 422         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 423         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 424         {
 425             if ( outLen )
 426             {
 427                 *outLen = dstLen;
 428
 429                 const size_t nulLen = GetMBNulLen();
 430                 if ( dstLen >= nulLen &&
 431                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 432                 {
 433                     // in this case the output is NUL-terminated and we're not
 434                     // supposed to count NUL
 435                     *outLen -= nulLen;
 436                 }
 437             }
 438
 439             return buf;
 440         }
 441     }
 442
 443     if ( outLen )
 444         *outLen = 0;
 445
 446     return wxCharBuffer();
 447 }
 448
 449 // ----------------------------------------------------------------------------
 450 // wxMBConvLibc
 451 // ----------------------------------------------------------------------------
 452
 453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     return wxMB2WC(buf, psz, n);
 456 }
 457
 458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 459 {
 460     return wxWC2MB(buf, psz, n);
 461 }
 462
 463 // ----------------------------------------------------------------------------
 464 // wxConvBrokenFileNames
 465 // ----------------------------------------------------------------------------
 466
 467 #ifdef __UNIX__
 468
 469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 470 {
 471     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 472          wxStricmp(charset, _T("UTF8")) == 0  )
 473         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 474     else
 475         m_conv = new wxCSConv(charset);
 476 }
 477
 478 #endif // __UNIX__
 479
 480 // ----------------------------------------------------------------------------
 481 // UTF-7
 482 // ----------------------------------------------------------------------------
 483
 484 // Implementation (C) 2004 Fredrik Roubert
 485
 486 //
 487 // BASE64 decoding table
 488 //
 489 static const unsigned char utf7unb64[] =
 490 {
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 497     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 498     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 500     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 501     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 502     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 504     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 505     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 506     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 523 };
 524
 525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 526 {
 527     size_t len = 0;
 528
 529     while ( *psz && (!buf || (len < n)) )
 530     {
 531         unsigned char cc = *psz++;
 532         if (cc != '+')
 533         {
 534             // plain ASCII char
 535             if (buf)
 536                 *buf++ = cc;
 537             len++;
 538         }
 539         else if (*psz == '-')
 540         {
 541             // encoded plus sign
 542             if (buf)
 543                 *buf++ = cc;
 544             len++;
 545             psz++;
 546         }
 547         else // start of BASE64 encoded string
 548         {
 549             bool lsb, ok;
 550             unsigned int d, l;
 551             for ( ok = lsb = false, d = 0, l = 0;
 552                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 553                   psz++ )
 554             {
 555                 d <<= 6;
 556                 d += cc;
 557                 for (l += 6; l >= 8; lsb = !lsb)
 558                 {
 559                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 560                     if (lsb)
 561                     {
 562                         if (buf)
 563                             *buf++ |= c;
 564                         len ++;
 565                     }
 566                     else
 567                     {
 568                         if (buf)
 569                             *buf = (wchar_t)(c << 8);
 570                     }
 571
 572                     ok = true;
 573                 }
 574             }
 575
 576             if ( !ok )
 577             {
 578                 // in valid UTF7 we should have valid characters after '+'
 579                 return wxCONV_FAILED;
 580             }
 581
 582             if (*psz == '-')
 583                 psz++;
 584         }
 585     }
 586
 587     if ( buf && (len < n) )
 588         *buf = '\0';
 589
 590     return len;
 591 }
 592
 593 //
 594 // BASE64 encoding table
 595 //
 596 static const unsigned char utf7enb64[] =
 597 {
 598     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 599     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 600     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 601     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 602     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 603     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 604     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 605     '4', '5', '6', '7', '8', '9', '+', '/'
 606 };
 607
 608 //
 609 // UTF-7 encoding table
 610 //
 611 // 0 - Set D (directly encoded characters)
 612 // 1 - Set O (optional direct characters)
 613 // 2 - whitespace characters (optional)
 614 // 3 - special characters
 615 //
 616 static const unsigned char utf7encode[128] =
 617 {
 618     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 620     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 622     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 626 };
 627
 628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 629 {
 630     size_t len = 0;
 631
 632     while (*psz && ((!buf) || (len < n)))
 633     {
 634         wchar_t cc = *psz++;
 635         if (cc < 0x80 && utf7encode[cc] < 1)
 636         {
 637             // plain ASCII char
 638             if (buf)
 639                 *buf++ = (char)cc;
 640
 641             len++;
 642         }
 643 #ifndef WC_UTF16
 644         else if (((wxUint32)cc) > 0xffff)
 645         {
 646             // no surrogate pair generation (yet?)
 647             return wxCONV_FAILED;
 648         }
 649 #endif
 650         else
 651         {
 652             if (buf)
 653                 *buf++ = '+';
 654
 655             len++;
 656             if (cc != '+')
 657             {
 658                 // BASE64 encode string
 659                 unsigned int lsb, d, l;
 660                 for (d = 0, l = 0; /*nothing*/; psz++)
 661                 {
 662                     for (lsb = 0; lsb < 2; lsb ++)
 663                     {
 664                         d <<= 8;
 665                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 666
 667                         for (l += 8; l >= 6; )
 668                         {
 669                             l -= 6;
 670                             if (buf)
 671                                 *buf++ = utf7enb64[(d >> l) % 64];
 672                             len++;
 673                         }
 674                     }
 675
 676                     cc = *psz;
 677                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 678                         break;
 679                 }
 680
 681                 if (l != 0)
 682                 {
 683                     if (buf)
 684                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 685
 686                     len++;
 687                 }
 688             }
 689
 690             if (buf)
 691                 *buf++ = '-';
 692             len++;
 693         }
 694     }
 695
 696     if (buf && (len < n))
 697         *buf = 0;
 698
 699     return len;
 700 }
 701
 702 // ----------------------------------------------------------------------------
 703 // UTF-8
 704 // ----------------------------------------------------------------------------
 705
 706 static const wxUint32 utf8_max[]=
 707     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 708
 709 // boundaries of the private use area we use to (temporarily) remap invalid
 710 // characters invalid in a UTF-8 encoded string
 711 const wxUint32 wxUnicodePUA = 0x100000;
 712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 713
 714 // this table gives the length of the UTF-8 encoding from its first character:
 715 const unsigned char tableUtf8Lengths[256] = {
 716     // single-byte sequences (ASCII):
 717     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 725
 726     // these are invalid:
 727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 731     0, 0,                                            // C0,C1
 732
 733     // two-byte sequences:
 734           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 735     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 736
 737     // three-byte sequences:
 738     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 739
 740     // four-byte sequences:
 741     4, 4, 4, 4, 4,                                   // F0..F4
 742
 743     // these are invalid again (5- or 6-byte
 744     // sequences and sequences for code points
 745     // above U+10FFFF, as restricted by RFC 3629):
 746                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 747 };
 748
 749 size_t
 750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 751                             const char *src, size_t srcLen) const
 752 {
 753     wchar_t *out = dstLen ? dst : NULL;
 754     size_t written = 0;
 755
 756     if ( srcLen == wxNO_LEN )
 757         srcLen = strlen(src) + 1;
 758
 759     for ( const char *p = src; ; p++ )
 760     {
 761         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 762         {
 763             // all done successfully, just add the trailing NULL if we are not
 764             // using explicit length
 765             if ( srcLen == wxNO_LEN )
 766             {
 767                 if ( out )
 768                 {
 769                     if ( !dstLen )
 770                         break;
 771
 772                     *out = L'\0';
 773                 }
 774
 775                 written++;
 776             }
 777
 778             return written;
 779         }
 780
 781         if ( out && !dstLen-- )
 782             break;
 783
 784         wxUint32 code;
 785         unsigned char c = *p;
 786
 787         if ( c < 0x80 )
 788         {
 789             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 790                 break;
 791
 792             if ( srcLen != wxNO_LEN )
 793                 srcLen--;
 794
 795             code = c;
 796         }
 797         else
 798         {
 799             unsigned len = tableUtf8Lengths[c];
 800             if ( !len )
 801                 break;
 802
 803             if ( srcLen < len ) // the test works for wxNO_LEN too
 804                 break;
 805
 806             if ( srcLen != wxNO_LEN )
 807                 srcLen -= len;
 808
 809             //   Char. number range   |        UTF-8 octet sequence
 810             //      (hexadecimal)     |              (binary)
 811             //  ----------------------+----------------------------------------
 812             //  0000 0000 - 0000 007F | 0xxxxxxx
 813             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 814             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 815             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 816             //
 817             //  Code point value is stored in bits marked with 'x',
 818             //  lowest-order bit of the value on the right side in the diagram
 819             //  above.                                         (from RFC 3629)
 820
 821             // mask to extract lead byte's value ('x' bits above), by sequence
 822             // length:
 823             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 824
 825             // mask and value of lead byte's most significant bits, by length:
 826             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 827             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 828
 829             len--; // it's more convenient to work with 0-based length here
 830
 831             // extract the lead byte's value bits:
 832             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 833                 break;
 834
 835             code = c & leadValueMask[len];
 836
 837             // all remaining bytes, if any, are handled in the same way
 838             // regardless of sequence's length:
 839             for ( ; len; --len )
 840             {
 841                 c = *++p;
 842                 if ( (c & 0xC0) != 0x80 )
 843                     return wxCONV_FAILED;
 844
 845                 code <<= 6;
 846                 code |= c & 0x3F;
 847             }
 848         }
 849
 850 #ifdef WC_UTF16
 851         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 852         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 853         {
 854             if ( out )
 855                 out++;
 856             written++;
 857         }
 858 #else // !WC_UTF16
 859         if ( out )
 860             *out = code;
 861 #endif // WC_UTF16/!WC_UTF16
 862
 863         if ( out )
 864             out++;
 865
 866         written++;
 867     }
 868
 869     return wxCONV_FAILED;
 870 }
 871
 872 size_t
 873 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 874                               const wchar_t *src, size_t srcLen) const
 875 {
 876     char *out = dstLen ? dst : NULL;
 877     size_t written = 0;
 878
 879     for ( const wchar_t *wp = src; ; wp++ )
 880     {
 881         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 882         {
 883             // all done successfully, just add the trailing NULL if we are not
 884             // using explicit length
 885             if ( srcLen == wxNO_LEN )
 886             {
 887                 if ( out )
 888                 {
 889                     if ( !dstLen )
 890                         break;
 891
 892                     *out = '\0';
 893                 }
 894
 895                 written++;
 896             }
 897
 898             return written;
 899         }
 900
 901
 902         wxUint32 code;
 903 #ifdef WC_UTF16
 904         // cast is ok for WC_UTF16
 905         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 906         {
 907             // skip the next char too as we decoded a surrogate
 908             wp++;
 909         }
 910 #else // wchar_t is UTF-32
 911         code = *wp & 0x7fffffff;
 912 #endif
 913
 914         unsigned len;
 915         if ( code <= 0x7F )
 916         {
 917             len = 1;
 918             if ( out )
 919             {
 920                 if ( dstLen < len )
 921                     break;
 922
 923                 out[0] = (char)code;
 924             }
 925         }
 926         else if ( code <= 0x07FF )
 927         {
 928             len = 2;
 929             if ( out )
 930             {
 931                 if ( dstLen < len )
 932                     break;
 933
 934                 // NB: this line takes 6 least significant bits, encodes them as
 935                 // 10xxxxxx and discards them so that the next byte can be encoded:
 936                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 937                 out[0] = 0xC0 | code;
 938             }
 939         }
 940         else if ( code < 0xFFFF )
 941         {
 942             len = 3;
 943             if ( out )
 944             {
 945                 if ( dstLen < len )
 946                     break;
 947
 948                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 949                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[0] = 0xE0 | code;
 951             }
 952         }
 953         else if ( code <= 0x10FFFF )
 954         {
 955             len = 4;
 956             if ( out )
 957             {
 958                 if ( dstLen < len )
 959                     break;
 960
 961                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 962                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 963                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[0] = 0xF0 | code;
 965             }
 966         }
 967         else
 968         {
 969             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 970             break;
 971         }
 972
 973         if ( out )
 974         {
 975             out += len;
 976             dstLen -= len;
 977         }
 978
 979         written += len;
 980     }
 981
 982     // we only get here if an error occurs during decoding
 983     return wxCONV_FAILED;
 984 }
 985
 986 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 987 {
 988     if ( m_options == MAP_INVALID_UTF8_NOT )
 989         return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
 990
 991     size_t len = 0;
 992
 993     while (*psz && ((!buf) || (len < n)))
 994     {
 995         const char *opsz = psz;
 996         bool invalid = false;
 997         unsigned char cc = *psz++, fc = cc;
 998         unsigned cnt;
 999         for (cnt = 0; fc & 0x80; cnt++)
1000             fc <<= 1;
1001
1002         if (!cnt)
1003         {
1004             // plain ASCII char
1005             if (buf)
1006                 *buf++ = cc;
1007             len++;
1008
1009             // escape the escape character for octal escapes
1010             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1011                     && cc == '\\' && (!buf || len < n))
1012             {
1013                 if (buf)
1014                     *buf++ = cc;
1015                 len++;
1016             }
1017         }
1018         else
1019         {
1020             cnt--;
1021             if (!cnt)
1022             {
1023                 // invalid UTF-8 sequence
1024                 invalid = true;
1025             }
1026             else
1027             {
1028                 unsigned ocnt = cnt - 1;
1029                 wxUint32 res = cc & (0x3f >> cnt);
1030                 while (cnt--)
1031                 {
1032                     cc = *psz;
1033                     if ((cc & 0xC0) != 0x80)
1034                     {
1035                         // invalid UTF-8 sequence
1036                         invalid = true;
1037                         break;
1038                     }
1039
1040                     psz++;
1041                     res = (res << 6) | (cc & 0x3f);
1042                 }
1043
1044                 if (invalid || res <= utf8_max[ocnt])
1045                 {
1046                     // illegal UTF-8 encoding
1047                     invalid = true;
1048                 }
1049                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1050                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1051                 {
1052                     // if one of our PUA characters turns up externally
1053                     // it must also be treated as an illegal sequence
1054                     // (a bit like you have to escape an escape character)
1055                     invalid = true;
1056                 }
1057                 else
1058                 {
1059 #ifdef WC_UTF16
1060                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1061                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1062                     if (pa == wxCONV_FAILED)
1063                     {
1064                         invalid = true;
1065                     }
1066                     else
1067                     {
1068                         if (buf)
1069                             buf += pa;
1070                         len += pa;
1071                     }
1072 #else // !WC_UTF16
1073                     if (buf)
1074                         *buf++ = (wchar_t)res;
1075                     len++;
1076 #endif // WC_UTF16/!WC_UTF16
1077                 }
1078             }
1079
1080             if (invalid)
1081             {
1082                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1083                 {
1084                     while (opsz < psz && (!buf || len < n))
1085                     {
1086 #ifdef WC_UTF16
1087                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1088                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1089                         wxASSERT(pa != wxCONV_FAILED);
1090                         if (buf)
1091                             buf += pa;
1092                         opsz++;
1093                         len += pa;
1094 #else
1095                         if (buf)
1096                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1097                         opsz++;
1098                         len++;
1099 #endif
1100                     }
1101                 }
1102                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1103                 {
1104                     while (opsz < psz && (!buf || len < n))
1105                     {
1106                         if ( buf && len + 3 < n )
1107                         {
1108                             unsigned char on = *opsz;
1109                             *buf++ = L'\\';
1110                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1111                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1112                             *buf++ = (wchar_t)( L'0' + on % 010 );
1113                         }
1114
1115                         opsz++;
1116                         len += 4;
1117                     }
1118                 }
1119                 else // MAP_INVALID_UTF8_NOT
1120                 {
1121                     return wxCONV_FAILED;
1122                 }
1123             }
1124         }
1125     }
1126
1127     if (buf && (len < n))
1128         *buf = 0;
1129
1130     return len;
1131 }
1132
1133 static inline bool isoctal(wchar_t wch)
1134 {
1135     return L'0' <= wch && wch <= L'7';
1136 }
1137
1138 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1139 {
1140     if ( m_options == MAP_INVALID_UTF8_NOT )
1141         return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1142
1143     size_t len = 0;
1144
1145     while (*psz && ((!buf) || (len < n)))
1146     {
1147         wxUint32 cc;
1148
1149 #ifdef WC_UTF16
1150         // cast is ok for WC_UTF16
1151         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1152         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1153 #else
1154         cc = (*psz++) & 0x7fffffff;
1155 #endif
1156
1157         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1158                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1159         {
1160             if (buf)
1161                 *buf++ = (char)(cc - wxUnicodePUA);
1162             len++;
1163         }
1164         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1165                     && cc == L'\\' && psz[0] == L'\\' )
1166         {
1167             if (buf)
1168                 *buf++ = (char)cc;
1169             psz++;
1170             len++;
1171         }
1172         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1173                     cc == L'\\' &&
1174                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1175         {
1176             if (buf)
1177             {
1178                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1179                                  (psz[1] - L'0') * 010 +
1180                                  (psz[2] - L'0'));
1181             }
1182
1183             psz += 3;
1184             len++;
1185         }
1186         else
1187         {
1188             unsigned cnt;
1189             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1190             {
1191             }
1192
1193             if (!cnt)
1194             {
1195                 // plain ASCII char
1196                 if (buf)
1197                     *buf++ = (char) cc;
1198                 len++;
1199             }
1200             else
1201             {
1202                 len += cnt + 1;
1203                 if (buf)
1204                 {
1205                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1206                     while (cnt--)
1207                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1208                 }
1209             }
1210         }
1211     }
1212
1213     if (buf && (len < n))
1214         *buf = 0;
1215
1216     return len;
1217 }
1218
1219 // ============================================================================
1220 // UTF-16
1221 // ============================================================================
1222
1223 #ifdef WORDS_BIGENDIAN
1224     #define wxMBConvUTF16straight wxMBConvUTF16BE
1225     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1226 #else
1227     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1228     #define wxMBConvUTF16straight wxMBConvUTF16LE
1229 #endif
1230
1231 /* static */
1232 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1233 {
1234     if ( srcLen == wxNO_LEN )
1235     {
1236         // count the number of bytes in input, including the trailing NULs
1237         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1238         for ( srcLen = 1; *inBuff++; srcLen++ )
1239             ;
1240
1241         srcLen *= BYTES_PER_CHAR;
1242     }
1243     else // we already have the length
1244     {
1245         // we can only convert an entire number of UTF-16 characters
1246         if ( srcLen % BYTES_PER_CHAR )
1247             return wxCONV_FAILED;
1248     }
1249
1250     return srcLen;
1251 }
1252
1253 // case when in-memory representation is UTF-16 too
1254 #ifdef WC_UTF16
1255
1256 // ----------------------------------------------------------------------------
1257 // conversions without endianness change
1258 // ----------------------------------------------------------------------------
1259
1260 size_t
1261 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1262                                const char *src, size_t srcLen) const
1263 {
1264     // set up the scene for using memcpy() (which is presumably more efficient
1265     // than copying the bytes one by one)
1266     srcLen = GetLength(src, srcLen);
1267     if ( srcLen == wxNO_LEN )
1268         return wxCONV_FAILED;
1269
1270     const size_t inLen = srcLen / BYTES_PER_CHAR;
1271     if ( dst )
1272     {
1273         if ( dstLen < inLen )
1274             return wxCONV_FAILED;
1275
1276         memcpy(dst, src, srcLen);
1277     }
1278
1279     return inLen;
1280 }
1281
1282 size_t
1283 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1284                                  const wchar_t *src, size_t srcLen) const
1285 {
1286     if ( srcLen == wxNO_LEN )
1287         srcLen = wxWcslen(src) + 1;
1288
1289     srcLen *= BYTES_PER_CHAR;
1290
1291     if ( dst )
1292     {
1293         if ( dstLen < srcLen )
1294             return wxCONV_FAILED;
1295
1296         memcpy(dst, src, srcLen);
1297     }
1298
1299     return srcLen;
1300 }
1301
1302 // ----------------------------------------------------------------------------
1303 // endian-reversing conversions
1304 // ----------------------------------------------------------------------------
1305
1306 size_t
1307 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1308                            const char *src, size_t srcLen) const
1309 {
1310     srcLen = GetLength(src, srcLen);
1311     if ( srcLen == wxNO_LEN )
1312         return wxCONV_FAILED;
1313
1314     srcLen /= BYTES_PER_CHAR;
1315
1316     if ( dst )
1317     {
1318         if ( dstLen < srcLen )
1319             return wxCONV_FAILED;
1320
1321         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1322         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1323         {
1324             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1325         }
1326     }
1327
1328     return srcLen;
1329 }
1330
1331 size_t
1332 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1333                              const wchar_t *src, size_t srcLen) const
1334 {
1335     if ( srcLen == wxNO_LEN )
1336         srcLen = wxWcslen(src) + 1;
1337
1338     srcLen *= BYTES_PER_CHAR;
1339
1340     if ( dst )
1341     {
1342         if ( dstLen < srcLen )
1343             return wxCONV_FAILED;
1344
1345         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1346         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1347         {
1348             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1349         }
1350     }
1351
1352     return srcLen;
1353 }
1354
1355 #else // !WC_UTF16: wchar_t is UTF-32
1356
1357 // ----------------------------------------------------------------------------
1358 // conversions without endianness change
1359 // ----------------------------------------------------------------------------
1360
1361 size_t
1362 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1363                                const char *src, size_t srcLen) const
1364 {
1365     srcLen = GetLength(src, srcLen);
1366     if ( srcLen == wxNO_LEN )
1367         return wxCONV_FAILED;
1368
1369     const size_t inLen = srcLen / BYTES_PER_CHAR;
1370     if ( !dst )
1371     {
1372         // optimization: return maximal space which could be needed for this
1373         // string even if the real size could be smaller if the buffer contains
1374         // any surrogates
1375         return inLen;
1376     }
1377
1378     size_t outLen = 0;
1379     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1380     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1381     {
1382         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1383         if ( !inBuff )
1384             return wxCONV_FAILED;
1385
1386         if ( ++outLen > dstLen )
1387             return wxCONV_FAILED;
1388
1389         *dst++ = ch;
1390     }
1391
1392
1393     return outLen;
1394 }
1395
1396 size_t
1397 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1398                                  const wchar_t *src, size_t srcLen) const
1399 {
1400     if ( srcLen == wxNO_LEN )
1401         srcLen = wxWcslen(src) + 1;
1402
1403     size_t outLen = 0;
1404     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1405     for ( size_t n = 0; n < srcLen; n++ )
1406     {
1407         wxUint16 cc[2];
1408         const size_t numChars = encode_utf16(*src++, cc);
1409         if ( numChars == wxCONV_FAILED )
1410             return wxCONV_FAILED;
1411
1412         outLen += numChars * BYTES_PER_CHAR;
1413         if ( outBuff )
1414         {
1415             if ( outLen > dstLen )
1416                 return wxCONV_FAILED;
1417
1418             *outBuff++ = cc[0];
1419             if ( numChars == 2 )
1420             {
1421                 // second character of a surrogate
1422                 *outBuff++ = cc[1];
1423             }
1424         }
1425     }
1426
1427     return outLen;
1428 }
1429
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1433
1434 size_t
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1436                            const char *src, size_t srcLen) const
1437 {
1438     srcLen = GetLength(src, srcLen);
1439     if ( srcLen == wxNO_LEN )
1440         return wxCONV_FAILED;
1441
1442     const size_t inLen = srcLen / BYTES_PER_CHAR;
1443     if ( !dst )
1444     {
1445         // optimization: return maximal space which could be needed for this
1446         // string even if the real size could be smaller if the buffer contains
1447         // any surrogates
1448         return inLen;
1449     }
1450
1451     size_t outLen = 0;
1452     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1453     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1454     {
1455         wxUint32 ch;
1456         wxUint16 tmp[2];
1457
1458         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1459         inBuff++;
1460         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1461
1462         const size_t numChars = decode_utf16(tmp, ch);
1463         if ( numChars == wxCONV_FAILED )
1464             return wxCONV_FAILED;
1465
1466         if ( numChars == 2 )
1467             inBuff++;
1468
1469         if ( ++outLen > dstLen )
1470             return wxCONV_FAILED;
1471
1472         *dst++ = ch;
1473     }
1474
1475
1476     return outLen;
1477 }
1478
1479 size_t
1480 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1481                              const wchar_t *src, size_t srcLen) const
1482 {
1483     if ( srcLen == wxNO_LEN )
1484         srcLen = wxWcslen(src) + 1;
1485
1486     size_t outLen = 0;
1487     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1488     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1489     {
1490         wxUint16 cc[2];
1491         const size_t numChars = encode_utf16(*src, cc);
1492         if ( numChars == wxCONV_FAILED )
1493             return wxCONV_FAILED;
1494
1495         outLen += numChars * BYTES_PER_CHAR;
1496         if ( outBuff )
1497         {
1498             if ( outLen > dstLen )
1499                 return wxCONV_FAILED;
1500
1501             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1502             if ( numChars == 2 )
1503             {
1504                 // second character of a surrogate
1505                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1506             }
1507         }
1508     }
1509
1510     return outLen;
1511 }
1512
1513 #endif // WC_UTF16/!WC_UTF16
1514
1515
1516 // ============================================================================
1517 // UTF-32
1518 // ============================================================================
1519
1520 #ifdef WORDS_BIGENDIAN
1521     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1522     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1523 #else
1524     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1525     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1526 #endif
1527
1528
1529 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1530 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1531
1532 /* static */
1533 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1534 {
1535     if ( srcLen == wxNO_LEN )
1536     {
1537         // count the number of bytes in input, including the trailing NULs
1538         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1539         for ( srcLen = 1; *inBuff++; srcLen++ )
1540             ;
1541
1542         srcLen *= BYTES_PER_CHAR;
1543     }
1544     else // we already have the length
1545     {
1546         // we can only convert an entire number of UTF-32 characters
1547         if ( srcLen % BYTES_PER_CHAR )
1548             return wxCONV_FAILED;
1549     }
1550
1551     return srcLen;
1552 }
1553
1554 // case when in-memory representation is UTF-16
1555 #ifdef WC_UTF16
1556
1557 // ----------------------------------------------------------------------------
1558 // conversions without endianness change
1559 // ----------------------------------------------------------------------------
1560
1561 size_t
1562 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1563                                const char *src, size_t srcLen) const
1564 {
1565     srcLen = GetLength(src, srcLen);
1566     if ( srcLen == wxNO_LEN )
1567         return wxCONV_FAILED;
1568
1569     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1570     const size_t inLen = srcLen / BYTES_PER_CHAR;
1571     size_t outLen = 0;
1572     for ( size_t n = 0; n < inLen; n++ )
1573     {
1574         wxUint16 cc[2];
1575         const size_t numChars = encode_utf16(*inBuff++, cc);
1576         if ( numChars == wxCONV_FAILED )
1577             return wxCONV_FAILED;
1578
1579         outLen += numChars;
1580         if ( dst )
1581         {
1582             if ( outLen > dstLen )
1583                 return wxCONV_FAILED;
1584
1585             *dst++ = cc[0];
1586             if ( numChars == 2 )
1587             {
1588                 // second character of a surrogate
1589                 *dst++ = cc[1];
1590             }
1591         }
1592     }
1593
1594     return outLen;
1595 }
1596
1597 size_t
1598 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1599                                  const wchar_t *src, size_t srcLen) const
1600 {
1601     if ( srcLen == wxNO_LEN )
1602         srcLen = wxWcslen(src) + 1;
1603
1604     if ( !dst )
1605     {
1606         // optimization: return maximal space which could be needed for this
1607         // string instead of the exact amount which could be less if there are
1608         // any surrogates in the input
1609         //
1610         // we consider that surrogates are rare enough to make it worthwhile to
1611         // avoid running the loop below at the cost of slightly extra memory
1612         // consumption
1613         return srcLen * BYTES_PER_CHAR;
1614     }
1615
1616     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1617     size_t outLen = 0;
1618     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1619     {
1620         const wxUint32 ch = wxDecodeSurrogate(&src);
1621         if ( !src )
1622             return wxCONV_FAILED;
1623
1624         outLen += BYTES_PER_CHAR;
1625
1626         if ( outLen > dstLen )
1627             return wxCONV_FAILED;
1628
1629         *outBuff++ = ch;
1630     }
1631
1632     return outLen;
1633 }
1634
1635 // ----------------------------------------------------------------------------
1636 // endian-reversing conversions
1637 // ----------------------------------------------------------------------------
1638
1639 size_t
1640 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1641                            const char *src, size_t srcLen) const
1642 {
1643     srcLen = GetLength(src, srcLen);
1644     if ( srcLen == wxNO_LEN )
1645         return wxCONV_FAILED;
1646
1647     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1648     const size_t inLen = srcLen / BYTES_PER_CHAR;
1649     size_t outLen = 0;
1650     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1651     {
1652         wxUint16 cc[2];
1653         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1654         if ( numChars == wxCONV_FAILED )
1655             return wxCONV_FAILED;
1656
1657         outLen += numChars;
1658         if ( dst )
1659         {
1660             if ( outLen > dstLen )
1661                 return wxCONV_FAILED;
1662
1663             *dst++ = cc[0];
1664             if ( numChars == 2 )
1665             {
1666                 // second character of a surrogate
1667                 *dst++ = cc[1];
1668             }
1669         }
1670     }
1671
1672     return outLen;
1673 }
1674
1675 size_t
1676 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1677                              const wchar_t *src, size_t srcLen) const
1678 {
1679     if ( srcLen == wxNO_LEN )
1680         srcLen = wxWcslen(src) + 1;
1681
1682     if ( !dst )
1683     {
1684         // optimization: return maximal space which could be needed for this
1685         // string instead of the exact amount which could be less if there are
1686         // any surrogates in the input
1687         //
1688         // we consider that surrogates are rare enough to make it worthwhile to
1689         // avoid running the loop below at the cost of slightly extra memory
1690         // consumption
1691         return srcLen*BYTES_PER_CHAR;
1692     }
1693
1694     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1695     size_t outLen = 0;
1696     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1697     {
1698         const wxUint32 ch = wxDecodeSurrogate(&src);
1699         if ( !src )
1700             return wxCONV_FAILED;
1701
1702         outLen += BYTES_PER_CHAR;
1703
1704         if ( outLen > dstLen )
1705             return wxCONV_FAILED;
1706
1707         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1708     }
1709
1710     return outLen;
1711 }
1712
1713 #else // !WC_UTF16: wchar_t is UTF-32
1714
1715 // ----------------------------------------------------------------------------
1716 // conversions without endianness change
1717 // ----------------------------------------------------------------------------
1718
1719 size_t
1720 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1721                                const char *src, size_t srcLen) const
1722 {
1723     // use memcpy() as it should be much faster than hand-written loop
1724     srcLen = GetLength(src, srcLen);
1725     if ( srcLen == wxNO_LEN )
1726         return wxCONV_FAILED;
1727
1728     const size_t inLen = srcLen/BYTES_PER_CHAR;
1729     if ( dst )
1730     {
1731         if ( dstLen < inLen )
1732             return wxCONV_FAILED;
1733
1734         memcpy(dst, src, srcLen);
1735     }
1736
1737     return inLen;
1738 }
1739
1740 size_t
1741 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1742                                  const wchar_t *src, size_t srcLen) const
1743 {
1744     if ( srcLen == wxNO_LEN )
1745         srcLen = wxWcslen(src) + 1;
1746
1747     srcLen *= BYTES_PER_CHAR;
1748
1749     if ( dst )
1750     {
1751         if ( dstLen < srcLen )
1752             return wxCONV_FAILED;
1753
1754         memcpy(dst, src, srcLen);
1755     }
1756
1757     return srcLen;
1758 }
1759
1760 // ----------------------------------------------------------------------------
1761 // endian-reversing conversions
1762 // ----------------------------------------------------------------------------
1763
1764 size_t
1765 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1766                            const char *src, size_t srcLen) const
1767 {
1768     srcLen = GetLength(src, srcLen);
1769     if ( srcLen == wxNO_LEN )
1770         return wxCONV_FAILED;
1771
1772     srcLen /= BYTES_PER_CHAR;
1773
1774     if ( dst )
1775     {
1776         if ( dstLen < srcLen )
1777             return wxCONV_FAILED;
1778
1779         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1780         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1781         {
1782             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1783         }
1784     }
1785
1786     return srcLen;
1787 }
1788
1789 size_t
1790 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1791                              const wchar_t *src, size_t srcLen) const
1792 {
1793     if ( srcLen == wxNO_LEN )
1794         srcLen = wxWcslen(src) + 1;
1795
1796     srcLen *= BYTES_PER_CHAR;
1797
1798     if ( dst )
1799     {
1800         if ( dstLen < srcLen )
1801             return wxCONV_FAILED;
1802
1803         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1804         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1805         {
1806             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1807         }
1808     }
1809
1810     return srcLen;
1811 }
1812
1813 #endif // WC_UTF16/!WC_UTF16
1814
1815
1816 // ============================================================================
1817 // The classes doing conversion using the iconv_xxx() functions
1818 // ============================================================================
1819
1820 #ifdef HAVE_ICONV
1821
1822 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1823 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1824 //     (unless there's yet another bug in glibc) the only case when iconv()
1825 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1826 //     left in the input buffer -- when _real_ error occurs,
1827 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1828 //     iconv() failure.
1829 //     [This bug does not appear in glibc 2.2.]
1830 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1831 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1832                                      (errno != E2BIG || bufLeft != 0))
1833 #else
1834 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1835 #endif
1836
1837 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1838
1839 #define ICONV_T_INVALID ((iconv_t)-1)
1840
1841 #if SIZEOF_WCHAR_T == 4
1842     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1843     #define WC_ENC      wxFONTENCODING_UTF32
1844 #elif SIZEOF_WCHAR_T == 2
1845     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1846     #define WC_ENC      wxFONTENCODING_UTF16
1847 #else // sizeof(wchar_t) != 2 nor 4
1848     // does this ever happen?
1849     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1850 #endif
1851
1852 // ----------------------------------------------------------------------------
1853 // wxMBConv_iconv: encapsulates an iconv character set
1854 // ----------------------------------------------------------------------------
1855
1856 class wxMBConv_iconv : public wxMBConv
1857 {
1858 public:
1859     wxMBConv_iconv(const char *name);
1860     virtual ~wxMBConv_iconv();
1861
1862     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1863     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1864
1865     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1866     virtual size_t GetMBNulLen() const;
1867
1868 #if wxUSE_UNICODE_UTF8
1869     virtual bool IsUTF8() const;
1870 #endif
1871
1872     virtual wxMBConv *Clone() const
1873     {
1874         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1875         p->m_minMBCharWidth = m_minMBCharWidth;
1876         return p;
1877     }
1878
1879     bool IsOk() const
1880         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1881
1882 protected:
1883     // the iconv handlers used to translate from multibyte
1884     // to wide char and in the other direction
1885     iconv_t m2w,
1886             w2m;
1887
1888 #if wxUSE_THREADS
1889     // guards access to m2w and w2m objects
1890     wxMutex m_iconvMutex;
1891 #endif
1892
1893 private:
1894     // the name (for iconv_open()) of a wide char charset -- if none is
1895     // available on this machine, it will remain NULL
1896     static wxString ms_wcCharsetName;
1897
1898     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1899     // different endian-ness than the native one
1900     static bool ms_wcNeedsSwap;
1901
1902
1903     // name of the encoding handled by this conversion
1904     wxString m_name;
1905
1906     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1907     // initially
1908     size_t m_minMBCharWidth;
1909 };
1910
1911 // make the constructor available for unit testing
1912 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1913 {
1914     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1915     if ( !result->IsOk() )
1916     {
1917         delete result;
1918         return 0;
1919     }
1920
1921     return result;
1922 }
1923
1924 wxString wxMBConv_iconv::ms_wcCharsetName;
1925 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1926
1927 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1928               : m_name(name)
1929 {
1930     m_minMBCharWidth = 0;
1931
1932     // check for charset that represents wchar_t:
1933     if ( ms_wcCharsetName.empty() )
1934     {
1935         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1936
1937 #if wxUSE_FONTMAP
1938         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1939 #else // !wxUSE_FONTMAP
1940         static const wxChar *names_static[] =
1941         {
1942 #if SIZEOF_WCHAR_T == 4
1943             _T("UCS-4"),
1944 #elif SIZEOF_WCHAR_T = 2
1945             _T("UCS-2"),
1946 #endif
1947             NULL
1948         };
1949         const wxChar **names = names_static;
1950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1951
1952         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1953         {
1954             const wxString nameCS(*names);
1955
1956             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1957             wxString nameXE(nameCS);
1958
1959 #ifdef WORDS_BIGENDIAN
1960                 nameXE += _T("BE");
1961 #else // little endian
1962                 nameXE += _T("LE");
1963 #endif
1964
1965             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1966                        nameXE.c_str());
1967
1968             m2w = iconv_open(nameXE.ToAscii(), name);
1969             if ( m2w == ICONV_T_INVALID )
1970             {
1971                 // try charset w/o bytesex info (e.g. "UCS4")
1972                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1973                            nameCS.c_str());
1974                 m2w = iconv_open(nameCS.ToAscii(), name);
1975
1976                 // and check for bytesex ourselves:
1977                 if ( m2w != ICONV_T_INVALID )
1978                 {
1979                     char    buf[2], *bufPtr;
1980                     wchar_t wbuf[2], *wbufPtr;
1981                     size_t  insz, outsz;
1982                     size_t  res;
1983
1984                     buf[0] = 'A';
1985                     buf[1] = 0;
1986                     wbuf[0] = 0;
1987                     insz = 2;
1988                     outsz = SIZEOF_WCHAR_T * 2;
1989                     wbufPtr = wbuf;
1990                     bufPtr = buf;
1991
1992                     res = iconv(
1993                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1994                         (char**)&wbufPtr, &outsz);
1995
1996                     if (ICONV_FAILED(res, insz))
1997                     {
1998                         wxLogLastError(wxT("iconv"));
1999                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2000                                    nameCS.c_str());
2001                     }
2002                     else // ok, can convert to this encoding, remember it
2003                     {
2004                         ms_wcCharsetName = nameCS;
2005                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2006                     }
2007                 }
2008             }
2009             else // use charset not requiring byte swapping
2010             {
2011                 ms_wcCharsetName = nameXE;
2012             }
2013         }
2014
2015         wxLogTrace(TRACE_STRCONV,
2016                    wxT("iconv wchar_t charset is \"%s\"%s"),
2017                    ms_wcCharsetName.empty() ? wxString("<none>")
2018                                             : ms_wcCharsetName,
2019                    ms_wcNeedsSwap ? _T(" (needs swap)")
2020                                   : _T(""));
2021     }
2022     else // we already have ms_wcCharsetName
2023     {
2024         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2025     }
2026
2027     if ( ms_wcCharsetName.empty() )
2028     {
2029         w2m = ICONV_T_INVALID;
2030     }
2031     else
2032     {
2033         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2034         if ( w2m == ICONV_T_INVALID )
2035         {
2036             wxLogTrace(TRACE_STRCONV,
2037                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2038                        ms_wcCharsetName.c_str(), name);
2039         }
2040     }
2041 }
2042
2043 wxMBConv_iconv::~wxMBConv_iconv()
2044 {
2045     if ( m2w != ICONV_T_INVALID )
2046         iconv_close(m2w);
2047     if ( w2m != ICONV_T_INVALID )
2048         iconv_close(w2m);
2049 }
2050
2051 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2052 {
2053     // find the string length: notice that must be done differently for
2054     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2055     size_t inbuf;
2056     const size_t nulLen = GetMBNulLen();
2057     switch ( nulLen )
2058     {
2059         default:
2060             return wxCONV_FAILED;
2061
2062         case 1:
2063             inbuf = strlen(psz); // arguably more optimized than our version
2064             break;
2065
2066         case 2:
2067         case 4:
2068             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2069             // they also have to start at character boundary and not span two
2070             // adjacent characters
2071             const char *p;
2072             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2073                 ;
2074             inbuf = p - psz;
2075             break;
2076     }
2077
2078 #if wxUSE_THREADS
2079     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2080     //     Unfortunately there are a couple of global wxCSConv objects such as
2081     //     wxConvLocal that are used all over wx code, so we have to make sure
2082     //     the handle is used by at most one thread at the time. Otherwise
2083     //     only a few wx classes would be safe to use from non-main threads
2084     //     as MB<->WC conversion would fail "randomly".
2085     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2086 #endif // wxUSE_THREADS
2087
2088     size_t outbuf = n * SIZEOF_WCHAR_T;
2089     size_t res, cres;
2090     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2091     wchar_t *bufPtr = buf;
2092     const char *pszPtr = psz;
2093
2094     if (buf)
2095     {
2096         // have destination buffer, convert there
2097         cres = iconv(m2w,
2098                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2099                      (char**)&bufPtr, &outbuf);
2100         res = n - (outbuf / SIZEOF_WCHAR_T);
2101
2102         if (ms_wcNeedsSwap)
2103         {
2104             // convert to native endianness
2105             for ( unsigned i = 0; i < res; i++ )
2106                 buf[n] = WC_BSWAP(buf[i]);
2107         }
2108
2109         // NUL-terminate the string if there is any space left
2110         if (res < n)
2111             buf[res] = 0;
2112     }
2113     else
2114     {
2115         // no destination buffer... convert using temp buffer
2116         // to calculate destination buffer requirement
2117         wchar_t tbuf[8];
2118         res = 0;
2119
2120         do
2121         {
2122             bufPtr = tbuf;
2123             outbuf = 8 * SIZEOF_WCHAR_T;
2124
2125             cres = iconv(m2w,
2126                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2127                          (char**)&bufPtr, &outbuf );
2128
2129             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2130         }
2131         while ((cres == (size_t)-1) && (errno == E2BIG));
2132     }
2133
2134     if (ICONV_FAILED(cres, inbuf))
2135     {
2136         //VS: it is ok if iconv fails, hence trace only
2137         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2138         return wxCONV_FAILED;
2139     }
2140
2141     return res;
2142 }
2143
2144 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2145 {
2146 #if wxUSE_THREADS
2147     // NB: explained in MB2WC
2148     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2149 #endif
2150
2151     size_t inlen = wxWcslen(psz);
2152     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2153     size_t outbuf = n;
2154     size_t res, cres;
2155
2156     wchar_t *tmpbuf = 0;
2157
2158     if (ms_wcNeedsSwap)
2159     {
2160         // need to copy to temp buffer to switch endianness
2161         // (doing WC_BSWAP twice on the original buffer won't help, as it
2162         //  could be in read-only memory, or be accessed in some other thread)
2163         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2164         for ( size_t i = 0; i < inlen; i++ )
2165             tmpbuf[n] = WC_BSWAP(psz[i]);
2166
2167         tmpbuf[inlen] = L'\0';
2168         psz = tmpbuf;
2169     }
2170
2171     if (buf)
2172     {
2173         // have destination buffer, convert there
2174         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2175
2176         res = n - outbuf;
2177
2178         // NB: iconv was given only wcslen(psz) characters on input, and so
2179         //     it couldn't convert the trailing zero. Let's do it ourselves
2180         //     if there's some room left for it in the output buffer.
2181         if (res < n)
2182             buf[0] = 0;
2183     }
2184     else
2185     {
2186         // no destination buffer: convert using temp buffer
2187         // to calculate destination buffer requirement
2188         char tbuf[16];
2189         res = 0;
2190         do
2191         {
2192             buf = tbuf;
2193             outbuf = 16;
2194
2195             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2196
2197             res += 16 - outbuf;
2198         }
2199         while ((cres == (size_t)-1) && (errno == E2BIG));
2200     }
2201
2202     if (ms_wcNeedsSwap)
2203     {
2204         free(tmpbuf);
2205     }
2206
2207     if (ICONV_FAILED(cres, inbuf))
2208     {
2209         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2210         return wxCONV_FAILED;
2211     }
2212
2213     return res;
2214 }
2215
2216 size_t wxMBConv_iconv::GetMBNulLen() const
2217 {
2218     if ( m_minMBCharWidth == 0 )
2219     {
2220         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2221
2222 #if wxUSE_THREADS
2223         // NB: explained in MB2WC
2224         wxMutexLocker lock(self->m_iconvMutex);
2225 #endif
2226
2227         const wchar_t *wnul = L"";
2228         char buf[8]; // should be enough for NUL in any encoding
2229         size_t inLen = sizeof(wchar_t),
2230                outLen = WXSIZEOF(buf);
2231         char *inBuff = (char *)wnul;
2232         char *outBuff = buf;
2233         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2234         {
2235             self->m_minMBCharWidth = (size_t)-1;
2236         }
2237         else // ok
2238         {
2239             self->m_minMBCharWidth = outBuff - buf;
2240         }
2241     }
2242
2243     return m_minMBCharWidth;
2244 }
2245
2246 #if wxUSE_UNICODE_UTF8
2247 bool wxMBConv_iconv::IsUTF8() const
2248 {
2249     return wxStricmp(m_name, "UTF-8") == 0 ||
2250            wxStricmp(m_name, "UTF8") == 0;
2251 }
2252 #endif
2253
2254 #endif // HAVE_ICONV
2255
2256
2257 // ============================================================================
2258 // Win32 conversion classes
2259 // ============================================================================
2260
2261 #ifdef wxHAVE_WIN32_MB2WC
2262
2263 // from utils.cpp
2264 #if wxUSE_FONTMAP
2265 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2266 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2267 #endif
2268
2269 class wxMBConv_win32 : public wxMBConv
2270 {
2271 public:
2272     wxMBConv_win32()
2273     {
2274         m_CodePage = CP_ACP;
2275         m_minMBCharWidth = 0;
2276     }
2277
2278     wxMBConv_win32(const wxMBConv_win32& conv)
2279         : wxMBConv()
2280     {
2281         m_CodePage = conv.m_CodePage;
2282         m_minMBCharWidth = conv.m_minMBCharWidth;
2283     }
2284
2285 #if wxUSE_FONTMAP
2286     wxMBConv_win32(const char* name)
2287     {
2288         m_CodePage = wxCharsetToCodepage(name);
2289         m_minMBCharWidth = 0;
2290     }
2291
2292     wxMBConv_win32(wxFontEncoding encoding)
2293     {
2294         m_CodePage = wxEncodingToCodepage(encoding);
2295         m_minMBCharWidth = 0;
2296     }
2297 #endif // wxUSE_FONTMAP
2298
2299     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2300     {
2301         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2302         // the behaviour is not compatible with the Unix version (using iconv)
2303         // and break the library itself, e.g. wxTextInputStream::NextChar()
2304         // wouldn't work if reading an incomplete MB char didn't result in an
2305         // error
2306         //
2307         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2308         // Win XP or newer and it is not supported for UTF-[78] so we always
2309         // use our own conversions in this case. See
2310         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2311         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2312         if ( m_CodePage == CP_UTF8 )
2313         {
2314             return wxMBConvUTF8().MB2WC(buf, psz, n);
2315         }
2316
2317         if ( m_CodePage == CP_UTF7 )
2318         {
2319             return wxMBConvUTF7().MB2WC(buf, psz, n);
2320         }
2321
2322         int flags = 0;
2323         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2324                 IsAtLeastWin2kSP4() )
2325         {
2326             flags = MB_ERR_INVALID_CHARS;
2327         }
2328
2329         const size_t len = ::MultiByteToWideChar
2330                              (
2331                                 m_CodePage,     // code page
2332                                 flags,          // flags: fall on error
2333                                 psz,            // input string
2334                                 -1,             // its length (NUL-terminated)
2335                                 buf,            // output string
2336                                 buf ? n : 0     // size of output buffer
2337                              );
2338         if ( !len )
2339         {
2340             // function totally failed
2341             return wxCONV_FAILED;
2342         }
2343
2344         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2345         // check if we succeeded, by doing a double trip:
2346         if ( !flags && buf )
2347         {
2348             const size_t mbLen = strlen(psz);
2349             wxCharBuffer mbBuf(mbLen);
2350             if ( ::WideCharToMultiByte
2351                    (
2352                       m_CodePage,
2353                       0,
2354                       buf,
2355                       -1,
2356                       mbBuf.data(),
2357                       mbLen + 1,        // size in bytes, not length
2358                       NULL,
2359                       NULL
2360                    ) == 0 ||
2361                   strcmp(mbBuf, psz) != 0 )
2362             {
2363                 // we didn't obtain the same thing we started from, hence
2364                 // the conversion was lossy and we consider that it failed
2365                 return wxCONV_FAILED;
2366             }
2367         }
2368
2369         // note that it returns count of written chars for buf != NULL and size
2370         // of the needed buffer for buf == NULL so in either case the length of
2371         // the string (which never includes the terminating NUL) is one less
2372         return len - 1;
2373     }
2374
2375     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2376     {
2377         /*
2378             we have a problem here: by default, WideCharToMultiByte() may
2379             replace characters unrepresentable in the target code page with bad
2380             quality approximations such as turning "1/2" symbol (U+00BD) into
2381             "1" for the code pages which don't have it and we, obviously, want
2382             to avoid this at any price
2383
2384             the trouble is that this function does it _silently_, i.e. it won't
2385             even tell us whether it did or not... Win98/2000 and higher provide
2386             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2387             we have to resort to a round trip, i.e. check that converting back
2388             results in the same string -- this is, of course, expensive but
2389             otherwise we simply can't be sure to not garble the data.
2390          */
2391
2392         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2393         // it doesn't work with CJK encodings (which we test for rather roughly
2394         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2395         // supporting it
2396         BOOL usedDef wxDUMMY_INITIALIZE(false);
2397         BOOL *pUsedDef;
2398         int flags;
2399         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2400         {
2401             // it's our lucky day
2402             flags = WC_NO_BEST_FIT_CHARS;
2403             pUsedDef = &usedDef;
2404         }
2405         else // old system or unsupported encoding
2406         {
2407             flags = 0;
2408             pUsedDef = NULL;
2409         }
2410
2411         const size_t len = ::WideCharToMultiByte
2412                              (
2413                                 m_CodePage,     // code page
2414                                 flags,          // either none or no best fit
2415                                 pwz,            // input string
2416                                 -1,             // it is (wide) NUL-terminated
2417                                 buf,            // output buffer
2418                                 buf ? n : 0,    // and its size
2419                                 NULL,           // default "replacement" char
2420                                 pUsedDef        // [out] was it used?
2421                              );
2422
2423         if ( !len )
2424         {
2425             // function totally failed
2426             return wxCONV_FAILED;
2427         }
2428
2429         // we did something, check if we really succeeded
2430         if ( flags )
2431         {
2432             // check if the conversion failed, i.e. if any replacements
2433             // were done
2434             if ( usedDef )
2435                 return wxCONV_FAILED;
2436         }
2437         else // we must resort to double tripping...
2438         {
2439             // first we need to ensure that we really have the MB data: this is
2440             // not the case if we're called with NULL buffer, in which case we
2441             // need to do the conversion yet again
2442             wxCharBuffer bufDef;
2443             if ( !buf )
2444             {
2445                 bufDef = wxCharBuffer(len);
2446                 buf = bufDef.data();
2447                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2448                                             buf, len, NULL, NULL) )
2449                     return wxCONV_FAILED;
2450             }
2451
2452             wxWCharBuffer wcBuf(n);
2453             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2454                     wcscmp(wcBuf, pwz) != 0 )
2455             {
2456                 // we didn't obtain the same thing we started from, hence
2457                 // the conversion was lossy and we consider that it failed
2458                 return wxCONV_FAILED;
2459             }
2460         }
2461
2462         // see the comment above for the reason of "len - 1"
2463         return len - 1;
2464     }
2465
2466     virtual size_t GetMBNulLen() const
2467     {
2468         if ( m_minMBCharWidth == 0 )
2469         {
2470             int len = ::WideCharToMultiByte
2471                         (
2472                             m_CodePage,     // code page
2473                             0,              // no flags
2474                             L"",            // input string
2475                             1,              // translate just the NUL
2476                             NULL,           // output buffer
2477                             0,              // and its size
2478                             NULL,           // no replacement char
2479                             NULL            // [out] don't care if it was used
2480                         );
2481
2482             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2483             switch ( len )
2484             {
2485                 default:
2486                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2487                     self->m_minMBCharWidth = (size_t)-1;
2488                     break;
2489
2490                 case 0:
2491                     self->m_minMBCharWidth = (size_t)-1;
2492                     break;
2493
2494                 case 1:
2495                 case 2:
2496                 case 4:
2497                     self->m_minMBCharWidth = len;
2498                     break;
2499             }
2500         }
2501
2502         return m_minMBCharWidth;
2503     }
2504
2505     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2506
2507     bool IsOk() const { return m_CodePage != -1; }
2508
2509 private:
2510     static bool CanUseNoBestFit()
2511     {
2512         static int s_isWin98Or2k = -1;
2513
2514         if ( s_isWin98Or2k == -1 )
2515         {
2516             int verMaj, verMin;
2517             switch ( wxGetOsVersion(&verMaj, &verMin) )
2518             {
2519                 case wxOS_WINDOWS_9X:
2520                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2521                     break;
2522
2523                 case wxOS_WINDOWS_NT:
2524                     s_isWin98Or2k = verMaj >= 5;
2525                     break;
2526
2527                 default:
2528                     // unknown: be conservative by default
2529                     s_isWin98Or2k = 0;
2530                     break;
2531             }
2532
2533             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2534         }
2535
2536         return s_isWin98Or2k == 1;
2537     }
2538
2539     static bool IsAtLeastWin2kSP4()
2540     {
2541 #ifdef __WXWINCE__
2542         return false;
2543 #else
2544         static int s_isAtLeastWin2kSP4 = -1;
2545
2546         if ( s_isAtLeastWin2kSP4 == -1 )
2547         {
2548             OSVERSIONINFOEX ver;
2549
2550             memset(&ver, 0, sizeof(ver));
2551             ver.dwOSVersionInfoSize = sizeof(ver);
2552             GetVersionEx((OSVERSIONINFO*)&ver);
2553
2554             s_isAtLeastWin2kSP4 =
2555               ((ver.dwMajorVersion > 5) || // Vista+
2556                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2557                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2558                ver.wServicePackMajor >= 4)) // 2000 SP4+
2559               ? 1 : 0;
2560         }
2561
2562         return s_isAtLeastWin2kSP4 == 1;
2563 #endif
2564     }
2565
2566
2567     // the code page we're working with
2568     long m_CodePage;
2569
2570     // cached result of GetMBNulLen(), set to 0 initially meaning
2571     // "unknown"
2572     size_t m_minMBCharWidth;
2573 };
2574
2575 #endif // wxHAVE_WIN32_MB2WC
2576
2577
2578 // ============================================================================
2579 // wxEncodingConverter based conversion classes
2580 // ============================================================================
2581
2582 #if wxUSE_FONTMAP
2583
2584 class wxMBConv_wxwin : public wxMBConv
2585 {
2586 private:
2587     void Init()
2588     {
2589         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2590         // The wxMBConv_cf class does a better job.
2591         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2592                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2593                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2594     }
2595
2596 public:
2597     // temporarily just use wxEncodingConverter stuff,
2598     // so that it works while a better implementation is built
2599     wxMBConv_wxwin(const char* name)
2600     {
2601         if (name)
2602             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2603         else
2604             m_enc = wxFONTENCODING_SYSTEM;
2605
2606         Init();
2607     }
2608
2609     wxMBConv_wxwin(wxFontEncoding enc)
2610     {
2611         m_enc = enc;
2612
2613         Init();
2614     }
2615
2616     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2617     {
2618         size_t inbuf = strlen(psz);
2619         if (buf)
2620         {
2621             if (!m2w.Convert(psz, buf))
2622                 return wxCONV_FAILED;
2623         }
2624         return inbuf;
2625     }
2626
2627     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2628     {
2629         const size_t inbuf = wxWcslen(psz);
2630         if (buf)
2631         {
2632             if (!w2m.Convert(psz, buf))
2633                 return wxCONV_FAILED;
2634         }
2635
2636         return inbuf;
2637     }
2638
2639     virtual size_t GetMBNulLen() const
2640     {
2641         switch ( m_enc )
2642         {
2643             case wxFONTENCODING_UTF16BE:
2644             case wxFONTENCODING_UTF16LE:
2645                 return 2;
2646
2647             case wxFONTENCODING_UTF32BE:
2648             case wxFONTENCODING_UTF32LE:
2649                 return 4;
2650
2651             default:
2652                 return 1;
2653         }
2654     }
2655
2656     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2657
2658     bool IsOk() const { return m_ok; }
2659
2660 public:
2661     wxFontEncoding m_enc;
2662     wxEncodingConverter m2w, w2m;
2663
2664 private:
2665     // were we initialized successfully?
2666     bool m_ok;
2667
2668     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2669 };
2670
2671 // make the constructors available for unit testing
2672 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2673 {
2674     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2675     if ( !result->IsOk() )
2676     {
2677         delete result;
2678         return 0;
2679     }
2680
2681     return result;
2682 }
2683
2684 #endif // wxUSE_FONTMAP
2685
2686 // ============================================================================
2687 // wxCSConv implementation
2688 // ============================================================================
2689
2690 void wxCSConv::Init()
2691 {
2692     m_name = NULL;
2693     m_convReal =  NULL;
2694     m_deferred = true;
2695 }
2696
2697 wxCSConv::wxCSConv(const wxString& charset)
2698 {
2699     Init();
2700
2701     if ( !charset.empty() )
2702     {
2703         SetName(charset.ToAscii());
2704     }
2705
2706 #if wxUSE_FONTMAP
2707     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2708 #else
2709     m_encoding = wxFONTENCODING_SYSTEM;
2710 #endif
2711 }
2712
2713 wxCSConv::wxCSConv(wxFontEncoding encoding)
2714 {
2715     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2716     {
2717         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2718
2719         encoding = wxFONTENCODING_SYSTEM;
2720     }
2721
2722     Init();
2723
2724     m_encoding = encoding;
2725 }
2726
2727 wxCSConv::~wxCSConv()
2728 {
2729     Clear();
2730 }
2731
2732 wxCSConv::wxCSConv(const wxCSConv& conv)
2733         : wxMBConv()
2734 {
2735     Init();
2736
2737     SetName(conv.m_name);
2738     m_encoding = conv.m_encoding;
2739 }
2740
2741 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2742 {
2743     Clear();
2744
2745     SetName(conv.m_name);
2746     m_encoding = conv.m_encoding;
2747
2748     return *this;
2749 }
2750
2751 void wxCSConv::Clear()
2752 {
2753     free(m_name);
2754     delete m_convReal;
2755
2756     m_name = NULL;
2757     m_convReal = NULL;
2758 }
2759
2760 void wxCSConv::SetName(const char *charset)
2761 {
2762     if (charset)
2763     {
2764         m_name = wxStrdup(charset);
2765         m_deferred = true;
2766     }
2767 }
2768
2769 #if wxUSE_FONTMAP
2770
2771 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2772                      wxEncodingNameCache );
2773
2774 static wxEncodingNameCache gs_nameCache;
2775 #endif
2776
2777 wxMBConv *wxCSConv::DoCreate() const
2778 {
2779 #if wxUSE_FONTMAP
2780     wxLogTrace(TRACE_STRCONV,
2781                wxT("creating conversion for %s"),
2782                (m_name ? m_name
2783                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2784 #endif // wxUSE_FONTMAP
2785
2786     // check for the special case of ASCII or ISO8859-1 charset: as we have
2787     // special knowledge of it anyhow, we don't need to create a special
2788     // conversion object
2789     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2790             m_encoding == wxFONTENCODING_DEFAULT )
2791     {
2792         // don't convert at all
2793         return NULL;
2794     }
2795
2796     // we trust OS to do conversion better than we can so try external
2797     // conversion methods first
2798     //
2799     // the full order is:
2800     //      1. OS conversion (iconv() under Unix or Win32 API)
2801     //      2. hard coded conversions for UTF
2802     //      3. wxEncodingConverter as fall back
2803
2804     // step (1)
2805 #ifdef HAVE_ICONV
2806 #if !wxUSE_FONTMAP
2807     if ( m_name )
2808 #endif // !wxUSE_FONTMAP
2809     {
2810 #if wxUSE_FONTMAP
2811         wxFontEncoding encoding(m_encoding);
2812 #endif
2813
2814         if ( m_name )
2815         {
2816             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2817             if ( conv->IsOk() )
2818                 return conv;
2819
2820             delete conv;
2821
2822 #if wxUSE_FONTMAP
2823             encoding =
2824                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2825 #endif // wxUSE_FONTMAP
2826         }
2827 #if wxUSE_FONTMAP
2828         {
2829             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2830             if ( it != gs_nameCache.end() )
2831             {
2832                 if ( it->second.empty() )
2833                     return NULL;
2834
2835                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2836                 if ( conv->IsOk() )
2837                     return conv;
2838
2839                 delete conv;
2840             }
2841
2842             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2843             // CS : in case this does not return valid names (eg for MacRoman)
2844             // encoding got a 'failure' entry in the cache all the same,
2845             // although it just has to be created using a different method, so
2846             // only store failed iconv creation attempts (or perhaps we
2847             // shoulnd't do this at all ?)
2848             if ( names[0] != NULL )
2849             {
2850                 for ( ; *names; ++names )
2851                 {
2852                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2853                     //             will need changes that will obsolete this
2854                     wxString name(*names);
2855                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2856                     if ( conv->IsOk() )
2857                     {
2858                         gs_nameCache[encoding] = *names;
2859                         return conv;
2860                     }
2861
2862                     delete conv;
2863                 }
2864
2865                 gs_nameCache[encoding] = _T(""); // cache the failure
2866             }
2867         }
2868 #endif // wxUSE_FONTMAP
2869     }
2870 #endif // HAVE_ICONV
2871
2872 #ifdef wxHAVE_WIN32_MB2WC
2873     {
2874 #if wxUSE_FONTMAP
2875         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2876                                       : new wxMBConv_win32(m_encoding);
2877         if ( conv->IsOk() )
2878             return conv;
2879
2880         delete conv;
2881 #else
2882         return NULL;
2883 #endif
2884     }
2885 #endif // wxHAVE_WIN32_MB2WC
2886
2887 #ifdef __DARWIN__
2888     {
2889         // leave UTF16 and UTF32 to the built-ins of wx
2890         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2891             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2892         {
2893 #if wxUSE_FONTMAP
2894             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2895                                           : new wxMBConv_cf(m_encoding);
2896 #else
2897             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2898 #endif
2899
2900             if ( conv->IsOk() )
2901                  return conv;
2902
2903             delete conv;
2904         }
2905     }
2906 #endif // __DARWIN__
2907
2908     // step (2)
2909     wxFontEncoding enc = m_encoding;
2910 #if wxUSE_FONTMAP
2911     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2912     {
2913         // use "false" to suppress interactive dialogs -- we can be called from
2914         // anywhere and popping up a dialog from here is the last thing we want to
2915         // do
2916         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2917     }
2918 #endif // wxUSE_FONTMAP
2919
2920     switch ( enc )
2921     {
2922         case wxFONTENCODING_UTF7:
2923              return new wxMBConvUTF7;
2924
2925         case wxFONTENCODING_UTF8:
2926              return new wxMBConvUTF8;
2927
2928         case wxFONTENCODING_UTF16BE:
2929              return new wxMBConvUTF16BE;
2930
2931         case wxFONTENCODING_UTF16LE:
2932              return new wxMBConvUTF16LE;
2933
2934         case wxFONTENCODING_UTF32BE:
2935              return new wxMBConvUTF32BE;
2936
2937         case wxFONTENCODING_UTF32LE:
2938              return new wxMBConvUTF32LE;
2939
2940         default:
2941              // nothing to do but put here to suppress gcc warnings
2942              break;
2943     }
2944
2945     // step (3)
2946 #if wxUSE_FONTMAP
2947     {
2948         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2949                                       : new wxMBConv_wxwin(m_encoding);
2950         if ( conv->IsOk() )
2951             return conv;
2952
2953         delete conv;
2954     }
2955 #endif // wxUSE_FONTMAP
2956
2957     // NB: This is a hack to prevent deadlock. What could otherwise happen
2958     //     in Unicode build: wxConvLocal creation ends up being here
2959     //     because of some failure and logs the error. But wxLog will try to
2960     //     attach a timestamp, for which it will need wxConvLocal (to convert
2961     //     time to char* and then wchar_t*), but that fails, tries to log the
2962     //     error, but wxLog has an (already locked) critical section that
2963     //     guards the static buffer.
2964     static bool alreadyLoggingError = false;
2965     if (!alreadyLoggingError)
2966     {
2967         alreadyLoggingError = true;
2968         wxLogError(_("Cannot convert from the charset '%s'!"),
2969                    m_name ? m_name
2970                       :
2971 #if wxUSE_FONTMAP
2972                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2973 #else // !wxUSE_FONTMAP
2974                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2975 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2976               );
2977
2978         alreadyLoggingError = false;
2979     }
2980
2981     return NULL;
2982 }
2983
2984 void wxCSConv::CreateConvIfNeeded() const
2985 {
2986     if ( m_deferred )
2987     {
2988         wxCSConv *self = (wxCSConv *)this; // const_cast
2989
2990         // if we don't have neither the name nor the encoding, use the default
2991         // encoding for this system
2992         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2993         {
2994 #if wxUSE_INTL
2995             self->m_encoding = wxLocale::GetSystemEncoding();
2996 #else
2997             // fallback to some reasonable default:
2998             self->m_encoding = wxFONTENCODING_ISO8859_1;
2999 #endif // wxUSE_INTL
3000         }
3001
3002         self->m_convReal = DoCreate();
3003         self->m_deferred = false;
3004     }
3005 }
3006
3007 bool wxCSConv::IsOk() const
3008 {
3009     CreateConvIfNeeded();
3010
3011     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3012     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3013         return true; // always ok as we do it ourselves
3014
3015     // m_convReal->IsOk() is called at its own creation, so we know it must
3016     // be ok if m_convReal is non-NULL
3017     return m_convReal != NULL;
3018 }
3019
3020 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3021                          const char *src, size_t srcLen) const
3022 {
3023     CreateConvIfNeeded();
3024
3025     if (m_convReal)
3026         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3027
3028     // latin-1 (direct)
3029     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3030 }
3031
3032 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3033                            const wchar_t *src, size_t srcLen) const
3034 {
3035     CreateConvIfNeeded();
3036
3037     if (m_convReal)
3038         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3039
3040     // latin-1 (direct)
3041     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3042 }
3043
3044 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3045 {
3046     CreateConvIfNeeded();
3047
3048     if (m_convReal)
3049         return m_convReal->MB2WC(buf, psz, n);
3050
3051     // latin-1 (direct)
3052     size_t len = strlen(psz);
3053
3054     if (buf)
3055     {
3056         for (size_t c = 0; c <= len; c++)
3057             buf[c] = (unsigned char)(psz[c]);
3058     }
3059
3060     return len;
3061 }
3062
3063 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3064 {
3065     CreateConvIfNeeded();
3066
3067     if (m_convReal)
3068         return m_convReal->WC2MB(buf, psz, n);
3069
3070     // latin-1 (direct)
3071     const size_t len = wxWcslen(psz);
3072     if (buf)
3073     {
3074         for (size_t c = 0; c <= len; c++)
3075         {
3076             if (psz[c] > 0xFF)
3077                 return wxCONV_FAILED;
3078
3079             buf[c] = (char)psz[c];
3080         }
3081     }
3082     else
3083     {
3084         for (size_t c = 0; c <= len; c++)
3085         {
3086             if (psz[c] > 0xFF)
3087                 return wxCONV_FAILED;
3088         }
3089     }
3090
3091     return len;
3092 }
3093
3094 size_t wxCSConv::GetMBNulLen() const
3095 {
3096     CreateConvIfNeeded();
3097
3098     if ( m_convReal )
3099     {
3100         return m_convReal->GetMBNulLen();
3101     }
3102
3103     // otherwise, we are ISO-8859-1
3104     return 1;
3105 }
3106
3107 #if wxUSE_UNICODE_UTF8
3108 bool wxCSConv::IsUTF8() const
3109 {
3110     CreateConvIfNeeded();
3111
3112     if ( m_convReal )
3113     {
3114         return m_convReal->IsUTF8();
3115     }
3116
3117     // otherwise, we are ISO-8859-1
3118     return false;
3119 }
3120 #endif
3121
3122
3123 #if wxUSE_UNICODE
3124
3125 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3126 {
3127     if ( !s )
3128         return wxWCharBuffer();
3129
3130     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3131     if ( !wbuf )
3132         wbuf = wxMBConvUTF8().cMB2WX(s);
3133     if ( !wbuf )
3134         wbuf = wxConvISO8859_1.cMB2WX(s);
3135
3136     return wbuf;
3137 }
3138
3139 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3140 {
3141     if ( !ws )
3142         return wxCharBuffer();
3143
3144     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3145     if ( !buf )
3146         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3147
3148     return buf;
3149 }
3150
3151 #endif // wxUSE_UNICODE
3152
3153 // ----------------------------------------------------------------------------
3154 // globals
3155 // ----------------------------------------------------------------------------
3156
3157 // NB: The reason why we create converted objects in this convoluted way,
3158 //     using a factory function instead of global variable, is that they
3159 //     may be used at static initialization time (some of them are used by
3160 //     wxString ctors and there may be a global wxString object). In other
3161 //     words, possibly _before_ the converter global object would be
3162 //     initialized.
3163
3164 #undef wxConvLibc
3165 #undef wxConvUTF8
3166 #undef wxConvUTF7
3167 #undef wxConvLocal
3168 #undef wxConvISO8859_1
3169
3170 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3171     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3172     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3173     {                                                                   \
3174         static impl_klass name##Obj ctor_args;                          \
3175         return &name##Obj;                                              \
3176     }                                                                   \
3177     /* this ensures that all global converter objects are created */    \
3178     /* by the time static initialization is done, i.e. before any */    \
3179     /* thread is launched: */                                           \
3180     static klass* gs_##name##instance = wxGet_##name##Ptr()
3181
3182 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3183     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3184
3185 #ifdef __WINDOWS__
3186     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3187 #else
3188     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3189 #endif
3190
3191 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3192 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3193
3194 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3195 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3196
3197 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3198 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3199
3200 #ifdef __DARWIN__
3201 // The xnu kernel always communicates file paths in decomposed UTF-8.
3202 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3203 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3204 #endif
3205
3206 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3207 #ifdef __DARWIN__
3208                                     &wxConvMacUTF8DObj;
3209 #else // !__DARWIN__
3210                                     wxGet_wxConvLibcPtr();
3211 #endif // __DARWIN__/!__DARWIN__
3212
3213 #else // !wxUSE_WCHAR_T
3214
3215 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3216 // stand-ins in absence of wchar_t
3217 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3218                                 wxConvISO8859_1,
3219                                 wxConvLocal,
3220                                 wxConvUTF8;
3221
3222 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T