src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = ToWChar(NULL, 0, psz);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = FromWChar(NULL, 0, pwz);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             wxCharBuffer buf(nLen - 1);
 380             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 381                 return buf;
 382         }
 383     }
 384
 385     return wxCharBuffer();
 386 }
 387
 388 const wxWCharBuffer
 389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 390 {
 391     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 392     if ( dstLen != wxCONV_FAILED )
 393     {
 394         // notice that we allocate space for dstLen+1 wide characters here
 395         // because we want the buffer to always be NUL-terminated, even if the
 396         // input isn't (as otherwise the caller has no way to know its length)
 397         wxWCharBuffer wbuf(dstLen);
 398         wbuf.data()[dstLen - 1] = L'\0';
 399         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 400         {
 401             if ( outLen )
 402             {
 403                 *outLen = dstLen;
 404                 if ( wbuf[dstLen - 1] == L'\0' )
 405                     (*outLen)--;
 406             }
 407
 408             return wbuf;
 409         }
 410     }
 411
 412     if ( outLen )
 413         *outLen = 0;
 414
 415     return wxWCharBuffer();
 416 }
 417
 418 const wxCharBuffer
 419 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 420 {
 421     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 422     if ( dstLen != wxCONV_FAILED )
 423     {
 424         const size_t nulLen = GetMBNulLen();
 425
 426         // as above, ensure that the buffer is always NUL-terminated, even if
 427         // the input is not
 428         wxCharBuffer buf(dstLen + nulLen - 1);
 429         memset(buf.data() + dstLen, 0, nulLen);
 430         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 431         {
 432             if ( outLen )
 433             {
 434                 *outLen = dstLen;
 435
 436                 if ( dstLen >= nulLen &&
 437                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 438                 {
 439                     // in this case the output is NUL-terminated and we're not
 440                     // supposed to count NUL
 441                     *outLen -= nulLen;
 442                 }
 443             }
 444
 445             return buf;
 446         }
 447     }
 448
 449     if ( outLen )
 450         *outLen = 0;
 451
 452     return wxCharBuffer();
 453 }
 454
 455 // ----------------------------------------------------------------------------
 456 // wxMBConvLibc
 457 // ----------------------------------------------------------------------------
 458
 459 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 460 {
 461     return wxMB2WC(buf, psz, n);
 462 }
 463
 464 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 465 {
 466     return wxWC2MB(buf, psz, n);
 467 }
 468
 469 // ----------------------------------------------------------------------------
 470 // wxConvBrokenFileNames
 471 // ----------------------------------------------------------------------------
 472
 473 #ifdef __UNIX__
 474
 475 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 476 {
 477     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 478          wxStricmp(charset, _T("UTF8")) == 0  )
 479         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 480     else
 481         m_conv = new wxCSConv(charset);
 482 }
 483
 484 #endif // __UNIX__
 485
 486 // ----------------------------------------------------------------------------
 487 // UTF-7
 488 // ----------------------------------------------------------------------------
 489
 490 // Implementation (C) 2004 Fredrik Roubert
 491
 492 //
 493 // BASE64 decoding table
 494 //
 495 static const unsigned char utf7unb64[] =
 496 {
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 503     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 504     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 506     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 507     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 508     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 510     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 511     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 512     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 529 };
 530
 531 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 532 {
 533     size_t len = 0;
 534
 535     while ( *psz && (!buf || (len < n)) )
 536     {
 537         unsigned char cc = *psz++;
 538         if (cc != '+')
 539         {
 540             // plain ASCII char
 541             if (buf)
 542                 *buf++ = cc;
 543             len++;
 544         }
 545         else if (*psz == '-')
 546         {
 547             // encoded plus sign
 548             if (buf)
 549                 *buf++ = cc;
 550             len++;
 551             psz++;
 552         }
 553         else // start of BASE64 encoded string
 554         {
 555             bool lsb, ok;
 556             unsigned int d, l;
 557             for ( ok = lsb = false, d = 0, l = 0;
 558                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 559                   psz++ )
 560             {
 561                 d <<= 6;
 562                 d += cc;
 563                 for (l += 6; l >= 8; lsb = !lsb)
 564                 {
 565                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 566                     if (lsb)
 567                     {
 568                         if (buf)
 569                             *buf++ |= c;
 570                         len ++;
 571                     }
 572                     else
 573                     {
 574                         if (buf)
 575                             *buf = (wchar_t)(c << 8);
 576                     }
 577
 578                     ok = true;
 579                 }
 580             }
 581
 582             if ( !ok )
 583             {
 584                 // in valid UTF7 we should have valid characters after '+'
 585                 return wxCONV_FAILED;
 586             }
 587
 588             if (*psz == '-')
 589                 psz++;
 590         }
 591     }
 592
 593     if ( buf && (len < n) )
 594         *buf = '\0';
 595
 596     return len;
 597 }
 598
 599 //
 600 // BASE64 encoding table
 601 //
 602 static const unsigned char utf7enb64[] =
 603 {
 604     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 605     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 606     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 607     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 608     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 609     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 610     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 611     '4', '5', '6', '7', '8', '9', '+', '/'
 612 };
 613
 614 //
 615 // UTF-7 encoding table
 616 //
 617 // 0 - Set D (directly encoded characters)
 618 // 1 - Set O (optional direct characters)
 619 // 2 - whitespace characters (optional)
 620 // 3 - special characters
 621 //
 622 static const unsigned char utf7encode[128] =
 623 {
 624     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 625     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 626     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 628     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 629     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 630     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 631     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 632 };
 633
 634 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 635 {
 636     size_t len = 0;
 637
 638     while (*psz && ((!buf) || (len < n)))
 639     {
 640         wchar_t cc = *psz++;
 641         if (cc < 0x80 && utf7encode[cc] < 1)
 642         {
 643             // plain ASCII char
 644             if (buf)
 645                 *buf++ = (char)cc;
 646
 647             len++;
 648         }
 649 #ifndef WC_UTF16
 650         else if (((wxUint32)cc) > 0xffff)
 651         {
 652             // no surrogate pair generation (yet?)
 653             return wxCONV_FAILED;
 654         }
 655 #endif
 656         else
 657         {
 658             if (buf)
 659                 *buf++ = '+';
 660
 661             len++;
 662             if (cc != '+')
 663             {
 664                 // BASE64 encode string
 665                 unsigned int lsb, d, l;
 666                 for (d = 0, l = 0; /*nothing*/; psz++)
 667                 {
 668                     for (lsb = 0; lsb < 2; lsb ++)
 669                     {
 670                         d <<= 8;
 671                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 672
 673                         for (l += 8; l >= 6; )
 674                         {
 675                             l -= 6;
 676                             if (buf)
 677                                 *buf++ = utf7enb64[(d >> l) % 64];
 678                             len++;
 679                         }
 680                     }
 681
 682                     cc = *psz;
 683                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 684                         break;
 685                 }
 686
 687                 if (l != 0)
 688                 {
 689                     if (buf)
 690                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 691
 692                     len++;
 693                 }
 694             }
 695
 696             if (buf)
 697                 *buf++ = '-';
 698             len++;
 699         }
 700     }
 701
 702     if (buf && (len < n))
 703         *buf = 0;
 704
 705     return len;
 706 }
 707
 708 // ----------------------------------------------------------------------------
 709 // UTF-8
 710 // ----------------------------------------------------------------------------
 711
 712 static const wxUint32 utf8_max[]=
 713     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 714
 715 // boundaries of the private use area we use to (temporarily) remap invalid
 716 // characters invalid in a UTF-8 encoded string
 717 const wxUint32 wxUnicodePUA = 0x100000;
 718 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 719
 720 // this table gives the length of the UTF-8 encoding from its first character:
 721 const unsigned char tableUtf8Lengths[256] = {
 722     // single-byte sequences (ASCII):
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 726     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 727     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 728     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 729     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 730     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 731
 732     // these are invalid:
 733     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 734     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 735     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 736     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 737     0, 0,                                            // C0,C1
 738
 739     // two-byte sequences:
 740           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 741     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 742
 743     // three-byte sequences:
 744     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 745
 746     // four-byte sequences:
 747     4, 4, 4, 4, 4,                                   // F0..F4
 748
 749     // these are invalid again (5- or 6-byte
 750     // sequences and sequences for code points
 751     // above U+10FFFF, as restricted by RFC 3629):
 752                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 753 };
 754
 755 size_t
 756 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 757                             const char *src, size_t srcLen) const
 758 {
 759     wchar_t *out = dstLen ? dst : NULL;
 760     size_t written = 0;
 761
 762     if ( srcLen == wxNO_LEN )
 763         srcLen = strlen(src) + 1;
 764
 765     for ( const char *p = src; ; p++ )
 766     {
 767         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 768         {
 769             // all done successfully, just add the trailing NULL if we are not
 770             // using explicit length
 771             if ( srcLen == wxNO_LEN )
 772             {
 773                 if ( out )
 774                 {
 775                     if ( !dstLen )
 776                         break;
 777
 778                     *out = L'\0';
 779                 }
 780
 781                 written++;
 782             }
 783
 784             return written;
 785         }
 786
 787         if ( out && !dstLen-- )
 788             break;
 789
 790         wxUint32 code;
 791         unsigned char c = *p;
 792
 793         if ( c < 0x80 )
 794         {
 795             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 796                 break;
 797
 798             if ( srcLen != wxNO_LEN )
 799                 srcLen--;
 800
 801             code = c;
 802         }
 803         else
 804         {
 805             unsigned len = tableUtf8Lengths[c];
 806             if ( !len )
 807                 break;
 808
 809             if ( srcLen < len ) // the test works for wxNO_LEN too
 810                 break;
 811
 812             if ( srcLen != wxNO_LEN )
 813                 srcLen -= len;
 814
 815             //   Char. number range   |        UTF-8 octet sequence
 816             //      (hexadecimal)     |              (binary)
 817             //  ----------------------+----------------------------------------
 818             //  0000 0000 - 0000 007F | 0xxxxxxx
 819             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 820             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 821             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 822             //
 823             //  Code point value is stored in bits marked with 'x',
 824             //  lowest-order bit of the value on the right side in the diagram
 825             //  above.                                         (from RFC 3629)
 826
 827             // mask to extract lead byte's value ('x' bits above), by sequence
 828             // length:
 829             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 830
 831             // mask and value of lead byte's most significant bits, by length:
 832             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 833             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 834
 835             len--; // it's more convenient to work with 0-based length here
 836
 837             // extract the lead byte's value bits:
 838             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 839                 break;
 840
 841             code = c & leadValueMask[len];
 842
 843             // all remaining bytes, if any, are handled in the same way
 844             // regardless of sequence's length:
 845             for ( ; len; --len )
 846             {
 847                 c = *++p;
 848                 if ( (c & 0xC0) != 0x80 )
 849                     return wxCONV_FAILED;
 850
 851                 code <<= 6;
 852                 code |= c & 0x3F;
 853             }
 854         }
 855
 856 #ifdef WC_UTF16
 857         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 858         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 859         {
 860             if ( out )
 861                 out++;
 862             written++;
 863         }
 864 #else // !WC_UTF16
 865         if ( out )
 866             *out = code;
 867 #endif // WC_UTF16/!WC_UTF16
 868
 869         if ( out )
 870             out++;
 871
 872         written++;
 873     }
 874
 875     return wxCONV_FAILED;
 876 }
 877
 878 size_t
 879 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 880                               const wchar_t *src, size_t srcLen) const
 881 {
 882     char *out = dstLen ? dst : NULL;
 883     size_t written = 0;
 884
 885     for ( const wchar_t *wp = src; ; wp++ )
 886     {
 887         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 888         {
 889             // all done successfully, just add the trailing NULL if we are not
 890             // using explicit length
 891             if ( srcLen == wxNO_LEN )
 892             {
 893                 if ( out )
 894                 {
 895                     if ( !dstLen )
 896                         break;
 897
 898                     *out = '\0';
 899                 }
 900
 901                 written++;
 902             }
 903
 904             return written;
 905         }
 906
 907
 908         wxUint32 code;
 909 #ifdef WC_UTF16
 910         // cast is ok for WC_UTF16
 911         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 912         {
 913             // skip the next char too as we decoded a surrogate
 914             wp++;
 915         }
 916 #else // wchar_t is UTF-32
 917         code = *wp & 0x7fffffff;
 918 #endif
 919
 920         unsigned len;
 921         if ( code <= 0x7F )
 922         {
 923             len = 1;
 924             if ( out )
 925             {
 926                 if ( dstLen < len )
 927                     break;
 928
 929                 out[0] = (char)code;
 930             }
 931         }
 932         else if ( code <= 0x07FF )
 933         {
 934             len = 2;
 935             if ( out )
 936             {
 937                 if ( dstLen < len )
 938                     break;
 939
 940                 // NB: this line takes 6 least significant bits, encodes them as
 941                 // 10xxxxxx and discards them so that the next byte can be encoded:
 942                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 943                 out[0] = 0xC0 | code;
 944             }
 945         }
 946         else if ( code < 0xFFFF )
 947         {
 948             len = 3;
 949             if ( out )
 950             {
 951                 if ( dstLen < len )
 952                     break;
 953
 954                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 955                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 956                 out[0] = 0xE0 | code;
 957             }
 958         }
 959         else if ( code <= 0x10FFFF )
 960         {
 961             len = 4;
 962             if ( out )
 963             {
 964                 if ( dstLen < len )
 965                     break;
 966
 967                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 968                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 969                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 970                 out[0] = 0xF0 | code;
 971             }
 972         }
 973         else
 974         {
 975             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 976             break;
 977         }
 978
 979         if ( out )
 980         {
 981             out += len;
 982             dstLen -= len;
 983         }
 984
 985         written += len;
 986     }
 987
 988     // we only get here if an error occurs during decoding
 989     return wxCONV_FAILED;
 990 }
 991
 992 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 993                              const char *psz, size_t srcLen) const
 994 {
 995     if ( m_options == MAP_INVALID_UTF8_NOT )
 996         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 997
 998     size_t len = 0;
 999
1000     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1001     {
1002         const char *opsz = psz;
1003         bool invalid = false;
1004         unsigned char cc = *psz++, fc = cc;
1005         unsigned cnt;
1006         for (cnt = 0; fc & 0x80; cnt++)
1007             fc <<= 1;
1008
1009         if (!cnt)
1010         {
1011             // plain ASCII char
1012             if (buf)
1013                 *buf++ = cc;
1014             len++;
1015
1016             // escape the escape character for octal escapes
1017             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1018                     && cc == '\\' && (!buf || len < n))
1019             {
1020                 if (buf)
1021                     *buf++ = cc;
1022                 len++;
1023             }
1024         }
1025         else
1026         {
1027             cnt--;
1028             if (!cnt)
1029             {
1030                 // invalid UTF-8 sequence
1031                 invalid = true;
1032             }
1033             else
1034             {
1035                 unsigned ocnt = cnt - 1;
1036                 wxUint32 res = cc & (0x3f >> cnt);
1037                 while (cnt--)
1038                 {
1039                     cc = *psz;
1040                     if ((cc & 0xC0) != 0x80)
1041                     {
1042                         // invalid UTF-8 sequence
1043                         invalid = true;
1044                         break;
1045                     }
1046
1047                     psz++;
1048                     res = (res << 6) | (cc & 0x3f);
1049                 }
1050
1051                 if (invalid || res <= utf8_max[ocnt])
1052                 {
1053                     // illegal UTF-8 encoding
1054                     invalid = true;
1055                 }
1056                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1057                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1058                 {
1059                     // if one of our PUA characters turns up externally
1060                     // it must also be treated as an illegal sequence
1061                     // (a bit like you have to escape an escape character)
1062                     invalid = true;
1063                 }
1064                 else
1065                 {
1066 #ifdef WC_UTF16
1067                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1068                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1069                     if (pa == wxCONV_FAILED)
1070                     {
1071                         invalid = true;
1072                     }
1073                     else
1074                     {
1075                         if (buf)
1076                             buf += pa;
1077                         len += pa;
1078                     }
1079 #else // !WC_UTF16
1080                     if (buf)
1081                         *buf++ = (wchar_t)res;
1082                     len++;
1083 #endif // WC_UTF16/!WC_UTF16
1084                 }
1085             }
1086
1087             if (invalid)
1088             {
1089                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1090                 {
1091                     while (opsz < psz && (!buf || len < n))
1092                     {
1093 #ifdef WC_UTF16
1094                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1095                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1096                         wxASSERT(pa != wxCONV_FAILED);
1097                         if (buf)
1098                             buf += pa;
1099                         opsz++;
1100                         len += pa;
1101 #else
1102                         if (buf)
1103                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1104                         opsz++;
1105                         len++;
1106 #endif
1107                     }
1108                 }
1109                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1110                 {
1111                     while (opsz < psz && (!buf || len < n))
1112                     {
1113                         if ( buf && len + 3 < n )
1114                         {
1115                             unsigned char on = *opsz;
1116                             *buf++ = L'\\';
1117                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1118                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1119                             *buf++ = (wchar_t)( L'0' + on % 010 );
1120                         }
1121
1122                         opsz++;
1123                         len += 4;
1124                     }
1125                 }
1126                 else // MAP_INVALID_UTF8_NOT
1127                 {
1128                     return wxCONV_FAILED;
1129                 }
1130             }
1131         }
1132     }
1133
1134     if (srcLen == wxNO_LEN && buf && (len < n))
1135         *buf = 0;
1136
1137     return len + 1;
1138 }
1139
1140 static inline bool isoctal(wchar_t wch)
1141 {
1142     return L'0' <= wch && wch <= L'7';
1143 }
1144
1145 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1146                                const wchar_t *psz, size_t srcLen) const
1147 {
1148     if ( m_options == MAP_INVALID_UTF8_NOT )
1149         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1150
1151     size_t len = 0;
1152
1153     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1154     {
1155         wxUint32 cc;
1156
1157 #ifdef WC_UTF16
1158         // cast is ok for WC_UTF16
1159         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1160         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1161 #else
1162         cc = (*psz++) & 0x7fffffff;
1163 #endif
1164
1165         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1166                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1167         {
1168             if (buf)
1169                 *buf++ = (char)(cc - wxUnicodePUA);
1170             len++;
1171         }
1172         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1173                     && cc == L'\\' && psz[0] == L'\\' )
1174         {
1175             if (buf)
1176                 *buf++ = (char)cc;
1177             psz++;
1178             len++;
1179         }
1180         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1181                     cc == L'\\' &&
1182                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1183         {
1184             if (buf)
1185             {
1186                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1187                                  (psz[1] - L'0') * 010 +
1188                                  (psz[2] - L'0'));
1189             }
1190
1191             psz += 3;
1192             len++;
1193         }
1194         else
1195         {
1196             unsigned cnt;
1197             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1198             {
1199             }
1200
1201             if (!cnt)
1202             {
1203                 // plain ASCII char
1204                 if (buf)
1205                     *buf++ = (char) cc;
1206                 len++;
1207             }
1208             else
1209             {
1210                 len += cnt + 1;
1211                 if (buf)
1212                 {
1213                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1214                     while (cnt--)
1215                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1216                 }
1217             }
1218         }
1219     }
1220
1221     if (srcLen == wxNO_LEN && buf && (len < n))
1222         *buf = 0;
1223
1224     return len + 1;
1225 }
1226
1227 // ============================================================================
1228 // UTF-16
1229 // ============================================================================
1230
1231 #ifdef WORDS_BIGENDIAN
1232     #define wxMBConvUTF16straight wxMBConvUTF16BE
1233     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1234 #else
1235     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1236     #define wxMBConvUTF16straight wxMBConvUTF16LE
1237 #endif
1238
1239 /* static */
1240 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1241 {
1242     if ( srcLen == wxNO_LEN )
1243     {
1244         // count the number of bytes in input, including the trailing NULs
1245         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1246         for ( srcLen = 1; *inBuff++; srcLen++ )
1247             ;
1248
1249         srcLen *= BYTES_PER_CHAR;
1250     }
1251     else // we already have the length
1252     {
1253         // we can only convert an entire number of UTF-16 characters
1254         if ( srcLen % BYTES_PER_CHAR )
1255             return wxCONV_FAILED;
1256     }
1257
1258     return srcLen;
1259 }
1260
1261 // case when in-memory representation is UTF-16 too
1262 #ifdef WC_UTF16
1263
1264 // ----------------------------------------------------------------------------
1265 // conversions without endianness change
1266 // ----------------------------------------------------------------------------
1267
1268 size_t
1269 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1270                                const char *src, size_t srcLen) const
1271 {
1272     // set up the scene for using memcpy() (which is presumably more efficient
1273     // than copying the bytes one by one)
1274     srcLen = GetLength(src, srcLen);
1275     if ( srcLen == wxNO_LEN )
1276         return wxCONV_FAILED;
1277
1278     const size_t inLen = srcLen / BYTES_PER_CHAR;
1279     if ( dst )
1280     {
1281         if ( dstLen < inLen )
1282             return wxCONV_FAILED;
1283
1284         memcpy(dst, src, srcLen);
1285     }
1286
1287     return inLen;
1288 }
1289
1290 size_t
1291 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1292                                  const wchar_t *src, size_t srcLen) const
1293 {
1294     if ( srcLen == wxNO_LEN )
1295         srcLen = wxWcslen(src) + 1;
1296
1297     srcLen *= BYTES_PER_CHAR;
1298
1299     if ( dst )
1300     {
1301         if ( dstLen < srcLen )
1302             return wxCONV_FAILED;
1303
1304         memcpy(dst, src, srcLen);
1305     }
1306
1307     return srcLen;
1308 }
1309
1310 // ----------------------------------------------------------------------------
1311 // endian-reversing conversions
1312 // ----------------------------------------------------------------------------
1313
1314 size_t
1315 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1316                            const char *src, size_t srcLen) const
1317 {
1318     srcLen = GetLength(src, srcLen);
1319     if ( srcLen == wxNO_LEN )
1320         return wxCONV_FAILED;
1321
1322     srcLen /= BYTES_PER_CHAR;
1323
1324     if ( dst )
1325     {
1326         if ( dstLen < srcLen )
1327             return wxCONV_FAILED;
1328
1329         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1330         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1331         {
1332             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1333         }
1334     }
1335
1336     return srcLen;
1337 }
1338
1339 size_t
1340 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1341                              const wchar_t *src, size_t srcLen) const
1342 {
1343     if ( srcLen == wxNO_LEN )
1344         srcLen = wxWcslen(src) + 1;
1345
1346     srcLen *= BYTES_PER_CHAR;
1347
1348     if ( dst )
1349     {
1350         if ( dstLen < srcLen )
1351             return wxCONV_FAILED;
1352
1353         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1354         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1355         {
1356             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1357         }
1358     }
1359
1360     return srcLen;
1361 }
1362
1363 #else // !WC_UTF16: wchar_t is UTF-32
1364
1365 // ----------------------------------------------------------------------------
1366 // conversions without endianness change
1367 // ----------------------------------------------------------------------------
1368
1369 size_t
1370 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1371                                const char *src, size_t srcLen) const
1372 {
1373     srcLen = GetLength(src, srcLen);
1374     if ( srcLen == wxNO_LEN )
1375         return wxCONV_FAILED;
1376
1377     const size_t inLen = srcLen / BYTES_PER_CHAR;
1378     if ( !dst )
1379     {
1380         // optimization: return maximal space which could be needed for this
1381         // string even if the real size could be smaller if the buffer contains
1382         // any surrogates
1383         return inLen;
1384     }
1385
1386     size_t outLen = 0;
1387     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1388     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1389     {
1390         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1391         if ( !inBuff )
1392             return wxCONV_FAILED;
1393
1394         if ( ++outLen > dstLen )
1395             return wxCONV_FAILED;
1396
1397         *dst++ = ch;
1398     }
1399
1400
1401     return outLen;
1402 }
1403
1404 size_t
1405 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1406                                  const wchar_t *src, size_t srcLen) const
1407 {
1408     if ( srcLen == wxNO_LEN )
1409         srcLen = wxWcslen(src) + 1;
1410
1411     size_t outLen = 0;
1412     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1413     for ( size_t n = 0; n < srcLen; n++ )
1414     {
1415         wxUint16 cc[2];
1416         const size_t numChars = encode_utf16(*src++, cc);
1417         if ( numChars == wxCONV_FAILED )
1418             return wxCONV_FAILED;
1419
1420         outLen += numChars * BYTES_PER_CHAR;
1421         if ( outBuff )
1422         {
1423             if ( outLen > dstLen )
1424                 return wxCONV_FAILED;
1425
1426             *outBuff++ = cc[0];
1427             if ( numChars == 2 )
1428             {
1429                 // second character of a surrogate
1430                 *outBuff++ = cc[1];
1431             }
1432         }
1433     }
1434
1435     return outLen;
1436 }
1437
1438 // ----------------------------------------------------------------------------
1439 // endian-reversing conversions
1440 // ----------------------------------------------------------------------------
1441
1442 size_t
1443 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1444                            const char *src, size_t srcLen) const
1445 {
1446     srcLen = GetLength(src, srcLen);
1447     if ( srcLen == wxNO_LEN )
1448         return wxCONV_FAILED;
1449
1450     const size_t inLen = srcLen / BYTES_PER_CHAR;
1451     if ( !dst )
1452     {
1453         // optimization: return maximal space which could be needed for this
1454         // string even if the real size could be smaller if the buffer contains
1455         // any surrogates
1456         return inLen;
1457     }
1458
1459     size_t outLen = 0;
1460     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1461     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1462     {
1463         wxUint32 ch;
1464         wxUint16 tmp[2];
1465
1466         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1467         inBuff++;
1468         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1469
1470         const size_t numChars = decode_utf16(tmp, ch);
1471         if ( numChars == wxCONV_FAILED )
1472             return wxCONV_FAILED;
1473
1474         if ( numChars == 2 )
1475             inBuff++;
1476
1477         if ( ++outLen > dstLen )
1478             return wxCONV_FAILED;
1479
1480         *dst++ = ch;
1481     }
1482
1483
1484     return outLen;
1485 }
1486
1487 size_t
1488 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1489                              const wchar_t *src, size_t srcLen) const
1490 {
1491     if ( srcLen == wxNO_LEN )
1492         srcLen = wxWcslen(src) + 1;
1493
1494     size_t outLen = 0;
1495     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1496     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1497     {
1498         wxUint16 cc[2];
1499         const size_t numChars = encode_utf16(*src, cc);
1500         if ( numChars == wxCONV_FAILED )
1501             return wxCONV_FAILED;
1502
1503         outLen += numChars * BYTES_PER_CHAR;
1504         if ( outBuff )
1505         {
1506             if ( outLen > dstLen )
1507                 return wxCONV_FAILED;
1508
1509             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1510             if ( numChars == 2 )
1511             {
1512                 // second character of a surrogate
1513                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1514             }
1515         }
1516     }
1517
1518     return outLen;
1519 }
1520
1521 #endif // WC_UTF16/!WC_UTF16
1522
1523
1524 // ============================================================================
1525 // UTF-32
1526 // ============================================================================
1527
1528 #ifdef WORDS_BIGENDIAN
1529     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1530     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1531 #else
1532     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1533     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1534 #endif
1535
1536
1537 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1538 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1539
1540 /* static */
1541 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1542 {
1543     if ( srcLen == wxNO_LEN )
1544     {
1545         // count the number of bytes in input, including the trailing NULs
1546         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1547         for ( srcLen = 1; *inBuff++; srcLen++ )
1548             ;
1549
1550         srcLen *= BYTES_PER_CHAR;
1551     }
1552     else // we already have the length
1553     {
1554         // we can only convert an entire number of UTF-32 characters
1555         if ( srcLen % BYTES_PER_CHAR )
1556             return wxCONV_FAILED;
1557     }
1558
1559     return srcLen;
1560 }
1561
1562 // case when in-memory representation is UTF-16
1563 #ifdef WC_UTF16
1564
1565 // ----------------------------------------------------------------------------
1566 // conversions without endianness change
1567 // ----------------------------------------------------------------------------
1568
1569 size_t
1570 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1571                                const char *src, size_t srcLen) const
1572 {
1573     srcLen = GetLength(src, srcLen);
1574     if ( srcLen == wxNO_LEN )
1575         return wxCONV_FAILED;
1576
1577     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1578     const size_t inLen = srcLen / BYTES_PER_CHAR;
1579     size_t outLen = 0;
1580     for ( size_t n = 0; n < inLen; n++ )
1581     {
1582         wxUint16 cc[2];
1583         const size_t numChars = encode_utf16(*inBuff++, cc);
1584         if ( numChars == wxCONV_FAILED )
1585             return wxCONV_FAILED;
1586
1587         outLen += numChars;
1588         if ( dst )
1589         {
1590             if ( outLen > dstLen )
1591                 return wxCONV_FAILED;
1592
1593             *dst++ = cc[0];
1594             if ( numChars == 2 )
1595             {
1596                 // second character of a surrogate
1597                 *dst++ = cc[1];
1598             }
1599         }
1600     }
1601
1602     return outLen;
1603 }
1604
1605 size_t
1606 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1607                                  const wchar_t *src, size_t srcLen) const
1608 {
1609     if ( srcLen == wxNO_LEN )
1610         srcLen = wxWcslen(src) + 1;
1611
1612     if ( !dst )
1613     {
1614         // optimization: return maximal space which could be needed for this
1615         // string instead of the exact amount which could be less if there are
1616         // any surrogates in the input
1617         //
1618         // we consider that surrogates are rare enough to make it worthwhile to
1619         // avoid running the loop below at the cost of slightly extra memory
1620         // consumption
1621         return srcLen * BYTES_PER_CHAR;
1622     }
1623
1624     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1625     size_t outLen = 0;
1626     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1627     {
1628         const wxUint32 ch = wxDecodeSurrogate(&src);
1629         if ( !src )
1630             return wxCONV_FAILED;
1631
1632         outLen += BYTES_PER_CHAR;
1633
1634         if ( outLen > dstLen )
1635             return wxCONV_FAILED;
1636
1637         *outBuff++ = ch;
1638     }
1639
1640     return outLen;
1641 }
1642
1643 // ----------------------------------------------------------------------------
1644 // endian-reversing conversions
1645 // ----------------------------------------------------------------------------
1646
1647 size_t
1648 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1649                            const char *src, size_t srcLen) const
1650 {
1651     srcLen = GetLength(src, srcLen);
1652     if ( srcLen == wxNO_LEN )
1653         return wxCONV_FAILED;
1654
1655     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1656     const size_t inLen = srcLen / BYTES_PER_CHAR;
1657     size_t outLen = 0;
1658     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1659     {
1660         wxUint16 cc[2];
1661         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1662         if ( numChars == wxCONV_FAILED )
1663             return wxCONV_FAILED;
1664
1665         outLen += numChars;
1666         if ( dst )
1667         {
1668             if ( outLen > dstLen )
1669                 return wxCONV_FAILED;
1670
1671             *dst++ = cc[0];
1672             if ( numChars == 2 )
1673             {
1674                 // second character of a surrogate
1675                 *dst++ = cc[1];
1676             }
1677         }
1678     }
1679
1680     return outLen;
1681 }
1682
1683 size_t
1684 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1685                              const wchar_t *src, size_t srcLen) const
1686 {
1687     if ( srcLen == wxNO_LEN )
1688         srcLen = wxWcslen(src) + 1;
1689
1690     if ( !dst )
1691     {
1692         // optimization: return maximal space which could be needed for this
1693         // string instead of the exact amount which could be less if there are
1694         // any surrogates in the input
1695         //
1696         // we consider that surrogates are rare enough to make it worthwhile to
1697         // avoid running the loop below at the cost of slightly extra memory
1698         // consumption
1699         return srcLen*BYTES_PER_CHAR;
1700     }
1701
1702     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1703     size_t outLen = 0;
1704     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1705     {
1706         const wxUint32 ch = wxDecodeSurrogate(&src);
1707         if ( !src )
1708             return wxCONV_FAILED;
1709
1710         outLen += BYTES_PER_CHAR;
1711
1712         if ( outLen > dstLen )
1713             return wxCONV_FAILED;
1714
1715         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1716     }
1717
1718     return outLen;
1719 }
1720
1721 #else // !WC_UTF16: wchar_t is UTF-32
1722
1723 // ----------------------------------------------------------------------------
1724 // conversions without endianness change
1725 // ----------------------------------------------------------------------------
1726
1727 size_t
1728 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1729                                const char *src, size_t srcLen) const
1730 {
1731     // use memcpy() as it should be much faster than hand-written loop
1732     srcLen = GetLength(src, srcLen);
1733     if ( srcLen == wxNO_LEN )
1734         return wxCONV_FAILED;
1735
1736     const size_t inLen = srcLen/BYTES_PER_CHAR;
1737     if ( dst )
1738     {
1739         if ( dstLen < inLen )
1740             return wxCONV_FAILED;
1741
1742         memcpy(dst, src, srcLen);
1743     }
1744
1745     return inLen;
1746 }
1747
1748 size_t
1749 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1750                                  const wchar_t *src, size_t srcLen) const
1751 {
1752     if ( srcLen == wxNO_LEN )
1753         srcLen = wxWcslen(src) + 1;
1754
1755     srcLen *= BYTES_PER_CHAR;
1756
1757     if ( dst )
1758     {
1759         if ( dstLen < srcLen )
1760             return wxCONV_FAILED;
1761
1762         memcpy(dst, src, srcLen);
1763     }
1764
1765     return srcLen;
1766 }
1767
1768 // ----------------------------------------------------------------------------
1769 // endian-reversing conversions
1770 // ----------------------------------------------------------------------------
1771
1772 size_t
1773 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1774                            const char *src, size_t srcLen) const
1775 {
1776     srcLen = GetLength(src, srcLen);
1777     if ( srcLen == wxNO_LEN )
1778         return wxCONV_FAILED;
1779
1780     srcLen /= BYTES_PER_CHAR;
1781
1782     if ( dst )
1783     {
1784         if ( dstLen < srcLen )
1785             return wxCONV_FAILED;
1786
1787         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1788         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1789         {
1790             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1791         }
1792     }
1793
1794     return srcLen;
1795 }
1796
1797 size_t
1798 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1799                              const wchar_t *src, size_t srcLen) const
1800 {
1801     if ( srcLen == wxNO_LEN )
1802         srcLen = wxWcslen(src) + 1;
1803
1804     srcLen *= BYTES_PER_CHAR;
1805
1806     if ( dst )
1807     {
1808         if ( dstLen < srcLen )
1809             return wxCONV_FAILED;
1810
1811         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1812         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1813         {
1814             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1815         }
1816     }
1817
1818     return srcLen;
1819 }
1820
1821 #endif // WC_UTF16/!WC_UTF16
1822
1823
1824 // ============================================================================
1825 // The classes doing conversion using the iconv_xxx() functions
1826 // ============================================================================
1827
1828 #ifdef HAVE_ICONV
1829
1830 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1831 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1832 //     (unless there's yet another bug in glibc) the only case when iconv()
1833 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1834 //     left in the input buffer -- when _real_ error occurs,
1835 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1836 //     iconv() failure.
1837 //     [This bug does not appear in glibc 2.2.]
1838 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1839 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1840                                      (errno != E2BIG || bufLeft != 0))
1841 #else
1842 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1843 #endif
1844
1845 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1846
1847 #define ICONV_T_INVALID ((iconv_t)-1)
1848
1849 #if SIZEOF_WCHAR_T == 4
1850     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1851     #define WC_ENC      wxFONTENCODING_UTF32
1852 #elif SIZEOF_WCHAR_T == 2
1853     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1854     #define WC_ENC      wxFONTENCODING_UTF16
1855 #else // sizeof(wchar_t) != 2 nor 4
1856     // does this ever happen?
1857     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1858 #endif
1859
1860 // ----------------------------------------------------------------------------
1861 // wxMBConv_iconv: encapsulates an iconv character set
1862 // ----------------------------------------------------------------------------
1863
1864 class wxMBConv_iconv : public wxMBConv
1865 {
1866 public:
1867     wxMBConv_iconv(const char *name);
1868     virtual ~wxMBConv_iconv();
1869
1870     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1871     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1872
1873     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1874     virtual size_t GetMBNulLen() const;
1875
1876 #if wxUSE_UNICODE_UTF8
1877     virtual bool IsUTF8() const;
1878 #endif
1879
1880     virtual wxMBConv *Clone() const
1881     {
1882         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1883         p->m_minMBCharWidth = m_minMBCharWidth;
1884         return p;
1885     }
1886
1887     bool IsOk() const
1888         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1889
1890 protected:
1891     // the iconv handlers used to translate from multibyte
1892     // to wide char and in the other direction
1893     iconv_t m2w,
1894             w2m;
1895
1896 #if wxUSE_THREADS
1897     // guards access to m2w and w2m objects
1898     wxMutex m_iconvMutex;
1899 #endif
1900
1901 private:
1902     // the name (for iconv_open()) of a wide char charset -- if none is
1903     // available on this machine, it will remain NULL
1904     static wxString ms_wcCharsetName;
1905
1906     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1907     // different endian-ness than the native one
1908     static bool ms_wcNeedsSwap;
1909
1910
1911     // name of the encoding handled by this conversion
1912     wxString m_name;
1913
1914     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1915     // initially
1916     size_t m_minMBCharWidth;
1917 };
1918
1919 // make the constructor available for unit testing
1920 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1921 {
1922     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1923     if ( !result->IsOk() )
1924     {
1925         delete result;
1926         return 0;
1927     }
1928
1929     return result;
1930 }
1931
1932 wxString wxMBConv_iconv::ms_wcCharsetName;
1933 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1934
1935 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1936               : m_name(name)
1937 {
1938     m_minMBCharWidth = 0;
1939
1940     // check for charset that represents wchar_t:
1941     if ( ms_wcCharsetName.empty() )
1942     {
1943         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1944
1945 #if wxUSE_FONTMAP
1946         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1947 #else // !wxUSE_FONTMAP
1948         static const wxChar *names_static[] =
1949         {
1950 #if SIZEOF_WCHAR_T == 4
1951             _T("UCS-4"),
1952 #elif SIZEOF_WCHAR_T = 2
1953             _T("UCS-2"),
1954 #endif
1955             NULL
1956         };
1957         const wxChar **names = names_static;
1958 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1959
1960         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1961         {
1962             const wxString nameCS(*names);
1963
1964             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1965             wxString nameXE(nameCS);
1966
1967 #ifdef WORDS_BIGENDIAN
1968                 nameXE += _T("BE");
1969 #else // little endian
1970                 nameXE += _T("LE");
1971 #endif
1972
1973             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1974                        nameXE.c_str());
1975
1976             m2w = iconv_open(nameXE.ToAscii(), name);
1977             if ( m2w == ICONV_T_INVALID )
1978             {
1979                 // try charset w/o bytesex info (e.g. "UCS4")
1980                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1981                            nameCS.c_str());
1982                 m2w = iconv_open(nameCS.ToAscii(), name);
1983
1984                 // and check for bytesex ourselves:
1985                 if ( m2w != ICONV_T_INVALID )
1986                 {
1987                     char    buf[2], *bufPtr;
1988                     wchar_t wbuf[2], *wbufPtr;
1989                     size_t  insz, outsz;
1990                     size_t  res;
1991
1992                     buf[0] = 'A';
1993                     buf[1] = 0;
1994                     wbuf[0] = 0;
1995                     insz = 2;
1996                     outsz = SIZEOF_WCHAR_T * 2;
1997                     wbufPtr = wbuf;
1998                     bufPtr = buf;
1999
2000                     res = iconv(
2001                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2002                         (char**)&wbufPtr, &outsz);
2003
2004                     if (ICONV_FAILED(res, insz))
2005                     {
2006                         wxLogLastError(wxT("iconv"));
2007                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2008                                    nameCS.c_str());
2009                     }
2010                     else // ok, can convert to this encoding, remember it
2011                     {
2012                         ms_wcCharsetName = nameCS;
2013                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2014                     }
2015                 }
2016             }
2017             else // use charset not requiring byte swapping
2018             {
2019                 ms_wcCharsetName = nameXE;
2020             }
2021         }
2022
2023         wxLogTrace(TRACE_STRCONV,
2024                    wxT("iconv wchar_t charset is \"%s\"%s"),
2025                    ms_wcCharsetName.empty() ? wxString("<none>")
2026                                             : ms_wcCharsetName,
2027                    ms_wcNeedsSwap ? _T(" (needs swap)")
2028                                   : _T(""));
2029     }
2030     else // we already have ms_wcCharsetName
2031     {
2032         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2033     }
2034
2035     if ( ms_wcCharsetName.empty() )
2036     {
2037         w2m = ICONV_T_INVALID;
2038     }
2039     else
2040     {
2041         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2042         if ( w2m == ICONV_T_INVALID )
2043         {
2044             wxLogTrace(TRACE_STRCONV,
2045                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2046                        ms_wcCharsetName.c_str(), name);
2047         }
2048     }
2049 }
2050
2051 wxMBConv_iconv::~wxMBConv_iconv()
2052 {
2053     if ( m2w != ICONV_T_INVALID )
2054         iconv_close(m2w);
2055     if ( w2m != ICONV_T_INVALID )
2056         iconv_close(w2m);
2057 }
2058
2059 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2060 {
2061     // find the string length: notice that must be done differently for
2062     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2063     size_t inbuf;
2064     const size_t nulLen = GetMBNulLen();
2065     switch ( nulLen )
2066     {
2067         default:
2068             return wxCONV_FAILED;
2069
2070         case 1:
2071             inbuf = strlen(psz); // arguably more optimized than our version
2072             break;
2073
2074         case 2:
2075         case 4:
2076             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2077             // they also have to start at character boundary and not span two
2078             // adjacent characters
2079             const char *p;
2080             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2081                 ;
2082             inbuf = p - psz;
2083             break;
2084     }
2085
2086 #if wxUSE_THREADS
2087     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2088     //     Unfortunately there are a couple of global wxCSConv objects such as
2089     //     wxConvLocal that are used all over wx code, so we have to make sure
2090     //     the handle is used by at most one thread at the time. Otherwise
2091     //     only a few wx classes would be safe to use from non-main threads
2092     //     as MB<->WC conversion would fail "randomly".
2093     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2094 #endif // wxUSE_THREADS
2095
2096     size_t outbuf = n * SIZEOF_WCHAR_T;
2097     size_t res, cres;
2098     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2099     wchar_t *bufPtr = buf;
2100     const char *pszPtr = psz;
2101
2102     if (buf)
2103     {
2104         // have destination buffer, convert there
2105         cres = iconv(m2w,
2106                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2107                      (char**)&bufPtr, &outbuf);
2108         res = n - (outbuf / SIZEOF_WCHAR_T);
2109
2110         if (ms_wcNeedsSwap)
2111         {
2112             // convert to native endianness
2113             for ( unsigned i = 0; i < res; i++ )
2114                 buf[n] = WC_BSWAP(buf[i]);
2115         }
2116
2117         // NUL-terminate the string if there is any space left
2118         if (res < n)
2119             buf[res] = 0;
2120     }
2121     else
2122     {
2123         // no destination buffer... convert using temp buffer
2124         // to calculate destination buffer requirement
2125         wchar_t tbuf[8];
2126         res = 0;
2127
2128         do
2129         {
2130             bufPtr = tbuf;
2131             outbuf = 8 * SIZEOF_WCHAR_T;
2132
2133             cres = iconv(m2w,
2134                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2135                          (char**)&bufPtr, &outbuf );
2136
2137             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2138         }
2139         while ((cres == (size_t)-1) && (errno == E2BIG));
2140     }
2141
2142     if (ICONV_FAILED(cres, inbuf))
2143     {
2144         //VS: it is ok if iconv fails, hence trace only
2145         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2146         return wxCONV_FAILED;
2147     }
2148
2149     return res;
2150 }
2151
2152 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2153 {
2154 #if wxUSE_THREADS
2155     // NB: explained in MB2WC
2156     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2157 #endif
2158
2159     size_t inlen = wxWcslen(psz);
2160     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2161     size_t outbuf = n;
2162     size_t res, cres;
2163
2164     wchar_t *tmpbuf = 0;
2165
2166     if (ms_wcNeedsSwap)
2167     {
2168         // need to copy to temp buffer to switch endianness
2169         // (doing WC_BSWAP twice on the original buffer won't help, as it
2170         //  could be in read-only memory, or be accessed in some other thread)
2171         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2172         for ( size_t i = 0; i < inlen; i++ )
2173             tmpbuf[n] = WC_BSWAP(psz[i]);
2174
2175         tmpbuf[inlen] = L'\0';
2176         psz = tmpbuf;
2177     }
2178
2179     if (buf)
2180     {
2181         // have destination buffer, convert there
2182         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2183
2184         res = n - outbuf;
2185
2186         // NB: iconv was given only wcslen(psz) characters on input, and so
2187         //     it couldn't convert the trailing zero. Let's do it ourselves
2188         //     if there's some room left for it in the output buffer.
2189         if (res < n)
2190             buf[0] = 0;
2191     }
2192     else
2193     {
2194         // no destination buffer: convert using temp buffer
2195         // to calculate destination buffer requirement
2196         char tbuf[16];
2197         res = 0;
2198         do
2199         {
2200             buf = tbuf;
2201             outbuf = 16;
2202
2203             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2204
2205             res += 16 - outbuf;
2206         }
2207         while ((cres == (size_t)-1) && (errno == E2BIG));
2208     }
2209
2210     if (ms_wcNeedsSwap)
2211     {
2212         free(tmpbuf);
2213     }
2214
2215     if (ICONV_FAILED(cres, inbuf))
2216     {
2217         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2218         return wxCONV_FAILED;
2219     }
2220
2221     return res;
2222 }
2223
2224 size_t wxMBConv_iconv::GetMBNulLen() const
2225 {
2226     if ( m_minMBCharWidth == 0 )
2227     {
2228         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2229
2230 #if wxUSE_THREADS
2231         // NB: explained in MB2WC
2232         wxMutexLocker lock(self->m_iconvMutex);
2233 #endif
2234
2235         const wchar_t *wnul = L"";
2236         char buf[8]; // should be enough for NUL in any encoding
2237         size_t inLen = sizeof(wchar_t),
2238                outLen = WXSIZEOF(buf);
2239         char *inBuff = (char *)wnul;
2240         char *outBuff = buf;
2241         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2242         {
2243             self->m_minMBCharWidth = (size_t)-1;
2244         }
2245         else // ok
2246         {
2247             self->m_minMBCharWidth = outBuff - buf;
2248         }
2249     }
2250
2251     return m_minMBCharWidth;
2252 }
2253
2254 #if wxUSE_UNICODE_UTF8
2255 bool wxMBConv_iconv::IsUTF8() const
2256 {
2257     return wxStricmp(m_name, "UTF-8") == 0 ||
2258            wxStricmp(m_name, "UTF8") == 0;
2259 }
2260 #endif
2261
2262 #endif // HAVE_ICONV
2263
2264
2265 // ============================================================================
2266 // Win32 conversion classes
2267 // ============================================================================
2268
2269 #ifdef wxHAVE_WIN32_MB2WC
2270
2271 // from utils.cpp
2272 #if wxUSE_FONTMAP
2273 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2274 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2275 #endif
2276
2277 class wxMBConv_win32 : public wxMBConv
2278 {
2279 public:
2280     wxMBConv_win32()
2281     {
2282         m_CodePage = CP_ACP;
2283         m_minMBCharWidth = 0;
2284     }
2285
2286     wxMBConv_win32(const wxMBConv_win32& conv)
2287         : wxMBConv()
2288     {
2289         m_CodePage = conv.m_CodePage;
2290         m_minMBCharWidth = conv.m_minMBCharWidth;
2291     }
2292
2293 #if wxUSE_FONTMAP
2294     wxMBConv_win32(const char* name)
2295     {
2296         m_CodePage = wxCharsetToCodepage(name);
2297         m_minMBCharWidth = 0;
2298     }
2299
2300     wxMBConv_win32(wxFontEncoding encoding)
2301     {
2302         m_CodePage = wxEncodingToCodepage(encoding);
2303         m_minMBCharWidth = 0;
2304     }
2305 #endif // wxUSE_FONTMAP
2306
2307     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2308     {
2309         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2310         // the behaviour is not compatible with the Unix version (using iconv)
2311         // and break the library itself, e.g. wxTextInputStream::NextChar()
2312         // wouldn't work if reading an incomplete MB char didn't result in an
2313         // error
2314         //
2315         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2316         // Win XP or newer and it is not supported for UTF-[78] so we always
2317         // use our own conversions in this case. See
2318         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2319         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2320         if ( m_CodePage == CP_UTF8 )
2321         {
2322             return wxMBConvUTF8().MB2WC(buf, psz, n);
2323         }
2324
2325         if ( m_CodePage == CP_UTF7 )
2326         {
2327             return wxMBConvUTF7().MB2WC(buf, psz, n);
2328         }
2329
2330         int flags = 0;
2331         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2332                 IsAtLeastWin2kSP4() )
2333         {
2334             flags = MB_ERR_INVALID_CHARS;
2335         }
2336
2337         const size_t len = ::MultiByteToWideChar
2338                              (
2339                                 m_CodePage,     // code page
2340                                 flags,          // flags: fall on error
2341                                 psz,            // input string
2342                                 -1,             // its length (NUL-terminated)
2343                                 buf,            // output string
2344                                 buf ? n : 0     // size of output buffer
2345                              );
2346         if ( !len )
2347         {
2348             // function totally failed
2349             return wxCONV_FAILED;
2350         }
2351
2352         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2353         // check if we succeeded, by doing a double trip:
2354         if ( !flags && buf )
2355         {
2356             const size_t mbLen = strlen(psz);
2357             wxCharBuffer mbBuf(mbLen);
2358             if ( ::WideCharToMultiByte
2359                    (
2360                       m_CodePage,
2361                       0,
2362                       buf,
2363                       -1,
2364                       mbBuf.data(),
2365                       mbLen + 1,        // size in bytes, not length
2366                       NULL,
2367                       NULL
2368                    ) == 0 ||
2369                   strcmp(mbBuf, psz) != 0 )
2370             {
2371                 // we didn't obtain the same thing we started from, hence
2372                 // the conversion was lossy and we consider that it failed
2373                 return wxCONV_FAILED;
2374             }
2375         }
2376
2377         // note that it returns count of written chars for buf != NULL and size
2378         // of the needed buffer for buf == NULL so in either case the length of
2379         // the string (which never includes the terminating NUL) is one less
2380         return len - 1;
2381     }
2382
2383     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2384     {
2385         /*
2386             we have a problem here: by default, WideCharToMultiByte() may
2387             replace characters unrepresentable in the target code page with bad
2388             quality approximations such as turning "1/2" symbol (U+00BD) into
2389             "1" for the code pages which don't have it and we, obviously, want
2390             to avoid this at any price
2391
2392             the trouble is that this function does it _silently_, i.e. it won't
2393             even tell us whether it did or not... Win98/2000 and higher provide
2394             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2395             we have to resort to a round trip, i.e. check that converting back
2396             results in the same string -- this is, of course, expensive but
2397             otherwise we simply can't be sure to not garble the data.
2398          */
2399
2400         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2401         // it doesn't work with CJK encodings (which we test for rather roughly
2402         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2403         // supporting it
2404         BOOL usedDef wxDUMMY_INITIALIZE(false);
2405         BOOL *pUsedDef;
2406         int flags;
2407         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2408         {
2409             // it's our lucky day
2410             flags = WC_NO_BEST_FIT_CHARS;
2411             pUsedDef = &usedDef;
2412         }
2413         else // old system or unsupported encoding
2414         {
2415             flags = 0;
2416             pUsedDef = NULL;
2417         }
2418
2419         const size_t len = ::WideCharToMultiByte
2420                              (
2421                                 m_CodePage,     // code page
2422                                 flags,          // either none or no best fit
2423                                 pwz,            // input string
2424                                 -1,             // it is (wide) NUL-terminated
2425                                 buf,            // output buffer
2426                                 buf ? n : 0,    // and its size
2427                                 NULL,           // default "replacement" char
2428                                 pUsedDef        // [out] was it used?
2429                              );
2430
2431         if ( !len )
2432         {
2433             // function totally failed
2434             return wxCONV_FAILED;
2435         }
2436
2437         // we did something, check if we really succeeded
2438         if ( flags )
2439         {
2440             // check if the conversion failed, i.e. if any replacements
2441             // were done
2442             if ( usedDef )
2443                 return wxCONV_FAILED;
2444         }
2445         else // we must resort to double tripping...
2446         {
2447             // first we need to ensure that we really have the MB data: this is
2448             // not the case if we're called with NULL buffer, in which case we
2449             // need to do the conversion yet again
2450             wxCharBuffer bufDef;
2451             if ( !buf )
2452             {
2453                 bufDef = wxCharBuffer(len);
2454                 buf = bufDef.data();
2455                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2456                                             buf, len, NULL, NULL) )
2457                     return wxCONV_FAILED;
2458             }
2459
2460             if ( !n )
2461                 n = wcslen(pwz);
2462             wxWCharBuffer wcBuf(n);
2463             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2464                     wcscmp(wcBuf, pwz) != 0 )
2465             {
2466                 // we didn't obtain the same thing we started from, hence
2467                 // the conversion was lossy and we consider that it failed
2468                 return wxCONV_FAILED;
2469             }
2470         }
2471
2472         // see the comment above for the reason of "len - 1"
2473         return len - 1;
2474     }
2475
2476     virtual size_t GetMBNulLen() const
2477     {
2478         if ( m_minMBCharWidth == 0 )
2479         {
2480             int len = ::WideCharToMultiByte
2481                         (
2482                             m_CodePage,     // code page
2483                             0,              // no flags
2484                             L"",            // input string
2485                             1,              // translate just the NUL
2486                             NULL,           // output buffer
2487                             0,              // and its size
2488                             NULL,           // no replacement char
2489                             NULL            // [out] don't care if it was used
2490                         );
2491
2492             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2493             switch ( len )
2494             {
2495                 default:
2496                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2497                     self->m_minMBCharWidth = (size_t)-1;
2498                     break;
2499
2500                 case 0:
2501                     self->m_minMBCharWidth = (size_t)-1;
2502                     break;
2503
2504                 case 1:
2505                 case 2:
2506                 case 4:
2507                     self->m_minMBCharWidth = len;
2508                     break;
2509             }
2510         }
2511
2512         return m_minMBCharWidth;
2513     }
2514
2515     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2516
2517     bool IsOk() const { return m_CodePage != -1; }
2518
2519 private:
2520     static bool CanUseNoBestFit()
2521     {
2522         static int s_isWin98Or2k = -1;
2523
2524         if ( s_isWin98Or2k == -1 )
2525         {
2526             int verMaj, verMin;
2527             switch ( wxGetOsVersion(&verMaj, &verMin) )
2528             {
2529                 case wxOS_WINDOWS_9X:
2530                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2531                     break;
2532
2533                 case wxOS_WINDOWS_NT:
2534                     s_isWin98Or2k = verMaj >= 5;
2535                     break;
2536
2537                 default:
2538                     // unknown: be conservative by default
2539                     s_isWin98Or2k = 0;
2540                     break;
2541             }
2542
2543             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2544         }
2545
2546         return s_isWin98Or2k == 1;
2547     }
2548
2549     static bool IsAtLeastWin2kSP4()
2550     {
2551 #ifdef __WXWINCE__
2552         return false;
2553 #else
2554         static int s_isAtLeastWin2kSP4 = -1;
2555
2556         if ( s_isAtLeastWin2kSP4 == -1 )
2557         {
2558             OSVERSIONINFOEX ver;
2559
2560             memset(&ver, 0, sizeof(ver));
2561             ver.dwOSVersionInfoSize = sizeof(ver);
2562             GetVersionEx((OSVERSIONINFO*)&ver);
2563
2564             s_isAtLeastWin2kSP4 =
2565               ((ver.dwMajorVersion > 5) || // Vista+
2566                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2567                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2568                ver.wServicePackMajor >= 4)) // 2000 SP4+
2569               ? 1 : 0;
2570         }
2571
2572         return s_isAtLeastWin2kSP4 == 1;
2573 #endif
2574     }
2575
2576
2577     // the code page we're working with
2578     long m_CodePage;
2579
2580     // cached result of GetMBNulLen(), set to 0 initially meaning
2581     // "unknown"
2582     size_t m_minMBCharWidth;
2583 };
2584
2585 #endif // wxHAVE_WIN32_MB2WC
2586
2587
2588 // ============================================================================
2589 // wxEncodingConverter based conversion classes
2590 // ============================================================================
2591
2592 #if wxUSE_FONTMAP
2593
2594 class wxMBConv_wxwin : public wxMBConv
2595 {
2596 private:
2597     void Init()
2598     {
2599         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2600         // The wxMBConv_cf class does a better job.
2601         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2602                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2603                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2604     }
2605
2606 public:
2607     // temporarily just use wxEncodingConverter stuff,
2608     // so that it works while a better implementation is built
2609     wxMBConv_wxwin(const char* name)
2610     {
2611         if (name)
2612             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2613         else
2614             m_enc = wxFONTENCODING_SYSTEM;
2615
2616         Init();
2617     }
2618
2619     wxMBConv_wxwin(wxFontEncoding enc)
2620     {
2621         m_enc = enc;
2622
2623         Init();
2624     }
2625
2626     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2627     {
2628         size_t inbuf = strlen(psz);
2629         if (buf)
2630         {
2631             if (!m2w.Convert(psz, buf))
2632                 return wxCONV_FAILED;
2633         }
2634         return inbuf;
2635     }
2636
2637     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2638     {
2639         const size_t inbuf = wxWcslen(psz);
2640         if (buf)
2641         {
2642             if (!w2m.Convert(psz, buf))
2643                 return wxCONV_FAILED;
2644         }
2645
2646         return inbuf;
2647     }
2648
2649     virtual size_t GetMBNulLen() const
2650     {
2651         switch ( m_enc )
2652         {
2653             case wxFONTENCODING_UTF16BE:
2654             case wxFONTENCODING_UTF16LE:
2655                 return 2;
2656
2657             case wxFONTENCODING_UTF32BE:
2658             case wxFONTENCODING_UTF32LE:
2659                 return 4;
2660
2661             default:
2662                 return 1;
2663         }
2664     }
2665
2666     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2667
2668     bool IsOk() const { return m_ok; }
2669
2670 public:
2671     wxFontEncoding m_enc;
2672     wxEncodingConverter m2w, w2m;
2673
2674 private:
2675     // were we initialized successfully?
2676     bool m_ok;
2677
2678     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2679 };
2680
2681 // make the constructors available for unit testing
2682 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2683 {
2684     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2685     if ( !result->IsOk() )
2686     {
2687         delete result;
2688         return 0;
2689     }
2690
2691     return result;
2692 }
2693
2694 #endif // wxUSE_FONTMAP
2695
2696 // ============================================================================
2697 // wxCSConv implementation
2698 // ============================================================================
2699
2700 void wxCSConv::Init()
2701 {
2702     m_name = NULL;
2703     m_convReal =  NULL;
2704     m_deferred = true;
2705 }
2706
2707 wxCSConv::wxCSConv(const wxString& charset)
2708 {
2709     Init();
2710
2711     if ( !charset.empty() )
2712     {
2713         SetName(charset.ToAscii());
2714     }
2715
2716 #if wxUSE_FONTMAP
2717     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2718 #else
2719     m_encoding = wxFONTENCODING_SYSTEM;
2720 #endif
2721 }
2722
2723 wxCSConv::wxCSConv(wxFontEncoding encoding)
2724 {
2725     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2726     {
2727         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2728
2729         encoding = wxFONTENCODING_SYSTEM;
2730     }
2731
2732     Init();
2733
2734     m_encoding = encoding;
2735 }
2736
2737 wxCSConv::~wxCSConv()
2738 {
2739     Clear();
2740 }
2741
2742 wxCSConv::wxCSConv(const wxCSConv& conv)
2743         : wxMBConv()
2744 {
2745     Init();
2746
2747     SetName(conv.m_name);
2748     m_encoding = conv.m_encoding;
2749 }
2750
2751 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2752 {
2753     Clear();
2754
2755     SetName(conv.m_name);
2756     m_encoding = conv.m_encoding;
2757
2758     return *this;
2759 }
2760
2761 void wxCSConv::Clear()
2762 {
2763     free(m_name);
2764     delete m_convReal;
2765
2766     m_name = NULL;
2767     m_convReal = NULL;
2768 }
2769
2770 void wxCSConv::SetName(const char *charset)
2771 {
2772     if (charset)
2773     {
2774         m_name = wxStrdup(charset);
2775         m_deferred = true;
2776     }
2777 }
2778
2779 #if wxUSE_FONTMAP
2780
2781 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2782                      wxEncodingNameCache );
2783
2784 static wxEncodingNameCache gs_nameCache;
2785 #endif
2786
2787 wxMBConv *wxCSConv::DoCreate() const
2788 {
2789 #if wxUSE_FONTMAP
2790     wxLogTrace(TRACE_STRCONV,
2791                wxT("creating conversion for %s"),
2792                (m_name ? m_name
2793                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2794 #endif // wxUSE_FONTMAP
2795
2796     // check for the special case of ASCII or ISO8859-1 charset: as we have
2797     // special knowledge of it anyhow, we don't need to create a special
2798     // conversion object
2799     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2800             m_encoding == wxFONTENCODING_DEFAULT )
2801     {
2802         // don't convert at all
2803         return NULL;
2804     }
2805
2806     // we trust OS to do conversion better than we can so try external
2807     // conversion methods first
2808     //
2809     // the full order is:
2810     //      1. OS conversion (iconv() under Unix or Win32 API)
2811     //      2. hard coded conversions for UTF
2812     //      3. wxEncodingConverter as fall back
2813
2814     // step (1)
2815 #ifdef HAVE_ICONV
2816 #if !wxUSE_FONTMAP
2817     if ( m_name )
2818 #endif // !wxUSE_FONTMAP
2819     {
2820 #if wxUSE_FONTMAP
2821         wxFontEncoding encoding(m_encoding);
2822 #endif
2823
2824         if ( m_name )
2825         {
2826             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2827             if ( conv->IsOk() )
2828                 return conv;
2829
2830             delete conv;
2831
2832 #if wxUSE_FONTMAP
2833             encoding =
2834                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2835 #endif // wxUSE_FONTMAP
2836         }
2837 #if wxUSE_FONTMAP
2838         {
2839             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2840             if ( it != gs_nameCache.end() )
2841             {
2842                 if ( it->second.empty() )
2843                     return NULL;
2844
2845                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2846                 if ( conv->IsOk() )
2847                     return conv;
2848
2849                 delete conv;
2850             }
2851
2852             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2853             // CS : in case this does not return valid names (eg for MacRoman)
2854             // encoding got a 'failure' entry in the cache all the same,
2855             // although it just has to be created using a different method, so
2856             // only store failed iconv creation attempts (or perhaps we
2857             // shoulnd't do this at all ?)
2858             if ( names[0] != NULL )
2859             {
2860                 for ( ; *names; ++names )
2861                 {
2862                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2863                     //             will need changes that will obsolete this
2864                     wxString name(*names);
2865                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2866                     if ( conv->IsOk() )
2867                     {
2868                         gs_nameCache[encoding] = *names;
2869                         return conv;
2870                     }
2871
2872                     delete conv;
2873                 }
2874
2875                 gs_nameCache[encoding] = _T(""); // cache the failure
2876             }
2877         }
2878 #endif // wxUSE_FONTMAP
2879     }
2880 #endif // HAVE_ICONV
2881
2882 #ifdef wxHAVE_WIN32_MB2WC
2883     {
2884 #if wxUSE_FONTMAP
2885         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2886                                       : new wxMBConv_win32(m_encoding);
2887         if ( conv->IsOk() )
2888             return conv;
2889
2890         delete conv;
2891 #else
2892         return NULL;
2893 #endif
2894     }
2895 #endif // wxHAVE_WIN32_MB2WC
2896
2897 #ifdef __DARWIN__
2898     {
2899         // leave UTF16 and UTF32 to the built-ins of wx
2900         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2901             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2902         {
2903 #if wxUSE_FONTMAP
2904             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2905                                           : new wxMBConv_cf(m_encoding);
2906 #else
2907             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2908 #endif
2909
2910             if ( conv->IsOk() )
2911                  return conv;
2912
2913             delete conv;
2914         }
2915     }
2916 #endif // __DARWIN__
2917
2918     // step (2)
2919     wxFontEncoding enc = m_encoding;
2920 #if wxUSE_FONTMAP
2921     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2922     {
2923         // use "false" to suppress interactive dialogs -- we can be called from
2924         // anywhere and popping up a dialog from here is the last thing we want to
2925         // do
2926         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2927     }
2928 #endif // wxUSE_FONTMAP
2929
2930     switch ( enc )
2931     {
2932         case wxFONTENCODING_UTF7:
2933              return new wxMBConvUTF7;
2934
2935         case wxFONTENCODING_UTF8:
2936              return new wxMBConvUTF8;
2937
2938         case wxFONTENCODING_UTF16BE:
2939              return new wxMBConvUTF16BE;
2940
2941         case wxFONTENCODING_UTF16LE:
2942              return new wxMBConvUTF16LE;
2943
2944         case wxFONTENCODING_UTF32BE:
2945              return new wxMBConvUTF32BE;
2946
2947         case wxFONTENCODING_UTF32LE:
2948              return new wxMBConvUTF32LE;
2949
2950         default:
2951              // nothing to do but put here to suppress gcc warnings
2952              break;
2953     }
2954
2955     // step (3)
2956 #if wxUSE_FONTMAP
2957     {
2958         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2959                                       : new wxMBConv_wxwin(m_encoding);
2960         if ( conv->IsOk() )
2961             return conv;
2962
2963         delete conv;
2964     }
2965 #endif // wxUSE_FONTMAP
2966
2967     // NB: This is a hack to prevent deadlock. What could otherwise happen
2968     //     in Unicode build: wxConvLocal creation ends up being here
2969     //     because of some failure and logs the error. But wxLog will try to
2970     //     attach a timestamp, for which it will need wxConvLocal (to convert
2971     //     time to char* and then wchar_t*), but that fails, tries to log the
2972     //     error, but wxLog has an (already locked) critical section that
2973     //     guards the static buffer.
2974     static bool alreadyLoggingError = false;
2975     if (!alreadyLoggingError)
2976     {
2977         alreadyLoggingError = true;
2978         wxLogError(_("Cannot convert from the charset '%s'!"),
2979                    m_name ? m_name
2980                       :
2981 #if wxUSE_FONTMAP
2982                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2983 #else // !wxUSE_FONTMAP
2984                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2985 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2986               );
2987
2988         alreadyLoggingError = false;
2989     }
2990
2991     return NULL;
2992 }
2993
2994 void wxCSConv::CreateConvIfNeeded() const
2995 {
2996     if ( m_deferred )
2997     {
2998         wxCSConv *self = (wxCSConv *)this; // const_cast
2999
3000         // if we don't have neither the name nor the encoding, use the default
3001         // encoding for this system
3002         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3003         {
3004 #if wxUSE_INTL
3005             self->m_encoding = wxLocale::GetSystemEncoding();
3006 #else
3007             // fallback to some reasonable default:
3008             self->m_encoding = wxFONTENCODING_ISO8859_1;
3009 #endif // wxUSE_INTL
3010         }
3011
3012         self->m_convReal = DoCreate();
3013         self->m_deferred = false;
3014     }
3015 }
3016
3017 bool wxCSConv::IsOk() const
3018 {
3019     CreateConvIfNeeded();
3020
3021     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3022     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3023         return true; // always ok as we do it ourselves
3024
3025     // m_convReal->IsOk() is called at its own creation, so we know it must
3026     // be ok if m_convReal is non-NULL
3027     return m_convReal != NULL;
3028 }
3029
3030 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3031                          const char *src, size_t srcLen) const
3032 {
3033     CreateConvIfNeeded();
3034
3035     if (m_convReal)
3036         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3037
3038     // latin-1 (direct)
3039     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3040 }
3041
3042 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3043                            const wchar_t *src, size_t srcLen) const
3044 {
3045     CreateConvIfNeeded();
3046
3047     if (m_convReal)
3048         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3049
3050     // latin-1 (direct)
3051     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3052 }
3053
3054 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3055 {
3056     CreateConvIfNeeded();
3057
3058     if (m_convReal)
3059         return m_convReal->MB2WC(buf, psz, n);
3060
3061     // latin-1 (direct)
3062     size_t len = strlen(psz);
3063
3064     if (buf)
3065     {
3066         for (size_t c = 0; c <= len; c++)
3067             buf[c] = (unsigned char)(psz[c]);
3068     }
3069
3070     return len;
3071 }
3072
3073 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3074 {
3075     CreateConvIfNeeded();
3076
3077     if (m_convReal)
3078         return m_convReal->WC2MB(buf, psz, n);
3079
3080     // latin-1 (direct)
3081     const size_t len = wxWcslen(psz);
3082     if (buf)
3083     {
3084         for (size_t c = 0; c <= len; c++)
3085         {
3086             if (psz[c] > 0xFF)
3087                 return wxCONV_FAILED;
3088
3089             buf[c] = (char)psz[c];
3090         }
3091     }
3092     else
3093     {
3094         for (size_t c = 0; c <= len; c++)
3095         {
3096             if (psz[c] > 0xFF)
3097                 return wxCONV_FAILED;
3098         }
3099     }
3100
3101     return len;
3102 }
3103
3104 size_t wxCSConv::GetMBNulLen() const
3105 {
3106     CreateConvIfNeeded();
3107
3108     if ( m_convReal )
3109     {
3110         return m_convReal->GetMBNulLen();
3111     }
3112
3113     // otherwise, we are ISO-8859-1
3114     return 1;
3115 }
3116
3117 #if wxUSE_UNICODE_UTF8
3118 bool wxCSConv::IsUTF8() const
3119 {
3120     CreateConvIfNeeded();
3121
3122     if ( m_convReal )
3123     {
3124         return m_convReal->IsUTF8();
3125     }
3126
3127     // otherwise, we are ISO-8859-1
3128     return false;
3129 }
3130 #endif
3131
3132
3133 #if wxUSE_UNICODE
3134
3135 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3136 {
3137     if ( !s )
3138         return wxWCharBuffer();
3139
3140     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3141     if ( !wbuf )
3142         wbuf = wxMBConvUTF8().cMB2WX(s);
3143     if ( !wbuf )
3144         wbuf = wxConvISO8859_1.cMB2WX(s);
3145
3146     return wbuf;
3147 }
3148
3149 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3150 {
3151     if ( !ws )
3152         return wxCharBuffer();
3153
3154     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3155     if ( !buf )
3156         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3157
3158     return buf;
3159 }
3160
3161 #endif // wxUSE_UNICODE
3162
3163 // ----------------------------------------------------------------------------
3164 // globals
3165 // ----------------------------------------------------------------------------
3166
3167 // NB: The reason why we create converted objects in this convoluted way,
3168 //     using a factory function instead of global variable, is that they
3169 //     may be used at static initialization time (some of them are used by
3170 //     wxString ctors and there may be a global wxString object). In other
3171 //     words, possibly _before_ the converter global object would be
3172 //     initialized.
3173
3174 #undef wxConvLibc
3175 #undef wxConvUTF8
3176 #undef wxConvUTF7
3177 #undef wxConvLocal
3178 #undef wxConvISO8859_1
3179
3180 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3181     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3182     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3183     {                                                                   \
3184         static impl_klass name##Obj ctor_args;                          \
3185         return &name##Obj;                                              \
3186     }                                                                   \
3187     /* this ensures that all global converter objects are created */    \
3188     /* by the time static initialization is done, i.e. before any */    \
3189     /* thread is launched: */                                           \
3190     static klass* gs_##name##instance = wxGet_##name##Ptr()
3191
3192 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3193     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3194
3195 #ifdef __WINDOWS__
3196     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3197 #else
3198     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3199 #endif
3200
3201 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3202 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3203 //     provokes an error message about "not enough macro parameters"; and we
3204 //     can't use "()" here as the name##Obj declaration would be parsed as a
3205 //     function declaration then, so use a semicolon and live with an extra
3206 //     empty statement (and hope that no compilers warns about this)
3207 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3208 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3209
3210 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3211 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3212
3213 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3214 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3215
3216 #ifdef __DARWIN__
3217 // The xnu kernel always communicates file paths in decomposed UTF-8.
3218 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3219 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3220 #endif
3221
3222 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3223 #ifdef __DARWIN__
3224                                     &wxConvMacUTF8DObj;
3225 #else // !__DARWIN__
3226                                     wxGet_wxConvLibcPtr();
3227 #endif // __DARWIN__/!__DARWIN__
3228
3229 #else // !wxUSE_WCHAR_T
3230
3231 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3232 // stand-ins in absence of wchar_t
3233 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3234                                 wxConvISO8859_1,
3235                                 wxConvLocal,
3236                                 wxConvUTF8;
3237
3238 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T