src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487 //
 488 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 489
 490 //
 491 // BASE64 decoding table
 492 //
 493 static const unsigned char utf7unb64[] =
 494 {
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 501     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 502     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 504     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 505     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 506     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 508     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 509     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 510     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 527 };
 528
 529 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 530                              const char *src, size_t srcLen) const
 531 {
 532     DecoderState stateOrig,
 533          *statePtr;
 534     if ( srcLen == wxNO_LEN )
 535     {
 536         // convert the entire string, up to and including the trailing NUL
 537         srcLen = strlen(src) + 1;
 538
 539         // when working on the entire strings we don't update nor use the shift
 540         // state from the previous call
 541         statePtr = &stateOrig;
 542     }
 543     else // when working with partial strings we do use the shift state
 544     {
 545         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 546
 547         // also save the old state to be able to rollback to it on error
 548         stateOrig = m_stateDecoder;
 549     }
 550
 551     // but to simplify the code below we use this variable in both cases
 552     DecoderState& state = *statePtr;
 553
 554
 555     // number of characters [which would have been] written to dst [if it were
 556     // not NULL]
 557     size_t len = 0;
 558
 559     const char * const srcEnd = src + srcLen;
 560
 561     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 562     {
 563         const unsigned char cc = *src++;
 564
 565         if ( state.IsShifted() )
 566         {
 567             const unsigned char dc = utf7unb64[cc];
 568             if ( dc == 0xff )
 569             {
 570                 // end of encoded part
 571                 state.ToDirect();
 572
 573                 // re-parse this character normally below unless it's '-' which
 574                 // is consumed by the decoder
 575                 if ( cc == '-' )
 576                     continue;
 577             }
 578             else // valid encoded character
 579             {
 580                 // mini base64 decoder: each character is 6 bits
 581                 state.bit += 6;
 582                 state.accum <<= 6;
 583                 state.accum += dc;
 584
 585                 if ( state.bit >= 8 )
 586                 {
 587                     // got the full byte, consume it
 588                     state.bit -= 8;
 589                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 590
 591                     if ( state.isLSB )
 592                     {
 593                         // we've got the full word, output it
 594                         if ( dst )
 595                             *dst++ = (state.msb << 8) | b;
 596                         len++;
 597                         state.isLSB = false;
 598                     }
 599                     else // MSB
 600                     {
 601                         // just store it while we wait for LSB
 602                         state.msb = b;
 603                         state.isLSB = true;
 604                     }
 605                 }
 606             }
 607         }
 608
 609         if ( state.IsDirect() )
 610         {
 611             // start of an encoded segment?
 612             if ( cc == '+' )
 613             {
 614                 if ( src == srcEnd )
 615                     return wxCONV_FAILED; // can't have '+' at the end
 616
 617                 if ( *src == '-' )
 618                 {
 619                     // just the encoded plus sign, don't switch to shifted mode
 620                     if ( dst )
 621                         *dst++ = '+';
 622                     len++;
 623                     src++;
 624                 }
 625                 else
 626                 {
 627                     state.ToShifted();
 628                 }
 629             }
 630             else // not '+'
 631             {
 632                 // only printable 7 bit ASCII characters (with the exception of
 633                 // NUL, TAB, CR and LF) can be used directly
 634                 if ( cc >= 0x7f || (cc < ' ' &&
 635                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 636                     return wxCONV_FAILED;
 637
 638                 if ( dst )
 639                     *dst++ = cc;
 640                 len++;
 641             }
 642         }
 643     }
 644
 645     if ( !len )
 646     {
 647         // as we didn't read any characters we should be called with the same
 648         // data (followed by some more new data) again later so don't save our
 649         // state
 650         state = stateOrig;
 651
 652         return wxCONV_FAILED;
 653     }
 654
 655     return len;
 656 }
 657
 658 //
 659 // BASE64 encoding table
 660 //
 661 static const unsigned char utf7enb64[] =
 662 {
 663     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 664     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 665     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 666     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 667     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 668     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 669     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 670     '4', '5', '6', '7', '8', '9', '+', '/'
 671 };
 672
 673 //
 674 // UTF-7 encoding table
 675 //
 676 // 0 - Set D (directly encoded characters)
 677 // 1 - Set O (optional direct characters)
 678 // 2 - whitespace characters (optional)
 679 // 3 - special characters
 680 //
 681 static const unsigned char utf7encode[128] =
 682 {
 683     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 684     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 685     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 686     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 687     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 688     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 689     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 690     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 691 };
 692
 693 static inline bool wxIsUTF7Direct(wchar_t wc)
 694 {
 695     return wc < 0x80 && utf7encode[wc] < 1;
 696 }
 697
 698 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 699                                const wchar_t *src, size_t srcLen) const
 700 {
 701     EncoderState stateOrig,
 702                 *statePtr;
 703     if ( srcLen == wxNO_LEN )
 704     {
 705         // we don't apply the stored state when operating on entire strings at
 706         // once
 707         statePtr = &stateOrig;
 708
 709         srcLen = wxWcslen(src) + 1;
 710     }
 711     else // do use the mode we left the output in previously
 712     {
 713         stateOrig = m_stateEncoder;
 714         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 715     }
 716
 717     EncoderState& state = *statePtr;
 718
 719
 720     size_t len = 0;
 721
 722     const wchar_t * const srcEnd = src + srcLen;
 723     while ( src < srcEnd && (!dst || len < dstLen) )
 724     {
 725         wchar_t cc = *src++;
 726         if ( wxIsUTF7Direct(cc) )
 727         {
 728             if ( state.IsShifted() )
 729             {
 730                 // pad with zeros the last encoded block if necessary
 731                 if ( state.bit )
 732                 {
 733                     if ( dst )
 734                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 735                     len++;
 736                 }
 737
 738                 state.ToDirect();
 739
 740                 if ( dst )
 741                     *dst++ = '-';
 742                 len++;
 743             }
 744
 745             if ( dst )
 746                 *dst++ = (char)cc;
 747             len++;
 748         }
 749         else if ( cc == '+' && state.IsDirect() )
 750         {
 751             if ( dst )
 752             {
 753                 *dst++ = '+';
 754                 *dst++ = '-';
 755             }
 756
 757             len += 2;
 758         }
 759 #ifndef WC_UTF16
 760         else if (((wxUint32)cc) > 0xffff)
 761         {
 762             // no surrogate pair generation (yet?)
 763             return wxCONV_FAILED;
 764         }
 765 #endif
 766         else
 767         {
 768             if ( state.IsDirect() )
 769             {
 770                 state.ToShifted();
 771
 772                 if ( dst )
 773                     *dst++ = '+';
 774                 len++;
 775             }
 776
 777             // BASE64 encode string
 778             for ( ;; )
 779             {
 780                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 781                 {
 782                     state.accum <<= 8;
 783                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 784
 785                     for (state.bit += 8; state.bit >= 6; )
 786                     {
 787                         state.bit -= 6;
 788                         if ( dst )
 789                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 790                         len++;
 791                     }
 792                 }
 793
 794                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 795                     break;
 796
 797                 src++;
 798             }
 799         }
 800     }
 801
 802     // we need to restore the original encoder state if we were called just to
 803     // calculate the amount of space needed as we will presumably be called
 804     // again to really convert the data now
 805     if ( !dst )
 806         state = stateOrig;
 807
 808     return len;
 809 }
 810
 811 // ----------------------------------------------------------------------------
 812 // UTF-8
 813 // ----------------------------------------------------------------------------
 814
 815 static const wxUint32 utf8_max[]=
 816     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 817
 818 // boundaries of the private use area we use to (temporarily) remap invalid
 819 // characters invalid in a UTF-8 encoded string
 820 const wxUint32 wxUnicodePUA = 0x100000;
 821 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 822
 823 // this table gives the length of the UTF-8 encoding from its first character:
 824 const unsigned char tableUtf8Lengths[256] = {
 825     // single-byte sequences (ASCII):
 826     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 827     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 828     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 829     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 830     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 831     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 832     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 833     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 834
 835     // these are invalid:
 836     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 837     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 838     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 839     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 840     0, 0,                                            // C0,C1
 841
 842     // two-byte sequences:
 843           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 844     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 845
 846     // three-byte sequences:
 847     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 848
 849     // four-byte sequences:
 850     4, 4, 4, 4, 4,                                   // F0..F4
 851
 852     // these are invalid again (5- or 6-byte
 853     // sequences and sequences for code points
 854     // above U+10FFFF, as restricted by RFC 3629):
 855                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 856 };
 857
 858 size_t
 859 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 860                             const char *src, size_t srcLen) const
 861 {
 862     wchar_t *out = dstLen ? dst : NULL;
 863     size_t written = 0;
 864
 865     if ( srcLen == wxNO_LEN )
 866         srcLen = strlen(src) + 1;
 867
 868     for ( const char *p = src; ; p++ )
 869     {
 870         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 871         {
 872             // all done successfully, just add the trailing NULL if we are not
 873             // using explicit length
 874             if ( srcLen == wxNO_LEN )
 875             {
 876                 if ( out )
 877                 {
 878                     if ( !dstLen )
 879                         break;
 880
 881                     *out = L'\0';
 882                 }
 883
 884                 written++;
 885             }
 886
 887             return written;
 888         }
 889
 890         if ( out && !dstLen-- )
 891             break;
 892
 893         wxUint32 code;
 894         unsigned char c = *p;
 895
 896         if ( c < 0x80 )
 897         {
 898             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 899                 break;
 900
 901             if ( srcLen != wxNO_LEN )
 902                 srcLen--;
 903
 904             code = c;
 905         }
 906         else
 907         {
 908             unsigned len = tableUtf8Lengths[c];
 909             if ( !len )
 910                 break;
 911
 912             if ( srcLen < len ) // the test works for wxNO_LEN too
 913                 break;
 914
 915             if ( srcLen != wxNO_LEN )
 916                 srcLen -= len;
 917
 918             //   Char. number range   |        UTF-8 octet sequence
 919             //      (hexadecimal)     |              (binary)
 920             //  ----------------------+----------------------------------------
 921             //  0000 0000 - 0000 007F | 0xxxxxxx
 922             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 923             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 924             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 925             //
 926             //  Code point value is stored in bits marked with 'x',
 927             //  lowest-order bit of the value on the right side in the diagram
 928             //  above.                                         (from RFC 3629)
 929
 930             // mask to extract lead byte's value ('x' bits above), by sequence
 931             // length:
 932             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 933
 934             // mask and value of lead byte's most significant bits, by length:
 935             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 936             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 937
 938             len--; // it's more convenient to work with 0-based length here
 939
 940             // extract the lead byte's value bits:
 941             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 942                 break;
 943
 944             code = c & leadValueMask[len];
 945
 946             // all remaining bytes, if any, are handled in the same way
 947             // regardless of sequence's length:
 948             for ( ; len; --len )
 949             {
 950                 c = *++p;
 951                 if ( (c & 0xC0) != 0x80 )
 952                     return wxCONV_FAILED;
 953
 954                 code <<= 6;
 955                 code |= c & 0x3F;
 956             }
 957         }
 958
 959 #ifdef WC_UTF16
 960         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 961         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 962         {
 963             if ( out )
 964                 out++;
 965             written++;
 966         }
 967 #else // !WC_UTF16
 968         if ( out )
 969             *out = code;
 970 #endif // WC_UTF16/!WC_UTF16
 971
 972         if ( out )
 973             out++;
 974
 975         written++;
 976     }
 977
 978     return wxCONV_FAILED;
 979 }
 980
 981 size_t
 982 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 983                               const wchar_t *src, size_t srcLen) const
 984 {
 985     char *out = dstLen ? dst : NULL;
 986     size_t written = 0;
 987
 988     for ( const wchar_t *wp = src; ; wp++ )
 989     {
 990         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 991         {
 992             // all done successfully, just add the trailing NULL if we are not
 993             // using explicit length
 994             if ( srcLen == wxNO_LEN )
 995             {
 996                 if ( out )
 997                 {
 998                     if ( !dstLen )
 999                         break;
1000
1001                     *out = '\0';
1002                 }
1003
1004                 written++;
1005             }
1006
1007             return written;
1008         }
1009
1010
1011         wxUint32 code;
1012 #ifdef WC_UTF16
1013         // cast is ok for WC_UTF16
1014         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1015         {
1016             // skip the next char too as we decoded a surrogate
1017             wp++;
1018         }
1019 #else // wchar_t is UTF-32
1020         code = *wp & 0x7fffffff;
1021 #endif
1022
1023         unsigned len;
1024         if ( code <= 0x7F )
1025         {
1026             len = 1;
1027             if ( out )
1028             {
1029                 if ( dstLen < len )
1030                     break;
1031
1032                 out[0] = (char)code;
1033             }
1034         }
1035         else if ( code <= 0x07FF )
1036         {
1037             len = 2;
1038             if ( out )
1039             {
1040                 if ( dstLen < len )
1041                     break;
1042
1043                 // NB: this line takes 6 least significant bits, encodes them as
1044                 // 10xxxxxx and discards them so that the next byte can be encoded:
1045                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1046                 out[0] = 0xC0 | code;
1047             }
1048         }
1049         else if ( code < 0xFFFF )
1050         {
1051             len = 3;
1052             if ( out )
1053             {
1054                 if ( dstLen < len )
1055                     break;
1056
1057                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1058                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1059                 out[0] = 0xE0 | code;
1060             }
1061         }
1062         else if ( code <= 0x10FFFF )
1063         {
1064             len = 4;
1065             if ( out )
1066             {
1067                 if ( dstLen < len )
1068                     break;
1069
1070                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1071                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1072                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1073                 out[0] = 0xF0 | code;
1074             }
1075         }
1076         else
1077         {
1078             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1079             break;
1080         }
1081
1082         if ( out )
1083         {
1084             out += len;
1085             dstLen -= len;
1086         }
1087
1088         written += len;
1089     }
1090
1091     // we only get here if an error occurs during decoding
1092     return wxCONV_FAILED;
1093 }
1094
1095 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1096                              const char *psz, size_t srcLen) const
1097 {
1098     if ( m_options == MAP_INVALID_UTF8_NOT )
1099         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1100
1101     size_t len = 0;
1102
1103     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1104     {
1105         const char *opsz = psz;
1106         bool invalid = false;
1107         unsigned char cc = *psz++, fc = cc;
1108         unsigned cnt;
1109         for (cnt = 0; fc & 0x80; cnt++)
1110             fc <<= 1;
1111
1112         if (!cnt)
1113         {
1114             // plain ASCII char
1115             if (buf)
1116                 *buf++ = cc;
1117             len++;
1118
1119             // escape the escape character for octal escapes
1120             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1121                     && cc == '\\' && (!buf || len < n))
1122             {
1123                 if (buf)
1124                     *buf++ = cc;
1125                 len++;
1126             }
1127         }
1128         else
1129         {
1130             cnt--;
1131             if (!cnt)
1132             {
1133                 // invalid UTF-8 sequence
1134                 invalid = true;
1135             }
1136             else
1137             {
1138                 unsigned ocnt = cnt - 1;
1139                 wxUint32 res = cc & (0x3f >> cnt);
1140                 while (cnt--)
1141                 {
1142                     cc = *psz;
1143                     if ((cc & 0xC0) != 0x80)
1144                     {
1145                         // invalid UTF-8 sequence
1146                         invalid = true;
1147                         break;
1148                     }
1149
1150                     psz++;
1151                     res = (res << 6) | (cc & 0x3f);
1152                 }
1153
1154                 if (invalid || res <= utf8_max[ocnt])
1155                 {
1156                     // illegal UTF-8 encoding
1157                     invalid = true;
1158                 }
1159                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1160                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1161                 {
1162                     // if one of our PUA characters turns up externally
1163                     // it must also be treated as an illegal sequence
1164                     // (a bit like you have to escape an escape character)
1165                     invalid = true;
1166                 }
1167                 else
1168                 {
1169 #ifdef WC_UTF16
1170                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1171                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1172                     if (pa == wxCONV_FAILED)
1173                     {
1174                         invalid = true;
1175                     }
1176                     else
1177                     {
1178                         if (buf)
1179                             buf += pa;
1180                         len += pa;
1181                     }
1182 #else // !WC_UTF16
1183                     if (buf)
1184                         *buf++ = (wchar_t)res;
1185                     len++;
1186 #endif // WC_UTF16/!WC_UTF16
1187                 }
1188             }
1189
1190             if (invalid)
1191             {
1192                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1193                 {
1194                     while (opsz < psz && (!buf || len < n))
1195                     {
1196 #ifdef WC_UTF16
1197                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1198                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1199                         wxASSERT(pa != wxCONV_FAILED);
1200                         if (buf)
1201                             buf += pa;
1202                         opsz++;
1203                         len += pa;
1204 #else
1205                         if (buf)
1206                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1207                         opsz++;
1208                         len++;
1209 #endif
1210                     }
1211                 }
1212                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1213                 {
1214                     while (opsz < psz && (!buf || len < n))
1215                     {
1216                         if ( buf && len + 3 < n )
1217                         {
1218                             unsigned char on = *opsz;
1219                             *buf++ = L'\\';
1220                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1221                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1222                             *buf++ = (wchar_t)( L'0' + on % 010 );
1223                         }
1224
1225                         opsz++;
1226                         len += 4;
1227                     }
1228                 }
1229                 else // MAP_INVALID_UTF8_NOT
1230                 {
1231                     return wxCONV_FAILED;
1232                 }
1233             }
1234         }
1235     }
1236
1237     if (srcLen == wxNO_LEN && buf && (len < n))
1238         *buf = 0;
1239
1240     return len + 1;
1241 }
1242
1243 static inline bool isoctal(wchar_t wch)
1244 {
1245     return L'0' <= wch && wch <= L'7';
1246 }
1247
1248 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1249                                const wchar_t *psz, size_t srcLen) const
1250 {
1251     if ( m_options == MAP_INVALID_UTF8_NOT )
1252         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1253
1254     size_t len = 0;
1255
1256     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1257     {
1258         wxUint32 cc;
1259
1260 #ifdef WC_UTF16
1261         // cast is ok for WC_UTF16
1262         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1263         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1264 #else
1265         cc = (*psz++) & 0x7fffffff;
1266 #endif
1267
1268         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1269                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1270         {
1271             if (buf)
1272                 *buf++ = (char)(cc - wxUnicodePUA);
1273             len++;
1274         }
1275         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1276                     && cc == L'\\' && psz[0] == L'\\' )
1277         {
1278             if (buf)
1279                 *buf++ = (char)cc;
1280             psz++;
1281             len++;
1282         }
1283         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1284                     cc == L'\\' &&
1285                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1286         {
1287             if (buf)
1288             {
1289                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1290                                  (psz[1] - L'0') * 010 +
1291                                  (psz[2] - L'0'));
1292             }
1293
1294             psz += 3;
1295             len++;
1296         }
1297         else
1298         {
1299             unsigned cnt;
1300             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1301             {
1302             }
1303
1304             if (!cnt)
1305             {
1306                 // plain ASCII char
1307                 if (buf)
1308                     *buf++ = (char) cc;
1309                 len++;
1310             }
1311             else
1312             {
1313                 len += cnt + 1;
1314                 if (buf)
1315                 {
1316                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1317                     while (cnt--)
1318                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1319                 }
1320             }
1321         }
1322     }
1323
1324     if (srcLen == wxNO_LEN && buf && (len < n))
1325         *buf = 0;
1326
1327     return len + 1;
1328 }
1329
1330 // ============================================================================
1331 // UTF-16
1332 // ============================================================================
1333
1334 #ifdef WORDS_BIGENDIAN
1335     #define wxMBConvUTF16straight wxMBConvUTF16BE
1336     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1337 #else
1338     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1339     #define wxMBConvUTF16straight wxMBConvUTF16LE
1340 #endif
1341
1342 /* static */
1343 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1344 {
1345     if ( srcLen == wxNO_LEN )
1346     {
1347         // count the number of bytes in input, including the trailing NULs
1348         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1349         for ( srcLen = 1; *inBuff++; srcLen++ )
1350             ;
1351
1352         srcLen *= BYTES_PER_CHAR;
1353     }
1354     else // we already have the length
1355     {
1356         // we can only convert an entire number of UTF-16 characters
1357         if ( srcLen % BYTES_PER_CHAR )
1358             return wxCONV_FAILED;
1359     }
1360
1361     return srcLen;
1362 }
1363
1364 // case when in-memory representation is UTF-16 too
1365 #ifdef WC_UTF16
1366
1367 // ----------------------------------------------------------------------------
1368 // conversions without endianness change
1369 // ----------------------------------------------------------------------------
1370
1371 size_t
1372 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1373                                const char *src, size_t srcLen) const
1374 {
1375     // set up the scene for using memcpy() (which is presumably more efficient
1376     // than copying the bytes one by one)
1377     srcLen = GetLength(src, srcLen);
1378     if ( srcLen == wxNO_LEN )
1379         return wxCONV_FAILED;
1380
1381     const size_t inLen = srcLen / BYTES_PER_CHAR;
1382     if ( dst )
1383     {
1384         if ( dstLen < inLen )
1385             return wxCONV_FAILED;
1386
1387         memcpy(dst, src, srcLen);
1388     }
1389
1390     return inLen;
1391 }
1392
1393 size_t
1394 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1395                                  const wchar_t *src, size_t srcLen) const
1396 {
1397     if ( srcLen == wxNO_LEN )
1398         srcLen = wxWcslen(src) + 1;
1399
1400     srcLen *= BYTES_PER_CHAR;
1401
1402     if ( dst )
1403     {
1404         if ( dstLen < srcLen )
1405             return wxCONV_FAILED;
1406
1407         memcpy(dst, src, srcLen);
1408     }
1409
1410     return srcLen;
1411 }
1412
1413 // ----------------------------------------------------------------------------
1414 // endian-reversing conversions
1415 // ----------------------------------------------------------------------------
1416
1417 size_t
1418 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1419                            const char *src, size_t srcLen) const
1420 {
1421     srcLen = GetLength(src, srcLen);
1422     if ( srcLen == wxNO_LEN )
1423         return wxCONV_FAILED;
1424
1425     srcLen /= BYTES_PER_CHAR;
1426
1427     if ( dst )
1428     {
1429         if ( dstLen < srcLen )
1430             return wxCONV_FAILED;
1431
1432         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1433         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1434         {
1435             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1436         }
1437     }
1438
1439     return srcLen;
1440 }
1441
1442 size_t
1443 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1444                              const wchar_t *src, size_t srcLen) const
1445 {
1446     if ( srcLen == wxNO_LEN )
1447         srcLen = wxWcslen(src) + 1;
1448
1449     srcLen *= BYTES_PER_CHAR;
1450
1451     if ( dst )
1452     {
1453         if ( dstLen < srcLen )
1454             return wxCONV_FAILED;
1455
1456         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1457         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1458         {
1459             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1460         }
1461     }
1462
1463     return srcLen;
1464 }
1465
1466 #else // !WC_UTF16: wchar_t is UTF-32
1467
1468 // ----------------------------------------------------------------------------
1469 // conversions without endianness change
1470 // ----------------------------------------------------------------------------
1471
1472 size_t
1473 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1474                                const char *src, size_t srcLen) const
1475 {
1476     srcLen = GetLength(src, srcLen);
1477     if ( srcLen == wxNO_LEN )
1478         return wxCONV_FAILED;
1479
1480     const size_t inLen = srcLen / BYTES_PER_CHAR;
1481     if ( !dst )
1482     {
1483         // optimization: return maximal space which could be needed for this
1484         // string even if the real size could be smaller if the buffer contains
1485         // any surrogates
1486         return inLen;
1487     }
1488
1489     size_t outLen = 0;
1490     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1491     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1492     {
1493         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1494         if ( !inBuff )
1495             return wxCONV_FAILED;
1496
1497         if ( ++outLen > dstLen )
1498             return wxCONV_FAILED;
1499
1500         *dst++ = ch;
1501     }
1502
1503
1504     return outLen;
1505 }
1506
1507 size_t
1508 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1509                                  const wchar_t *src, size_t srcLen) const
1510 {
1511     if ( srcLen == wxNO_LEN )
1512         srcLen = wxWcslen(src) + 1;
1513
1514     size_t outLen = 0;
1515     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1516     for ( size_t n = 0; n < srcLen; n++ )
1517     {
1518         wxUint16 cc[2];
1519         const size_t numChars = encode_utf16(*src++, cc);
1520         if ( numChars == wxCONV_FAILED )
1521             return wxCONV_FAILED;
1522
1523         outLen += numChars * BYTES_PER_CHAR;
1524         if ( outBuff )
1525         {
1526             if ( outLen > dstLen )
1527                 return wxCONV_FAILED;
1528
1529             *outBuff++ = cc[0];
1530             if ( numChars == 2 )
1531             {
1532                 // second character of a surrogate
1533                 *outBuff++ = cc[1];
1534             }
1535         }
1536     }
1537
1538     return outLen;
1539 }
1540
1541 // ----------------------------------------------------------------------------
1542 // endian-reversing conversions
1543 // ----------------------------------------------------------------------------
1544
1545 size_t
1546 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1547                            const char *src, size_t srcLen) const
1548 {
1549     srcLen = GetLength(src, srcLen);
1550     if ( srcLen == wxNO_LEN )
1551         return wxCONV_FAILED;
1552
1553     const size_t inLen = srcLen / BYTES_PER_CHAR;
1554     if ( !dst )
1555     {
1556         // optimization: return maximal space which could be needed for this
1557         // string even if the real size could be smaller if the buffer contains
1558         // any surrogates
1559         return inLen;
1560     }
1561
1562     size_t outLen = 0;
1563     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1564     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1565     {
1566         wxUint32 ch;
1567         wxUint16 tmp[2];
1568
1569         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1570         inBuff++;
1571         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1572
1573         const size_t numChars = decode_utf16(tmp, ch);
1574         if ( numChars == wxCONV_FAILED )
1575             return wxCONV_FAILED;
1576
1577         if ( numChars == 2 )
1578             inBuff++;
1579
1580         if ( ++outLen > dstLen )
1581             return wxCONV_FAILED;
1582
1583         *dst++ = ch;
1584     }
1585
1586
1587     return outLen;
1588 }
1589
1590 size_t
1591 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1592                              const wchar_t *src, size_t srcLen) const
1593 {
1594     if ( srcLen == wxNO_LEN )
1595         srcLen = wxWcslen(src) + 1;
1596
1597     size_t outLen = 0;
1598     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1599     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1600     {
1601         wxUint16 cc[2];
1602         const size_t numChars = encode_utf16(*src, cc);
1603         if ( numChars == wxCONV_FAILED )
1604             return wxCONV_FAILED;
1605
1606         outLen += numChars * BYTES_PER_CHAR;
1607         if ( outBuff )
1608         {
1609             if ( outLen > dstLen )
1610                 return wxCONV_FAILED;
1611
1612             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1613             if ( numChars == 2 )
1614             {
1615                 // second character of a surrogate
1616                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1617             }
1618         }
1619     }
1620
1621     return outLen;
1622 }
1623
1624 #endif // WC_UTF16/!WC_UTF16
1625
1626
1627 // ============================================================================
1628 // UTF-32
1629 // ============================================================================
1630
1631 #ifdef WORDS_BIGENDIAN
1632     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1633     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1634 #else
1635     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1636     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1637 #endif
1638
1639
1640 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1641 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1642
1643 /* static */
1644 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1645 {
1646     if ( srcLen == wxNO_LEN )
1647     {
1648         // count the number of bytes in input, including the trailing NULs
1649         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1650         for ( srcLen = 1; *inBuff++; srcLen++ )
1651             ;
1652
1653         srcLen *= BYTES_PER_CHAR;
1654     }
1655     else // we already have the length
1656     {
1657         // we can only convert an entire number of UTF-32 characters
1658         if ( srcLen % BYTES_PER_CHAR )
1659             return wxCONV_FAILED;
1660     }
1661
1662     return srcLen;
1663 }
1664
1665 // case when in-memory representation is UTF-16
1666 #ifdef WC_UTF16
1667
1668 // ----------------------------------------------------------------------------
1669 // conversions without endianness change
1670 // ----------------------------------------------------------------------------
1671
1672 size_t
1673 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1674                                const char *src, size_t srcLen) const
1675 {
1676     srcLen = GetLength(src, srcLen);
1677     if ( srcLen == wxNO_LEN )
1678         return wxCONV_FAILED;
1679
1680     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1681     const size_t inLen = srcLen / BYTES_PER_CHAR;
1682     size_t outLen = 0;
1683     for ( size_t n = 0; n < inLen; n++ )
1684     {
1685         wxUint16 cc[2];
1686         const size_t numChars = encode_utf16(*inBuff++, cc);
1687         if ( numChars == wxCONV_FAILED )
1688             return wxCONV_FAILED;
1689
1690         outLen += numChars;
1691         if ( dst )
1692         {
1693             if ( outLen > dstLen )
1694                 return wxCONV_FAILED;
1695
1696             *dst++ = cc[0];
1697             if ( numChars == 2 )
1698             {
1699                 // second character of a surrogate
1700                 *dst++ = cc[1];
1701             }
1702         }
1703     }
1704
1705     return outLen;
1706 }
1707
1708 size_t
1709 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1710                                  const wchar_t *src, size_t srcLen) const
1711 {
1712     if ( srcLen == wxNO_LEN )
1713         srcLen = wxWcslen(src) + 1;
1714
1715     if ( !dst )
1716     {
1717         // optimization: return maximal space which could be needed for this
1718         // string instead of the exact amount which could be less if there are
1719         // any surrogates in the input
1720         //
1721         // we consider that surrogates are rare enough to make it worthwhile to
1722         // avoid running the loop below at the cost of slightly extra memory
1723         // consumption
1724         return srcLen * BYTES_PER_CHAR;
1725     }
1726
1727     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1728     size_t outLen = 0;
1729     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1730     {
1731         const wxUint32 ch = wxDecodeSurrogate(&src);
1732         if ( !src )
1733             return wxCONV_FAILED;
1734
1735         outLen += BYTES_PER_CHAR;
1736
1737         if ( outLen > dstLen )
1738             return wxCONV_FAILED;
1739
1740         *outBuff++ = ch;
1741     }
1742
1743     return outLen;
1744 }
1745
1746 // ----------------------------------------------------------------------------
1747 // endian-reversing conversions
1748 // ----------------------------------------------------------------------------
1749
1750 size_t
1751 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1752                            const char *src, size_t srcLen) const
1753 {
1754     srcLen = GetLength(src, srcLen);
1755     if ( srcLen == wxNO_LEN )
1756         return wxCONV_FAILED;
1757
1758     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1759     const size_t inLen = srcLen / BYTES_PER_CHAR;
1760     size_t outLen = 0;
1761     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1762     {
1763         wxUint16 cc[2];
1764         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1765         if ( numChars == wxCONV_FAILED )
1766             return wxCONV_FAILED;
1767
1768         outLen += numChars;
1769         if ( dst )
1770         {
1771             if ( outLen > dstLen )
1772                 return wxCONV_FAILED;
1773
1774             *dst++ = cc[0];
1775             if ( numChars == 2 )
1776             {
1777                 // second character of a surrogate
1778                 *dst++ = cc[1];
1779             }
1780         }
1781     }
1782
1783     return outLen;
1784 }
1785
1786 size_t
1787 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1788                              const wchar_t *src, size_t srcLen) const
1789 {
1790     if ( srcLen == wxNO_LEN )
1791         srcLen = wxWcslen(src) + 1;
1792
1793     if ( !dst )
1794     {
1795         // optimization: return maximal space which could be needed for this
1796         // string instead of the exact amount which could be less if there are
1797         // any surrogates in the input
1798         //
1799         // we consider that surrogates are rare enough to make it worthwhile to
1800         // avoid running the loop below at the cost of slightly extra memory
1801         // consumption
1802         return srcLen*BYTES_PER_CHAR;
1803     }
1804
1805     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1806     size_t outLen = 0;
1807     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1808     {
1809         const wxUint32 ch = wxDecodeSurrogate(&src);
1810         if ( !src )
1811             return wxCONV_FAILED;
1812
1813         outLen += BYTES_PER_CHAR;
1814
1815         if ( outLen > dstLen )
1816             return wxCONV_FAILED;
1817
1818         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1819     }
1820
1821     return outLen;
1822 }
1823
1824 #else // !WC_UTF16: wchar_t is UTF-32
1825
1826 // ----------------------------------------------------------------------------
1827 // conversions without endianness change
1828 // ----------------------------------------------------------------------------
1829
1830 size_t
1831 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1832                                const char *src, size_t srcLen) const
1833 {
1834     // use memcpy() as it should be much faster than hand-written loop
1835     srcLen = GetLength(src, srcLen);
1836     if ( srcLen == wxNO_LEN )
1837         return wxCONV_FAILED;
1838
1839     const size_t inLen = srcLen/BYTES_PER_CHAR;
1840     if ( dst )
1841     {
1842         if ( dstLen < inLen )
1843             return wxCONV_FAILED;
1844
1845         memcpy(dst, src, srcLen);
1846     }
1847
1848     return inLen;
1849 }
1850
1851 size_t
1852 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1853                                  const wchar_t *src, size_t srcLen) const
1854 {
1855     if ( srcLen == wxNO_LEN )
1856         srcLen = wxWcslen(src) + 1;
1857
1858     srcLen *= BYTES_PER_CHAR;
1859
1860     if ( dst )
1861     {
1862         if ( dstLen < srcLen )
1863             return wxCONV_FAILED;
1864
1865         memcpy(dst, src, srcLen);
1866     }
1867
1868     return srcLen;
1869 }
1870
1871 // ----------------------------------------------------------------------------
1872 // endian-reversing conversions
1873 // ----------------------------------------------------------------------------
1874
1875 size_t
1876 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1877                            const char *src, size_t srcLen) const
1878 {
1879     srcLen = GetLength(src, srcLen);
1880     if ( srcLen == wxNO_LEN )
1881         return wxCONV_FAILED;
1882
1883     srcLen /= BYTES_PER_CHAR;
1884
1885     if ( dst )
1886     {
1887         if ( dstLen < srcLen )
1888             return wxCONV_FAILED;
1889
1890         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1891         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1892         {
1893             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1894         }
1895     }
1896
1897     return srcLen;
1898 }
1899
1900 size_t
1901 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1902                              const wchar_t *src, size_t srcLen) const
1903 {
1904     if ( srcLen == wxNO_LEN )
1905         srcLen = wxWcslen(src) + 1;
1906
1907     srcLen *= BYTES_PER_CHAR;
1908
1909     if ( dst )
1910     {
1911         if ( dstLen < srcLen )
1912             return wxCONV_FAILED;
1913
1914         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1915         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1916         {
1917             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1918         }
1919     }
1920
1921     return srcLen;
1922 }
1923
1924 #endif // WC_UTF16/!WC_UTF16
1925
1926
1927 // ============================================================================
1928 // The classes doing conversion using the iconv_xxx() functions
1929 // ============================================================================
1930
1931 #ifdef HAVE_ICONV
1932
1933 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1934 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1935 //     (unless there's yet another bug in glibc) the only case when iconv()
1936 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1937 //     left in the input buffer -- when _real_ error occurs,
1938 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1939 //     iconv() failure.
1940 //     [This bug does not appear in glibc 2.2.]
1941 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1942 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1943                                      (errno != E2BIG || bufLeft != 0))
1944 #else
1945 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1946 #endif
1947
1948 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1949
1950 #define ICONV_T_INVALID ((iconv_t)-1)
1951
1952 #if SIZEOF_WCHAR_T == 4
1953     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1954     #define WC_ENC      wxFONTENCODING_UTF32
1955 #elif SIZEOF_WCHAR_T == 2
1956     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1957     #define WC_ENC      wxFONTENCODING_UTF16
1958 #else // sizeof(wchar_t) != 2 nor 4
1959     // does this ever happen?
1960     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1961 #endif
1962
1963 // ----------------------------------------------------------------------------
1964 // wxMBConv_iconv: encapsulates an iconv character set
1965 // ----------------------------------------------------------------------------
1966
1967 class wxMBConv_iconv : public wxMBConv
1968 {
1969 public:
1970     wxMBConv_iconv(const char *name);
1971     virtual ~wxMBConv_iconv();
1972
1973     // implement base class virtual methods
1974     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1975                            const char *src, size_t srcLen = wxNO_LEN) const;
1976     virtual size_t FromWChar(char *dst, size_t dstLen,
1977                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1978     virtual size_t GetMBNulLen() const;
1979
1980 #if wxUSE_UNICODE_UTF8
1981     virtual bool IsUTF8() const;
1982 #endif
1983
1984     virtual wxMBConv *Clone() const
1985     {
1986         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1987         p->m_minMBCharWidth = m_minMBCharWidth;
1988         return p;
1989     }
1990
1991     bool IsOk() const
1992         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1993
1994 protected:
1995     // the iconv handlers used to translate from multibyte
1996     // to wide char and in the other direction
1997     iconv_t m2w,
1998             w2m;
1999
2000 #if wxUSE_THREADS
2001     // guards access to m2w and w2m objects
2002     wxMutex m_iconvMutex;
2003 #endif
2004
2005 private:
2006     // the name (for iconv_open()) of a wide char charset -- if none is
2007     // available on this machine, it will remain NULL
2008     static wxString ms_wcCharsetName;
2009
2010     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2011     // different endian-ness than the native one
2012     static bool ms_wcNeedsSwap;
2013
2014
2015     // name of the encoding handled by this conversion
2016     wxString m_name;
2017
2018     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2019     // initially
2020     size_t m_minMBCharWidth;
2021 };
2022
2023 // make the constructor available for unit testing
2024 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2025 {
2026     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2027     if ( !result->IsOk() )
2028     {
2029         delete result;
2030         return 0;
2031     }
2032
2033     return result;
2034 }
2035
2036 wxString wxMBConv_iconv::ms_wcCharsetName;
2037 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2038
2039 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2040               : m_name(name)
2041 {
2042     m_minMBCharWidth = 0;
2043
2044     // check for charset that represents wchar_t:
2045     if ( ms_wcCharsetName.empty() )
2046     {
2047         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2048
2049 #if wxUSE_FONTMAP
2050         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2051 #else // !wxUSE_FONTMAP
2052         static const wxChar *names_static[] =
2053         {
2054 #if SIZEOF_WCHAR_T == 4
2055             _T("UCS-4"),
2056 #elif SIZEOF_WCHAR_T = 2
2057             _T("UCS-2"),
2058 #endif
2059             NULL
2060         };
2061         const wxChar **names = names_static;
2062 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2063
2064         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2065         {
2066             const wxString nameCS(*names);
2067
2068             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2069             wxString nameXE(nameCS);
2070
2071 #ifdef WORDS_BIGENDIAN
2072                 nameXE += _T("BE");
2073 #else // little endian
2074                 nameXE += _T("LE");
2075 #endif
2076
2077             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2078                        nameXE.c_str());
2079
2080             m2w = iconv_open(nameXE.ToAscii(), name);
2081             if ( m2w == ICONV_T_INVALID )
2082             {
2083                 // try charset w/o bytesex info (e.g. "UCS4")
2084                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2085                            nameCS.c_str());
2086                 m2w = iconv_open(nameCS.ToAscii(), name);
2087
2088                 // and check for bytesex ourselves:
2089                 if ( m2w != ICONV_T_INVALID )
2090                 {
2091                     char    buf[2], *bufPtr;
2092                     wchar_t wbuf[2];
2093                     size_t  insz, outsz;
2094                     size_t  res;
2095
2096                     buf[0] = 'A';
2097                     buf[1] = 0;
2098                     wbuf[0] = 0;
2099                     insz = 2;
2100                     outsz = SIZEOF_WCHAR_T * 2;
2101                     char* wbufPtr = (char*)wbuf;
2102                     bufPtr = buf;
2103
2104                     res = iconv(
2105                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2106                         &wbufPtr, &outsz);
2107
2108                     if (ICONV_FAILED(res, insz))
2109                     {
2110                         wxLogLastError(wxT("iconv"));
2111                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2112                                    nameCS.c_str());
2113                     }
2114                     else // ok, can convert to this encoding, remember it
2115                     {
2116                         ms_wcCharsetName = nameCS;
2117                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2118                     }
2119                 }
2120             }
2121             else // use charset not requiring byte swapping
2122             {
2123                 ms_wcCharsetName = nameXE;
2124             }
2125         }
2126
2127         wxLogTrace(TRACE_STRCONV,
2128                    wxT("iconv wchar_t charset is \"%s\"%s"),
2129                    ms_wcCharsetName.empty() ? wxString("<none>")
2130                                             : ms_wcCharsetName,
2131                    ms_wcNeedsSwap ? _T(" (needs swap)")
2132                                   : _T(""));
2133     }
2134     else // we already have ms_wcCharsetName
2135     {
2136         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2137     }
2138
2139     if ( ms_wcCharsetName.empty() )
2140     {
2141         w2m = ICONV_T_INVALID;
2142     }
2143     else
2144     {
2145         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2146         if ( w2m == ICONV_T_INVALID )
2147         {
2148             wxLogTrace(TRACE_STRCONV,
2149                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2150                        ms_wcCharsetName.c_str(), name);
2151         }
2152     }
2153 }
2154
2155 wxMBConv_iconv::~wxMBConv_iconv()
2156 {
2157     if ( m2w != ICONV_T_INVALID )
2158         iconv_close(m2w);
2159     if ( w2m != ICONV_T_INVALID )
2160         iconv_close(w2m);
2161 }
2162
2163 size_t
2164 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2165                         const char *src, size_t srcLen) const
2166 {
2167     if ( srcLen == wxNO_LEN )
2168     {
2169         // find the string length: notice that must be done differently for
2170         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2171         // consecutive NULs
2172         const size_t nulLen = GetMBNulLen();
2173         switch ( nulLen )
2174         {
2175             default:
2176                 return wxCONV_FAILED;
2177
2178             case 1:
2179                 srcLen = strlen(src); // arguably more optimized than our version
2180                 break;
2181
2182             case 2:
2183             case 4:
2184                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2185                 // but they also have to start at character boundary and not
2186                 // span two adjacent characters
2187                 const char *p;
2188                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2189                     ;
2190                 srcLen = p - src;
2191                 break;
2192         }
2193
2194         // when we're determining the length of the string ourselves we count
2195         // the terminating NUL(s) as part of it and always NUL-terminate the
2196         // output
2197         srcLen += nulLen;
2198     }
2199
2200     // we express length in the number of (wide) characters but iconv always
2201     // counts buffer sizes it in bytes
2202     dstLen *= SIZEOF_WCHAR_T;
2203
2204 #if wxUSE_THREADS
2205     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2206     //     Unfortunately there are a couple of global wxCSConv objects such as
2207     //     wxConvLocal that are used all over wx code, so we have to make sure
2208     //     the handle is used by at most one thread at the time. Otherwise
2209     //     only a few wx classes would be safe to use from non-main threads
2210     //     as MB<->WC conversion would fail "randomly".
2211     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2212 #endif // wxUSE_THREADS
2213
2214     size_t res, cres;
2215     const char *pszPtr = src;
2216
2217     if ( dst )
2218     {
2219         char* bufPtr = (char*)dst;
2220
2221         // have destination buffer, convert there
2222         size_t dstLenOrig = dstLen;
2223         cres = iconv(m2w,
2224                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2225                      &bufPtr, &dstLen);
2226
2227         // convert the number of bytes converted as returned by iconv to the
2228         // number of (wide) characters converted that we need
2229         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2230
2231         if (ms_wcNeedsSwap)
2232         {
2233             // convert to native endianness
2234             for ( unsigned i = 0; i < res; i++ )
2235                 dst[i] = WC_BSWAP(dst[i]);
2236         }
2237     }
2238     else // no destination buffer
2239     {
2240         // convert using temp buffer to calculate the size of the buffer needed
2241         wchar_t tbuf[8];
2242         res = 0;
2243
2244         do
2245         {
2246             char* bufPtr = (char*)tbuf;
2247             dstLen = 8 * SIZEOF_WCHAR_T;
2248
2249             cres = iconv(m2w,
2250                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2251                          &bufPtr, &dstLen );
2252
2253             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2254         }
2255         while ((cres == (size_t)-1) && (errno == E2BIG));
2256     }
2257
2258     if (ICONV_FAILED(cres, srcLen))
2259     {
2260         //VS: it is ok if iconv fails, hence trace only
2261         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2262         return wxCONV_FAILED;
2263     }
2264
2265     return res;
2266 }
2267
2268 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2269                                  const wchar_t *src, size_t srcLen) const
2270 {
2271 #if wxUSE_THREADS
2272     // NB: explained in MB2WC
2273     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2274 #endif
2275
2276     if ( srcLen == wxNO_LEN )
2277         srcLen = wxWcslen(src) + 1;
2278
2279     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2280     size_t outbuflen = dstLen;
2281     size_t res, cres;
2282
2283     wchar_t *tmpbuf = 0;
2284
2285     if (ms_wcNeedsSwap)
2286     {
2287         // need to copy to temp buffer to switch endianness
2288         // (doing WC_BSWAP twice on the original buffer won't help, as it
2289         //  could be in read-only memory, or be accessed in some other thread)
2290         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2291         for ( size_t i = 0; i < srcLen; i++ )
2292             tmpbuf[i] = WC_BSWAP(src[i]);
2293
2294         tmpbuf[srcLen] = L'\0';
2295         src = tmpbuf;
2296     }
2297
2298     char* inbuf = (char*)src;
2299     if ( dst )
2300     {
2301         // have destination buffer, convert there
2302         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2303
2304         res = dstLen - outbuflen;
2305     }
2306     else // no destination buffer
2307     {
2308         // convert using temp buffer to calculate the size of the buffer needed
2309         char tbuf[16];
2310         res = 0;
2311         do
2312         {
2313             dst = tbuf;
2314             outbuflen = 16;
2315
2316             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2317
2318             res += 16 - outbuflen;
2319         }
2320         while ((cres == (size_t)-1) && (errno == E2BIG));
2321     }
2322
2323     if (ms_wcNeedsSwap)
2324     {
2325         free(tmpbuf);
2326     }
2327
2328     if (ICONV_FAILED(cres, inbuflen))
2329     {
2330         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2331         return wxCONV_FAILED;
2332     }
2333
2334     return res;
2335 }
2336
2337 size_t wxMBConv_iconv::GetMBNulLen() const
2338 {
2339     if ( m_minMBCharWidth == 0 )
2340     {
2341         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2342
2343 #if wxUSE_THREADS
2344         // NB: explained in MB2WC
2345         wxMutexLocker lock(self->m_iconvMutex);
2346 #endif
2347
2348         const wchar_t *wnul = L"";
2349         char buf[8]; // should be enough for NUL in any encoding
2350         size_t inLen = sizeof(wchar_t),
2351                outLen = WXSIZEOF(buf);
2352         char *inBuff = (char *)wnul;
2353         char *outBuff = buf;
2354         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2355         {
2356             self->m_minMBCharWidth = (size_t)-1;
2357         }
2358         else // ok
2359         {
2360             self->m_minMBCharWidth = outBuff - buf;
2361         }
2362     }
2363
2364     return m_minMBCharWidth;
2365 }
2366
2367 #if wxUSE_UNICODE_UTF8
2368 bool wxMBConv_iconv::IsUTF8() const
2369 {
2370     return wxStricmp(m_name, "UTF-8") == 0 ||
2371            wxStricmp(m_name, "UTF8") == 0;
2372 }
2373 #endif
2374
2375 #endif // HAVE_ICONV
2376
2377
2378 // ============================================================================
2379 // Win32 conversion classes
2380 // ============================================================================
2381
2382 #ifdef wxHAVE_WIN32_MB2WC
2383
2384 // from utils.cpp
2385 #if wxUSE_FONTMAP
2386 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2387 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2388 #endif
2389
2390 class wxMBConv_win32 : public wxMBConv
2391 {
2392 public:
2393     wxMBConv_win32()
2394     {
2395         m_CodePage = CP_ACP;
2396         m_minMBCharWidth = 0;
2397     }
2398
2399     wxMBConv_win32(const wxMBConv_win32& conv)
2400         : wxMBConv()
2401     {
2402         m_CodePage = conv.m_CodePage;
2403         m_minMBCharWidth = conv.m_minMBCharWidth;
2404     }
2405
2406 #if wxUSE_FONTMAP
2407     wxMBConv_win32(const char* name)
2408     {
2409         m_CodePage = wxCharsetToCodepage(name);
2410         m_minMBCharWidth = 0;
2411     }
2412
2413     wxMBConv_win32(wxFontEncoding encoding)
2414     {
2415         m_CodePage = wxEncodingToCodepage(encoding);
2416         m_minMBCharWidth = 0;
2417     }
2418 #endif // wxUSE_FONTMAP
2419
2420     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2421     {
2422         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2423         // the behaviour is not compatible with the Unix version (using iconv)
2424         // and break the library itself, e.g. wxTextInputStream::NextChar()
2425         // wouldn't work if reading an incomplete MB char didn't result in an
2426         // error
2427         //
2428         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2429         // Win XP or newer and it is not supported for UTF-[78] so we always
2430         // use our own conversions in this case. See
2431         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2432         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2433         if ( m_CodePage == CP_UTF8 )
2434         {
2435             return wxMBConvUTF8().MB2WC(buf, psz, n);
2436         }
2437
2438         if ( m_CodePage == CP_UTF7 )
2439         {
2440             return wxMBConvUTF7().MB2WC(buf, psz, n);
2441         }
2442
2443         int flags = 0;
2444         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2445                 IsAtLeastWin2kSP4() )
2446         {
2447             flags = MB_ERR_INVALID_CHARS;
2448         }
2449
2450         const size_t len = ::MultiByteToWideChar
2451                              (
2452                                 m_CodePage,     // code page
2453                                 flags,          // flags: fall on error
2454                                 psz,            // input string
2455                                 -1,             // its length (NUL-terminated)
2456                                 buf,            // output string
2457                                 buf ? n : 0     // size of output buffer
2458                              );
2459         if ( !len )
2460         {
2461             // function totally failed
2462             return wxCONV_FAILED;
2463         }
2464
2465         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2466         // check if we succeeded, by doing a double trip:
2467         if ( !flags && buf )
2468         {
2469             const size_t mbLen = strlen(psz);
2470             wxCharBuffer mbBuf(mbLen);
2471             if ( ::WideCharToMultiByte
2472                    (
2473                       m_CodePage,
2474                       0,
2475                       buf,
2476                       -1,
2477                       mbBuf.data(),
2478                       mbLen + 1,        // size in bytes, not length
2479                       NULL,
2480                       NULL
2481                    ) == 0 ||
2482                   strcmp(mbBuf, psz) != 0 )
2483             {
2484                 // we didn't obtain the same thing we started from, hence
2485                 // the conversion was lossy and we consider that it failed
2486                 return wxCONV_FAILED;
2487             }
2488         }
2489
2490         // note that it returns count of written chars for buf != NULL and size
2491         // of the needed buffer for buf == NULL so in either case the length of
2492         // the string (which never includes the terminating NUL) is one less
2493         return len - 1;
2494     }
2495
2496     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2497     {
2498         /*
2499             we have a problem here: by default, WideCharToMultiByte() may
2500             replace characters unrepresentable in the target code page with bad
2501             quality approximations such as turning "1/2" symbol (U+00BD) into
2502             "1" for the code pages which don't have it and we, obviously, want
2503             to avoid this at any price
2504
2505             the trouble is that this function does it _silently_, i.e. it won't
2506             even tell us whether it did or not... Win98/2000 and higher provide
2507             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2508             we have to resort to a round trip, i.e. check that converting back
2509             results in the same string -- this is, of course, expensive but
2510             otherwise we simply can't be sure to not garble the data.
2511          */
2512
2513         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2514         // it doesn't work with CJK encodings (which we test for rather roughly
2515         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2516         // supporting it
2517         BOOL usedDef wxDUMMY_INITIALIZE(false);
2518         BOOL *pUsedDef;
2519         int flags;
2520         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2521         {
2522             // it's our lucky day
2523             flags = WC_NO_BEST_FIT_CHARS;
2524             pUsedDef = &usedDef;
2525         }
2526         else // old system or unsupported encoding
2527         {
2528             flags = 0;
2529             pUsedDef = NULL;
2530         }
2531
2532         const size_t len = ::WideCharToMultiByte
2533                              (
2534                                 m_CodePage,     // code page
2535                                 flags,          // either none or no best fit
2536                                 pwz,            // input string
2537                                 -1,             // it is (wide) NUL-terminated
2538                                 buf,            // output buffer
2539                                 buf ? n : 0,    // and its size
2540                                 NULL,           // default "replacement" char
2541                                 pUsedDef        // [out] was it used?
2542                              );
2543
2544         if ( !len )
2545         {
2546             // function totally failed
2547             return wxCONV_FAILED;
2548         }
2549
2550         // we did something, check if we really succeeded
2551         if ( flags )
2552         {
2553             // check if the conversion failed, i.e. if any replacements
2554             // were done
2555             if ( usedDef )
2556                 return wxCONV_FAILED;
2557         }
2558         else // we must resort to double tripping...
2559         {
2560             // first we need to ensure that we really have the MB data: this is
2561             // not the case if we're called with NULL buffer, in which case we
2562             // need to do the conversion yet again
2563             wxCharBuffer bufDef;
2564             if ( !buf )
2565             {
2566                 bufDef = wxCharBuffer(len);
2567                 buf = bufDef.data();
2568                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2569                                             buf, len, NULL, NULL) )
2570                     return wxCONV_FAILED;
2571             }
2572
2573             if ( !n )
2574                 n = wcslen(pwz);
2575             wxWCharBuffer wcBuf(n);
2576             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2577                     wcscmp(wcBuf, pwz) != 0 )
2578             {
2579                 // we didn't obtain the same thing we started from, hence
2580                 // the conversion was lossy and we consider that it failed
2581                 return wxCONV_FAILED;
2582             }
2583         }
2584
2585         // see the comment above for the reason of "len - 1"
2586         return len - 1;
2587     }
2588
2589     virtual size_t GetMBNulLen() const
2590     {
2591         if ( m_minMBCharWidth == 0 )
2592         {
2593             int len = ::WideCharToMultiByte
2594                         (
2595                             m_CodePage,     // code page
2596                             0,              // no flags
2597                             L"",            // input string
2598                             1,              // translate just the NUL
2599                             NULL,           // output buffer
2600                             0,              // and its size
2601                             NULL,           // no replacement char
2602                             NULL            // [out] don't care if it was used
2603                         );
2604
2605             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2606             switch ( len )
2607             {
2608                 default:
2609                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2610                     self->m_minMBCharWidth = (size_t)-1;
2611                     break;
2612
2613                 case 0:
2614                     self->m_minMBCharWidth = (size_t)-1;
2615                     break;
2616
2617                 case 1:
2618                 case 2:
2619                 case 4:
2620                     self->m_minMBCharWidth = len;
2621                     break;
2622             }
2623         }
2624
2625         return m_minMBCharWidth;
2626     }
2627
2628     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2629
2630     bool IsOk() const { return m_CodePage != -1; }
2631
2632 private:
2633     static bool CanUseNoBestFit()
2634     {
2635         static int s_isWin98Or2k = -1;
2636
2637         if ( s_isWin98Or2k == -1 )
2638         {
2639             int verMaj, verMin;
2640             switch ( wxGetOsVersion(&verMaj, &verMin) )
2641             {
2642                 case wxOS_WINDOWS_9X:
2643                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2644                     break;
2645
2646                 case wxOS_WINDOWS_NT:
2647                     s_isWin98Or2k = verMaj >= 5;
2648                     break;
2649
2650                 default:
2651                     // unknown: be conservative by default
2652                     s_isWin98Or2k = 0;
2653                     break;
2654             }
2655
2656             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2657         }
2658
2659         return s_isWin98Or2k == 1;
2660     }
2661
2662     static bool IsAtLeastWin2kSP4()
2663     {
2664 #ifdef __WXWINCE__
2665         return false;
2666 #else
2667         static int s_isAtLeastWin2kSP4 = -1;
2668
2669         if ( s_isAtLeastWin2kSP4 == -1 )
2670         {
2671             OSVERSIONINFOEX ver;
2672
2673             memset(&ver, 0, sizeof(ver));
2674             ver.dwOSVersionInfoSize = sizeof(ver);
2675             GetVersionEx((OSVERSIONINFO*)&ver);
2676
2677             s_isAtLeastWin2kSP4 =
2678               ((ver.dwMajorVersion > 5) || // Vista+
2679                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2680                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2681                ver.wServicePackMajor >= 4)) // 2000 SP4+
2682               ? 1 : 0;
2683         }
2684
2685         return s_isAtLeastWin2kSP4 == 1;
2686 #endif
2687     }
2688
2689
2690     // the code page we're working with
2691     long m_CodePage;
2692
2693     // cached result of GetMBNulLen(), set to 0 initially meaning
2694     // "unknown"
2695     size_t m_minMBCharWidth;
2696 };
2697
2698 #endif // wxHAVE_WIN32_MB2WC
2699
2700
2701 // ============================================================================
2702 // wxEncodingConverter based conversion classes
2703 // ============================================================================
2704
2705 #if wxUSE_FONTMAP
2706
2707 class wxMBConv_wxwin : public wxMBConv
2708 {
2709 private:
2710     void Init()
2711     {
2712         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2713         // The wxMBConv_cf class does a better job.
2714         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2715                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2716                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2717     }
2718
2719 public:
2720     // temporarily just use wxEncodingConverter stuff,
2721     // so that it works while a better implementation is built
2722     wxMBConv_wxwin(const char* name)
2723     {
2724         if (name)
2725             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2726         else
2727             m_enc = wxFONTENCODING_SYSTEM;
2728
2729         Init();
2730     }
2731
2732     wxMBConv_wxwin(wxFontEncoding enc)
2733     {
2734         m_enc = enc;
2735
2736         Init();
2737     }
2738
2739     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2740     {
2741         size_t inbuf = strlen(psz);
2742         if (buf)
2743         {
2744             if (!m2w.Convert(psz, buf))
2745                 return wxCONV_FAILED;
2746         }
2747         return inbuf;
2748     }
2749
2750     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2751     {
2752         const size_t inbuf = wxWcslen(psz);
2753         if (buf)
2754         {
2755             if (!w2m.Convert(psz, buf))
2756                 return wxCONV_FAILED;
2757         }
2758
2759         return inbuf;
2760     }
2761
2762     virtual size_t GetMBNulLen() const
2763     {
2764         switch ( m_enc )
2765         {
2766             case wxFONTENCODING_UTF16BE:
2767             case wxFONTENCODING_UTF16LE:
2768                 return 2;
2769
2770             case wxFONTENCODING_UTF32BE:
2771             case wxFONTENCODING_UTF32LE:
2772                 return 4;
2773
2774             default:
2775                 return 1;
2776         }
2777     }
2778
2779     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2780
2781     bool IsOk() const { return m_ok; }
2782
2783 public:
2784     wxFontEncoding m_enc;
2785     wxEncodingConverter m2w, w2m;
2786
2787 private:
2788     // were we initialized successfully?
2789     bool m_ok;
2790
2791     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2792 };
2793
2794 // make the constructors available for unit testing
2795 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2796 {
2797     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2798     if ( !result->IsOk() )
2799     {
2800         delete result;
2801         return 0;
2802     }
2803
2804     return result;
2805 }
2806
2807 #endif // wxUSE_FONTMAP
2808
2809 // ============================================================================
2810 // wxCSConv implementation
2811 // ============================================================================
2812
2813 void wxCSConv::Init()
2814 {
2815     m_name = NULL;
2816     m_convReal =  NULL;
2817     m_deferred = true;
2818 }
2819
2820 wxCSConv::wxCSConv(const wxString& charset)
2821 {
2822     Init();
2823
2824     if ( !charset.empty() )
2825     {
2826         SetName(charset.ToAscii());
2827     }
2828
2829 #if wxUSE_FONTMAP
2830     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2831 #else
2832     m_encoding = wxFONTENCODING_SYSTEM;
2833 #endif
2834 }
2835
2836 wxCSConv::wxCSConv(wxFontEncoding encoding)
2837 {
2838     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2839     {
2840         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2841
2842         encoding = wxFONTENCODING_SYSTEM;
2843     }
2844
2845     Init();
2846
2847     m_encoding = encoding;
2848 }
2849
2850 wxCSConv::~wxCSConv()
2851 {
2852     Clear();
2853 }
2854
2855 wxCSConv::wxCSConv(const wxCSConv& conv)
2856         : wxMBConv()
2857 {
2858     Init();
2859
2860     SetName(conv.m_name);
2861     m_encoding = conv.m_encoding;
2862 }
2863
2864 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2865 {
2866     Clear();
2867
2868     SetName(conv.m_name);
2869     m_encoding = conv.m_encoding;
2870
2871     return *this;
2872 }
2873
2874 void wxCSConv::Clear()
2875 {
2876     free(m_name);
2877     delete m_convReal;
2878
2879     m_name = NULL;
2880     m_convReal = NULL;
2881 }
2882
2883 void wxCSConv::SetName(const char *charset)
2884 {
2885     if (charset)
2886     {
2887         m_name = wxStrdup(charset);
2888         m_deferred = true;
2889     }
2890 }
2891
2892 #if wxUSE_FONTMAP
2893
2894 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2895                      wxEncodingNameCache );
2896
2897 static wxEncodingNameCache gs_nameCache;
2898 #endif
2899
2900 wxMBConv *wxCSConv::DoCreate() const
2901 {
2902 #if wxUSE_FONTMAP
2903     wxLogTrace(TRACE_STRCONV,
2904                wxT("creating conversion for %s"),
2905                (m_name ? m_name
2906                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2907 #endif // wxUSE_FONTMAP
2908
2909     // check for the special case of ASCII or ISO8859-1 charset: as we have
2910     // special knowledge of it anyhow, we don't need to create a special
2911     // conversion object
2912     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2913             m_encoding == wxFONTENCODING_DEFAULT )
2914     {
2915         // don't convert at all
2916         return NULL;
2917     }
2918
2919     // we trust OS to do conversion better than we can so try external
2920     // conversion methods first
2921     //
2922     // the full order is:
2923     //      1. OS conversion (iconv() under Unix or Win32 API)
2924     //      2. hard coded conversions for UTF
2925     //      3. wxEncodingConverter as fall back
2926
2927     // step (1)
2928 #ifdef HAVE_ICONV
2929 #if !wxUSE_FONTMAP
2930     if ( m_name )
2931 #endif // !wxUSE_FONTMAP
2932     {
2933 #if wxUSE_FONTMAP
2934         wxFontEncoding encoding(m_encoding);
2935 #endif
2936
2937         if ( m_name )
2938         {
2939             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2940             if ( conv->IsOk() )
2941                 return conv;
2942
2943             delete conv;
2944
2945 #if wxUSE_FONTMAP
2946             encoding =
2947                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2948 #endif // wxUSE_FONTMAP
2949         }
2950 #if wxUSE_FONTMAP
2951         {
2952             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2953             if ( it != gs_nameCache.end() )
2954             {
2955                 if ( it->second.empty() )
2956                     return NULL;
2957
2958                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2959                 if ( conv->IsOk() )
2960                     return conv;
2961
2962                 delete conv;
2963             }
2964
2965             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2966             // CS : in case this does not return valid names (eg for MacRoman)
2967             // encoding got a 'failure' entry in the cache all the same,
2968             // although it just has to be created using a different method, so
2969             // only store failed iconv creation attempts (or perhaps we
2970             // shoulnd't do this at all ?)
2971             if ( names[0] != NULL )
2972             {
2973                 for ( ; *names; ++names )
2974                 {
2975                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2976                     //             will need changes that will obsolete this
2977                     wxString name(*names);
2978                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2979                     if ( conv->IsOk() )
2980                     {
2981                         gs_nameCache[encoding] = *names;
2982                         return conv;
2983                     }
2984
2985                     delete conv;
2986                 }
2987
2988                 gs_nameCache[encoding] = _T(""); // cache the failure
2989             }
2990         }
2991 #endif // wxUSE_FONTMAP
2992     }
2993 #endif // HAVE_ICONV
2994
2995 #ifdef wxHAVE_WIN32_MB2WC
2996     {
2997 #if wxUSE_FONTMAP
2998         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2999                                       : new wxMBConv_win32(m_encoding);
3000         if ( conv->IsOk() )
3001             return conv;
3002
3003         delete conv;
3004 #else
3005         return NULL;
3006 #endif
3007     }
3008 #endif // wxHAVE_WIN32_MB2WC
3009
3010 #ifdef __DARWIN__
3011     {
3012         // leave UTF16 and UTF32 to the built-ins of wx
3013         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3014             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3015         {
3016 #if wxUSE_FONTMAP
3017             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3018                                           : new wxMBConv_cf(m_encoding);
3019 #else
3020             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3021 #endif
3022
3023             if ( conv->IsOk() )
3024                  return conv;
3025
3026             delete conv;
3027         }
3028     }
3029 #endif // __DARWIN__
3030
3031     // step (2)
3032     wxFontEncoding enc = m_encoding;
3033 #if wxUSE_FONTMAP
3034     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3035     {
3036         // use "false" to suppress interactive dialogs -- we can be called from
3037         // anywhere and popping up a dialog from here is the last thing we want to
3038         // do
3039         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3040     }
3041 #endif // wxUSE_FONTMAP
3042
3043     switch ( enc )
3044     {
3045         case wxFONTENCODING_UTF7:
3046              return new wxMBConvUTF7;
3047
3048         case wxFONTENCODING_UTF8:
3049              return new wxMBConvUTF8;
3050
3051         case wxFONTENCODING_UTF16BE:
3052              return new wxMBConvUTF16BE;
3053
3054         case wxFONTENCODING_UTF16LE:
3055              return new wxMBConvUTF16LE;
3056
3057         case wxFONTENCODING_UTF32BE:
3058              return new wxMBConvUTF32BE;
3059
3060         case wxFONTENCODING_UTF32LE:
3061              return new wxMBConvUTF32LE;
3062
3063         default:
3064              // nothing to do but put here to suppress gcc warnings
3065              break;
3066     }
3067
3068     // step (3)
3069 #if wxUSE_FONTMAP
3070     {
3071         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3072                                       : new wxMBConv_wxwin(m_encoding);
3073         if ( conv->IsOk() )
3074             return conv;
3075
3076         delete conv;
3077     }
3078 #endif // wxUSE_FONTMAP
3079
3080     // NB: This is a hack to prevent deadlock. What could otherwise happen
3081     //     in Unicode build: wxConvLocal creation ends up being here
3082     //     because of some failure and logs the error. But wxLog will try to
3083     //     attach a timestamp, for which it will need wxConvLocal (to convert
3084     //     time to char* and then wchar_t*), but that fails, tries to log the
3085     //     error, but wxLog has an (already locked) critical section that
3086     //     guards the static buffer.
3087     static bool alreadyLoggingError = false;
3088     if (!alreadyLoggingError)
3089     {
3090         alreadyLoggingError = true;
3091         wxLogError(_("Cannot convert from the charset '%s'!"),
3092                    m_name ? m_name
3093                       :
3094 #if wxUSE_FONTMAP
3095                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3096 #else // !wxUSE_FONTMAP
3097                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3098 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3099               );
3100
3101         alreadyLoggingError = false;
3102     }
3103
3104     return NULL;
3105 }
3106
3107 void wxCSConv::CreateConvIfNeeded() const
3108 {
3109     if ( m_deferred )
3110     {
3111         wxCSConv *self = (wxCSConv *)this; // const_cast
3112
3113         // if we don't have neither the name nor the encoding, use the default
3114         // encoding for this system
3115         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3116         {
3117 #if wxUSE_INTL
3118             self->m_encoding = wxLocale::GetSystemEncoding();
3119 #else
3120             // fallback to some reasonable default:
3121             self->m_encoding = wxFONTENCODING_ISO8859_1;
3122 #endif // wxUSE_INTL
3123         }
3124
3125         self->m_convReal = DoCreate();
3126         self->m_deferred = false;
3127     }
3128 }
3129
3130 bool wxCSConv::IsOk() const
3131 {
3132     CreateConvIfNeeded();
3133
3134     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3135     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3136         return true; // always ok as we do it ourselves
3137
3138     // m_convReal->IsOk() is called at its own creation, so we know it must
3139     // be ok if m_convReal is non-NULL
3140     return m_convReal != NULL;
3141 }
3142
3143 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3144                          const char *src, size_t srcLen) const
3145 {
3146     CreateConvIfNeeded();
3147
3148     if (m_convReal)
3149         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3150
3151     // latin-1 (direct)
3152     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3153 }
3154
3155 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3156                            const wchar_t *src, size_t srcLen) const
3157 {
3158     CreateConvIfNeeded();
3159
3160     if (m_convReal)
3161         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3162
3163     // latin-1 (direct)
3164     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3165 }
3166
3167 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3168 {
3169     CreateConvIfNeeded();
3170
3171     if (m_convReal)
3172         return m_convReal->MB2WC(buf, psz, n);
3173
3174     // latin-1 (direct)
3175     size_t len = strlen(psz);
3176
3177     if (buf)
3178     {
3179         for (size_t c = 0; c <= len; c++)
3180             buf[c] = (unsigned char)(psz[c]);
3181     }
3182
3183     return len;
3184 }
3185
3186 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3187 {
3188     CreateConvIfNeeded();
3189
3190     if (m_convReal)
3191         return m_convReal->WC2MB(buf, psz, n);
3192
3193     // latin-1 (direct)
3194     const size_t len = wxWcslen(psz);
3195     if (buf)
3196     {
3197         for (size_t c = 0; c <= len; c++)
3198         {
3199             if (psz[c] > 0xFF)
3200                 return wxCONV_FAILED;
3201
3202             buf[c] = (char)psz[c];
3203         }
3204     }
3205     else
3206     {
3207         for (size_t c = 0; c <= len; c++)
3208         {
3209             if (psz[c] > 0xFF)
3210                 return wxCONV_FAILED;
3211         }
3212     }
3213
3214     return len;
3215 }
3216
3217 size_t wxCSConv::GetMBNulLen() const
3218 {
3219     CreateConvIfNeeded();
3220
3221     if ( m_convReal )
3222     {
3223         return m_convReal->GetMBNulLen();
3224     }
3225
3226     // otherwise, we are ISO-8859-1
3227     return 1;
3228 }
3229
3230 #if wxUSE_UNICODE_UTF8
3231 bool wxCSConv::IsUTF8() const
3232 {
3233     CreateConvIfNeeded();
3234
3235     if ( m_convReal )
3236     {
3237         return m_convReal->IsUTF8();
3238     }
3239
3240     // otherwise, we are ISO-8859-1
3241     return false;
3242 }
3243 #endif
3244
3245
3246 #if wxUSE_UNICODE
3247
3248 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3249 {
3250     if ( !s )
3251         return wxWCharBuffer();
3252
3253     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3254     if ( !wbuf )
3255         wbuf = wxMBConvUTF8().cMB2WX(s);
3256     if ( !wbuf )
3257         wbuf = wxConvISO8859_1.cMB2WX(s);
3258
3259     return wbuf;
3260 }
3261
3262 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3263 {
3264     if ( !ws )
3265         return wxCharBuffer();
3266
3267     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3268     if ( !buf )
3269         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3270
3271     return buf;
3272 }
3273
3274 #endif // wxUSE_UNICODE
3275
3276 // ----------------------------------------------------------------------------
3277 // globals
3278 // ----------------------------------------------------------------------------
3279
3280 // NB: The reason why we create converted objects in this convoluted way,
3281 //     using a factory function instead of global variable, is that they
3282 //     may be used at static initialization time (some of them are used by
3283 //     wxString ctors and there may be a global wxString object). In other
3284 //     words, possibly _before_ the converter global object would be
3285 //     initialized.
3286
3287 #undef wxConvLibc
3288 #undef wxConvUTF8
3289 #undef wxConvUTF7
3290 #undef wxConvLocal
3291 #undef wxConvISO8859_1
3292
3293 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3294     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3295     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3296     {                                                                   \
3297         static impl_klass name##Obj ctor_args;                          \
3298         return &name##Obj;                                              \
3299     }                                                                   \
3300     /* this ensures that all global converter objects are created */    \
3301     /* by the time static initialization is done, i.e. before any */    \
3302     /* thread is launched: */                                           \
3303     static klass* gs_##name##instance = wxGet_##name##Ptr()
3304
3305 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3306     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3307
3308 #ifdef __WINDOWS__
3309     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3310 #else
3311     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3312 #endif
3313
3314 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3315 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3316 //     provokes an error message about "not enough macro parameters"; and we
3317 //     can't use "()" here as the name##Obj declaration would be parsed as a
3318 //     function declaration then, so use a semicolon and live with an extra
3319 //     empty statement (and hope that no compilers warns about this)
3320 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3321 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3322
3323 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3324 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3325
3326 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3327 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3328
3329 #ifdef __DARWIN__
3330 // The xnu kernel always communicates file paths in decomposed UTF-8.
3331 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3332 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3333 #endif
3334
3335 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3336 #ifdef __DARWIN__
3337                                     &wxConvMacUTF8DObj;
3338 #else // !__DARWIN__
3339                                     wxGet_wxConvLibcPtr();
3340 #endif // __DARWIN__/!__DARWIN__
3341
3342 #else // !wxUSE_WCHAR_T
3343
3344 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3345 // stand-ins in absence of wchar_t
3346 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3347                                 wxConvISO8859_1,
3348                                 wxConvLocal,
3349                                 wxConvUTF8;
3350
3351 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T