src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         dstWritten += lenChunk;
 216         if ( !srcEnd )
 217             dstWritten++;
 218
 219         if ( !lenChunk )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             // +1 is for trailing NUL
 231             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 232                 return wxCONV_FAILED;
 233
 234             dst += lenChunk;
 235             if ( !srcEnd )
 236                 dst++;
 237         }
 238
 239         if ( !srcEnd )
 240         {
 241             // we convert just one chunk in this case as this is the entire
 242             // string anyhow
 243             break;
 244         }
 245
 246         // advance the input pointer past the end of this chunk
 247         while ( NotAllNULs(src, nulLen) )
 248         {
 249             // notice that we must skip over multiple bytes here as we suppose
 250             // that if NUL takes 2 or 4 bytes, then all the other characters do
 251             // too and so if advanced by a single byte we might erroneously
 252             // detect sequences of NUL bytes in the middle of the input
 253             src += nulLen;
 254         }
 255
 256         src += nulLen; // skipping over its terminator as well
 257
 258         // note that ">=" (and not just "==") is needed here as the terminator
 259         // we skipped just above could be inside or just after the buffer
 260         // delimited by inEnd
 261         if ( src >= srcEnd )
 262             break;
 263     }
 264
 265     return dstWritten;
 266 }
 267
 268 size_t
 269 wxMBConv::FromWChar(char *dst, size_t dstLen,
 270                     const wchar_t *src, size_t srcLen) const
 271 {
 272     // the number of chars [which would be] written to dst [if it were not NULL]
 273     size_t dstWritten = 0;
 274
 275     // if we don't know its length we have no choice but to assume that it is
 276     // NUL-terminated (notice that it can still be NUL-terminated even if
 277     // explicit length is given but it doesn't change our return value)
 278     const bool isNulTerminated = srcLen == wxNO_LEN;
 279
 280     // make a copy of the input string unless it is already properly
 281     // NUL-terminated
 282     wxWCharBuffer bufTmp;
 283     if ( isNulTerminated )
 284     {
 285         srcLen = wxWcslen(src) + 1;
 286     }
 287     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 288     {
 289         // make a copy in order to properly NUL-terminate the string
 290         bufTmp = wxWCharBuffer(srcLen);
 291         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 292         src = bufTmp;
 293     }
 294
 295     const size_t lenNul = GetMBNulLen();
 296     for ( const wchar_t * const srcEnd = src + srcLen;
 297           src < srcEnd;
 298           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 299     {
 300         // try to convert the current chunk
 301         size_t lenChunk = WC2MB(NULL, src, 0);
 302
 303         if ( lenChunk == wxCONV_FAILED )
 304             return wxCONV_FAILED;
 305
 306         dstWritten += lenChunk;
 307         if ( isNulTerminated )
 308             dstWritten += lenNul;
 309
 310         if ( dst )
 311         {
 312             if ( dstWritten > dstLen )
 313                 return wxCONV_FAILED;
 314
 315             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 316                 return wxCONV_FAILED;
 317
 318             dst += lenChunk;
 319             if ( isNulTerminated )
 320                 dst += lenNul;
 321         }
 322     }
 323
 324     return dstWritten;
 325 }
 326
 327 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 328 {
 329     size_t rc = ToWChar(outBuff, outLen, inBuff);
 330     if ( rc != wxCONV_FAILED )
 331     {
 332         // ToWChar() returns the buffer length, i.e. including the trailing
 333         // NUL, while this method doesn't take it into account
 334         rc--;
 335     }
 336
 337     return rc;
 338 }
 339
 340 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 341 {
 342     size_t rc = FromWChar(outBuff, outLen, inBuff);
 343     if ( rc != wxCONV_FAILED )
 344     {
 345         rc -= GetMBNulLen();
 346     }
 347
 348     return rc;
 349 }
 350
 351 wxMBConv::~wxMBConv()
 352 {
 353     // nothing to do here (necessary for Darwin linking probably)
 354 }
 355
 356 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 357 {
 358     if ( psz )
 359     {
 360         // calculate the length of the buffer needed first
 361         const size_t nLen = ToWChar(NULL, 0, psz);
 362         if ( nLen != wxCONV_FAILED )
 363         {
 364             // now do the actual conversion
 365             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 366
 367             // +1 for the trailing NULL
 368             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 369                 return buf;
 370         }
 371     }
 372
 373     return wxWCharBuffer();
 374 }
 375
 376 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 377 {
 378     if ( pwz )
 379     {
 380         const size_t nLen = FromWChar(NULL, 0, pwz);
 381         if ( nLen != wxCONV_FAILED )
 382         {
 383             wxCharBuffer buf(nLen - 1);
 384             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 385                 return buf;
 386         }
 387     }
 388
 389     return wxCharBuffer();
 390 }
 391
 392 const wxWCharBuffer
 393 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 394 {
 395     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 396     if ( dstLen != wxCONV_FAILED )
 397     {
 398         // notice that we allocate space for dstLen+1 wide characters here
 399         // because we want the buffer to always be NUL-terminated, even if the
 400         // input isn't (as otherwise the caller has no way to know its length)
 401         wxWCharBuffer wbuf(dstLen);
 402         wbuf.data()[dstLen] = L'\0';
 403         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 404         {
 405             if ( outLen )
 406             {
 407                 *outLen = dstLen;
 408
 409                 // we also need to handle NUL-terminated input strings
 410                 // specially: for them the output is the length of the string
 411                 // excluding the trailing NUL, however if we're asked to
 412                 // convert a specific number of characters we return the length
 413                 // of the resulting output even if it's NUL-terminated
 414                 if ( inLen == wxNO_LEN )
 415                     (*outLen)--;
 416             }
 417
 418             return wbuf;
 419         }
 420     }
 421
 422     if ( outLen )
 423         *outLen = 0;
 424
 425     return wxWCharBuffer();
 426 }
 427
 428 const wxCharBuffer
 429 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 430 {
 431     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 432     if ( dstLen != wxCONV_FAILED )
 433     {
 434         const size_t nulLen = GetMBNulLen();
 435
 436         // as above, ensure that the buffer is always NUL-terminated, even if
 437         // the input is not
 438         wxCharBuffer buf(dstLen + nulLen - 1);
 439         memset(buf.data() + dstLen, 0, nulLen);
 440         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 441         {
 442             if ( outLen )
 443             {
 444                 *outLen = dstLen;
 445
 446                 if ( inLen == wxNO_LEN )
 447                 {
 448                     // in this case both input and output are NUL-terminated
 449                     // and we're not supposed to count NUL
 450                     *outLen -= nulLen;
 451                 }
 452             }
 453
 454             return buf;
 455         }
 456     }
 457
 458     if ( outLen )
 459         *outLen = 0;
 460
 461     return wxCharBuffer();
 462 }
 463
 464 // ----------------------------------------------------------------------------
 465 // wxMBConvLibc
 466 // ----------------------------------------------------------------------------
 467
 468 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 469 {
 470     return wxMB2WC(buf, psz, n);
 471 }
 472
 473 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 474 {
 475     return wxWC2MB(buf, psz, n);
 476 }
 477
 478 // ----------------------------------------------------------------------------
 479 // wxConvBrokenFileNames
 480 // ----------------------------------------------------------------------------
 481
 482 #ifdef __UNIX__
 483
 484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 485 {
 486     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 487          wxStricmp(charset, _T("UTF8")) == 0  )
 488         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 489     else
 490         m_conv = new wxCSConv(charset);
 491 }
 492
 493 #endif // __UNIX__
 494
 495 // ----------------------------------------------------------------------------
 496 // UTF-7
 497 // ----------------------------------------------------------------------------
 498
 499 // Implementation (C) 2004 Fredrik Roubert
 500 //
 501 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 502
 503 //
 504 // BASE64 decoding table
 505 //
 506 static const unsigned char utf7unb64[] =
 507 {
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 514     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 515     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 517     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 518     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 519     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 521     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 522     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 523     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 535     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 536     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 539     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 540 };
 541
 542 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 543                              const char *src, size_t srcLen) const
 544 {
 545     DecoderState stateOrig,
 546          *statePtr;
 547     if ( srcLen == wxNO_LEN )
 548     {
 549         // convert the entire string, up to and including the trailing NUL
 550         srcLen = strlen(src) + 1;
 551
 552         // when working on the entire strings we don't update nor use the shift
 553         // state from the previous call
 554         statePtr = &stateOrig;
 555     }
 556     else // when working with partial strings we do use the shift state
 557     {
 558         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 559
 560         // also save the old state to be able to rollback to it on error
 561         stateOrig = m_stateDecoder;
 562     }
 563
 564     // but to simplify the code below we use this variable in both cases
 565     DecoderState& state = *statePtr;
 566
 567
 568     // number of characters [which would have been] written to dst [if it were
 569     // not NULL]
 570     size_t len = 0;
 571
 572     const char * const srcEnd = src + srcLen;
 573
 574     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 575     {
 576         const unsigned char cc = *src++;
 577
 578         if ( state.IsShifted() )
 579         {
 580             const unsigned char dc = utf7unb64[cc];
 581             if ( dc == 0xff )
 582             {
 583                 // end of encoded part
 584                 state.ToDirect();
 585
 586                 // re-parse this character normally below unless it's '-' which
 587                 // is consumed by the decoder
 588                 if ( cc == '-' )
 589                     continue;
 590             }
 591             else // valid encoded character
 592             {
 593                 // mini base64 decoder: each character is 6 bits
 594                 state.bit += 6;
 595                 state.accum <<= 6;
 596                 state.accum += dc;
 597
 598                 if ( state.bit >= 8 )
 599                 {
 600                     // got the full byte, consume it
 601                     state.bit -= 8;
 602                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 603
 604                     if ( state.isLSB )
 605                     {
 606                         // we've got the full word, output it
 607                         if ( dst )
 608                             *dst++ = (state.msb << 8) | b;
 609                         len++;
 610                         state.isLSB = false;
 611                     }
 612                     else // MSB
 613                     {
 614                         // just store it while we wait for LSB
 615                         state.msb = b;
 616                         state.isLSB = true;
 617                     }
 618                 }
 619             }
 620         }
 621
 622         if ( state.IsDirect() )
 623         {
 624             // start of an encoded segment?
 625             if ( cc == '+' )
 626             {
 627                 if ( src == srcEnd )
 628                     return wxCONV_FAILED; // can't have '+' at the end
 629
 630                 if ( *src == '-' )
 631                 {
 632                     // just the encoded plus sign, don't switch to shifted mode
 633                     if ( dst )
 634                         *dst++ = '+';
 635                     len++;
 636                     src++;
 637                 }
 638                 else
 639                 {
 640                     state.ToShifted();
 641                 }
 642             }
 643             else // not '+'
 644             {
 645                 // only printable 7 bit ASCII characters (with the exception of
 646                 // NUL, TAB, CR and LF) can be used directly
 647                 if ( cc >= 0x7f || (cc < ' ' &&
 648                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 649                     return wxCONV_FAILED;
 650
 651                 if ( dst )
 652                     *dst++ = cc;
 653                 len++;
 654             }
 655         }
 656     }
 657
 658     if ( !len )
 659     {
 660         // as we didn't read any characters we should be called with the same
 661         // data (followed by some more new data) again later so don't save our
 662         // state
 663         state = stateOrig;
 664
 665         return wxCONV_FAILED;
 666     }
 667
 668     return len;
 669 }
 670
 671 //
 672 // BASE64 encoding table
 673 //
 674 static const unsigned char utf7enb64[] =
 675 {
 676     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 677     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 678     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 679     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 680     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 681     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 682     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 683     '4', '5', '6', '7', '8', '9', '+', '/'
 684 };
 685
 686 //
 687 // UTF-7 encoding table
 688 //
 689 // 0 - Set D (directly encoded characters)
 690 // 1 - Set O (optional direct characters)
 691 // 2 - whitespace characters (optional)
 692 // 3 - special characters
 693 //
 694 static const unsigned char utf7encode[128] =
 695 {
 696     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 697     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 698     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 699     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 700     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 701     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 702     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 703     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 704 };
 705
 706 static inline bool wxIsUTF7Direct(wchar_t wc)
 707 {
 708     return wc < 0x80 && utf7encode[wc] < 1;
 709 }
 710
 711 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 712                                const wchar_t *src, size_t srcLen) const
 713 {
 714     EncoderState stateOrig,
 715                 *statePtr;
 716     if ( srcLen == wxNO_LEN )
 717     {
 718         // we don't apply the stored state when operating on entire strings at
 719         // once
 720         statePtr = &stateOrig;
 721
 722         srcLen = wxWcslen(src) + 1;
 723     }
 724     else // do use the mode we left the output in previously
 725     {
 726         stateOrig = m_stateEncoder;
 727         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 728     }
 729
 730     EncoderState& state = *statePtr;
 731
 732
 733     size_t len = 0;
 734
 735     const wchar_t * const srcEnd = src + srcLen;
 736     while ( src < srcEnd && (!dst || len < dstLen) )
 737     {
 738         wchar_t cc = *src++;
 739         if ( wxIsUTF7Direct(cc) )
 740         {
 741             if ( state.IsShifted() )
 742             {
 743                 // pad with zeros the last encoded block if necessary
 744                 if ( state.bit )
 745                 {
 746                     if ( dst )
 747                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 748                     len++;
 749                 }
 750
 751                 state.ToDirect();
 752
 753                 if ( dst )
 754                     *dst++ = '-';
 755                 len++;
 756             }
 757
 758             if ( dst )
 759                 *dst++ = (char)cc;
 760             len++;
 761         }
 762         else if ( cc == '+' && state.IsDirect() )
 763         {
 764             if ( dst )
 765             {
 766                 *dst++ = '+';
 767                 *dst++ = '-';
 768             }
 769
 770             len += 2;
 771         }
 772 #ifndef WC_UTF16
 773         else if (((wxUint32)cc) > 0xffff)
 774         {
 775             // no surrogate pair generation (yet?)
 776             return wxCONV_FAILED;
 777         }
 778 #endif
 779         else
 780         {
 781             if ( state.IsDirect() )
 782             {
 783                 state.ToShifted();
 784
 785                 if ( dst )
 786                     *dst++ = '+';
 787                 len++;
 788             }
 789
 790             // BASE64 encode string
 791             for ( ;; )
 792             {
 793                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 794                 {
 795                     state.accum <<= 8;
 796                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 797
 798                     for (state.bit += 8; state.bit >= 6; )
 799                     {
 800                         state.bit -= 6;
 801                         if ( dst )
 802                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 803                         len++;
 804                     }
 805                 }
 806
 807                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 808                     break;
 809
 810                 src++;
 811             }
 812         }
 813     }
 814
 815     // we need to restore the original encoder state if we were called just to
 816     // calculate the amount of space needed as we will presumably be called
 817     // again to really convert the data now
 818     if ( !dst )
 819         state = stateOrig;
 820
 821     return len;
 822 }
 823
 824 // ----------------------------------------------------------------------------
 825 // UTF-8
 826 // ----------------------------------------------------------------------------
 827
 828 static const wxUint32 utf8_max[]=
 829     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 830
 831 // boundaries of the private use area we use to (temporarily) remap invalid
 832 // characters invalid in a UTF-8 encoded string
 833 const wxUint32 wxUnicodePUA = 0x100000;
 834 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 835
 836 // this table gives the length of the UTF-8 encoding from its first character:
 837 const unsigned char tableUtf8Lengths[256] = {
 838     // single-byte sequences (ASCII):
 839     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 840     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 841     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 842     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 843     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 844     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 845     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 846     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 847
 848     // these are invalid:
 849     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 850     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 851     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 852     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 853     0, 0,                                            // C0,C1
 854
 855     // two-byte sequences:
 856           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 857     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 858
 859     // three-byte sequences:
 860     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 861
 862     // four-byte sequences:
 863     4, 4, 4, 4, 4,                                   // F0..F4
 864
 865     // these are invalid again (5- or 6-byte
 866     // sequences and sequences for code points
 867     // above U+10FFFF, as restricted by RFC 3629):
 868                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 869 };
 870
 871 size_t
 872 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 873                             const char *src, size_t srcLen) const
 874 {
 875     wchar_t *out = dstLen ? dst : NULL;
 876     size_t written = 0;
 877
 878     if ( srcLen == wxNO_LEN )
 879         srcLen = strlen(src) + 1;
 880
 881     for ( const char *p = src; ; p++ )
 882     {
 883         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 884         {
 885             // all done successfully, just add the trailing NULL if we are not
 886             // using explicit length
 887             if ( srcLen == wxNO_LEN )
 888             {
 889                 if ( out )
 890                 {
 891                     if ( !dstLen )
 892                         break;
 893
 894                     *out = L'\0';
 895                 }
 896
 897                 written++;
 898             }
 899
 900             return written;
 901         }
 902
 903         if ( out && !dstLen-- )
 904             break;
 905
 906         wxUint32 code;
 907         unsigned char c = *p;
 908
 909         if ( c < 0x80 )
 910         {
 911             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 912                 break;
 913
 914             if ( srcLen != wxNO_LEN )
 915                 srcLen--;
 916
 917             code = c;
 918         }
 919         else
 920         {
 921             unsigned len = tableUtf8Lengths[c];
 922             if ( !len )
 923                 break;
 924
 925             if ( srcLen < len ) // the test works for wxNO_LEN too
 926                 break;
 927
 928             if ( srcLen != wxNO_LEN )
 929                 srcLen -= len;
 930
 931             //   Char. number range   |        UTF-8 octet sequence
 932             //      (hexadecimal)     |              (binary)
 933             //  ----------------------+----------------------------------------
 934             //  0000 0000 - 0000 007F | 0xxxxxxx
 935             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 936             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 937             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 938             //
 939             //  Code point value is stored in bits marked with 'x',
 940             //  lowest-order bit of the value on the right side in the diagram
 941             //  above.                                         (from RFC 3629)
 942
 943             // mask to extract lead byte's value ('x' bits above), by sequence
 944             // length:
 945             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 946
 947             // mask and value of lead byte's most significant bits, by length:
 948             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 949             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 950
 951             len--; // it's more convenient to work with 0-based length here
 952
 953             // extract the lead byte's value bits:
 954             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 955                 break;
 956
 957             code = c & leadValueMask[len];
 958
 959             // all remaining bytes, if any, are handled in the same way
 960             // regardless of sequence's length:
 961             for ( ; len; --len )
 962             {
 963                 c = *++p;
 964                 if ( (c & 0xC0) != 0x80 )
 965                     return wxCONV_FAILED;
 966
 967                 code <<= 6;
 968                 code |= c & 0x3F;
 969             }
 970         }
 971
 972 #ifdef WC_UTF16
 973         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 974         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 975         {
 976             if ( out )
 977                 out++;
 978             written++;
 979         }
 980 #else // !WC_UTF16
 981         if ( out )
 982             *out = code;
 983 #endif // WC_UTF16/!WC_UTF16
 984
 985         if ( out )
 986             out++;
 987
 988         written++;
 989     }
 990
 991     return wxCONV_FAILED;
 992 }
 993
 994 size_t
 995 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 996                               const wchar_t *src, size_t srcLen) const
 997 {
 998     char *out = dstLen ? dst : NULL;
 999     size_t written = 0;
1000
1001     for ( const wchar_t *wp = src; ; wp++ )
1002     {
1003         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1004         {
1005             // all done successfully, just add the trailing NULL if we are not
1006             // using explicit length
1007             if ( srcLen == wxNO_LEN )
1008             {
1009                 if ( out )
1010                 {
1011                     if ( !dstLen )
1012                         break;
1013
1014                     *out = '\0';
1015                 }
1016
1017                 written++;
1018             }
1019
1020             return written;
1021         }
1022
1023         if ( srcLen != wxNO_LEN )
1024             srcLen--;
1025
1026         wxUint32 code;
1027 #ifdef WC_UTF16
1028         // cast is ok for WC_UTF16
1029         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1030         {
1031             // skip the next char too as we decoded a surrogate
1032             wp++;
1033         }
1034 #else // wchar_t is UTF-32
1035         code = *wp & 0x7fffffff;
1036 #endif
1037
1038         unsigned len;
1039         if ( code <= 0x7F )
1040         {
1041             len = 1;
1042             if ( out )
1043             {
1044                 if ( dstLen < len )
1045                     break;
1046
1047                 out[0] = (char)code;
1048             }
1049         }
1050         else if ( code <= 0x07FF )
1051         {
1052             len = 2;
1053             if ( out )
1054             {
1055                 if ( dstLen < len )
1056                     break;
1057
1058                 // NB: this line takes 6 least significant bits, encodes them as
1059                 // 10xxxxxx and discards them so that the next byte can be encoded:
1060                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1061                 out[0] = 0xC0 | code;
1062             }
1063         }
1064         else if ( code < 0xFFFF )
1065         {
1066             len = 3;
1067             if ( out )
1068             {
1069                 if ( dstLen < len )
1070                     break;
1071
1072                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1073                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1074                 out[0] = 0xE0 | code;
1075             }
1076         }
1077         else if ( code <= 0x10FFFF )
1078         {
1079             len = 4;
1080             if ( out )
1081             {
1082                 if ( dstLen < len )
1083                     break;
1084
1085                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1086                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1087                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1088                 out[0] = 0xF0 | code;
1089             }
1090         }
1091         else
1092         {
1093             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1094             break;
1095         }
1096
1097         if ( out )
1098         {
1099             out += len;
1100             dstLen -= len;
1101         }
1102
1103         written += len;
1104     }
1105
1106     // we only get here if an error occurs during decoding
1107     return wxCONV_FAILED;
1108 }
1109
1110 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1111                              const char *psz, size_t srcLen) const
1112 {
1113     if ( m_options == MAP_INVALID_UTF8_NOT )
1114         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1115
1116     size_t len = 0;
1117
1118     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1119     {
1120         const char *opsz = psz;
1121         bool invalid = false;
1122         unsigned char cc = *psz++, fc = cc;
1123         unsigned cnt;
1124         for (cnt = 0; fc & 0x80; cnt++)
1125             fc <<= 1;
1126
1127         if (!cnt)
1128         {
1129             // plain ASCII char
1130             if (buf)
1131                 *buf++ = cc;
1132             len++;
1133
1134             // escape the escape character for octal escapes
1135             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1136                     && cc == '\\' && (!buf || len < n))
1137             {
1138                 if (buf)
1139                     *buf++ = cc;
1140                 len++;
1141             }
1142         }
1143         else
1144         {
1145             cnt--;
1146             if (!cnt)
1147             {
1148                 // invalid UTF-8 sequence
1149                 invalid = true;
1150             }
1151             else
1152             {
1153                 unsigned ocnt = cnt - 1;
1154                 wxUint32 res = cc & (0x3f >> cnt);
1155                 while (cnt--)
1156                 {
1157                     cc = *psz;
1158                     if ((cc & 0xC0) != 0x80)
1159                     {
1160                         // invalid UTF-8 sequence
1161                         invalid = true;
1162                         break;
1163                     }
1164
1165                     psz++;
1166                     res = (res << 6) | (cc & 0x3f);
1167                 }
1168
1169                 if (invalid || res <= utf8_max[ocnt])
1170                 {
1171                     // illegal UTF-8 encoding
1172                     invalid = true;
1173                 }
1174                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1175                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1176                 {
1177                     // if one of our PUA characters turns up externally
1178                     // it must also be treated as an illegal sequence
1179                     // (a bit like you have to escape an escape character)
1180                     invalid = true;
1181                 }
1182                 else
1183                 {
1184 #ifdef WC_UTF16
1185                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1186                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1187                     if (pa == wxCONV_FAILED)
1188                     {
1189                         invalid = true;
1190                     }
1191                     else
1192                     {
1193                         if (buf)
1194                             buf += pa;
1195                         len += pa;
1196                     }
1197 #else // !WC_UTF16
1198                     if (buf)
1199                         *buf++ = (wchar_t)res;
1200                     len++;
1201 #endif // WC_UTF16/!WC_UTF16
1202                 }
1203             }
1204
1205             if (invalid)
1206             {
1207                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1208                 {
1209                     while (opsz < psz && (!buf || len < n))
1210                     {
1211 #ifdef WC_UTF16
1212                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1213                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1214                         wxASSERT(pa != wxCONV_FAILED);
1215                         if (buf)
1216                             buf += pa;
1217                         opsz++;
1218                         len += pa;
1219 #else
1220                         if (buf)
1221                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1222                         opsz++;
1223                         len++;
1224 #endif
1225                     }
1226                 }
1227                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1228                 {
1229                     while (opsz < psz && (!buf || len < n))
1230                     {
1231                         if ( buf && len + 3 < n )
1232                         {
1233                             unsigned char on = *opsz;
1234                             *buf++ = L'\\';
1235                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1236                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1237                             *buf++ = (wchar_t)( L'0' + on % 010 );
1238                         }
1239
1240                         opsz++;
1241                         len += 4;
1242                     }
1243                 }
1244                 else // MAP_INVALID_UTF8_NOT
1245                 {
1246                     return wxCONV_FAILED;
1247                 }
1248             }
1249         }
1250     }
1251
1252     if (srcLen == wxNO_LEN && buf && (len < n))
1253         *buf = 0;
1254
1255     return len + 1;
1256 }
1257
1258 static inline bool isoctal(wchar_t wch)
1259 {
1260     return L'0' <= wch && wch <= L'7';
1261 }
1262
1263 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1264                                const wchar_t *psz, size_t srcLen) const
1265 {
1266     if ( m_options == MAP_INVALID_UTF8_NOT )
1267         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1268
1269     size_t len = 0;
1270
1271     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1272     {
1273         wxUint32 cc;
1274
1275 #ifdef WC_UTF16
1276         // cast is ok for WC_UTF16
1277         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1278         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1279 #else
1280         cc = (*psz++) & 0x7fffffff;
1281 #endif
1282
1283         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1284                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1285         {
1286             if (buf)
1287                 *buf++ = (char)(cc - wxUnicodePUA);
1288             len++;
1289         }
1290         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1291                     && cc == L'\\' && psz[0] == L'\\' )
1292         {
1293             if (buf)
1294                 *buf++ = (char)cc;
1295             psz++;
1296             len++;
1297         }
1298         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1299                     cc == L'\\' &&
1300                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1301         {
1302             if (buf)
1303             {
1304                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1305                                  (psz[1] - L'0') * 010 +
1306                                  (psz[2] - L'0'));
1307             }
1308
1309             psz += 3;
1310             len++;
1311         }
1312         else
1313         {
1314             unsigned cnt;
1315             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1316             {
1317             }
1318
1319             if (!cnt)
1320             {
1321                 // plain ASCII char
1322                 if (buf)
1323                     *buf++ = (char) cc;
1324                 len++;
1325             }
1326             else
1327             {
1328                 len += cnt + 1;
1329                 if (buf)
1330                 {
1331                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1332                     while (cnt--)
1333                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1334                 }
1335             }
1336         }
1337     }
1338
1339     if (srcLen == wxNO_LEN && buf && (len < n))
1340         *buf = 0;
1341
1342     return len + 1;
1343 }
1344
1345 // ============================================================================
1346 // UTF-16
1347 // ============================================================================
1348
1349 #ifdef WORDS_BIGENDIAN
1350     #define wxMBConvUTF16straight wxMBConvUTF16BE
1351     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1352 #else
1353     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1354     #define wxMBConvUTF16straight wxMBConvUTF16LE
1355 #endif
1356
1357 /* static */
1358 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1359 {
1360     if ( srcLen == wxNO_LEN )
1361     {
1362         // count the number of bytes in input, including the trailing NULs
1363         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1364         for ( srcLen = 1; *inBuff++; srcLen++ )
1365             ;
1366
1367         srcLen *= BYTES_PER_CHAR;
1368     }
1369     else // we already have the length
1370     {
1371         // we can only convert an entire number of UTF-16 characters
1372         if ( srcLen % BYTES_PER_CHAR )
1373             return wxCONV_FAILED;
1374     }
1375
1376     return srcLen;
1377 }
1378
1379 // case when in-memory representation is UTF-16 too
1380 #ifdef WC_UTF16
1381
1382 // ----------------------------------------------------------------------------
1383 // conversions without endianness change
1384 // ----------------------------------------------------------------------------
1385
1386 size_t
1387 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1388                                const char *src, size_t srcLen) const
1389 {
1390     // set up the scene for using memcpy() (which is presumably more efficient
1391     // than copying the bytes one by one)
1392     srcLen = GetLength(src, srcLen);
1393     if ( srcLen == wxNO_LEN )
1394         return wxCONV_FAILED;
1395
1396     const size_t inLen = srcLen / BYTES_PER_CHAR;
1397     if ( dst )
1398     {
1399         if ( dstLen < inLen )
1400             return wxCONV_FAILED;
1401
1402         memcpy(dst, src, srcLen);
1403     }
1404
1405     return inLen;
1406 }
1407
1408 size_t
1409 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1410                                  const wchar_t *src, size_t srcLen) const
1411 {
1412     if ( srcLen == wxNO_LEN )
1413         srcLen = wxWcslen(src) + 1;
1414
1415     srcLen *= BYTES_PER_CHAR;
1416
1417     if ( dst )
1418     {
1419         if ( dstLen < srcLen )
1420             return wxCONV_FAILED;
1421
1422         memcpy(dst, src, srcLen);
1423     }
1424
1425     return srcLen;
1426 }
1427
1428 // ----------------------------------------------------------------------------
1429 // endian-reversing conversions
1430 // ----------------------------------------------------------------------------
1431
1432 size_t
1433 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1434                            const char *src, size_t srcLen) const
1435 {
1436     srcLen = GetLength(src, srcLen);
1437     if ( srcLen == wxNO_LEN )
1438         return wxCONV_FAILED;
1439
1440     srcLen /= BYTES_PER_CHAR;
1441
1442     if ( dst )
1443     {
1444         if ( dstLen < srcLen )
1445             return wxCONV_FAILED;
1446
1447         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1448         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1449         {
1450             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1451         }
1452     }
1453
1454     return srcLen;
1455 }
1456
1457 size_t
1458 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1459                              const wchar_t *src, size_t srcLen) const
1460 {
1461     if ( srcLen == wxNO_LEN )
1462         srcLen = wxWcslen(src) + 1;
1463
1464     srcLen *= BYTES_PER_CHAR;
1465
1466     if ( dst )
1467     {
1468         if ( dstLen < srcLen )
1469             return wxCONV_FAILED;
1470
1471         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1472         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1473         {
1474             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1475         }
1476     }
1477
1478     return srcLen;
1479 }
1480
1481 #else // !WC_UTF16: wchar_t is UTF-32
1482
1483 // ----------------------------------------------------------------------------
1484 // conversions without endianness change
1485 // ----------------------------------------------------------------------------
1486
1487 size_t
1488 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1489                                const char *src, size_t srcLen) const
1490 {
1491     srcLen = GetLength(src, srcLen);
1492     if ( srcLen == wxNO_LEN )
1493         return wxCONV_FAILED;
1494
1495     const size_t inLen = srcLen / BYTES_PER_CHAR;
1496     if ( !dst )
1497     {
1498         // optimization: return maximal space which could be needed for this
1499         // string even if the real size could be smaller if the buffer contains
1500         // any surrogates
1501         return inLen;
1502     }
1503
1504     size_t outLen = 0;
1505     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1506     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1507     {
1508         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1509         if ( !inBuff )
1510             return wxCONV_FAILED;
1511
1512         if ( ++outLen > dstLen )
1513             return wxCONV_FAILED;
1514
1515         *dst++ = ch;
1516     }
1517
1518
1519     return outLen;
1520 }
1521
1522 size_t
1523 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1524                                  const wchar_t *src, size_t srcLen) const
1525 {
1526     if ( srcLen == wxNO_LEN )
1527         srcLen = wxWcslen(src) + 1;
1528
1529     size_t outLen = 0;
1530     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1531     for ( size_t n = 0; n < srcLen; n++ )
1532     {
1533         wxUint16 cc[2];
1534         const size_t numChars = encode_utf16(*src++, cc);
1535         if ( numChars == wxCONV_FAILED )
1536             return wxCONV_FAILED;
1537
1538         outLen += numChars * BYTES_PER_CHAR;
1539         if ( outBuff )
1540         {
1541             if ( outLen > dstLen )
1542                 return wxCONV_FAILED;
1543
1544             *outBuff++ = cc[0];
1545             if ( numChars == 2 )
1546             {
1547                 // second character of a surrogate
1548                 *outBuff++ = cc[1];
1549             }
1550         }
1551     }
1552
1553     return outLen;
1554 }
1555
1556 // ----------------------------------------------------------------------------
1557 // endian-reversing conversions
1558 // ----------------------------------------------------------------------------
1559
1560 size_t
1561 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1562                            const char *src, size_t srcLen) const
1563 {
1564     srcLen = GetLength(src, srcLen);
1565     if ( srcLen == wxNO_LEN )
1566         return wxCONV_FAILED;
1567
1568     const size_t inLen = srcLen / BYTES_PER_CHAR;
1569     if ( !dst )
1570     {
1571         // optimization: return maximal space which could be needed for this
1572         // string even if the real size could be smaller if the buffer contains
1573         // any surrogates
1574         return inLen;
1575     }
1576
1577     size_t outLen = 0;
1578     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1579     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1580     {
1581         wxUint32 ch;
1582         wxUint16 tmp[2];
1583
1584         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1585         inBuff++;
1586         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1587
1588         const size_t numChars = decode_utf16(tmp, ch);
1589         if ( numChars == wxCONV_FAILED )
1590             return wxCONV_FAILED;
1591
1592         if ( numChars == 2 )
1593             inBuff++;
1594
1595         if ( ++outLen > dstLen )
1596             return wxCONV_FAILED;
1597
1598         *dst++ = ch;
1599     }
1600
1601
1602     return outLen;
1603 }
1604
1605 size_t
1606 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1607                              const wchar_t *src, size_t srcLen) const
1608 {
1609     if ( srcLen == wxNO_LEN )
1610         srcLen = wxWcslen(src) + 1;
1611
1612     size_t outLen = 0;
1613     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1614     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1615     {
1616         wxUint16 cc[2];
1617         const size_t numChars = encode_utf16(*src, cc);
1618         if ( numChars == wxCONV_FAILED )
1619             return wxCONV_FAILED;
1620
1621         outLen += numChars * BYTES_PER_CHAR;
1622         if ( outBuff )
1623         {
1624             if ( outLen > dstLen )
1625                 return wxCONV_FAILED;
1626
1627             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1628             if ( numChars == 2 )
1629             {
1630                 // second character of a surrogate
1631                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1632             }
1633         }
1634     }
1635
1636     return outLen;
1637 }
1638
1639 #endif // WC_UTF16/!WC_UTF16
1640
1641
1642 // ============================================================================
1643 // UTF-32
1644 // ============================================================================
1645
1646 #ifdef WORDS_BIGENDIAN
1647     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1648     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1649 #else
1650     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1651     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1652 #endif
1653
1654
1655 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1656 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1657
1658 /* static */
1659 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1660 {
1661     if ( srcLen == wxNO_LEN )
1662     {
1663         // count the number of bytes in input, including the trailing NULs
1664         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1665         for ( srcLen = 1; *inBuff++; srcLen++ )
1666             ;
1667
1668         srcLen *= BYTES_PER_CHAR;
1669     }
1670     else // we already have the length
1671     {
1672         // we can only convert an entire number of UTF-32 characters
1673         if ( srcLen % BYTES_PER_CHAR )
1674             return wxCONV_FAILED;
1675     }
1676
1677     return srcLen;
1678 }
1679
1680 // case when in-memory representation is UTF-16
1681 #ifdef WC_UTF16
1682
1683 // ----------------------------------------------------------------------------
1684 // conversions without endianness change
1685 // ----------------------------------------------------------------------------
1686
1687 size_t
1688 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1689                                const char *src, size_t srcLen) const
1690 {
1691     srcLen = GetLength(src, srcLen);
1692     if ( srcLen == wxNO_LEN )
1693         return wxCONV_FAILED;
1694
1695     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1696     const size_t inLen = srcLen / BYTES_PER_CHAR;
1697     size_t outLen = 0;
1698     for ( size_t n = 0; n < inLen; n++ )
1699     {
1700         wxUint16 cc[2];
1701         const size_t numChars = encode_utf16(*inBuff++, cc);
1702         if ( numChars == wxCONV_FAILED )
1703             return wxCONV_FAILED;
1704
1705         outLen += numChars;
1706         if ( dst )
1707         {
1708             if ( outLen > dstLen )
1709                 return wxCONV_FAILED;
1710
1711             *dst++ = cc[0];
1712             if ( numChars == 2 )
1713             {
1714                 // second character of a surrogate
1715                 *dst++ = cc[1];
1716             }
1717         }
1718     }
1719
1720     return outLen;
1721 }
1722
1723 size_t
1724 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1725                                  const wchar_t *src, size_t srcLen) const
1726 {
1727     if ( srcLen == wxNO_LEN )
1728         srcLen = wxWcslen(src) + 1;
1729
1730     if ( !dst )
1731     {
1732         // optimization: return maximal space which could be needed for this
1733         // string instead of the exact amount which could be less if there are
1734         // any surrogates in the input
1735         //
1736         // we consider that surrogates are rare enough to make it worthwhile to
1737         // avoid running the loop below at the cost of slightly extra memory
1738         // consumption
1739         return srcLen * BYTES_PER_CHAR;
1740     }
1741
1742     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1743     size_t outLen = 0;
1744     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1745     {
1746         const wxUint32 ch = wxDecodeSurrogate(&src);
1747         if ( !src )
1748             return wxCONV_FAILED;
1749
1750         outLen += BYTES_PER_CHAR;
1751
1752         if ( outLen > dstLen )
1753             return wxCONV_FAILED;
1754
1755         *outBuff++ = ch;
1756     }
1757
1758     return outLen;
1759 }
1760
1761 // ----------------------------------------------------------------------------
1762 // endian-reversing conversions
1763 // ----------------------------------------------------------------------------
1764
1765 size_t
1766 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1767                            const char *src, size_t srcLen) const
1768 {
1769     srcLen = GetLength(src, srcLen);
1770     if ( srcLen == wxNO_LEN )
1771         return wxCONV_FAILED;
1772
1773     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1774     const size_t inLen = srcLen / BYTES_PER_CHAR;
1775     size_t outLen = 0;
1776     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1777     {
1778         wxUint16 cc[2];
1779         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1780         if ( numChars == wxCONV_FAILED )
1781             return wxCONV_FAILED;
1782
1783         outLen += numChars;
1784         if ( dst )
1785         {
1786             if ( outLen > dstLen )
1787                 return wxCONV_FAILED;
1788
1789             *dst++ = cc[0];
1790             if ( numChars == 2 )
1791             {
1792                 // second character of a surrogate
1793                 *dst++ = cc[1];
1794             }
1795         }
1796     }
1797
1798     return outLen;
1799 }
1800
1801 size_t
1802 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1803                              const wchar_t *src, size_t srcLen) const
1804 {
1805     if ( srcLen == wxNO_LEN )
1806         srcLen = wxWcslen(src) + 1;
1807
1808     if ( !dst )
1809     {
1810         // optimization: return maximal space which could be needed for this
1811         // string instead of the exact amount which could be less if there are
1812         // any surrogates in the input
1813         //
1814         // we consider that surrogates are rare enough to make it worthwhile to
1815         // avoid running the loop below at the cost of slightly extra memory
1816         // consumption
1817         return srcLen*BYTES_PER_CHAR;
1818     }
1819
1820     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1821     size_t outLen = 0;
1822     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1823     {
1824         const wxUint32 ch = wxDecodeSurrogate(&src);
1825         if ( !src )
1826             return wxCONV_FAILED;
1827
1828         outLen += BYTES_PER_CHAR;
1829
1830         if ( outLen > dstLen )
1831             return wxCONV_FAILED;
1832
1833         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1834     }
1835
1836     return outLen;
1837 }
1838
1839 #else // !WC_UTF16: wchar_t is UTF-32
1840
1841 // ----------------------------------------------------------------------------
1842 // conversions without endianness change
1843 // ----------------------------------------------------------------------------
1844
1845 size_t
1846 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1847                                const char *src, size_t srcLen) const
1848 {
1849     // use memcpy() as it should be much faster than hand-written loop
1850     srcLen = GetLength(src, srcLen);
1851     if ( srcLen == wxNO_LEN )
1852         return wxCONV_FAILED;
1853
1854     const size_t inLen = srcLen/BYTES_PER_CHAR;
1855     if ( dst )
1856     {
1857         if ( dstLen < inLen )
1858             return wxCONV_FAILED;
1859
1860         memcpy(dst, src, srcLen);
1861     }
1862
1863     return inLen;
1864 }
1865
1866 size_t
1867 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1868                                  const wchar_t *src, size_t srcLen) const
1869 {
1870     if ( srcLen == wxNO_LEN )
1871         srcLen = wxWcslen(src) + 1;
1872
1873     srcLen *= BYTES_PER_CHAR;
1874
1875     if ( dst )
1876     {
1877         if ( dstLen < srcLen )
1878             return wxCONV_FAILED;
1879
1880         memcpy(dst, src, srcLen);
1881     }
1882
1883     return srcLen;
1884 }
1885
1886 // ----------------------------------------------------------------------------
1887 // endian-reversing conversions
1888 // ----------------------------------------------------------------------------
1889
1890 size_t
1891 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1892                            const char *src, size_t srcLen) const
1893 {
1894     srcLen = GetLength(src, srcLen);
1895     if ( srcLen == wxNO_LEN )
1896         return wxCONV_FAILED;
1897
1898     srcLen /= BYTES_PER_CHAR;
1899
1900     if ( dst )
1901     {
1902         if ( dstLen < srcLen )
1903             return wxCONV_FAILED;
1904
1905         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1906         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1907         {
1908             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1909         }
1910     }
1911
1912     return srcLen;
1913 }
1914
1915 size_t
1916 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1917                              const wchar_t *src, size_t srcLen) const
1918 {
1919     if ( srcLen == wxNO_LEN )
1920         srcLen = wxWcslen(src) + 1;
1921
1922     srcLen *= BYTES_PER_CHAR;
1923
1924     if ( dst )
1925     {
1926         if ( dstLen < srcLen )
1927             return wxCONV_FAILED;
1928
1929         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1930         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1931         {
1932             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1933         }
1934     }
1935
1936     return srcLen;
1937 }
1938
1939 #endif // WC_UTF16/!WC_UTF16
1940
1941
1942 // ============================================================================
1943 // The classes doing conversion using the iconv_xxx() functions
1944 // ============================================================================
1945
1946 #ifdef HAVE_ICONV
1947
1948 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1949 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1950 //     (unless there's yet another bug in glibc) the only case when iconv()
1951 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1952 //     left in the input buffer -- when _real_ error occurs,
1953 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1954 //     iconv() failure.
1955 //     [This bug does not appear in glibc 2.2.]
1956 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1957 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1958                                      (errno != E2BIG || bufLeft != 0))
1959 #else
1960 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1961 #endif
1962
1963 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1964
1965 #define ICONV_T_INVALID ((iconv_t)-1)
1966
1967 #if SIZEOF_WCHAR_T == 4
1968     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1969     #define WC_ENC      wxFONTENCODING_UTF32
1970 #elif SIZEOF_WCHAR_T == 2
1971     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1972     #define WC_ENC      wxFONTENCODING_UTF16
1973 #else // sizeof(wchar_t) != 2 nor 4
1974     // does this ever happen?
1975     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1976 #endif
1977
1978 // ----------------------------------------------------------------------------
1979 // wxMBConv_iconv: encapsulates an iconv character set
1980 // ----------------------------------------------------------------------------
1981
1982 class wxMBConv_iconv : public wxMBConv
1983 {
1984 public:
1985     wxMBConv_iconv(const char *name);
1986     virtual ~wxMBConv_iconv();
1987
1988     // implement base class virtual methods
1989     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1990                            const char *src, size_t srcLen = wxNO_LEN) const;
1991     virtual size_t FromWChar(char *dst, size_t dstLen,
1992                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1993     virtual size_t GetMBNulLen() const;
1994
1995 #if wxUSE_UNICODE_UTF8
1996     virtual bool IsUTF8() const;
1997 #endif
1998
1999     virtual wxMBConv *Clone() const
2000     {
2001         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2002         p->m_minMBCharWidth = m_minMBCharWidth;
2003         return p;
2004     }
2005
2006     bool IsOk() const
2007         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2008
2009 protected:
2010     // the iconv handlers used to translate from multibyte
2011     // to wide char and in the other direction
2012     iconv_t m2w,
2013             w2m;
2014
2015 #if wxUSE_THREADS
2016     // guards access to m2w and w2m objects
2017     wxMutex m_iconvMutex;
2018 #endif
2019
2020 private:
2021     // the name (for iconv_open()) of a wide char charset -- if none is
2022     // available on this machine, it will remain NULL
2023     static wxString ms_wcCharsetName;
2024
2025     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2026     // different endian-ness than the native one
2027     static bool ms_wcNeedsSwap;
2028
2029
2030     // name of the encoding handled by this conversion
2031     wxString m_name;
2032
2033     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2034     // initially
2035     size_t m_minMBCharWidth;
2036 };
2037
2038 // make the constructor available for unit testing
2039 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2040 {
2041     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2042     if ( !result->IsOk() )
2043     {
2044         delete result;
2045         return 0;
2046     }
2047
2048     return result;
2049 }
2050
2051 wxString wxMBConv_iconv::ms_wcCharsetName;
2052 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2053
2054 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2055               : m_name(name)
2056 {
2057     m_minMBCharWidth = 0;
2058
2059     // check for charset that represents wchar_t:
2060     if ( ms_wcCharsetName.empty() )
2061     {
2062         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2063
2064 #if wxUSE_FONTMAP
2065         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2066 #else // !wxUSE_FONTMAP
2067         static const wxChar *names_static[] =
2068         {
2069 #if SIZEOF_WCHAR_T == 4
2070             _T("UCS-4"),
2071 #elif SIZEOF_WCHAR_T = 2
2072             _T("UCS-2"),
2073 #endif
2074             NULL
2075         };
2076         const wxChar **names = names_static;
2077 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2078
2079         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2080         {
2081             const wxString nameCS(*names);
2082
2083             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2084             wxString nameXE(nameCS);
2085
2086 #ifdef WORDS_BIGENDIAN
2087                 nameXE += _T("BE");
2088 #else // little endian
2089                 nameXE += _T("LE");
2090 #endif
2091
2092             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2093                        nameXE.c_str());
2094
2095             m2w = iconv_open(nameXE.ToAscii(), name);
2096             if ( m2w == ICONV_T_INVALID )
2097             {
2098                 // try charset w/o bytesex info (e.g. "UCS4")
2099                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2100                            nameCS.c_str());
2101                 m2w = iconv_open(nameCS.ToAscii(), name);
2102
2103                 // and check for bytesex ourselves:
2104                 if ( m2w != ICONV_T_INVALID )
2105                 {
2106                     char    buf[2], *bufPtr;
2107                     wchar_t wbuf[2];
2108                     size_t  insz, outsz;
2109                     size_t  res;
2110
2111                     buf[0] = 'A';
2112                     buf[1] = 0;
2113                     wbuf[0] = 0;
2114                     insz = 2;
2115                     outsz = SIZEOF_WCHAR_T * 2;
2116                     char* wbufPtr = (char*)wbuf;
2117                     bufPtr = buf;
2118
2119                     res = iconv(
2120                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2121                         &wbufPtr, &outsz);
2122
2123                     if (ICONV_FAILED(res, insz))
2124                     {
2125                         wxLogLastError(wxT("iconv"));
2126                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2127                                    nameCS.c_str());
2128                     }
2129                     else // ok, can convert to this encoding, remember it
2130                     {
2131                         ms_wcCharsetName = nameCS;
2132                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2133                     }
2134                 }
2135             }
2136             else // use charset not requiring byte swapping
2137             {
2138                 ms_wcCharsetName = nameXE;
2139             }
2140         }
2141
2142         wxLogTrace(TRACE_STRCONV,
2143                    wxT("iconv wchar_t charset is \"%s\"%s"),
2144                    ms_wcCharsetName.empty() ? wxString("<none>")
2145                                             : ms_wcCharsetName,
2146                    ms_wcNeedsSwap ? _T(" (needs swap)")
2147                                   : _T(""));
2148     }
2149     else // we already have ms_wcCharsetName
2150     {
2151         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2152     }
2153
2154     if ( ms_wcCharsetName.empty() )
2155     {
2156         w2m = ICONV_T_INVALID;
2157     }
2158     else
2159     {
2160         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2161         if ( w2m == ICONV_T_INVALID )
2162         {
2163             wxLogTrace(TRACE_STRCONV,
2164                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2165                        ms_wcCharsetName.c_str(), name);
2166         }
2167     }
2168 }
2169
2170 wxMBConv_iconv::~wxMBConv_iconv()
2171 {
2172     if ( m2w != ICONV_T_INVALID )
2173         iconv_close(m2w);
2174     if ( w2m != ICONV_T_INVALID )
2175         iconv_close(w2m);
2176 }
2177
2178 size_t
2179 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2180                         const char *src, size_t srcLen) const
2181 {
2182     if ( srcLen == wxNO_LEN )
2183     {
2184         // find the string length: notice that must be done differently for
2185         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2186         // consecutive NULs
2187         const size_t nulLen = GetMBNulLen();
2188         switch ( nulLen )
2189         {
2190             default:
2191                 return wxCONV_FAILED;
2192
2193             case 1:
2194                 srcLen = strlen(src); // arguably more optimized than our version
2195                 break;
2196
2197             case 2:
2198             case 4:
2199                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2200                 // but they also have to start at character boundary and not
2201                 // span two adjacent characters
2202                 const char *p;
2203                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2204                     ;
2205                 srcLen = p - src;
2206                 break;
2207         }
2208
2209         // when we're determining the length of the string ourselves we count
2210         // the terminating NUL(s) as part of it and always NUL-terminate the
2211         // output
2212         srcLen += nulLen;
2213     }
2214
2215     // we express length in the number of (wide) characters but iconv always
2216     // counts buffer sizes it in bytes
2217     dstLen *= SIZEOF_WCHAR_T;
2218
2219 #if wxUSE_THREADS
2220     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2221     //     Unfortunately there are a couple of global wxCSConv objects such as
2222     //     wxConvLocal that are used all over wx code, so we have to make sure
2223     //     the handle is used by at most one thread at the time. Otherwise
2224     //     only a few wx classes would be safe to use from non-main threads
2225     //     as MB<->WC conversion would fail "randomly".
2226     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2227 #endif // wxUSE_THREADS
2228
2229     size_t res, cres;
2230     const char *pszPtr = src;
2231
2232     if ( dst )
2233     {
2234         char* bufPtr = (char*)dst;
2235
2236         // have destination buffer, convert there
2237         size_t dstLenOrig = dstLen;
2238         cres = iconv(m2w,
2239                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2240                      &bufPtr, &dstLen);
2241
2242         // convert the number of bytes converted as returned by iconv to the
2243         // number of (wide) characters converted that we need
2244         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2245
2246         if (ms_wcNeedsSwap)
2247         {
2248             // convert to native endianness
2249             for ( unsigned i = 0; i < res; i++ )
2250                 dst[i] = WC_BSWAP(dst[i]);
2251         }
2252     }
2253     else // no destination buffer
2254     {
2255         // convert using temp buffer to calculate the size of the buffer needed
2256         wchar_t tbuf[8];
2257         res = 0;
2258
2259         do
2260         {
2261             char* bufPtr = (char*)tbuf;
2262             dstLen = 8 * SIZEOF_WCHAR_T;
2263
2264             cres = iconv(m2w,
2265                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2266                          &bufPtr, &dstLen );
2267
2268             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2269         }
2270         while ((cres == (size_t)-1) && (errno == E2BIG));
2271     }
2272
2273     if (ICONV_FAILED(cres, srcLen))
2274     {
2275         //VS: it is ok if iconv fails, hence trace only
2276         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2277         return wxCONV_FAILED;
2278     }
2279
2280     return res;
2281 }
2282
2283 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2284                                  const wchar_t *src, size_t srcLen) const
2285 {
2286 #if wxUSE_THREADS
2287     // NB: explained in MB2WC
2288     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2289 #endif
2290
2291     if ( srcLen == wxNO_LEN )
2292         srcLen = wxWcslen(src) + 1;
2293
2294     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2295     size_t outbuflen = dstLen;
2296     size_t res, cres;
2297
2298     wchar_t *tmpbuf = 0;
2299
2300     if (ms_wcNeedsSwap)
2301     {
2302         // need to copy to temp buffer to switch endianness
2303         // (doing WC_BSWAP twice on the original buffer won't help, as it
2304         //  could be in read-only memory, or be accessed in some other thread)
2305         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2306         for ( size_t i = 0; i < srcLen; i++ )
2307             tmpbuf[i] = WC_BSWAP(src[i]);
2308
2309         tmpbuf[srcLen] = L'\0';
2310         src = tmpbuf;
2311     }
2312
2313     char* inbuf = (char*)src;
2314     if ( dst )
2315     {
2316         // have destination buffer, convert there
2317         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2318
2319         res = dstLen - outbuflen;
2320     }
2321     else // no destination buffer
2322     {
2323         // convert using temp buffer to calculate the size of the buffer needed
2324         char tbuf[16];
2325         res = 0;
2326         do
2327         {
2328             dst = tbuf;
2329             outbuflen = 16;
2330
2331             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2332
2333             res += 16 - outbuflen;
2334         }
2335         while ((cres == (size_t)-1) && (errno == E2BIG));
2336     }
2337
2338     if (ms_wcNeedsSwap)
2339     {
2340         free(tmpbuf);
2341     }
2342
2343     if (ICONV_FAILED(cres, inbuflen))
2344     {
2345         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2346         return wxCONV_FAILED;
2347     }
2348
2349     return res;
2350 }
2351
2352 size_t wxMBConv_iconv::GetMBNulLen() const
2353 {
2354     if ( m_minMBCharWidth == 0 )
2355     {
2356         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2357
2358 #if wxUSE_THREADS
2359         // NB: explained in MB2WC
2360         wxMutexLocker lock(self->m_iconvMutex);
2361 #endif
2362
2363         const wchar_t *wnul = L"";
2364         char buf[8]; // should be enough for NUL in any encoding
2365         size_t inLen = sizeof(wchar_t),
2366                outLen = WXSIZEOF(buf);
2367         char *inBuff = (char *)wnul;
2368         char *outBuff = buf;
2369         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2370         {
2371             self->m_minMBCharWidth = (size_t)-1;
2372         }
2373         else // ok
2374         {
2375             self->m_minMBCharWidth = outBuff - buf;
2376         }
2377     }
2378
2379     return m_minMBCharWidth;
2380 }
2381
2382 #if wxUSE_UNICODE_UTF8
2383 bool wxMBConv_iconv::IsUTF8() const
2384 {
2385     return wxStricmp(m_name, "UTF-8") == 0 ||
2386            wxStricmp(m_name, "UTF8") == 0;
2387 }
2388 #endif
2389
2390 #endif // HAVE_ICONV
2391
2392
2393 // ============================================================================
2394 // Win32 conversion classes
2395 // ============================================================================
2396
2397 #ifdef wxHAVE_WIN32_MB2WC
2398
2399 // from utils.cpp
2400 #if wxUSE_FONTMAP
2401 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2402 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2403 #endif
2404
2405 class wxMBConv_win32 : public wxMBConv
2406 {
2407 public:
2408     wxMBConv_win32()
2409     {
2410         m_CodePage = CP_ACP;
2411         m_minMBCharWidth = 0;
2412     }
2413
2414     wxMBConv_win32(const wxMBConv_win32& conv)
2415         : wxMBConv()
2416     {
2417         m_CodePage = conv.m_CodePage;
2418         m_minMBCharWidth = conv.m_minMBCharWidth;
2419     }
2420
2421 #if wxUSE_FONTMAP
2422     wxMBConv_win32(const char* name)
2423     {
2424         m_CodePage = wxCharsetToCodepage(name);
2425         m_minMBCharWidth = 0;
2426     }
2427
2428     wxMBConv_win32(wxFontEncoding encoding)
2429     {
2430         m_CodePage = wxEncodingToCodepage(encoding);
2431         m_minMBCharWidth = 0;
2432     }
2433 #endif // wxUSE_FONTMAP
2434
2435     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2436     {
2437         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2438         // the behaviour is not compatible with the Unix version (using iconv)
2439         // and break the library itself, e.g. wxTextInputStream::NextChar()
2440         // wouldn't work if reading an incomplete MB char didn't result in an
2441         // error
2442         //
2443         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2444         // Win XP or newer and it is not supported for UTF-[78] so we always
2445         // use our own conversions in this case. See
2446         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2447         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2448         if ( m_CodePage == CP_UTF8 )
2449         {
2450             return wxMBConvUTF8().MB2WC(buf, psz, n);
2451         }
2452
2453         if ( m_CodePage == CP_UTF7 )
2454         {
2455             return wxMBConvUTF7().MB2WC(buf, psz, n);
2456         }
2457
2458         int flags = 0;
2459         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2460                 IsAtLeastWin2kSP4() )
2461         {
2462             flags = MB_ERR_INVALID_CHARS;
2463         }
2464
2465         const size_t len = ::MultiByteToWideChar
2466                              (
2467                                 m_CodePage,     // code page
2468                                 flags,          // flags: fall on error
2469                                 psz,            // input string
2470                                 -1,             // its length (NUL-terminated)
2471                                 buf,            // output string
2472                                 buf ? n : 0     // size of output buffer
2473                              );
2474         if ( !len )
2475         {
2476             // function totally failed
2477             return wxCONV_FAILED;
2478         }
2479
2480         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2481         // check if we succeeded, by doing a double trip:
2482         if ( !flags && buf )
2483         {
2484             const size_t mbLen = strlen(psz);
2485             wxCharBuffer mbBuf(mbLen);
2486             if ( ::WideCharToMultiByte
2487                    (
2488                       m_CodePage,
2489                       0,
2490                       buf,
2491                       -1,
2492                       mbBuf.data(),
2493                       mbLen + 1,        // size in bytes, not length
2494                       NULL,
2495                       NULL
2496                    ) == 0 ||
2497                   strcmp(mbBuf, psz) != 0 )
2498             {
2499                 // we didn't obtain the same thing we started from, hence
2500                 // the conversion was lossy and we consider that it failed
2501                 return wxCONV_FAILED;
2502             }
2503         }
2504
2505         // note that it returns count of written chars for buf != NULL and size
2506         // of the needed buffer for buf == NULL so in either case the length of
2507         // the string (which never includes the terminating NUL) is one less
2508         return len - 1;
2509     }
2510
2511     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2512     {
2513         /*
2514             we have a problem here: by default, WideCharToMultiByte() may
2515             replace characters unrepresentable in the target code page with bad
2516             quality approximations such as turning "1/2" symbol (U+00BD) into
2517             "1" for the code pages which don't have it and we, obviously, want
2518             to avoid this at any price
2519
2520             the trouble is that this function does it _silently_, i.e. it won't
2521             even tell us whether it did or not... Win98/2000 and higher provide
2522             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2523             we have to resort to a round trip, i.e. check that converting back
2524             results in the same string -- this is, of course, expensive but
2525             otherwise we simply can't be sure to not garble the data.
2526          */
2527
2528         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2529         // it doesn't work with CJK encodings (which we test for rather roughly
2530         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2531         // supporting it
2532         BOOL usedDef wxDUMMY_INITIALIZE(false);
2533         BOOL *pUsedDef;
2534         int flags;
2535         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2536         {
2537             // it's our lucky day
2538             flags = WC_NO_BEST_FIT_CHARS;
2539             pUsedDef = &usedDef;
2540         }
2541         else // old system or unsupported encoding
2542         {
2543             flags = 0;
2544             pUsedDef = NULL;
2545         }
2546
2547         const size_t len = ::WideCharToMultiByte
2548                              (
2549                                 m_CodePage,     // code page
2550                                 flags,          // either none or no best fit
2551                                 pwz,            // input string
2552                                 -1,             // it is (wide) NUL-terminated
2553                                 buf,            // output buffer
2554                                 buf ? n : 0,    // and its size
2555                                 NULL,           // default "replacement" char
2556                                 pUsedDef        // [out] was it used?
2557                              );
2558
2559         if ( !len )
2560         {
2561             // function totally failed
2562             return wxCONV_FAILED;
2563         }
2564
2565         // we did something, check if we really succeeded
2566         if ( flags )
2567         {
2568             // check if the conversion failed, i.e. if any replacements
2569             // were done
2570             if ( usedDef )
2571                 return wxCONV_FAILED;
2572         }
2573         else // we must resort to double tripping...
2574         {
2575             // first we need to ensure that we really have the MB data: this is
2576             // not the case if we're called with NULL buffer, in which case we
2577             // need to do the conversion yet again
2578             wxCharBuffer bufDef;
2579             if ( !buf )
2580             {
2581                 bufDef = wxCharBuffer(len);
2582                 buf = bufDef.data();
2583                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2584                                             buf, len, NULL, NULL) )
2585                     return wxCONV_FAILED;
2586             }
2587
2588             if ( !n )
2589                 n = wcslen(pwz);
2590             wxWCharBuffer wcBuf(n);
2591             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2592                     wcscmp(wcBuf, pwz) != 0 )
2593             {
2594                 // we didn't obtain the same thing we started from, hence
2595                 // the conversion was lossy and we consider that it failed
2596                 return wxCONV_FAILED;
2597             }
2598         }
2599
2600         // see the comment above for the reason of "len - 1"
2601         return len - 1;
2602     }
2603
2604     virtual size_t GetMBNulLen() const
2605     {
2606         if ( m_minMBCharWidth == 0 )
2607         {
2608             int len = ::WideCharToMultiByte
2609                         (
2610                             m_CodePage,     // code page
2611                             0,              // no flags
2612                             L"",            // input string
2613                             1,              // translate just the NUL
2614                             NULL,           // output buffer
2615                             0,              // and its size
2616                             NULL,           // no replacement char
2617                             NULL            // [out] don't care if it was used
2618                         );
2619
2620             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2621             switch ( len )
2622             {
2623                 default:
2624                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2625                     self->m_minMBCharWidth = (size_t)-1;
2626                     break;
2627
2628                 case 0:
2629                     self->m_minMBCharWidth = (size_t)-1;
2630                     break;
2631
2632                 case 1:
2633                 case 2:
2634                 case 4:
2635                     self->m_minMBCharWidth = len;
2636                     break;
2637             }
2638         }
2639
2640         return m_minMBCharWidth;
2641     }
2642
2643     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2644
2645     bool IsOk() const { return m_CodePage != -1; }
2646
2647 private:
2648     static bool CanUseNoBestFit()
2649     {
2650         static int s_isWin98Or2k = -1;
2651
2652         if ( s_isWin98Or2k == -1 )
2653         {
2654             int verMaj, verMin;
2655             switch ( wxGetOsVersion(&verMaj, &verMin) )
2656             {
2657                 case wxOS_WINDOWS_9X:
2658                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2659                     break;
2660
2661                 case wxOS_WINDOWS_NT:
2662                     s_isWin98Or2k = verMaj >= 5;
2663                     break;
2664
2665                 default:
2666                     // unknown: be conservative by default
2667                     s_isWin98Or2k = 0;
2668                     break;
2669             }
2670
2671             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2672         }
2673
2674         return s_isWin98Or2k == 1;
2675     }
2676
2677     static bool IsAtLeastWin2kSP4()
2678     {
2679 #ifdef __WXWINCE__
2680         return false;
2681 #else
2682         static int s_isAtLeastWin2kSP4 = -1;
2683
2684         if ( s_isAtLeastWin2kSP4 == -1 )
2685         {
2686             OSVERSIONINFOEX ver;
2687
2688             memset(&ver, 0, sizeof(ver));
2689             ver.dwOSVersionInfoSize = sizeof(ver);
2690             GetVersionEx((OSVERSIONINFO*)&ver);
2691
2692             s_isAtLeastWin2kSP4 =
2693               ((ver.dwMajorVersion > 5) || // Vista+
2694                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2695                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2696                ver.wServicePackMajor >= 4)) // 2000 SP4+
2697               ? 1 : 0;
2698         }
2699
2700         return s_isAtLeastWin2kSP4 == 1;
2701 #endif
2702     }
2703
2704
2705     // the code page we're working with
2706     long m_CodePage;
2707
2708     // cached result of GetMBNulLen(), set to 0 initially meaning
2709     // "unknown"
2710     size_t m_minMBCharWidth;
2711 };
2712
2713 #endif // wxHAVE_WIN32_MB2WC
2714
2715
2716 // ============================================================================
2717 // wxEncodingConverter based conversion classes
2718 // ============================================================================
2719
2720 #if wxUSE_FONTMAP
2721
2722 class wxMBConv_wxwin : public wxMBConv
2723 {
2724 private:
2725     void Init()
2726     {
2727         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2728         // The wxMBConv_cf class does a better job.
2729         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2730                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2731                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2732     }
2733
2734 public:
2735     // temporarily just use wxEncodingConverter stuff,
2736     // so that it works while a better implementation is built
2737     wxMBConv_wxwin(const char* name)
2738     {
2739         if (name)
2740             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2741         else
2742             m_enc = wxFONTENCODING_SYSTEM;
2743
2744         Init();
2745     }
2746
2747     wxMBConv_wxwin(wxFontEncoding enc)
2748     {
2749         m_enc = enc;
2750
2751         Init();
2752     }
2753
2754     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2755     {
2756         size_t inbuf = strlen(psz);
2757         if (buf)
2758         {
2759             if (!m2w.Convert(psz, buf))
2760                 return wxCONV_FAILED;
2761         }
2762         return inbuf;
2763     }
2764
2765     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2766     {
2767         const size_t inbuf = wxWcslen(psz);
2768         if (buf)
2769         {
2770             if (!w2m.Convert(psz, buf))
2771                 return wxCONV_FAILED;
2772         }
2773
2774         return inbuf;
2775     }
2776
2777     virtual size_t GetMBNulLen() const
2778     {
2779         switch ( m_enc )
2780         {
2781             case wxFONTENCODING_UTF16BE:
2782             case wxFONTENCODING_UTF16LE:
2783                 return 2;
2784
2785             case wxFONTENCODING_UTF32BE:
2786             case wxFONTENCODING_UTF32LE:
2787                 return 4;
2788
2789             default:
2790                 return 1;
2791         }
2792     }
2793
2794     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2795
2796     bool IsOk() const { return m_ok; }
2797
2798 public:
2799     wxFontEncoding m_enc;
2800     wxEncodingConverter m2w, w2m;
2801
2802 private:
2803     // were we initialized successfully?
2804     bool m_ok;
2805
2806     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2807 };
2808
2809 // make the constructors available for unit testing
2810 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2811 {
2812     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2813     if ( !result->IsOk() )
2814     {
2815         delete result;
2816         return 0;
2817     }
2818
2819     return result;
2820 }
2821
2822 #endif // wxUSE_FONTMAP
2823
2824 // ============================================================================
2825 // wxCSConv implementation
2826 // ============================================================================
2827
2828 void wxCSConv::Init()
2829 {
2830     m_name = NULL;
2831     m_convReal =  NULL;
2832     m_deferred = true;
2833 }
2834
2835 wxCSConv::wxCSConv(const wxString& charset)
2836 {
2837     Init();
2838
2839     if ( !charset.empty() )
2840     {
2841         SetName(charset.ToAscii());
2842     }
2843
2844 #if wxUSE_FONTMAP
2845     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2846 #else
2847     m_encoding = wxFONTENCODING_SYSTEM;
2848 #endif
2849 }
2850
2851 wxCSConv::wxCSConv(wxFontEncoding encoding)
2852 {
2853     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2854     {
2855         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2856
2857         encoding = wxFONTENCODING_SYSTEM;
2858     }
2859
2860     Init();
2861
2862     m_encoding = encoding;
2863 }
2864
2865 wxCSConv::~wxCSConv()
2866 {
2867     Clear();
2868 }
2869
2870 wxCSConv::wxCSConv(const wxCSConv& conv)
2871         : wxMBConv()
2872 {
2873     Init();
2874
2875     SetName(conv.m_name);
2876     m_encoding = conv.m_encoding;
2877 }
2878
2879 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2880 {
2881     Clear();
2882
2883     SetName(conv.m_name);
2884     m_encoding = conv.m_encoding;
2885
2886     return *this;
2887 }
2888
2889 void wxCSConv::Clear()
2890 {
2891     free(m_name);
2892     delete m_convReal;
2893
2894     m_name = NULL;
2895     m_convReal = NULL;
2896 }
2897
2898 void wxCSConv::SetName(const char *charset)
2899 {
2900     if (charset)
2901     {
2902         m_name = wxStrdup(charset);
2903         m_deferred = true;
2904     }
2905 }
2906
2907 #if wxUSE_FONTMAP
2908
2909 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2910                      wxEncodingNameCache );
2911
2912 static wxEncodingNameCache gs_nameCache;
2913 #endif
2914
2915 wxMBConv *wxCSConv::DoCreate() const
2916 {
2917 #if wxUSE_FONTMAP
2918     wxLogTrace(TRACE_STRCONV,
2919                wxT("creating conversion for %s"),
2920                (m_name ? m_name
2921                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2922 #endif // wxUSE_FONTMAP
2923
2924     // check for the special case of ASCII or ISO8859-1 charset: as we have
2925     // special knowledge of it anyhow, we don't need to create a special
2926     // conversion object
2927     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2928             m_encoding == wxFONTENCODING_DEFAULT )
2929     {
2930         // don't convert at all
2931         return NULL;
2932     }
2933
2934     // we trust OS to do conversion better than we can so try external
2935     // conversion methods first
2936     //
2937     // the full order is:
2938     //      1. OS conversion (iconv() under Unix or Win32 API)
2939     //      2. hard coded conversions for UTF
2940     //      3. wxEncodingConverter as fall back
2941
2942     // step (1)
2943 #ifdef HAVE_ICONV
2944 #if !wxUSE_FONTMAP
2945     if ( m_name )
2946 #endif // !wxUSE_FONTMAP
2947     {
2948 #if wxUSE_FONTMAP
2949         wxFontEncoding encoding(m_encoding);
2950 #endif
2951
2952         if ( m_name )
2953         {
2954             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2955             if ( conv->IsOk() )
2956                 return conv;
2957
2958             delete conv;
2959
2960 #if wxUSE_FONTMAP
2961             encoding =
2962                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2963 #endif // wxUSE_FONTMAP
2964         }
2965 #if wxUSE_FONTMAP
2966         {
2967             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2968             if ( it != gs_nameCache.end() )
2969             {
2970                 if ( it->second.empty() )
2971                     return NULL;
2972
2973                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2974                 if ( conv->IsOk() )
2975                     return conv;
2976
2977                 delete conv;
2978             }
2979
2980             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2981             // CS : in case this does not return valid names (eg for MacRoman)
2982             // encoding got a 'failure' entry in the cache all the same,
2983             // although it just has to be created using a different method, so
2984             // only store failed iconv creation attempts (or perhaps we
2985             // shoulnd't do this at all ?)
2986             if ( names[0] != NULL )
2987             {
2988                 for ( ; *names; ++names )
2989                 {
2990                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2991                     //             will need changes that will obsolete this
2992                     wxString name(*names);
2993                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2994                     if ( conv->IsOk() )
2995                     {
2996                         gs_nameCache[encoding] = *names;
2997                         return conv;
2998                     }
2999
3000                     delete conv;
3001                 }
3002
3003                 gs_nameCache[encoding] = _T(""); // cache the failure
3004             }
3005         }
3006 #endif // wxUSE_FONTMAP
3007     }
3008 #endif // HAVE_ICONV
3009
3010 #ifdef wxHAVE_WIN32_MB2WC
3011     {
3012 #if wxUSE_FONTMAP
3013         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3014                                       : new wxMBConv_win32(m_encoding);
3015         if ( conv->IsOk() )
3016             return conv;
3017
3018         delete conv;
3019 #else
3020         return NULL;
3021 #endif
3022     }
3023 #endif // wxHAVE_WIN32_MB2WC
3024
3025 #ifdef __DARWIN__
3026     {
3027         // leave UTF16 and UTF32 to the built-ins of wx
3028         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3029             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3030         {
3031 #if wxUSE_FONTMAP
3032             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3033                                           : new wxMBConv_cf(m_encoding);
3034 #else
3035             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3036 #endif
3037
3038             if ( conv->IsOk() )
3039                  return conv;
3040
3041             delete conv;
3042         }
3043     }
3044 #endif // __DARWIN__
3045
3046     // step (2)
3047     wxFontEncoding enc = m_encoding;
3048 #if wxUSE_FONTMAP
3049     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3050     {
3051         // use "false" to suppress interactive dialogs -- we can be called from
3052         // anywhere and popping up a dialog from here is the last thing we want to
3053         // do
3054         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3055     }
3056 #endif // wxUSE_FONTMAP
3057
3058     switch ( enc )
3059     {
3060         case wxFONTENCODING_UTF7:
3061              return new wxMBConvUTF7;
3062
3063         case wxFONTENCODING_UTF8:
3064              return new wxMBConvUTF8;
3065
3066         case wxFONTENCODING_UTF16BE:
3067              return new wxMBConvUTF16BE;
3068
3069         case wxFONTENCODING_UTF16LE:
3070              return new wxMBConvUTF16LE;
3071
3072         case wxFONTENCODING_UTF32BE:
3073              return new wxMBConvUTF32BE;
3074
3075         case wxFONTENCODING_UTF32LE:
3076              return new wxMBConvUTF32LE;
3077
3078         default:
3079              // nothing to do but put here to suppress gcc warnings
3080              break;
3081     }
3082
3083     // step (3)
3084 #if wxUSE_FONTMAP
3085     {
3086         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3087                                       : new wxMBConv_wxwin(m_encoding);
3088         if ( conv->IsOk() )
3089             return conv;
3090
3091         delete conv;
3092     }
3093 #endif // wxUSE_FONTMAP
3094
3095     // NB: This is a hack to prevent deadlock. What could otherwise happen
3096     //     in Unicode build: wxConvLocal creation ends up being here
3097     //     because of some failure and logs the error. But wxLog will try to
3098     //     attach a timestamp, for which it will need wxConvLocal (to convert
3099     //     time to char* and then wchar_t*), but that fails, tries to log the
3100     //     error, but wxLog has an (already locked) critical section that
3101     //     guards the static buffer.
3102     static bool alreadyLoggingError = false;
3103     if (!alreadyLoggingError)
3104     {
3105         alreadyLoggingError = true;
3106         wxLogError(_("Cannot convert from the charset '%s'!"),
3107                    m_name ? m_name
3108                       :
3109 #if wxUSE_FONTMAP
3110                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3111 #else // !wxUSE_FONTMAP
3112                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3113 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3114               );
3115
3116         alreadyLoggingError = false;
3117     }
3118
3119     return NULL;
3120 }
3121
3122 void wxCSConv::CreateConvIfNeeded() const
3123 {
3124     if ( m_deferred )
3125     {
3126         wxCSConv *self = (wxCSConv *)this; // const_cast
3127
3128         // if we don't have neither the name nor the encoding, use the default
3129         // encoding for this system
3130         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3131         {
3132 #if wxUSE_INTL
3133             self->m_encoding = wxLocale::GetSystemEncoding();
3134 #else
3135             // fallback to some reasonable default:
3136             self->m_encoding = wxFONTENCODING_ISO8859_1;
3137 #endif // wxUSE_INTL
3138         }
3139
3140         self->m_convReal = DoCreate();
3141         self->m_deferred = false;
3142     }
3143 }
3144
3145 bool wxCSConv::IsOk() const
3146 {
3147     CreateConvIfNeeded();
3148
3149     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3150     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3151         return true; // always ok as we do it ourselves
3152
3153     // m_convReal->IsOk() is called at its own creation, so we know it must
3154     // be ok if m_convReal is non-NULL
3155     return m_convReal != NULL;
3156 }
3157
3158 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3159                          const char *src, size_t srcLen) const
3160 {
3161     CreateConvIfNeeded();
3162
3163     if (m_convReal)
3164         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3165
3166     // latin-1 (direct)
3167     if ( srcLen == wxNO_LEN )
3168         srcLen = strlen(src) + 1; // take trailing NUL too
3169
3170     if ( dst )
3171     {
3172         if ( dstLen < srcLen )
3173             return wxCONV_FAILED;
3174
3175         for ( size_t n = 0; n < srcLen; n++ )
3176             dst[n] = (unsigned char)(src[n]);
3177     }
3178
3179     return srcLen;
3180 }
3181
3182 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3183                            const wchar_t *src, size_t srcLen) const
3184 {
3185     CreateConvIfNeeded();
3186
3187     if (m_convReal)
3188         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3189
3190     // latin-1 (direct)
3191     if ( srcLen == wxNO_LEN )
3192         srcLen = wxWcslen(src) + 1;
3193
3194     if ( dst )
3195     {
3196         if ( dstLen < srcLen )
3197             return wxCONV_FAILED;
3198
3199         for ( size_t n = 0; n < srcLen; n++ )
3200         {
3201             if ( src[n] > 0xFF )
3202                 return wxCONV_FAILED;
3203
3204             dst[n] = (char)src[n];
3205         }
3206
3207     }
3208     else // still need to check the input validity
3209     {
3210         for ( size_t n = 0; n < srcLen; n++ )
3211         {
3212             if ( src[n] > 0xFF )
3213                 return wxCONV_FAILED;
3214         }
3215     }
3216
3217     return srcLen;
3218 }
3219
3220 size_t wxCSConv::GetMBNulLen() const
3221 {
3222     CreateConvIfNeeded();
3223
3224     if ( m_convReal )
3225     {
3226         return m_convReal->GetMBNulLen();
3227     }
3228
3229     // otherwise, we are ISO-8859-1
3230     return 1;
3231 }
3232
3233 #if wxUSE_UNICODE_UTF8
3234 bool wxCSConv::IsUTF8() const
3235 {
3236     CreateConvIfNeeded();
3237
3238     if ( m_convReal )
3239     {
3240         return m_convReal->IsUTF8();
3241     }
3242
3243     // otherwise, we are ISO-8859-1
3244     return false;
3245 }
3246 #endif
3247
3248
3249 #if wxUSE_UNICODE
3250
3251 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3252 {
3253     if ( !s )
3254         return wxWCharBuffer();
3255
3256     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3257     if ( !wbuf )
3258         wbuf = wxMBConvUTF8().cMB2WX(s);
3259     if ( !wbuf )
3260         wbuf = wxConvISO8859_1.cMB2WX(s);
3261
3262     return wbuf;
3263 }
3264
3265 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3266 {
3267     if ( !ws )
3268         return wxCharBuffer();
3269
3270     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3271     if ( !buf )
3272         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3273
3274     return buf;
3275 }
3276
3277 #endif // wxUSE_UNICODE
3278
3279 // ----------------------------------------------------------------------------
3280 // globals
3281 // ----------------------------------------------------------------------------
3282
3283 // NB: The reason why we create converted objects in this convoluted way,
3284 //     using a factory function instead of global variable, is that they
3285 //     may be used at static initialization time (some of them are used by
3286 //     wxString ctors and there may be a global wxString object). In other
3287 //     words, possibly _before_ the converter global object would be
3288 //     initialized.
3289
3290 #undef wxConvLibc
3291 #undef wxConvUTF8
3292 #undef wxConvUTF7
3293 #undef wxConvLocal
3294 #undef wxConvISO8859_1
3295
3296 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3297     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3298     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3299     {                                                                   \
3300         static impl_klass name##Obj ctor_args;                          \
3301         return &name##Obj;                                              \
3302     }                                                                   \
3303     /* this ensures that all global converter objects are created */    \
3304     /* by the time static initialization is done, i.e. before any */    \
3305     /* thread is launched: */                                           \
3306     static klass* gs_##name##instance = wxGet_##name##Ptr()
3307
3308 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3309     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3310
3311 #ifdef __WINDOWS__
3312     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3313 #else
3314     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3315 #endif
3316
3317 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3318 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3319 //     provokes an error message about "not enough macro parameters"; and we
3320 //     can't use "()" here as the name##Obj declaration would be parsed as a
3321 //     function declaration then, so use a semicolon and live with an extra
3322 //     empty statement (and hope that no compilers warns about this)
3323 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3324 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3325
3326 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3327 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3328
3329 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3330 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3331
3332 #ifdef __DARWIN__
3333 // The xnu kernel always communicates file paths in decomposed UTF-8.
3334 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3335 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3336 #endif
3337
3338 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3339 #ifdef __DARWIN__
3340                                     &wxConvMacUTF8DObj;
3341 #else // !__DARWIN__
3342                                     wxGet_wxConvLibcPtr();
3343 #endif // __DARWIN__/!__DARWIN__
3344
3345 #else // !wxUSE_WCHAR_T
3346
3347 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3348 // stand-ins in absence of wchar_t
3349 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3350                                 wxConvISO8859_1,
3351                                 wxConvLocal,
3352                                 wxConvUTF8;
3353
3354 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T