src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         dstWritten += lenChunk;
 216         if ( !srcEnd )
 217             dstWritten++;
 218
 219         if ( !lenChunk )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             // +1 is for trailing NUL
 231             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 232                 return wxCONV_FAILED;
 233
 234             dst += lenChunk;
 235             if ( !srcEnd )
 236                 dst++;
 237         }
 238
 239         if ( !srcEnd )
 240         {
 241             // we convert just one chunk in this case as this is the entire
 242             // string anyhow
 243             break;
 244         }
 245
 246         // advance the input pointer past the end of this chunk
 247         while ( NotAllNULs(src, nulLen) )
 248         {
 249             // notice that we must skip over multiple bytes here as we suppose
 250             // that if NUL takes 2 or 4 bytes, then all the other characters do
 251             // too and so if advanced by a single byte we might erroneously
 252             // detect sequences of NUL bytes in the middle of the input
 253             src += nulLen;
 254         }
 255
 256         src += nulLen; // skipping over its terminator as well
 257
 258         // note that ">=" (and not just "==") is needed here as the terminator
 259         // we skipped just above could be inside or just after the buffer
 260         // delimited by inEnd
 261         if ( src >= srcEnd )
 262             break;
 263     }
 264
 265     return dstWritten;
 266 }
 267
 268 size_t
 269 wxMBConv::FromWChar(char *dst, size_t dstLen,
 270                     const wchar_t *src, size_t srcLen) const
 271 {
 272     // the number of chars [which would be] written to dst [if it were not NULL]
 273     size_t dstWritten = 0;
 274
 275     // if we don't know its length we have no choice but to assume that it is
 276     // NUL-terminated (notice that it can still be NUL-terminated even if
 277     // explicit length is given but it doesn't change our return value)
 278     const bool isNulTerminated = srcLen == wxNO_LEN;
 279
 280     // make a copy of the input string unless it is already properly
 281     // NUL-terminated
 282     wxWCharBuffer bufTmp;
 283     if ( isNulTerminated )
 284     {
 285         srcLen = wxWcslen(src) + 1;
 286     }
 287     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 288     {
 289         // make a copy in order to properly NUL-terminate the string
 290         bufTmp = wxWCharBuffer(srcLen);
 291         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 292         src = bufTmp;
 293     }
 294
 295     const size_t lenNul = GetMBNulLen();
 296     for ( const wchar_t * const srcEnd = src + srcLen;
 297           src < srcEnd;
 298           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 299     {
 300         // try to convert the current chunk
 301         size_t lenChunk = WC2MB(NULL, src, 0);
 302
 303         if ( lenChunk == wxCONV_FAILED )
 304             return wxCONV_FAILED;
 305
 306         dstWritten += lenChunk;
 307         if ( isNulTerminated )
 308             dstWritten += lenNul;
 309
 310         if ( dst )
 311         {
 312             if ( dstWritten > dstLen )
 313                 return wxCONV_FAILED;
 314
 315             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 316                 return wxCONV_FAILED;
 317
 318             dst += lenChunk;
 319             if ( isNulTerminated )
 320                 dst += lenNul;
 321         }
 322     }
 323
 324     return dstWritten;
 325 }
 326
 327 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 328 {
 329     size_t rc = ToWChar(outBuff, outLen, inBuff);
 330     if ( rc != wxCONV_FAILED )
 331     {
 332         // ToWChar() returns the buffer length, i.e. including the trailing
 333         // NUL, while this method doesn't take it into account
 334         rc--;
 335     }
 336
 337     return rc;
 338 }
 339
 340 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 341 {
 342     size_t rc = FromWChar(outBuff, outLen, inBuff);
 343     if ( rc != wxCONV_FAILED )
 344     {
 345         rc -= GetMBNulLen();
 346     }
 347
 348     return rc;
 349 }
 350
 351 wxMBConv::~wxMBConv()
 352 {
 353     // nothing to do here (necessary for Darwin linking probably)
 354 }
 355
 356 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 357 {
 358     if ( psz )
 359     {
 360         // calculate the length of the buffer needed first
 361         const size_t nLen = ToWChar(NULL, 0, psz);
 362         if ( nLen != wxCONV_FAILED )
 363         {
 364             // now do the actual conversion
 365             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 366
 367             // +1 for the trailing NULL
 368             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 369                 return buf;
 370         }
 371     }
 372
 373     return wxWCharBuffer();
 374 }
 375
 376 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 377 {
 378     if ( pwz )
 379     {
 380         const size_t nLen = FromWChar(NULL, 0, pwz);
 381         if ( nLen != wxCONV_FAILED )
 382         {
 383             wxCharBuffer buf(nLen - 1);
 384             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 385                 return buf;
 386         }
 387     }
 388
 389     return wxCharBuffer();
 390 }
 391
 392 const wxWCharBuffer
 393 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 394 {
 395     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 396     if ( dstLen != wxCONV_FAILED )
 397     {
 398         // notice that we allocate space for dstLen+1 wide characters here
 399         // because we want the buffer to always be NUL-terminated, even if the
 400         // input isn't (as otherwise the caller has no way to know its length)
 401         wxWCharBuffer wbuf(dstLen);
 402         wbuf.data()[dstLen] = L'\0';
 403         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 404         {
 405             if ( outLen )
 406             {
 407                 *outLen = dstLen;
 408
 409                 // we also need to handle NUL-terminated input strings
 410                 // specially: for them the output is the length of the string
 411                 // excluding the trailing NUL, however if we're asked to
 412                 // convert a specific number of characters we return the length
 413                 // of the resulting output even if it's NUL-terminated
 414                 if ( inLen == wxNO_LEN )
 415                     (*outLen)--;
 416             }
 417
 418             return wbuf;
 419         }
 420     }
 421
 422     if ( outLen )
 423         *outLen = 0;
 424
 425     return wxWCharBuffer();
 426 }
 427
 428 const wxCharBuffer
 429 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 430 {
 431     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 432     if ( dstLen != wxCONV_FAILED )
 433     {
 434         const size_t nulLen = GetMBNulLen();
 435
 436         // as above, ensure that the buffer is always NUL-terminated, even if
 437         // the input is not
 438         wxCharBuffer buf(dstLen + nulLen - 1);
 439         memset(buf.data() + dstLen, 0, nulLen);
 440         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 441         {
 442             if ( outLen )
 443             {
 444                 *outLen = dstLen;
 445
 446                 if ( inLen == wxNO_LEN )
 447                 {
 448                     // in this case both input and output are NUL-terminated
 449                     // and we're not supposed to count NUL
 450                     *outLen -= nulLen;
 451                 }
 452             }
 453
 454             return buf;
 455         }
 456     }
 457
 458     if ( outLen )
 459         *outLen = 0;
 460
 461     return wxCharBuffer();
 462 }
 463
 464 // ----------------------------------------------------------------------------
 465 // wxMBConvLibc
 466 // ----------------------------------------------------------------------------
 467
 468 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 469 {
 470     return wxMB2WC(buf, psz, n);
 471 }
 472
 473 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 474 {
 475     return wxWC2MB(buf, psz, n);
 476 }
 477
 478 // ----------------------------------------------------------------------------
 479 // wxConvBrokenFileNames
 480 // ----------------------------------------------------------------------------
 481
 482 #ifdef __UNIX__
 483
 484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 485 {
 486     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 487          wxStricmp(charset, _T("UTF8")) == 0  )
 488         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 489     else
 490         m_conv = new wxCSConv(charset);
 491 }
 492
 493 #endif // __UNIX__
 494
 495 // ----------------------------------------------------------------------------
 496 // UTF-7
 497 // ----------------------------------------------------------------------------
 498
 499 // Implementation (C) 2004 Fredrik Roubert
 500 //
 501 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 502
 503 //
 504 // BASE64 decoding table
 505 //
 506 static const unsigned char utf7unb64[] =
 507 {
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 514     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 515     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 517     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 518     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 519     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 521     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 522     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 523     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 535     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 536     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 539     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 540 };
 541
 542 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 543                              const char *src, size_t srcLen) const
 544 {
 545     DecoderState stateOrig,
 546                 *statePtr;
 547     if ( srcLen == wxNO_LEN )
 548     {
 549         // convert the entire string, up to and including the trailing NUL
 550         srcLen = strlen(src) + 1;
 551
 552         // when working on the entire strings we don't update nor use the shift
 553         // state from the previous call
 554         statePtr = &stateOrig;
 555     }
 556     else // when working with partial strings we do use the shift state
 557     {
 558         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 559
 560         // also save the old state to be able to rollback to it on error
 561         stateOrig = m_stateDecoder;
 562     }
 563
 564     // but to simplify the code below we use this variable in both cases
 565     DecoderState& state = *statePtr;
 566
 567
 568     // number of characters [which would have been] written to dst [if it were
 569     // not NULL]
 570     size_t len = 0;
 571
 572     const char * const srcEnd = src + srcLen;
 573
 574     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 575     {
 576         const unsigned char cc = *src++;
 577
 578         if ( state.IsShifted() )
 579         {
 580             const unsigned char dc = utf7unb64[cc];
 581             if ( dc == 0xff )
 582             {
 583                 // end of encoded part, check that nothing was left: the bit
 584                 // field cycles through 0,6,4,2 sequence so check that we're at
 585                 // the end of it
 586                 if ( state.bit != 2 )
 587                     return wxCONV_FAILED;
 588
 589                 state.ToDirect();
 590
 591                 // re-parse this character normally below unless it's '-' which
 592                 // is consumed by the decoder
 593                 if ( cc == '-' )
 594                     continue;
 595             }
 596             else // valid encoded character
 597             {
 598                 // mini base64 decoder: each character is 6 bits
 599                 state.bit += 6;
 600                 state.accum <<= 6;
 601                 state.accum += dc;
 602
 603                 if ( state.bit >= 8 )
 604                 {
 605                     // got the full byte, consume it
 606                     state.bit -= 8;
 607                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 608
 609                     if ( state.isLSB )
 610                     {
 611                         // we've got the full word, output it
 612                         if ( dst )
 613                             *dst++ = (state.msb << 8) | b;
 614                         len++;
 615                         state.isLSB = false;
 616                     }
 617                     else // MSB
 618                     {
 619                         // just store it while we wait for LSB
 620                         state.msb = b;
 621                         state.isLSB = true;
 622                     }
 623                 }
 624             }
 625         }
 626
 627         if ( state.IsDirect() )
 628         {
 629             // start of an encoded segment?
 630             if ( cc == '+' )
 631             {
 632                 if ( *src == '-' )
 633                 {
 634                     // just the encoded plus sign, don't switch to shifted mode
 635                     if ( dst )
 636                         *dst++ = '+';
 637                     len++;
 638                     src++;
 639                 }
 640                 else
 641                 {
 642                     state.ToShifted();
 643                 }
 644             }
 645             else // not '+'
 646             {
 647                 // only printable 7 bit ASCII characters (with the exception of
 648                 // NUL, TAB, CR and LF) can be used directly
 649                 if ( cc >= 0x7f || (cc < ' ' &&
 650                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 651                     return wxCONV_FAILED;
 652
 653                 if ( dst )
 654                     *dst++ = cc;
 655                 len++;
 656             }
 657         }
 658     }
 659
 660     if ( !len )
 661     {
 662         // as we didn't read any characters we should be called with the same
 663         // data (followed by some more new data) again later so don't save our
 664         // state
 665         state = stateOrig;
 666
 667         return wxCONV_FAILED;
 668     }
 669
 670     return len;
 671 }
 672
 673 //
 674 // BASE64 encoding table
 675 //
 676 static const unsigned char utf7enb64[] =
 677 {
 678     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 679     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 680     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 681     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 682     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 683     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 684     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 685     '4', '5', '6', '7', '8', '9', '+', '/'
 686 };
 687
 688 //
 689 // UTF-7 encoding table
 690 //
 691 // 0 - Set D (directly encoded characters)
 692 // 1 - Set O (optional direct characters)
 693 // 2 - whitespace characters (optional)
 694 // 3 - special characters
 695 //
 696 static const unsigned char utf7encode[128] =
 697 {
 698     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 699     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 700     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 701     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 702     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 703     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 704     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 705     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 706 };
 707
 708 static inline bool wxIsUTF7Direct(wchar_t wc)
 709 {
 710     return wc < 0x80 && utf7encode[wc] < 1;
 711 }
 712
 713 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 714                                const wchar_t *src, size_t srcLen) const
 715 {
 716     EncoderState stateOrig,
 717                 *statePtr;
 718     if ( srcLen == wxNO_LEN )
 719     {
 720         // we don't apply the stored state when operating on entire strings at
 721         // once
 722         statePtr = &stateOrig;
 723
 724         srcLen = wxWcslen(src) + 1;
 725     }
 726     else // do use the mode we left the output in previously
 727     {
 728         stateOrig = m_stateEncoder;
 729         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 730     }
 731
 732     EncoderState& state = *statePtr;
 733
 734
 735     size_t len = 0;
 736
 737     const wchar_t * const srcEnd = src + srcLen;
 738     while ( src < srcEnd && (!dst || len < dstLen) )
 739     {
 740         wchar_t cc = *src++;
 741         if ( wxIsUTF7Direct(cc) )
 742         {
 743             if ( state.IsShifted() )
 744             {
 745                 // pad with zeros the last encoded block if necessary
 746                 if ( state.bit )
 747                 {
 748                     if ( dst )
 749                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 750                     len++;
 751                 }
 752
 753                 state.ToDirect();
 754
 755                 if ( dst )
 756                     *dst++ = '-';
 757                 len++;
 758             }
 759
 760             if ( dst )
 761                 *dst++ = (char)cc;
 762             len++;
 763         }
 764         else if ( cc == '+' && state.IsDirect() )
 765         {
 766             if ( dst )
 767             {
 768                 *dst++ = '+';
 769                 *dst++ = '-';
 770             }
 771
 772             len += 2;
 773         }
 774 #ifndef WC_UTF16
 775         else if (((wxUint32)cc) > 0xffff)
 776         {
 777             // no surrogate pair generation (yet?)
 778             return wxCONV_FAILED;
 779         }
 780 #endif
 781         else
 782         {
 783             if ( state.IsDirect() )
 784             {
 785                 state.ToShifted();
 786
 787                 if ( dst )
 788                     *dst++ = '+';
 789                 len++;
 790             }
 791
 792             // BASE64 encode string
 793             for ( ;; )
 794             {
 795                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 796                 {
 797                     state.accum <<= 8;
 798                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 799
 800                     for (state.bit += 8; state.bit >= 6; )
 801                     {
 802                         state.bit -= 6;
 803                         if ( dst )
 804                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 805                         len++;
 806                     }
 807                 }
 808
 809                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 810                     break;
 811
 812                 src++;
 813             }
 814         }
 815     }
 816
 817     // we need to restore the original encoder state if we were called just to
 818     // calculate the amount of space needed as we will presumably be called
 819     // again to really convert the data now
 820     if ( !dst )
 821         state = stateOrig;
 822
 823     return len;
 824 }
 825
 826 // ----------------------------------------------------------------------------
 827 // UTF-8
 828 // ----------------------------------------------------------------------------
 829
 830 static const wxUint32 utf8_max[]=
 831     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 832
 833 // boundaries of the private use area we use to (temporarily) remap invalid
 834 // characters invalid in a UTF-8 encoded string
 835 const wxUint32 wxUnicodePUA = 0x100000;
 836 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 837
 838 // this table gives the length of the UTF-8 encoding from its first character:
 839 const unsigned char tableUtf8Lengths[256] = {
 840     // single-byte sequences (ASCII):
 841     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 842     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 843     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 844     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 845     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 846     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 847     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 848     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 849
 850     // these are invalid:
 851     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 852     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 853     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 854     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 855     0, 0,                                            // C0,C1
 856
 857     // two-byte sequences:
 858           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 859     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 860
 861     // three-byte sequences:
 862     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 863
 864     // four-byte sequences:
 865     4, 4, 4, 4, 4,                                   // F0..F4
 866
 867     // these are invalid again (5- or 6-byte
 868     // sequences and sequences for code points
 869     // above U+10FFFF, as restricted by RFC 3629):
 870                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 871 };
 872
 873 size_t
 874 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 875                             const char *src, size_t srcLen) const
 876 {
 877     wchar_t *out = dstLen ? dst : NULL;
 878     size_t written = 0;
 879
 880     if ( srcLen == wxNO_LEN )
 881         srcLen = strlen(src) + 1;
 882
 883     for ( const char *p = src; ; p++ )
 884     {
 885         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 886         {
 887             // all done successfully, just add the trailing NULL if we are not
 888             // using explicit length
 889             if ( srcLen == wxNO_LEN )
 890             {
 891                 if ( out )
 892                 {
 893                     if ( !dstLen )
 894                         break;
 895
 896                     *out = L'\0';
 897                 }
 898
 899                 written++;
 900             }
 901
 902             return written;
 903         }
 904
 905         if ( out && !dstLen-- )
 906             break;
 907
 908         wxUint32 code;
 909         unsigned char c = *p;
 910
 911         if ( c < 0x80 )
 912         {
 913             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 914                 break;
 915
 916             if ( srcLen != wxNO_LEN )
 917                 srcLen--;
 918
 919             code = c;
 920         }
 921         else
 922         {
 923             unsigned len = tableUtf8Lengths[c];
 924             if ( !len )
 925                 break;
 926
 927             if ( srcLen < len ) // the test works for wxNO_LEN too
 928                 break;
 929
 930             if ( srcLen != wxNO_LEN )
 931                 srcLen -= len;
 932
 933             //   Char. number range   |        UTF-8 octet sequence
 934             //      (hexadecimal)     |              (binary)
 935             //  ----------------------+----------------------------------------
 936             //  0000 0000 - 0000 007F | 0xxxxxxx
 937             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 938             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 939             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 940             //
 941             //  Code point value is stored in bits marked with 'x',
 942             //  lowest-order bit of the value on the right side in the diagram
 943             //  above.                                         (from RFC 3629)
 944
 945             // mask to extract lead byte's value ('x' bits above), by sequence
 946             // length:
 947             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 948
 949             // mask and value of lead byte's most significant bits, by length:
 950             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 951             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 952
 953             len--; // it's more convenient to work with 0-based length here
 954
 955             // extract the lead byte's value bits:
 956             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 957                 break;
 958
 959             code = c & leadValueMask[len];
 960
 961             // all remaining bytes, if any, are handled in the same way
 962             // regardless of sequence's length:
 963             for ( ; len; --len )
 964             {
 965                 c = *++p;
 966                 if ( (c & 0xC0) != 0x80 )
 967                     return wxCONV_FAILED;
 968
 969                 code <<= 6;
 970                 code |= c & 0x3F;
 971             }
 972         }
 973
 974 #ifdef WC_UTF16
 975         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 976         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 977         {
 978             if ( out )
 979                 out++;
 980             written++;
 981         }
 982 #else // !WC_UTF16
 983         if ( out )
 984             *out = code;
 985 #endif // WC_UTF16/!WC_UTF16
 986
 987         if ( out )
 988             out++;
 989
 990         written++;
 991     }
 992
 993     return wxCONV_FAILED;
 994 }
 995
 996 size_t
 997 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 998                               const wchar_t *src, size_t srcLen) const
 999 {
1000     char *out = dstLen ? dst : NULL;
1001     size_t written = 0;
1002
1003     for ( const wchar_t *wp = src; ; wp++ )
1004     {
1005         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1006         {
1007             // all done successfully, just add the trailing NULL if we are not
1008             // using explicit length
1009             if ( srcLen == wxNO_LEN )
1010             {
1011                 if ( out )
1012                 {
1013                     if ( !dstLen )
1014                         break;
1015
1016                     *out = '\0';
1017                 }
1018
1019                 written++;
1020             }
1021
1022             return written;
1023         }
1024
1025         if ( srcLen != wxNO_LEN )
1026             srcLen--;
1027
1028         wxUint32 code;
1029 #ifdef WC_UTF16
1030         // cast is ok for WC_UTF16
1031         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1032         {
1033             // skip the next char too as we decoded a surrogate
1034             wp++;
1035         }
1036 #else // wchar_t is UTF-32
1037         code = *wp & 0x7fffffff;
1038 #endif
1039
1040         unsigned len;
1041         if ( code <= 0x7F )
1042         {
1043             len = 1;
1044             if ( out )
1045             {
1046                 if ( dstLen < len )
1047                     break;
1048
1049                 out[0] = (char)code;
1050             }
1051         }
1052         else if ( code <= 0x07FF )
1053         {
1054             len = 2;
1055             if ( out )
1056             {
1057                 if ( dstLen < len )
1058                     break;
1059
1060                 // NB: this line takes 6 least significant bits, encodes them as
1061                 // 10xxxxxx and discards them so that the next byte can be encoded:
1062                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1063                 out[0] = 0xC0 | code;
1064             }
1065         }
1066         else if ( code < 0xFFFF )
1067         {
1068             len = 3;
1069             if ( out )
1070             {
1071                 if ( dstLen < len )
1072                     break;
1073
1074                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1075                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1076                 out[0] = 0xE0 | code;
1077             }
1078         }
1079         else if ( code <= 0x10FFFF )
1080         {
1081             len = 4;
1082             if ( out )
1083             {
1084                 if ( dstLen < len )
1085                     break;
1086
1087                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1088                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1089                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1090                 out[0] = 0xF0 | code;
1091             }
1092         }
1093         else
1094         {
1095             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1096             break;
1097         }
1098
1099         if ( out )
1100         {
1101             out += len;
1102             dstLen -= len;
1103         }
1104
1105         written += len;
1106     }
1107
1108     // we only get here if an error occurs during decoding
1109     return wxCONV_FAILED;
1110 }
1111
1112 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1113                              const char *psz, size_t srcLen) const
1114 {
1115     if ( m_options == MAP_INVALID_UTF8_NOT )
1116         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1117
1118     size_t len = 0;
1119
1120     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1121     {
1122         const char *opsz = psz;
1123         bool invalid = false;
1124         unsigned char cc = *psz++, fc = cc;
1125         unsigned cnt;
1126         for (cnt = 0; fc & 0x80; cnt++)
1127             fc <<= 1;
1128
1129         if (!cnt)
1130         {
1131             // plain ASCII char
1132             if (buf)
1133                 *buf++ = cc;
1134             len++;
1135
1136             // escape the escape character for octal escapes
1137             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1138                     && cc == '\\' && (!buf || len < n))
1139             {
1140                 if (buf)
1141                     *buf++ = cc;
1142                 len++;
1143             }
1144         }
1145         else
1146         {
1147             cnt--;
1148             if (!cnt)
1149             {
1150                 // invalid UTF-8 sequence
1151                 invalid = true;
1152             }
1153             else
1154             {
1155                 unsigned ocnt = cnt - 1;
1156                 wxUint32 res = cc & (0x3f >> cnt);
1157                 while (cnt--)
1158                 {
1159                     cc = *psz;
1160                     if ((cc & 0xC0) != 0x80)
1161                     {
1162                         // invalid UTF-8 sequence
1163                         invalid = true;
1164                         break;
1165                     }
1166
1167                     psz++;
1168                     res = (res << 6) | (cc & 0x3f);
1169                 }
1170
1171                 if (invalid || res <= utf8_max[ocnt])
1172                 {
1173                     // illegal UTF-8 encoding
1174                     invalid = true;
1175                 }
1176                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1177                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1178                 {
1179                     // if one of our PUA characters turns up externally
1180                     // it must also be treated as an illegal sequence
1181                     // (a bit like you have to escape an escape character)
1182                     invalid = true;
1183                 }
1184                 else
1185                 {
1186 #ifdef WC_UTF16
1187                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1188                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1189                     if (pa == wxCONV_FAILED)
1190                     {
1191                         invalid = true;
1192                     }
1193                     else
1194                     {
1195                         if (buf)
1196                             buf += pa;
1197                         len += pa;
1198                     }
1199 #else // !WC_UTF16
1200                     if (buf)
1201                         *buf++ = (wchar_t)res;
1202                     len++;
1203 #endif // WC_UTF16/!WC_UTF16
1204                 }
1205             }
1206
1207             if (invalid)
1208             {
1209                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1210                 {
1211                     while (opsz < psz && (!buf || len < n))
1212                     {
1213 #ifdef WC_UTF16
1214                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1215                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1216                         wxASSERT(pa != wxCONV_FAILED);
1217                         if (buf)
1218                             buf += pa;
1219                         opsz++;
1220                         len += pa;
1221 #else
1222                         if (buf)
1223                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1224                         opsz++;
1225                         len++;
1226 #endif
1227                     }
1228                 }
1229                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1230                 {
1231                     while (opsz < psz && (!buf || len < n))
1232                     {
1233                         if ( buf && len + 3 < n )
1234                         {
1235                             unsigned char on = *opsz;
1236                             *buf++ = L'\\';
1237                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1238                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1239                             *buf++ = (wchar_t)( L'0' + on % 010 );
1240                         }
1241
1242                         opsz++;
1243                         len += 4;
1244                     }
1245                 }
1246                 else // MAP_INVALID_UTF8_NOT
1247                 {
1248                     return wxCONV_FAILED;
1249                 }
1250             }
1251         }
1252     }
1253
1254     if (srcLen == wxNO_LEN && buf && (len < n))
1255         *buf = 0;
1256
1257     return len + 1;
1258 }
1259
1260 static inline bool isoctal(wchar_t wch)
1261 {
1262     return L'0' <= wch && wch <= L'7';
1263 }
1264
1265 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1266                                const wchar_t *psz, size_t srcLen) const
1267 {
1268     if ( m_options == MAP_INVALID_UTF8_NOT )
1269         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1270
1271     size_t len = 0;
1272
1273     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1274     {
1275         wxUint32 cc;
1276
1277 #ifdef WC_UTF16
1278         // cast is ok for WC_UTF16
1279         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1280         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1281 #else
1282         cc = (*psz++) & 0x7fffffff;
1283 #endif
1284
1285         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1286                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1287         {
1288             if (buf)
1289                 *buf++ = (char)(cc - wxUnicodePUA);
1290             len++;
1291         }
1292         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1293                     && cc == L'\\' && psz[0] == L'\\' )
1294         {
1295             if (buf)
1296                 *buf++ = (char)cc;
1297             psz++;
1298             len++;
1299         }
1300         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1301                     cc == L'\\' &&
1302                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1303         {
1304             if (buf)
1305             {
1306                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1307                                  (psz[1] - L'0') * 010 +
1308                                  (psz[2] - L'0'));
1309             }
1310
1311             psz += 3;
1312             len++;
1313         }
1314         else
1315         {
1316             unsigned cnt;
1317             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1318             {
1319             }
1320
1321             if (!cnt)
1322             {
1323                 // plain ASCII char
1324                 if (buf)
1325                     *buf++ = (char) cc;
1326                 len++;
1327             }
1328             else
1329             {
1330                 len += cnt + 1;
1331                 if (buf)
1332                 {
1333                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1334                     while (cnt--)
1335                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1336                 }
1337             }
1338         }
1339     }
1340
1341     if (srcLen == wxNO_LEN && buf && (len < n))
1342         *buf = 0;
1343
1344     return len + 1;
1345 }
1346
1347 // ============================================================================
1348 // UTF-16
1349 // ============================================================================
1350
1351 #ifdef WORDS_BIGENDIAN
1352     #define wxMBConvUTF16straight wxMBConvUTF16BE
1353     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1354 #else
1355     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1356     #define wxMBConvUTF16straight wxMBConvUTF16LE
1357 #endif
1358
1359 /* static */
1360 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1361 {
1362     if ( srcLen == wxNO_LEN )
1363     {
1364         // count the number of bytes in input, including the trailing NULs
1365         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1366         for ( srcLen = 1; *inBuff++; srcLen++ )
1367             ;
1368
1369         srcLen *= BYTES_PER_CHAR;
1370     }
1371     else // we already have the length
1372     {
1373         // we can only convert an entire number of UTF-16 characters
1374         if ( srcLen % BYTES_PER_CHAR )
1375             return wxCONV_FAILED;
1376     }
1377
1378     return srcLen;
1379 }
1380
1381 // case when in-memory representation is UTF-16 too
1382 #ifdef WC_UTF16
1383
1384 // ----------------------------------------------------------------------------
1385 // conversions without endianness change
1386 // ----------------------------------------------------------------------------
1387
1388 size_t
1389 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1390                                const char *src, size_t srcLen) const
1391 {
1392     // set up the scene for using memcpy() (which is presumably more efficient
1393     // than copying the bytes one by one)
1394     srcLen = GetLength(src, srcLen);
1395     if ( srcLen == wxNO_LEN )
1396         return wxCONV_FAILED;
1397
1398     const size_t inLen = srcLen / BYTES_PER_CHAR;
1399     if ( dst )
1400     {
1401         if ( dstLen < inLen )
1402             return wxCONV_FAILED;
1403
1404         memcpy(dst, src, srcLen);
1405     }
1406
1407     return inLen;
1408 }
1409
1410 size_t
1411 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1412                                  const wchar_t *src, size_t srcLen) const
1413 {
1414     if ( srcLen == wxNO_LEN )
1415         srcLen = wxWcslen(src) + 1;
1416
1417     srcLen *= BYTES_PER_CHAR;
1418
1419     if ( dst )
1420     {
1421         if ( dstLen < srcLen )
1422             return wxCONV_FAILED;
1423
1424         memcpy(dst, src, srcLen);
1425     }
1426
1427     return srcLen;
1428 }
1429
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1433
1434 size_t
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1436                            const char *src, size_t srcLen) const
1437 {
1438     srcLen = GetLength(src, srcLen);
1439     if ( srcLen == wxNO_LEN )
1440         return wxCONV_FAILED;
1441
1442     srcLen /= BYTES_PER_CHAR;
1443
1444     if ( dst )
1445     {
1446         if ( dstLen < srcLen )
1447             return wxCONV_FAILED;
1448
1449         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1450         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1451         {
1452             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1453         }
1454     }
1455
1456     return srcLen;
1457 }
1458
1459 size_t
1460 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1461                              const wchar_t *src, size_t srcLen) const
1462 {
1463     if ( srcLen == wxNO_LEN )
1464         srcLen = wxWcslen(src) + 1;
1465
1466     srcLen *= BYTES_PER_CHAR;
1467
1468     if ( dst )
1469     {
1470         if ( dstLen < srcLen )
1471             return wxCONV_FAILED;
1472
1473         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1474         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1475         {
1476             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1477         }
1478     }
1479
1480     return srcLen;
1481 }
1482
1483 #else // !WC_UTF16: wchar_t is UTF-32
1484
1485 // ----------------------------------------------------------------------------
1486 // conversions without endianness change
1487 // ----------------------------------------------------------------------------
1488
1489 size_t
1490 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1491                                const char *src, size_t srcLen) const
1492 {
1493     srcLen = GetLength(src, srcLen);
1494     if ( srcLen == wxNO_LEN )
1495         return wxCONV_FAILED;
1496
1497     const size_t inLen = srcLen / BYTES_PER_CHAR;
1498     if ( !dst )
1499     {
1500         // optimization: return maximal space which could be needed for this
1501         // string even if the real size could be smaller if the buffer contains
1502         // any surrogates
1503         return inLen;
1504     }
1505
1506     size_t outLen = 0;
1507     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1508     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1509     {
1510         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1511         if ( !inBuff )
1512             return wxCONV_FAILED;
1513
1514         if ( ++outLen > dstLen )
1515             return wxCONV_FAILED;
1516
1517         *dst++ = ch;
1518     }
1519
1520
1521     return outLen;
1522 }
1523
1524 size_t
1525 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1526                                  const wchar_t *src, size_t srcLen) const
1527 {
1528     if ( srcLen == wxNO_LEN )
1529         srcLen = wxWcslen(src) + 1;
1530
1531     size_t outLen = 0;
1532     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1533     for ( size_t n = 0; n < srcLen; n++ )
1534     {
1535         wxUint16 cc[2];
1536         const size_t numChars = encode_utf16(*src++, cc);
1537         if ( numChars == wxCONV_FAILED )
1538             return wxCONV_FAILED;
1539
1540         outLen += numChars * BYTES_PER_CHAR;
1541         if ( outBuff )
1542         {
1543             if ( outLen > dstLen )
1544                 return wxCONV_FAILED;
1545
1546             *outBuff++ = cc[0];
1547             if ( numChars == 2 )
1548             {
1549                 // second character of a surrogate
1550                 *outBuff++ = cc[1];
1551             }
1552         }
1553     }
1554
1555     return outLen;
1556 }
1557
1558 // ----------------------------------------------------------------------------
1559 // endian-reversing conversions
1560 // ----------------------------------------------------------------------------
1561
1562 size_t
1563 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1564                            const char *src, size_t srcLen) const
1565 {
1566     srcLen = GetLength(src, srcLen);
1567     if ( srcLen == wxNO_LEN )
1568         return wxCONV_FAILED;
1569
1570     const size_t inLen = srcLen / BYTES_PER_CHAR;
1571     if ( !dst )
1572     {
1573         // optimization: return maximal space which could be needed for this
1574         // string even if the real size could be smaller if the buffer contains
1575         // any surrogates
1576         return inLen;
1577     }
1578
1579     size_t outLen = 0;
1580     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1581     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1582     {
1583         wxUint32 ch;
1584         wxUint16 tmp[2];
1585
1586         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1587         inBuff++;
1588         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1589
1590         const size_t numChars = decode_utf16(tmp, ch);
1591         if ( numChars == wxCONV_FAILED )
1592             return wxCONV_FAILED;
1593
1594         if ( numChars == 2 )
1595             inBuff++;
1596
1597         if ( ++outLen > dstLen )
1598             return wxCONV_FAILED;
1599
1600         *dst++ = ch;
1601     }
1602
1603
1604     return outLen;
1605 }
1606
1607 size_t
1608 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1609                              const wchar_t *src, size_t srcLen) const
1610 {
1611     if ( srcLen == wxNO_LEN )
1612         srcLen = wxWcslen(src) + 1;
1613
1614     size_t outLen = 0;
1615     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1616     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1617     {
1618         wxUint16 cc[2];
1619         const size_t numChars = encode_utf16(*src, cc);
1620         if ( numChars == wxCONV_FAILED )
1621             return wxCONV_FAILED;
1622
1623         outLen += numChars * BYTES_PER_CHAR;
1624         if ( outBuff )
1625         {
1626             if ( outLen > dstLen )
1627                 return wxCONV_FAILED;
1628
1629             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1630             if ( numChars == 2 )
1631             {
1632                 // second character of a surrogate
1633                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1634             }
1635         }
1636     }
1637
1638     return outLen;
1639 }
1640
1641 #endif // WC_UTF16/!WC_UTF16
1642
1643
1644 // ============================================================================
1645 // UTF-32
1646 // ============================================================================
1647
1648 #ifdef WORDS_BIGENDIAN
1649     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1650     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1651 #else
1652     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1653     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1654 #endif
1655
1656
1657 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1658 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1659
1660 /* static */
1661 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1662 {
1663     if ( srcLen == wxNO_LEN )
1664     {
1665         // count the number of bytes in input, including the trailing NULs
1666         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1667         for ( srcLen = 1; *inBuff++; srcLen++ )
1668             ;
1669
1670         srcLen *= BYTES_PER_CHAR;
1671     }
1672     else // we already have the length
1673     {
1674         // we can only convert an entire number of UTF-32 characters
1675         if ( srcLen % BYTES_PER_CHAR )
1676             return wxCONV_FAILED;
1677     }
1678
1679     return srcLen;
1680 }
1681
1682 // case when in-memory representation is UTF-16
1683 #ifdef WC_UTF16
1684
1685 // ----------------------------------------------------------------------------
1686 // conversions without endianness change
1687 // ----------------------------------------------------------------------------
1688
1689 size_t
1690 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1691                                const char *src, size_t srcLen) const
1692 {
1693     srcLen = GetLength(src, srcLen);
1694     if ( srcLen == wxNO_LEN )
1695         return wxCONV_FAILED;
1696
1697     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1698     const size_t inLen = srcLen / BYTES_PER_CHAR;
1699     size_t outLen = 0;
1700     for ( size_t n = 0; n < inLen; n++ )
1701     {
1702         wxUint16 cc[2];
1703         const size_t numChars = encode_utf16(*inBuff++, cc);
1704         if ( numChars == wxCONV_FAILED )
1705             return wxCONV_FAILED;
1706
1707         outLen += numChars;
1708         if ( dst )
1709         {
1710             if ( outLen > dstLen )
1711                 return wxCONV_FAILED;
1712
1713             *dst++ = cc[0];
1714             if ( numChars == 2 )
1715             {
1716                 // second character of a surrogate
1717                 *dst++ = cc[1];
1718             }
1719         }
1720     }
1721
1722     return outLen;
1723 }
1724
1725 size_t
1726 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1727                                  const wchar_t *src, size_t srcLen) const
1728 {
1729     if ( srcLen == wxNO_LEN )
1730         srcLen = wxWcslen(src) + 1;
1731
1732     if ( !dst )
1733     {
1734         // optimization: return maximal space which could be needed for this
1735         // string instead of the exact amount which could be less if there are
1736         // any surrogates in the input
1737         //
1738         // we consider that surrogates are rare enough to make it worthwhile to
1739         // avoid running the loop below at the cost of slightly extra memory
1740         // consumption
1741         return srcLen * BYTES_PER_CHAR;
1742     }
1743
1744     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1745     size_t outLen = 0;
1746     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1747     {
1748         const wxUint32 ch = wxDecodeSurrogate(&src);
1749         if ( !src )
1750             return wxCONV_FAILED;
1751
1752         outLen += BYTES_PER_CHAR;
1753
1754         if ( outLen > dstLen )
1755             return wxCONV_FAILED;
1756
1757         *outBuff++ = ch;
1758     }
1759
1760     return outLen;
1761 }
1762
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1766
1767 size_t
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1769                            const char *src, size_t srcLen) const
1770 {
1771     srcLen = GetLength(src, srcLen);
1772     if ( srcLen == wxNO_LEN )
1773         return wxCONV_FAILED;
1774
1775     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1776     const size_t inLen = srcLen / BYTES_PER_CHAR;
1777     size_t outLen = 0;
1778     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1779     {
1780         wxUint16 cc[2];
1781         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1782         if ( numChars == wxCONV_FAILED )
1783             return wxCONV_FAILED;
1784
1785         outLen += numChars;
1786         if ( dst )
1787         {
1788             if ( outLen > dstLen )
1789                 return wxCONV_FAILED;
1790
1791             *dst++ = cc[0];
1792             if ( numChars == 2 )
1793             {
1794                 // second character of a surrogate
1795                 *dst++ = cc[1];
1796             }
1797         }
1798     }
1799
1800     return outLen;
1801 }
1802
1803 size_t
1804 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1805                              const wchar_t *src, size_t srcLen) const
1806 {
1807     if ( srcLen == wxNO_LEN )
1808         srcLen = wxWcslen(src) + 1;
1809
1810     if ( !dst )
1811     {
1812         // optimization: return maximal space which could be needed for this
1813         // string instead of the exact amount which could be less if there are
1814         // any surrogates in the input
1815         //
1816         // we consider that surrogates are rare enough to make it worthwhile to
1817         // avoid running the loop below at the cost of slightly extra memory
1818         // consumption
1819         return srcLen*BYTES_PER_CHAR;
1820     }
1821
1822     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1823     size_t outLen = 0;
1824     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1825     {
1826         const wxUint32 ch = wxDecodeSurrogate(&src);
1827         if ( !src )
1828             return wxCONV_FAILED;
1829
1830         outLen += BYTES_PER_CHAR;
1831
1832         if ( outLen > dstLen )
1833             return wxCONV_FAILED;
1834
1835         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1836     }
1837
1838     return outLen;
1839 }
1840
1841 #else // !WC_UTF16: wchar_t is UTF-32
1842
1843 // ----------------------------------------------------------------------------
1844 // conversions without endianness change
1845 // ----------------------------------------------------------------------------
1846
1847 size_t
1848 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1849                                const char *src, size_t srcLen) const
1850 {
1851     // use memcpy() as it should be much faster than hand-written loop
1852     srcLen = GetLength(src, srcLen);
1853     if ( srcLen == wxNO_LEN )
1854         return wxCONV_FAILED;
1855
1856     const size_t inLen = srcLen/BYTES_PER_CHAR;
1857     if ( dst )
1858     {
1859         if ( dstLen < inLen )
1860             return wxCONV_FAILED;
1861
1862         memcpy(dst, src, srcLen);
1863     }
1864
1865     return inLen;
1866 }
1867
1868 size_t
1869 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1870                                  const wchar_t *src, size_t srcLen) const
1871 {
1872     if ( srcLen == wxNO_LEN )
1873         srcLen = wxWcslen(src) + 1;
1874
1875     srcLen *= BYTES_PER_CHAR;
1876
1877     if ( dst )
1878     {
1879         if ( dstLen < srcLen )
1880             return wxCONV_FAILED;
1881
1882         memcpy(dst, src, srcLen);
1883     }
1884
1885     return srcLen;
1886 }
1887
1888 // ----------------------------------------------------------------------------
1889 // endian-reversing conversions
1890 // ----------------------------------------------------------------------------
1891
1892 size_t
1893 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1894                            const char *src, size_t srcLen) const
1895 {
1896     srcLen = GetLength(src, srcLen);
1897     if ( srcLen == wxNO_LEN )
1898         return wxCONV_FAILED;
1899
1900     srcLen /= BYTES_PER_CHAR;
1901
1902     if ( dst )
1903     {
1904         if ( dstLen < srcLen )
1905             return wxCONV_FAILED;
1906
1907         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1908         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1909         {
1910             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1911         }
1912     }
1913
1914     return srcLen;
1915 }
1916
1917 size_t
1918 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1919                              const wchar_t *src, size_t srcLen) const
1920 {
1921     if ( srcLen == wxNO_LEN )
1922         srcLen = wxWcslen(src) + 1;
1923
1924     srcLen *= BYTES_PER_CHAR;
1925
1926     if ( dst )
1927     {
1928         if ( dstLen < srcLen )
1929             return wxCONV_FAILED;
1930
1931         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1932         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1933         {
1934             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1935         }
1936     }
1937
1938     return srcLen;
1939 }
1940
1941 #endif // WC_UTF16/!WC_UTF16
1942
1943
1944 // ============================================================================
1945 // The classes doing conversion using the iconv_xxx() functions
1946 // ============================================================================
1947
1948 #ifdef HAVE_ICONV
1949
1950 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1951 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1952 //     (unless there's yet another bug in glibc) the only case when iconv()
1953 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1954 //     left in the input buffer -- when _real_ error occurs,
1955 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1956 //     iconv() failure.
1957 //     [This bug does not appear in glibc 2.2.]
1958 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1959 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1960                                      (errno != E2BIG || bufLeft != 0))
1961 #else
1962 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1963 #endif
1964
1965 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1966
1967 #define ICONV_T_INVALID ((iconv_t)-1)
1968
1969 #if SIZEOF_WCHAR_T == 4
1970     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1971     #define WC_ENC      wxFONTENCODING_UTF32
1972 #elif SIZEOF_WCHAR_T == 2
1973     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1974     #define WC_ENC      wxFONTENCODING_UTF16
1975 #else // sizeof(wchar_t) != 2 nor 4
1976     // does this ever happen?
1977     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1978 #endif
1979
1980 // ----------------------------------------------------------------------------
1981 // wxMBConv_iconv: encapsulates an iconv character set
1982 // ----------------------------------------------------------------------------
1983
1984 class wxMBConv_iconv : public wxMBConv
1985 {
1986 public:
1987     wxMBConv_iconv(const char *name);
1988     virtual ~wxMBConv_iconv();
1989
1990     // implement base class virtual methods
1991     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1992                            const char *src, size_t srcLen = wxNO_LEN) const;
1993     virtual size_t FromWChar(char *dst, size_t dstLen,
1994                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1995     virtual size_t GetMBNulLen() const;
1996
1997 #if wxUSE_UNICODE_UTF8
1998     virtual bool IsUTF8() const;
1999 #endif
2000
2001     virtual wxMBConv *Clone() const
2002     {
2003         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2004         p->m_minMBCharWidth = m_minMBCharWidth;
2005         return p;
2006     }
2007
2008     bool IsOk() const
2009         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2010
2011 protected:
2012     // the iconv handlers used to translate from multibyte
2013     // to wide char and in the other direction
2014     iconv_t m2w,
2015             w2m;
2016
2017 #if wxUSE_THREADS
2018     // guards access to m2w and w2m objects
2019     wxMutex m_iconvMutex;
2020 #endif
2021
2022 private:
2023     // the name (for iconv_open()) of a wide char charset -- if none is
2024     // available on this machine, it will remain NULL
2025     static wxString ms_wcCharsetName;
2026
2027     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2028     // different endian-ness than the native one
2029     static bool ms_wcNeedsSwap;
2030
2031
2032     // name of the encoding handled by this conversion
2033     wxString m_name;
2034
2035     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2036     // initially
2037     size_t m_minMBCharWidth;
2038 };
2039
2040 // make the constructor available for unit testing
2041 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2042 {
2043     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2044     if ( !result->IsOk() )
2045     {
2046         delete result;
2047         return 0;
2048     }
2049
2050     return result;
2051 }
2052
2053 wxString wxMBConv_iconv::ms_wcCharsetName;
2054 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2055
2056 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2057               : m_name(name)
2058 {
2059     m_minMBCharWidth = 0;
2060
2061     // check for charset that represents wchar_t:
2062     if ( ms_wcCharsetName.empty() )
2063     {
2064         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2065
2066 #if wxUSE_FONTMAP
2067         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2068 #else // !wxUSE_FONTMAP
2069         static const wxChar *names_static[] =
2070         {
2071 #if SIZEOF_WCHAR_T == 4
2072             _T("UCS-4"),
2073 #elif SIZEOF_WCHAR_T = 2
2074             _T("UCS-2"),
2075 #endif
2076             NULL
2077         };
2078         const wxChar **names = names_static;
2079 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2080
2081         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2082         {
2083             const wxString nameCS(*names);
2084
2085             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2086             wxString nameXE(nameCS);
2087
2088 #ifdef WORDS_BIGENDIAN
2089                 nameXE += _T("BE");
2090 #else // little endian
2091                 nameXE += _T("LE");
2092 #endif
2093
2094             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2095                        nameXE.c_str());
2096
2097             m2w = iconv_open(nameXE.ToAscii(), name);
2098             if ( m2w == ICONV_T_INVALID )
2099             {
2100                 // try charset w/o bytesex info (e.g. "UCS4")
2101                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2102                            nameCS.c_str());
2103                 m2w = iconv_open(nameCS.ToAscii(), name);
2104
2105                 // and check for bytesex ourselves:
2106                 if ( m2w != ICONV_T_INVALID )
2107                 {
2108                     char    buf[2], *bufPtr;
2109                     wchar_t wbuf[2];
2110                     size_t  insz, outsz;
2111                     size_t  res;
2112
2113                     buf[0] = 'A';
2114                     buf[1] = 0;
2115                     wbuf[0] = 0;
2116                     insz = 2;
2117                     outsz = SIZEOF_WCHAR_T * 2;
2118                     char* wbufPtr = (char*)wbuf;
2119                     bufPtr = buf;
2120
2121                     res = iconv(
2122                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2123                         &wbufPtr, &outsz);
2124
2125                     if (ICONV_FAILED(res, insz))
2126                     {
2127                         wxLogLastError(wxT("iconv"));
2128                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2129                                    nameCS.c_str());
2130                     }
2131                     else // ok, can convert to this encoding, remember it
2132                     {
2133                         ms_wcCharsetName = nameCS;
2134                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2135                     }
2136                 }
2137             }
2138             else // use charset not requiring byte swapping
2139             {
2140                 ms_wcCharsetName = nameXE;
2141             }
2142         }
2143
2144         wxLogTrace(TRACE_STRCONV,
2145                    wxT("iconv wchar_t charset is \"%s\"%s"),
2146                    ms_wcCharsetName.empty() ? wxString("<none>")
2147                                             : ms_wcCharsetName,
2148                    ms_wcNeedsSwap ? _T(" (needs swap)")
2149                                   : _T(""));
2150     }
2151     else // we already have ms_wcCharsetName
2152     {
2153         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2154     }
2155
2156     if ( ms_wcCharsetName.empty() )
2157     {
2158         w2m = ICONV_T_INVALID;
2159     }
2160     else
2161     {
2162         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2163         if ( w2m == ICONV_T_INVALID )
2164         {
2165             wxLogTrace(TRACE_STRCONV,
2166                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2167                        ms_wcCharsetName.c_str(), name);
2168         }
2169     }
2170 }
2171
2172 wxMBConv_iconv::~wxMBConv_iconv()
2173 {
2174     if ( m2w != ICONV_T_INVALID )
2175         iconv_close(m2w);
2176     if ( w2m != ICONV_T_INVALID )
2177         iconv_close(w2m);
2178 }
2179
2180 size_t
2181 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2182                         const char *src, size_t srcLen) const
2183 {
2184     if ( srcLen == wxNO_LEN )
2185     {
2186         // find the string length: notice that must be done differently for
2187         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2188         // consecutive NULs
2189         const size_t nulLen = GetMBNulLen();
2190         switch ( nulLen )
2191         {
2192             default:
2193                 return wxCONV_FAILED;
2194
2195             case 1:
2196                 srcLen = strlen(src); // arguably more optimized than our version
2197                 break;
2198
2199             case 2:
2200             case 4:
2201                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2202                 // but they also have to start at character boundary and not
2203                 // span two adjacent characters
2204                 const char *p;
2205                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2206                     ;
2207                 srcLen = p - src;
2208                 break;
2209         }
2210
2211         // when we're determining the length of the string ourselves we count
2212         // the terminating NUL(s) as part of it and always NUL-terminate the
2213         // output
2214         srcLen += nulLen;
2215     }
2216
2217     // we express length in the number of (wide) characters but iconv always
2218     // counts buffer sizes it in bytes
2219     dstLen *= SIZEOF_WCHAR_T;
2220
2221 #if wxUSE_THREADS
2222     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2223     //     Unfortunately there are a couple of global wxCSConv objects such as
2224     //     wxConvLocal that are used all over wx code, so we have to make sure
2225     //     the handle is used by at most one thread at the time. Otherwise
2226     //     only a few wx classes would be safe to use from non-main threads
2227     //     as MB<->WC conversion would fail "randomly".
2228     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2229 #endif // wxUSE_THREADS
2230
2231     size_t res, cres;
2232     const char *pszPtr = src;
2233
2234     if ( dst )
2235     {
2236         char* bufPtr = (char*)dst;
2237
2238         // have destination buffer, convert there
2239         size_t dstLenOrig = dstLen;
2240         cres = iconv(m2w,
2241                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2242                      &bufPtr, &dstLen);
2243
2244         // convert the number of bytes converted as returned by iconv to the
2245         // number of (wide) characters converted that we need
2246         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2247
2248         if (ms_wcNeedsSwap)
2249         {
2250             // convert to native endianness
2251             for ( unsigned i = 0; i < res; i++ )
2252                 dst[i] = WC_BSWAP(dst[i]);
2253         }
2254     }
2255     else // no destination buffer
2256     {
2257         // convert using temp buffer to calculate the size of the buffer needed
2258         wchar_t tbuf[8];
2259         res = 0;
2260
2261         do
2262         {
2263             char* bufPtr = (char*)tbuf;
2264             dstLen = 8 * SIZEOF_WCHAR_T;
2265
2266             cres = iconv(m2w,
2267                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2268                          &bufPtr, &dstLen );
2269
2270             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2271         }
2272         while ((cres == (size_t)-1) && (errno == E2BIG));
2273     }
2274
2275     if (ICONV_FAILED(cres, srcLen))
2276     {
2277         //VS: it is ok if iconv fails, hence trace only
2278         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2279         return wxCONV_FAILED;
2280     }
2281
2282     return res;
2283 }
2284
2285 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2286                                  const wchar_t *src, size_t srcLen) const
2287 {
2288 #if wxUSE_THREADS
2289     // NB: explained in MB2WC
2290     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2291 #endif
2292
2293     if ( srcLen == wxNO_LEN )
2294         srcLen = wxWcslen(src) + 1;
2295
2296     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2297     size_t outbuflen = dstLen;
2298     size_t res, cres;
2299
2300     wchar_t *tmpbuf = 0;
2301
2302     if (ms_wcNeedsSwap)
2303     {
2304         // need to copy to temp buffer to switch endianness
2305         // (doing WC_BSWAP twice on the original buffer won't help, as it
2306         //  could be in read-only memory, or be accessed in some other thread)
2307         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2308         for ( size_t i = 0; i < srcLen; i++ )
2309             tmpbuf[i] = WC_BSWAP(src[i]);
2310
2311         tmpbuf[srcLen] = L'\0';
2312         src = tmpbuf;
2313     }
2314
2315     char* inbuf = (char*)src;
2316     if ( dst )
2317     {
2318         // have destination buffer, convert there
2319         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2320
2321         res = dstLen - outbuflen;
2322     }
2323     else // no destination buffer
2324     {
2325         // convert using temp buffer to calculate the size of the buffer needed
2326         char tbuf[16];
2327         res = 0;
2328         do
2329         {
2330             dst = tbuf;
2331             outbuflen = 16;
2332
2333             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2334
2335             res += 16 - outbuflen;
2336         }
2337         while ((cres == (size_t)-1) && (errno == E2BIG));
2338     }
2339
2340     if (ms_wcNeedsSwap)
2341     {
2342         free(tmpbuf);
2343     }
2344
2345     if (ICONV_FAILED(cres, inbuflen))
2346     {
2347         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2348         return wxCONV_FAILED;
2349     }
2350
2351     return res;
2352 }
2353
2354 size_t wxMBConv_iconv::GetMBNulLen() const
2355 {
2356     if ( m_minMBCharWidth == 0 )
2357     {
2358         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2359
2360 #if wxUSE_THREADS
2361         // NB: explained in MB2WC
2362         wxMutexLocker lock(self->m_iconvMutex);
2363 #endif
2364
2365         const wchar_t *wnul = L"";
2366         char buf[8]; // should be enough for NUL in any encoding
2367         size_t inLen = sizeof(wchar_t),
2368                outLen = WXSIZEOF(buf);
2369         char *inBuff = (char *)wnul;
2370         char *outBuff = buf;
2371         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2372         {
2373             self->m_minMBCharWidth = (size_t)-1;
2374         }
2375         else // ok
2376         {
2377             self->m_minMBCharWidth = outBuff - buf;
2378         }
2379     }
2380
2381     return m_minMBCharWidth;
2382 }
2383
2384 #if wxUSE_UNICODE_UTF8
2385 bool wxMBConv_iconv::IsUTF8() const
2386 {
2387     return wxStricmp(m_name, "UTF-8") == 0 ||
2388            wxStricmp(m_name, "UTF8") == 0;
2389 }
2390 #endif
2391
2392 #endif // HAVE_ICONV
2393
2394
2395 // ============================================================================
2396 // Win32 conversion classes
2397 // ============================================================================
2398
2399 #ifdef wxHAVE_WIN32_MB2WC
2400
2401 // from utils.cpp
2402 #if wxUSE_FONTMAP
2403 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2404 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2405 #endif
2406
2407 class wxMBConv_win32 : public wxMBConv
2408 {
2409 public:
2410     wxMBConv_win32()
2411     {
2412         m_CodePage = CP_ACP;
2413         m_minMBCharWidth = 0;
2414     }
2415
2416     wxMBConv_win32(const wxMBConv_win32& conv)
2417         : wxMBConv()
2418     {
2419         m_CodePage = conv.m_CodePage;
2420         m_minMBCharWidth = conv.m_minMBCharWidth;
2421     }
2422
2423 #if wxUSE_FONTMAP
2424     wxMBConv_win32(const char* name)
2425     {
2426         m_CodePage = wxCharsetToCodepage(name);
2427         m_minMBCharWidth = 0;
2428     }
2429
2430     wxMBConv_win32(wxFontEncoding encoding)
2431     {
2432         m_CodePage = wxEncodingToCodepage(encoding);
2433         m_minMBCharWidth = 0;
2434     }
2435 #endif // wxUSE_FONTMAP
2436
2437     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2438     {
2439         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2440         // the behaviour is not compatible with the Unix version (using iconv)
2441         // and break the library itself, e.g. wxTextInputStream::NextChar()
2442         // wouldn't work if reading an incomplete MB char didn't result in an
2443         // error
2444         //
2445         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2446         // Win XP or newer and it is not supported for UTF-[78] so we always
2447         // use our own conversions in this case. See
2448         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2449         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2450         if ( m_CodePage == CP_UTF8 )
2451         {
2452             return wxMBConvUTF8().MB2WC(buf, psz, n);
2453         }
2454
2455         if ( m_CodePage == CP_UTF7 )
2456         {
2457             return wxMBConvUTF7().MB2WC(buf, psz, n);
2458         }
2459
2460         int flags = 0;
2461         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2462                 IsAtLeastWin2kSP4() )
2463         {
2464             flags = MB_ERR_INVALID_CHARS;
2465         }
2466
2467         const size_t len = ::MultiByteToWideChar
2468                              (
2469                                 m_CodePage,     // code page
2470                                 flags,          // flags: fall on error
2471                                 psz,            // input string
2472                                 -1,             // its length (NUL-terminated)
2473                                 buf,            // output string
2474                                 buf ? n : 0     // size of output buffer
2475                              );
2476         if ( !len )
2477         {
2478             // function totally failed
2479             return wxCONV_FAILED;
2480         }
2481
2482         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2483         // check if we succeeded, by doing a double trip:
2484         if ( !flags && buf )
2485         {
2486             const size_t mbLen = strlen(psz);
2487             wxCharBuffer mbBuf(mbLen);
2488             if ( ::WideCharToMultiByte
2489                    (
2490                       m_CodePage,
2491                       0,
2492                       buf,
2493                       -1,
2494                       mbBuf.data(),
2495                       mbLen + 1,        // size in bytes, not length
2496                       NULL,
2497                       NULL
2498                    ) == 0 ||
2499                   strcmp(mbBuf, psz) != 0 )
2500             {
2501                 // we didn't obtain the same thing we started from, hence
2502                 // the conversion was lossy and we consider that it failed
2503                 return wxCONV_FAILED;
2504             }
2505         }
2506
2507         // note that it returns count of written chars for buf != NULL and size
2508         // of the needed buffer for buf == NULL so in either case the length of
2509         // the string (which never includes the terminating NUL) is one less
2510         return len - 1;
2511     }
2512
2513     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2514     {
2515         /*
2516             we have a problem here: by default, WideCharToMultiByte() may
2517             replace characters unrepresentable in the target code page with bad
2518             quality approximations such as turning "1/2" symbol (U+00BD) into
2519             "1" for the code pages which don't have it and we, obviously, want
2520             to avoid this at any price
2521
2522             the trouble is that this function does it _silently_, i.e. it won't
2523             even tell us whether it did or not... Win98/2000 and higher provide
2524             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2525             we have to resort to a round trip, i.e. check that converting back
2526             results in the same string -- this is, of course, expensive but
2527             otherwise we simply can't be sure to not garble the data.
2528          */
2529
2530         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2531         // it doesn't work with CJK encodings (which we test for rather roughly
2532         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2533         // supporting it
2534         BOOL usedDef wxDUMMY_INITIALIZE(false);
2535         BOOL *pUsedDef;
2536         int flags;
2537         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2538         {
2539             // it's our lucky day
2540             flags = WC_NO_BEST_FIT_CHARS;
2541             pUsedDef = &usedDef;
2542         }
2543         else // old system or unsupported encoding
2544         {
2545             flags = 0;
2546             pUsedDef = NULL;
2547         }
2548
2549         const size_t len = ::WideCharToMultiByte
2550                              (
2551                                 m_CodePage,     // code page
2552                                 flags,          // either none or no best fit
2553                                 pwz,            // input string
2554                                 -1,             // it is (wide) NUL-terminated
2555                                 buf,            // output buffer
2556                                 buf ? n : 0,    // and its size
2557                                 NULL,           // default "replacement" char
2558                                 pUsedDef        // [out] was it used?
2559                              );
2560
2561         if ( !len )
2562         {
2563             // function totally failed
2564             return wxCONV_FAILED;
2565         }
2566
2567         // we did something, check if we really succeeded
2568         if ( flags )
2569         {
2570             // check if the conversion failed, i.e. if any replacements
2571             // were done
2572             if ( usedDef )
2573                 return wxCONV_FAILED;
2574         }
2575         else // we must resort to double tripping...
2576         {
2577             // first we need to ensure that we really have the MB data: this is
2578             // not the case if we're called with NULL buffer, in which case we
2579             // need to do the conversion yet again
2580             wxCharBuffer bufDef;
2581             if ( !buf )
2582             {
2583                 bufDef = wxCharBuffer(len);
2584                 buf = bufDef.data();
2585                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2586                                             buf, len, NULL, NULL) )
2587                     return wxCONV_FAILED;
2588             }
2589
2590             if ( !n )
2591                 n = wcslen(pwz);
2592             wxWCharBuffer wcBuf(n);
2593             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2594                     wcscmp(wcBuf, pwz) != 0 )
2595             {
2596                 // we didn't obtain the same thing we started from, hence
2597                 // the conversion was lossy and we consider that it failed
2598                 return wxCONV_FAILED;
2599             }
2600         }
2601
2602         // see the comment above for the reason of "len - 1"
2603         return len - 1;
2604     }
2605
2606     virtual size_t GetMBNulLen() const
2607     {
2608         if ( m_minMBCharWidth == 0 )
2609         {
2610             int len = ::WideCharToMultiByte
2611                         (
2612                             m_CodePage,     // code page
2613                             0,              // no flags
2614                             L"",            // input string
2615                             1,              // translate just the NUL
2616                             NULL,           // output buffer
2617                             0,              // and its size
2618                             NULL,           // no replacement char
2619                             NULL            // [out] don't care if it was used
2620                         );
2621
2622             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2623             switch ( len )
2624             {
2625                 default:
2626                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2627                     self->m_minMBCharWidth = (size_t)-1;
2628                     break;
2629
2630                 case 0:
2631                     self->m_minMBCharWidth = (size_t)-1;
2632                     break;
2633
2634                 case 1:
2635                 case 2:
2636                 case 4:
2637                     self->m_minMBCharWidth = len;
2638                     break;
2639             }
2640         }
2641
2642         return m_minMBCharWidth;
2643     }
2644
2645     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2646
2647     bool IsOk() const { return m_CodePage != -1; }
2648
2649 private:
2650     static bool CanUseNoBestFit()
2651     {
2652         static int s_isWin98Or2k = -1;
2653
2654         if ( s_isWin98Or2k == -1 )
2655         {
2656             int verMaj, verMin;
2657             switch ( wxGetOsVersion(&verMaj, &verMin) )
2658             {
2659                 case wxOS_WINDOWS_9X:
2660                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2661                     break;
2662
2663                 case wxOS_WINDOWS_NT:
2664                     s_isWin98Or2k = verMaj >= 5;
2665                     break;
2666
2667                 default:
2668                     // unknown: be conservative by default
2669                     s_isWin98Or2k = 0;
2670                     break;
2671             }
2672
2673             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2674         }
2675
2676         return s_isWin98Or2k == 1;
2677     }
2678
2679     static bool IsAtLeastWin2kSP4()
2680     {
2681 #ifdef __WXWINCE__
2682         return false;
2683 #else
2684         static int s_isAtLeastWin2kSP4 = -1;
2685
2686         if ( s_isAtLeastWin2kSP4 == -1 )
2687         {
2688             OSVERSIONINFOEX ver;
2689
2690             memset(&ver, 0, sizeof(ver));
2691             ver.dwOSVersionInfoSize = sizeof(ver);
2692             GetVersionEx((OSVERSIONINFO*)&ver);
2693
2694             s_isAtLeastWin2kSP4 =
2695               ((ver.dwMajorVersion > 5) || // Vista+
2696                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2697                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2698                ver.wServicePackMajor >= 4)) // 2000 SP4+
2699               ? 1 : 0;
2700         }
2701
2702         return s_isAtLeastWin2kSP4 == 1;
2703 #endif
2704     }
2705
2706
2707     // the code page we're working with
2708     long m_CodePage;
2709
2710     // cached result of GetMBNulLen(), set to 0 initially meaning
2711     // "unknown"
2712     size_t m_minMBCharWidth;
2713 };
2714
2715 #endif // wxHAVE_WIN32_MB2WC
2716
2717
2718 // ============================================================================
2719 // wxEncodingConverter based conversion classes
2720 // ============================================================================
2721
2722 #if wxUSE_FONTMAP
2723
2724 class wxMBConv_wxwin : public wxMBConv
2725 {
2726 private:
2727     void Init()
2728     {
2729         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2730         // The wxMBConv_cf class does a better job.
2731         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2732                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2733                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2734     }
2735
2736 public:
2737     // temporarily just use wxEncodingConverter stuff,
2738     // so that it works while a better implementation is built
2739     wxMBConv_wxwin(const char* name)
2740     {
2741         if (name)
2742             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2743         else
2744             m_enc = wxFONTENCODING_SYSTEM;
2745
2746         Init();
2747     }
2748
2749     wxMBConv_wxwin(wxFontEncoding enc)
2750     {
2751         m_enc = enc;
2752
2753         Init();
2754     }
2755
2756     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2757     {
2758         size_t inbuf = strlen(psz);
2759         if (buf)
2760         {
2761             if (!m2w.Convert(psz, buf))
2762                 return wxCONV_FAILED;
2763         }
2764         return inbuf;
2765     }
2766
2767     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2768     {
2769         const size_t inbuf = wxWcslen(psz);
2770         if (buf)
2771         {
2772             if (!w2m.Convert(psz, buf))
2773                 return wxCONV_FAILED;
2774         }
2775
2776         return inbuf;
2777     }
2778
2779     virtual size_t GetMBNulLen() const
2780     {
2781         switch ( m_enc )
2782         {
2783             case wxFONTENCODING_UTF16BE:
2784             case wxFONTENCODING_UTF16LE:
2785                 return 2;
2786
2787             case wxFONTENCODING_UTF32BE:
2788             case wxFONTENCODING_UTF32LE:
2789                 return 4;
2790
2791             default:
2792                 return 1;
2793         }
2794     }
2795
2796     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2797
2798     bool IsOk() const { return m_ok; }
2799
2800 public:
2801     wxFontEncoding m_enc;
2802     wxEncodingConverter m2w, w2m;
2803
2804 private:
2805     // were we initialized successfully?
2806     bool m_ok;
2807
2808     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2809 };
2810
2811 // make the constructors available for unit testing
2812 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2813 {
2814     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2815     if ( !result->IsOk() )
2816     {
2817         delete result;
2818         return 0;
2819     }
2820
2821     return result;
2822 }
2823
2824 #endif // wxUSE_FONTMAP
2825
2826 // ============================================================================
2827 // wxCSConv implementation
2828 // ============================================================================
2829
2830 void wxCSConv::Init()
2831 {
2832     m_name = NULL;
2833     m_convReal =  NULL;
2834     m_deferred = true;
2835 }
2836
2837 wxCSConv::wxCSConv(const wxString& charset)
2838 {
2839     Init();
2840
2841     if ( !charset.empty() )
2842     {
2843         SetName(charset.ToAscii());
2844     }
2845
2846 #if wxUSE_FONTMAP
2847     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2848 #else
2849     m_encoding = wxFONTENCODING_SYSTEM;
2850 #endif
2851 }
2852
2853 wxCSConv::wxCSConv(wxFontEncoding encoding)
2854 {
2855     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2856     {
2857         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2858
2859         encoding = wxFONTENCODING_SYSTEM;
2860     }
2861
2862     Init();
2863
2864     m_encoding = encoding;
2865 }
2866
2867 wxCSConv::~wxCSConv()
2868 {
2869     Clear();
2870 }
2871
2872 wxCSConv::wxCSConv(const wxCSConv& conv)
2873         : wxMBConv()
2874 {
2875     Init();
2876
2877     SetName(conv.m_name);
2878     m_encoding = conv.m_encoding;
2879 }
2880
2881 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2882 {
2883     Clear();
2884
2885     SetName(conv.m_name);
2886     m_encoding = conv.m_encoding;
2887
2888     return *this;
2889 }
2890
2891 void wxCSConv::Clear()
2892 {
2893     free(m_name);
2894     delete m_convReal;
2895
2896     m_name = NULL;
2897     m_convReal = NULL;
2898 }
2899
2900 void wxCSConv::SetName(const char *charset)
2901 {
2902     if (charset)
2903     {
2904         m_name = wxStrdup(charset);
2905         m_deferred = true;
2906     }
2907 }
2908
2909 #if wxUSE_FONTMAP
2910
2911 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2912                      wxEncodingNameCache );
2913
2914 static wxEncodingNameCache gs_nameCache;
2915 #endif
2916
2917 wxMBConv *wxCSConv::DoCreate() const
2918 {
2919 #if wxUSE_FONTMAP
2920     wxLogTrace(TRACE_STRCONV,
2921                wxT("creating conversion for %s"),
2922                (m_name ? m_name
2923                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2924 #endif // wxUSE_FONTMAP
2925
2926     // check for the special case of ASCII or ISO8859-1 charset: as we have
2927     // special knowledge of it anyhow, we don't need to create a special
2928     // conversion object
2929     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2930             m_encoding == wxFONTENCODING_DEFAULT )
2931     {
2932         // don't convert at all
2933         return NULL;
2934     }
2935
2936     // we trust OS to do conversion better than we can so try external
2937     // conversion methods first
2938     //
2939     // the full order is:
2940     //      1. OS conversion (iconv() under Unix or Win32 API)
2941     //      2. hard coded conversions for UTF
2942     //      3. wxEncodingConverter as fall back
2943
2944     // step (1)
2945 #ifdef HAVE_ICONV
2946 #if !wxUSE_FONTMAP
2947     if ( m_name )
2948 #endif // !wxUSE_FONTMAP
2949     {
2950 #if wxUSE_FONTMAP
2951         wxFontEncoding encoding(m_encoding);
2952 #endif
2953
2954         if ( m_name )
2955         {
2956             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2957             if ( conv->IsOk() )
2958                 return conv;
2959
2960             delete conv;
2961
2962 #if wxUSE_FONTMAP
2963             encoding =
2964                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2965 #endif // wxUSE_FONTMAP
2966         }
2967 #if wxUSE_FONTMAP
2968         {
2969             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2970             if ( it != gs_nameCache.end() )
2971             {
2972                 if ( it->second.empty() )
2973                     return NULL;
2974
2975                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2976                 if ( conv->IsOk() )
2977                     return conv;
2978
2979                 delete conv;
2980             }
2981
2982             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2983             // CS : in case this does not return valid names (eg for MacRoman)
2984             // encoding got a 'failure' entry in the cache all the same,
2985             // although it just has to be created using a different method, so
2986             // only store failed iconv creation attempts (or perhaps we
2987             // shoulnd't do this at all ?)
2988             if ( names[0] != NULL )
2989             {
2990                 for ( ; *names; ++names )
2991                 {
2992                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2993                     //             will need changes that will obsolete this
2994                     wxString name(*names);
2995                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2996                     if ( conv->IsOk() )
2997                     {
2998                         gs_nameCache[encoding] = *names;
2999                         return conv;
3000                     }
3001
3002                     delete conv;
3003                 }
3004
3005                 gs_nameCache[encoding] = _T(""); // cache the failure
3006             }
3007         }
3008 #endif // wxUSE_FONTMAP
3009     }
3010 #endif // HAVE_ICONV
3011
3012 #ifdef wxHAVE_WIN32_MB2WC
3013     {
3014 #if wxUSE_FONTMAP
3015         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3016                                       : new wxMBConv_win32(m_encoding);
3017         if ( conv->IsOk() )
3018             return conv;
3019
3020         delete conv;
3021 #else
3022         return NULL;
3023 #endif
3024     }
3025 #endif // wxHAVE_WIN32_MB2WC
3026
3027 #ifdef __DARWIN__
3028     {
3029         // leave UTF16 and UTF32 to the built-ins of wx
3030         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3031             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3032         {
3033 #if wxUSE_FONTMAP
3034             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3035                                           : new wxMBConv_cf(m_encoding);
3036 #else
3037             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3038 #endif
3039
3040             if ( conv->IsOk() )
3041                  return conv;
3042
3043             delete conv;
3044         }
3045     }
3046 #endif // __DARWIN__
3047
3048     // step (2)
3049     wxFontEncoding enc = m_encoding;
3050 #if wxUSE_FONTMAP
3051     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3052     {
3053         // use "false" to suppress interactive dialogs -- we can be called from
3054         // anywhere and popping up a dialog from here is the last thing we want to
3055         // do
3056         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3057     }
3058 #endif // wxUSE_FONTMAP
3059
3060     switch ( enc )
3061     {
3062         case wxFONTENCODING_UTF7:
3063              return new wxMBConvUTF7;
3064
3065         case wxFONTENCODING_UTF8:
3066              return new wxMBConvUTF8;
3067
3068         case wxFONTENCODING_UTF16BE:
3069              return new wxMBConvUTF16BE;
3070
3071         case wxFONTENCODING_UTF16LE:
3072              return new wxMBConvUTF16LE;
3073
3074         case wxFONTENCODING_UTF32BE:
3075              return new wxMBConvUTF32BE;
3076
3077         case wxFONTENCODING_UTF32LE:
3078              return new wxMBConvUTF32LE;
3079
3080         default:
3081              // nothing to do but put here to suppress gcc warnings
3082              break;
3083     }
3084
3085     // step (3)
3086 #if wxUSE_FONTMAP
3087     {
3088         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3089                                       : new wxMBConv_wxwin(m_encoding);
3090         if ( conv->IsOk() )
3091             return conv;
3092
3093         delete conv;
3094     }
3095 #endif // wxUSE_FONTMAP
3096
3097     // NB: This is a hack to prevent deadlock. What could otherwise happen
3098     //     in Unicode build: wxConvLocal creation ends up being here
3099     //     because of some failure and logs the error. But wxLog will try to
3100     //     attach a timestamp, for which it will need wxConvLocal (to convert
3101     //     time to char* and then wchar_t*), but that fails, tries to log the
3102     //     error, but wxLog has an (already locked) critical section that
3103     //     guards the static buffer.
3104     static bool alreadyLoggingError = false;
3105     if (!alreadyLoggingError)
3106     {
3107         alreadyLoggingError = true;
3108         wxLogError(_("Cannot convert from the charset '%s'!"),
3109                    m_name ? m_name
3110                       :
3111 #if wxUSE_FONTMAP
3112                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3113 #else // !wxUSE_FONTMAP
3114                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3115 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3116               );
3117
3118         alreadyLoggingError = false;
3119     }
3120
3121     return NULL;
3122 }
3123
3124 void wxCSConv::CreateConvIfNeeded() const
3125 {
3126     if ( m_deferred )
3127     {
3128         wxCSConv *self = (wxCSConv *)this; // const_cast
3129
3130         // if we don't have neither the name nor the encoding, use the default
3131         // encoding for this system
3132         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3133         {
3134 #if wxUSE_INTL
3135             self->m_encoding = wxLocale::GetSystemEncoding();
3136 #else
3137             // fallback to some reasonable default:
3138             self->m_encoding = wxFONTENCODING_ISO8859_1;
3139 #endif // wxUSE_INTL
3140         }
3141
3142         self->m_convReal = DoCreate();
3143         self->m_deferred = false;
3144     }
3145 }
3146
3147 bool wxCSConv::IsOk() const
3148 {
3149     CreateConvIfNeeded();
3150
3151     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3152     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3153         return true; // always ok as we do it ourselves
3154
3155     // m_convReal->IsOk() is called at its own creation, so we know it must
3156     // be ok if m_convReal is non-NULL
3157     return m_convReal != NULL;
3158 }
3159
3160 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3161                          const char *src, size_t srcLen) const
3162 {
3163     CreateConvIfNeeded();
3164
3165     if (m_convReal)
3166         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3167
3168     // latin-1 (direct)
3169     if ( srcLen == wxNO_LEN )
3170         srcLen = strlen(src) + 1; // take trailing NUL too
3171
3172     if ( dst )
3173     {
3174         if ( dstLen < srcLen )
3175             return wxCONV_FAILED;
3176
3177         for ( size_t n = 0; n < srcLen; n++ )
3178             dst[n] = (unsigned char)(src[n]);
3179     }
3180
3181     return srcLen;
3182 }
3183
3184 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3185                            const wchar_t *src, size_t srcLen) const
3186 {
3187     CreateConvIfNeeded();
3188
3189     if (m_convReal)
3190         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3191
3192     // latin-1 (direct)
3193     if ( srcLen == wxNO_LEN )
3194         srcLen = wxWcslen(src) + 1;
3195
3196     if ( dst )
3197     {
3198         if ( dstLen < srcLen )
3199             return wxCONV_FAILED;
3200
3201         for ( size_t n = 0; n < srcLen; n++ )
3202         {
3203             if ( src[n] > 0xFF )
3204                 return wxCONV_FAILED;
3205
3206             dst[n] = (char)src[n];
3207         }
3208
3209     }
3210     else // still need to check the input validity
3211     {
3212         for ( size_t n = 0; n < srcLen; n++ )
3213         {
3214             if ( src[n] > 0xFF )
3215                 return wxCONV_FAILED;
3216         }
3217     }
3218
3219     return srcLen;
3220 }
3221
3222 size_t wxCSConv::GetMBNulLen() const
3223 {
3224     CreateConvIfNeeded();
3225
3226     if ( m_convReal )
3227     {
3228         return m_convReal->GetMBNulLen();
3229     }
3230
3231     // otherwise, we are ISO-8859-1
3232     return 1;
3233 }
3234
3235 #if wxUSE_UNICODE_UTF8
3236 bool wxCSConv::IsUTF8() const
3237 {
3238     CreateConvIfNeeded();
3239
3240     if ( m_convReal )
3241     {
3242         return m_convReal->IsUTF8();
3243     }
3244
3245     // otherwise, we are ISO-8859-1
3246     return false;
3247 }
3248 #endif
3249
3250
3251 #if wxUSE_UNICODE
3252
3253 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3254 {
3255     if ( !s )
3256         return wxWCharBuffer();
3257
3258     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3259     if ( !wbuf )
3260         wbuf = wxMBConvUTF8().cMB2WX(s);
3261     if ( !wbuf )
3262         wbuf = wxConvISO8859_1.cMB2WX(s);
3263
3264     return wbuf;
3265 }
3266
3267 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3268 {
3269     if ( !ws )
3270         return wxCharBuffer();
3271
3272     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3273     if ( !buf )
3274         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3275
3276     return buf;
3277 }
3278
3279 #endif // wxUSE_UNICODE
3280
3281 // ----------------------------------------------------------------------------
3282 // globals
3283 // ----------------------------------------------------------------------------
3284
3285 // NB: The reason why we create converted objects in this convoluted way,
3286 //     using a factory function instead of global variable, is that they
3287 //     may be used at static initialization time (some of them are used by
3288 //     wxString ctors and there may be a global wxString object). In other
3289 //     words, possibly _before_ the converter global object would be
3290 //     initialized.
3291
3292 #undef wxConvLibc
3293 #undef wxConvUTF8
3294 #undef wxConvUTF7
3295 #undef wxConvLocal
3296 #undef wxConvISO8859_1
3297
3298 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3299     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3300     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3301     {                                                                   \
3302         static impl_klass name##Obj ctor_args;                          \
3303         return &name##Obj;                                              \
3304     }                                                                   \
3305     /* this ensures that all global converter objects are created */    \
3306     /* by the time static initialization is done, i.e. before any */    \
3307     /* thread is launched: */                                           \
3308     static klass* gs_##name##instance = wxGet_##name##Ptr()
3309
3310 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3311     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3312
3313 #ifdef __WINDOWS__
3314     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3315 #else
3316     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3317 #endif
3318
3319 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3320 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3321 //     provokes an error message about "not enough macro parameters"; and we
3322 //     can't use "()" here as the name##Obj declaration would be parsed as a
3323 //     function declaration then, so use a semicolon and live with an extra
3324 //     empty statement (and hope that no compilers warns about this)
3325 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3326 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3327
3328 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3329 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3330
3331 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3332 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3333
3334 #ifdef __DARWIN__
3335 // The xnu kernel always communicates file paths in decomposed UTF-8.
3336 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3337 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3338 #endif
3339
3340 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3341 #ifdef __DARWIN__
3342                                     &wxConvMacUTF8DObj;
3343 #else // !__DARWIN__
3344                                     wxGet_wxConvLibcPtr();
3345 #endif // __DARWIN__/!__DARWIN__
3346
3347 #else // !wxUSE_WCHAR_T
3348
3349 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3350 // stand-ins in absence of wchar_t
3351 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3352                                 wxConvISO8859_1,
3353                                 wxConvLocal,
3354                                 wxConvUTF8;
3355
3356 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T