src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487 //
 488 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 489
 490 //
 491 // BASE64 decoding table
 492 //
 493 static const unsigned char utf7unb64[] =
 494 {
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 501     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 502     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 504     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 505     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 506     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 508     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 509     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 510     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 527 };
 528
 529 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 530                              const char *src, size_t srcLen) const
 531 {
 532     DecoderState stateOrig,
 533          *statePtr;
 534     if ( srcLen == wxNO_LEN )
 535     {
 536         // convert the entire string, up to and including the trailing NUL
 537         srcLen = strlen(src) + 1;
 538
 539         // when working on the entire strings we don't update nor use the shift
 540         // state from the previous call
 541         statePtr = &stateOrig;
 542     }
 543     else // when working with partial strings we do use the shift state
 544     {
 545         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 546
 547         // also save the old state to be able to rollback to it on error
 548         stateOrig = m_stateDecoder;
 549     }
 550
 551     // but to simplify the code below we use this variable in both cases
 552     DecoderState& state = *statePtr;
 553
 554
 555     // number of characters [which would have been] written to dst [if it were
 556     // not NULL]
 557     size_t len = 0;
 558
 559     const char * const srcEnd = src + srcLen;
 560
 561     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 562     {
 563         const unsigned char cc = *src++;
 564
 565         if ( state.IsShifted() )
 566         {
 567             const unsigned char dc = utf7unb64[cc];
 568             if ( dc == 0xff )
 569             {
 570                 // end of encoded part
 571                 state.ToDirect();
 572
 573                 // re-parse this character normally below unless it's '-' which
 574                 // is consumed by the decoder
 575                 if ( cc == '-' )
 576                     continue;
 577             }
 578             else // valid encoded character
 579             {
 580                 // mini base64 decoder: each character is 6 bits
 581                 state.bit += 6;
 582                 state.accum <<= 6;
 583                 state.accum += dc;
 584
 585                 if ( state.bit >= 8 )
 586                 {
 587                     // got the full byte, consume it
 588                     state.bit -= 8;
 589                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 590
 591                     if ( state.isLSB )
 592                     {
 593                         // we've got the full word, output it
 594                         if ( dst )
 595                             *dst++ = (state.msb << 8) | b;
 596                         len++;
 597                         state.isLSB = false;
 598                     }
 599                     else // MSB
 600                     {
 601                         // just store it while we wait for LSB
 602                         state.msb = b;
 603                         state.isLSB = true;
 604                     }
 605                 }
 606             }
 607         }
 608
 609         if ( state.IsDirect() )
 610         {
 611             // start of an encoded segment?
 612             if ( cc == '+' )
 613             {
 614                 if ( src == srcEnd )
 615                     return wxCONV_FAILED; // can't have '+' at the end
 616
 617                 if ( *src == '-' )
 618                 {
 619                     // just the encoded plus sign, don't switch to shifted mode
 620                     if ( dst )
 621                         *dst++ = '+';
 622                     len++;
 623                     src++;
 624                 }
 625                 else
 626                 {
 627                     state.ToShifted();
 628                 }
 629             }
 630             else // not '+'
 631             {
 632                 // only printable 7 bit ASCII characters (with the exception of
 633                 // NUL, TAB, CR and LF) can be used directly
 634                 if ( cc >= 0x7f || (cc < ' ' &&
 635                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 636                     return wxCONV_FAILED;
 637
 638                 if ( dst )
 639                     *dst++ = cc;
 640                 len++;
 641             }
 642         }
 643     }
 644
 645     if ( !len )
 646     {
 647         // as we didn't read any characters we should be called with the same
 648         // data (followed by some more new data) again later so don't save our
 649         // state
 650         state = stateOrig;
 651
 652         return wxCONV_FAILED;
 653     }
 654
 655     return len;
 656 }
 657
 658 //
 659 // BASE64 encoding table
 660 //
 661 static const unsigned char utf7enb64[] =
 662 {
 663     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 664     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 665     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 666     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 667     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 668     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 669     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 670     '4', '5', '6', '7', '8', '9', '+', '/'
 671 };
 672
 673 //
 674 // UTF-7 encoding table
 675 //
 676 // 0 - Set D (directly encoded characters)
 677 // 1 - Set O (optional direct characters)
 678 // 2 - whitespace characters (optional)
 679 // 3 - special characters
 680 //
 681 static const unsigned char utf7encode[128] =
 682 {
 683     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 684     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 685     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 686     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 687     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 688     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 689     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 690     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 691 };
 692
 693 static inline bool wxIsUTF7Direct(wchar_t wc)
 694 {
 695     return wc < 0x80 && utf7encode[wc] < 1;
 696 }
 697
 698 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 699                                const wchar_t *src, size_t srcLen) const
 700 {
 701     EncoderState stateOrig,
 702                 *statePtr;
 703     if ( srcLen == wxNO_LEN )
 704     {
 705         // we don't apply the stored state when operating on entire strings at
 706         // once
 707         statePtr = &stateOrig;
 708
 709         srcLen = wxWcslen(src) + 1;
 710     }
 711     else // do use the mode we left the output in previously
 712     {
 713         stateOrig = m_stateEncoder;
 714         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 715     }
 716
 717     EncoderState& state = *statePtr;
 718
 719
 720     size_t len = 0;
 721
 722     const wchar_t * const srcEnd = src + srcLen;
 723     while ( src < srcEnd && (!dst || len < dstLen) )
 724     {
 725         wchar_t cc = *src++;
 726         if ( wxIsUTF7Direct(cc) )
 727         {
 728             if ( state.IsShifted() )
 729             {
 730                 // pad with zeros the last encoded block if necessary
 731                 if ( state.bit )
 732                 {
 733                     if ( dst )
 734                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 735                     len++;
 736                 }
 737
 738                 state.ToDirect();
 739
 740                 if ( dst )
 741                     *dst++ = '-';
 742                 len++;
 743             }
 744
 745             if ( dst )
 746                 *dst++ = (char)cc;
 747             len++;
 748         }
 749         else if ( cc == '+' && state.IsDirect() )
 750         {
 751             if ( dst )
 752             {
 753                 *dst++ = '+';
 754                 *dst++ = '-';
 755             }
 756
 757             len += 2;
 758         }
 759 #ifndef WC_UTF16
 760         else if (((wxUint32)cc) > 0xffff)
 761         {
 762             // no surrogate pair generation (yet?)
 763             return wxCONV_FAILED;
 764         }
 765 #endif
 766         else
 767         {
 768             if ( state.IsDirect() )
 769             {
 770                 state.ToShifted();
 771
 772                 if ( dst )
 773                     *dst++ = '+';
 774                 len++;
 775             }
 776
 777             // BASE64 encode string
 778             for ( ;; )
 779             {
 780                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 781                 {
 782                     state.accum <<= 8;
 783                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 784
 785                     for (state.bit += 8; state.bit >= 6; )
 786                     {
 787                         state.bit -= 6;
 788                         if ( dst )
 789                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 790                         len++;
 791                     }
 792                 }
 793
 794                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 795                     break;
 796
 797                 src++;
 798             }
 799         }
 800     }
 801
 802     // we need to restore the original encoder state if we were called just to
 803     // calculate the amount of space needed as we will presumably be called
 804     // again to really convert the data now
 805     if ( !dst )
 806         state = stateOrig;
 807
 808     return len;
 809 }
 810
 811 // ----------------------------------------------------------------------------
 812 // UTF-8
 813 // ----------------------------------------------------------------------------
 814
 815 static const wxUint32 utf8_max[]=
 816     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 817
 818 // boundaries of the private use area we use to (temporarily) remap invalid
 819 // characters invalid in a UTF-8 encoded string
 820 const wxUint32 wxUnicodePUA = 0x100000;
 821 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 822
 823 // this table gives the length of the UTF-8 encoding from its first character:
 824 const unsigned char tableUtf8Lengths[256] = {
 825     // single-byte sequences (ASCII):
 826     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 827     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 828     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 829     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 830     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 831     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 832     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 833     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 834
 835     // these are invalid:
 836     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 837     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 838     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 839     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 840     0, 0,                                            // C0,C1
 841
 842     // two-byte sequences:
 843           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 844     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 845
 846     // three-byte sequences:
 847     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 848
 849     // four-byte sequences:
 850     4, 4, 4, 4, 4,                                   // F0..F4
 851
 852     // these are invalid again (5- or 6-byte
 853     // sequences and sequences for code points
 854     // above U+10FFFF, as restricted by RFC 3629):
 855                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 856 };
 857
 858 size_t
 859 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 860                             const char *src, size_t srcLen) const
 861 {
 862     wchar_t *out = dstLen ? dst : NULL;
 863     size_t written = 0;
 864
 865     if ( srcLen == wxNO_LEN )
 866         srcLen = strlen(src) + 1;
 867
 868     for ( const char *p = src; ; p++ )
 869     {
 870         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 871         {
 872             // all done successfully, just add the trailing NULL if we are not
 873             // using explicit length
 874             if ( srcLen == wxNO_LEN )
 875             {
 876                 if ( out )
 877                 {
 878                     if ( !dstLen )
 879                         break;
 880
 881                     *out = L'\0';
 882                 }
 883
 884                 written++;
 885             }
 886
 887             return written;
 888         }
 889
 890         if ( out && !dstLen-- )
 891             break;
 892
 893         wxUint32 code;
 894         unsigned char c = *p;
 895
 896         if ( c < 0x80 )
 897         {
 898             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 899                 break;
 900
 901             if ( srcLen != wxNO_LEN )
 902                 srcLen--;
 903
 904             code = c;
 905         }
 906         else
 907         {
 908             unsigned len = tableUtf8Lengths[c];
 909             if ( !len )
 910                 break;
 911
 912             if ( srcLen < len ) // the test works for wxNO_LEN too
 913                 break;
 914
 915             if ( srcLen != wxNO_LEN )
 916                 srcLen -= len;
 917
 918             //   Char. number range   |        UTF-8 octet sequence
 919             //      (hexadecimal)     |              (binary)
 920             //  ----------------------+----------------------------------------
 921             //  0000 0000 - 0000 007F | 0xxxxxxx
 922             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 923             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 924             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 925             //
 926             //  Code point value is stored in bits marked with 'x',
 927             //  lowest-order bit of the value on the right side in the diagram
 928             //  above.                                         (from RFC 3629)
 929
 930             // mask to extract lead byte's value ('x' bits above), by sequence
 931             // length:
 932             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 933
 934             // mask and value of lead byte's most significant bits, by length:
 935             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 936             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 937
 938             len--; // it's more convenient to work with 0-based length here
 939
 940             // extract the lead byte's value bits:
 941             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 942                 break;
 943
 944             code = c & leadValueMask[len];
 945
 946             // all remaining bytes, if any, are handled in the same way
 947             // regardless of sequence's length:
 948             for ( ; len; --len )
 949             {
 950                 c = *++p;
 951                 if ( (c & 0xC0) != 0x80 )
 952                     return wxCONV_FAILED;
 953
 954                 code <<= 6;
 955                 code |= c & 0x3F;
 956             }
 957         }
 958
 959 #ifdef WC_UTF16
 960         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 961         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 962         {
 963             if ( out )
 964                 out++;
 965             written++;
 966         }
 967 #else // !WC_UTF16
 968         if ( out )
 969             *out = code;
 970 #endif // WC_UTF16/!WC_UTF16
 971
 972         if ( out )
 973             out++;
 974
 975         written++;
 976     }
 977
 978     return wxCONV_FAILED;
 979 }
 980
 981 size_t
 982 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 983                               const wchar_t *src, size_t srcLen) const
 984 {
 985     char *out = dstLen ? dst : NULL;
 986     size_t written = 0;
 987
 988     for ( const wchar_t *wp = src; ; wp++ )
 989     {
 990         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
 991         {
 992             // all done successfully, just add the trailing NULL if we are not
 993             // using explicit length
 994             if ( srcLen == wxNO_LEN )
 995             {
 996                 if ( out )
 997                 {
 998                     if ( !dstLen )
 999                         break;
1000
1001                     *out = '\0';
1002                 }
1003
1004                 written++;
1005             }
1006
1007             return written;
1008         }
1009
1010         if ( srcLen != wxNO_LEN )
1011             srcLen--;
1012
1013         wxUint32 code;
1014 #ifdef WC_UTF16
1015         // cast is ok for WC_UTF16
1016         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1017         {
1018             // skip the next char too as we decoded a surrogate
1019             wp++;
1020         }
1021 #else // wchar_t is UTF-32
1022         code = *wp & 0x7fffffff;
1023 #endif
1024
1025         unsigned len;
1026         if ( code <= 0x7F )
1027         {
1028             len = 1;
1029             if ( out )
1030             {
1031                 if ( dstLen < len )
1032                     break;
1033
1034                 out[0] = (char)code;
1035             }
1036         }
1037         else if ( code <= 0x07FF )
1038         {
1039             len = 2;
1040             if ( out )
1041             {
1042                 if ( dstLen < len )
1043                     break;
1044
1045                 // NB: this line takes 6 least significant bits, encodes them as
1046                 // 10xxxxxx and discards them so that the next byte can be encoded:
1047                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1048                 out[0] = 0xC0 | code;
1049             }
1050         }
1051         else if ( code < 0xFFFF )
1052         {
1053             len = 3;
1054             if ( out )
1055             {
1056                 if ( dstLen < len )
1057                     break;
1058
1059                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1060                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1061                 out[0] = 0xE0 | code;
1062             }
1063         }
1064         else if ( code <= 0x10FFFF )
1065         {
1066             len = 4;
1067             if ( out )
1068             {
1069                 if ( dstLen < len )
1070                     break;
1071
1072                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1073                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1074                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1075                 out[0] = 0xF0 | code;
1076             }
1077         }
1078         else
1079         {
1080             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1081             break;
1082         }
1083
1084         if ( out )
1085         {
1086             out += len;
1087             dstLen -= len;
1088         }
1089
1090         written += len;
1091     }
1092
1093     // we only get here if an error occurs during decoding
1094     return wxCONV_FAILED;
1095 }
1096
1097 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1098                              const char *psz, size_t srcLen) const
1099 {
1100     if ( m_options == MAP_INVALID_UTF8_NOT )
1101         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1102
1103     size_t len = 0;
1104
1105     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1106     {
1107         const char *opsz = psz;
1108         bool invalid = false;
1109         unsigned char cc = *psz++, fc = cc;
1110         unsigned cnt;
1111         for (cnt = 0; fc & 0x80; cnt++)
1112             fc <<= 1;
1113
1114         if (!cnt)
1115         {
1116             // plain ASCII char
1117             if (buf)
1118                 *buf++ = cc;
1119             len++;
1120
1121             // escape the escape character for octal escapes
1122             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1123                     && cc == '\\' && (!buf || len < n))
1124             {
1125                 if (buf)
1126                     *buf++ = cc;
1127                 len++;
1128             }
1129         }
1130         else
1131         {
1132             cnt--;
1133             if (!cnt)
1134             {
1135                 // invalid UTF-8 sequence
1136                 invalid = true;
1137             }
1138             else
1139             {
1140                 unsigned ocnt = cnt - 1;
1141                 wxUint32 res = cc & (0x3f >> cnt);
1142                 while (cnt--)
1143                 {
1144                     cc = *psz;
1145                     if ((cc & 0xC0) != 0x80)
1146                     {
1147                         // invalid UTF-8 sequence
1148                         invalid = true;
1149                         break;
1150                     }
1151
1152                     psz++;
1153                     res = (res << 6) | (cc & 0x3f);
1154                 }
1155
1156                 if (invalid || res <= utf8_max[ocnt])
1157                 {
1158                     // illegal UTF-8 encoding
1159                     invalid = true;
1160                 }
1161                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1162                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1163                 {
1164                     // if one of our PUA characters turns up externally
1165                     // it must also be treated as an illegal sequence
1166                     // (a bit like you have to escape an escape character)
1167                     invalid = true;
1168                 }
1169                 else
1170                 {
1171 #ifdef WC_UTF16
1172                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1173                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1174                     if (pa == wxCONV_FAILED)
1175                     {
1176                         invalid = true;
1177                     }
1178                     else
1179                     {
1180                         if (buf)
1181                             buf += pa;
1182                         len += pa;
1183                     }
1184 #else // !WC_UTF16
1185                     if (buf)
1186                         *buf++ = (wchar_t)res;
1187                     len++;
1188 #endif // WC_UTF16/!WC_UTF16
1189                 }
1190             }
1191
1192             if (invalid)
1193             {
1194                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1195                 {
1196                     while (opsz < psz && (!buf || len < n))
1197                     {
1198 #ifdef WC_UTF16
1199                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1200                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1201                         wxASSERT(pa != wxCONV_FAILED);
1202                         if (buf)
1203                             buf += pa;
1204                         opsz++;
1205                         len += pa;
1206 #else
1207                         if (buf)
1208                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1209                         opsz++;
1210                         len++;
1211 #endif
1212                     }
1213                 }
1214                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1215                 {
1216                     while (opsz < psz && (!buf || len < n))
1217                     {
1218                         if ( buf && len + 3 < n )
1219                         {
1220                             unsigned char on = *opsz;
1221                             *buf++ = L'\\';
1222                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1223                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1224                             *buf++ = (wchar_t)( L'0' + on % 010 );
1225                         }
1226
1227                         opsz++;
1228                         len += 4;
1229                     }
1230                 }
1231                 else // MAP_INVALID_UTF8_NOT
1232                 {
1233                     return wxCONV_FAILED;
1234                 }
1235             }
1236         }
1237     }
1238
1239     if (srcLen == wxNO_LEN && buf && (len < n))
1240         *buf = 0;
1241
1242     return len + 1;
1243 }
1244
1245 static inline bool isoctal(wchar_t wch)
1246 {
1247     return L'0' <= wch && wch <= L'7';
1248 }
1249
1250 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1251                                const wchar_t *psz, size_t srcLen) const
1252 {
1253     if ( m_options == MAP_INVALID_UTF8_NOT )
1254         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1255
1256     size_t len = 0;
1257
1258     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1259     {
1260         wxUint32 cc;
1261
1262 #ifdef WC_UTF16
1263         // cast is ok for WC_UTF16
1264         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1265         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1266 #else
1267         cc = (*psz++) & 0x7fffffff;
1268 #endif
1269
1270         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1271                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1272         {
1273             if (buf)
1274                 *buf++ = (char)(cc - wxUnicodePUA);
1275             len++;
1276         }
1277         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1278                     && cc == L'\\' && psz[0] == L'\\' )
1279         {
1280             if (buf)
1281                 *buf++ = (char)cc;
1282             psz++;
1283             len++;
1284         }
1285         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1286                     cc == L'\\' &&
1287                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1288         {
1289             if (buf)
1290             {
1291                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1292                                  (psz[1] - L'0') * 010 +
1293                                  (psz[2] - L'0'));
1294             }
1295
1296             psz += 3;
1297             len++;
1298         }
1299         else
1300         {
1301             unsigned cnt;
1302             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1303             {
1304             }
1305
1306             if (!cnt)
1307             {
1308                 // plain ASCII char
1309                 if (buf)
1310                     *buf++ = (char) cc;
1311                 len++;
1312             }
1313             else
1314             {
1315                 len += cnt + 1;
1316                 if (buf)
1317                 {
1318                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1319                     while (cnt--)
1320                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1321                 }
1322             }
1323         }
1324     }
1325
1326     if (srcLen == wxNO_LEN && buf && (len < n))
1327         *buf = 0;
1328
1329     return len + 1;
1330 }
1331
1332 // ============================================================================
1333 // UTF-16
1334 // ============================================================================
1335
1336 #ifdef WORDS_BIGENDIAN
1337     #define wxMBConvUTF16straight wxMBConvUTF16BE
1338     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1339 #else
1340     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1341     #define wxMBConvUTF16straight wxMBConvUTF16LE
1342 #endif
1343
1344 /* static */
1345 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1346 {
1347     if ( srcLen == wxNO_LEN )
1348     {
1349         // count the number of bytes in input, including the trailing NULs
1350         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1351         for ( srcLen = 1; *inBuff++; srcLen++ )
1352             ;
1353
1354         srcLen *= BYTES_PER_CHAR;
1355     }
1356     else // we already have the length
1357     {
1358         // we can only convert an entire number of UTF-16 characters
1359         if ( srcLen % BYTES_PER_CHAR )
1360             return wxCONV_FAILED;
1361     }
1362
1363     return srcLen;
1364 }
1365
1366 // case when in-memory representation is UTF-16 too
1367 #ifdef WC_UTF16
1368
1369 // ----------------------------------------------------------------------------
1370 // conversions without endianness change
1371 // ----------------------------------------------------------------------------
1372
1373 size_t
1374 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1375                                const char *src, size_t srcLen) const
1376 {
1377     // set up the scene for using memcpy() (which is presumably more efficient
1378     // than copying the bytes one by one)
1379     srcLen = GetLength(src, srcLen);
1380     if ( srcLen == wxNO_LEN )
1381         return wxCONV_FAILED;
1382
1383     const size_t inLen = srcLen / BYTES_PER_CHAR;
1384     if ( dst )
1385     {
1386         if ( dstLen < inLen )
1387             return wxCONV_FAILED;
1388
1389         memcpy(dst, src, srcLen);
1390     }
1391
1392     return inLen;
1393 }
1394
1395 size_t
1396 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1397                                  const wchar_t *src, size_t srcLen) const
1398 {
1399     if ( srcLen == wxNO_LEN )
1400         srcLen = wxWcslen(src) + 1;
1401
1402     srcLen *= BYTES_PER_CHAR;
1403
1404     if ( dst )
1405     {
1406         if ( dstLen < srcLen )
1407             return wxCONV_FAILED;
1408
1409         memcpy(dst, src, srcLen);
1410     }
1411
1412     return srcLen;
1413 }
1414
1415 // ----------------------------------------------------------------------------
1416 // endian-reversing conversions
1417 // ----------------------------------------------------------------------------
1418
1419 size_t
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1421                            const char *src, size_t srcLen) const
1422 {
1423     srcLen = GetLength(src, srcLen);
1424     if ( srcLen == wxNO_LEN )
1425         return wxCONV_FAILED;
1426
1427     srcLen /= BYTES_PER_CHAR;
1428
1429     if ( dst )
1430     {
1431         if ( dstLen < srcLen )
1432             return wxCONV_FAILED;
1433
1434         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1435         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1436         {
1437             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1438         }
1439     }
1440
1441     return srcLen;
1442 }
1443
1444 size_t
1445 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1446                              const wchar_t *src, size_t srcLen) const
1447 {
1448     if ( srcLen == wxNO_LEN )
1449         srcLen = wxWcslen(src) + 1;
1450
1451     srcLen *= BYTES_PER_CHAR;
1452
1453     if ( dst )
1454     {
1455         if ( dstLen < srcLen )
1456             return wxCONV_FAILED;
1457
1458         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1459         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1460         {
1461             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1462         }
1463     }
1464
1465     return srcLen;
1466 }
1467
1468 #else // !WC_UTF16: wchar_t is UTF-32
1469
1470 // ----------------------------------------------------------------------------
1471 // conversions without endianness change
1472 // ----------------------------------------------------------------------------
1473
1474 size_t
1475 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1476                                const char *src, size_t srcLen) const
1477 {
1478     srcLen = GetLength(src, srcLen);
1479     if ( srcLen == wxNO_LEN )
1480         return wxCONV_FAILED;
1481
1482     const size_t inLen = srcLen / BYTES_PER_CHAR;
1483     if ( !dst )
1484     {
1485         // optimization: return maximal space which could be needed for this
1486         // string even if the real size could be smaller if the buffer contains
1487         // any surrogates
1488         return inLen;
1489     }
1490
1491     size_t outLen = 0;
1492     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1493     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1494     {
1495         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1496         if ( !inBuff )
1497             return wxCONV_FAILED;
1498
1499         if ( ++outLen > dstLen )
1500             return wxCONV_FAILED;
1501
1502         *dst++ = ch;
1503     }
1504
1505
1506     return outLen;
1507 }
1508
1509 size_t
1510 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1511                                  const wchar_t *src, size_t srcLen) const
1512 {
1513     if ( srcLen == wxNO_LEN )
1514         srcLen = wxWcslen(src) + 1;
1515
1516     size_t outLen = 0;
1517     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1518     for ( size_t n = 0; n < srcLen; n++ )
1519     {
1520         wxUint16 cc[2];
1521         const size_t numChars = encode_utf16(*src++, cc);
1522         if ( numChars == wxCONV_FAILED )
1523             return wxCONV_FAILED;
1524
1525         outLen += numChars * BYTES_PER_CHAR;
1526         if ( outBuff )
1527         {
1528             if ( outLen > dstLen )
1529                 return wxCONV_FAILED;
1530
1531             *outBuff++ = cc[0];
1532             if ( numChars == 2 )
1533             {
1534                 // second character of a surrogate
1535                 *outBuff++ = cc[1];
1536             }
1537         }
1538     }
1539
1540     return outLen;
1541 }
1542
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1546
1547 size_t
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549                            const char *src, size_t srcLen) const
1550 {
1551     srcLen = GetLength(src, srcLen);
1552     if ( srcLen == wxNO_LEN )
1553         return wxCONV_FAILED;
1554
1555     const size_t inLen = srcLen / BYTES_PER_CHAR;
1556     if ( !dst )
1557     {
1558         // optimization: return maximal space which could be needed for this
1559         // string even if the real size could be smaller if the buffer contains
1560         // any surrogates
1561         return inLen;
1562     }
1563
1564     size_t outLen = 0;
1565     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1566     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1567     {
1568         wxUint32 ch;
1569         wxUint16 tmp[2];
1570
1571         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1572         inBuff++;
1573         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1574
1575         const size_t numChars = decode_utf16(tmp, ch);
1576         if ( numChars == wxCONV_FAILED )
1577             return wxCONV_FAILED;
1578
1579         if ( numChars == 2 )
1580             inBuff++;
1581
1582         if ( ++outLen > dstLen )
1583             return wxCONV_FAILED;
1584
1585         *dst++ = ch;
1586     }
1587
1588
1589     return outLen;
1590 }
1591
1592 size_t
1593 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1594                              const wchar_t *src, size_t srcLen) const
1595 {
1596     if ( srcLen == wxNO_LEN )
1597         srcLen = wxWcslen(src) + 1;
1598
1599     size_t outLen = 0;
1600     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1601     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1602     {
1603         wxUint16 cc[2];
1604         const size_t numChars = encode_utf16(*src, cc);
1605         if ( numChars == wxCONV_FAILED )
1606             return wxCONV_FAILED;
1607
1608         outLen += numChars * BYTES_PER_CHAR;
1609         if ( outBuff )
1610         {
1611             if ( outLen > dstLen )
1612                 return wxCONV_FAILED;
1613
1614             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1615             if ( numChars == 2 )
1616             {
1617                 // second character of a surrogate
1618                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1619             }
1620         }
1621     }
1622
1623     return outLen;
1624 }
1625
1626 #endif // WC_UTF16/!WC_UTF16
1627
1628
1629 // ============================================================================
1630 // UTF-32
1631 // ============================================================================
1632
1633 #ifdef WORDS_BIGENDIAN
1634     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1635     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1636 #else
1637     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1638     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1639 #endif
1640
1641
1642 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1643 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1644
1645 /* static */
1646 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1647 {
1648     if ( srcLen == wxNO_LEN )
1649     {
1650         // count the number of bytes in input, including the trailing NULs
1651         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652         for ( srcLen = 1; *inBuff++; srcLen++ )
1653             ;
1654
1655         srcLen *= BYTES_PER_CHAR;
1656     }
1657     else // we already have the length
1658     {
1659         // we can only convert an entire number of UTF-32 characters
1660         if ( srcLen % BYTES_PER_CHAR )
1661             return wxCONV_FAILED;
1662     }
1663
1664     return srcLen;
1665 }
1666
1667 // case when in-memory representation is UTF-16
1668 #ifdef WC_UTF16
1669
1670 // ----------------------------------------------------------------------------
1671 // conversions without endianness change
1672 // ----------------------------------------------------------------------------
1673
1674 size_t
1675 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1676                                const char *src, size_t srcLen) const
1677 {
1678     srcLen = GetLength(src, srcLen);
1679     if ( srcLen == wxNO_LEN )
1680         return wxCONV_FAILED;
1681
1682     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1683     const size_t inLen = srcLen / BYTES_PER_CHAR;
1684     size_t outLen = 0;
1685     for ( size_t n = 0; n < inLen; n++ )
1686     {
1687         wxUint16 cc[2];
1688         const size_t numChars = encode_utf16(*inBuff++, cc);
1689         if ( numChars == wxCONV_FAILED )
1690             return wxCONV_FAILED;
1691
1692         outLen += numChars;
1693         if ( dst )
1694         {
1695             if ( outLen > dstLen )
1696                 return wxCONV_FAILED;
1697
1698             *dst++ = cc[0];
1699             if ( numChars == 2 )
1700             {
1701                 // second character of a surrogate
1702                 *dst++ = cc[1];
1703             }
1704         }
1705     }
1706
1707     return outLen;
1708 }
1709
1710 size_t
1711 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1712                                  const wchar_t *src, size_t srcLen) const
1713 {
1714     if ( srcLen == wxNO_LEN )
1715         srcLen = wxWcslen(src) + 1;
1716
1717     if ( !dst )
1718     {
1719         // optimization: return maximal space which could be needed for this
1720         // string instead of the exact amount which could be less if there are
1721         // any surrogates in the input
1722         //
1723         // we consider that surrogates are rare enough to make it worthwhile to
1724         // avoid running the loop below at the cost of slightly extra memory
1725         // consumption
1726         return srcLen * BYTES_PER_CHAR;
1727     }
1728
1729     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1730     size_t outLen = 0;
1731     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1732     {
1733         const wxUint32 ch = wxDecodeSurrogate(&src);
1734         if ( !src )
1735             return wxCONV_FAILED;
1736
1737         outLen += BYTES_PER_CHAR;
1738
1739         if ( outLen > dstLen )
1740             return wxCONV_FAILED;
1741
1742         *outBuff++ = ch;
1743     }
1744
1745     return outLen;
1746 }
1747
1748 // ----------------------------------------------------------------------------
1749 // endian-reversing conversions
1750 // ----------------------------------------------------------------------------
1751
1752 size_t
1753 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1754                            const char *src, size_t srcLen) const
1755 {
1756     srcLen = GetLength(src, srcLen);
1757     if ( srcLen == wxNO_LEN )
1758         return wxCONV_FAILED;
1759
1760     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1761     const size_t inLen = srcLen / BYTES_PER_CHAR;
1762     size_t outLen = 0;
1763     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1764     {
1765         wxUint16 cc[2];
1766         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1767         if ( numChars == wxCONV_FAILED )
1768             return wxCONV_FAILED;
1769
1770         outLen += numChars;
1771         if ( dst )
1772         {
1773             if ( outLen > dstLen )
1774                 return wxCONV_FAILED;
1775
1776             *dst++ = cc[0];
1777             if ( numChars == 2 )
1778             {
1779                 // second character of a surrogate
1780                 *dst++ = cc[1];
1781             }
1782         }
1783     }
1784
1785     return outLen;
1786 }
1787
1788 size_t
1789 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1790                              const wchar_t *src, size_t srcLen) const
1791 {
1792     if ( srcLen == wxNO_LEN )
1793         srcLen = wxWcslen(src) + 1;
1794
1795     if ( !dst )
1796     {
1797         // optimization: return maximal space which could be needed for this
1798         // string instead of the exact amount which could be less if there are
1799         // any surrogates in the input
1800         //
1801         // we consider that surrogates are rare enough to make it worthwhile to
1802         // avoid running the loop below at the cost of slightly extra memory
1803         // consumption
1804         return srcLen*BYTES_PER_CHAR;
1805     }
1806
1807     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1808     size_t outLen = 0;
1809     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1810     {
1811         const wxUint32 ch = wxDecodeSurrogate(&src);
1812         if ( !src )
1813             return wxCONV_FAILED;
1814
1815         outLen += BYTES_PER_CHAR;
1816
1817         if ( outLen > dstLen )
1818             return wxCONV_FAILED;
1819
1820         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1821     }
1822
1823     return outLen;
1824 }
1825
1826 #else // !WC_UTF16: wchar_t is UTF-32
1827
1828 // ----------------------------------------------------------------------------
1829 // conversions without endianness change
1830 // ----------------------------------------------------------------------------
1831
1832 size_t
1833 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1834                                const char *src, size_t srcLen) const
1835 {
1836     // use memcpy() as it should be much faster than hand-written loop
1837     srcLen = GetLength(src, srcLen);
1838     if ( srcLen == wxNO_LEN )
1839         return wxCONV_FAILED;
1840
1841     const size_t inLen = srcLen/BYTES_PER_CHAR;
1842     if ( dst )
1843     {
1844         if ( dstLen < inLen )
1845             return wxCONV_FAILED;
1846
1847         memcpy(dst, src, srcLen);
1848     }
1849
1850     return inLen;
1851 }
1852
1853 size_t
1854 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1855                                  const wchar_t *src, size_t srcLen) const
1856 {
1857     if ( srcLen == wxNO_LEN )
1858         srcLen = wxWcslen(src) + 1;
1859
1860     srcLen *= BYTES_PER_CHAR;
1861
1862     if ( dst )
1863     {
1864         if ( dstLen < srcLen )
1865             return wxCONV_FAILED;
1866
1867         memcpy(dst, src, srcLen);
1868     }
1869
1870     return srcLen;
1871 }
1872
1873 // ----------------------------------------------------------------------------
1874 // endian-reversing conversions
1875 // ----------------------------------------------------------------------------
1876
1877 size_t
1878 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1879                            const char *src, size_t srcLen) const
1880 {
1881     srcLen = GetLength(src, srcLen);
1882     if ( srcLen == wxNO_LEN )
1883         return wxCONV_FAILED;
1884
1885     srcLen /= BYTES_PER_CHAR;
1886
1887     if ( dst )
1888     {
1889         if ( dstLen < srcLen )
1890             return wxCONV_FAILED;
1891
1892         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1893         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1894         {
1895             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1896         }
1897     }
1898
1899     return srcLen;
1900 }
1901
1902 size_t
1903 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1904                              const wchar_t *src, size_t srcLen) const
1905 {
1906     if ( srcLen == wxNO_LEN )
1907         srcLen = wxWcslen(src) + 1;
1908
1909     srcLen *= BYTES_PER_CHAR;
1910
1911     if ( dst )
1912     {
1913         if ( dstLen < srcLen )
1914             return wxCONV_FAILED;
1915
1916         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1917         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1918         {
1919             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1920         }
1921     }
1922
1923     return srcLen;
1924 }
1925
1926 #endif // WC_UTF16/!WC_UTF16
1927
1928
1929 // ============================================================================
1930 // The classes doing conversion using the iconv_xxx() functions
1931 // ============================================================================
1932
1933 #ifdef HAVE_ICONV
1934
1935 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1936 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1937 //     (unless there's yet another bug in glibc) the only case when iconv()
1938 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1939 //     left in the input buffer -- when _real_ error occurs,
1940 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1941 //     iconv() failure.
1942 //     [This bug does not appear in glibc 2.2.]
1943 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1944 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1945                                      (errno != E2BIG || bufLeft != 0))
1946 #else
1947 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1948 #endif
1949
1950 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1951
1952 #define ICONV_T_INVALID ((iconv_t)-1)
1953
1954 #if SIZEOF_WCHAR_T == 4
1955     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1956     #define WC_ENC      wxFONTENCODING_UTF32
1957 #elif SIZEOF_WCHAR_T == 2
1958     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1959     #define WC_ENC      wxFONTENCODING_UTF16
1960 #else // sizeof(wchar_t) != 2 nor 4
1961     // does this ever happen?
1962     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1963 #endif
1964
1965 // ----------------------------------------------------------------------------
1966 // wxMBConv_iconv: encapsulates an iconv character set
1967 // ----------------------------------------------------------------------------
1968
1969 class wxMBConv_iconv : public wxMBConv
1970 {
1971 public:
1972     wxMBConv_iconv(const char *name);
1973     virtual ~wxMBConv_iconv();
1974
1975     // implement base class virtual methods
1976     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1977                            const char *src, size_t srcLen = wxNO_LEN) const;
1978     virtual size_t FromWChar(char *dst, size_t dstLen,
1979                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1980     virtual size_t GetMBNulLen() const;
1981
1982 #if wxUSE_UNICODE_UTF8
1983     virtual bool IsUTF8() const;
1984 #endif
1985
1986     virtual wxMBConv *Clone() const
1987     {
1988         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1989         p->m_minMBCharWidth = m_minMBCharWidth;
1990         return p;
1991     }
1992
1993     bool IsOk() const
1994         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1995
1996 protected:
1997     // the iconv handlers used to translate from multibyte
1998     // to wide char and in the other direction
1999     iconv_t m2w,
2000             w2m;
2001
2002 #if wxUSE_THREADS
2003     // guards access to m2w and w2m objects
2004     wxMutex m_iconvMutex;
2005 #endif
2006
2007 private:
2008     // the name (for iconv_open()) of a wide char charset -- if none is
2009     // available on this machine, it will remain NULL
2010     static wxString ms_wcCharsetName;
2011
2012     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2013     // different endian-ness than the native one
2014     static bool ms_wcNeedsSwap;
2015
2016
2017     // name of the encoding handled by this conversion
2018     wxString m_name;
2019
2020     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2021     // initially
2022     size_t m_minMBCharWidth;
2023 };
2024
2025 // make the constructor available for unit testing
2026 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2027 {
2028     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2029     if ( !result->IsOk() )
2030     {
2031         delete result;
2032         return 0;
2033     }
2034
2035     return result;
2036 }
2037
2038 wxString wxMBConv_iconv::ms_wcCharsetName;
2039 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2040
2041 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2042               : m_name(name)
2043 {
2044     m_minMBCharWidth = 0;
2045
2046     // check for charset that represents wchar_t:
2047     if ( ms_wcCharsetName.empty() )
2048     {
2049         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2050
2051 #if wxUSE_FONTMAP
2052         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2053 #else // !wxUSE_FONTMAP
2054         static const wxChar *names_static[] =
2055         {
2056 #if SIZEOF_WCHAR_T == 4
2057             _T("UCS-4"),
2058 #elif SIZEOF_WCHAR_T = 2
2059             _T("UCS-2"),
2060 #endif
2061             NULL
2062         };
2063         const wxChar **names = names_static;
2064 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2065
2066         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2067         {
2068             const wxString nameCS(*names);
2069
2070             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2071             wxString nameXE(nameCS);
2072
2073 #ifdef WORDS_BIGENDIAN
2074                 nameXE += _T("BE");
2075 #else // little endian
2076                 nameXE += _T("LE");
2077 #endif
2078
2079             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2080                        nameXE.c_str());
2081
2082             m2w = iconv_open(nameXE.ToAscii(), name);
2083             if ( m2w == ICONV_T_INVALID )
2084             {
2085                 // try charset w/o bytesex info (e.g. "UCS4")
2086                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2087                            nameCS.c_str());
2088                 m2w = iconv_open(nameCS.ToAscii(), name);
2089
2090                 // and check for bytesex ourselves:
2091                 if ( m2w != ICONV_T_INVALID )
2092                 {
2093                     char    buf[2], *bufPtr;
2094                     wchar_t wbuf[2];
2095                     size_t  insz, outsz;
2096                     size_t  res;
2097
2098                     buf[0] = 'A';
2099                     buf[1] = 0;
2100                     wbuf[0] = 0;
2101                     insz = 2;
2102                     outsz = SIZEOF_WCHAR_T * 2;
2103                     char* wbufPtr = (char*)wbuf;
2104                     bufPtr = buf;
2105
2106                     res = iconv(
2107                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2108                         &wbufPtr, &outsz);
2109
2110                     if (ICONV_FAILED(res, insz))
2111                     {
2112                         wxLogLastError(wxT("iconv"));
2113                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2114                                    nameCS.c_str());
2115                     }
2116                     else // ok, can convert to this encoding, remember it
2117                     {
2118                         ms_wcCharsetName = nameCS;
2119                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2120                     }
2121                 }
2122             }
2123             else // use charset not requiring byte swapping
2124             {
2125                 ms_wcCharsetName = nameXE;
2126             }
2127         }
2128
2129         wxLogTrace(TRACE_STRCONV,
2130                    wxT("iconv wchar_t charset is \"%s\"%s"),
2131                    ms_wcCharsetName.empty() ? wxString("<none>")
2132                                             : ms_wcCharsetName,
2133                    ms_wcNeedsSwap ? _T(" (needs swap)")
2134                                   : _T(""));
2135     }
2136     else // we already have ms_wcCharsetName
2137     {
2138         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2139     }
2140
2141     if ( ms_wcCharsetName.empty() )
2142     {
2143         w2m = ICONV_T_INVALID;
2144     }
2145     else
2146     {
2147         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2148         if ( w2m == ICONV_T_INVALID )
2149         {
2150             wxLogTrace(TRACE_STRCONV,
2151                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2152                        ms_wcCharsetName.c_str(), name);
2153         }
2154     }
2155 }
2156
2157 wxMBConv_iconv::~wxMBConv_iconv()
2158 {
2159     if ( m2w != ICONV_T_INVALID )
2160         iconv_close(m2w);
2161     if ( w2m != ICONV_T_INVALID )
2162         iconv_close(w2m);
2163 }
2164
2165 size_t
2166 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2167                         const char *src, size_t srcLen) const
2168 {
2169     if ( srcLen == wxNO_LEN )
2170     {
2171         // find the string length: notice that must be done differently for
2172         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2173         // consecutive NULs
2174         const size_t nulLen = GetMBNulLen();
2175         switch ( nulLen )
2176         {
2177             default:
2178                 return wxCONV_FAILED;
2179
2180             case 1:
2181                 srcLen = strlen(src); // arguably more optimized than our version
2182                 break;
2183
2184             case 2:
2185             case 4:
2186                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2187                 // but they also have to start at character boundary and not
2188                 // span two adjacent characters
2189                 const char *p;
2190                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2191                     ;
2192                 srcLen = p - src;
2193                 break;
2194         }
2195
2196         // when we're determining the length of the string ourselves we count
2197         // the terminating NUL(s) as part of it and always NUL-terminate the
2198         // output
2199         srcLen += nulLen;
2200     }
2201
2202     // we express length in the number of (wide) characters but iconv always
2203     // counts buffer sizes it in bytes
2204     dstLen *= SIZEOF_WCHAR_T;
2205
2206 #if wxUSE_THREADS
2207     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2208     //     Unfortunately there are a couple of global wxCSConv objects such as
2209     //     wxConvLocal that are used all over wx code, so we have to make sure
2210     //     the handle is used by at most one thread at the time. Otherwise
2211     //     only a few wx classes would be safe to use from non-main threads
2212     //     as MB<->WC conversion would fail "randomly".
2213     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2214 #endif // wxUSE_THREADS
2215
2216     size_t res, cres;
2217     const char *pszPtr = src;
2218
2219     if ( dst )
2220     {
2221         char* bufPtr = (char*)dst;
2222
2223         // have destination buffer, convert there
2224         size_t dstLenOrig = dstLen;
2225         cres = iconv(m2w,
2226                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2227                      &bufPtr, &dstLen);
2228
2229         // convert the number of bytes converted as returned by iconv to the
2230         // number of (wide) characters converted that we need
2231         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2232
2233         if (ms_wcNeedsSwap)
2234         {
2235             // convert to native endianness
2236             for ( unsigned i = 0; i < res; i++ )
2237                 dst[i] = WC_BSWAP(dst[i]);
2238         }
2239     }
2240     else // no destination buffer
2241     {
2242         // convert using temp buffer to calculate the size of the buffer needed
2243         wchar_t tbuf[8];
2244         res = 0;
2245
2246         do
2247         {
2248             char* bufPtr = (char*)tbuf;
2249             dstLen = 8 * SIZEOF_WCHAR_T;
2250
2251             cres = iconv(m2w,
2252                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2253                          &bufPtr, &dstLen );
2254
2255             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2256         }
2257         while ((cres == (size_t)-1) && (errno == E2BIG));
2258     }
2259
2260     if (ICONV_FAILED(cres, srcLen))
2261     {
2262         //VS: it is ok if iconv fails, hence trace only
2263         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2264         return wxCONV_FAILED;
2265     }
2266
2267     return res;
2268 }
2269
2270 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2271                                  const wchar_t *src, size_t srcLen) const
2272 {
2273 #if wxUSE_THREADS
2274     // NB: explained in MB2WC
2275     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2276 #endif
2277
2278     if ( srcLen == wxNO_LEN )
2279         srcLen = wxWcslen(src) + 1;
2280
2281     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2282     size_t outbuflen = dstLen;
2283     size_t res, cres;
2284
2285     wchar_t *tmpbuf = 0;
2286
2287     if (ms_wcNeedsSwap)
2288     {
2289         // need to copy to temp buffer to switch endianness
2290         // (doing WC_BSWAP twice on the original buffer won't help, as it
2291         //  could be in read-only memory, or be accessed in some other thread)
2292         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2293         for ( size_t i = 0; i < srcLen; i++ )
2294             tmpbuf[i] = WC_BSWAP(src[i]);
2295
2296         tmpbuf[srcLen] = L'\0';
2297         src = tmpbuf;
2298     }
2299
2300     char* inbuf = (char*)src;
2301     if ( dst )
2302     {
2303         // have destination buffer, convert there
2304         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2305
2306         res = dstLen - outbuflen;
2307     }
2308     else // no destination buffer
2309     {
2310         // convert using temp buffer to calculate the size of the buffer needed
2311         char tbuf[16];
2312         res = 0;
2313         do
2314         {
2315             dst = tbuf;
2316             outbuflen = 16;
2317
2318             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2319
2320             res += 16 - outbuflen;
2321         }
2322         while ((cres == (size_t)-1) && (errno == E2BIG));
2323     }
2324
2325     if (ms_wcNeedsSwap)
2326     {
2327         free(tmpbuf);
2328     }
2329
2330     if (ICONV_FAILED(cres, inbuflen))
2331     {
2332         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2333         return wxCONV_FAILED;
2334     }
2335
2336     return res;
2337 }
2338
2339 size_t wxMBConv_iconv::GetMBNulLen() const
2340 {
2341     if ( m_minMBCharWidth == 0 )
2342     {
2343         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2344
2345 #if wxUSE_THREADS
2346         // NB: explained in MB2WC
2347         wxMutexLocker lock(self->m_iconvMutex);
2348 #endif
2349
2350         const wchar_t *wnul = L"";
2351         char buf[8]; // should be enough for NUL in any encoding
2352         size_t inLen = sizeof(wchar_t),
2353                outLen = WXSIZEOF(buf);
2354         char *inBuff = (char *)wnul;
2355         char *outBuff = buf;
2356         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2357         {
2358             self->m_minMBCharWidth = (size_t)-1;
2359         }
2360         else // ok
2361         {
2362             self->m_minMBCharWidth = outBuff - buf;
2363         }
2364     }
2365
2366     return m_minMBCharWidth;
2367 }
2368
2369 #if wxUSE_UNICODE_UTF8
2370 bool wxMBConv_iconv::IsUTF8() const
2371 {
2372     return wxStricmp(m_name, "UTF-8") == 0 ||
2373            wxStricmp(m_name, "UTF8") == 0;
2374 }
2375 #endif
2376
2377 #endif // HAVE_ICONV
2378
2379
2380 // ============================================================================
2381 // Win32 conversion classes
2382 // ============================================================================
2383
2384 #ifdef wxHAVE_WIN32_MB2WC
2385
2386 // from utils.cpp
2387 #if wxUSE_FONTMAP
2388 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2389 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2390 #endif
2391
2392 class wxMBConv_win32 : public wxMBConv
2393 {
2394 public:
2395     wxMBConv_win32()
2396     {
2397         m_CodePage = CP_ACP;
2398         m_minMBCharWidth = 0;
2399     }
2400
2401     wxMBConv_win32(const wxMBConv_win32& conv)
2402         : wxMBConv()
2403     {
2404         m_CodePage = conv.m_CodePage;
2405         m_minMBCharWidth = conv.m_minMBCharWidth;
2406     }
2407
2408 #if wxUSE_FONTMAP
2409     wxMBConv_win32(const char* name)
2410     {
2411         m_CodePage = wxCharsetToCodepage(name);
2412         m_minMBCharWidth = 0;
2413     }
2414
2415     wxMBConv_win32(wxFontEncoding encoding)
2416     {
2417         m_CodePage = wxEncodingToCodepage(encoding);
2418         m_minMBCharWidth = 0;
2419     }
2420 #endif // wxUSE_FONTMAP
2421
2422     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2423     {
2424         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2425         // the behaviour is not compatible with the Unix version (using iconv)
2426         // and break the library itself, e.g. wxTextInputStream::NextChar()
2427         // wouldn't work if reading an incomplete MB char didn't result in an
2428         // error
2429         //
2430         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2431         // Win XP or newer and it is not supported for UTF-[78] so we always
2432         // use our own conversions in this case. See
2433         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2434         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2435         if ( m_CodePage == CP_UTF8 )
2436         {
2437             return wxMBConvUTF8().MB2WC(buf, psz, n);
2438         }
2439
2440         if ( m_CodePage == CP_UTF7 )
2441         {
2442             return wxMBConvUTF7().MB2WC(buf, psz, n);
2443         }
2444
2445         int flags = 0;
2446         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2447                 IsAtLeastWin2kSP4() )
2448         {
2449             flags = MB_ERR_INVALID_CHARS;
2450         }
2451
2452         const size_t len = ::MultiByteToWideChar
2453                              (
2454                                 m_CodePage,     // code page
2455                                 flags,          // flags: fall on error
2456                                 psz,            // input string
2457                                 -1,             // its length (NUL-terminated)
2458                                 buf,            // output string
2459                                 buf ? n : 0     // size of output buffer
2460                              );
2461         if ( !len )
2462         {
2463             // function totally failed
2464             return wxCONV_FAILED;
2465         }
2466
2467         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2468         // check if we succeeded, by doing a double trip:
2469         if ( !flags && buf )
2470         {
2471             const size_t mbLen = strlen(psz);
2472             wxCharBuffer mbBuf(mbLen);
2473             if ( ::WideCharToMultiByte
2474                    (
2475                       m_CodePage,
2476                       0,
2477                       buf,
2478                       -1,
2479                       mbBuf.data(),
2480                       mbLen + 1,        // size in bytes, not length
2481                       NULL,
2482                       NULL
2483                    ) == 0 ||
2484                   strcmp(mbBuf, psz) != 0 )
2485             {
2486                 // we didn't obtain the same thing we started from, hence
2487                 // the conversion was lossy and we consider that it failed
2488                 return wxCONV_FAILED;
2489             }
2490         }
2491
2492         // note that it returns count of written chars for buf != NULL and size
2493         // of the needed buffer for buf == NULL so in either case the length of
2494         // the string (which never includes the terminating NUL) is one less
2495         return len - 1;
2496     }
2497
2498     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2499     {
2500         /*
2501             we have a problem here: by default, WideCharToMultiByte() may
2502             replace characters unrepresentable in the target code page with bad
2503             quality approximations such as turning "1/2" symbol (U+00BD) into
2504             "1" for the code pages which don't have it and we, obviously, want
2505             to avoid this at any price
2506
2507             the trouble is that this function does it _silently_, i.e. it won't
2508             even tell us whether it did or not... Win98/2000 and higher provide
2509             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2510             we have to resort to a round trip, i.e. check that converting back
2511             results in the same string -- this is, of course, expensive but
2512             otherwise we simply can't be sure to not garble the data.
2513          */
2514
2515         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2516         // it doesn't work with CJK encodings (which we test for rather roughly
2517         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2518         // supporting it
2519         BOOL usedDef wxDUMMY_INITIALIZE(false);
2520         BOOL *pUsedDef;
2521         int flags;
2522         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2523         {
2524             // it's our lucky day
2525             flags = WC_NO_BEST_FIT_CHARS;
2526             pUsedDef = &usedDef;
2527         }
2528         else // old system or unsupported encoding
2529         {
2530             flags = 0;
2531             pUsedDef = NULL;
2532         }
2533
2534         const size_t len = ::WideCharToMultiByte
2535                              (
2536                                 m_CodePage,     // code page
2537                                 flags,          // either none or no best fit
2538                                 pwz,            // input string
2539                                 -1,             // it is (wide) NUL-terminated
2540                                 buf,            // output buffer
2541                                 buf ? n : 0,    // and its size
2542                                 NULL,           // default "replacement" char
2543                                 pUsedDef        // [out] was it used?
2544                              );
2545
2546         if ( !len )
2547         {
2548             // function totally failed
2549             return wxCONV_FAILED;
2550         }
2551
2552         // we did something, check if we really succeeded
2553         if ( flags )
2554         {
2555             // check if the conversion failed, i.e. if any replacements
2556             // were done
2557             if ( usedDef )
2558                 return wxCONV_FAILED;
2559         }
2560         else // we must resort to double tripping...
2561         {
2562             // first we need to ensure that we really have the MB data: this is
2563             // not the case if we're called with NULL buffer, in which case we
2564             // need to do the conversion yet again
2565             wxCharBuffer bufDef;
2566             if ( !buf )
2567             {
2568                 bufDef = wxCharBuffer(len);
2569                 buf = bufDef.data();
2570                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2571                                             buf, len, NULL, NULL) )
2572                     return wxCONV_FAILED;
2573             }
2574
2575             if ( !n )
2576                 n = wcslen(pwz);
2577             wxWCharBuffer wcBuf(n);
2578             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2579                     wcscmp(wcBuf, pwz) != 0 )
2580             {
2581                 // we didn't obtain the same thing we started from, hence
2582                 // the conversion was lossy and we consider that it failed
2583                 return wxCONV_FAILED;
2584             }
2585         }
2586
2587         // see the comment above for the reason of "len - 1"
2588         return len - 1;
2589     }
2590
2591     virtual size_t GetMBNulLen() const
2592     {
2593         if ( m_minMBCharWidth == 0 )
2594         {
2595             int len = ::WideCharToMultiByte
2596                         (
2597                             m_CodePage,     // code page
2598                             0,              // no flags
2599                             L"",            // input string
2600                             1,              // translate just the NUL
2601                             NULL,           // output buffer
2602                             0,              // and its size
2603                             NULL,           // no replacement char
2604                             NULL            // [out] don't care if it was used
2605                         );
2606
2607             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2608             switch ( len )
2609             {
2610                 default:
2611                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2612                     self->m_minMBCharWidth = (size_t)-1;
2613                     break;
2614
2615                 case 0:
2616                     self->m_minMBCharWidth = (size_t)-1;
2617                     break;
2618
2619                 case 1:
2620                 case 2:
2621                 case 4:
2622                     self->m_minMBCharWidth = len;
2623                     break;
2624             }
2625         }
2626
2627         return m_minMBCharWidth;
2628     }
2629
2630     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2631
2632     bool IsOk() const { return m_CodePage != -1; }
2633
2634 private:
2635     static bool CanUseNoBestFit()
2636     {
2637         static int s_isWin98Or2k = -1;
2638
2639         if ( s_isWin98Or2k == -1 )
2640         {
2641             int verMaj, verMin;
2642             switch ( wxGetOsVersion(&verMaj, &verMin) )
2643             {
2644                 case wxOS_WINDOWS_9X:
2645                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2646                     break;
2647
2648                 case wxOS_WINDOWS_NT:
2649                     s_isWin98Or2k = verMaj >= 5;
2650                     break;
2651
2652                 default:
2653                     // unknown: be conservative by default
2654                     s_isWin98Or2k = 0;
2655                     break;
2656             }
2657
2658             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2659         }
2660
2661         return s_isWin98Or2k == 1;
2662     }
2663
2664     static bool IsAtLeastWin2kSP4()
2665     {
2666 #ifdef __WXWINCE__
2667         return false;
2668 #else
2669         static int s_isAtLeastWin2kSP4 = -1;
2670
2671         if ( s_isAtLeastWin2kSP4 == -1 )
2672         {
2673             OSVERSIONINFOEX ver;
2674
2675             memset(&ver, 0, sizeof(ver));
2676             ver.dwOSVersionInfoSize = sizeof(ver);
2677             GetVersionEx((OSVERSIONINFO*)&ver);
2678
2679             s_isAtLeastWin2kSP4 =
2680               ((ver.dwMajorVersion > 5) || // Vista+
2681                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2682                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2683                ver.wServicePackMajor >= 4)) // 2000 SP4+
2684               ? 1 : 0;
2685         }
2686
2687         return s_isAtLeastWin2kSP4 == 1;
2688 #endif
2689     }
2690
2691
2692     // the code page we're working with
2693     long m_CodePage;
2694
2695     // cached result of GetMBNulLen(), set to 0 initially meaning
2696     // "unknown"
2697     size_t m_minMBCharWidth;
2698 };
2699
2700 #endif // wxHAVE_WIN32_MB2WC
2701
2702
2703 // ============================================================================
2704 // wxEncodingConverter based conversion classes
2705 // ============================================================================
2706
2707 #if wxUSE_FONTMAP
2708
2709 class wxMBConv_wxwin : public wxMBConv
2710 {
2711 private:
2712     void Init()
2713     {
2714         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2715         // The wxMBConv_cf class does a better job.
2716         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2717                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2718                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2719     }
2720
2721 public:
2722     // temporarily just use wxEncodingConverter stuff,
2723     // so that it works while a better implementation is built
2724     wxMBConv_wxwin(const char* name)
2725     {
2726         if (name)
2727             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2728         else
2729             m_enc = wxFONTENCODING_SYSTEM;
2730
2731         Init();
2732     }
2733
2734     wxMBConv_wxwin(wxFontEncoding enc)
2735     {
2736         m_enc = enc;
2737
2738         Init();
2739     }
2740
2741     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2742     {
2743         size_t inbuf = strlen(psz);
2744         if (buf)
2745         {
2746             if (!m2w.Convert(psz, buf))
2747                 return wxCONV_FAILED;
2748         }
2749         return inbuf;
2750     }
2751
2752     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2753     {
2754         const size_t inbuf = wxWcslen(psz);
2755         if (buf)
2756         {
2757             if (!w2m.Convert(psz, buf))
2758                 return wxCONV_FAILED;
2759         }
2760
2761         return inbuf;
2762     }
2763
2764     virtual size_t GetMBNulLen() const
2765     {
2766         switch ( m_enc )
2767         {
2768             case wxFONTENCODING_UTF16BE:
2769             case wxFONTENCODING_UTF16LE:
2770                 return 2;
2771
2772             case wxFONTENCODING_UTF32BE:
2773             case wxFONTENCODING_UTF32LE:
2774                 return 4;
2775
2776             default:
2777                 return 1;
2778         }
2779     }
2780
2781     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2782
2783     bool IsOk() const { return m_ok; }
2784
2785 public:
2786     wxFontEncoding m_enc;
2787     wxEncodingConverter m2w, w2m;
2788
2789 private:
2790     // were we initialized successfully?
2791     bool m_ok;
2792
2793     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2794 };
2795
2796 // make the constructors available for unit testing
2797 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2798 {
2799     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2800     if ( !result->IsOk() )
2801     {
2802         delete result;
2803         return 0;
2804     }
2805
2806     return result;
2807 }
2808
2809 #endif // wxUSE_FONTMAP
2810
2811 // ============================================================================
2812 // wxCSConv implementation
2813 // ============================================================================
2814
2815 void wxCSConv::Init()
2816 {
2817     m_name = NULL;
2818     m_convReal =  NULL;
2819     m_deferred = true;
2820 }
2821
2822 wxCSConv::wxCSConv(const wxString& charset)
2823 {
2824     Init();
2825
2826     if ( !charset.empty() )
2827     {
2828         SetName(charset.ToAscii());
2829     }
2830
2831 #if wxUSE_FONTMAP
2832     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2833 #else
2834     m_encoding = wxFONTENCODING_SYSTEM;
2835 #endif
2836 }
2837
2838 wxCSConv::wxCSConv(wxFontEncoding encoding)
2839 {
2840     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2841     {
2842         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2843
2844         encoding = wxFONTENCODING_SYSTEM;
2845     }
2846
2847     Init();
2848
2849     m_encoding = encoding;
2850 }
2851
2852 wxCSConv::~wxCSConv()
2853 {
2854     Clear();
2855 }
2856
2857 wxCSConv::wxCSConv(const wxCSConv& conv)
2858         : wxMBConv()
2859 {
2860     Init();
2861
2862     SetName(conv.m_name);
2863     m_encoding = conv.m_encoding;
2864 }
2865
2866 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2867 {
2868     Clear();
2869
2870     SetName(conv.m_name);
2871     m_encoding = conv.m_encoding;
2872
2873     return *this;
2874 }
2875
2876 void wxCSConv::Clear()
2877 {
2878     free(m_name);
2879     delete m_convReal;
2880
2881     m_name = NULL;
2882     m_convReal = NULL;
2883 }
2884
2885 void wxCSConv::SetName(const char *charset)
2886 {
2887     if (charset)
2888     {
2889         m_name = wxStrdup(charset);
2890         m_deferred = true;
2891     }
2892 }
2893
2894 #if wxUSE_FONTMAP
2895
2896 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2897                      wxEncodingNameCache );
2898
2899 static wxEncodingNameCache gs_nameCache;
2900 #endif
2901
2902 wxMBConv *wxCSConv::DoCreate() const
2903 {
2904 #if wxUSE_FONTMAP
2905     wxLogTrace(TRACE_STRCONV,
2906                wxT("creating conversion for %s"),
2907                (m_name ? m_name
2908                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2909 #endif // wxUSE_FONTMAP
2910
2911     // check for the special case of ASCII or ISO8859-1 charset: as we have
2912     // special knowledge of it anyhow, we don't need to create a special
2913     // conversion object
2914     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2915             m_encoding == wxFONTENCODING_DEFAULT )
2916     {
2917         // don't convert at all
2918         return NULL;
2919     }
2920
2921     // we trust OS to do conversion better than we can so try external
2922     // conversion methods first
2923     //
2924     // the full order is:
2925     //      1. OS conversion (iconv() under Unix or Win32 API)
2926     //      2. hard coded conversions for UTF
2927     //      3. wxEncodingConverter as fall back
2928
2929     // step (1)
2930 #ifdef HAVE_ICONV
2931 #if !wxUSE_FONTMAP
2932     if ( m_name )
2933 #endif // !wxUSE_FONTMAP
2934     {
2935 #if wxUSE_FONTMAP
2936         wxFontEncoding encoding(m_encoding);
2937 #endif
2938
2939         if ( m_name )
2940         {
2941             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2942             if ( conv->IsOk() )
2943                 return conv;
2944
2945             delete conv;
2946
2947 #if wxUSE_FONTMAP
2948             encoding =
2949                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2950 #endif // wxUSE_FONTMAP
2951         }
2952 #if wxUSE_FONTMAP
2953         {
2954             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2955             if ( it != gs_nameCache.end() )
2956             {
2957                 if ( it->second.empty() )
2958                     return NULL;
2959
2960                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2961                 if ( conv->IsOk() )
2962                     return conv;
2963
2964                 delete conv;
2965             }
2966
2967             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2968             // CS : in case this does not return valid names (eg for MacRoman)
2969             // encoding got a 'failure' entry in the cache all the same,
2970             // although it just has to be created using a different method, so
2971             // only store failed iconv creation attempts (or perhaps we
2972             // shoulnd't do this at all ?)
2973             if ( names[0] != NULL )
2974             {
2975                 for ( ; *names; ++names )
2976                 {
2977                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2978                     //             will need changes that will obsolete this
2979                     wxString name(*names);
2980                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2981                     if ( conv->IsOk() )
2982                     {
2983                         gs_nameCache[encoding] = *names;
2984                         return conv;
2985                     }
2986
2987                     delete conv;
2988                 }
2989
2990                 gs_nameCache[encoding] = _T(""); // cache the failure
2991             }
2992         }
2993 #endif // wxUSE_FONTMAP
2994     }
2995 #endif // HAVE_ICONV
2996
2997 #ifdef wxHAVE_WIN32_MB2WC
2998     {
2999 #if wxUSE_FONTMAP
3000         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3001                                       : new wxMBConv_win32(m_encoding);
3002         if ( conv->IsOk() )
3003             return conv;
3004
3005         delete conv;
3006 #else
3007         return NULL;
3008 #endif
3009     }
3010 #endif // wxHAVE_WIN32_MB2WC
3011
3012 #ifdef __DARWIN__
3013     {
3014         // leave UTF16 and UTF32 to the built-ins of wx
3015         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3016             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3017         {
3018 #if wxUSE_FONTMAP
3019             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3020                                           : new wxMBConv_cf(m_encoding);
3021 #else
3022             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3023 #endif
3024
3025             if ( conv->IsOk() )
3026                  return conv;
3027
3028             delete conv;
3029         }
3030     }
3031 #endif // __DARWIN__
3032
3033     // step (2)
3034     wxFontEncoding enc = m_encoding;
3035 #if wxUSE_FONTMAP
3036     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3037     {
3038         // use "false" to suppress interactive dialogs -- we can be called from
3039         // anywhere and popping up a dialog from here is the last thing we want to
3040         // do
3041         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3042     }
3043 #endif // wxUSE_FONTMAP
3044
3045     switch ( enc )
3046     {
3047         case wxFONTENCODING_UTF7:
3048              return new wxMBConvUTF7;
3049
3050         case wxFONTENCODING_UTF8:
3051              return new wxMBConvUTF8;
3052
3053         case wxFONTENCODING_UTF16BE:
3054              return new wxMBConvUTF16BE;
3055
3056         case wxFONTENCODING_UTF16LE:
3057              return new wxMBConvUTF16LE;
3058
3059         case wxFONTENCODING_UTF32BE:
3060              return new wxMBConvUTF32BE;
3061
3062         case wxFONTENCODING_UTF32LE:
3063              return new wxMBConvUTF32LE;
3064
3065         default:
3066              // nothing to do but put here to suppress gcc warnings
3067              break;
3068     }
3069
3070     // step (3)
3071 #if wxUSE_FONTMAP
3072     {
3073         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3074                                       : new wxMBConv_wxwin(m_encoding);
3075         if ( conv->IsOk() )
3076             return conv;
3077
3078         delete conv;
3079     }
3080 #endif // wxUSE_FONTMAP
3081
3082     // NB: This is a hack to prevent deadlock. What could otherwise happen
3083     //     in Unicode build: wxConvLocal creation ends up being here
3084     //     because of some failure and logs the error. But wxLog will try to
3085     //     attach a timestamp, for which it will need wxConvLocal (to convert
3086     //     time to char* and then wchar_t*), but that fails, tries to log the
3087     //     error, but wxLog has an (already locked) critical section that
3088     //     guards the static buffer.
3089     static bool alreadyLoggingError = false;
3090     if (!alreadyLoggingError)
3091     {
3092         alreadyLoggingError = true;
3093         wxLogError(_("Cannot convert from the charset '%s'!"),
3094                    m_name ? m_name
3095                       :
3096 #if wxUSE_FONTMAP
3097                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3098 #else // !wxUSE_FONTMAP
3099                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3100 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3101               );
3102
3103         alreadyLoggingError = false;
3104     }
3105
3106     return NULL;
3107 }
3108
3109 void wxCSConv::CreateConvIfNeeded() const
3110 {
3111     if ( m_deferred )
3112     {
3113         wxCSConv *self = (wxCSConv *)this; // const_cast
3114
3115         // if we don't have neither the name nor the encoding, use the default
3116         // encoding for this system
3117         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3118         {
3119 #if wxUSE_INTL
3120             self->m_encoding = wxLocale::GetSystemEncoding();
3121 #else
3122             // fallback to some reasonable default:
3123             self->m_encoding = wxFONTENCODING_ISO8859_1;
3124 #endif // wxUSE_INTL
3125         }
3126
3127         self->m_convReal = DoCreate();
3128         self->m_deferred = false;
3129     }
3130 }
3131
3132 bool wxCSConv::IsOk() const
3133 {
3134     CreateConvIfNeeded();
3135
3136     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3137     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3138         return true; // always ok as we do it ourselves
3139
3140     // m_convReal->IsOk() is called at its own creation, so we know it must
3141     // be ok if m_convReal is non-NULL
3142     return m_convReal != NULL;
3143 }
3144
3145 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3146                          const char *src, size_t srcLen) const
3147 {
3148     CreateConvIfNeeded();
3149
3150     if (m_convReal)
3151         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3152
3153     // latin-1 (direct)
3154     if ( srcLen == wxNO_LEN )
3155         srcLen = strlen(src) + 1; // take trailing NUL too
3156
3157     if ( dst )
3158     {
3159         if ( dstLen < srcLen )
3160             return wxCONV_FAILED;
3161
3162         for ( size_t n = 0; n < srcLen; n++ )
3163             dst[n] = (unsigned char)(src[n]);
3164     }
3165
3166     return srcLen;
3167 }
3168
3169 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3170                            const wchar_t *src, size_t srcLen) const
3171 {
3172     CreateConvIfNeeded();
3173
3174     if (m_convReal)
3175         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3176
3177     // latin-1 (direct)
3178     if ( srcLen == wxNO_LEN )
3179         srcLen = wxWcslen(src) + 1;
3180
3181     if ( dst )
3182     {
3183         if ( dstLen < srcLen )
3184             return wxCONV_FAILED;
3185
3186         for ( size_t n = 0; n < srcLen; n++ )
3187         {
3188             if ( src[n] > 0xFF )
3189                 return wxCONV_FAILED;
3190
3191             dst[n] = (char)src[n];
3192         }
3193
3194     }
3195     else // still need to check the input validity
3196     {
3197         for ( size_t n = 0; n < srcLen; n++ )
3198         {
3199             if ( src[n] > 0xFF )
3200                 return wxCONV_FAILED;
3201         }
3202     }
3203
3204     return srcLen;
3205 }
3206
3207 size_t wxCSConv::GetMBNulLen() const
3208 {
3209     CreateConvIfNeeded();
3210
3211     if ( m_convReal )
3212     {
3213         return m_convReal->GetMBNulLen();
3214     }
3215
3216     // otherwise, we are ISO-8859-1
3217     return 1;
3218 }
3219
3220 #if wxUSE_UNICODE_UTF8
3221 bool wxCSConv::IsUTF8() const
3222 {
3223     CreateConvIfNeeded();
3224
3225     if ( m_convReal )
3226     {
3227         return m_convReal->IsUTF8();
3228     }
3229
3230     // otherwise, we are ISO-8859-1
3231     return false;
3232 }
3233 #endif
3234
3235
3236 #if wxUSE_UNICODE
3237
3238 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3239 {
3240     if ( !s )
3241         return wxWCharBuffer();
3242
3243     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3244     if ( !wbuf )
3245         wbuf = wxMBConvUTF8().cMB2WX(s);
3246     if ( !wbuf )
3247         wbuf = wxConvISO8859_1.cMB2WX(s);
3248
3249     return wbuf;
3250 }
3251
3252 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3253 {
3254     if ( !ws )
3255         return wxCharBuffer();
3256
3257     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3258     if ( !buf )
3259         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3260
3261     return buf;
3262 }
3263
3264 #endif // wxUSE_UNICODE
3265
3266 // ----------------------------------------------------------------------------
3267 // globals
3268 // ----------------------------------------------------------------------------
3269
3270 // NB: The reason why we create converted objects in this convoluted way,
3271 //     using a factory function instead of global variable, is that they
3272 //     may be used at static initialization time (some of them are used by
3273 //     wxString ctors and there may be a global wxString object). In other
3274 //     words, possibly _before_ the converter global object would be
3275 //     initialized.
3276
3277 #undef wxConvLibc
3278 #undef wxConvUTF8
3279 #undef wxConvUTF7
3280 #undef wxConvLocal
3281 #undef wxConvISO8859_1
3282
3283 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3284     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3285     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3286     {                                                                   \
3287         static impl_klass name##Obj ctor_args;                          \
3288         return &name##Obj;                                              \
3289     }                                                                   \
3290     /* this ensures that all global converter objects are created */    \
3291     /* by the time static initialization is done, i.e. before any */    \
3292     /* thread is launched: */                                           \
3293     static klass* gs_##name##instance = wxGet_##name##Ptr()
3294
3295 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3296     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3297
3298 #ifdef __WINDOWS__
3299     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3300 #else
3301     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3302 #endif
3303
3304 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3305 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3306 //     provokes an error message about "not enough macro parameters"; and we
3307 //     can't use "()" here as the name##Obj declaration would be parsed as a
3308 //     function declaration then, so use a semicolon and live with an extra
3309 //     empty statement (and hope that no compilers warns about this)
3310 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3311 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3312
3313 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3314 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3315
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3317 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3318
3319 #ifdef __DARWIN__
3320 // The xnu kernel always communicates file paths in decomposed UTF-8.
3321 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3322 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3323 #endif
3324
3325 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3326 #ifdef __DARWIN__
3327                                     &wxConvMacUTF8DObj;
3328 #else // !__DARWIN__
3329                                     wxGet_wxConvLibcPtr();
3330 #endif // __DARWIN__/!__DARWIN__
3331
3332 #else // !wxUSE_WCHAR_T
3333
3334 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3335 // stand-ins in absence of wchar_t
3336 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3337                                 wxConvISO8859_1,
3338                                 wxConvLocal,
3339                                 wxConvUTF8;
3340
3341 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T