src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existing ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168     //
 169     // moreover, some conversion classes simply can't implement ToWChar()
 170     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 171     // NUL-terminated strings
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     // the idea of this code is straightforward: it converts a NUL-terminated
 213     // chunk of the string during each iteration and updates the output buffer
 214     // with the result
 215     //
 216     // all the complication come from the fact that this function, for
 217     // historical reasons, must behave in 2 subtly different ways when it's
 218     // called with a fixed number of characters and when it's called for the
 219     // entire NUL-terminated string: in the former case (srcEnd == NULL) we
 220     // must count all characters we convert, NUL or not; but in the latter we
 221     // do not count the trailing NUL -- but still count all the NULs inside the
 222     // string
 223     //
 224     // so for the (simple) former case we just always count the trailing NUL,
 225     // but for the latter we need to wait until we see if there is going to be
 226     // another loop iteration and only count it then
 227     for ( ;; )
 228     {
 229         // try to convert the current chunk
 230         size_t lenChunk = MB2WC(NULL, src, 0);
 231         if ( lenChunk == wxCONV_FAILED )
 232             return wxCONV_FAILED;
 233
 234         dstWritten += lenChunk;
 235         if ( !srcEnd )
 236             dstWritten++;
 237
 238         if ( !lenChunk )
 239         {
 240             // nothing left in the input string, conversion succeeded
 241             break;
 242         }
 243
 244         if ( dst )
 245         {
 246             if ( dstWritten > dstLen )
 247                 return wxCONV_FAILED;
 248
 249             // +1 is for trailing NUL
 250             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 251                 return wxCONV_FAILED;
 252
 253             dst += lenChunk;
 254             if ( !srcEnd )
 255                 dst++;
 256         }
 257
 258         if ( !srcEnd )
 259         {
 260             // we convert just one chunk in this case as this is the entire
 261             // string anyhow
 262             break;
 263         }
 264
 265         // advance the input pointer past the end of this chunk
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         src += nulLen; // skipping over its terminator as well
 276
 277         // note that ">=" (and not just "==") is needed here as the terminator
 278         // we skipped just above could be inside or just after the buffer
 279         // delimited by srcEnd
 280         if ( src >= srcEnd )
 281             break;
 282
 283         // if we got here then this wasn't the last chunk in this string and
 284         // hence we must count an extra char for L'\0' even when converting a
 285         // fixed number of characters
 286         if ( srcEnd )
 287         {
 288             dstWritten++;
 289             if ( dst )
 290                 dst++;
 291         }
 292     }
 293
 294     return dstWritten;
 295 }
 296
 297 size_t
 298 wxMBConv::FromWChar(char *dst, size_t dstLen,
 299                     const wchar_t *src, size_t srcLen) const
 300 {
 301     // the number of chars [which would be] written to dst [if it were not NULL]
 302     size_t dstWritten = 0;
 303
 304     // if we don't know its length we have no choice but to assume that it is
 305     // NUL-terminated (notice that it can still be NUL-terminated even if
 306     // explicit length is given but it doesn't change our return value)
 307     const bool isNulTerminated = srcLen == wxNO_LEN;
 308
 309     // make a copy of the input string unless it is already properly
 310     // NUL-terminated
 311     wxWCharBuffer bufTmp;
 312     if ( isNulTerminated )
 313     {
 314         srcLen = wxWcslen(src) + 1;
 315     }
 316     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 317     {
 318         // make a copy in order to properly NUL-terminate the string
 319         bufTmp = wxWCharBuffer(srcLen);
 320         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 321         src = bufTmp;
 322     }
 323
 324     const size_t lenNul = GetMBNulLen();
 325     for ( const wchar_t * const srcEnd = src + srcLen;
 326           src < srcEnd;
 327           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 328     {
 329         // try to convert the current chunk
 330         size_t lenChunk = WC2MB(NULL, src, 0);
 331
 332         if ( lenChunk == wxCONV_FAILED )
 333             return wxCONV_FAILED;
 334
 335         dstWritten += lenChunk;
 336         if ( isNulTerminated )
 337             dstWritten += lenNul;
 338
 339         if ( dst )
 340         {
 341             if ( dstWritten > dstLen )
 342                 return wxCONV_FAILED;
 343
 344             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 345                 return wxCONV_FAILED;
 346
 347             dst += lenChunk;
 348             if ( isNulTerminated )
 349                 dst += lenNul;
 350         }
 351     }
 352
 353     return dstWritten;
 354 }
 355
 356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 357 {
 358     // add 1 to available buffer length because MB2WC() parameter counts the
 359     // number of non-NUL characters while ToWChar() counts everything
 360     size_t rc = ToWChar(outBuff, outLen + 1, inBuff);
 361     if ( rc != wxCONV_FAILED )
 362     {
 363         // ToWChar() returns the buffer length, i.e. including the trailing
 364         // NUL, while this method doesn't take it into account
 365         rc--;
 366     }
 367
 368     return rc;
 369 }
 370
 371 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 372 {
 373     const size_t nulLen = GetMBNulLen();
 374
 375     size_t rc = FromWChar(outBuff, outLen + nulLen, inBuff);
 376     if ( rc != wxCONV_FAILED )
 377     {
 378         rc -= nulLen;
 379     }
 380
 381     return rc;
 382 }
 383
 384 wxMBConv::~wxMBConv()
 385 {
 386     // nothing to do here (necessary for Darwin linking probably)
 387 }
 388
 389 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 390 {
 391     if ( psz )
 392     {
 393         // calculate the length of the buffer needed first
 394         const size_t nLen = ToWChar(NULL, 0, psz);
 395         if ( nLen != wxCONV_FAILED )
 396         {
 397             // now do the actual conversion
 398             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 399
 400             // +1 for the trailing NULL
 401             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 402                 return buf;
 403         }
 404     }
 405
 406     return wxWCharBuffer();
 407 }
 408
 409 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 410 {
 411     if ( pwz )
 412     {
 413         const size_t nLen = FromWChar(NULL, 0, pwz);
 414         if ( nLen != wxCONV_FAILED )
 415         {
 416             wxCharBuffer buf(nLen - 1);
 417             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 418                 return buf;
 419         }
 420     }
 421
 422     return wxCharBuffer();
 423 }
 424
 425 const wxWCharBuffer
 426 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 427 {
 428     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 429     if ( dstLen != wxCONV_FAILED )
 430     {
 431         // notice that we allocate space for dstLen+1 wide characters here
 432         // because we want the buffer to always be NUL-terminated, even if the
 433         // input isn't (as otherwise the caller has no way to know its length)
 434         wxWCharBuffer wbuf(dstLen);
 435         wbuf.data()[dstLen] = L'\0';
 436         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 437         {
 438             if ( outLen )
 439             {
 440                 *outLen = dstLen;
 441
 442                 // we also need to handle NUL-terminated input strings
 443                 // specially: for them the output is the length of the string
 444                 // excluding the trailing NUL, however if we're asked to
 445                 // convert a specific number of characters we return the length
 446                 // of the resulting output even if it's NUL-terminated
 447                 if ( inLen == wxNO_LEN )
 448                     (*outLen)--;
 449             }
 450
 451             return wbuf;
 452         }
 453     }
 454
 455     if ( outLen )
 456         *outLen = 0;
 457
 458     return wxWCharBuffer();
 459 }
 460
 461 const wxCharBuffer
 462 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 463 {
 464     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 465     if ( dstLen != wxCONV_FAILED )
 466     {
 467         const size_t nulLen = GetMBNulLen();
 468
 469         // as above, ensure that the buffer is always NUL-terminated, even if
 470         // the input is not
 471         wxCharBuffer buf(dstLen + nulLen - 1);
 472         memset(buf.data() + dstLen, 0, nulLen);
 473         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 474         {
 475             if ( outLen )
 476             {
 477                 *outLen = dstLen;
 478
 479                 if ( inLen == wxNO_LEN )
 480                 {
 481                     // in this case both input and output are NUL-terminated
 482                     // and we're not supposed to count NUL
 483                     *outLen -= nulLen;
 484                 }
 485             }
 486
 487             return buf;
 488         }
 489     }
 490
 491     if ( outLen )
 492         *outLen = 0;
 493
 494     return wxCharBuffer();
 495 }
 496
 497 // ----------------------------------------------------------------------------
 498 // wxMBConvLibc
 499 // ----------------------------------------------------------------------------
 500
 501 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 502 {
 503     return wxMB2WC(buf, psz, n);
 504 }
 505
 506 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 507 {
 508     return wxWC2MB(buf, psz, n);
 509 }
 510
 511 // ----------------------------------------------------------------------------
 512 // wxConvBrokenFileNames
 513 // ----------------------------------------------------------------------------
 514
 515 #ifdef __UNIX__
 516
 517 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 518 {
 519     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 520          wxStricmp(charset, _T("UTF8")) == 0  )
 521         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 522     else
 523         m_conv = new wxCSConv(charset);
 524 }
 525
 526 #endif // __UNIX__
 527
 528 // ----------------------------------------------------------------------------
 529 // UTF-7
 530 // ----------------------------------------------------------------------------
 531
 532 // Implementation (C) 2004 Fredrik Roubert
 533 //
 534 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 535
 536 //
 537 // BASE64 decoding table
 538 //
 539 static const unsigned char utf7unb64[] =
 540 {
 541     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 542     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 543     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 544     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 545     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 546     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 547     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 548     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 549     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 550     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 551     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 552     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 553     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 554     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 555     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 556     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 557     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 558     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 559     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 560     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 561     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 562     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 563     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 564     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 565     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 566     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 567     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 568     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 569     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 570     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 571     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 572     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 573 };
 574
 575 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 576                              const char *src, size_t srcLen) const
 577 {
 578     DecoderState stateOrig,
 579                 *statePtr;
 580     if ( srcLen == wxNO_LEN )
 581     {
 582         // convert the entire string, up to and including the trailing NUL
 583         srcLen = strlen(src) + 1;
 584
 585         // when working on the entire strings we don't update nor use the shift
 586         // state from the previous call
 587         statePtr = &stateOrig;
 588     }
 589     else // when working with partial strings we do use the shift state
 590     {
 591         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 592
 593         // also save the old state to be able to rollback to it on error
 594         stateOrig = m_stateDecoder;
 595     }
 596
 597     // but to simplify the code below we use this variable in both cases
 598     DecoderState& state = *statePtr;
 599
 600
 601     // number of characters [which would have been] written to dst [if it were
 602     // not NULL]
 603     size_t len = 0;
 604
 605     const char * const srcEnd = src + srcLen;
 606
 607     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 608     {
 609         const unsigned char cc = *src++;
 610
 611         if ( state.IsShifted() )
 612         {
 613             const unsigned char dc = utf7unb64[cc];
 614             if ( dc == 0xff )
 615             {
 616                 // end of encoded part, check that nothing was left: there can
 617                 // be up to 4 bits of 0 padding but nothing else (we also need
 618                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 619                 // encoded sequence must contain an integral number of UTF-16
 620                 // characters)
 621                 if ( state.isLSB || state.bit > 4 ||
 622                         (state.accum & ((1 << state.bit) - 1)) )
 623                 {
 624                     if ( !len )
 625                         state = stateOrig;
 626
 627                     return wxCONV_FAILED;
 628                 }
 629
 630                 state.ToDirect();
 631
 632                 // re-parse this character normally below unless it's '-' which
 633                 // is consumed by the decoder
 634                 if ( cc == '-' )
 635                     continue;
 636             }
 637             else // valid encoded character
 638             {
 639                 // mini base64 decoder: each character is 6 bits
 640                 state.bit += 6;
 641                 state.accum <<= 6;
 642                 state.accum += dc;
 643
 644                 if ( state.bit >= 8 )
 645                 {
 646                     // got the full byte, consume it
 647                     state.bit -= 8;
 648                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 649
 650                     if ( state.isLSB )
 651                     {
 652                         // we've got the full word, output it
 653                         if ( dst )
 654                             *dst++ = (state.msb << 8) | b;
 655                         len++;
 656                         state.isLSB = false;
 657                     }
 658                     else // MSB
 659                     {
 660                         // just store it while we wait for LSB
 661                         state.msb = b;
 662                         state.isLSB = true;
 663                     }
 664                 }
 665             }
 666         }
 667
 668         if ( state.IsDirect() )
 669         {
 670             // start of an encoded segment?
 671             if ( cc == '+' )
 672             {
 673                 if ( *src == '-' )
 674                 {
 675                     // just the encoded plus sign, don't switch to shifted mode
 676                     if ( dst )
 677                         *dst++ = '+';
 678                     len++;
 679                     src++;
 680                 }
 681                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 682                 {
 683                     // empty encoded chunks are not allowed
 684                     if ( !len )
 685                         state = stateOrig;
 686
 687                     return wxCONV_FAILED;
 688                 }
 689                 else // base-64 encoded chunk follows
 690                 {
 691                     state.ToShifted();
 692                 }
 693             }
 694             else // not '+'
 695             {
 696                 // only printable 7 bit ASCII characters (with the exception of
 697                 // NUL, TAB, CR and LF) can be used directly
 698                 if ( cc >= 0x7f || (cc < ' ' &&
 699                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 700                     return wxCONV_FAILED;
 701
 702                 if ( dst )
 703                     *dst++ = cc;
 704                 len++;
 705             }
 706         }
 707     }
 708
 709     if ( !len )
 710     {
 711         // as we didn't read any characters we should be called with the same
 712         // data (followed by some more new data) again later so don't save our
 713         // state
 714         state = stateOrig;
 715
 716         return wxCONV_FAILED;
 717     }
 718
 719     return len;
 720 }
 721
 722 //
 723 // BASE64 encoding table
 724 //
 725 static const unsigned char utf7enb64[] =
 726 {
 727     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 728     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 729     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 730     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 731     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 732     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 733     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 734     '4', '5', '6', '7', '8', '9', '+', '/'
 735 };
 736
 737 //
 738 // UTF-7 encoding table
 739 //
 740 // 0 - Set D (directly encoded characters)
 741 // 1 - Set O (optional direct characters)
 742 // 2 - whitespace characters (optional)
 743 // 3 - special characters
 744 //
 745 static const unsigned char utf7encode[128] =
 746 {
 747     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 748     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 749     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 750     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 751     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 752     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 753     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 754     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 755 };
 756
 757 static inline bool wxIsUTF7Direct(wchar_t wc)
 758 {
 759     return wc < 0x80 && utf7encode[wc] < 1;
 760 }
 761
 762 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 763                                const wchar_t *src, size_t srcLen) const
 764 {
 765     EncoderState stateOrig,
 766                 *statePtr;
 767     if ( srcLen == wxNO_LEN )
 768     {
 769         // we don't apply the stored state when operating on entire strings at
 770         // once
 771         statePtr = &stateOrig;
 772
 773         srcLen = wxWcslen(src) + 1;
 774     }
 775     else // do use the mode we left the output in previously
 776     {
 777         stateOrig = m_stateEncoder;
 778         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 779     }
 780
 781     EncoderState& state = *statePtr;
 782
 783
 784     size_t len = 0;
 785
 786     const wchar_t * const srcEnd = src + srcLen;
 787     while ( src < srcEnd && (!dst || len < dstLen) )
 788     {
 789         wchar_t cc = *src++;
 790         if ( wxIsUTF7Direct(cc) )
 791         {
 792             if ( state.IsShifted() )
 793             {
 794                 // pad with zeros the last encoded block if necessary
 795                 if ( state.bit )
 796                 {
 797                     if ( dst )
 798                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 799                     len++;
 800                 }
 801
 802                 state.ToDirect();
 803
 804                 if ( dst )
 805                     *dst++ = '-';
 806                 len++;
 807             }
 808
 809             if ( dst )
 810                 *dst++ = (char)cc;
 811             len++;
 812         }
 813         else if ( cc == '+' && state.IsDirect() )
 814         {
 815             if ( dst )
 816             {
 817                 *dst++ = '+';
 818                 *dst++ = '-';
 819             }
 820
 821             len += 2;
 822         }
 823 #ifndef WC_UTF16
 824         else if (((wxUint32)cc) > 0xffff)
 825         {
 826             // no surrogate pair generation (yet?)
 827             return wxCONV_FAILED;
 828         }
 829 #endif
 830         else
 831         {
 832             if ( state.IsDirect() )
 833             {
 834                 state.ToShifted();
 835
 836                 if ( dst )
 837                     *dst++ = '+';
 838                 len++;
 839             }
 840
 841             // BASE64 encode string
 842             for ( ;; )
 843             {
 844                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 845                 {
 846                     state.accum <<= 8;
 847                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 848
 849                     for (state.bit += 8; state.bit >= 6; )
 850                     {
 851                         state.bit -= 6;
 852                         if ( dst )
 853                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 854                         len++;
 855                     }
 856                 }
 857
 858                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 859                     break;
 860
 861                 src++;
 862             }
 863         }
 864     }
 865
 866     // we need to restore the original encoder state if we were called just to
 867     // calculate the amount of space needed as we will presumably be called
 868     // again to really convert the data now
 869     if ( !dst )
 870         state = stateOrig;
 871
 872     return len;
 873 }
 874
 875 // ----------------------------------------------------------------------------
 876 // UTF-8
 877 // ----------------------------------------------------------------------------
 878
 879 static const wxUint32 utf8_max[]=
 880     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 881
 882 // boundaries of the private use area we use to (temporarily) remap invalid
 883 // characters invalid in a UTF-8 encoded string
 884 const wxUint32 wxUnicodePUA = 0x100000;
 885 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 886
 887 // this table gives the length of the UTF-8 encoding from its first character:
 888 const unsigned char tableUtf8Lengths[256] = {
 889     // single-byte sequences (ASCII):
 890     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 891     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 892     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 893     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 894     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 895     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 896     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 897     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 898
 899     // these are invalid:
 900     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 901     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 902     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 903     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 904     0, 0,                                            // C0,C1
 905
 906     // two-byte sequences:
 907           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 908     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 909
 910     // three-byte sequences:
 911     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 912
 913     // four-byte sequences:
 914     4, 4, 4, 4, 4,                                   // F0..F4
 915
 916     // these are invalid again (5- or 6-byte
 917     // sequences and sequences for code points
 918     // above U+10FFFF, as restricted by RFC 3629):
 919                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 920 };
 921
 922 size_t
 923 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 924                             const char *src, size_t srcLen) const
 925 {
 926     wchar_t *out = dstLen ? dst : NULL;
 927     size_t written = 0;
 928
 929     if ( srcLen == wxNO_LEN )
 930         srcLen = strlen(src) + 1;
 931
 932     for ( const char *p = src; ; p++ )
 933     {
 934         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 935         {
 936             // all done successfully, just add the trailing NULL if we are not
 937             // using explicit length
 938             if ( srcLen == wxNO_LEN )
 939             {
 940                 if ( out )
 941                 {
 942                     if ( !dstLen )
 943                         break;
 944
 945                     *out = L'\0';
 946                 }
 947
 948                 written++;
 949             }
 950
 951             return written;
 952         }
 953
 954         if ( out && !dstLen-- )
 955             break;
 956
 957         wxUint32 code;
 958         unsigned char c = *p;
 959
 960         if ( c < 0x80 )
 961         {
 962             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 963                 break;
 964
 965             if ( srcLen != wxNO_LEN )
 966                 srcLen--;
 967
 968             code = c;
 969         }
 970         else
 971         {
 972             unsigned len = tableUtf8Lengths[c];
 973             if ( !len )
 974                 break;
 975
 976             if ( srcLen < len ) // the test works for wxNO_LEN too
 977                 break;
 978
 979             if ( srcLen != wxNO_LEN )
 980                 srcLen -= len;
 981
 982             //   Char. number range   |        UTF-8 octet sequence
 983             //      (hexadecimal)     |              (binary)
 984             //  ----------------------+----------------------------------------
 985             //  0000 0000 - 0000 007F | 0xxxxxxx
 986             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 987             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 988             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 989             //
 990             //  Code point value is stored in bits marked with 'x',
 991             //  lowest-order bit of the value on the right side in the diagram
 992             //  above.                                         (from RFC 3629)
 993
 994             // mask to extract lead byte's value ('x' bits above), by sequence
 995             // length:
 996             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 997
 998             // mask and value of lead byte's most significant bits, by length:
 999             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1000             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1001
1002             len--; // it's more convenient to work with 0-based length here
1003
1004             // extract the lead byte's value bits:
1005             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1006                 break;
1007
1008             code = c & leadValueMask[len];
1009
1010             // all remaining bytes, if any, are handled in the same way
1011             // regardless of sequence's length:
1012             for ( ; len; --len )
1013             {
1014                 c = *++p;
1015                 if ( (c & 0xC0) != 0x80 )
1016                     return wxCONV_FAILED;
1017
1018                 code <<= 6;
1019                 code |= c & 0x3F;
1020             }
1021         }
1022
1023 #ifdef WC_UTF16
1024         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1025         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1026         {
1027             if ( out )
1028                 out++;
1029             written++;
1030         }
1031 #else // !WC_UTF16
1032         if ( out )
1033             *out = code;
1034 #endif // WC_UTF16/!WC_UTF16
1035
1036         if ( out )
1037             out++;
1038
1039         written++;
1040     }
1041
1042     return wxCONV_FAILED;
1043 }
1044
1045 size_t
1046 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1047                               const wchar_t *src, size_t srcLen) const
1048 {
1049     char *out = dstLen ? dst : NULL;
1050     size_t written = 0;
1051
1052     for ( const wchar_t *wp = src; ; wp++ )
1053     {
1054         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1055         {
1056             // all done successfully, just add the trailing NULL if we are not
1057             // using explicit length
1058             if ( srcLen == wxNO_LEN )
1059             {
1060                 if ( out )
1061                 {
1062                     if ( !dstLen )
1063                         break;
1064
1065                     *out = '\0';
1066                 }
1067
1068                 written++;
1069             }
1070
1071             return written;
1072         }
1073
1074         if ( srcLen != wxNO_LEN )
1075             srcLen--;
1076
1077         wxUint32 code;
1078 #ifdef WC_UTF16
1079         // cast is ok for WC_UTF16
1080         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1081         {
1082             // skip the next char too as we decoded a surrogate
1083             wp++;
1084         }
1085 #else // wchar_t is UTF-32
1086         code = *wp & 0x7fffffff;
1087 #endif
1088
1089         unsigned len;
1090         if ( code <= 0x7F )
1091         {
1092             len = 1;
1093             if ( out )
1094             {
1095                 if ( dstLen < len )
1096                     break;
1097
1098                 out[0] = (char)code;
1099             }
1100         }
1101         else if ( code <= 0x07FF )
1102         {
1103             len = 2;
1104             if ( out )
1105             {
1106                 if ( dstLen < len )
1107                     break;
1108
1109                 // NB: this line takes 6 least significant bits, encodes them as
1110                 // 10xxxxxx and discards them so that the next byte can be encoded:
1111                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1112                 out[0] = 0xC0 | code;
1113             }
1114         }
1115         else if ( code < 0xFFFF )
1116         {
1117             len = 3;
1118             if ( out )
1119             {
1120                 if ( dstLen < len )
1121                     break;
1122
1123                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1124                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1125                 out[0] = 0xE0 | code;
1126             }
1127         }
1128         else if ( code <= 0x10FFFF )
1129         {
1130             len = 4;
1131             if ( out )
1132             {
1133                 if ( dstLen < len )
1134                     break;
1135
1136                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1137                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1138                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1139                 out[0] = 0xF0 | code;
1140             }
1141         }
1142         else
1143         {
1144             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1145             break;
1146         }
1147
1148         if ( out )
1149         {
1150             out += len;
1151             dstLen -= len;
1152         }
1153
1154         written += len;
1155     }
1156
1157     // we only get here if an error occurs during decoding
1158     return wxCONV_FAILED;
1159 }
1160
1161 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1162                              const char *psz, size_t srcLen) const
1163 {
1164     if ( m_options == MAP_INVALID_UTF8_NOT )
1165         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1166
1167     size_t len = 0;
1168
1169     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1170     {
1171         const char *opsz = psz;
1172         bool invalid = false;
1173         unsigned char cc = *psz++, fc = cc;
1174         unsigned cnt;
1175         for (cnt = 0; fc & 0x80; cnt++)
1176             fc <<= 1;
1177
1178         if (!cnt)
1179         {
1180             // plain ASCII char
1181             if (buf)
1182                 *buf++ = cc;
1183             len++;
1184
1185             // escape the escape character for octal escapes
1186             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1187                     && cc == '\\' && (!buf || len < n))
1188             {
1189                 if (buf)
1190                     *buf++ = cc;
1191                 len++;
1192             }
1193         }
1194         else
1195         {
1196             cnt--;
1197             if (!cnt)
1198             {
1199                 // invalid UTF-8 sequence
1200                 invalid = true;
1201             }
1202             else
1203             {
1204                 unsigned ocnt = cnt - 1;
1205                 wxUint32 res = cc & (0x3f >> cnt);
1206                 while (cnt--)
1207                 {
1208                     cc = *psz;
1209                     if ((cc & 0xC0) != 0x80)
1210                     {
1211                         // invalid UTF-8 sequence
1212                         invalid = true;
1213                         break;
1214                     }
1215
1216                     psz++;
1217                     res = (res << 6) | (cc & 0x3f);
1218                 }
1219
1220                 if (invalid || res <= utf8_max[ocnt])
1221                 {
1222                     // illegal UTF-8 encoding
1223                     invalid = true;
1224                 }
1225                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1226                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1227                 {
1228                     // if one of our PUA characters turns up externally
1229                     // it must also be treated as an illegal sequence
1230                     // (a bit like you have to escape an escape character)
1231                     invalid = true;
1232                 }
1233                 else
1234                 {
1235 #ifdef WC_UTF16
1236                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1237                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1238                     if (pa == wxCONV_FAILED)
1239                     {
1240                         invalid = true;
1241                     }
1242                     else
1243                     {
1244                         if (buf)
1245                             buf += pa;
1246                         len += pa;
1247                     }
1248 #else // !WC_UTF16
1249                     if (buf)
1250                         *buf++ = (wchar_t)res;
1251                     len++;
1252 #endif // WC_UTF16/!WC_UTF16
1253                 }
1254             }
1255
1256             if (invalid)
1257             {
1258                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1259                 {
1260                     while (opsz < psz && (!buf || len < n))
1261                     {
1262 #ifdef WC_UTF16
1263                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1264                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1265                         wxASSERT(pa != wxCONV_FAILED);
1266                         if (buf)
1267                             buf += pa;
1268                         opsz++;
1269                         len += pa;
1270 #else
1271                         if (buf)
1272                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1273                         opsz++;
1274                         len++;
1275 #endif
1276                     }
1277                 }
1278                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1279                 {
1280                     while (opsz < psz && (!buf || len < n))
1281                     {
1282                         if ( buf && len + 3 < n )
1283                         {
1284                             unsigned char on = *opsz;
1285                             *buf++ = L'\\';
1286                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1287                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1288                             *buf++ = (wchar_t)( L'0' + on % 010 );
1289                         }
1290
1291                         opsz++;
1292                         len += 4;
1293                     }
1294                 }
1295                 else // MAP_INVALID_UTF8_NOT
1296                 {
1297                     return wxCONV_FAILED;
1298                 }
1299             }
1300         }
1301     }
1302
1303     if (srcLen == wxNO_LEN && buf && (len < n))
1304         *buf = 0;
1305
1306     return len + 1;
1307 }
1308
1309 static inline bool isoctal(wchar_t wch)
1310 {
1311     return L'0' <= wch && wch <= L'7';
1312 }
1313
1314 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1315                                const wchar_t *psz, size_t srcLen) const
1316 {
1317     if ( m_options == MAP_INVALID_UTF8_NOT )
1318         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1319
1320     size_t len = 0;
1321
1322     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1323     {
1324         wxUint32 cc;
1325
1326 #ifdef WC_UTF16
1327         // cast is ok for WC_UTF16
1328         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1329         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1330 #else
1331         cc = (*psz++) & 0x7fffffff;
1332 #endif
1333
1334         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1335                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1336         {
1337             if (buf)
1338                 *buf++ = (char)(cc - wxUnicodePUA);
1339             len++;
1340         }
1341         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1342                     && cc == L'\\' && psz[0] == L'\\' )
1343         {
1344             if (buf)
1345                 *buf++ = (char)cc;
1346             psz++;
1347             len++;
1348         }
1349         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1350                     cc == L'\\' &&
1351                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1352         {
1353             if (buf)
1354             {
1355                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1356                                  (psz[1] - L'0') * 010 +
1357                                  (psz[2] - L'0'));
1358             }
1359
1360             psz += 3;
1361             len++;
1362         }
1363         else
1364         {
1365             unsigned cnt;
1366             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1367             {
1368             }
1369
1370             if (!cnt)
1371             {
1372                 // plain ASCII char
1373                 if (buf)
1374                     *buf++ = (char) cc;
1375                 len++;
1376             }
1377             else
1378             {
1379                 len += cnt + 1;
1380                 if (buf)
1381                 {
1382                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1383                     while (cnt--)
1384                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1385                 }
1386             }
1387         }
1388     }
1389
1390     if (srcLen == wxNO_LEN && buf && (len < n))
1391         *buf = 0;
1392
1393     return len + 1;
1394 }
1395
1396 // ============================================================================
1397 // UTF-16
1398 // ============================================================================
1399
1400 #ifdef WORDS_BIGENDIAN
1401     #define wxMBConvUTF16straight wxMBConvUTF16BE
1402     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1403 #else
1404     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1405     #define wxMBConvUTF16straight wxMBConvUTF16LE
1406 #endif
1407
1408 /* static */
1409 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1410 {
1411     if ( srcLen == wxNO_LEN )
1412     {
1413         // count the number of bytes in input, including the trailing NULs
1414         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1415         for ( srcLen = 1; *inBuff++; srcLen++ )
1416             ;
1417
1418         srcLen *= BYTES_PER_CHAR;
1419     }
1420     else // we already have the length
1421     {
1422         // we can only convert an entire number of UTF-16 characters
1423         if ( srcLen % BYTES_PER_CHAR )
1424             return wxCONV_FAILED;
1425     }
1426
1427     return srcLen;
1428 }
1429
1430 // case when in-memory representation is UTF-16 too
1431 #ifdef WC_UTF16
1432
1433 // ----------------------------------------------------------------------------
1434 // conversions without endianness change
1435 // ----------------------------------------------------------------------------
1436
1437 size_t
1438 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1439                                const char *src, size_t srcLen) const
1440 {
1441     // set up the scene for using memcpy() (which is presumably more efficient
1442     // than copying the bytes one by one)
1443     srcLen = GetLength(src, srcLen);
1444     if ( srcLen == wxNO_LEN )
1445         return wxCONV_FAILED;
1446
1447     const size_t inLen = srcLen / BYTES_PER_CHAR;
1448     if ( dst )
1449     {
1450         if ( dstLen < inLen )
1451             return wxCONV_FAILED;
1452
1453         memcpy(dst, src, srcLen);
1454     }
1455
1456     return inLen;
1457 }
1458
1459 size_t
1460 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1461                                  const wchar_t *src, size_t srcLen) const
1462 {
1463     if ( srcLen == wxNO_LEN )
1464         srcLen = wxWcslen(src) + 1;
1465
1466     srcLen *= BYTES_PER_CHAR;
1467
1468     if ( dst )
1469     {
1470         if ( dstLen < srcLen )
1471             return wxCONV_FAILED;
1472
1473         memcpy(dst, src, srcLen);
1474     }
1475
1476     return srcLen;
1477 }
1478
1479 // ----------------------------------------------------------------------------
1480 // endian-reversing conversions
1481 // ----------------------------------------------------------------------------
1482
1483 size_t
1484 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1485                            const char *src, size_t srcLen) const
1486 {
1487     srcLen = GetLength(src, srcLen);
1488     if ( srcLen == wxNO_LEN )
1489         return wxCONV_FAILED;
1490
1491     srcLen /= BYTES_PER_CHAR;
1492
1493     if ( dst )
1494     {
1495         if ( dstLen < srcLen )
1496             return wxCONV_FAILED;
1497
1498         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1499         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1500         {
1501             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1502         }
1503     }
1504
1505     return srcLen;
1506 }
1507
1508 size_t
1509 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1510                              const wchar_t *src, size_t srcLen) const
1511 {
1512     if ( srcLen == wxNO_LEN )
1513         srcLen = wxWcslen(src) + 1;
1514
1515     srcLen *= BYTES_PER_CHAR;
1516
1517     if ( dst )
1518     {
1519         if ( dstLen < srcLen )
1520             return wxCONV_FAILED;
1521
1522         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1523         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1524         {
1525             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1526         }
1527     }
1528
1529     return srcLen;
1530 }
1531
1532 #else // !WC_UTF16: wchar_t is UTF-32
1533
1534 // ----------------------------------------------------------------------------
1535 // conversions without endianness change
1536 // ----------------------------------------------------------------------------
1537
1538 size_t
1539 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1540                                const char *src, size_t srcLen) const
1541 {
1542     srcLen = GetLength(src, srcLen);
1543     if ( srcLen == wxNO_LEN )
1544         return wxCONV_FAILED;
1545
1546     const size_t inLen = srcLen / BYTES_PER_CHAR;
1547     if ( !dst )
1548     {
1549         // optimization: return maximal space which could be needed for this
1550         // string even if the real size could be smaller if the buffer contains
1551         // any surrogates
1552         return inLen;
1553     }
1554
1555     size_t outLen = 0;
1556     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1557     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1558     {
1559         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1560         if ( !inBuff )
1561             return wxCONV_FAILED;
1562
1563         if ( ++outLen > dstLen )
1564             return wxCONV_FAILED;
1565
1566         *dst++ = ch;
1567     }
1568
1569
1570     return outLen;
1571 }
1572
1573 size_t
1574 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1575                                  const wchar_t *src, size_t srcLen) const
1576 {
1577     if ( srcLen == wxNO_LEN )
1578         srcLen = wxWcslen(src) + 1;
1579
1580     size_t outLen = 0;
1581     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1582     for ( size_t n = 0; n < srcLen; n++ )
1583     {
1584         wxUint16 cc[2];
1585         const size_t numChars = encode_utf16(*src++, cc);
1586         if ( numChars == wxCONV_FAILED )
1587             return wxCONV_FAILED;
1588
1589         outLen += numChars * BYTES_PER_CHAR;
1590         if ( outBuff )
1591         {
1592             if ( outLen > dstLen )
1593                 return wxCONV_FAILED;
1594
1595             *outBuff++ = cc[0];
1596             if ( numChars == 2 )
1597             {
1598                 // second character of a surrogate
1599                 *outBuff++ = cc[1];
1600             }
1601         }
1602     }
1603
1604     return outLen;
1605 }
1606
1607 // ----------------------------------------------------------------------------
1608 // endian-reversing conversions
1609 // ----------------------------------------------------------------------------
1610
1611 size_t
1612 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1613                            const char *src, size_t srcLen) const
1614 {
1615     srcLen = GetLength(src, srcLen);
1616     if ( srcLen == wxNO_LEN )
1617         return wxCONV_FAILED;
1618
1619     const size_t inLen = srcLen / BYTES_PER_CHAR;
1620     if ( !dst )
1621     {
1622         // optimization: return maximal space which could be needed for this
1623         // string even if the real size could be smaller if the buffer contains
1624         // any surrogates
1625         return inLen;
1626     }
1627
1628     size_t outLen = 0;
1629     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1630     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1631     {
1632         wxUint32 ch;
1633         wxUint16 tmp[2];
1634
1635         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1636         inBuff++;
1637         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1638
1639         const size_t numChars = decode_utf16(tmp, ch);
1640         if ( numChars == wxCONV_FAILED )
1641             return wxCONV_FAILED;
1642
1643         if ( numChars == 2 )
1644             inBuff++;
1645
1646         if ( ++outLen > dstLen )
1647             return wxCONV_FAILED;
1648
1649         *dst++ = ch;
1650     }
1651
1652
1653     return outLen;
1654 }
1655
1656 size_t
1657 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1658                              const wchar_t *src, size_t srcLen) const
1659 {
1660     if ( srcLen == wxNO_LEN )
1661         srcLen = wxWcslen(src) + 1;
1662
1663     size_t outLen = 0;
1664     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1665     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1666     {
1667         wxUint16 cc[2];
1668         const size_t numChars = encode_utf16(*src, cc);
1669         if ( numChars == wxCONV_FAILED )
1670             return wxCONV_FAILED;
1671
1672         outLen += numChars * BYTES_PER_CHAR;
1673         if ( outBuff )
1674         {
1675             if ( outLen > dstLen )
1676                 return wxCONV_FAILED;
1677
1678             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1679             if ( numChars == 2 )
1680             {
1681                 // second character of a surrogate
1682                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1683             }
1684         }
1685     }
1686
1687     return outLen;
1688 }
1689
1690 #endif // WC_UTF16/!WC_UTF16
1691
1692
1693 // ============================================================================
1694 // UTF-32
1695 // ============================================================================
1696
1697 #ifdef WORDS_BIGENDIAN
1698     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1699     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1700 #else
1701     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1702     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1703 #endif
1704
1705
1706 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1707 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1708
1709 /* static */
1710 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1711 {
1712     if ( srcLen == wxNO_LEN )
1713     {
1714         // count the number of bytes in input, including the trailing NULs
1715         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1716         for ( srcLen = 1; *inBuff++; srcLen++ )
1717             ;
1718
1719         srcLen *= BYTES_PER_CHAR;
1720     }
1721     else // we already have the length
1722     {
1723         // we can only convert an entire number of UTF-32 characters
1724         if ( srcLen % BYTES_PER_CHAR )
1725             return wxCONV_FAILED;
1726     }
1727
1728     return srcLen;
1729 }
1730
1731 // case when in-memory representation is UTF-16
1732 #ifdef WC_UTF16
1733
1734 // ----------------------------------------------------------------------------
1735 // conversions without endianness change
1736 // ----------------------------------------------------------------------------
1737
1738 size_t
1739 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1740                                const char *src, size_t srcLen) const
1741 {
1742     srcLen = GetLength(src, srcLen);
1743     if ( srcLen == wxNO_LEN )
1744         return wxCONV_FAILED;
1745
1746     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1747     const size_t inLen = srcLen / BYTES_PER_CHAR;
1748     size_t outLen = 0;
1749     for ( size_t n = 0; n < inLen; n++ )
1750     {
1751         wxUint16 cc[2];
1752         const size_t numChars = encode_utf16(*inBuff++, cc);
1753         if ( numChars == wxCONV_FAILED )
1754             return wxCONV_FAILED;
1755
1756         outLen += numChars;
1757         if ( dst )
1758         {
1759             if ( outLen > dstLen )
1760                 return wxCONV_FAILED;
1761
1762             *dst++ = cc[0];
1763             if ( numChars == 2 )
1764             {
1765                 // second character of a surrogate
1766                 *dst++ = cc[1];
1767             }
1768         }
1769     }
1770
1771     return outLen;
1772 }
1773
1774 size_t
1775 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1776                                  const wchar_t *src, size_t srcLen) const
1777 {
1778     if ( srcLen == wxNO_LEN )
1779         srcLen = wxWcslen(src) + 1;
1780
1781     if ( !dst )
1782     {
1783         // optimization: return maximal space which could be needed for this
1784         // string instead of the exact amount which could be less if there are
1785         // any surrogates in the input
1786         //
1787         // we consider that surrogates are rare enough to make it worthwhile to
1788         // avoid running the loop below at the cost of slightly extra memory
1789         // consumption
1790         return srcLen * BYTES_PER_CHAR;
1791     }
1792
1793     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1794     size_t outLen = 0;
1795     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1796     {
1797         const wxUint32 ch = wxDecodeSurrogate(&src);
1798         if ( !src )
1799             return wxCONV_FAILED;
1800
1801         outLen += BYTES_PER_CHAR;
1802
1803         if ( outLen > dstLen )
1804             return wxCONV_FAILED;
1805
1806         *outBuff++ = ch;
1807     }
1808
1809     return outLen;
1810 }
1811
1812 // ----------------------------------------------------------------------------
1813 // endian-reversing conversions
1814 // ----------------------------------------------------------------------------
1815
1816 size_t
1817 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1818                            const char *src, size_t srcLen) const
1819 {
1820     srcLen = GetLength(src, srcLen);
1821     if ( srcLen == wxNO_LEN )
1822         return wxCONV_FAILED;
1823
1824     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1825     const size_t inLen = srcLen / BYTES_PER_CHAR;
1826     size_t outLen = 0;
1827     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1828     {
1829         wxUint16 cc[2];
1830         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1831         if ( numChars == wxCONV_FAILED )
1832             return wxCONV_FAILED;
1833
1834         outLen += numChars;
1835         if ( dst )
1836         {
1837             if ( outLen > dstLen )
1838                 return wxCONV_FAILED;
1839
1840             *dst++ = cc[0];
1841             if ( numChars == 2 )
1842             {
1843                 // second character of a surrogate
1844                 *dst++ = cc[1];
1845             }
1846         }
1847     }
1848
1849     return outLen;
1850 }
1851
1852 size_t
1853 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1854                              const wchar_t *src, size_t srcLen) const
1855 {
1856     if ( srcLen == wxNO_LEN )
1857         srcLen = wxWcslen(src) + 1;
1858
1859     if ( !dst )
1860     {
1861         // optimization: return maximal space which could be needed for this
1862         // string instead of the exact amount which could be less if there are
1863         // any surrogates in the input
1864         //
1865         // we consider that surrogates are rare enough to make it worthwhile to
1866         // avoid running the loop below at the cost of slightly extra memory
1867         // consumption
1868         return srcLen*BYTES_PER_CHAR;
1869     }
1870
1871     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1872     size_t outLen = 0;
1873     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1874     {
1875         const wxUint32 ch = wxDecodeSurrogate(&src);
1876         if ( !src )
1877             return wxCONV_FAILED;
1878
1879         outLen += BYTES_PER_CHAR;
1880
1881         if ( outLen > dstLen )
1882             return wxCONV_FAILED;
1883
1884         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1885     }
1886
1887     return outLen;
1888 }
1889
1890 #else // !WC_UTF16: wchar_t is UTF-32
1891
1892 // ----------------------------------------------------------------------------
1893 // conversions without endianness change
1894 // ----------------------------------------------------------------------------
1895
1896 size_t
1897 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1898                                const char *src, size_t srcLen) const
1899 {
1900     // use memcpy() as it should be much faster than hand-written loop
1901     srcLen = GetLength(src, srcLen);
1902     if ( srcLen == wxNO_LEN )
1903         return wxCONV_FAILED;
1904
1905     const size_t inLen = srcLen/BYTES_PER_CHAR;
1906     if ( dst )
1907     {
1908         if ( dstLen < inLen )
1909             return wxCONV_FAILED;
1910
1911         memcpy(dst, src, srcLen);
1912     }
1913
1914     return inLen;
1915 }
1916
1917 size_t
1918 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1919                                  const wchar_t *src, size_t srcLen) const
1920 {
1921     if ( srcLen == wxNO_LEN )
1922         srcLen = wxWcslen(src) + 1;
1923
1924     srcLen *= BYTES_PER_CHAR;
1925
1926     if ( dst )
1927     {
1928         if ( dstLen < srcLen )
1929             return wxCONV_FAILED;
1930
1931         memcpy(dst, src, srcLen);
1932     }
1933
1934     return srcLen;
1935 }
1936
1937 // ----------------------------------------------------------------------------
1938 // endian-reversing conversions
1939 // ----------------------------------------------------------------------------
1940
1941 size_t
1942 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1943                            const char *src, size_t srcLen) const
1944 {
1945     srcLen = GetLength(src, srcLen);
1946     if ( srcLen == wxNO_LEN )
1947         return wxCONV_FAILED;
1948
1949     srcLen /= BYTES_PER_CHAR;
1950
1951     if ( dst )
1952     {
1953         if ( dstLen < srcLen )
1954             return wxCONV_FAILED;
1955
1956         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1957         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1958         {
1959             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1960         }
1961     }
1962
1963     return srcLen;
1964 }
1965
1966 size_t
1967 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1968                              const wchar_t *src, size_t srcLen) const
1969 {
1970     if ( srcLen == wxNO_LEN )
1971         srcLen = wxWcslen(src) + 1;
1972
1973     srcLen *= BYTES_PER_CHAR;
1974
1975     if ( dst )
1976     {
1977         if ( dstLen < srcLen )
1978             return wxCONV_FAILED;
1979
1980         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1981         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1982         {
1983             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1984         }
1985     }
1986
1987     return srcLen;
1988 }
1989
1990 #endif // WC_UTF16/!WC_UTF16
1991
1992
1993 // ============================================================================
1994 // The classes doing conversion using the iconv_xxx() functions
1995 // ============================================================================
1996
1997 #ifdef HAVE_ICONV
1998
1999 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2000 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2001 //     (unless there's yet another bug in glibc) the only case when iconv()
2002 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2003 //     left in the input buffer -- when _real_ error occurs,
2004 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2005 //     iconv() failure.
2006 //     [This bug does not appear in glibc 2.2.]
2007 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2008 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2009                                      (errno != E2BIG || bufLeft != 0))
2010 #else
2011 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2012 #endif
2013
2014 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2015
2016 #define ICONV_T_INVALID ((iconv_t)-1)
2017
2018 #if SIZEOF_WCHAR_T == 4
2019     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2020     #define WC_ENC      wxFONTENCODING_UTF32
2021 #elif SIZEOF_WCHAR_T == 2
2022     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2023     #define WC_ENC      wxFONTENCODING_UTF16
2024 #else // sizeof(wchar_t) != 2 nor 4
2025     // does this ever happen?
2026     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2027 #endif
2028
2029 // ----------------------------------------------------------------------------
2030 // wxMBConv_iconv: encapsulates an iconv character set
2031 // ----------------------------------------------------------------------------
2032
2033 class wxMBConv_iconv : public wxMBConv
2034 {
2035 public:
2036     wxMBConv_iconv(const char *name);
2037     virtual ~wxMBConv_iconv();
2038
2039     // implement base class virtual methods
2040     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2041                            const char *src, size_t srcLen = wxNO_LEN) const;
2042     virtual size_t FromWChar(char *dst, size_t dstLen,
2043                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2044     virtual size_t GetMBNulLen() const;
2045
2046 #if wxUSE_UNICODE_UTF8
2047     virtual bool IsUTF8() const;
2048 #endif
2049
2050     virtual wxMBConv *Clone() const
2051     {
2052         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2053         p->m_minMBCharWidth = m_minMBCharWidth;
2054         return p;
2055     }
2056
2057     bool IsOk() const
2058         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2059
2060 protected:
2061     // the iconv handlers used to translate from multibyte
2062     // to wide char and in the other direction
2063     iconv_t m2w,
2064             w2m;
2065
2066 #if wxUSE_THREADS
2067     // guards access to m2w and w2m objects
2068     wxMutex m_iconvMutex;
2069 #endif
2070
2071 private:
2072     // the name (for iconv_open()) of a wide char charset -- if none is
2073     // available on this machine, it will remain NULL
2074     static wxString ms_wcCharsetName;
2075
2076     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2077     // different endian-ness than the native one
2078     static bool ms_wcNeedsSwap;
2079
2080
2081     // name of the encoding handled by this conversion
2082     wxString m_name;
2083
2084     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2085     // initially
2086     size_t m_minMBCharWidth;
2087 };
2088
2089 // make the constructor available for unit testing
2090 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2091 {
2092     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2093     if ( !result->IsOk() )
2094     {
2095         delete result;
2096         return 0;
2097     }
2098
2099     return result;
2100 }
2101
2102 wxString wxMBConv_iconv::ms_wcCharsetName;
2103 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2104
2105 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2106               : m_name(name)
2107 {
2108     m_minMBCharWidth = 0;
2109
2110     // check for charset that represents wchar_t:
2111     if ( ms_wcCharsetName.empty() )
2112     {
2113         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2114
2115 #if wxUSE_FONTMAP
2116         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2117 #else // !wxUSE_FONTMAP
2118         static const wxChar *names_static[] =
2119         {
2120 #if SIZEOF_WCHAR_T == 4
2121             _T("UCS-4"),
2122 #elif SIZEOF_WCHAR_T = 2
2123             _T("UCS-2"),
2124 #endif
2125             NULL
2126         };
2127         const wxChar **names = names_static;
2128 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2129
2130         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2131         {
2132             const wxString nameCS(*names);
2133
2134             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2135             wxString nameXE(nameCS);
2136
2137 #ifdef WORDS_BIGENDIAN
2138                 nameXE += _T("BE");
2139 #else // little endian
2140                 nameXE += _T("LE");
2141 #endif
2142
2143             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2144                        nameXE.c_str());
2145
2146             m2w = iconv_open(nameXE.ToAscii(), name);
2147             if ( m2w == ICONV_T_INVALID )
2148             {
2149                 // try charset w/o bytesex info (e.g. "UCS4")
2150                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2151                            nameCS.c_str());
2152                 m2w = iconv_open(nameCS.ToAscii(), name);
2153
2154                 // and check for bytesex ourselves:
2155                 if ( m2w != ICONV_T_INVALID )
2156                 {
2157                     char    buf[2], *bufPtr;
2158                     wchar_t wbuf[2];
2159                     size_t  insz, outsz;
2160                     size_t  res;
2161
2162                     buf[0] = 'A';
2163                     buf[1] = 0;
2164                     wbuf[0] = 0;
2165                     insz = 2;
2166                     outsz = SIZEOF_WCHAR_T * 2;
2167                     char* wbufPtr = (char*)wbuf;
2168                     bufPtr = buf;
2169
2170                     res = iconv(
2171                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2172                         &wbufPtr, &outsz);
2173
2174                     if (ICONV_FAILED(res, insz))
2175                     {
2176                         wxLogLastError(wxT("iconv"));
2177                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2178                                    nameCS.c_str());
2179                     }
2180                     else // ok, can convert to this encoding, remember it
2181                     {
2182                         ms_wcCharsetName = nameCS;
2183                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2184                     }
2185                 }
2186             }
2187             else // use charset not requiring byte swapping
2188             {
2189                 ms_wcCharsetName = nameXE;
2190             }
2191         }
2192
2193         wxLogTrace(TRACE_STRCONV,
2194                    wxT("iconv wchar_t charset is \"%s\"%s"),
2195                    ms_wcCharsetName.empty() ? wxString("<none>")
2196                                             : ms_wcCharsetName,
2197                    ms_wcNeedsSwap ? _T(" (needs swap)")
2198                                   : _T(""));
2199     }
2200     else // we already have ms_wcCharsetName
2201     {
2202         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2203     }
2204
2205     if ( ms_wcCharsetName.empty() )
2206     {
2207         w2m = ICONV_T_INVALID;
2208     }
2209     else
2210     {
2211         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2212         if ( w2m == ICONV_T_INVALID )
2213         {
2214             wxLogTrace(TRACE_STRCONV,
2215                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2216                        ms_wcCharsetName.c_str(), name);
2217         }
2218     }
2219 }
2220
2221 wxMBConv_iconv::~wxMBConv_iconv()
2222 {
2223     if ( m2w != ICONV_T_INVALID )
2224         iconv_close(m2w);
2225     if ( w2m != ICONV_T_INVALID )
2226         iconv_close(w2m);
2227 }
2228
2229 size_t
2230 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2231                         const char *src, size_t srcLen) const
2232 {
2233     if ( srcLen == wxNO_LEN )
2234     {
2235         // find the string length: notice that must be done differently for
2236         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2237         // consecutive NULs
2238         const size_t nulLen = GetMBNulLen();
2239         switch ( nulLen )
2240         {
2241             default:
2242                 return wxCONV_FAILED;
2243
2244             case 1:
2245                 srcLen = strlen(src); // arguably more optimized than our version
2246                 break;
2247
2248             case 2:
2249             case 4:
2250                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2251                 // but they also have to start at character boundary and not
2252                 // span two adjacent characters
2253                 const char *p;
2254                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2255                     ;
2256                 srcLen = p - src;
2257                 break;
2258         }
2259
2260         // when we're determining the length of the string ourselves we count
2261         // the terminating NUL(s) as part of it and always NUL-terminate the
2262         // output
2263         srcLen += nulLen;
2264     }
2265
2266     // we express length in the number of (wide) characters but iconv always
2267     // counts buffer sizes it in bytes
2268     dstLen *= SIZEOF_WCHAR_T;
2269
2270 #if wxUSE_THREADS
2271     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2272     //     Unfortunately there are a couple of global wxCSConv objects such as
2273     //     wxConvLocal that are used all over wx code, so we have to make sure
2274     //     the handle is used by at most one thread at the time. Otherwise
2275     //     only a few wx classes would be safe to use from non-main threads
2276     //     as MB<->WC conversion would fail "randomly".
2277     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2278 #endif // wxUSE_THREADS
2279
2280     size_t res, cres;
2281     const char *pszPtr = src;
2282
2283     if ( dst )
2284     {
2285         char* bufPtr = (char*)dst;
2286
2287         // have destination buffer, convert there
2288         size_t dstLenOrig = dstLen;
2289         cres = iconv(m2w,
2290                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2291                      &bufPtr, &dstLen);
2292
2293         // convert the number of bytes converted as returned by iconv to the
2294         // number of (wide) characters converted that we need
2295         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2296
2297         if (ms_wcNeedsSwap)
2298         {
2299             // convert to native endianness
2300             for ( unsigned i = 0; i < res; i++ )
2301                 dst[i] = WC_BSWAP(dst[i]);
2302         }
2303     }
2304     else // no destination buffer
2305     {
2306         // convert using temp buffer to calculate the size of the buffer needed
2307         wchar_t tbuf[8];
2308         res = 0;
2309
2310         do
2311         {
2312             char* bufPtr = (char*)tbuf;
2313             dstLen = 8 * SIZEOF_WCHAR_T;
2314
2315             cres = iconv(m2w,
2316                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2317                          &bufPtr, &dstLen );
2318
2319             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2320         }
2321         while ((cres == (size_t)-1) && (errno == E2BIG));
2322     }
2323
2324     if (ICONV_FAILED(cres, srcLen))
2325     {
2326         //VS: it is ok if iconv fails, hence trace only
2327         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2328         return wxCONV_FAILED;
2329     }
2330
2331     return res;
2332 }
2333
2334 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2335                                  const wchar_t *src, size_t srcLen) const
2336 {
2337 #if wxUSE_THREADS
2338     // NB: explained in MB2WC
2339     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2340 #endif
2341
2342     if ( srcLen == wxNO_LEN )
2343         srcLen = wxWcslen(src) + 1;
2344
2345     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2346     size_t outbuflen = dstLen;
2347     size_t res, cres;
2348
2349     wchar_t *tmpbuf = 0;
2350
2351     if (ms_wcNeedsSwap)
2352     {
2353         // need to copy to temp buffer to switch endianness
2354         // (doing WC_BSWAP twice on the original buffer won't help, as it
2355         //  could be in read-only memory, or be accessed in some other thread)
2356         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2357         for ( size_t i = 0; i < srcLen; i++ )
2358             tmpbuf[i] = WC_BSWAP(src[i]);
2359
2360         tmpbuf[srcLen] = L'\0';
2361         src = tmpbuf;
2362     }
2363
2364     char* inbuf = (char*)src;
2365     if ( dst )
2366     {
2367         // have destination buffer, convert there
2368         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2369
2370         res = dstLen - outbuflen;
2371     }
2372     else // no destination buffer
2373     {
2374         // convert using temp buffer to calculate the size of the buffer needed
2375         char tbuf[16];
2376         res = 0;
2377         do
2378         {
2379             dst = tbuf;
2380             outbuflen = 16;
2381
2382             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2383
2384             res += 16 - outbuflen;
2385         }
2386         while ((cres == (size_t)-1) && (errno == E2BIG));
2387     }
2388
2389     if (ms_wcNeedsSwap)
2390     {
2391         free(tmpbuf);
2392     }
2393
2394     if (ICONV_FAILED(cres, inbuflen))
2395     {
2396         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2397         return wxCONV_FAILED;
2398     }
2399
2400     return res;
2401 }
2402
2403 size_t wxMBConv_iconv::GetMBNulLen() const
2404 {
2405     if ( m_minMBCharWidth == 0 )
2406     {
2407         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2408
2409 #if wxUSE_THREADS
2410         // NB: explained in MB2WC
2411         wxMutexLocker lock(self->m_iconvMutex);
2412 #endif
2413
2414         const wchar_t *wnul = L"";
2415         char buf[8]; // should be enough for NUL in any encoding
2416         size_t inLen = sizeof(wchar_t),
2417                outLen = WXSIZEOF(buf);
2418         char *inBuff = (char *)wnul;
2419         char *outBuff = buf;
2420         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2421         {
2422             self->m_minMBCharWidth = (size_t)-1;
2423         }
2424         else // ok
2425         {
2426             self->m_minMBCharWidth = outBuff - buf;
2427         }
2428     }
2429
2430     return m_minMBCharWidth;
2431 }
2432
2433 #if wxUSE_UNICODE_UTF8
2434 bool wxMBConv_iconv::IsUTF8() const
2435 {
2436     return wxStricmp(m_name, "UTF-8") == 0 ||
2437            wxStricmp(m_name, "UTF8") == 0;
2438 }
2439 #endif
2440
2441 #endif // HAVE_ICONV
2442
2443
2444 // ============================================================================
2445 // Win32 conversion classes
2446 // ============================================================================
2447
2448 #ifdef wxHAVE_WIN32_MB2WC
2449
2450 // from utils.cpp
2451 #if wxUSE_FONTMAP
2452 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2453 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2454 #endif
2455
2456 class wxMBConv_win32 : public wxMBConv
2457 {
2458 public:
2459     wxMBConv_win32()
2460     {
2461         m_CodePage = CP_ACP;
2462         m_minMBCharWidth = 0;
2463     }
2464
2465     wxMBConv_win32(const wxMBConv_win32& conv)
2466         : wxMBConv()
2467     {
2468         m_CodePage = conv.m_CodePage;
2469         m_minMBCharWidth = conv.m_minMBCharWidth;
2470     }
2471
2472 #if wxUSE_FONTMAP
2473     wxMBConv_win32(const char* name)
2474     {
2475         m_CodePage = wxCharsetToCodepage(name);
2476         m_minMBCharWidth = 0;
2477     }
2478
2479     wxMBConv_win32(wxFontEncoding encoding)
2480     {
2481         m_CodePage = wxEncodingToCodepage(encoding);
2482         m_minMBCharWidth = 0;
2483     }
2484 #endif // wxUSE_FONTMAP
2485
2486     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2487     {
2488         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2489         // the behaviour is not compatible with the Unix version (using iconv)
2490         // and break the library itself, e.g. wxTextInputStream::NextChar()
2491         // wouldn't work if reading an incomplete MB char didn't result in an
2492         // error
2493         //
2494         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2495         // Win XP or newer and it is not supported for UTF-[78] so we always
2496         // use our own conversions in this case. See
2497         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2498         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2499         if ( m_CodePage == CP_UTF8 )
2500         {
2501             return wxMBConvUTF8().MB2WC(buf, psz, n);
2502         }
2503
2504         if ( m_CodePage == CP_UTF7 )
2505         {
2506             return wxMBConvUTF7().MB2WC(buf, psz, n);
2507         }
2508
2509         int flags = 0;
2510         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2511                 IsAtLeastWin2kSP4() )
2512         {
2513             flags = MB_ERR_INVALID_CHARS;
2514         }
2515
2516         const size_t len = ::MultiByteToWideChar
2517                              (
2518                                 m_CodePage,     // code page
2519                                 flags,          // flags: fall on error
2520                                 psz,            // input string
2521                                 -1,             // its length (NUL-terminated)
2522                                 buf,            // output string
2523                                 buf ? n : 0     // size of output buffer
2524                              );
2525         if ( !len )
2526         {
2527             // function totally failed
2528             return wxCONV_FAILED;
2529         }
2530
2531         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2532         // check if we succeeded, by doing a double trip:
2533         if ( !flags && buf )
2534         {
2535             const size_t mbLen = strlen(psz);
2536             wxCharBuffer mbBuf(mbLen);
2537             if ( ::WideCharToMultiByte
2538                    (
2539                       m_CodePage,
2540                       0,
2541                       buf,
2542                       -1,
2543                       mbBuf.data(),
2544                       mbLen + 1,        // size in bytes, not length
2545                       NULL,
2546                       NULL
2547                    ) == 0 ||
2548                   strcmp(mbBuf, psz) != 0 )
2549             {
2550                 // we didn't obtain the same thing we started from, hence
2551                 // the conversion was lossy and we consider that it failed
2552                 return wxCONV_FAILED;
2553             }
2554         }
2555
2556         // note that it returns count of written chars for buf != NULL and size
2557         // of the needed buffer for buf == NULL so in either case the length of
2558         // the string (which never includes the terminating NUL) is one less
2559         return len - 1;
2560     }
2561
2562     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2563     {
2564         /*
2565             we have a problem here: by default, WideCharToMultiByte() may
2566             replace characters unrepresentable in the target code page with bad
2567             quality approximations such as turning "1/2" symbol (U+00BD) into
2568             "1" for the code pages which don't have it and we, obviously, want
2569             to avoid this at any price
2570
2571             the trouble is that this function does it _silently_, i.e. it won't
2572             even tell us whether it did or not... Win98/2000 and higher provide
2573             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2574             we have to resort to a round trip, i.e. check that converting back
2575             results in the same string -- this is, of course, expensive but
2576             otherwise we simply can't be sure to not garble the data.
2577          */
2578
2579         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2580         // it doesn't work with CJK encodings (which we test for rather roughly
2581         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2582         // supporting it
2583         BOOL usedDef wxDUMMY_INITIALIZE(false);
2584         BOOL *pUsedDef;
2585         int flags;
2586         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2587         {
2588             // it's our lucky day
2589             flags = WC_NO_BEST_FIT_CHARS;
2590             pUsedDef = &usedDef;
2591         }
2592         else // old system or unsupported encoding
2593         {
2594             flags = 0;
2595             pUsedDef = NULL;
2596         }
2597
2598         const size_t len = ::WideCharToMultiByte
2599                              (
2600                                 m_CodePage,     // code page
2601                                 flags,          // either none or no best fit
2602                                 pwz,            // input string
2603                                 -1,             // it is (wide) NUL-terminated
2604                                 buf,            // output buffer
2605                                 buf ? n : 0,    // and its size
2606                                 NULL,           // default "replacement" char
2607                                 pUsedDef        // [out] was it used?
2608                              );
2609
2610         if ( !len )
2611         {
2612             // function totally failed
2613             return wxCONV_FAILED;
2614         }
2615
2616         // we did something, check if we really succeeded
2617         if ( flags )
2618         {
2619             // check if the conversion failed, i.e. if any replacements
2620             // were done
2621             if ( usedDef )
2622                 return wxCONV_FAILED;
2623         }
2624         else // we must resort to double tripping...
2625         {
2626             // first we need to ensure that we really have the MB data: this is
2627             // not the case if we're called with NULL buffer, in which case we
2628             // need to do the conversion yet again
2629             wxCharBuffer bufDef;
2630             if ( !buf )
2631             {
2632                 bufDef = wxCharBuffer(len);
2633                 buf = bufDef.data();
2634                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2635                                             buf, len, NULL, NULL) )
2636                     return wxCONV_FAILED;
2637             }
2638
2639             if ( !n )
2640                 n = wcslen(pwz);
2641             wxWCharBuffer wcBuf(n);
2642             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2643                     wcscmp(wcBuf, pwz) != 0 )
2644             {
2645                 // we didn't obtain the same thing we started from, hence
2646                 // the conversion was lossy and we consider that it failed
2647                 return wxCONV_FAILED;
2648             }
2649         }
2650
2651         // see the comment above for the reason of "len - 1"
2652         return len - 1;
2653     }
2654
2655     virtual size_t GetMBNulLen() const
2656     {
2657         if ( m_minMBCharWidth == 0 )
2658         {
2659             int len = ::WideCharToMultiByte
2660                         (
2661                             m_CodePage,     // code page
2662                             0,              // no flags
2663                             L"",            // input string
2664                             1,              // translate just the NUL
2665                             NULL,           // output buffer
2666                             0,              // and its size
2667                             NULL,           // no replacement char
2668                             NULL            // [out] don't care if it was used
2669                         );
2670
2671             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2672             switch ( len )
2673             {
2674                 default:
2675                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2676                     self->m_minMBCharWidth = (size_t)-1;
2677                     break;
2678
2679                 case 0:
2680                     self->m_minMBCharWidth = (size_t)-1;
2681                     break;
2682
2683                 case 1:
2684                 case 2:
2685                 case 4:
2686                     self->m_minMBCharWidth = len;
2687                     break;
2688             }
2689         }
2690
2691         return m_minMBCharWidth;
2692     }
2693
2694     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2695
2696     bool IsOk() const { return m_CodePage != -1; }
2697
2698 private:
2699     static bool CanUseNoBestFit()
2700     {
2701         static int s_isWin98Or2k = -1;
2702
2703         if ( s_isWin98Or2k == -1 )
2704         {
2705             int verMaj, verMin;
2706             switch ( wxGetOsVersion(&verMaj, &verMin) )
2707             {
2708                 case wxOS_WINDOWS_9X:
2709                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2710                     break;
2711
2712                 case wxOS_WINDOWS_NT:
2713                     s_isWin98Or2k = verMaj >= 5;
2714                     break;
2715
2716                 default:
2717                     // unknown: be conservative by default
2718                     s_isWin98Or2k = 0;
2719                     break;
2720             }
2721
2722             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2723         }
2724
2725         return s_isWin98Or2k == 1;
2726     }
2727
2728     static bool IsAtLeastWin2kSP4()
2729     {
2730 #ifdef __WXWINCE__
2731         return false;
2732 #else
2733         static int s_isAtLeastWin2kSP4 = -1;
2734
2735         if ( s_isAtLeastWin2kSP4 == -1 )
2736         {
2737             OSVERSIONINFOEX ver;
2738
2739             memset(&ver, 0, sizeof(ver));
2740             ver.dwOSVersionInfoSize = sizeof(ver);
2741             GetVersionEx((OSVERSIONINFO*)&ver);
2742
2743             s_isAtLeastWin2kSP4 =
2744               ((ver.dwMajorVersion > 5) || // Vista+
2745                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2746                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2747                ver.wServicePackMajor >= 4)) // 2000 SP4+
2748               ? 1 : 0;
2749         }
2750
2751         return s_isAtLeastWin2kSP4 == 1;
2752 #endif
2753     }
2754
2755
2756     // the code page we're working with
2757     long m_CodePage;
2758
2759     // cached result of GetMBNulLen(), set to 0 initially meaning
2760     // "unknown"
2761     size_t m_minMBCharWidth;
2762 };
2763
2764 #endif // wxHAVE_WIN32_MB2WC
2765
2766
2767 // ============================================================================
2768 // wxEncodingConverter based conversion classes
2769 // ============================================================================
2770
2771 #if wxUSE_FONTMAP
2772
2773 class wxMBConv_wxwin : public wxMBConv
2774 {
2775 private:
2776     void Init()
2777     {
2778         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2779         // The wxMBConv_cf class does a better job.
2780         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2781                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2782                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2783     }
2784
2785 public:
2786     // temporarily just use wxEncodingConverter stuff,
2787     // so that it works while a better implementation is built
2788     wxMBConv_wxwin(const char* name)
2789     {
2790         if (name)
2791             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2792         else
2793             m_enc = wxFONTENCODING_SYSTEM;
2794
2795         Init();
2796     }
2797
2798     wxMBConv_wxwin(wxFontEncoding enc)
2799     {
2800         m_enc = enc;
2801
2802         Init();
2803     }
2804
2805     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2806     {
2807         size_t inbuf = strlen(psz);
2808         if (buf)
2809         {
2810             if (!m2w.Convert(psz, buf))
2811                 return wxCONV_FAILED;
2812         }
2813         return inbuf;
2814     }
2815
2816     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2817     {
2818         const size_t inbuf = wxWcslen(psz);
2819         if (buf)
2820         {
2821             if (!w2m.Convert(psz, buf))
2822                 return wxCONV_FAILED;
2823         }
2824
2825         return inbuf;
2826     }
2827
2828     virtual size_t GetMBNulLen() const
2829     {
2830         switch ( m_enc )
2831         {
2832             case wxFONTENCODING_UTF16BE:
2833             case wxFONTENCODING_UTF16LE:
2834                 return 2;
2835
2836             case wxFONTENCODING_UTF32BE:
2837             case wxFONTENCODING_UTF32LE:
2838                 return 4;
2839
2840             default:
2841                 return 1;
2842         }
2843     }
2844
2845     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2846
2847     bool IsOk() const { return m_ok; }
2848
2849 public:
2850     wxFontEncoding m_enc;
2851     wxEncodingConverter m2w, w2m;
2852
2853 private:
2854     // were we initialized successfully?
2855     bool m_ok;
2856
2857     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2858 };
2859
2860 // make the constructors available for unit testing
2861 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2862 {
2863     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2864     if ( !result->IsOk() )
2865     {
2866         delete result;
2867         return 0;
2868     }
2869
2870     return result;
2871 }
2872
2873 #endif // wxUSE_FONTMAP
2874
2875 // ============================================================================
2876 // wxCSConv implementation
2877 // ============================================================================
2878
2879 void wxCSConv::Init()
2880 {
2881     m_name = NULL;
2882     m_convReal =  NULL;
2883     m_deferred = true;
2884 }
2885
2886 wxCSConv::wxCSConv(const wxString& charset)
2887 {
2888     Init();
2889
2890     if ( !charset.empty() )
2891     {
2892         SetName(charset.ToAscii());
2893     }
2894
2895 #if wxUSE_FONTMAP
2896     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2897     if ( m_encoding == wxFONTENCODING_MAX )
2898     {
2899         // set to unknown/invalid value
2900         m_encoding = wxFONTENCODING_SYSTEM;
2901     }
2902     else if ( m_encoding == wxFONTENCODING_DEFAULT )
2903     {
2904         // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2905         m_encoding = wxFONTENCODING_ISO8859_1;
2906     }
2907 #else
2908     m_encoding = wxFONTENCODING_SYSTEM;
2909 #endif
2910 }
2911
2912 wxCSConv::wxCSConv(wxFontEncoding encoding)
2913 {
2914     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2915     {
2916         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2917
2918         encoding = wxFONTENCODING_SYSTEM;
2919     }
2920
2921     Init();
2922
2923     m_encoding = encoding;
2924 }
2925
2926 wxCSConv::~wxCSConv()
2927 {
2928     Clear();
2929 }
2930
2931 wxCSConv::wxCSConv(const wxCSConv& conv)
2932         : wxMBConv()
2933 {
2934     Init();
2935
2936     SetName(conv.m_name);
2937     m_encoding = conv.m_encoding;
2938 }
2939
2940 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2941 {
2942     Clear();
2943
2944     SetName(conv.m_name);
2945     m_encoding = conv.m_encoding;
2946
2947     return *this;
2948 }
2949
2950 void wxCSConv::Clear()
2951 {
2952     free(m_name);
2953     delete m_convReal;
2954
2955     m_name = NULL;
2956     m_convReal = NULL;
2957 }
2958
2959 void wxCSConv::SetName(const char *charset)
2960 {
2961     if (charset)
2962     {
2963         m_name = wxStrdup(charset);
2964         m_deferred = true;
2965     }
2966 }
2967
2968 #if wxUSE_FONTMAP
2969
2970 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2971                      wxEncodingNameCache );
2972
2973 static wxEncodingNameCache gs_nameCache;
2974 #endif
2975
2976 wxMBConv *wxCSConv::DoCreate() const
2977 {
2978 #if wxUSE_FONTMAP
2979     wxLogTrace(TRACE_STRCONV,
2980                wxT("creating conversion for %s"),
2981                (m_name ? m_name
2982                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2983 #endif // wxUSE_FONTMAP
2984
2985     // check for the special case of ASCII or ISO8859-1 charset: as we have
2986     // special knowledge of it anyhow, we don't need to create a special
2987     // conversion object
2988     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2989             m_encoding == wxFONTENCODING_DEFAULT )
2990     {
2991         // don't convert at all
2992         return NULL;
2993     }
2994
2995     // we trust OS to do conversion better than we can so try external
2996     // conversion methods first
2997     //
2998     // the full order is:
2999     //      1. OS conversion (iconv() under Unix or Win32 API)
3000     //      2. hard coded conversions for UTF
3001     //      3. wxEncodingConverter as fall back
3002
3003     // step (1)
3004 #ifdef HAVE_ICONV
3005 #if !wxUSE_FONTMAP
3006     if ( m_name )
3007 #endif // !wxUSE_FONTMAP
3008     {
3009 #if wxUSE_FONTMAP
3010         wxFontEncoding encoding(m_encoding);
3011 #endif
3012
3013         if ( m_name )
3014         {
3015             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3016             if ( conv->IsOk() )
3017                 return conv;
3018
3019             delete conv;
3020
3021 #if wxUSE_FONTMAP
3022             encoding =
3023                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3024 #endif // wxUSE_FONTMAP
3025         }
3026 #if wxUSE_FONTMAP
3027         {
3028             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3029             if ( it != gs_nameCache.end() )
3030             {
3031                 if ( it->second.empty() )
3032                     return NULL;
3033
3034                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3035                 if ( conv->IsOk() )
3036                     return conv;
3037
3038                 delete conv;
3039             }
3040
3041             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3042             // CS : in case this does not return valid names (eg for MacRoman)
3043             // encoding got a 'failure' entry in the cache all the same,
3044             // although it just has to be created using a different method, so
3045             // only store failed iconv creation attempts (or perhaps we
3046             // shoulnd't do this at all ?)
3047             if ( names[0] != NULL )
3048             {
3049                 for ( ; *names; ++names )
3050                 {
3051                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3052                     //             will need changes that will obsolete this
3053                     wxString name(*names);
3054                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3055                     if ( conv->IsOk() )
3056                     {
3057                         gs_nameCache[encoding] = *names;
3058                         return conv;
3059                     }
3060
3061                     delete conv;
3062                 }
3063
3064                 gs_nameCache[encoding] = _T(""); // cache the failure
3065             }
3066         }
3067 #endif // wxUSE_FONTMAP
3068     }
3069 #endif // HAVE_ICONV
3070
3071 #ifdef wxHAVE_WIN32_MB2WC
3072     {
3073 #if wxUSE_FONTMAP
3074         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3075                                       : new wxMBConv_win32(m_encoding);
3076         if ( conv->IsOk() )
3077             return conv;
3078
3079         delete conv;
3080 #else
3081         return NULL;
3082 #endif
3083     }
3084 #endif // wxHAVE_WIN32_MB2WC
3085
3086 #ifdef __DARWIN__
3087     {
3088         // leave UTF16 and UTF32 to the built-ins of wx
3089         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3090             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3091         {
3092 #if wxUSE_FONTMAP
3093             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3094                                           : new wxMBConv_cf(m_encoding);
3095 #else
3096             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3097 #endif
3098
3099             if ( conv->IsOk() )
3100                  return conv;
3101
3102             delete conv;
3103         }
3104     }
3105 #endif // __DARWIN__
3106
3107     // step (2)
3108     wxFontEncoding enc = m_encoding;
3109 #if wxUSE_FONTMAP
3110     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3111     {
3112         // use "false" to suppress interactive dialogs -- we can be called from
3113         // anywhere and popping up a dialog from here is the last thing we want to
3114         // do
3115         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3116     }
3117 #endif // wxUSE_FONTMAP
3118
3119     switch ( enc )
3120     {
3121         case wxFONTENCODING_UTF7:
3122              return new wxMBConvUTF7;
3123
3124         case wxFONTENCODING_UTF8:
3125              return new wxMBConvUTF8;
3126
3127         case wxFONTENCODING_UTF16BE:
3128              return new wxMBConvUTF16BE;
3129
3130         case wxFONTENCODING_UTF16LE:
3131              return new wxMBConvUTF16LE;
3132
3133         case wxFONTENCODING_UTF32BE:
3134              return new wxMBConvUTF32BE;
3135
3136         case wxFONTENCODING_UTF32LE:
3137              return new wxMBConvUTF32LE;
3138
3139         default:
3140              // nothing to do but put here to suppress gcc warnings
3141              break;
3142     }
3143
3144     // step (3)
3145 #if wxUSE_FONTMAP
3146     {
3147         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3148                                       : new wxMBConv_wxwin(m_encoding);
3149         if ( conv->IsOk() )
3150             return conv;
3151
3152         delete conv;
3153     }
3154 #endif // wxUSE_FONTMAP
3155
3156     // NB: This is a hack to prevent deadlock. What could otherwise happen
3157     //     in Unicode build: wxConvLocal creation ends up being here
3158     //     because of some failure and logs the error. But wxLog will try to
3159     //     attach a timestamp, for which it will need wxConvLocal (to convert
3160     //     time to char* and then wchar_t*), but that fails, tries to log the
3161     //     error, but wxLog has an (already locked) critical section that
3162     //     guards the static buffer.
3163     static bool alreadyLoggingError = false;
3164     if (!alreadyLoggingError)
3165     {
3166         alreadyLoggingError = true;
3167         wxLogError(_("Cannot convert from the charset '%s'!"),
3168                    m_name ? m_name
3169                       :
3170 #if wxUSE_FONTMAP
3171                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3172 #else // !wxUSE_FONTMAP
3173                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3174 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3175               );
3176
3177         alreadyLoggingError = false;
3178     }
3179
3180     return NULL;
3181 }
3182
3183 void wxCSConv::CreateConvIfNeeded() const
3184 {
3185     if ( m_deferred )
3186     {
3187         wxCSConv *self = (wxCSConv *)this; // const_cast
3188
3189         // if we don't have neither the name nor the encoding, use the default
3190         // encoding for this system
3191         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3192         {
3193 #if wxUSE_INTL
3194             self->m_encoding = wxLocale::GetSystemEncoding();
3195 #else
3196             // fallback to some reasonable default:
3197             self->m_encoding = wxFONTENCODING_ISO8859_1;
3198 #endif // wxUSE_INTL
3199         }
3200
3201         self->m_convReal = DoCreate();
3202         self->m_deferred = false;
3203     }
3204 }
3205
3206 bool wxCSConv::IsOk() const
3207 {
3208     CreateConvIfNeeded();
3209
3210     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3211     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3212         return true; // always ok as we do it ourselves
3213
3214     // m_convReal->IsOk() is called at its own creation, so we know it must
3215     // be ok if m_convReal is non-NULL
3216     return m_convReal != NULL;
3217 }
3218
3219 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3220                          const char *src, size_t srcLen) const
3221 {
3222     CreateConvIfNeeded();
3223
3224     if (m_convReal)
3225         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3226
3227     // latin-1 (direct)
3228     if ( srcLen == wxNO_LEN )
3229         srcLen = strlen(src) + 1; // take trailing NUL too
3230
3231     if ( dst )
3232     {
3233         if ( dstLen < srcLen )
3234             return wxCONV_FAILED;
3235
3236         for ( size_t n = 0; n < srcLen; n++ )
3237             dst[n] = (unsigned char)(src[n]);
3238     }
3239
3240     return srcLen;
3241 }
3242
3243 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3244                            const wchar_t *src, size_t srcLen) const
3245 {
3246     CreateConvIfNeeded();
3247
3248     if (m_convReal)
3249         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3250
3251     // latin-1 (direct)
3252     if ( srcLen == wxNO_LEN )
3253         srcLen = wxWcslen(src) + 1;
3254
3255     if ( dst )
3256     {
3257         if ( dstLen < srcLen )
3258             return wxCONV_FAILED;
3259
3260         for ( size_t n = 0; n < srcLen; n++ )
3261         {
3262             if ( src[n] > 0xFF )
3263                 return wxCONV_FAILED;
3264
3265             dst[n] = (char)src[n];
3266         }
3267
3268     }
3269     else // still need to check the input validity
3270     {
3271         for ( size_t n = 0; n < srcLen; n++ )
3272         {
3273             if ( src[n] > 0xFF )
3274                 return wxCONV_FAILED;
3275         }
3276     }
3277
3278     return srcLen;
3279 }
3280
3281 size_t wxCSConv::GetMBNulLen() const
3282 {
3283     CreateConvIfNeeded();
3284
3285     if ( m_convReal )
3286     {
3287         return m_convReal->GetMBNulLen();
3288     }
3289
3290     // otherwise, we are ISO-8859-1
3291     return 1;
3292 }
3293
3294 #if wxUSE_UNICODE_UTF8
3295 bool wxCSConv::IsUTF8() const
3296 {
3297     CreateConvIfNeeded();
3298
3299     if ( m_convReal )
3300     {
3301         return m_convReal->IsUTF8();
3302     }
3303
3304     // otherwise, we are ISO-8859-1
3305     return false;
3306 }
3307 #endif
3308
3309
3310 #if wxUSE_UNICODE
3311
3312 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3313 {
3314     if ( !s )
3315         return wxWCharBuffer();
3316
3317     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3318     if ( !wbuf )
3319         wbuf = wxMBConvUTF8().cMB2WX(s);
3320     if ( !wbuf )
3321         wbuf = wxConvISO8859_1.cMB2WX(s);
3322
3323     return wbuf;
3324 }
3325
3326 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3327 {
3328     if ( !ws )
3329         return wxCharBuffer();
3330
3331     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3332     if ( !buf )
3333         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3334
3335     return buf;
3336 }
3337
3338 #endif // wxUSE_UNICODE
3339
3340 // ----------------------------------------------------------------------------
3341 // globals
3342 // ----------------------------------------------------------------------------
3343
3344 // NB: The reason why we create converted objects in this convoluted way,
3345 //     using a factory function instead of global variable, is that they
3346 //     may be used at static initialization time (some of them are used by
3347 //     wxString ctors and there may be a global wxString object). In other
3348 //     words, possibly _before_ the converter global object would be
3349 //     initialized.
3350
3351 #undef wxConvLibc
3352 #undef wxConvUTF8
3353 #undef wxConvUTF7
3354 #undef wxConvLocal
3355 #undef wxConvISO8859_1
3356
3357 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3358     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3359     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3360     {                                                                   \
3361         static impl_klass name##Obj ctor_args;                          \
3362         return &name##Obj;                                              \
3363     }                                                                   \
3364     /* this ensures that all global converter objects are created */    \
3365     /* by the time static initialization is done, i.e. before any */    \
3366     /* thread is launched: */                                           \
3367     static klass* gs_##name##instance = wxGet_##name##Ptr()
3368
3369 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3370     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3371
3372 #ifdef __WINDOWS__
3373     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3374 #else
3375     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3376 #endif
3377
3378 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3379 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3380 //     provokes an error message about "not enough macro parameters"; and we
3381 //     can't use "()" here as the name##Obj declaration would be parsed as a
3382 //     function declaration then, so use a semicolon and live with an extra
3383 //     empty statement (and hope that no compilers warns about this)
3384 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3385 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3386
3387 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3388 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3389
3390 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3391 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3392
3393 #ifdef __DARWIN__
3394 // The xnu kernel always communicates file paths in decomposed UTF-8.
3395 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3396 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3397 #endif
3398
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3400 #ifdef __DARWIN__
3401                                     &wxConvMacUTF8DObj;
3402 #else // !__DARWIN__
3403                                     wxGet_wxConvLibcPtr();
3404 #endif // __DARWIN__/!__DARWIN__
3405
3406 #else // !wxUSE_WCHAR_T
3407
3408 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3409 // stand-ins in absence of wchar_t
3410 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3411                                 wxConvISO8859_1,
3412                                 wxConvLocal,
3413                                 wxConvUTF8;
3414
3415 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T