src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #ifndef __WXWINCE__
  32 #include <errno.h>
  33 #endif
  34
  35 #include <ctype.h>
  36 #include <string.h>
  37 #include <stdlib.h>
  38
  39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42     #define wxHAVE_WIN32_MB2WC
  43 #endif
  44
  45 #ifdef HAVE_ICONV
  46     #include <iconv.h>
  47     #include "wx/thread.h"
  48 #endif
  49
  50 #include "wx/encconv.h"
  51 #include "wx/fontmap.h"
  52
  53 #ifdef __DARWIN__
  54 #include "wx/osx/core/private/strconv_cf.h"
  55 #endif //def __DARWIN__
  56
  57
  58 #define TRACE_STRCONV wxT("strconv")
  59
  60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  61 // be 4 bytes
  62 #if SIZEOF_WCHAR_T == 2
  63     #define WC_UTF16
  64 #endif
  65
  66
  67 // ============================================================================
  68 // implementation
  69 // ============================================================================
  70
  71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  72 static bool NotAllNULs(const char *p, size_t n)
  73 {
  74     while ( n && *p++ == '\0' )
  75         n--;
  76
  77     return n != 0;
  78 }
  79
  80 // ----------------------------------------------------------------------------
  81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  82 // ----------------------------------------------------------------------------
  83
  84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  85 {
  86     if (input <= 0xffff)
  87     {
  88         if (output)
  89             *output = (wxUint16) input;
  90
  91         return 1;
  92     }
  93     else if (input >= 0x110000)
  94     {
  95         return wxCONV_FAILED;
  96     }
  97     else
  98     {
  99         if (output)
 100         {
 101             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 102             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 103         }
 104
 105         return 2;
 106     }
 107 }
 108
 109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 110 {
 111     if ((*input < 0xd800) || (*input > 0xdfff))
 112     {
 113         output = *input;
 114         return 1;
 115     }
 116     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 117     {
 118         output = *input;
 119         return wxCONV_FAILED;
 120     }
 121     else
 122     {
 123         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 124         return 2;
 125     }
 126 }
 127
 128 #ifdef WC_UTF16
 129     typedef wchar_t wxDecodeSurrogate_t;
 130 #else // !WC_UTF16
 131     typedef wxUint16 wxDecodeSurrogate_t;
 132 #endif // WC_UTF16/!WC_UTF16
 133
 134 // returns the next UTF-32 character from the wchar_t buffer and advances the
 135 // pointer to the character after this one
 136 //
 137 // if an invalid character is found, *pSrc is set to NULL, the caller must
 138 // check for this
 139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 140 {
 141     wxUint32 out;
 142     const size_t
 143         n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
 144     if ( n == wxCONV_FAILED )
 145         *pSrc = NULL;
 146     else
 147         *pSrc += n;
 148
 149     return out;
 150 }
 151
 152 // ----------------------------------------------------------------------------
 153 // wxMBConv
 154 // ----------------------------------------------------------------------------
 155
 156 size_t
 157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 158                   const char *src, size_t srcLen) const
 159 {
 160     // although new conversion classes are supposed to implement this function
 161     // directly, the existing ones only implement the old MB2WC() and so, to
 162     // avoid to have to rewrite all conversion classes at once, we provide a
 163     // default (but not efficient) implementation of this one in terms of the
 164     // old function by copying the input to ensure that it's NUL-terminated and
 165     // then using MB2WC() to convert it
 166     //
 167     // moreover, some conversion classes simply can't implement ToWChar()
 168     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 169     // NUL-terminated strings
 170
 171     // the number of chars [which would be] written to dst [if it were not NULL]
 172     size_t dstWritten = 0;
 173
 174     // the number of NULs terminating this string
 175     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 176
 177     // if we were not given the input size we just have to assume that the
 178     // string is properly terminated as we have no way of knowing how long it
 179     // is anyhow, but if we do have the size check whether there are enough
 180     // NULs at the end
 181     wxCharBuffer bufTmp;
 182     const char *srcEnd;
 183     if ( srcLen != wxNO_LEN )
 184     {
 185         // we need to know how to find the end of this string
 186         nulLen = GetMBNulLen();
 187         if ( nulLen == wxCONV_FAILED )
 188             return wxCONV_FAILED;
 189
 190         // if there are enough NULs we can avoid the copy
 191         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 192         {
 193             // make a copy in order to properly NUL-terminate the string
 194             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 195             char * const p = bufTmp.data();
 196             memcpy(p, src, srcLen);
 197             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 198                 *s = '\0';
 199
 200             src = bufTmp;
 201         }
 202
 203         srcEnd = src + srcLen;
 204     }
 205     else // quit after the first loop iteration
 206     {
 207         srcEnd = NULL;
 208     }
 209
 210     // the idea of this code is straightforward: it converts a NUL-terminated
 211     // chunk of the string during each iteration and updates the output buffer
 212     // with the result
 213     //
 214     // all the complication come from the fact that this function, for
 215     // historical reasons, must behave in 2 subtly different ways when it's
 216     // called with a fixed number of characters and when it's called for the
 217     // entire NUL-terminated string: in the former case (srcEnd != NULL) we
 218     // must count all characters we convert, NUL or not; but in the latter we
 219     // do not count the trailing NUL -- but still count all the NULs inside the
 220     // string
 221     //
 222     // so for the (simple) former case we just always count the trailing NUL,
 223     // but for the latter we need to wait until we see if there is going to be
 224     // another loop iteration and only count it then
 225     for ( ;; )
 226     {
 227         // try to convert the current chunk
 228         size_t lenChunk = MB2WC(NULL, src, 0);
 229         if ( lenChunk == wxCONV_FAILED )
 230             return wxCONV_FAILED;
 231
 232         dstWritten += lenChunk;
 233         if ( !srcEnd )
 234             dstWritten++;
 235
 236         if ( !lenChunk )
 237         {
 238             // nothing left in the input string, conversion succeeded
 239             break;
 240         }
 241
 242         if ( dst )
 243         {
 244             if ( dstWritten > dstLen )
 245                 return wxCONV_FAILED;
 246
 247             // +1 is for trailing NUL
 248             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 249                 return wxCONV_FAILED;
 250
 251             dst += lenChunk;
 252             if ( !srcEnd )
 253                 dst++;
 254         }
 255
 256         if ( !srcEnd )
 257         {
 258             // we convert just one chunk in this case as this is the entire
 259             // string anyhow (and we don't count the trailing NUL in this case)
 260             break;
 261         }
 262
 263         // advance the input pointer past the end of this chunk: notice that we
 264         // will always stop before srcEnd because we know that the chunk is
 265         // always properly NUL-terminated
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         // if the buffer ends before this NUL, we shouldn't count it in our
 276         // output so skip the code below
 277         if ( src == srcEnd )
 278             break;
 279
 280         // do count this terminator as it's inside the buffer we convert
 281         dstWritten++;
 282         if ( dst )
 283             dst++;
 284
 285         src += nulLen; // skip the terminator itself
 286
 287         if ( src >= srcEnd )
 288             break;
 289     }
 290
 291     return dstWritten;
 292 }
 293
 294 size_t
 295 wxMBConv::FromWChar(char *dst, size_t dstLen,
 296                     const wchar_t *src, size_t srcLen) const
 297 {
 298     // the number of chars [which would be] written to dst [if it were not NULL]
 299     size_t dstWritten = 0;
 300
 301     // if we don't know its length we have no choice but to assume that it is
 302     // NUL-terminated (notice that it can still be NUL-terminated even if
 303     // explicit length is given but it doesn't change our return value)
 304     const bool isNulTerminated = srcLen == wxNO_LEN;
 305
 306     // make a copy of the input string unless it is already properly
 307     // NUL-terminated
 308     wxWCharBuffer bufTmp;
 309     if ( isNulTerminated )
 310     {
 311         srcLen = wxWcslen(src) + 1;
 312     }
 313     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 314     {
 315         // make a copy in order to properly NUL-terminate the string
 316         bufTmp = wxWCharBuffer(srcLen);
 317         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 318         src = bufTmp;
 319     }
 320
 321     const size_t lenNul = GetMBNulLen();
 322     for ( const wchar_t * const srcEnd = src + srcLen;
 323           src < srcEnd;
 324           src++ /* skip L'\0' too */ )
 325     {
 326         // try to convert the current chunk
 327         size_t lenChunk = WC2MB(NULL, src, 0);
 328         if ( lenChunk == wxCONV_FAILED )
 329             return wxCONV_FAILED;
 330
 331         dstWritten += lenChunk;
 332
 333         const wchar_t * const
 334             chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
 335
 336         // our return value accounts for the trailing NUL(s), unlike that of
 337         // WC2MB(), however don't do it for the last NUL we artificially added
 338         // ourselves above
 339         if ( chunkEnd < srcEnd )
 340             dstWritten += lenNul;
 341
 342         if ( dst )
 343         {
 344             if ( dstWritten > dstLen )
 345                 return wxCONV_FAILED;
 346
 347             // if we know that there is enough space in the destination buffer
 348             // (because we accounted for lenNul in dstWritten above), we can
 349             // convert directly in place -- but otherwise we need another
 350             // temporary buffer to ensure that we don't overwrite the output
 351             wxCharBuffer dstBuf;
 352             char *dstTmp;
 353             if ( chunkEnd == srcEnd )
 354             {
 355                 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
 356                 dstTmp = dstBuf.data();
 357             }
 358             else
 359             {
 360                 dstTmp = dst;
 361             }
 362
 363             if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
 364                 return wxCONV_FAILED;
 365
 366             if ( dstTmp != dst )
 367             {
 368                 // copy everything up to but excluding the terminating NUL(s)
 369                 // into the real output buffer
 370                 memcpy(dst, dstTmp, lenChunk);
 371
 372                 // micro-optimization: if dstTmp != dst it means that chunkEnd
 373                 // == srcEnd and so we're done, no need to update anything below
 374                 break;
 375             }
 376
 377             dst += lenChunk;
 378             if ( chunkEnd < srcEnd )
 379                 dst += lenNul;
 380         }
 381
 382         src = chunkEnd;
 383     }
 384
 385     return dstWritten;
 386 }
 387
 388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 389 {
 390     size_t rc = ToWChar(outBuff, outLen, inBuff);
 391     if ( rc != wxCONV_FAILED )
 392     {
 393         // ToWChar() returns the buffer length, i.e. including the trailing
 394         // NUL, while this method doesn't take it into account
 395         rc--;
 396     }
 397
 398     return rc;
 399 }
 400
 401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 402 {
 403     size_t rc = FromWChar(outBuff, outLen, inBuff);
 404     if ( rc != wxCONV_FAILED )
 405     {
 406         rc -= GetMBNulLen();
 407     }
 408
 409     return rc;
 410 }
 411
 412 wxMBConv::~wxMBConv()
 413 {
 414     // nothing to do here (necessary for Darwin linking probably)
 415 }
 416
 417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 418 {
 419     if ( psz )
 420     {
 421         // calculate the length of the buffer needed first
 422         const size_t nLen = ToWChar(NULL, 0, psz);
 423         if ( nLen != wxCONV_FAILED )
 424         {
 425             // now do the actual conversion
 426             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 427
 428             // +1 for the trailing NULL
 429             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 430                 return buf;
 431         }
 432     }
 433
 434     return wxWCharBuffer();
 435 }
 436
 437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 438 {
 439     if ( pwz )
 440     {
 441         const size_t nLen = FromWChar(NULL, 0, pwz);
 442         if ( nLen != wxCONV_FAILED )
 443         {
 444             wxCharBuffer buf(nLen - 1);
 445             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 446                 return buf;
 447         }
 448     }
 449
 450     return wxCharBuffer();
 451 }
 452
 453 const wxWCharBuffer
 454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 455 {
 456     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 457     if ( dstLen != wxCONV_FAILED )
 458     {
 459         // notice that we allocate space for dstLen+1 wide characters here
 460         // because we want the buffer to always be NUL-terminated, even if the
 461         // input isn't (as otherwise the caller has no way to know its length)
 462         wxWCharBuffer wbuf(dstLen);
 463         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 464         {
 465             if ( outLen )
 466             {
 467                 *outLen = dstLen;
 468
 469                 // we also need to handle NUL-terminated input strings
 470                 // specially: for them the output is the length of the string
 471                 // excluding the trailing NUL, however if we're asked to
 472                 // convert a specific number of characters we return the length
 473                 // of the resulting output even if it's NUL-terminated
 474                 if ( inLen == wxNO_LEN )
 475                     (*outLen)--;
 476             }
 477
 478             return wbuf;
 479         }
 480     }
 481
 482     if ( outLen )
 483         *outLen = 0;
 484
 485     return wxWCharBuffer();
 486 }
 487
 488 const wxCharBuffer
 489 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 490 {
 491     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 492     if ( dstLen != wxCONV_FAILED )
 493     {
 494         const size_t nulLen = GetMBNulLen();
 495
 496         // as above, ensure that the buffer is always NUL-terminated, even if
 497         // the input is not
 498         wxCharBuffer buf(dstLen + nulLen - 1);
 499         memset(buf.data() + dstLen, 0, nulLen);
 500         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 501         {
 502             if ( outLen )
 503             {
 504                 *outLen = dstLen;
 505
 506                 if ( inLen == wxNO_LEN )
 507                 {
 508                     // in this case both input and output are NUL-terminated
 509                     // and we're not supposed to count NUL
 510                     *outLen -= nulLen;
 511                 }
 512             }
 513
 514             return buf;
 515         }
 516     }
 517
 518     if ( outLen )
 519         *outLen = 0;
 520
 521     return wxCharBuffer();
 522 }
 523
 524 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
 525 {
 526     const size_t srcLen = buf.length();
 527     if ( srcLen )
 528     {
 529         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
 530         if ( dstLen != wxCONV_FAILED )
 531         {
 532             wxWCharBuffer wbuf(dstLen);
 533             wbuf.data()[dstLen] = L'\0';
 534             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
 535                 return wbuf;
 536         }
 537     }
 538
 539     return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
 540 }
 541
 542 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
 543 {
 544     const size_t srcLen = wbuf.length();
 545     if ( srcLen )
 546     {
 547         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
 548         if ( dstLen != wxCONV_FAILED )
 549         {
 550             wxCharBuffer buf(dstLen);
 551             buf.data()[dstLen] = '\0';
 552             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
 553                 return buf;
 554         }
 555     }
 556
 557     return wxScopedCharBuffer::CreateNonOwned("", 0);
 558 }
 559
 560 // ----------------------------------------------------------------------------
 561 // wxMBConvLibc
 562 // ----------------------------------------------------------------------------
 563
 564 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 565 {
 566     return wxMB2WC(buf, psz, n);
 567 }
 568
 569 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 570 {
 571     return wxWC2MB(buf, psz, n);
 572 }
 573
 574 // ----------------------------------------------------------------------------
 575 // wxConvBrokenFileNames
 576 // ----------------------------------------------------------------------------
 577
 578 #ifdef __UNIX__
 579
 580 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 581 {
 582     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
 583          wxStricmp(charset, wxT("UTF8")) == 0  )
 584         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 585     else
 586         m_conv = new wxCSConv(charset);
 587 }
 588
 589 #endif // __UNIX__
 590
 591 // ----------------------------------------------------------------------------
 592 // UTF-7
 593 // ----------------------------------------------------------------------------
 594
 595 // Implementation (C) 2004 Fredrik Roubert
 596 //
 597 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 598
 599 //
 600 // BASE64 decoding table
 601 //
 602 static const unsigned char utf7unb64[] =
 603 {
 604     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 605     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 606     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 607     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 608     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 609     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 610     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 611     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 612     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 613     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 614     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 615     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 616     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 617     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 618     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 619     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 620     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 621     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 622     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 623     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 624     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 625     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 626     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 627     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 628     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 629     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 630     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 631     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 632     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 633     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 634     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 635     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 636 };
 637
 638 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 639                              const char *src, size_t srcLen) const
 640 {
 641     DecoderState stateOrig,
 642                 *statePtr;
 643     if ( srcLen == wxNO_LEN )
 644     {
 645         // convert the entire string, up to and including the trailing NUL
 646         srcLen = strlen(src) + 1;
 647
 648         // when working on the entire strings we don't update nor use the shift
 649         // state from the previous call
 650         statePtr = &stateOrig;
 651     }
 652     else // when working with partial strings we do use the shift state
 653     {
 654         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
 655
 656         // also save the old state to be able to rollback to it on error
 657         stateOrig = m_stateDecoder;
 658     }
 659
 660     // but to simplify the code below we use this variable in both cases
 661     DecoderState& state = *statePtr;
 662
 663
 664     // number of characters [which would have been] written to dst [if it were
 665     // not NULL]
 666     size_t len = 0;
 667
 668     const char * const srcEnd = src + srcLen;
 669
 670     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 671     {
 672         const unsigned char cc = *src++;
 673
 674         if ( state.IsShifted() )
 675         {
 676             const unsigned char dc = utf7unb64[cc];
 677             if ( dc == 0xff )
 678             {
 679                 // end of encoded part, check that nothing was left: there can
 680                 // be up to 4 bits of 0 padding but nothing else (we also need
 681                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 682                 // encoded sequence must contain an integral number of UTF-16
 683                 // characters)
 684                 if ( state.isLSB || state.bit > 4 ||
 685                         (state.accum & ((1 << state.bit) - 1)) )
 686                 {
 687                     if ( !len )
 688                         state = stateOrig;
 689
 690                     return wxCONV_FAILED;
 691                 }
 692
 693                 state.ToDirect();
 694
 695                 // re-parse this character normally below unless it's '-' which
 696                 // is consumed by the decoder
 697                 if ( cc == '-' )
 698                     continue;
 699             }
 700             else // valid encoded character
 701             {
 702                 // mini base64 decoder: each character is 6 bits
 703                 state.bit += 6;
 704                 state.accum <<= 6;
 705                 state.accum += dc;
 706
 707                 if ( state.bit >= 8 )
 708                 {
 709                     // got the full byte, consume it
 710                     state.bit -= 8;
 711                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 712
 713                     if ( state.isLSB )
 714                     {
 715                         // we've got the full word, output it
 716                         if ( dst )
 717                             *dst++ = (state.msb << 8) | b;
 718                         len++;
 719                         state.isLSB = false;
 720                     }
 721                     else // MSB
 722                     {
 723                         // just store it while we wait for LSB
 724                         state.msb = b;
 725                         state.isLSB = true;
 726                     }
 727                 }
 728             }
 729         }
 730
 731         if ( state.IsDirect() )
 732         {
 733             // start of an encoded segment?
 734             if ( cc == '+' )
 735             {
 736                 if ( *src == '-' )
 737                 {
 738                     // just the encoded plus sign, don't switch to shifted mode
 739                     if ( dst )
 740                         *dst++ = '+';
 741                     len++;
 742                     src++;
 743                 }
 744                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 745                 {
 746                     // empty encoded chunks are not allowed
 747                     if ( !len )
 748                         state = stateOrig;
 749
 750                     return wxCONV_FAILED;
 751                 }
 752                 else // base-64 encoded chunk follows
 753                 {
 754                     state.ToShifted();
 755                 }
 756             }
 757             else // not '+'
 758             {
 759                 // only printable 7 bit ASCII characters (with the exception of
 760                 // NUL, TAB, CR and LF) can be used directly
 761                 if ( cc >= 0x7f || (cc < ' ' &&
 762                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 763                     return wxCONV_FAILED;
 764
 765                 if ( dst )
 766                     *dst++ = cc;
 767                 len++;
 768             }
 769         }
 770     }
 771
 772     if ( !len )
 773     {
 774         // as we didn't read any characters we should be called with the same
 775         // data (followed by some more new data) again later so don't save our
 776         // state
 777         state = stateOrig;
 778
 779         return wxCONV_FAILED;
 780     }
 781
 782     return len;
 783 }
 784
 785 //
 786 // BASE64 encoding table
 787 //
 788 static const unsigned char utf7enb64[] =
 789 {
 790     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 791     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 792     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 793     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 794     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 795     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 796     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 797     '4', '5', '6', '7', '8', '9', '+', '/'
 798 };
 799
 800 //
 801 // UTF-7 encoding table
 802 //
 803 // 0 - Set D (directly encoded characters)
 804 // 1 - Set O (optional direct characters)
 805 // 2 - whitespace characters (optional)
 806 // 3 - special characters
 807 //
 808 static const unsigned char utf7encode[128] =
 809 {
 810     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 811     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 812     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 813     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 814     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 815     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 816     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 817     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 818 };
 819
 820 static inline bool wxIsUTF7Direct(wchar_t wc)
 821 {
 822     return wc < 0x80 && utf7encode[wc] < 1;
 823 }
 824
 825 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 826                                const wchar_t *src, size_t srcLen) const
 827 {
 828     EncoderState stateOrig,
 829                 *statePtr;
 830     if ( srcLen == wxNO_LEN )
 831     {
 832         // we don't apply the stored state when operating on entire strings at
 833         // once
 834         statePtr = &stateOrig;
 835
 836         srcLen = wxWcslen(src) + 1;
 837     }
 838     else // do use the mode we left the output in previously
 839     {
 840         stateOrig = m_stateEncoder;
 841         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
 842     }
 843
 844     EncoderState& state = *statePtr;
 845
 846
 847     size_t len = 0;
 848
 849     const wchar_t * const srcEnd = src + srcLen;
 850     while ( src < srcEnd && (!dst || len < dstLen) )
 851     {
 852         wchar_t cc = *src++;
 853         if ( wxIsUTF7Direct(cc) )
 854         {
 855             if ( state.IsShifted() )
 856             {
 857                 // pad with zeros the last encoded block if necessary
 858                 if ( state.bit )
 859                 {
 860                     if ( dst )
 861                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 862                     len++;
 863                 }
 864
 865                 state.ToDirect();
 866
 867                 if ( dst )
 868                     *dst++ = '-';
 869                 len++;
 870             }
 871
 872             if ( dst )
 873                 *dst++ = (char)cc;
 874             len++;
 875         }
 876         else if ( cc == '+' && state.IsDirect() )
 877         {
 878             if ( dst )
 879             {
 880                 *dst++ = '+';
 881                 *dst++ = '-';
 882             }
 883
 884             len += 2;
 885         }
 886 #ifndef WC_UTF16
 887         else if (((wxUint32)cc) > 0xffff)
 888         {
 889             // no surrogate pair generation (yet?)
 890             return wxCONV_FAILED;
 891         }
 892 #endif
 893         else
 894         {
 895             if ( state.IsDirect() )
 896             {
 897                 state.ToShifted();
 898
 899                 if ( dst )
 900                     *dst++ = '+';
 901                 len++;
 902             }
 903
 904             // BASE64 encode string
 905             for ( ;; )
 906             {
 907                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 908                 {
 909                     state.accum <<= 8;
 910                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 911
 912                     for (state.bit += 8; state.bit >= 6; )
 913                     {
 914                         state.bit -= 6;
 915                         if ( dst )
 916                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 917                         len++;
 918                     }
 919                 }
 920
 921                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 922                     break;
 923
 924                 src++;
 925             }
 926         }
 927     }
 928
 929     // we need to restore the original encoder state if we were called just to
 930     // calculate the amount of space needed as we will presumably be called
 931     // again to really convert the data now
 932     if ( !dst )
 933         state = stateOrig;
 934
 935     return len;
 936 }
 937
 938 // ----------------------------------------------------------------------------
 939 // UTF-8
 940 // ----------------------------------------------------------------------------
 941
 942 static const wxUint32 utf8_max[]=
 943     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 944
 945 // boundaries of the private use area we use to (temporarily) remap invalid
 946 // characters invalid in a UTF-8 encoded string
 947 const wxUint32 wxUnicodePUA = 0x100000;
 948 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 949
 950 // this table gives the length of the UTF-8 encoding from its first character:
 951 const unsigned char tableUtf8Lengths[256] = {
 952     // single-byte sequences (ASCII):
 953     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 954     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 955     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 956     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 957     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 958     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 959     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 960     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 961
 962     // these are invalid:
 963     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 964     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 965     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 966     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 967     0, 0,                                            // C0,C1
 968
 969     // two-byte sequences:
 970           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 971     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 972
 973     // three-byte sequences:
 974     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 975
 976     // four-byte sequences:
 977     4, 4, 4, 4, 4,                                   // F0..F4
 978
 979     // these are invalid again (5- or 6-byte
 980     // sequences and sequences for code points
 981     // above U+10FFFF, as restricted by RFC 3629):
 982                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 983 };
 984
 985 size_t
 986 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 987                             const char *src, size_t srcLen) const
 988 {
 989     wchar_t *out = dstLen ? dst : NULL;
 990     size_t written = 0;
 991
 992     if ( srcLen == wxNO_LEN )
 993         srcLen = strlen(src) + 1;
 994
 995     for ( const char *p = src; ; p++ )
 996     {
 997         if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
 998         {
 999             // all done successfully, just add the trailing NULL if we are not
1000             // using explicit length
1001             if ( srcLen == wxNO_LEN )
1002             {
1003                 if ( out )
1004                 {
1005                     if ( !dstLen )
1006                         break;
1007
1008                     *out = L'\0';
1009                 }
1010
1011                 written++;
1012             }
1013
1014             return written;
1015         }
1016
1017         if ( out && !dstLen-- )
1018             break;
1019
1020         wxUint32 code;
1021         unsigned char c = *p;
1022
1023         if ( c < 0x80 )
1024         {
1025             if ( srcLen == 0 ) // the test works for wxNO_LEN too
1026                 break;
1027
1028             if ( srcLen != wxNO_LEN )
1029                 srcLen--;
1030
1031             code = c;
1032         }
1033         else
1034         {
1035             unsigned len = tableUtf8Lengths[c];
1036             if ( !len )
1037                 break;
1038
1039             if ( srcLen < len ) // the test works for wxNO_LEN too
1040                 break;
1041
1042             if ( srcLen != wxNO_LEN )
1043                 srcLen -= len;
1044
1045             //   Char. number range   |        UTF-8 octet sequence
1046             //      (hexadecimal)     |              (binary)
1047             //  ----------------------+----------------------------------------
1048             //  0000 0000 - 0000 007F | 0xxxxxxx
1049             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1050             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1051             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1052             //
1053             //  Code point value is stored in bits marked with 'x',
1054             //  lowest-order bit of the value on the right side in the diagram
1055             //  above.                                         (from RFC 3629)
1056
1057             // mask to extract lead byte's value ('x' bits above), by sequence
1058             // length:
1059             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1060
1061             // mask and value of lead byte's most significant bits, by length:
1062             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1063             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1064
1065             len--; // it's more convenient to work with 0-based length here
1066
1067             // extract the lead byte's value bits:
1068             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1069                 break;
1070
1071             code = c & leadValueMask[len];
1072
1073             // all remaining bytes, if any, are handled in the same way
1074             // regardless of sequence's length:
1075             for ( ; len; --len )
1076             {
1077                 c = *++p;
1078                 if ( (c & 0xC0) != 0x80 )
1079                     return wxCONV_FAILED;
1080
1081                 code <<= 6;
1082                 code |= c & 0x3F;
1083             }
1084         }
1085
1086 #ifdef WC_UTF16
1087         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1088         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1089         {
1090             if ( out )
1091                 out++;
1092             written++;
1093         }
1094 #else // !WC_UTF16
1095         if ( out )
1096             *out = code;
1097 #endif // WC_UTF16/!WC_UTF16
1098
1099         if ( out )
1100             out++;
1101
1102         written++;
1103     }
1104
1105     return wxCONV_FAILED;
1106 }
1107
1108 size_t
1109 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1110                               const wchar_t *src, size_t srcLen) const
1111 {
1112     char *out = dstLen ? dst : NULL;
1113     size_t written = 0;
1114
1115     for ( const wchar_t *wp = src; ; wp++ )
1116     {
1117         if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
1118         {
1119             // all done successfully, just add the trailing NULL if we are not
1120             // using explicit length
1121             if ( srcLen == wxNO_LEN )
1122             {
1123                 if ( out )
1124                 {
1125                     if ( !dstLen )
1126                         break;
1127
1128                     *out = '\0';
1129                 }
1130
1131                 written++;
1132             }
1133
1134             return written;
1135         }
1136
1137         if ( srcLen != wxNO_LEN )
1138             srcLen--;
1139
1140         wxUint32 code;
1141 #ifdef WC_UTF16
1142         // cast is ok for WC_UTF16
1143         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1144         {
1145             // skip the next char too as we decoded a surrogate
1146             wp++;
1147             if ( srcLen != wxNO_LEN )
1148                 srcLen--;
1149         }
1150 #else // wchar_t is UTF-32
1151         code = *wp & 0x7fffffff;
1152 #endif
1153
1154         unsigned len;
1155         if ( code <= 0x7F )
1156         {
1157             len = 1;
1158             if ( out )
1159             {
1160                 if ( dstLen < len )
1161                     break;
1162
1163                 out[0] = (char)code;
1164             }
1165         }
1166         else if ( code <= 0x07FF )
1167         {
1168             len = 2;
1169             if ( out )
1170             {
1171                 if ( dstLen < len )
1172                     break;
1173
1174                 // NB: this line takes 6 least significant bits, encodes them as
1175                 // 10xxxxxx and discards them so that the next byte can be encoded:
1176                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1177                 out[0] = 0xC0 | code;
1178             }
1179         }
1180         else if ( code < 0xFFFF )
1181         {
1182             len = 3;
1183             if ( out )
1184             {
1185                 if ( dstLen < len )
1186                     break;
1187
1188                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1189                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1190                 out[0] = 0xE0 | code;
1191             }
1192         }
1193         else if ( code <= 0x10FFFF )
1194         {
1195             len = 4;
1196             if ( out )
1197             {
1198                 if ( dstLen < len )
1199                     break;
1200
1201                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1202                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1203                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1204                 out[0] = 0xF0 | code;
1205             }
1206         }
1207         else
1208         {
1209             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1210             break;
1211         }
1212
1213         if ( out )
1214         {
1215             out += len;
1216             dstLen -= len;
1217         }
1218
1219         written += len;
1220     }
1221
1222     // we only get here if an error occurs during decoding
1223     return wxCONV_FAILED;
1224 }
1225
1226 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1227                              const char *psz, size_t srcLen) const
1228 {
1229     if ( m_options == MAP_INVALID_UTF8_NOT )
1230         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1231
1232     size_t len = 0;
1233
1234     // The length can be either given explicitly or computed implicitly for the
1235     // NUL-terminated strings.
1236     const bool isNulTerminated = srcLen == wxNO_LEN;
1237     while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1238     {
1239         const char *opsz = psz;
1240         bool invalid = false;
1241         unsigned char cc = *psz++, fc = cc;
1242         unsigned cnt;
1243         for (cnt = 0; fc & 0x80; cnt++)
1244             fc <<= 1;
1245
1246         if (!cnt)
1247         {
1248             // plain ASCII char
1249             if (buf)
1250                 *buf++ = cc;
1251             len++;
1252
1253             // escape the escape character for octal escapes
1254             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1255                     && cc == '\\' && (!buf || len < n))
1256             {
1257                 if (buf)
1258                     *buf++ = cc;
1259                 len++;
1260             }
1261         }
1262         else
1263         {
1264             cnt--;
1265             if (!cnt)
1266             {
1267                 // invalid UTF-8 sequence
1268                 invalid = true;
1269             }
1270             else
1271             {
1272                 unsigned ocnt = cnt - 1;
1273                 wxUint32 res = cc & (0x3f >> cnt);
1274                 while (cnt--)
1275                 {
1276                     cc = *psz;
1277                     if ((cc & 0xC0) != 0x80)
1278                     {
1279                         // invalid UTF-8 sequence
1280                         invalid = true;
1281                         break;
1282                     }
1283
1284                     psz++;
1285                     res = (res << 6) | (cc & 0x3f);
1286                 }
1287
1288                 if (invalid || res <= utf8_max[ocnt])
1289                 {
1290                     // illegal UTF-8 encoding
1291                     invalid = true;
1292                 }
1293                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1294                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1295                 {
1296                     // if one of our PUA characters turns up externally
1297                     // it must also be treated as an illegal sequence
1298                     // (a bit like you have to escape an escape character)
1299                     invalid = true;
1300                 }
1301                 else
1302                 {
1303 #ifdef WC_UTF16
1304                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1305                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1306                     if (pa == wxCONV_FAILED)
1307                     {
1308                         invalid = true;
1309                     }
1310                     else
1311                     {
1312                         if (buf)
1313                             buf += pa;
1314                         len += pa;
1315                     }
1316 #else // !WC_UTF16
1317                     if (buf)
1318                         *buf++ = (wchar_t)res;
1319                     len++;
1320 #endif // WC_UTF16/!WC_UTF16
1321                 }
1322             }
1323
1324             if (invalid)
1325             {
1326                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1327                 {
1328                     while (opsz < psz && (!buf || len < n))
1329                     {
1330 #ifdef WC_UTF16
1331                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1332                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1333                         wxASSERT(pa != wxCONV_FAILED);
1334                         if (buf)
1335                             buf += pa;
1336                         opsz++;
1337                         len += pa;
1338 #else
1339                         if (buf)
1340                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1341                         opsz++;
1342                         len++;
1343 #endif
1344                     }
1345                 }
1346                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1347                 {
1348                     while (opsz < psz && (!buf || len < n))
1349                     {
1350                         if ( buf && len + 3 < n )
1351                         {
1352                             unsigned char on = *opsz;
1353                             *buf++ = L'\\';
1354                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1355                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1356                             *buf++ = (wchar_t)( L'0' + on % 010 );
1357                         }
1358
1359                         opsz++;
1360                         len += 4;
1361                     }
1362                 }
1363                 else // MAP_INVALID_UTF8_NOT
1364                 {
1365                     return wxCONV_FAILED;
1366                 }
1367             }
1368         }
1369     }
1370
1371     if ( isNulTerminated )
1372     {
1373         // Add the trailing NUL in this case if we have a large enough buffer.
1374         if ( buf && (len < n) )
1375             *buf = 0;
1376
1377         // And count it in any case.
1378         len++;
1379     }
1380
1381     return len;
1382 }
1383
1384 static inline bool isoctal(wchar_t wch)
1385 {
1386     return L'0' <= wch && wch <= L'7';
1387 }
1388
1389 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1390                                const wchar_t *psz, size_t srcLen) const
1391 {
1392     if ( m_options == MAP_INVALID_UTF8_NOT )
1393         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1394
1395     size_t len = 0;
1396
1397     // The length can be either given explicitly or computed implicitly for the
1398     // NUL-terminated strings.
1399     const bool isNulTerminated = srcLen == wxNO_LEN;
1400     while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1401     {
1402         wxUint32 cc;
1403
1404 #ifdef WC_UTF16
1405         // cast is ok for WC_UTF16
1406         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1407         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1408 #else
1409         cc = (*psz++) & 0x7fffffff;
1410 #endif
1411
1412         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1413                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1414         {
1415             if (buf)
1416                 *buf++ = (char)(cc - wxUnicodePUA);
1417             len++;
1418         }
1419         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1420                     && cc == L'\\' && psz[0] == L'\\' )
1421         {
1422             if (buf)
1423                 *buf++ = (char)cc;
1424             psz++;
1425             len++;
1426         }
1427         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1428                     cc == L'\\' &&
1429                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1430         {
1431             if (buf)
1432             {
1433                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1434                                  (psz[1] - L'0') * 010 +
1435                                  (psz[2] - L'0'));
1436             }
1437
1438             psz += 3;
1439             len++;
1440         }
1441         else
1442         {
1443             unsigned cnt;
1444             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1445             {
1446             }
1447
1448             if (!cnt)
1449             {
1450                 // plain ASCII char
1451                 if (buf)
1452                     *buf++ = (char) cc;
1453                 len++;
1454             }
1455             else
1456             {
1457                 len += cnt + 1;
1458                 if (buf)
1459                 {
1460                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1461                     while (cnt--)
1462                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1463                 }
1464             }
1465         }
1466     }
1467
1468     if ( isNulTerminated )
1469     {
1470         // Add the trailing NUL in this case if we have a large enough buffer.
1471         if ( buf && (len < n) )
1472             *buf = 0;
1473
1474         // And count it in any case.
1475         len++;
1476     }
1477
1478     return len;
1479 }
1480
1481 // ============================================================================
1482 // UTF-16
1483 // ============================================================================
1484
1485 #ifdef WORDS_BIGENDIAN
1486     #define wxMBConvUTF16straight wxMBConvUTF16BE
1487     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1488 #else
1489     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1490     #define wxMBConvUTF16straight wxMBConvUTF16LE
1491 #endif
1492
1493 /* static */
1494 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1495 {
1496     if ( srcLen == wxNO_LEN )
1497     {
1498         // count the number of bytes in input, including the trailing NULs
1499         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1500         for ( srcLen = 1; *inBuff++; srcLen++ )
1501             ;
1502
1503         srcLen *= BYTES_PER_CHAR;
1504     }
1505     else // we already have the length
1506     {
1507         // we can only convert an entire number of UTF-16 characters
1508         if ( srcLen % BYTES_PER_CHAR )
1509             return wxCONV_FAILED;
1510     }
1511
1512     return srcLen;
1513 }
1514
1515 // case when in-memory representation is UTF-16 too
1516 #ifdef WC_UTF16
1517
1518 // ----------------------------------------------------------------------------
1519 // conversions without endianness change
1520 // ----------------------------------------------------------------------------
1521
1522 size_t
1523 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1524                                const char *src, size_t srcLen) const
1525 {
1526     // set up the scene for using memcpy() (which is presumably more efficient
1527     // than copying the bytes one by one)
1528     srcLen = GetLength(src, srcLen);
1529     if ( srcLen == wxNO_LEN )
1530         return wxCONV_FAILED;
1531
1532     const size_t inLen = srcLen / BYTES_PER_CHAR;
1533     if ( dst )
1534     {
1535         if ( dstLen < inLen )
1536             return wxCONV_FAILED;
1537
1538         memcpy(dst, src, srcLen);
1539     }
1540
1541     return inLen;
1542 }
1543
1544 size_t
1545 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1546                                  const wchar_t *src, size_t srcLen) const
1547 {
1548     if ( srcLen == wxNO_LEN )
1549         srcLen = wxWcslen(src) + 1;
1550
1551     srcLen *= BYTES_PER_CHAR;
1552
1553     if ( dst )
1554     {
1555         if ( dstLen < srcLen )
1556             return wxCONV_FAILED;
1557
1558         memcpy(dst, src, srcLen);
1559     }
1560
1561     return srcLen;
1562 }
1563
1564 // ----------------------------------------------------------------------------
1565 // endian-reversing conversions
1566 // ----------------------------------------------------------------------------
1567
1568 size_t
1569 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1570                            const char *src, size_t srcLen) const
1571 {
1572     srcLen = GetLength(src, srcLen);
1573     if ( srcLen == wxNO_LEN )
1574         return wxCONV_FAILED;
1575
1576     srcLen /= BYTES_PER_CHAR;
1577
1578     if ( dst )
1579     {
1580         if ( dstLen < srcLen )
1581             return wxCONV_FAILED;
1582
1583         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1584         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1585         {
1586             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1587         }
1588     }
1589
1590     return srcLen;
1591 }
1592
1593 size_t
1594 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1595                              const wchar_t *src, size_t srcLen) const
1596 {
1597     if ( srcLen == wxNO_LEN )
1598         srcLen = wxWcslen(src) + 1;
1599
1600     srcLen *= BYTES_PER_CHAR;
1601
1602     if ( dst )
1603     {
1604         if ( dstLen < srcLen )
1605             return wxCONV_FAILED;
1606
1607         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1608         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1609         {
1610             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1611         }
1612     }
1613
1614     return srcLen;
1615 }
1616
1617 #else // !WC_UTF16: wchar_t is UTF-32
1618
1619 // ----------------------------------------------------------------------------
1620 // conversions without endianness change
1621 // ----------------------------------------------------------------------------
1622
1623 size_t
1624 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1625                                const char *src, size_t srcLen) const
1626 {
1627     srcLen = GetLength(src, srcLen);
1628     if ( srcLen == wxNO_LEN )
1629         return wxCONV_FAILED;
1630
1631     const size_t inLen = srcLen / BYTES_PER_CHAR;
1632     if ( !dst )
1633     {
1634         // optimization: return maximal space which could be needed for this
1635         // string even if the real size could be smaller if the buffer contains
1636         // any surrogates
1637         return inLen;
1638     }
1639
1640     size_t outLen = 0;
1641     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1642     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1643     {
1644         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1645         if ( !inBuff )
1646             return wxCONV_FAILED;
1647
1648         if ( ++outLen > dstLen )
1649             return wxCONV_FAILED;
1650
1651         *dst++ = ch;
1652     }
1653
1654
1655     return outLen;
1656 }
1657
1658 size_t
1659 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1660                                  const wchar_t *src, size_t srcLen) const
1661 {
1662     if ( srcLen == wxNO_LEN )
1663         srcLen = wxWcslen(src) + 1;
1664
1665     size_t outLen = 0;
1666     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1667     for ( size_t n = 0; n < srcLen; n++ )
1668     {
1669         wxUint16 cc[2] = { 0 };
1670         const size_t numChars = encode_utf16(*src++, cc);
1671         if ( numChars == wxCONV_FAILED )
1672             return wxCONV_FAILED;
1673
1674         outLen += numChars * BYTES_PER_CHAR;
1675         if ( outBuff )
1676         {
1677             if ( outLen > dstLen )
1678                 return wxCONV_FAILED;
1679
1680             *outBuff++ = cc[0];
1681             if ( numChars == 2 )
1682             {
1683                 // second character of a surrogate
1684                 *outBuff++ = cc[1];
1685             }
1686         }
1687     }
1688
1689     return outLen;
1690 }
1691
1692 // ----------------------------------------------------------------------------
1693 // endian-reversing conversions
1694 // ----------------------------------------------------------------------------
1695
1696 size_t
1697 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1698                            const char *src, size_t srcLen) const
1699 {
1700     srcLen = GetLength(src, srcLen);
1701     if ( srcLen == wxNO_LEN )
1702         return wxCONV_FAILED;
1703
1704     const size_t inLen = srcLen / BYTES_PER_CHAR;
1705     if ( !dst )
1706     {
1707         // optimization: return maximal space which could be needed for this
1708         // string even if the real size could be smaller if the buffer contains
1709         // any surrogates
1710         return inLen;
1711     }
1712
1713     size_t outLen = 0;
1714     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1715     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1716     {
1717         wxUint32 ch;
1718         wxUint16 tmp[2];
1719
1720         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1721         inBuff++;
1722         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1723
1724         const size_t numChars = decode_utf16(tmp, ch);
1725         if ( numChars == wxCONV_FAILED )
1726             return wxCONV_FAILED;
1727
1728         if ( numChars == 2 )
1729             inBuff++;
1730
1731         if ( ++outLen > dstLen )
1732             return wxCONV_FAILED;
1733
1734         *dst++ = ch;
1735     }
1736
1737
1738     return outLen;
1739 }
1740
1741 size_t
1742 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1743                              const wchar_t *src, size_t srcLen) const
1744 {
1745     if ( srcLen == wxNO_LEN )
1746         srcLen = wxWcslen(src) + 1;
1747
1748     size_t outLen = 0;
1749     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1750     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1751     {
1752         wxUint16 cc[2] = { 0 };
1753         const size_t numChars = encode_utf16(*src, cc);
1754         if ( numChars == wxCONV_FAILED )
1755             return wxCONV_FAILED;
1756
1757         outLen += numChars * BYTES_PER_CHAR;
1758         if ( outBuff )
1759         {
1760             if ( outLen > dstLen )
1761                 return wxCONV_FAILED;
1762
1763             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1764             if ( numChars == 2 )
1765             {
1766                 // second character of a surrogate
1767                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1768             }
1769         }
1770     }
1771
1772     return outLen;
1773 }
1774
1775 #endif // WC_UTF16/!WC_UTF16
1776
1777
1778 // ============================================================================
1779 // UTF-32
1780 // ============================================================================
1781
1782 #ifdef WORDS_BIGENDIAN
1783     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1784     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1785 #else
1786     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1787     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1788 #endif
1789
1790
1791 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1792 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1793
1794 /* static */
1795 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1796 {
1797     if ( srcLen == wxNO_LEN )
1798     {
1799         // count the number of bytes in input, including the trailing NULs
1800         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1801         for ( srcLen = 1; *inBuff++; srcLen++ )
1802             ;
1803
1804         srcLen *= BYTES_PER_CHAR;
1805     }
1806     else // we already have the length
1807     {
1808         // we can only convert an entire number of UTF-32 characters
1809         if ( srcLen % BYTES_PER_CHAR )
1810             return wxCONV_FAILED;
1811     }
1812
1813     return srcLen;
1814 }
1815
1816 // case when in-memory representation is UTF-16
1817 #ifdef WC_UTF16
1818
1819 // ----------------------------------------------------------------------------
1820 // conversions without endianness change
1821 // ----------------------------------------------------------------------------
1822
1823 size_t
1824 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1825                                const char *src, size_t srcLen) const
1826 {
1827     srcLen = GetLength(src, srcLen);
1828     if ( srcLen == wxNO_LEN )
1829         return wxCONV_FAILED;
1830
1831     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1832     const size_t inLen = srcLen / BYTES_PER_CHAR;
1833     size_t outLen = 0;
1834     for ( size_t n = 0; n < inLen; n++ )
1835     {
1836         wxUint16 cc[2] = { 0 };
1837         const size_t numChars = encode_utf16(*inBuff++, cc);
1838         if ( numChars == wxCONV_FAILED )
1839             return wxCONV_FAILED;
1840
1841         outLen += numChars;
1842         if ( dst )
1843         {
1844             if ( outLen > dstLen )
1845                 return wxCONV_FAILED;
1846
1847             *dst++ = cc[0];
1848             if ( numChars == 2 )
1849             {
1850                 // second character of a surrogate
1851                 *dst++ = cc[1];
1852             }
1853         }
1854     }
1855
1856     return outLen;
1857 }
1858
1859 size_t
1860 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1861                                  const wchar_t *src, size_t srcLen) const
1862 {
1863     if ( srcLen == wxNO_LEN )
1864         srcLen = wxWcslen(src) + 1;
1865
1866     if ( !dst )
1867     {
1868         // optimization: return maximal space which could be needed for this
1869         // string instead of the exact amount which could be less if there are
1870         // any surrogates in the input
1871         //
1872         // we consider that surrogates are rare enough to make it worthwhile to
1873         // avoid running the loop below at the cost of slightly extra memory
1874         // consumption
1875         return srcLen * BYTES_PER_CHAR;
1876     }
1877
1878     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1879     size_t outLen = 0;
1880     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1881     {
1882         const wxUint32 ch = wxDecodeSurrogate(&src);
1883         if ( !src )
1884             return wxCONV_FAILED;
1885
1886         outLen += BYTES_PER_CHAR;
1887
1888         if ( outLen > dstLen )
1889             return wxCONV_FAILED;
1890
1891         *outBuff++ = ch;
1892     }
1893
1894     return outLen;
1895 }
1896
1897 // ----------------------------------------------------------------------------
1898 // endian-reversing conversions
1899 // ----------------------------------------------------------------------------
1900
1901 size_t
1902 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1903                            const char *src, size_t srcLen) const
1904 {
1905     srcLen = GetLength(src, srcLen);
1906     if ( srcLen == wxNO_LEN )
1907         return wxCONV_FAILED;
1908
1909     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1910     const size_t inLen = srcLen / BYTES_PER_CHAR;
1911     size_t outLen = 0;
1912     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1913     {
1914         wxUint16 cc[2] = { 0 };
1915         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1916         if ( numChars == wxCONV_FAILED )
1917             return wxCONV_FAILED;
1918
1919         outLen += numChars;
1920         if ( dst )
1921         {
1922             if ( outLen > dstLen )
1923                 return wxCONV_FAILED;
1924
1925             *dst++ = cc[0];
1926             if ( numChars == 2 )
1927             {
1928                 // second character of a surrogate
1929                 *dst++ = cc[1];
1930             }
1931         }
1932     }
1933
1934     return outLen;
1935 }
1936
1937 size_t
1938 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1939                              const wchar_t *src, size_t srcLen) const
1940 {
1941     if ( srcLen == wxNO_LEN )
1942         srcLen = wxWcslen(src) + 1;
1943
1944     if ( !dst )
1945     {
1946         // optimization: return maximal space which could be needed for this
1947         // string instead of the exact amount which could be less if there are
1948         // any surrogates in the input
1949         //
1950         // we consider that surrogates are rare enough to make it worthwhile to
1951         // avoid running the loop below at the cost of slightly extra memory
1952         // consumption
1953         return srcLen*BYTES_PER_CHAR;
1954     }
1955
1956     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1957     size_t outLen = 0;
1958     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1959     {
1960         const wxUint32 ch = wxDecodeSurrogate(&src);
1961         if ( !src )
1962             return wxCONV_FAILED;
1963
1964         outLen += BYTES_PER_CHAR;
1965
1966         if ( outLen > dstLen )
1967             return wxCONV_FAILED;
1968
1969         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1970     }
1971
1972     return outLen;
1973 }
1974
1975 #else // !WC_UTF16: wchar_t is UTF-32
1976
1977 // ----------------------------------------------------------------------------
1978 // conversions without endianness change
1979 // ----------------------------------------------------------------------------
1980
1981 size_t
1982 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1983                                const char *src, size_t srcLen) const
1984 {
1985     // use memcpy() as it should be much faster than hand-written loop
1986     srcLen = GetLength(src, srcLen);
1987     if ( srcLen == wxNO_LEN )
1988         return wxCONV_FAILED;
1989
1990     const size_t inLen = srcLen/BYTES_PER_CHAR;
1991     if ( dst )
1992     {
1993         if ( dstLen < inLen )
1994             return wxCONV_FAILED;
1995
1996         memcpy(dst, src, srcLen);
1997     }
1998
1999     return inLen;
2000 }
2001
2002 size_t
2003 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2004                                  const wchar_t *src, size_t srcLen) const
2005 {
2006     if ( srcLen == wxNO_LEN )
2007         srcLen = wxWcslen(src) + 1;
2008
2009     srcLen *= BYTES_PER_CHAR;
2010
2011     if ( dst )
2012     {
2013         if ( dstLen < srcLen )
2014             return wxCONV_FAILED;
2015
2016         memcpy(dst, src, srcLen);
2017     }
2018
2019     return srcLen;
2020 }
2021
2022 // ----------------------------------------------------------------------------
2023 // endian-reversing conversions
2024 // ----------------------------------------------------------------------------
2025
2026 size_t
2027 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2028                            const char *src, size_t srcLen) const
2029 {
2030     srcLen = GetLength(src, srcLen);
2031     if ( srcLen == wxNO_LEN )
2032         return wxCONV_FAILED;
2033
2034     srcLen /= BYTES_PER_CHAR;
2035
2036     if ( dst )
2037     {
2038         if ( dstLen < srcLen )
2039             return wxCONV_FAILED;
2040
2041         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2042         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2043         {
2044             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2045         }
2046     }
2047
2048     return srcLen;
2049 }
2050
2051 size_t
2052 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2053                              const wchar_t *src, size_t srcLen) const
2054 {
2055     if ( srcLen == wxNO_LEN )
2056         srcLen = wxWcslen(src) + 1;
2057
2058     srcLen *= BYTES_PER_CHAR;
2059
2060     if ( dst )
2061     {
2062         if ( dstLen < srcLen )
2063             return wxCONV_FAILED;
2064
2065         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2066         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2067         {
2068             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2069         }
2070     }
2071
2072     return srcLen;
2073 }
2074
2075 #endif // WC_UTF16/!WC_UTF16
2076
2077
2078 // ============================================================================
2079 // The classes doing conversion using the iconv_xxx() functions
2080 // ============================================================================
2081
2082 #ifdef HAVE_ICONV
2083
2084 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2085 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2086 //     (unless there's yet another bug in glibc) the only case when iconv()
2087 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2088 //     left in the input buffer -- when _real_ error occurs,
2089 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2090 //     iconv() failure.
2091 //     [This bug does not appear in glibc 2.2.]
2092 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2093 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2094                                      (errno != E2BIG || bufLeft != 0))
2095 #else
2096 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2097 #endif
2098
2099 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2100
2101 #define ICONV_T_INVALID ((iconv_t)-1)
2102
2103 #if SIZEOF_WCHAR_T == 4
2104     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2105     #define WC_ENC      wxFONTENCODING_UTF32
2106 #elif SIZEOF_WCHAR_T == 2
2107     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2108     #define WC_ENC      wxFONTENCODING_UTF16
2109 #else // sizeof(wchar_t) != 2 nor 4
2110     // does this ever happen?
2111     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2112 #endif
2113
2114 // ----------------------------------------------------------------------------
2115 // wxMBConv_iconv: encapsulates an iconv character set
2116 // ----------------------------------------------------------------------------
2117
2118 class wxMBConv_iconv : public wxMBConv
2119 {
2120 public:
2121     wxMBConv_iconv(const char *name);
2122     virtual ~wxMBConv_iconv();
2123
2124     // implement base class virtual methods
2125     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2126                            const char *src, size_t srcLen = wxNO_LEN) const;
2127     virtual size_t FromWChar(char *dst, size_t dstLen,
2128                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2129     virtual size_t GetMBNulLen() const;
2130
2131 #if wxUSE_UNICODE_UTF8
2132     virtual bool IsUTF8() const;
2133 #endif
2134
2135     virtual wxMBConv *Clone() const
2136     {
2137         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2138         p->m_minMBCharWidth = m_minMBCharWidth;
2139         return p;
2140     }
2141
2142     bool IsOk() const
2143         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2144
2145 protected:
2146     // the iconv handlers used to translate from multibyte
2147     // to wide char and in the other direction
2148     iconv_t m2w,
2149             w2m;
2150
2151 #if wxUSE_THREADS
2152     // guards access to m2w and w2m objects
2153     wxMutex m_iconvMutex;
2154 #endif
2155
2156 private:
2157     // the name (for iconv_open()) of a wide char charset -- if none is
2158     // available on this machine, it will remain NULL
2159     static wxString ms_wcCharsetName;
2160
2161     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2162     // different endian-ness than the native one
2163     static bool ms_wcNeedsSwap;
2164
2165
2166     // name of the encoding handled by this conversion
2167     const char *m_name;
2168
2169     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2170     // initially
2171     size_t m_minMBCharWidth;
2172 };
2173
2174 // make the constructor available for unit testing
2175 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2176 {
2177     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2178     if ( !result->IsOk() )
2179     {
2180         delete result;
2181         return 0;
2182     }
2183
2184     return result;
2185 }
2186
2187 wxString wxMBConv_iconv::ms_wcCharsetName;
2188 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2189
2190 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2191               : m_name(wxStrdup(name))
2192 {
2193     m_minMBCharWidth = 0;
2194
2195     // check for charset that represents wchar_t:
2196     if ( ms_wcCharsetName.empty() )
2197     {
2198         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2199
2200 #if wxUSE_FONTMAP
2201         const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2202 #else // !wxUSE_FONTMAP
2203         static const wxChar *const names_static[] =
2204         {
2205 #if SIZEOF_WCHAR_T == 4
2206             wxT("UCS-4"),
2207 #elif SIZEOF_WCHAR_T == 2
2208             wxT("UCS-2"),
2209 #endif
2210             NULL
2211         };
2212         const wxChar *const *names = names_static;
2213 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2214
2215         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2216         {
2217             const wxString nameCS(*names);
2218
2219             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2220             wxString nameXE(nameCS);
2221
2222 #ifdef WORDS_BIGENDIAN
2223                 nameXE += wxT("BE");
2224 #else // little endian
2225                 nameXE += wxT("LE");
2226 #endif
2227
2228             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2229                        nameXE.c_str());
2230
2231             m2w = iconv_open(nameXE.ToAscii(), name);
2232             if ( m2w == ICONV_T_INVALID )
2233             {
2234                 // try charset w/o bytesex info (e.g. "UCS4")
2235                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2236                            nameCS.c_str());
2237                 m2w = iconv_open(nameCS.ToAscii(), name);
2238
2239                 // and check for bytesex ourselves:
2240                 if ( m2w != ICONV_T_INVALID )
2241                 {
2242                     char    buf[2], *bufPtr;
2243                     wchar_t wbuf[2];
2244                     size_t  insz, outsz;
2245                     size_t  res;
2246
2247                     buf[0] = 'A';
2248                     buf[1] = 0;
2249                     wbuf[0] = 0;
2250                     insz = 2;
2251                     outsz = SIZEOF_WCHAR_T * 2;
2252                     char* wbufPtr = (char*)wbuf;
2253                     bufPtr = buf;
2254
2255                     res = iconv(
2256                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2257                         &wbufPtr, &outsz);
2258
2259                     if (ICONV_FAILED(res, insz))
2260                     {
2261                         wxLogLastError(wxT("iconv"));
2262                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2263                                    nameCS.c_str());
2264                     }
2265                     else // ok, can convert to this encoding, remember it
2266                     {
2267                         ms_wcCharsetName = nameCS;
2268                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2269                     }
2270                 }
2271             }
2272             else // use charset not requiring byte swapping
2273             {
2274                 ms_wcCharsetName = nameXE;
2275             }
2276         }
2277
2278         wxLogTrace(TRACE_STRCONV,
2279                    wxT("iconv wchar_t charset is \"%s\"%s"),
2280                    ms_wcCharsetName.empty() ? wxString("<none>")
2281                                             : ms_wcCharsetName,
2282                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2283                                   : wxT(""));
2284     }
2285     else // we already have ms_wcCharsetName
2286     {
2287         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2288     }
2289
2290     if ( ms_wcCharsetName.empty() )
2291     {
2292         w2m = ICONV_T_INVALID;
2293     }
2294     else
2295     {
2296         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2297         if ( w2m == ICONV_T_INVALID )
2298         {
2299             wxLogTrace(TRACE_STRCONV,
2300                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2301                        ms_wcCharsetName.c_str(), name);
2302         }
2303     }
2304 }
2305
2306 wxMBConv_iconv::~wxMBConv_iconv()
2307 {
2308     free(const_cast<char *>(m_name));
2309
2310     if ( m2w != ICONV_T_INVALID )
2311         iconv_close(m2w);
2312     if ( w2m != ICONV_T_INVALID )
2313         iconv_close(w2m);
2314 }
2315
2316 size_t
2317 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2318                         const char *src, size_t srcLen) const
2319 {
2320     if ( srcLen == wxNO_LEN )
2321     {
2322         // find the string length: notice that must be done differently for
2323         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2324         // consecutive NULs
2325         const size_t nulLen = GetMBNulLen();
2326         switch ( nulLen )
2327         {
2328             default:
2329                 return wxCONV_FAILED;
2330
2331             case 1:
2332                 srcLen = strlen(src); // arguably more optimized than our version
2333                 break;
2334
2335             case 2:
2336             case 4:
2337                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2338                 // but they also have to start at character boundary and not
2339                 // span two adjacent characters
2340                 const char *p;
2341                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2342                     ;
2343                 srcLen = p - src;
2344                 break;
2345         }
2346
2347         // when we're determining the length of the string ourselves we count
2348         // the terminating NUL(s) as part of it and always NUL-terminate the
2349         // output
2350         srcLen += nulLen;
2351     }
2352
2353     // we express length in the number of (wide) characters but iconv always
2354     // counts buffer sizes it in bytes
2355     dstLen *= SIZEOF_WCHAR_T;
2356
2357 #if wxUSE_THREADS
2358     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2359     //     Unfortunately there are a couple of global wxCSConv objects such as
2360     //     wxConvLocal that are used all over wx code, so we have to make sure
2361     //     the handle is used by at most one thread at the time. Otherwise
2362     //     only a few wx classes would be safe to use from non-main threads
2363     //     as MB<->WC conversion would fail "randomly".
2364     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2365 #endif // wxUSE_THREADS
2366
2367     size_t res, cres;
2368     const char *pszPtr = src;
2369
2370     if ( dst )
2371     {
2372         char* bufPtr = (char*)dst;
2373
2374         // have destination buffer, convert there
2375         size_t dstLenOrig = dstLen;
2376         cres = iconv(m2w,
2377                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2378                      &bufPtr, &dstLen);
2379
2380         // convert the number of bytes converted as returned by iconv to the
2381         // number of (wide) characters converted that we need
2382         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2383
2384         if (ms_wcNeedsSwap)
2385         {
2386             // convert to native endianness
2387             for ( unsigned i = 0; i < res; i++ )
2388                 dst[i] = WC_BSWAP(dst[i]);
2389         }
2390     }
2391     else // no destination buffer
2392     {
2393         // convert using temp buffer to calculate the size of the buffer needed
2394         wchar_t tbuf[256];
2395         res = 0;
2396
2397         do
2398         {
2399             char* bufPtr = (char*)tbuf;
2400             dstLen = 8 * SIZEOF_WCHAR_T;
2401
2402             cres = iconv(m2w,
2403                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2404                          &bufPtr, &dstLen );
2405
2406             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2407         }
2408         while ((cres == (size_t)-1) && (errno == E2BIG));
2409     }
2410
2411     if (ICONV_FAILED(cres, srcLen))
2412     {
2413         //VS: it is ok if iconv fails, hence trace only
2414         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2415         return wxCONV_FAILED;
2416     }
2417
2418     return res;
2419 }
2420
2421 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2422                                  const wchar_t *src, size_t srcLen) const
2423 {
2424 #if wxUSE_THREADS
2425     // NB: explained in MB2WC
2426     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2427 #endif
2428
2429     if ( srcLen == wxNO_LEN )
2430         srcLen = wxWcslen(src) + 1;
2431
2432     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2433     size_t outbuflen = dstLen;
2434     size_t res, cres;
2435
2436     wchar_t *tmpbuf = 0;
2437
2438     if (ms_wcNeedsSwap)
2439     {
2440         // need to copy to temp buffer to switch endianness
2441         // (doing WC_BSWAP twice on the original buffer won't work, as it
2442         //  could be in read-only memory, or be accessed in some other thread)
2443         tmpbuf = (wchar_t *)malloc(inbuflen);
2444         for ( size_t i = 0; i < srcLen; i++ )
2445             tmpbuf[i] = WC_BSWAP(src[i]);
2446
2447         src = tmpbuf;
2448     }
2449
2450     char* inbuf = (char*)src;
2451     if ( dst )
2452     {
2453         // have destination buffer, convert there
2454         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2455
2456         res = dstLen - outbuflen;
2457     }
2458     else // no destination buffer
2459     {
2460         // convert using temp buffer to calculate the size of the buffer needed
2461         char tbuf[256];
2462         res = 0;
2463         do
2464         {
2465             dst = tbuf;
2466             outbuflen = WXSIZEOF(tbuf);
2467
2468             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2469
2470             res += WXSIZEOF(tbuf) - outbuflen;
2471         }
2472         while ((cres == (size_t)-1) && (errno == E2BIG));
2473     }
2474
2475     if (ms_wcNeedsSwap)
2476     {
2477         free(tmpbuf);
2478     }
2479
2480     if (ICONV_FAILED(cres, inbuflen))
2481     {
2482         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2483         return wxCONV_FAILED;
2484     }
2485
2486     return res;
2487 }
2488
2489 size_t wxMBConv_iconv::GetMBNulLen() const
2490 {
2491     if ( m_minMBCharWidth == 0 )
2492     {
2493         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2494
2495 #if wxUSE_THREADS
2496         // NB: explained in MB2WC
2497         wxMutexLocker lock(self->m_iconvMutex);
2498 #endif
2499
2500         const wchar_t *wnul = L"";
2501         char buf[8]; // should be enough for NUL in any encoding
2502         size_t inLen = sizeof(wchar_t),
2503                outLen = WXSIZEOF(buf);
2504         char *inBuff = (char *)wnul;
2505         char *outBuff = buf;
2506         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2507         {
2508             self->m_minMBCharWidth = (size_t)-1;
2509         }
2510         else // ok
2511         {
2512             self->m_minMBCharWidth = outBuff - buf;
2513         }
2514     }
2515
2516     return m_minMBCharWidth;
2517 }
2518
2519 #if wxUSE_UNICODE_UTF8
2520 bool wxMBConv_iconv::IsUTF8() const
2521 {
2522     return wxStricmp(m_name, "UTF-8") == 0 ||
2523            wxStricmp(m_name, "UTF8") == 0;
2524 }
2525 #endif
2526
2527 #endif // HAVE_ICONV
2528
2529
2530 // ============================================================================
2531 // Win32 conversion classes
2532 // ============================================================================
2533
2534 #ifdef wxHAVE_WIN32_MB2WC
2535
2536 // from utils.cpp
2537 #if wxUSE_FONTMAP
2538 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2539 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2540 #endif
2541
2542 class wxMBConv_win32 : public wxMBConv
2543 {
2544 public:
2545     wxMBConv_win32()
2546     {
2547         m_CodePage = CP_ACP;
2548         m_minMBCharWidth = 0;
2549     }
2550
2551     wxMBConv_win32(const wxMBConv_win32& conv)
2552         : wxMBConv()
2553     {
2554         m_CodePage = conv.m_CodePage;
2555         m_minMBCharWidth = conv.m_minMBCharWidth;
2556     }
2557
2558 #if wxUSE_FONTMAP
2559     wxMBConv_win32(const char* name)
2560     {
2561         m_CodePage = wxCharsetToCodepage(name);
2562         m_minMBCharWidth = 0;
2563     }
2564
2565     wxMBConv_win32(wxFontEncoding encoding)
2566     {
2567         m_CodePage = wxEncodingToCodepage(encoding);
2568         m_minMBCharWidth = 0;
2569     }
2570 #endif // wxUSE_FONTMAP
2571
2572     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2573     {
2574         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2575         // the behaviour is not compatible with the Unix version (using iconv)
2576         // and break the library itself, e.g. wxTextInputStream::NextChar()
2577         // wouldn't work if reading an incomplete MB char didn't result in an
2578         // error
2579         //
2580         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2581         // Win XP or newer and it is not supported for UTF-[78] so we always
2582         // use our own conversions in this case. See
2583         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2584         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2585         if ( m_CodePage == CP_UTF8 )
2586         {
2587             return wxMBConvUTF8().MB2WC(buf, psz, n);
2588         }
2589
2590         if ( m_CodePage == CP_UTF7 )
2591         {
2592             return wxMBConvUTF7().MB2WC(buf, psz, n);
2593         }
2594
2595         int flags = 0;
2596         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2597                 IsAtLeastWin2kSP4() )
2598         {
2599             flags = MB_ERR_INVALID_CHARS;
2600         }
2601
2602         const size_t len = ::MultiByteToWideChar
2603                              (
2604                                 m_CodePage,     // code page
2605                                 flags,          // flags: fall on error
2606                                 psz,            // input string
2607                                 -1,             // its length (NUL-terminated)
2608                                 buf,            // output string
2609                                 buf ? n : 0     // size of output buffer
2610                              );
2611         if ( !len )
2612         {
2613             // function totally failed
2614             return wxCONV_FAILED;
2615         }
2616
2617         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2618         // check if we succeeded, by doing a double trip:
2619         if ( !flags && buf )
2620         {
2621             const size_t mbLen = strlen(psz);
2622             wxCharBuffer mbBuf(mbLen);
2623             if ( ::WideCharToMultiByte
2624                    (
2625                       m_CodePage,
2626                       0,
2627                       buf,
2628                       -1,
2629                       mbBuf.data(),
2630                       mbLen + 1,        // size in bytes, not length
2631                       NULL,
2632                       NULL
2633                    ) == 0 ||
2634                   strcmp(mbBuf, psz) != 0 )
2635             {
2636                 // we didn't obtain the same thing we started from, hence
2637                 // the conversion was lossy and we consider that it failed
2638                 return wxCONV_FAILED;
2639             }
2640         }
2641
2642         // note that it returns count of written chars for buf != NULL and size
2643         // of the needed buffer for buf == NULL so in either case the length of
2644         // the string (which never includes the terminating NUL) is one less
2645         return len - 1;
2646     }
2647
2648     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2649     {
2650         /*
2651             we have a problem here: by default, WideCharToMultiByte() may
2652             replace characters unrepresentable in the target code page with bad
2653             quality approximations such as turning "1/2" symbol (U+00BD) into
2654             "1" for the code pages which don't have it and we, obviously, want
2655             to avoid this at any price
2656
2657             the trouble is that this function does it _silently_, i.e. it won't
2658             even tell us whether it did or not... Win98/2000 and higher provide
2659             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2660             we have to resort to a round trip, i.e. check that converting back
2661             results in the same string -- this is, of course, expensive but
2662             otherwise we simply can't be sure to not garble the data.
2663          */
2664
2665         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2666         // it doesn't work with CJK encodings (which we test for rather roughly
2667         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2668         // supporting it
2669         BOOL usedDef wxDUMMY_INITIALIZE(false);
2670         BOOL *pUsedDef;
2671         int flags;
2672         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2673         {
2674             // it's our lucky day
2675             flags = WC_NO_BEST_FIT_CHARS;
2676             pUsedDef = &usedDef;
2677         }
2678         else // old system or unsupported encoding
2679         {
2680             flags = 0;
2681             pUsedDef = NULL;
2682         }
2683
2684         const size_t len = ::WideCharToMultiByte
2685                              (
2686                                 m_CodePage,     // code page
2687                                 flags,          // either none or no best fit
2688                                 pwz,            // input string
2689                                 -1,             // it is (wide) NUL-terminated
2690                                 buf,            // output buffer
2691                                 buf ? n : 0,    // and its size
2692                                 NULL,           // default "replacement" char
2693                                 pUsedDef        // [out] was it used?
2694                              );
2695
2696         if ( !len )
2697         {
2698             // function totally failed
2699             return wxCONV_FAILED;
2700         }
2701
2702         // we did something, check if we really succeeded
2703         if ( flags )
2704         {
2705             // check if the conversion failed, i.e. if any replacements
2706             // were done
2707             if ( usedDef )
2708                 return wxCONV_FAILED;
2709         }
2710         else // we must resort to double tripping...
2711         {
2712             // first we need to ensure that we really have the MB data: this is
2713             // not the case if we're called with NULL buffer, in which case we
2714             // need to do the conversion yet again
2715             wxCharBuffer bufDef;
2716             if ( !buf )
2717             {
2718                 bufDef = wxCharBuffer(len);
2719                 buf = bufDef.data();
2720                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2721                                             buf, len, NULL, NULL) )
2722                     return wxCONV_FAILED;
2723             }
2724
2725             if ( !n )
2726                 n = wcslen(pwz);
2727             wxWCharBuffer wcBuf(n);
2728             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2729                     wcscmp(wcBuf, pwz) != 0 )
2730             {
2731                 // we didn't obtain the same thing we started from, hence
2732                 // the conversion was lossy and we consider that it failed
2733                 return wxCONV_FAILED;
2734             }
2735         }
2736
2737         // see the comment above for the reason of "len - 1"
2738         return len - 1;
2739     }
2740
2741     virtual size_t GetMBNulLen() const
2742     {
2743         if ( m_minMBCharWidth == 0 )
2744         {
2745             int len = ::WideCharToMultiByte
2746                         (
2747                             m_CodePage,     // code page
2748                             0,              // no flags
2749                             L"",            // input string
2750                             1,              // translate just the NUL
2751                             NULL,           // output buffer
2752                             0,              // and its size
2753                             NULL,           // no replacement char
2754                             NULL            // [out] don't care if it was used
2755                         );
2756
2757             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2758             switch ( len )
2759             {
2760                 default:
2761                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2762                     self->m_minMBCharWidth = (size_t)-1;
2763                     break;
2764
2765                 case 0:
2766                     self->m_minMBCharWidth = (size_t)-1;
2767                     break;
2768
2769                 case 1:
2770                 case 2:
2771                 case 4:
2772                     self->m_minMBCharWidth = len;
2773                     break;
2774             }
2775         }
2776
2777         return m_minMBCharWidth;
2778     }
2779
2780     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2781
2782     bool IsOk() const { return m_CodePage != -1; }
2783
2784 private:
2785     static bool CanUseNoBestFit()
2786     {
2787         static int s_isWin98Or2k = -1;
2788
2789         if ( s_isWin98Or2k == -1 )
2790         {
2791             int verMaj, verMin;
2792             switch ( wxGetOsVersion(&verMaj, &verMin) )
2793             {
2794                 case wxOS_WINDOWS_9X:
2795                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2796                     break;
2797
2798                 case wxOS_WINDOWS_NT:
2799                     s_isWin98Or2k = verMaj >= 5;
2800                     break;
2801
2802                 default:
2803                     // unknown: be conservative by default
2804                     s_isWin98Or2k = 0;
2805                     break;
2806             }
2807
2808             wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2809         }
2810
2811         return s_isWin98Or2k == 1;
2812     }
2813
2814     static bool IsAtLeastWin2kSP4()
2815     {
2816 #ifdef __WXWINCE__
2817         return false;
2818 #else
2819         static int s_isAtLeastWin2kSP4 = -1;
2820
2821         if ( s_isAtLeastWin2kSP4 == -1 )
2822         {
2823             OSVERSIONINFOEX ver;
2824
2825             memset(&ver, 0, sizeof(ver));
2826             ver.dwOSVersionInfoSize = sizeof(ver);
2827             GetVersionEx((OSVERSIONINFO*)&ver);
2828
2829             s_isAtLeastWin2kSP4 =
2830               ((ver.dwMajorVersion > 5) || // Vista+
2831                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2832                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2833                ver.wServicePackMajor >= 4)) // 2000 SP4+
2834               ? 1 : 0;
2835         }
2836
2837         return s_isAtLeastWin2kSP4 == 1;
2838 #endif
2839     }
2840
2841
2842     // the code page we're working with
2843     long m_CodePage;
2844
2845     // cached result of GetMBNulLen(), set to 0 initially meaning
2846     // "unknown"
2847     size_t m_minMBCharWidth;
2848 };
2849
2850 #endif // wxHAVE_WIN32_MB2WC
2851
2852
2853 // ============================================================================
2854 // wxEncodingConverter based conversion classes
2855 // ============================================================================
2856
2857 #if wxUSE_FONTMAP
2858
2859 class wxMBConv_wxwin : public wxMBConv
2860 {
2861 private:
2862     void Init()
2863     {
2864         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2865         // The wxMBConv_cf class does a better job.
2866         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2867                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2868                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2869     }
2870
2871 public:
2872     // temporarily just use wxEncodingConverter stuff,
2873     // so that it works while a better implementation is built
2874     wxMBConv_wxwin(const char* name)
2875     {
2876         if (name)
2877             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2878         else
2879             m_enc = wxFONTENCODING_SYSTEM;
2880
2881         Init();
2882     }
2883
2884     wxMBConv_wxwin(wxFontEncoding enc)
2885     {
2886         m_enc = enc;
2887
2888         Init();
2889     }
2890
2891     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2892     {
2893         size_t inbuf = strlen(psz);
2894         if (buf)
2895         {
2896             if (!m2w.Convert(psz, buf))
2897                 return wxCONV_FAILED;
2898         }
2899         return inbuf;
2900     }
2901
2902     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2903     {
2904         const size_t inbuf = wxWcslen(psz);
2905         if (buf)
2906         {
2907             if (!w2m.Convert(psz, buf))
2908                 return wxCONV_FAILED;
2909         }
2910
2911         return inbuf;
2912     }
2913
2914     virtual size_t GetMBNulLen() const
2915     {
2916         switch ( m_enc )
2917         {
2918             case wxFONTENCODING_UTF16BE:
2919             case wxFONTENCODING_UTF16LE:
2920                 return 2;
2921
2922             case wxFONTENCODING_UTF32BE:
2923             case wxFONTENCODING_UTF32LE:
2924                 return 4;
2925
2926             default:
2927                 return 1;
2928         }
2929     }
2930
2931     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2932
2933     bool IsOk() const { return m_ok; }
2934
2935 public:
2936     wxFontEncoding m_enc;
2937     wxEncodingConverter m2w, w2m;
2938
2939 private:
2940     // were we initialized successfully?
2941     bool m_ok;
2942
2943     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2944 };
2945
2946 // make the constructors available for unit testing
2947 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2948 {
2949     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2950     if ( !result->IsOk() )
2951     {
2952         delete result;
2953         return 0;
2954     }
2955
2956     return result;
2957 }
2958
2959 #endif // wxUSE_FONTMAP
2960
2961 // ============================================================================
2962 // wxCSConv implementation
2963 // ============================================================================
2964
2965 void wxCSConv::Init()
2966 {
2967     m_name = NULL;
2968     m_convReal =  NULL;
2969 }
2970
2971 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2972 {
2973     switch ( encoding )
2974     {
2975         case wxFONTENCODING_MAX:
2976         case wxFONTENCODING_SYSTEM:
2977             if ( m_name )
2978             {
2979                 // It's ok to not have encoding value if we have a name for it.
2980                 m_encoding = wxFONTENCODING_SYSTEM;
2981             }
2982             else // No name neither.
2983             {
2984                 // Fall back to the system default encoding in this case (not
2985                 // sure how much sense does this make but this is how the old
2986                 // code used to behave).
2987 #if wxUSE_INTL
2988                 m_encoding = wxLocale::GetSystemEncoding();
2989                 if ( m_encoding == wxFONTENCODING_SYSTEM )
2990 #endif // wxUSE_INTL
2991                     m_encoding = wxFONTENCODING_ISO8859_1;
2992             }
2993             break;
2994
2995         case wxFONTENCODING_DEFAULT:
2996             // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2997             m_encoding = wxFONTENCODING_ISO8859_1;
2998             break;
2999
3000         default:
3001             // Just use the provided encoding.
3002             m_encoding = encoding;
3003     }
3004 }
3005
3006 wxCSConv::wxCSConv(const wxString& charset)
3007 {
3008     Init();
3009
3010     if ( !charset.empty() )
3011     {
3012         SetName(charset.ToAscii());
3013     }
3014
3015 #if wxUSE_FONTMAP
3016     SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3017 #else
3018     SetEncoding(wxFONTENCODING_SYSTEM);
3019 #endif
3020
3021     m_convReal = DoCreate();
3022 }
3023
3024 wxCSConv::wxCSConv(wxFontEncoding encoding)
3025 {
3026     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3027     {
3028         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3029
3030         encoding = wxFONTENCODING_SYSTEM;
3031     }
3032
3033     Init();
3034
3035     SetEncoding(encoding);
3036
3037     m_convReal = DoCreate();
3038 }
3039
3040 wxCSConv::~wxCSConv()
3041 {
3042     Clear();
3043 }
3044
3045 wxCSConv::wxCSConv(const wxCSConv& conv)
3046         : wxMBConv()
3047 {
3048     Init();
3049
3050     SetName(conv.m_name);
3051     SetEncoding(conv.m_encoding);
3052
3053     m_convReal = DoCreate();
3054 }
3055
3056 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3057 {
3058     Clear();
3059
3060     SetName(conv.m_name);
3061     SetEncoding(conv.m_encoding);
3062
3063     m_convReal = DoCreate();
3064
3065     return *this;
3066 }
3067
3068 void wxCSConv::Clear()
3069 {
3070     free(m_name);
3071     m_name = NULL;
3072
3073     wxDELETE(m_convReal);
3074 }
3075
3076 void wxCSConv::SetName(const char *charset)
3077 {
3078     if ( charset )
3079         m_name = wxStrdup(charset);
3080 }
3081
3082 #if wxUSE_FONTMAP
3083
3084 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3085                      wxEncodingNameCache );
3086
3087 static wxEncodingNameCache gs_nameCache;
3088 #endif
3089
3090 wxMBConv *wxCSConv::DoCreate() const
3091 {
3092 #if wxUSE_FONTMAP
3093     wxLogTrace(TRACE_STRCONV,
3094                wxT("creating conversion for %s"),
3095                (m_name ? m_name
3096                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3097 #endif // wxUSE_FONTMAP
3098
3099     // check for the special case of ASCII or ISO8859-1 charset: as we have
3100     // special knowledge of it anyhow, we don't need to create a special
3101     // conversion object
3102     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3103     {
3104         // don't convert at all
3105         return NULL;
3106     }
3107
3108     // we trust OS to do conversion better than we can so try external
3109     // conversion methods first
3110     //
3111     // the full order is:
3112     //      1. OS conversion (iconv() under Unix or Win32 API)
3113     //      2. hard coded conversions for UTF
3114     //      3. wxEncodingConverter as fall back
3115
3116     // step (1)
3117 #ifdef HAVE_ICONV
3118 #if !wxUSE_FONTMAP
3119     if ( m_name )
3120 #endif // !wxUSE_FONTMAP
3121     {
3122 #if wxUSE_FONTMAP
3123         wxFontEncoding encoding(m_encoding);
3124 #endif
3125
3126         if ( m_name )
3127         {
3128             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3129             if ( conv->IsOk() )
3130                 return conv;
3131
3132             delete conv;
3133
3134 #if wxUSE_FONTMAP
3135             encoding =
3136                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3137 #endif // wxUSE_FONTMAP
3138         }
3139 #if wxUSE_FONTMAP
3140         {
3141             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3142             if ( it != gs_nameCache.end() )
3143             {
3144                 if ( it->second.empty() )
3145                     return NULL;
3146
3147                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3148                 if ( conv->IsOk() )
3149                     return conv;
3150
3151                 delete conv;
3152             }
3153
3154             const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3155             // CS : in case this does not return valid names (eg for MacRoman)
3156             // encoding got a 'failure' entry in the cache all the same,
3157             // although it just has to be created using a different method, so
3158             // only store failed iconv creation attempts (or perhaps we
3159             // shoulnd't do this at all ?)
3160             if ( names[0] != NULL )
3161             {
3162                 for ( ; *names; ++names )
3163                 {
3164                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3165                     //             will need changes that will obsolete this
3166                     wxString name(*names);
3167                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3168                     if ( conv->IsOk() )
3169                     {
3170                         gs_nameCache[encoding] = *names;
3171                         return conv;
3172                     }
3173
3174                     delete conv;
3175                 }
3176
3177                 gs_nameCache[encoding] = wxT(""); // cache the failure
3178             }
3179         }
3180 #endif // wxUSE_FONTMAP
3181     }
3182 #endif // HAVE_ICONV
3183
3184 #ifdef wxHAVE_WIN32_MB2WC
3185     {
3186 #if wxUSE_FONTMAP
3187         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3188                                       : new wxMBConv_win32(m_encoding);
3189         if ( conv->IsOk() )
3190             return conv;
3191
3192         delete conv;
3193 #else
3194         return NULL;
3195 #endif
3196     }
3197 #endif // wxHAVE_WIN32_MB2WC
3198
3199 #ifdef __DARWIN__
3200     {
3201         // leave UTF16 and UTF32 to the built-ins of wx
3202         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3203             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3204         {
3205 #if wxUSE_FONTMAP
3206             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3207                                           : new wxMBConv_cf(m_encoding);
3208 #else
3209             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3210 #endif
3211
3212             if ( conv->IsOk() )
3213                  return conv;
3214
3215             delete conv;
3216         }
3217     }
3218 #endif // __DARWIN__
3219
3220     // step (2)
3221     wxFontEncoding enc = m_encoding;
3222 #if wxUSE_FONTMAP
3223     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3224     {
3225         // use "false" to suppress interactive dialogs -- we can be called from
3226         // anywhere and popping up a dialog from here is the last thing we want to
3227         // do
3228         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3229     }
3230 #endif // wxUSE_FONTMAP
3231
3232     switch ( enc )
3233     {
3234         case wxFONTENCODING_UTF7:
3235              return new wxMBConvUTF7;
3236
3237         case wxFONTENCODING_UTF8:
3238              return new wxMBConvUTF8;
3239
3240         case wxFONTENCODING_UTF16BE:
3241              return new wxMBConvUTF16BE;
3242
3243         case wxFONTENCODING_UTF16LE:
3244              return new wxMBConvUTF16LE;
3245
3246         case wxFONTENCODING_UTF32BE:
3247              return new wxMBConvUTF32BE;
3248
3249         case wxFONTENCODING_UTF32LE:
3250              return new wxMBConvUTF32LE;
3251
3252         default:
3253              // nothing to do but put here to suppress gcc warnings
3254              break;
3255     }
3256
3257     // step (3)
3258 #if wxUSE_FONTMAP
3259     {
3260         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3261                                       : new wxMBConv_wxwin(m_encoding);
3262         if ( conv->IsOk() )
3263             return conv;
3264
3265         delete conv;
3266     }
3267
3268     wxLogTrace(TRACE_STRCONV,
3269                wxT("encoding \"%s\" is not supported by this system"),
3270                (m_name ? wxString(m_name)
3271                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3272 #endif // wxUSE_FONTMAP
3273
3274     return NULL;
3275 }
3276
3277 bool wxCSConv::IsOk() const
3278 {
3279     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3280     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3281         return true; // always ok as we do it ourselves
3282
3283     // m_convReal->IsOk() is called at its own creation, so we know it must
3284     // be ok if m_convReal is non-NULL
3285     return m_convReal != NULL;
3286 }
3287
3288 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3289                          const char *src, size_t srcLen) const
3290 {
3291     if (m_convReal)
3292         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3293
3294     // latin-1 (direct)
3295     if ( srcLen == wxNO_LEN )
3296         srcLen = strlen(src) + 1; // take trailing NUL too
3297
3298     if ( dst )
3299     {
3300         if ( dstLen < srcLen )
3301             return wxCONV_FAILED;
3302
3303         for ( size_t n = 0; n < srcLen; n++ )
3304             dst[n] = (unsigned char)(src[n]);
3305     }
3306
3307     return srcLen;
3308 }
3309
3310 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3311                            const wchar_t *src, size_t srcLen) const
3312 {
3313     if (m_convReal)
3314         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3315
3316     // latin-1 (direct)
3317     if ( srcLen == wxNO_LEN )
3318         srcLen = wxWcslen(src) + 1;
3319
3320     if ( dst )
3321     {
3322         if ( dstLen < srcLen )
3323             return wxCONV_FAILED;
3324
3325         for ( size_t n = 0; n < srcLen; n++ )
3326         {
3327             if ( src[n] > 0xFF )
3328                 return wxCONV_FAILED;
3329
3330             dst[n] = (char)src[n];
3331         }
3332
3333     }
3334     else // still need to check the input validity
3335     {
3336         for ( size_t n = 0; n < srcLen; n++ )
3337         {
3338             if ( src[n] > 0xFF )
3339                 return wxCONV_FAILED;
3340         }
3341     }
3342
3343     return srcLen;
3344 }
3345
3346 size_t wxCSConv::GetMBNulLen() const
3347 {
3348     if ( m_convReal )
3349         return m_convReal->GetMBNulLen();
3350
3351     // otherwise, we are ISO-8859-1
3352     return 1;
3353 }
3354
3355 #if wxUSE_UNICODE_UTF8
3356 bool wxCSConv::IsUTF8() const
3357 {
3358     if ( m_convReal )
3359         return m_convReal->IsUTF8();
3360
3361     // otherwise, we are ISO-8859-1
3362     return false;
3363 }
3364 #endif
3365
3366
3367 #if wxUSE_UNICODE
3368
3369 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3370 {
3371     if ( !s )
3372         return wxWCharBuffer();
3373
3374     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3375     if ( !wbuf )
3376         wbuf = wxMBConvUTF8().cMB2WX(s);
3377     if ( !wbuf )
3378         wbuf = wxConvISO8859_1.cMB2WX(s);
3379
3380     return wbuf;
3381 }
3382
3383 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3384 {
3385     if ( !ws )
3386         return wxCharBuffer();
3387
3388     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3389     if ( !buf )
3390         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3391
3392     return buf;
3393 }
3394
3395 #endif // wxUSE_UNICODE
3396
3397 // ----------------------------------------------------------------------------
3398 // globals
3399 // ----------------------------------------------------------------------------
3400
3401 // NB: The reason why we create converted objects in this convoluted way,
3402 //     using a factory function instead of global variable, is that they
3403 //     may be used at static initialization time (some of them are used by
3404 //     wxString ctors and there may be a global wxString object). In other
3405 //     words, possibly _before_ the converter global object would be
3406 //     initialized.
3407
3408 #undef wxConvLibc
3409 #undef wxConvUTF8
3410 #undef wxConvUTF7
3411 #undef wxConvLocal
3412 #undef wxConvISO8859_1
3413
3414 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3415     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3416     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3417     {                                                                   \
3418         static impl_klass name##Obj ctor_args;                          \
3419         return &name##Obj;                                              \
3420     }                                                                   \
3421     /* this ensures that all global converter objects are created */    \
3422     /* by the time static initialization is done, i.e. before any */    \
3423     /* thread is launched: */                                           \
3424     static klass* gs_##name##instance = wxGet_##name##Ptr()
3425
3426 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3427     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3428
3429 #ifdef __INTELC__
3430     // disable warning "variable 'xxx' was declared but never referenced"
3431     #pragma warning(disable: 177)
3432 #endif // Intel C++
3433
3434 #ifdef __WINDOWS__
3435     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3436 #elif 0 // defined(__WXOSX__)
3437     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3438 #else
3439     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3440 #endif
3441
3442 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3443 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3444 //     provokes an error message about "not enough macro parameters"; and we
3445 //     can't use "()" here as the name##Obj declaration would be parsed as a
3446 //     function declaration then, so use a semicolon and live with an extra
3447 //     empty statement (and hope that no compilers warns about this)
3448 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3449 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3450
3451 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3452 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3453
3454 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3455 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3456
3457 #ifdef __DARWIN__
3458 // It is important to use this conversion object under Darwin as it ensures
3459 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3460 // decomposed form internally (at least for the file names).
3461 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3462 #endif
3463
3464 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3465 #ifdef __DARWIN__
3466                                     &wxConvMacUTF8DObj;
3467 #else // !__DARWIN__
3468                                     wxGet_wxConvLibcPtr();
3469 #endif // __DARWIN__/!__DARWIN__