src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #ifndef __WXWINCE__
  32 #include <errno.h>
  33 #endif
  34
  35 #include <ctype.h>
  36 #include <string.h>
  37 #include <stdlib.h>
  38
  39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42     #define wxHAVE_WIN32_MB2WC
  43 #endif
  44
  45 #ifdef HAVE_ICONV
  46     #include <iconv.h>
  47     #include "wx/thread.h"
  48 #endif
  49
  50 #include "wx/encconv.h"
  51 #include "wx/fontmap.h"
  52
  53 #ifdef __DARWIN__
  54 #include "wx/osx/core/private/strconv_cf.h"
  55 #endif //def __DARWIN__
  56
  57
  58 #define TRACE_STRCONV wxT("strconv")
  59
  60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  61 // be 4 bytes
  62 #if SIZEOF_WCHAR_T == 2
  63     #define WC_UTF16
  64 #endif
  65
  66
  67 // ============================================================================
  68 // implementation
  69 // ============================================================================
  70
  71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  72 static bool NotAllNULs(const char *p, size_t n)
  73 {
  74     while ( n && *p++ == '\0' )
  75         n--;
  76
  77     return n != 0;
  78 }
  79
  80 // ----------------------------------------------------------------------------
  81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  82 // ----------------------------------------------------------------------------
  83
  84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  85 {
  86     if (input <= 0xffff)
  87     {
  88         if (output)
  89             *output = (wxUint16) input;
  90
  91         return 1;
  92     }
  93     else if (input >= 0x110000)
  94     {
  95         return wxCONV_FAILED;
  96     }
  97     else
  98     {
  99         if (output)
 100         {
 101             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 102             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 103         }
 104
 105         return 2;
 106     }
 107 }
 108
 109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 110 {
 111     if ((*input < 0xd800) || (*input > 0xdfff))
 112     {
 113         output = *input;
 114         return 1;
 115     }
 116     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 117     {
 118         output = *input;
 119         return wxCONV_FAILED;
 120     }
 121     else
 122     {
 123         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 124         return 2;
 125     }
 126 }
 127
 128 #ifdef WC_UTF16
 129     typedef wchar_t wxDecodeSurrogate_t;
 130 #else // !WC_UTF16
 131     typedef wxUint16 wxDecodeSurrogate_t;
 132 #endif // WC_UTF16/!WC_UTF16
 133
 134 // returns the next UTF-32 character from the wchar_t buffer and advances the
 135 // pointer to the character after this one
 136 //
 137 // if an invalid character is found, *pSrc is set to NULL, the caller must
 138 // check for this
 139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 140 {
 141     wxUint32 out;
 142     const size_t
 143         n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
 144     if ( n == wxCONV_FAILED )
 145         *pSrc = NULL;
 146     else
 147         *pSrc += n;
 148
 149     return out;
 150 }
 151
 152 // ----------------------------------------------------------------------------
 153 // wxMBConv
 154 // ----------------------------------------------------------------------------
 155
 156 size_t
 157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 158                   const char *src, size_t srcLen) const
 159 {
 160     // although new conversion classes are supposed to implement this function
 161     // directly, the existing ones only implement the old MB2WC() and so, to
 162     // avoid to have to rewrite all conversion classes at once, we provide a
 163     // default (but not efficient) implementation of this one in terms of the
 164     // old function by copying the input to ensure that it's NUL-terminated and
 165     // then using MB2WC() to convert it
 166     //
 167     // moreover, some conversion classes simply can't implement ToWChar()
 168     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 169     // NUL-terminated strings
 170
 171     // the number of chars [which would be] written to dst [if it were not NULL]
 172     size_t dstWritten = 0;
 173
 174     // the number of NULs terminating this string
 175     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 176
 177     // if we were not given the input size we just have to assume that the
 178     // string is properly terminated as we have no way of knowing how long it
 179     // is anyhow, but if we do have the size check whether there are enough
 180     // NULs at the end
 181     wxCharBuffer bufTmp;
 182     const char *srcEnd;
 183     if ( srcLen != wxNO_LEN )
 184     {
 185         // we need to know how to find the end of this string
 186         nulLen = GetMBNulLen();
 187         if ( nulLen == wxCONV_FAILED )
 188             return wxCONV_FAILED;
 189
 190         // if there are enough NULs we can avoid the copy
 191         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 192         {
 193             // make a copy in order to properly NUL-terminate the string
 194             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 195             char * const p = bufTmp.data();
 196             memcpy(p, src, srcLen);
 197             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 198                 *s = '\0';
 199
 200             src = bufTmp;
 201         }
 202
 203         srcEnd = src + srcLen;
 204     }
 205     else // quit after the first loop iteration
 206     {
 207         srcEnd = NULL;
 208     }
 209
 210     // the idea of this code is straightforward: it converts a NUL-terminated
 211     // chunk of the string during each iteration and updates the output buffer
 212     // with the result
 213     //
 214     // all the complication come from the fact that this function, for
 215     // historical reasons, must behave in 2 subtly different ways when it's
 216     // called with a fixed number of characters and when it's called for the
 217     // entire NUL-terminated string: in the former case (srcEnd != NULL) we
 218     // must count all characters we convert, NUL or not; but in the latter we
 219     // do not count the trailing NUL -- but still count all the NULs inside the
 220     // string
 221     //
 222     // so for the (simple) former case we just always count the trailing NUL,
 223     // but for the latter we need to wait until we see if there is going to be
 224     // another loop iteration and only count it then
 225     for ( ;; )
 226     {
 227         // try to convert the current chunk
 228         size_t lenChunk = MB2WC(NULL, src, 0);
 229         if ( lenChunk == wxCONV_FAILED )
 230             return wxCONV_FAILED;
 231
 232         dstWritten += lenChunk;
 233         if ( !srcEnd )
 234             dstWritten++;
 235
 236         if ( !lenChunk )
 237         {
 238             // nothing left in the input string, conversion succeeded
 239             break;
 240         }
 241
 242         if ( dst )
 243         {
 244             if ( dstWritten > dstLen )
 245                 return wxCONV_FAILED;
 246
 247             // +1 is for trailing NUL
 248             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 249                 return wxCONV_FAILED;
 250
 251             dst += lenChunk;
 252             if ( !srcEnd )
 253                 dst++;
 254         }
 255
 256         if ( !srcEnd )
 257         {
 258             // we convert just one chunk in this case as this is the entire
 259             // string anyhow (and we don't count the trailing NUL in this case)
 260             break;
 261         }
 262
 263         // advance the input pointer past the end of this chunk: notice that we
 264         // will always stop before srcEnd because we know that the chunk is
 265         // always properly NUL-terminated
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         // if the buffer ends before this NUL, we shouldn't count it in our
 276         // output so skip the code below
 277         if ( src == srcEnd )
 278             break;
 279
 280         // do count this terminator as it's inside the buffer we convert
 281         dstWritten++;
 282         if ( dst )
 283             dst++;
 284
 285         src += nulLen; // skip the terminator itself
 286
 287         if ( src >= srcEnd )
 288             break;
 289     }
 290
 291     return dstWritten;
 292 }
 293
 294 size_t
 295 wxMBConv::FromWChar(char *dst, size_t dstLen,
 296                     const wchar_t *src, size_t srcLen) const
 297 {
 298     // the number of chars [which would be] written to dst [if it were not NULL]
 299     size_t dstWritten = 0;
 300
 301     // if we don't know its length we have no choice but to assume that it is
 302     // NUL-terminated (notice that it can still be NUL-terminated even if
 303     // explicit length is given but it doesn't change our return value)
 304     const bool isNulTerminated = srcLen == wxNO_LEN;
 305
 306     // make a copy of the input string unless it is already properly
 307     // NUL-terminated
 308     wxWCharBuffer bufTmp;
 309     if ( isNulTerminated )
 310     {
 311         srcLen = wxWcslen(src) + 1;
 312     }
 313     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 314     {
 315         // make a copy in order to properly NUL-terminate the string
 316         bufTmp = wxWCharBuffer(srcLen);
 317         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 318         src = bufTmp;
 319     }
 320
 321     const size_t lenNul = GetMBNulLen();
 322     for ( const wchar_t * const srcEnd = src + srcLen;
 323           src < srcEnd;
 324           src++ /* skip L'\0' too */ )
 325     {
 326         // try to convert the current chunk
 327         size_t lenChunk = WC2MB(NULL, src, 0);
 328         if ( lenChunk == wxCONV_FAILED )
 329             return wxCONV_FAILED;
 330
 331         dstWritten += lenChunk;
 332
 333         const wchar_t * const
 334             chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
 335
 336         // our return value accounts for the trailing NUL(s), unlike that of
 337         // WC2MB(), however don't do it for the last NUL we artificially added
 338         // ourselves above
 339         if ( chunkEnd < srcEnd )
 340             dstWritten += lenNul;
 341
 342         if ( dst )
 343         {
 344             if ( dstWritten > dstLen )
 345                 return wxCONV_FAILED;
 346
 347             // if we know that there is enough space in the destination buffer
 348             // (because we accounted for lenNul in dstWritten above), we can
 349             // convert directly in place -- but otherwise we need another
 350             // temporary buffer to ensure that we don't overwrite the output
 351             wxCharBuffer dstBuf;
 352             char *dstTmp;
 353             if ( chunkEnd == srcEnd )
 354             {
 355                 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
 356                 dstTmp = dstBuf.data();
 357             }
 358             else
 359             {
 360                 dstTmp = dst;
 361             }
 362
 363             if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
 364                 return wxCONV_FAILED;
 365
 366             if ( dstTmp != dst )
 367             {
 368                 // copy everything up to but excluding the terminating NUL(s)
 369                 // into the real output buffer
 370                 memcpy(dst, dstTmp, lenChunk);
 371
 372                 // micro-optimization: if dstTmp != dst it means that chunkEnd
 373                 // == srcEnd and so we're done, no need to update anything below
 374                 break;
 375             }
 376
 377             dst += lenChunk;
 378             if ( chunkEnd < srcEnd )
 379                 dst += lenNul;
 380         }
 381
 382         src = chunkEnd;
 383     }
 384
 385     return dstWritten;
 386 }
 387
 388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 389 {
 390     size_t rc = ToWChar(outBuff, outLen, inBuff);
 391     if ( rc != wxCONV_FAILED )
 392     {
 393         // ToWChar() returns the buffer length, i.e. including the trailing
 394         // NUL, while this method doesn't take it into account
 395         rc--;
 396     }
 397
 398     return rc;
 399 }
 400
 401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 402 {
 403     size_t rc = FromWChar(outBuff, outLen, inBuff);
 404     if ( rc != wxCONV_FAILED )
 405     {
 406         rc -= GetMBNulLen();
 407     }
 408
 409     return rc;
 410 }
 411
 412 wxMBConv::~wxMBConv()
 413 {
 414     // nothing to do here (necessary for Darwin linking probably)
 415 }
 416
 417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 418 {
 419     if ( psz )
 420     {
 421         // calculate the length of the buffer needed first
 422         const size_t nLen = ToWChar(NULL, 0, psz);
 423         if ( nLen != wxCONV_FAILED )
 424         {
 425             // now do the actual conversion
 426             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 427
 428             // +1 for the trailing NULL
 429             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 430                 return buf;
 431         }
 432     }
 433
 434     return wxWCharBuffer();
 435 }
 436
 437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 438 {
 439     if ( pwz )
 440     {
 441         const size_t nLen = FromWChar(NULL, 0, pwz);
 442         if ( nLen != wxCONV_FAILED )
 443         {
 444             wxCharBuffer buf(nLen - 1);
 445             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 446                 return buf;
 447         }
 448     }
 449
 450     return wxCharBuffer();
 451 }
 452
 453 const wxWCharBuffer
 454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 455 {
 456     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 457     if ( dstLen != wxCONV_FAILED )
 458     {
 459         // notice that we allocate space for dstLen+1 wide characters here
 460         // because we want the buffer to always be NUL-terminated, even if the
 461         // input isn't (as otherwise the caller has no way to know its length)
 462         wxWCharBuffer wbuf(dstLen);
 463         wbuf.data()[dstLen] = L'\0';
 464         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 465         {
 466             if ( outLen )
 467             {
 468                 *outLen = dstLen;
 469
 470                 // we also need to handle NUL-terminated input strings
 471                 // specially: for them the output is the length of the string
 472                 // excluding the trailing NUL, however if we're asked to
 473                 // convert a specific number of characters we return the length
 474                 // of the resulting output even if it's NUL-terminated
 475                 if ( inLen == wxNO_LEN )
 476                     (*outLen)--;
 477             }
 478
 479             return wbuf;
 480         }
 481     }
 482
 483     if ( outLen )
 484         *outLen = 0;
 485
 486     return wxWCharBuffer();
 487 }
 488
 489 const wxCharBuffer
 490 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 491 {
 492     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 493     if ( dstLen != wxCONV_FAILED )
 494     {
 495         const size_t nulLen = GetMBNulLen();
 496
 497         // as above, ensure that the buffer is always NUL-terminated, even if
 498         // the input is not
 499         wxCharBuffer buf(dstLen + nulLen - 1);
 500         memset(buf.data() + dstLen, 0, nulLen);
 501         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 502         {
 503             if ( outLen )
 504             {
 505                 *outLen = dstLen;
 506
 507                 if ( inLen == wxNO_LEN )
 508                 {
 509                     // in this case both input and output are NUL-terminated
 510                     // and we're not supposed to count NUL
 511                     *outLen -= nulLen;
 512                 }
 513             }
 514
 515             return buf;
 516         }
 517     }
 518
 519     if ( outLen )
 520         *outLen = 0;
 521
 522     return wxCharBuffer();
 523 }
 524
 525 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
 526 {
 527     const size_t srcLen = buf.length();
 528     if ( srcLen )
 529     {
 530         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
 531         if ( dstLen != wxCONV_FAILED )
 532         {
 533             wxWCharBuffer wbuf(dstLen);
 534             wbuf.data()[dstLen] = L'\0';
 535             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
 536                 return wbuf;
 537         }
 538     }
 539
 540     return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
 541 }
 542
 543 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
 544 {
 545     const size_t srcLen = wbuf.length();
 546     if ( srcLen )
 547     {
 548         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
 549         if ( dstLen != wxCONV_FAILED )
 550         {
 551             wxCharBuffer buf(dstLen);
 552             buf.data()[dstLen] = '\0';
 553             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
 554                 return buf;
 555         }
 556     }
 557
 558     return wxScopedCharBuffer::CreateNonOwned("", 0);
 559 }
 560
 561 // ----------------------------------------------------------------------------
 562 // wxMBConvLibc
 563 // ----------------------------------------------------------------------------
 564
 565 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 566 {
 567     return wxMB2WC(buf, psz, n);
 568 }
 569
 570 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 571 {
 572     return wxWC2MB(buf, psz, n);
 573 }
 574
 575 // ----------------------------------------------------------------------------
 576 // wxConvBrokenFileNames
 577 // ----------------------------------------------------------------------------
 578
 579 #ifdef __UNIX__
 580
 581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 582 {
 583     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
 584          wxStricmp(charset, wxT("UTF8")) == 0  )
 585         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 586     else
 587         m_conv = new wxCSConv(charset);
 588 }
 589
 590 #endif // __UNIX__
 591
 592 // ----------------------------------------------------------------------------
 593 // UTF-7
 594 // ----------------------------------------------------------------------------
 595
 596 // Implementation (C) 2004 Fredrik Roubert
 597 //
 598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 599
 600 //
 601 // BASE64 decoding table
 602 //
 603 static const unsigned char utf7unb64[] =
 604 {
 605     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 606     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 607     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 608     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 609     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 610     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 611     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 612     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 613     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 614     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 615     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 616     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 617     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 618     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 619     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 620     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 621     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 622     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 623     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 624     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 625     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 626     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 627     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 628     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 629     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 630     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 631     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 632     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 633     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 634     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 635     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 636     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 637 };
 638
 639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 640                              const char *src, size_t srcLen) const
 641 {
 642     DecoderState stateOrig,
 643                 *statePtr;
 644     if ( srcLen == wxNO_LEN )
 645     {
 646         // convert the entire string, up to and including the trailing NUL
 647         srcLen = strlen(src) + 1;
 648
 649         // when working on the entire strings we don't update nor use the shift
 650         // state from the previous call
 651         statePtr = &stateOrig;
 652     }
 653     else // when working with partial strings we do use the shift state
 654     {
 655         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
 656
 657         // also save the old state to be able to rollback to it on error
 658         stateOrig = m_stateDecoder;
 659     }
 660
 661     // but to simplify the code below we use this variable in both cases
 662     DecoderState& state = *statePtr;
 663
 664
 665     // number of characters [which would have been] written to dst [if it were
 666     // not NULL]
 667     size_t len = 0;
 668
 669     const char * const srcEnd = src + srcLen;
 670
 671     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 672     {
 673         const unsigned char cc = *src++;
 674
 675         if ( state.IsShifted() )
 676         {
 677             const unsigned char dc = utf7unb64[cc];
 678             if ( dc == 0xff )
 679             {
 680                 // end of encoded part, check that nothing was left: there can
 681                 // be up to 4 bits of 0 padding but nothing else (we also need
 682                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 683                 // encoded sequence must contain an integral number of UTF-16
 684                 // characters)
 685                 if ( state.isLSB || state.bit > 4 ||
 686                         (state.accum & ((1 << state.bit) - 1)) )
 687                 {
 688                     if ( !len )
 689                         state = stateOrig;
 690
 691                     return wxCONV_FAILED;
 692                 }
 693
 694                 state.ToDirect();
 695
 696                 // re-parse this character normally below unless it's '-' which
 697                 // is consumed by the decoder
 698                 if ( cc == '-' )
 699                     continue;
 700             }
 701             else // valid encoded character
 702             {
 703                 // mini base64 decoder: each character is 6 bits
 704                 state.bit += 6;
 705                 state.accum <<= 6;
 706                 state.accum += dc;
 707
 708                 if ( state.bit >= 8 )
 709                 {
 710                     // got the full byte, consume it
 711                     state.bit -= 8;
 712                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 713
 714                     if ( state.isLSB )
 715                     {
 716                         // we've got the full word, output it
 717                         if ( dst )
 718                             *dst++ = (state.msb << 8) | b;
 719                         len++;
 720                         state.isLSB = false;
 721                     }
 722                     else // MSB
 723                     {
 724                         // just store it while we wait for LSB
 725                         state.msb = b;
 726                         state.isLSB = true;
 727                     }
 728                 }
 729             }
 730         }
 731
 732         if ( state.IsDirect() )
 733         {
 734             // start of an encoded segment?
 735             if ( cc == '+' )
 736             {
 737                 if ( *src == '-' )
 738                 {
 739                     // just the encoded plus sign, don't switch to shifted mode
 740                     if ( dst )
 741                         *dst++ = '+';
 742                     len++;
 743                     src++;
 744                 }
 745                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 746                 {
 747                     // empty encoded chunks are not allowed
 748                     if ( !len )
 749                         state = stateOrig;
 750
 751                     return wxCONV_FAILED;
 752                 }
 753                 else // base-64 encoded chunk follows
 754                 {
 755                     state.ToShifted();
 756                 }
 757             }
 758             else // not '+'
 759             {
 760                 // only printable 7 bit ASCII characters (with the exception of
 761                 // NUL, TAB, CR and LF) can be used directly
 762                 if ( cc >= 0x7f || (cc < ' ' &&
 763                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 764                     return wxCONV_FAILED;
 765
 766                 if ( dst )
 767                     *dst++ = cc;
 768                 len++;
 769             }
 770         }
 771     }
 772
 773     if ( !len )
 774     {
 775         // as we didn't read any characters we should be called with the same
 776         // data (followed by some more new data) again later so don't save our
 777         // state
 778         state = stateOrig;
 779
 780         return wxCONV_FAILED;
 781     }
 782
 783     return len;
 784 }
 785
 786 //
 787 // BASE64 encoding table
 788 //
 789 static const unsigned char utf7enb64[] =
 790 {
 791     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 792     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 793     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 794     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 795     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 796     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 797     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 798     '4', '5', '6', '7', '8', '9', '+', '/'
 799 };
 800
 801 //
 802 // UTF-7 encoding table
 803 //
 804 // 0 - Set D (directly encoded characters)
 805 // 1 - Set O (optional direct characters)
 806 // 2 - whitespace characters (optional)
 807 // 3 - special characters
 808 //
 809 static const unsigned char utf7encode[128] =
 810 {
 811     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 812     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 813     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 814     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 815     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 816     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 817     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 818     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 819 };
 820
 821 static inline bool wxIsUTF7Direct(wchar_t wc)
 822 {
 823     return wc < 0x80 && utf7encode[wc] < 1;
 824 }
 825
 826 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 827                                const wchar_t *src, size_t srcLen) const
 828 {
 829     EncoderState stateOrig,
 830                 *statePtr;
 831     if ( srcLen == wxNO_LEN )
 832     {
 833         // we don't apply the stored state when operating on entire strings at
 834         // once
 835         statePtr = &stateOrig;
 836
 837         srcLen = wxWcslen(src) + 1;
 838     }
 839     else // do use the mode we left the output in previously
 840     {
 841         stateOrig = m_stateEncoder;
 842         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
 843     }
 844
 845     EncoderState& state = *statePtr;
 846
 847
 848     size_t len = 0;
 849
 850     const wchar_t * const srcEnd = src + srcLen;
 851     while ( src < srcEnd && (!dst || len < dstLen) )
 852     {
 853         wchar_t cc = *src++;
 854         if ( wxIsUTF7Direct(cc) )
 855         {
 856             if ( state.IsShifted() )
 857             {
 858                 // pad with zeros the last encoded block if necessary
 859                 if ( state.bit )
 860                 {
 861                     if ( dst )
 862                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 863                     len++;
 864                 }
 865
 866                 state.ToDirect();
 867
 868                 if ( dst )
 869                     *dst++ = '-';
 870                 len++;
 871             }
 872
 873             if ( dst )
 874                 *dst++ = (char)cc;
 875             len++;
 876         }
 877         else if ( cc == '+' && state.IsDirect() )
 878         {
 879             if ( dst )
 880             {
 881                 *dst++ = '+';
 882                 *dst++ = '-';
 883             }
 884
 885             len += 2;
 886         }
 887 #ifndef WC_UTF16
 888         else if (((wxUint32)cc) > 0xffff)
 889         {
 890             // no surrogate pair generation (yet?)
 891             return wxCONV_FAILED;
 892         }
 893 #endif
 894         else
 895         {
 896             if ( state.IsDirect() )
 897             {
 898                 state.ToShifted();
 899
 900                 if ( dst )
 901                     *dst++ = '+';
 902                 len++;
 903             }
 904
 905             // BASE64 encode string
 906             for ( ;; )
 907             {
 908                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 909                 {
 910                     state.accum <<= 8;
 911                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 912
 913                     for (state.bit += 8; state.bit >= 6; )
 914                     {
 915                         state.bit -= 6;
 916                         if ( dst )
 917                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 918                         len++;
 919                     }
 920                 }
 921
 922                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 923                     break;
 924
 925                 src++;
 926             }
 927         }
 928     }
 929
 930     // we need to restore the original encoder state if we were called just to
 931     // calculate the amount of space needed as we will presumably be called
 932     // again to really convert the data now
 933     if ( !dst )
 934         state = stateOrig;
 935
 936     return len;
 937 }
 938
 939 // ----------------------------------------------------------------------------
 940 // UTF-8
 941 // ----------------------------------------------------------------------------
 942
 943 static const wxUint32 utf8_max[]=
 944     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 945
 946 // boundaries of the private use area we use to (temporarily) remap invalid
 947 // characters invalid in a UTF-8 encoded string
 948 const wxUint32 wxUnicodePUA = 0x100000;
 949 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 950
 951 // this table gives the length of the UTF-8 encoding from its first character:
 952 const unsigned char tableUtf8Lengths[256] = {
 953     // single-byte sequences (ASCII):
 954     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 955     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 956     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 957     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 958     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 959     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 960     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 961     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 962
 963     // these are invalid:
 964     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 965     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 966     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 967     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 968     0, 0,                                            // C0,C1
 969
 970     // two-byte sequences:
 971           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 972     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 973
 974     // three-byte sequences:
 975     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 976
 977     // four-byte sequences:
 978     4, 4, 4, 4, 4,                                   // F0..F4
 979
 980     // these are invalid again (5- or 6-byte
 981     // sequences and sequences for code points
 982     // above U+10FFFF, as restricted by RFC 3629):
 983                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 984 };
 985
 986 size_t
 987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 988                             const char *src, size_t srcLen) const
 989 {
 990     wchar_t *out = dstLen ? dst : NULL;
 991     size_t written = 0;
 992
 993     if ( srcLen == wxNO_LEN )
 994         srcLen = strlen(src) + 1;
 995
 996     for ( const char *p = src; ; p++ )
 997     {
 998         if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
 999         {
1000             // all done successfully, just add the trailing NULL if we are not
1001             // using explicit length
1002             if ( srcLen == wxNO_LEN )
1003             {
1004                 if ( out )
1005                 {
1006                     if ( !dstLen )
1007                         break;
1008
1009                     *out = L'\0';
1010                 }
1011
1012                 written++;
1013             }
1014
1015             return written;
1016         }
1017
1018         if ( out && !dstLen-- )
1019             break;
1020
1021         wxUint32 code;
1022         unsigned char c = *p;
1023
1024         if ( c < 0x80 )
1025         {
1026             if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027                 break;
1028
1029             if ( srcLen != wxNO_LEN )
1030                 srcLen--;
1031
1032             code = c;
1033         }
1034         else
1035         {
1036             unsigned len = tableUtf8Lengths[c];
1037             if ( !len )
1038                 break;
1039
1040             if ( srcLen < len ) // the test works for wxNO_LEN too
1041                 break;
1042
1043             if ( srcLen != wxNO_LEN )
1044                 srcLen -= len;
1045
1046             //   Char. number range   |        UTF-8 octet sequence
1047             //      (hexadecimal)     |              (binary)
1048             //  ----------------------+----------------------------------------
1049             //  0000 0000 - 0000 007F | 0xxxxxxx
1050             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053             //
1054             //  Code point value is stored in bits marked with 'x',
1055             //  lowest-order bit of the value on the right side in the diagram
1056             //  above.                                         (from RFC 3629)
1057
1058             // mask to extract lead byte's value ('x' bits above), by sequence
1059             // length:
1060             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062             // mask and value of lead byte's most significant bits, by length:
1063             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066             len--; // it's more convenient to work with 0-based length here
1067
1068             // extract the lead byte's value bits:
1069             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070                 break;
1071
1072             code = c & leadValueMask[len];
1073
1074             // all remaining bytes, if any, are handled in the same way
1075             // regardless of sequence's length:
1076             for ( ; len; --len )
1077             {
1078                 c = *++p;
1079                 if ( (c & 0xC0) != 0x80 )
1080                     return wxCONV_FAILED;
1081
1082                 code <<= 6;
1083                 code |= c & 0x3F;
1084             }
1085         }
1086
1087 #ifdef WC_UTF16
1088         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090         {
1091             if ( out )
1092                 out++;
1093             written++;
1094         }
1095 #else // !WC_UTF16
1096         if ( out )
1097             *out = code;
1098 #endif // WC_UTF16/!WC_UTF16
1099
1100         if ( out )
1101             out++;
1102
1103         written++;
1104     }
1105
1106     return wxCONV_FAILED;
1107 }
1108
1109 size_t
1110 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111                               const wchar_t *src, size_t srcLen) const
1112 {
1113     char *out = dstLen ? dst : NULL;
1114     size_t written = 0;
1115
1116     for ( const wchar_t *wp = src; ; wp++ )
1117     {
1118         if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
1119         {
1120             // all done successfully, just add the trailing NULL if we are not
1121             // using explicit length
1122             if ( srcLen == wxNO_LEN )
1123             {
1124                 if ( out )
1125                 {
1126                     if ( !dstLen )
1127                         break;
1128
1129                     *out = '\0';
1130                 }
1131
1132                 written++;
1133             }
1134
1135             return written;
1136         }
1137
1138         if ( srcLen != wxNO_LEN )
1139             srcLen--;
1140
1141         wxUint32 code;
1142 #ifdef WC_UTF16
1143         // cast is ok for WC_UTF16
1144         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145         {
1146             // skip the next char too as we decoded a surrogate
1147             wp++;
1148             if ( srcLen != wxNO_LEN )
1149                 srcLen--;
1150         }
1151 #else // wchar_t is UTF-32
1152         code = *wp & 0x7fffffff;
1153 #endif
1154
1155         unsigned len;
1156         if ( code <= 0x7F )
1157         {
1158             len = 1;
1159             if ( out )
1160             {
1161                 if ( dstLen < len )
1162                     break;
1163
1164                 out[0] = (char)code;
1165             }
1166         }
1167         else if ( code <= 0x07FF )
1168         {
1169             len = 2;
1170             if ( out )
1171             {
1172                 if ( dstLen < len )
1173                     break;
1174
1175                 // NB: this line takes 6 least significant bits, encodes them as
1176                 // 10xxxxxx and discards them so that the next byte can be encoded:
1177                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1178                 out[0] = 0xC0 | code;
1179             }
1180         }
1181         else if ( code < 0xFFFF )
1182         {
1183             len = 3;
1184             if ( out )
1185             {
1186                 if ( dstLen < len )
1187                     break;
1188
1189                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1190                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1191                 out[0] = 0xE0 | code;
1192             }
1193         }
1194         else if ( code <= 0x10FFFF )
1195         {
1196             len = 4;
1197             if ( out )
1198             {
1199                 if ( dstLen < len )
1200                     break;
1201
1202                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1203                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1204                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1205                 out[0] = 0xF0 | code;
1206             }
1207         }
1208         else
1209         {
1210             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1211             break;
1212         }
1213
1214         if ( out )
1215         {
1216             out += len;
1217             dstLen -= len;
1218         }
1219
1220         written += len;
1221     }
1222
1223     // we only get here if an error occurs during decoding
1224     return wxCONV_FAILED;
1225 }
1226
1227 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1228                              const char *psz, size_t srcLen) const
1229 {
1230     if ( m_options == MAP_INVALID_UTF8_NOT )
1231         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1232
1233     size_t len = 0;
1234
1235     // The length can be either given explicitly or computed implicitly for the
1236     // NUL-terminated strings.
1237     const bool isNulTerminated = srcLen == wxNO_LEN;
1238     while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1239     {
1240         const char *opsz = psz;
1241         bool invalid = false;
1242         unsigned char cc = *psz++, fc = cc;
1243         unsigned cnt;
1244         for (cnt = 0; fc & 0x80; cnt++)
1245             fc <<= 1;
1246
1247         if (!cnt)
1248         {
1249             // plain ASCII char
1250             if (buf)
1251                 *buf++ = cc;
1252             len++;
1253
1254             // escape the escape character for octal escapes
1255             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1256                     && cc == '\\' && (!buf || len < n))
1257             {
1258                 if (buf)
1259                     *buf++ = cc;
1260                 len++;
1261             }
1262         }
1263         else
1264         {
1265             cnt--;
1266             if (!cnt)
1267             {
1268                 // invalid UTF-8 sequence
1269                 invalid = true;
1270             }
1271             else
1272             {
1273                 unsigned ocnt = cnt - 1;
1274                 wxUint32 res = cc & (0x3f >> cnt);
1275                 while (cnt--)
1276                 {
1277                     cc = *psz;
1278                     if ((cc & 0xC0) != 0x80)
1279                     {
1280                         // invalid UTF-8 sequence
1281                         invalid = true;
1282                         break;
1283                     }
1284
1285                     psz++;
1286                     res = (res << 6) | (cc & 0x3f);
1287                 }
1288
1289                 if (invalid || res <= utf8_max[ocnt])
1290                 {
1291                     // illegal UTF-8 encoding
1292                     invalid = true;
1293                 }
1294                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1295                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1296                 {
1297                     // if one of our PUA characters turns up externally
1298                     // it must also be treated as an illegal sequence
1299                     // (a bit like you have to escape an escape character)
1300                     invalid = true;
1301                 }
1302                 else
1303                 {
1304 #ifdef WC_UTF16
1305                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1306                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1307                     if (pa == wxCONV_FAILED)
1308                     {
1309                         invalid = true;
1310                     }
1311                     else
1312                     {
1313                         if (buf)
1314                             buf += pa;
1315                         len += pa;
1316                     }
1317 #else // !WC_UTF16
1318                     if (buf)
1319                         *buf++ = (wchar_t)res;
1320                     len++;
1321 #endif // WC_UTF16/!WC_UTF16
1322                 }
1323             }
1324
1325             if (invalid)
1326             {
1327                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1328                 {
1329                     while (opsz < psz && (!buf || len < n))
1330                     {
1331 #ifdef WC_UTF16
1332                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1333                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1334                         wxASSERT(pa != wxCONV_FAILED);
1335                         if (buf)
1336                             buf += pa;
1337                         opsz++;
1338                         len += pa;
1339 #else
1340                         if (buf)
1341                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1342                         opsz++;
1343                         len++;
1344 #endif
1345                     }
1346                 }
1347                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1348                 {
1349                     while (opsz < psz && (!buf || len < n))
1350                     {
1351                         if ( buf && len + 3 < n )
1352                         {
1353                             unsigned char on = *opsz;
1354                             *buf++ = L'\\';
1355                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1356                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1357                             *buf++ = (wchar_t)( L'0' + on % 010 );
1358                         }
1359
1360                         opsz++;
1361                         len += 4;
1362                     }
1363                 }
1364                 else // MAP_INVALID_UTF8_NOT
1365                 {
1366                     return wxCONV_FAILED;
1367                 }
1368             }
1369         }
1370     }
1371
1372     if ( isNulTerminated )
1373     {
1374         // Add the trailing NUL in this case if we have a large enough buffer.
1375         if ( buf && (len < n) )
1376             *buf = 0;
1377
1378         // And count it in any case.
1379         len++;
1380     }
1381
1382     return len;
1383 }
1384
1385 static inline bool isoctal(wchar_t wch)
1386 {
1387     return L'0' <= wch && wch <= L'7';
1388 }
1389
1390 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1391                                const wchar_t *psz, size_t srcLen) const
1392 {
1393     if ( m_options == MAP_INVALID_UTF8_NOT )
1394         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1395
1396     size_t len = 0;
1397
1398     // The length can be either given explicitly or computed implicitly for the
1399     // NUL-terminated strings.
1400     const bool isNulTerminated = srcLen == wxNO_LEN;
1401     while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1402     {
1403         wxUint32 cc;
1404
1405 #ifdef WC_UTF16
1406         // cast is ok for WC_UTF16
1407         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1408         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1409 #else
1410         cc = (*psz++) & 0x7fffffff;
1411 #endif
1412
1413         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1414                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1415         {
1416             if (buf)
1417                 *buf++ = (char)(cc - wxUnicodePUA);
1418             len++;
1419         }
1420         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1421                     && cc == L'\\' && psz[0] == L'\\' )
1422         {
1423             if (buf)
1424                 *buf++ = (char)cc;
1425             psz++;
1426             len++;
1427         }
1428         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1429                     cc == L'\\' &&
1430                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1431         {
1432             if (buf)
1433             {
1434                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1435                                  (psz[1] - L'0') * 010 +
1436                                  (psz[2] - L'0'));
1437             }
1438
1439             psz += 3;
1440             len++;
1441         }
1442         else
1443         {
1444             unsigned cnt;
1445             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1446             {
1447             }
1448
1449             if (!cnt)
1450             {
1451                 // plain ASCII char
1452                 if (buf)
1453                     *buf++ = (char) cc;
1454                 len++;
1455             }
1456             else
1457             {
1458                 len += cnt + 1;
1459                 if (buf)
1460                 {
1461                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1462                     while (cnt--)
1463                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1464                 }
1465             }
1466         }
1467     }
1468
1469     if ( isNulTerminated )
1470     {
1471         // Add the trailing NUL in this case if we have a large enough buffer.
1472         if ( buf && (len < n) )
1473             *buf = 0;
1474
1475         // And count it in any case.
1476         len++;
1477     }
1478
1479     return len;
1480 }
1481
1482 // ============================================================================
1483 // UTF-16
1484 // ============================================================================
1485
1486 #ifdef WORDS_BIGENDIAN
1487     #define wxMBConvUTF16straight wxMBConvUTF16BE
1488     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1489 #else
1490     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1491     #define wxMBConvUTF16straight wxMBConvUTF16LE
1492 #endif
1493
1494 /* static */
1495 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1496 {
1497     if ( srcLen == wxNO_LEN )
1498     {
1499         // count the number of bytes in input, including the trailing NULs
1500         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1501         for ( srcLen = 1; *inBuff++; srcLen++ )
1502             ;
1503
1504         srcLen *= BYTES_PER_CHAR;
1505     }
1506     else // we already have the length
1507     {
1508         // we can only convert an entire number of UTF-16 characters
1509         if ( srcLen % BYTES_PER_CHAR )
1510             return wxCONV_FAILED;
1511     }
1512
1513     return srcLen;
1514 }
1515
1516 // case when in-memory representation is UTF-16 too
1517 #ifdef WC_UTF16
1518
1519 // ----------------------------------------------------------------------------
1520 // conversions without endianness change
1521 // ----------------------------------------------------------------------------
1522
1523 size_t
1524 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1525                                const char *src, size_t srcLen) const
1526 {
1527     // set up the scene for using memcpy() (which is presumably more efficient
1528     // than copying the bytes one by one)
1529     srcLen = GetLength(src, srcLen);
1530     if ( srcLen == wxNO_LEN )
1531         return wxCONV_FAILED;
1532
1533     const size_t inLen = srcLen / BYTES_PER_CHAR;
1534     if ( dst )
1535     {
1536         if ( dstLen < inLen )
1537             return wxCONV_FAILED;
1538
1539         memcpy(dst, src, srcLen);
1540     }
1541
1542     return inLen;
1543 }
1544
1545 size_t
1546 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1547                                  const wchar_t *src, size_t srcLen) const
1548 {
1549     if ( srcLen == wxNO_LEN )
1550         srcLen = wxWcslen(src) + 1;
1551
1552     srcLen *= BYTES_PER_CHAR;
1553
1554     if ( dst )
1555     {
1556         if ( dstLen < srcLen )
1557             return wxCONV_FAILED;
1558
1559         memcpy(dst, src, srcLen);
1560     }
1561
1562     return srcLen;
1563 }
1564
1565 // ----------------------------------------------------------------------------
1566 // endian-reversing conversions
1567 // ----------------------------------------------------------------------------
1568
1569 size_t
1570 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1571                            const char *src, size_t srcLen) const
1572 {
1573     srcLen = GetLength(src, srcLen);
1574     if ( srcLen == wxNO_LEN )
1575         return wxCONV_FAILED;
1576
1577     srcLen /= BYTES_PER_CHAR;
1578
1579     if ( dst )
1580     {
1581         if ( dstLen < srcLen )
1582             return wxCONV_FAILED;
1583
1584         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1585         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1586         {
1587             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1588         }
1589     }
1590
1591     return srcLen;
1592 }
1593
1594 size_t
1595 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1596                              const wchar_t *src, size_t srcLen) const
1597 {
1598     if ( srcLen == wxNO_LEN )
1599         srcLen = wxWcslen(src) + 1;
1600
1601     srcLen *= BYTES_PER_CHAR;
1602
1603     if ( dst )
1604     {
1605         if ( dstLen < srcLen )
1606             return wxCONV_FAILED;
1607
1608         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1609         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1610         {
1611             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1612         }
1613     }
1614
1615     return srcLen;
1616 }
1617
1618 #else // !WC_UTF16: wchar_t is UTF-32
1619
1620 // ----------------------------------------------------------------------------
1621 // conversions without endianness change
1622 // ----------------------------------------------------------------------------
1623
1624 size_t
1625 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1626                                const char *src, size_t srcLen) const
1627 {
1628     srcLen = GetLength(src, srcLen);
1629     if ( srcLen == wxNO_LEN )
1630         return wxCONV_FAILED;
1631
1632     const size_t inLen = srcLen / BYTES_PER_CHAR;
1633     if ( !dst )
1634     {
1635         // optimization: return maximal space which could be needed for this
1636         // string even if the real size could be smaller if the buffer contains
1637         // any surrogates
1638         return inLen;
1639     }
1640
1641     size_t outLen = 0;
1642     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1643     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1644     {
1645         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1646         if ( !inBuff )
1647             return wxCONV_FAILED;
1648
1649         if ( ++outLen > dstLen )
1650             return wxCONV_FAILED;
1651
1652         *dst++ = ch;
1653     }
1654
1655
1656     return outLen;
1657 }
1658
1659 size_t
1660 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1661                                  const wchar_t *src, size_t srcLen) const
1662 {
1663     if ( srcLen == wxNO_LEN )
1664         srcLen = wxWcslen(src) + 1;
1665
1666     size_t outLen = 0;
1667     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1668     for ( size_t n = 0; n < srcLen; n++ )
1669     {
1670         wxUint16 cc[2] = { 0 };
1671         const size_t numChars = encode_utf16(*src++, cc);
1672         if ( numChars == wxCONV_FAILED )
1673             return wxCONV_FAILED;
1674
1675         outLen += numChars * BYTES_PER_CHAR;
1676         if ( outBuff )
1677         {
1678             if ( outLen > dstLen )
1679                 return wxCONV_FAILED;
1680
1681             *outBuff++ = cc[0];
1682             if ( numChars == 2 )
1683             {
1684                 // second character of a surrogate
1685                 *outBuff++ = cc[1];
1686             }
1687         }
1688     }
1689
1690     return outLen;
1691 }
1692
1693 // ----------------------------------------------------------------------------
1694 // endian-reversing conversions
1695 // ----------------------------------------------------------------------------
1696
1697 size_t
1698 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1699                            const char *src, size_t srcLen) const
1700 {
1701     srcLen = GetLength(src, srcLen);
1702     if ( srcLen == wxNO_LEN )
1703         return wxCONV_FAILED;
1704
1705     const size_t inLen = srcLen / BYTES_PER_CHAR;
1706     if ( !dst )
1707     {
1708         // optimization: return maximal space which could be needed for this
1709         // string even if the real size could be smaller if the buffer contains
1710         // any surrogates
1711         return inLen;
1712     }
1713
1714     size_t outLen = 0;
1715     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1716     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1717     {
1718         wxUint32 ch;
1719         wxUint16 tmp[2];
1720
1721         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1722         inBuff++;
1723         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1724
1725         const size_t numChars = decode_utf16(tmp, ch);
1726         if ( numChars == wxCONV_FAILED )
1727             return wxCONV_FAILED;
1728
1729         if ( numChars == 2 )
1730             inBuff++;
1731
1732         if ( ++outLen > dstLen )
1733             return wxCONV_FAILED;
1734
1735         *dst++ = ch;
1736     }
1737
1738
1739     return outLen;
1740 }
1741
1742 size_t
1743 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1744                              const wchar_t *src, size_t srcLen) const
1745 {
1746     if ( srcLen == wxNO_LEN )
1747         srcLen = wxWcslen(src) + 1;
1748
1749     size_t outLen = 0;
1750     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1751     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1752     {
1753         wxUint16 cc[2] = { 0 };
1754         const size_t numChars = encode_utf16(*src, cc);
1755         if ( numChars == wxCONV_FAILED )
1756             return wxCONV_FAILED;
1757
1758         outLen += numChars * BYTES_PER_CHAR;
1759         if ( outBuff )
1760         {
1761             if ( outLen > dstLen )
1762                 return wxCONV_FAILED;
1763
1764             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1765             if ( numChars == 2 )
1766             {
1767                 // second character of a surrogate
1768                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1769             }
1770         }
1771     }
1772
1773     return outLen;
1774 }
1775
1776 #endif // WC_UTF16/!WC_UTF16
1777
1778
1779 // ============================================================================
1780 // UTF-32
1781 // ============================================================================
1782
1783 #ifdef WORDS_BIGENDIAN
1784     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1785     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1786 #else
1787     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1788     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1789 #endif
1790
1791
1792 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1793 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1794
1795 /* static */
1796 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1797 {
1798     if ( srcLen == wxNO_LEN )
1799     {
1800         // count the number of bytes in input, including the trailing NULs
1801         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1802         for ( srcLen = 1; *inBuff++; srcLen++ )
1803             ;
1804
1805         srcLen *= BYTES_PER_CHAR;
1806     }
1807     else // we already have the length
1808     {
1809         // we can only convert an entire number of UTF-32 characters
1810         if ( srcLen % BYTES_PER_CHAR )
1811             return wxCONV_FAILED;
1812     }
1813
1814     return srcLen;
1815 }
1816
1817 // case when in-memory representation is UTF-16
1818 #ifdef WC_UTF16
1819
1820 // ----------------------------------------------------------------------------
1821 // conversions without endianness change
1822 // ----------------------------------------------------------------------------
1823
1824 size_t
1825 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1826                                const char *src, size_t srcLen) const
1827 {
1828     srcLen = GetLength(src, srcLen);
1829     if ( srcLen == wxNO_LEN )
1830         return wxCONV_FAILED;
1831
1832     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1833     const size_t inLen = srcLen / BYTES_PER_CHAR;
1834     size_t outLen = 0;
1835     for ( size_t n = 0; n < inLen; n++ )
1836     {
1837         wxUint16 cc[2] = { 0 };
1838         const size_t numChars = encode_utf16(*inBuff++, cc);
1839         if ( numChars == wxCONV_FAILED )
1840             return wxCONV_FAILED;
1841
1842         outLen += numChars;
1843         if ( dst )
1844         {
1845             if ( outLen > dstLen )
1846                 return wxCONV_FAILED;
1847
1848             *dst++ = cc[0];
1849             if ( numChars == 2 )
1850             {
1851                 // second character of a surrogate
1852                 *dst++ = cc[1];
1853             }
1854         }
1855     }
1856
1857     return outLen;
1858 }
1859
1860 size_t
1861 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1862                                  const wchar_t *src, size_t srcLen) const
1863 {
1864     if ( srcLen == wxNO_LEN )
1865         srcLen = wxWcslen(src) + 1;
1866
1867     if ( !dst )
1868     {
1869         // optimization: return maximal space which could be needed for this
1870         // string instead of the exact amount which could be less if there are
1871         // any surrogates in the input
1872         //
1873         // we consider that surrogates are rare enough to make it worthwhile to
1874         // avoid running the loop below at the cost of slightly extra memory
1875         // consumption
1876         return srcLen * BYTES_PER_CHAR;
1877     }
1878
1879     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1880     size_t outLen = 0;
1881     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1882     {
1883         const wxUint32 ch = wxDecodeSurrogate(&src);
1884         if ( !src )
1885             return wxCONV_FAILED;
1886
1887         outLen += BYTES_PER_CHAR;
1888
1889         if ( outLen > dstLen )
1890             return wxCONV_FAILED;
1891
1892         *outBuff++ = ch;
1893     }
1894
1895     return outLen;
1896 }
1897
1898 // ----------------------------------------------------------------------------
1899 // endian-reversing conversions
1900 // ----------------------------------------------------------------------------
1901
1902 size_t
1903 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1904                            const char *src, size_t srcLen) const
1905 {
1906     srcLen = GetLength(src, srcLen);
1907     if ( srcLen == wxNO_LEN )
1908         return wxCONV_FAILED;
1909
1910     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1911     const size_t inLen = srcLen / BYTES_PER_CHAR;
1912     size_t outLen = 0;
1913     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1914     {
1915         wxUint16 cc[2] = { 0 };
1916         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1917         if ( numChars == wxCONV_FAILED )
1918             return wxCONV_FAILED;
1919
1920         outLen += numChars;
1921         if ( dst )
1922         {
1923             if ( outLen > dstLen )
1924                 return wxCONV_FAILED;
1925
1926             *dst++ = cc[0];
1927             if ( numChars == 2 )
1928             {
1929                 // second character of a surrogate
1930                 *dst++ = cc[1];
1931             }
1932         }
1933     }
1934
1935     return outLen;
1936 }
1937
1938 size_t
1939 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1940                              const wchar_t *src, size_t srcLen) const
1941 {
1942     if ( srcLen == wxNO_LEN )
1943         srcLen = wxWcslen(src) + 1;
1944
1945     if ( !dst )
1946     {
1947         // optimization: return maximal space which could be needed for this
1948         // string instead of the exact amount which could be less if there are
1949         // any surrogates in the input
1950         //
1951         // we consider that surrogates are rare enough to make it worthwhile to
1952         // avoid running the loop below at the cost of slightly extra memory
1953         // consumption
1954         return srcLen*BYTES_PER_CHAR;
1955     }
1956
1957     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1958     size_t outLen = 0;
1959     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1960     {
1961         const wxUint32 ch = wxDecodeSurrogate(&src);
1962         if ( !src )
1963             return wxCONV_FAILED;
1964
1965         outLen += BYTES_PER_CHAR;
1966
1967         if ( outLen > dstLen )
1968             return wxCONV_FAILED;
1969
1970         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1971     }
1972
1973     return outLen;
1974 }
1975
1976 #else // !WC_UTF16: wchar_t is UTF-32
1977
1978 // ----------------------------------------------------------------------------
1979 // conversions without endianness change
1980 // ----------------------------------------------------------------------------
1981
1982 size_t
1983 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1984                                const char *src, size_t srcLen) const
1985 {
1986     // use memcpy() as it should be much faster than hand-written loop
1987     srcLen = GetLength(src, srcLen);
1988     if ( srcLen == wxNO_LEN )
1989         return wxCONV_FAILED;
1990
1991     const size_t inLen = srcLen/BYTES_PER_CHAR;
1992     if ( dst )
1993     {
1994         if ( dstLen < inLen )
1995             return wxCONV_FAILED;
1996
1997         memcpy(dst, src, srcLen);
1998     }
1999
2000     return inLen;
2001 }
2002
2003 size_t
2004 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2005                                  const wchar_t *src, size_t srcLen) const
2006 {
2007     if ( srcLen == wxNO_LEN )
2008         srcLen = wxWcslen(src) + 1;
2009
2010     srcLen *= BYTES_PER_CHAR;
2011
2012     if ( dst )
2013     {
2014         if ( dstLen < srcLen )
2015             return wxCONV_FAILED;
2016
2017         memcpy(dst, src, srcLen);
2018     }
2019
2020     return srcLen;
2021 }
2022
2023 // ----------------------------------------------------------------------------
2024 // endian-reversing conversions
2025 // ----------------------------------------------------------------------------
2026
2027 size_t
2028 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2029                            const char *src, size_t srcLen) const
2030 {
2031     srcLen = GetLength(src, srcLen);
2032     if ( srcLen == wxNO_LEN )
2033         return wxCONV_FAILED;
2034
2035     srcLen /= BYTES_PER_CHAR;
2036
2037     if ( dst )
2038     {
2039         if ( dstLen < srcLen )
2040             return wxCONV_FAILED;
2041
2042         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2043         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2044         {
2045             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2046         }
2047     }
2048
2049     return srcLen;
2050 }
2051
2052 size_t
2053 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2054                              const wchar_t *src, size_t srcLen) const
2055 {
2056     if ( srcLen == wxNO_LEN )
2057         srcLen = wxWcslen(src) + 1;
2058
2059     srcLen *= BYTES_PER_CHAR;
2060
2061     if ( dst )
2062     {
2063         if ( dstLen < srcLen )
2064             return wxCONV_FAILED;
2065
2066         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2067         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2068         {
2069             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2070         }
2071     }
2072
2073     return srcLen;
2074 }
2075
2076 #endif // WC_UTF16/!WC_UTF16
2077
2078
2079 // ============================================================================
2080 // The classes doing conversion using the iconv_xxx() functions
2081 // ============================================================================
2082
2083 #ifdef HAVE_ICONV
2084
2085 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2086 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2087 //     (unless there's yet another bug in glibc) the only case when iconv()
2088 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2089 //     left in the input buffer -- when _real_ error occurs,
2090 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2091 //     iconv() failure.
2092 //     [This bug does not appear in glibc 2.2.]
2093 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2094 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2095                                      (errno != E2BIG || bufLeft != 0))
2096 #else
2097 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2098 #endif
2099
2100 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2101
2102 #define ICONV_T_INVALID ((iconv_t)-1)
2103
2104 #if SIZEOF_WCHAR_T == 4
2105     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2106     #define WC_ENC      wxFONTENCODING_UTF32
2107 #elif SIZEOF_WCHAR_T == 2
2108     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2109     #define WC_ENC      wxFONTENCODING_UTF16
2110 #else // sizeof(wchar_t) != 2 nor 4
2111     // does this ever happen?
2112     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2113 #endif
2114
2115 // ----------------------------------------------------------------------------
2116 // wxMBConv_iconv: encapsulates an iconv character set
2117 // ----------------------------------------------------------------------------
2118
2119 class wxMBConv_iconv : public wxMBConv
2120 {
2121 public:
2122     wxMBConv_iconv(const char *name);
2123     virtual ~wxMBConv_iconv();
2124
2125     // implement base class virtual methods
2126     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2127                            const char *src, size_t srcLen = wxNO_LEN) const;
2128     virtual size_t FromWChar(char *dst, size_t dstLen,
2129                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2130     virtual size_t GetMBNulLen() const;
2131
2132 #if wxUSE_UNICODE_UTF8
2133     virtual bool IsUTF8() const;
2134 #endif
2135
2136     virtual wxMBConv *Clone() const
2137     {
2138         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2139         p->m_minMBCharWidth = m_minMBCharWidth;
2140         return p;
2141     }
2142
2143     bool IsOk() const
2144         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2145
2146 protected:
2147     // the iconv handlers used to translate from multibyte
2148     // to wide char and in the other direction
2149     iconv_t m2w,
2150             w2m;
2151
2152 #if wxUSE_THREADS
2153     // guards access to m2w and w2m objects
2154     wxMutex m_iconvMutex;
2155 #endif
2156
2157 private:
2158     // the name (for iconv_open()) of a wide char charset -- if none is
2159     // available on this machine, it will remain NULL
2160     static wxString ms_wcCharsetName;
2161
2162     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2163     // different endian-ness than the native one
2164     static bool ms_wcNeedsSwap;
2165
2166
2167     // name of the encoding handled by this conversion
2168     const char *m_name;
2169
2170     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2171     // initially
2172     size_t m_minMBCharWidth;
2173 };
2174
2175 // make the constructor available for unit testing
2176 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2177 {
2178     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2179     if ( !result->IsOk() )
2180     {
2181         delete result;
2182         return 0;
2183     }
2184
2185     return result;
2186 }
2187
2188 wxString wxMBConv_iconv::ms_wcCharsetName;
2189 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2190
2191 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2192               : m_name(wxStrdup(name))
2193 {
2194     m_minMBCharWidth = 0;
2195
2196     // check for charset that represents wchar_t:
2197     if ( ms_wcCharsetName.empty() )
2198     {
2199         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2200
2201 #if wxUSE_FONTMAP
2202         const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2203 #else // !wxUSE_FONTMAP
2204         static const wxChar *const names_static[] =
2205         {
2206 #if SIZEOF_WCHAR_T == 4
2207             wxT("UCS-4"),
2208 #elif SIZEOF_WCHAR_T == 2
2209             wxT("UCS-2"),
2210 #endif
2211             NULL
2212         };
2213         const wxChar *const *names = names_static;
2214 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2215
2216         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2217         {
2218             const wxString nameCS(*names);
2219
2220             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2221             wxString nameXE(nameCS);
2222
2223 #ifdef WORDS_BIGENDIAN
2224                 nameXE += wxT("BE");
2225 #else // little endian
2226                 nameXE += wxT("LE");
2227 #endif
2228
2229             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2230                        nameXE.c_str());
2231
2232             m2w = iconv_open(nameXE.ToAscii(), name);
2233             if ( m2w == ICONV_T_INVALID )
2234             {
2235                 // try charset w/o bytesex info (e.g. "UCS4")
2236                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2237                            nameCS.c_str());
2238                 m2w = iconv_open(nameCS.ToAscii(), name);
2239
2240                 // and check for bytesex ourselves:
2241                 if ( m2w != ICONV_T_INVALID )
2242                 {
2243                     char    buf[2], *bufPtr;
2244                     wchar_t wbuf[2];
2245                     size_t  insz, outsz;
2246                     size_t  res;
2247
2248                     buf[0] = 'A';
2249                     buf[1] = 0;
2250                     wbuf[0] = 0;
2251                     insz = 2;
2252                     outsz = SIZEOF_WCHAR_T * 2;
2253                     char* wbufPtr = (char*)wbuf;
2254                     bufPtr = buf;
2255
2256                     res = iconv(
2257                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2258                         &wbufPtr, &outsz);
2259
2260                     if (ICONV_FAILED(res, insz))
2261                     {
2262                         wxLogLastError(wxT("iconv"));
2263                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2264                                    nameCS.c_str());
2265                     }
2266                     else // ok, can convert to this encoding, remember it
2267                     {
2268                         ms_wcCharsetName = nameCS;
2269                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2270                     }
2271                 }
2272             }
2273             else // use charset not requiring byte swapping
2274             {
2275                 ms_wcCharsetName = nameXE;
2276             }
2277         }
2278
2279         wxLogTrace(TRACE_STRCONV,
2280                    wxT("iconv wchar_t charset is \"%s\"%s"),
2281                    ms_wcCharsetName.empty() ? wxString("<none>")
2282                                             : ms_wcCharsetName,
2283                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2284                                   : wxT(""));
2285     }
2286     else // we already have ms_wcCharsetName
2287     {
2288         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2289     }
2290
2291     if ( ms_wcCharsetName.empty() )
2292     {
2293         w2m = ICONV_T_INVALID;
2294     }
2295     else
2296     {
2297         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2298         if ( w2m == ICONV_T_INVALID )
2299         {
2300             wxLogTrace(TRACE_STRCONV,
2301                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2302                        ms_wcCharsetName.c_str(), name);
2303         }
2304     }
2305 }
2306
2307 wxMBConv_iconv::~wxMBConv_iconv()
2308 {
2309     free(const_cast<char *>(m_name));
2310
2311     if ( m2w != ICONV_T_INVALID )
2312         iconv_close(m2w);
2313     if ( w2m != ICONV_T_INVALID )
2314         iconv_close(w2m);
2315 }
2316
2317 size_t
2318 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2319                         const char *src, size_t srcLen) const
2320 {
2321     if ( srcLen == wxNO_LEN )
2322     {
2323         // find the string length: notice that must be done differently for
2324         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2325         // consecutive NULs
2326         const size_t nulLen = GetMBNulLen();
2327         switch ( nulLen )
2328         {
2329             default:
2330                 return wxCONV_FAILED;
2331
2332             case 1:
2333                 srcLen = strlen(src); // arguably more optimized than our version
2334                 break;
2335
2336             case 2:
2337             case 4:
2338                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2339                 // but they also have to start at character boundary and not
2340                 // span two adjacent characters
2341                 const char *p;
2342                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2343                     ;
2344                 srcLen = p - src;
2345                 break;
2346         }
2347
2348         // when we're determining the length of the string ourselves we count
2349         // the terminating NUL(s) as part of it and always NUL-terminate the
2350         // output
2351         srcLen += nulLen;
2352     }
2353
2354     // we express length in the number of (wide) characters but iconv always
2355     // counts buffer sizes it in bytes
2356     dstLen *= SIZEOF_WCHAR_T;
2357
2358 #if wxUSE_THREADS
2359     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2360     //     Unfortunately there are a couple of global wxCSConv objects such as
2361     //     wxConvLocal that are used all over wx code, so we have to make sure
2362     //     the handle is used by at most one thread at the time. Otherwise
2363     //     only a few wx classes would be safe to use from non-main threads
2364     //     as MB<->WC conversion would fail "randomly".
2365     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2366 #endif // wxUSE_THREADS
2367
2368     size_t res, cres;
2369     const char *pszPtr = src;
2370
2371     if ( dst )
2372     {
2373         char* bufPtr = (char*)dst;
2374
2375         // have destination buffer, convert there
2376         size_t dstLenOrig = dstLen;
2377         cres = iconv(m2w,
2378                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2379                      &bufPtr, &dstLen);
2380
2381         // convert the number of bytes converted as returned by iconv to the
2382         // number of (wide) characters converted that we need
2383         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2384
2385         if (ms_wcNeedsSwap)
2386         {
2387             // convert to native endianness
2388             for ( unsigned i = 0; i < res; i++ )
2389                 dst[i] = WC_BSWAP(dst[i]);
2390         }
2391     }
2392     else // no destination buffer
2393     {
2394         // convert using temp buffer to calculate the size of the buffer needed
2395         wchar_t tbuf[256];
2396         res = 0;
2397
2398         do
2399         {
2400             char* bufPtr = (char*)tbuf;
2401             dstLen = 8 * SIZEOF_WCHAR_T;
2402
2403             cres = iconv(m2w,
2404                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2405                          &bufPtr, &dstLen );
2406
2407             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2408         }
2409         while ((cres == (size_t)-1) && (errno == E2BIG));
2410     }
2411
2412     if (ICONV_FAILED(cres, srcLen))
2413     {
2414         //VS: it is ok if iconv fails, hence trace only
2415         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2416         return wxCONV_FAILED;
2417     }
2418
2419     return res;
2420 }
2421
2422 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2423                                  const wchar_t *src, size_t srcLen) const
2424 {
2425 #if wxUSE_THREADS
2426     // NB: explained in MB2WC
2427     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2428 #endif
2429
2430     if ( srcLen == wxNO_LEN )
2431         srcLen = wxWcslen(src) + 1;
2432
2433     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2434     size_t outbuflen = dstLen;
2435     size_t res, cres;
2436
2437     wchar_t *tmpbuf = 0;
2438
2439     if (ms_wcNeedsSwap)
2440     {
2441         // need to copy to temp buffer to switch endianness
2442         // (doing WC_BSWAP twice on the original buffer won't work, as it
2443         //  could be in read-only memory, or be accessed in some other thread)
2444         tmpbuf = (wchar_t *)malloc(inbuflen);
2445         for ( size_t i = 0; i < srcLen; i++ )
2446             tmpbuf[i] = WC_BSWAP(src[i]);
2447
2448         src = tmpbuf;
2449     }
2450
2451     char* inbuf = (char*)src;
2452     if ( dst )
2453     {
2454         // have destination buffer, convert there
2455         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2456
2457         res = dstLen - outbuflen;
2458     }
2459     else // no destination buffer
2460     {
2461         // convert using temp buffer to calculate the size of the buffer needed
2462         char tbuf[256];
2463         res = 0;
2464         do
2465         {
2466             dst = tbuf;
2467             outbuflen = WXSIZEOF(tbuf);
2468
2469             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2470
2471             res += WXSIZEOF(tbuf) - outbuflen;
2472         }
2473         while ((cres == (size_t)-1) && (errno == E2BIG));
2474     }
2475
2476     if (ms_wcNeedsSwap)
2477     {
2478         free(tmpbuf);
2479     }
2480
2481     if (ICONV_FAILED(cres, inbuflen))
2482     {
2483         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2484         return wxCONV_FAILED;
2485     }
2486
2487     return res;
2488 }
2489
2490 size_t wxMBConv_iconv::GetMBNulLen() const
2491 {
2492     if ( m_minMBCharWidth == 0 )
2493     {
2494         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2495
2496 #if wxUSE_THREADS
2497         // NB: explained in MB2WC
2498         wxMutexLocker lock(self->m_iconvMutex);
2499 #endif
2500
2501         const wchar_t *wnul = L"";
2502         char buf[8]; // should be enough for NUL in any encoding
2503         size_t inLen = sizeof(wchar_t),
2504                outLen = WXSIZEOF(buf);
2505         char *inBuff = (char *)wnul;
2506         char *outBuff = buf;
2507         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2508         {
2509             self->m_minMBCharWidth = (size_t)-1;
2510         }
2511         else // ok
2512         {
2513             self->m_minMBCharWidth = outBuff - buf;
2514         }
2515     }
2516
2517     return m_minMBCharWidth;
2518 }
2519
2520 #if wxUSE_UNICODE_UTF8
2521 bool wxMBConv_iconv::IsUTF8() const
2522 {
2523     return wxStricmp(m_name, "UTF-8") == 0 ||
2524            wxStricmp(m_name, "UTF8") == 0;
2525 }
2526 #endif
2527
2528 #endif // HAVE_ICONV
2529
2530
2531 // ============================================================================
2532 // Win32 conversion classes
2533 // ============================================================================
2534
2535 #ifdef wxHAVE_WIN32_MB2WC
2536
2537 // from utils.cpp
2538 #if wxUSE_FONTMAP
2539 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2540 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2541 #endif
2542
2543 class wxMBConv_win32 : public wxMBConv
2544 {
2545 public:
2546     wxMBConv_win32()
2547     {
2548         m_CodePage = CP_ACP;
2549         m_minMBCharWidth = 0;
2550     }
2551
2552     wxMBConv_win32(const wxMBConv_win32& conv)
2553         : wxMBConv()
2554     {
2555         m_CodePage = conv.m_CodePage;
2556         m_minMBCharWidth = conv.m_minMBCharWidth;
2557     }
2558
2559 #if wxUSE_FONTMAP
2560     wxMBConv_win32(const char* name)
2561     {
2562         m_CodePage = wxCharsetToCodepage(name);
2563         m_minMBCharWidth = 0;
2564     }
2565
2566     wxMBConv_win32(wxFontEncoding encoding)
2567     {
2568         m_CodePage = wxEncodingToCodepage(encoding);
2569         m_minMBCharWidth = 0;
2570     }
2571 #endif // wxUSE_FONTMAP
2572
2573     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2574     {
2575         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2576         // the behaviour is not compatible with the Unix version (using iconv)
2577         // and break the library itself, e.g. wxTextInputStream::NextChar()
2578         // wouldn't work if reading an incomplete MB char didn't result in an
2579         // error
2580         //
2581         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2582         // Win XP or newer and it is not supported for UTF-[78] so we always
2583         // use our own conversions in this case. See
2584         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2585         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2586         if ( m_CodePage == CP_UTF8 )
2587         {
2588             return wxMBConvUTF8().MB2WC(buf, psz, n);
2589         }
2590
2591         if ( m_CodePage == CP_UTF7 )
2592         {
2593             return wxMBConvUTF7().MB2WC(buf, psz, n);
2594         }
2595
2596         int flags = 0;
2597         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2598                 IsAtLeastWin2kSP4() )
2599         {
2600             flags = MB_ERR_INVALID_CHARS;
2601         }
2602
2603         const size_t len = ::MultiByteToWideChar
2604                              (
2605                                 m_CodePage,     // code page
2606                                 flags,          // flags: fall on error
2607                                 psz,            // input string
2608                                 -1,             // its length (NUL-terminated)
2609                                 buf,            // output string
2610                                 buf ? n : 0     // size of output buffer
2611                              );
2612         if ( !len )
2613         {
2614             // function totally failed
2615             return wxCONV_FAILED;
2616         }
2617
2618         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2619         // check if we succeeded, by doing a double trip:
2620         if ( !flags && buf )
2621         {
2622             const size_t mbLen = strlen(psz);
2623             wxCharBuffer mbBuf(mbLen);
2624             if ( ::WideCharToMultiByte
2625                    (
2626                       m_CodePage,
2627                       0,
2628                       buf,
2629                       -1,
2630                       mbBuf.data(),
2631                       mbLen + 1,        // size in bytes, not length
2632                       NULL,
2633                       NULL
2634                    ) == 0 ||
2635                   strcmp(mbBuf, psz) != 0 )
2636             {
2637                 // we didn't obtain the same thing we started from, hence
2638                 // the conversion was lossy and we consider that it failed
2639                 return wxCONV_FAILED;
2640             }
2641         }
2642
2643         // note that it returns count of written chars for buf != NULL and size
2644         // of the needed buffer for buf == NULL so in either case the length of
2645         // the string (which never includes the terminating NUL) is one less
2646         return len - 1;
2647     }
2648
2649     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2650     {
2651         /*
2652             we have a problem here: by default, WideCharToMultiByte() may
2653             replace characters unrepresentable in the target code page with bad
2654             quality approximations such as turning "1/2" symbol (U+00BD) into
2655             "1" for the code pages which don't have it and we, obviously, want
2656             to avoid this at any price
2657
2658             the trouble is that this function does it _silently_, i.e. it won't
2659             even tell us whether it did or not... Win98/2000 and higher provide
2660             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2661             we have to resort to a round trip, i.e. check that converting back
2662             results in the same string -- this is, of course, expensive but
2663             otherwise we simply can't be sure to not garble the data.
2664          */
2665
2666         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2667         // it doesn't work with CJK encodings (which we test for rather roughly
2668         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2669         // supporting it
2670         BOOL usedDef wxDUMMY_INITIALIZE(false);
2671         BOOL *pUsedDef;
2672         int flags;
2673         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2674         {
2675             // it's our lucky day
2676             flags = WC_NO_BEST_FIT_CHARS;
2677             pUsedDef = &usedDef;
2678         }
2679         else // old system or unsupported encoding
2680         {
2681             flags = 0;
2682             pUsedDef = NULL;
2683         }
2684
2685         const size_t len = ::WideCharToMultiByte
2686                              (
2687                                 m_CodePage,     // code page
2688                                 flags,          // either none or no best fit
2689                                 pwz,            // input string
2690                                 -1,             // it is (wide) NUL-terminated
2691                                 buf,            // output buffer
2692                                 buf ? n : 0,    // and its size
2693                                 NULL,           // default "replacement" char
2694                                 pUsedDef        // [out] was it used?
2695                              );
2696
2697         if ( !len )
2698         {
2699             // function totally failed
2700             return wxCONV_FAILED;
2701         }
2702
2703         // we did something, check if we really succeeded
2704         if ( flags )
2705         {
2706             // check if the conversion failed, i.e. if any replacements
2707             // were done
2708             if ( usedDef )
2709                 return wxCONV_FAILED;
2710         }
2711         else // we must resort to double tripping...
2712         {
2713             // first we need to ensure that we really have the MB data: this is
2714             // not the case if we're called with NULL buffer, in which case we
2715             // need to do the conversion yet again
2716             wxCharBuffer bufDef;
2717             if ( !buf )
2718             {
2719                 bufDef = wxCharBuffer(len);
2720                 buf = bufDef.data();
2721                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2722                                             buf, len, NULL, NULL) )
2723                     return wxCONV_FAILED;
2724             }
2725
2726             if ( !n )
2727                 n = wcslen(pwz);
2728             wxWCharBuffer wcBuf(n);
2729             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2730                     wcscmp(wcBuf, pwz) != 0 )
2731             {
2732                 // we didn't obtain the same thing we started from, hence
2733                 // the conversion was lossy and we consider that it failed
2734                 return wxCONV_FAILED;
2735             }
2736         }
2737
2738         // see the comment above for the reason of "len - 1"
2739         return len - 1;
2740     }
2741
2742     virtual size_t GetMBNulLen() const
2743     {
2744         if ( m_minMBCharWidth == 0 )
2745         {
2746             int len = ::WideCharToMultiByte
2747                         (
2748                             m_CodePage,     // code page
2749                             0,              // no flags
2750                             L"",            // input string
2751                             1,              // translate just the NUL
2752                             NULL,           // output buffer
2753                             0,              // and its size
2754                             NULL,           // no replacement char
2755                             NULL            // [out] don't care if it was used
2756                         );
2757
2758             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2759             switch ( len )
2760             {
2761                 default:
2762                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2763                     self->m_minMBCharWidth = (size_t)-1;
2764                     break;
2765
2766                 case 0:
2767                     self->m_minMBCharWidth = (size_t)-1;
2768                     break;
2769
2770                 case 1:
2771                 case 2:
2772                 case 4:
2773                     self->m_minMBCharWidth = len;
2774                     break;
2775             }
2776         }
2777
2778         return m_minMBCharWidth;
2779     }
2780
2781     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2782
2783     bool IsOk() const { return m_CodePage != -1; }
2784
2785 private:
2786     static bool CanUseNoBestFit()
2787     {
2788         static int s_isWin98Or2k = -1;
2789
2790         if ( s_isWin98Or2k == -1 )
2791         {
2792             int verMaj, verMin;
2793             switch ( wxGetOsVersion(&verMaj, &verMin) )
2794             {
2795                 case wxOS_WINDOWS_9X:
2796                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2797                     break;
2798
2799                 case wxOS_WINDOWS_NT:
2800                     s_isWin98Or2k = verMaj >= 5;
2801                     break;
2802
2803                 default:
2804                     // unknown: be conservative by default
2805                     s_isWin98Or2k = 0;
2806                     break;
2807             }
2808
2809             wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2810         }
2811
2812         return s_isWin98Or2k == 1;
2813     }
2814
2815     static bool IsAtLeastWin2kSP4()
2816     {
2817 #ifdef __WXWINCE__
2818         return false;
2819 #else
2820         static int s_isAtLeastWin2kSP4 = -1;
2821
2822         if ( s_isAtLeastWin2kSP4 == -1 )
2823         {
2824             OSVERSIONINFOEX ver;
2825
2826             memset(&ver, 0, sizeof(ver));
2827             ver.dwOSVersionInfoSize = sizeof(ver);
2828             GetVersionEx((OSVERSIONINFO*)&ver);
2829
2830             s_isAtLeastWin2kSP4 =
2831               ((ver.dwMajorVersion > 5) || // Vista+
2832                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2833                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2834                ver.wServicePackMajor >= 4)) // 2000 SP4+
2835               ? 1 : 0;
2836         }
2837
2838         return s_isAtLeastWin2kSP4 == 1;
2839 #endif
2840     }
2841
2842
2843     // the code page we're working with
2844     long m_CodePage;
2845
2846     // cached result of GetMBNulLen(), set to 0 initially meaning
2847     // "unknown"
2848     size_t m_minMBCharWidth;
2849 };
2850
2851 #endif // wxHAVE_WIN32_MB2WC
2852
2853
2854 // ============================================================================
2855 // wxEncodingConverter based conversion classes
2856 // ============================================================================
2857
2858 #if wxUSE_FONTMAP
2859
2860 class wxMBConv_wxwin : public wxMBConv
2861 {
2862 private:
2863     void Init()
2864     {
2865         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2866         // The wxMBConv_cf class does a better job.
2867         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2868                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2869                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2870     }
2871
2872 public:
2873     // temporarily just use wxEncodingConverter stuff,
2874     // so that it works while a better implementation is built
2875     wxMBConv_wxwin(const char* name)
2876     {
2877         if (name)
2878             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2879         else
2880             m_enc = wxFONTENCODING_SYSTEM;
2881
2882         Init();
2883     }
2884
2885     wxMBConv_wxwin(wxFontEncoding enc)
2886     {
2887         m_enc = enc;
2888
2889         Init();
2890     }
2891
2892     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2893     {
2894         size_t inbuf = strlen(psz);
2895         if (buf)
2896         {
2897             if (!m2w.Convert(psz, buf))
2898                 return wxCONV_FAILED;
2899         }
2900         return inbuf;
2901     }
2902
2903     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2904     {
2905         const size_t inbuf = wxWcslen(psz);
2906         if (buf)
2907         {
2908             if (!w2m.Convert(psz, buf))
2909                 return wxCONV_FAILED;
2910         }
2911
2912         return inbuf;
2913     }
2914
2915     virtual size_t GetMBNulLen() const
2916     {
2917         switch ( m_enc )
2918         {
2919             case wxFONTENCODING_UTF16BE:
2920             case wxFONTENCODING_UTF16LE:
2921                 return 2;
2922
2923             case wxFONTENCODING_UTF32BE:
2924             case wxFONTENCODING_UTF32LE:
2925                 return 4;
2926
2927             default:
2928                 return 1;
2929         }
2930     }
2931
2932     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2933
2934     bool IsOk() const { return m_ok; }
2935
2936 public:
2937     wxFontEncoding m_enc;
2938     wxEncodingConverter m2w, w2m;
2939
2940 private:
2941     // were we initialized successfully?
2942     bool m_ok;
2943
2944     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2945 };
2946
2947 // make the constructors available for unit testing
2948 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2949 {
2950     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2951     if ( !result->IsOk() )
2952     {
2953         delete result;
2954         return 0;
2955     }
2956
2957     return result;
2958 }
2959
2960 #endif // wxUSE_FONTMAP
2961
2962 // ============================================================================
2963 // wxCSConv implementation
2964 // ============================================================================
2965
2966 void wxCSConv::Init()
2967 {
2968     m_name = NULL;
2969     m_convReal =  NULL;
2970 }
2971
2972 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2973 {
2974     switch ( encoding )
2975     {
2976         case wxFONTENCODING_MAX:
2977         case wxFONTENCODING_SYSTEM:
2978             if ( m_name )
2979             {
2980                 // It's ok to not have encoding value if we have a name for it.
2981                 m_encoding = wxFONTENCODING_SYSTEM;
2982             }
2983             else // No name neither.
2984             {
2985                 // Fall back to the system default encoding in this case (not
2986                 // sure how much sense does this make but this is how the old
2987                 // code used to behave).
2988 #if wxUSE_INTL
2989                 m_encoding = wxLocale::GetSystemEncoding();
2990                 if ( m_encoding == wxFONTENCODING_SYSTEM )
2991 #endif // wxUSE_INTL
2992                     m_encoding = wxFONTENCODING_ISO8859_1;
2993             }
2994             break;
2995
2996         case wxFONTENCODING_DEFAULT:
2997             // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2998             m_encoding = wxFONTENCODING_ISO8859_1;
2999             break;
3000
3001         default:
3002             // Just use the provided encoding.
3003             m_encoding = encoding;
3004     }
3005 }
3006
3007 wxCSConv::wxCSConv(const wxString& charset)
3008 {
3009     Init();
3010
3011     if ( !charset.empty() )
3012     {
3013         SetName(charset.ToAscii());
3014     }
3015
3016 #if wxUSE_FONTMAP
3017     SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3018 #else
3019     SetEncoding(wxFONTENCODING_SYSTEM);
3020 #endif
3021
3022     m_convReal = DoCreate();
3023 }
3024
3025 wxCSConv::wxCSConv(wxFontEncoding encoding)
3026 {
3027     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3028     {
3029         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3030
3031         encoding = wxFONTENCODING_SYSTEM;
3032     }
3033
3034     Init();
3035
3036     SetEncoding(encoding);
3037
3038     m_convReal = DoCreate();
3039 }
3040
3041 wxCSConv::~wxCSConv()
3042 {
3043     Clear();
3044 }
3045
3046 wxCSConv::wxCSConv(const wxCSConv& conv)
3047         : wxMBConv()
3048 {
3049     Init();
3050
3051     SetName(conv.m_name);
3052     SetEncoding(conv.m_encoding);
3053
3054     m_convReal = DoCreate();
3055 }
3056
3057 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3058 {
3059     Clear();
3060
3061     SetName(conv.m_name);
3062     SetEncoding(conv.m_encoding);
3063
3064     m_convReal = DoCreate();
3065
3066     return *this;
3067 }
3068
3069 void wxCSConv::Clear()
3070 {
3071     free(m_name);
3072     m_name = NULL;
3073
3074     wxDELETE(m_convReal);
3075 }
3076
3077 void wxCSConv::SetName(const char *charset)
3078 {
3079     if ( charset )
3080         m_name = wxStrdup(charset);
3081 }
3082
3083 #if wxUSE_FONTMAP
3084
3085 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3086                      wxEncodingNameCache );
3087
3088 static wxEncodingNameCache gs_nameCache;
3089 #endif
3090
3091 wxMBConv *wxCSConv::DoCreate() const
3092 {
3093 #if wxUSE_FONTMAP
3094     wxLogTrace(TRACE_STRCONV,
3095                wxT("creating conversion for %s"),
3096                (m_name ? m_name
3097                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3098 #endif // wxUSE_FONTMAP
3099
3100     // check for the special case of ASCII or ISO8859-1 charset: as we have
3101     // special knowledge of it anyhow, we don't need to create a special
3102     // conversion object
3103     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3104     {
3105         // don't convert at all
3106         return NULL;
3107     }
3108
3109     // we trust OS to do conversion better than we can so try external
3110     // conversion methods first
3111     //
3112     // the full order is:
3113     //      1. OS conversion (iconv() under Unix or Win32 API)
3114     //      2. hard coded conversions for UTF
3115     //      3. wxEncodingConverter as fall back
3116
3117     // step (1)
3118 #ifdef HAVE_ICONV
3119 #if !wxUSE_FONTMAP
3120     if ( m_name )
3121 #endif // !wxUSE_FONTMAP
3122     {
3123 #if wxUSE_FONTMAP
3124         wxFontEncoding encoding(m_encoding);
3125 #endif
3126
3127         if ( m_name )
3128         {
3129             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3130             if ( conv->IsOk() )
3131                 return conv;
3132
3133             delete conv;
3134
3135 #if wxUSE_FONTMAP
3136             encoding =
3137                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3138 #endif // wxUSE_FONTMAP
3139         }
3140 #if wxUSE_FONTMAP
3141         {
3142             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3143             if ( it != gs_nameCache.end() )
3144             {
3145                 if ( it->second.empty() )
3146                     return NULL;
3147
3148                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3149                 if ( conv->IsOk() )
3150                     return conv;
3151
3152                 delete conv;
3153             }
3154
3155             const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3156             // CS : in case this does not return valid names (eg for MacRoman)
3157             // encoding got a 'failure' entry in the cache all the same,
3158             // although it just has to be created using a different method, so
3159             // only store failed iconv creation attempts (or perhaps we
3160             // shoulnd't do this at all ?)
3161             if ( names[0] != NULL )
3162             {
3163                 for ( ; *names; ++names )
3164                 {
3165                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3166                     //             will need changes that will obsolete this
3167                     wxString name(*names);
3168                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3169                     if ( conv->IsOk() )
3170                     {
3171                         gs_nameCache[encoding] = *names;
3172                         return conv;
3173                     }
3174
3175                     delete conv;
3176                 }
3177
3178                 gs_nameCache[encoding] = wxT(""); // cache the failure
3179             }
3180         }
3181 #endif // wxUSE_FONTMAP
3182     }
3183 #endif // HAVE_ICONV
3184
3185 #ifdef wxHAVE_WIN32_MB2WC
3186     {
3187 #if wxUSE_FONTMAP
3188         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3189                                       : new wxMBConv_win32(m_encoding);
3190         if ( conv->IsOk() )
3191             return conv;
3192
3193         delete conv;
3194 #else
3195         return NULL;
3196 #endif
3197     }
3198 #endif // wxHAVE_WIN32_MB2WC
3199
3200 #ifdef __DARWIN__
3201     {
3202         // leave UTF16 and UTF32 to the built-ins of wx
3203         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3204             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3205         {
3206 #if wxUSE_FONTMAP
3207             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3208                                           : new wxMBConv_cf(m_encoding);
3209 #else
3210             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3211 #endif
3212
3213             if ( conv->IsOk() )
3214                  return conv;
3215
3216             delete conv;
3217         }
3218     }
3219 #endif // __DARWIN__
3220
3221     // step (2)
3222     wxFontEncoding enc = m_encoding;
3223 #if wxUSE_FONTMAP
3224     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225     {
3226         // use "false" to suppress interactive dialogs -- we can be called from
3227         // anywhere and popping up a dialog from here is the last thing we want to
3228         // do
3229         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3230     }
3231 #endif // wxUSE_FONTMAP
3232
3233     switch ( enc )
3234     {
3235         case wxFONTENCODING_UTF7:
3236              return new wxMBConvUTF7;
3237
3238         case wxFONTENCODING_UTF8:
3239              return new wxMBConvUTF8;
3240
3241         case wxFONTENCODING_UTF16BE:
3242              return new wxMBConvUTF16BE;
3243
3244         case wxFONTENCODING_UTF16LE:
3245              return new wxMBConvUTF16LE;
3246
3247         case wxFONTENCODING_UTF32BE:
3248              return new wxMBConvUTF32BE;
3249
3250         case wxFONTENCODING_UTF32LE:
3251              return new wxMBConvUTF32LE;
3252
3253         default:
3254              // nothing to do but put here to suppress gcc warnings
3255              break;
3256     }
3257
3258     // step (3)
3259 #if wxUSE_FONTMAP
3260     {
3261         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262                                       : new wxMBConv_wxwin(m_encoding);
3263         if ( conv->IsOk() )
3264             return conv;
3265
3266         delete conv;
3267     }
3268
3269     wxLogTrace(TRACE_STRCONV,
3270                wxT("encoding \"%s\" is not supported by this system"),
3271                (m_name ? wxString(m_name)
3272                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3273 #endif // wxUSE_FONTMAP
3274
3275     return NULL;
3276 }
3277
3278 bool wxCSConv::IsOk() const
3279 {
3280     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3281     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3282         return true; // always ok as we do it ourselves
3283
3284     // m_convReal->IsOk() is called at its own creation, so we know it must
3285     // be ok if m_convReal is non-NULL
3286     return m_convReal != NULL;
3287 }
3288
3289 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3290                          const char *src, size_t srcLen) const
3291 {
3292     if (m_convReal)
3293         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3294
3295     // latin-1 (direct)
3296     if ( srcLen == wxNO_LEN )
3297         srcLen = strlen(src) + 1; // take trailing NUL too
3298
3299     if ( dst )
3300     {
3301         if ( dstLen < srcLen )
3302             return wxCONV_FAILED;
3303
3304         for ( size_t n = 0; n < srcLen; n++ )
3305             dst[n] = (unsigned char)(src[n]);
3306     }
3307
3308     return srcLen;
3309 }
3310
3311 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3312                            const wchar_t *src, size_t srcLen) const
3313 {
3314     if (m_convReal)
3315         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3316
3317     // latin-1 (direct)
3318     if ( srcLen == wxNO_LEN )
3319         srcLen = wxWcslen(src) + 1;
3320
3321     if ( dst )
3322     {
3323         if ( dstLen < srcLen )
3324             return wxCONV_FAILED;
3325
3326         for ( size_t n = 0; n < srcLen; n++ )
3327         {
3328             if ( src[n] > 0xFF )
3329                 return wxCONV_FAILED;
3330
3331             dst[n] = (char)src[n];
3332         }
3333
3334     }
3335     else // still need to check the input validity
3336     {
3337         for ( size_t n = 0; n < srcLen; n++ )
3338         {
3339             if ( src[n] > 0xFF )
3340                 return wxCONV_FAILED;
3341         }
3342     }
3343
3344     return srcLen;
3345 }
3346
3347 size_t wxCSConv::GetMBNulLen() const
3348 {
3349     if ( m_convReal )
3350         return m_convReal->GetMBNulLen();
3351
3352     // otherwise, we are ISO-8859-1
3353     return 1;
3354 }
3355
3356 #if wxUSE_UNICODE_UTF8
3357 bool wxCSConv::IsUTF8() const
3358 {
3359     if ( m_convReal )
3360         return m_convReal->IsUTF8();
3361
3362     // otherwise, we are ISO-8859-1
3363     return false;
3364 }
3365 #endif
3366
3367
3368 #if wxUSE_UNICODE
3369
3370 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3371 {
3372     if ( !s )
3373         return wxWCharBuffer();
3374
3375     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3376     if ( !wbuf )
3377         wbuf = wxMBConvUTF8().cMB2WX(s);
3378     if ( !wbuf )
3379         wbuf = wxConvISO8859_1.cMB2WX(s);
3380
3381     return wbuf;
3382 }
3383
3384 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3385 {
3386     if ( !ws )
3387         return wxCharBuffer();
3388
3389     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3390     if ( !buf )
3391         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3392
3393     return buf;
3394 }
3395
3396 #endif // wxUSE_UNICODE
3397
3398 // ----------------------------------------------------------------------------
3399 // globals
3400 // ----------------------------------------------------------------------------
3401
3402 // NB: The reason why we create converted objects in this convoluted way,
3403 //     using a factory function instead of global variable, is that they
3404 //     may be used at static initialization time (some of them are used by
3405 //     wxString ctors and there may be a global wxString object). In other
3406 //     words, possibly _before_ the converter global object would be
3407 //     initialized.
3408
3409 #undef wxConvLibc
3410 #undef wxConvUTF8
3411 #undef wxConvUTF7
3412 #undef wxConvLocal
3413 #undef wxConvISO8859_1
3414
3415 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3416     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3417     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3418     {                                                                   \
3419         static impl_klass name##Obj ctor_args;                          \
3420         return &name##Obj;                                              \
3421     }                                                                   \
3422     /* this ensures that all global converter objects are created */    \
3423     /* by the time static initialization is done, i.e. before any */    \
3424     /* thread is launched: */                                           \
3425     static klass* gs_##name##instance = wxGet_##name##Ptr()
3426
3427 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3428     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3429
3430 #ifdef __INTELC__
3431     // disable warning "variable 'xxx' was declared but never referenced"
3432     #pragma warning(disable: 177)
3433 #endif // Intel C++
3434
3435 #ifdef __WINDOWS__
3436     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3437 #elif 0 // defined(__WXOSX__)
3438     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3439 #else
3440     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3441 #endif
3442
3443 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3444 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3445 //     provokes an error message about "not enough macro parameters"; and we
3446 //     can't use "()" here as the name##Obj declaration would be parsed as a
3447 //     function declaration then, so use a semicolon and live with an extra
3448 //     empty statement (and hope that no compilers warns about this)
3449 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3450 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3451
3452 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3453 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3454
3455 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3456 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3457
3458 #ifdef __DARWIN__
3459 // It is important to use this conversion object under Darwin as it ensures
3460 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3461 // decomposed form internally (at least for the file names).
3462 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3463 #endif
3464
3465 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3466 #ifdef __DARWIN__
3467                                     &wxConvMacUTF8DObj;
3468 #else // !__DARWIN__
3469                                     wxGet_wxConvLibcPtr();
3470 #endif // __DARWIN__/!__DARWIN__