src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #ifndef __WXWINCE__
  32 #include <errno.h>
  33 #endif
  34
  35 #include <ctype.h>
  36 #include <string.h>
  37 #include <stdlib.h>
  38
  39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42     #define wxHAVE_WIN32_MB2WC
  43 #endif
  44
  45 #ifdef HAVE_ICONV
  46     #include <iconv.h>
  47     #include "wx/thread.h"
  48 #endif
  49
  50 #include "wx/encconv.h"
  51 #include "wx/fontmap.h"
  52
  53 #ifdef __DARWIN__
  54 #include "wx/osx/core/private/strconv_cf.h"
  55 #endif //def __DARWIN__
  56
  57
  58 #define TRACE_STRCONV wxT("strconv")
  59
  60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  61 // be 4 bytes
  62 #if SIZEOF_WCHAR_T == 2
  63     #define WC_UTF16
  64 #endif
  65
  66
  67 // ============================================================================
  68 // implementation
  69 // ============================================================================
  70
  71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  72 static bool NotAllNULs(const char *p, size_t n)
  73 {
  74     while ( n && *p++ == '\0' )
  75         n--;
  76
  77     return n != 0;
  78 }
  79
  80 // ----------------------------------------------------------------------------
  81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  82 // ----------------------------------------------------------------------------
  83
  84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  85 {
  86     if (input <= 0xffff)
  87     {
  88         if (output)
  89             *output = (wxUint16) input;
  90
  91         return 1;
  92     }
  93     else if (input >= 0x110000)
  94     {
  95         return wxCONV_FAILED;
  96     }
  97     else
  98     {
  99         if (output)
 100         {
 101             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 102             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 103         }
 104
 105         return 2;
 106     }
 107 }
 108
 109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 110 {
 111     if ((*input < 0xd800) || (*input > 0xdfff))
 112     {
 113         output = *input;
 114         return 1;
 115     }
 116     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 117     {
 118         output = *input;
 119         return wxCONV_FAILED;
 120     }
 121     else
 122     {
 123         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 124         return 2;
 125     }
 126 }
 127
 128 #ifdef WC_UTF16
 129     typedef wchar_t wxDecodeSurrogate_t;
 130 #else // !WC_UTF16
 131     typedef wxUint16 wxDecodeSurrogate_t;
 132 #endif // WC_UTF16/!WC_UTF16
 133
 134 // returns the next UTF-32 character from the wchar_t buffer and advances the
 135 // pointer to the character after this one
 136 //
 137 // if an invalid character is found, *pSrc is set to NULL, the caller must
 138 // check for this
 139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 140 {
 141     wxUint32 out;
 142     const size_t
 143         n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
 144     if ( n == wxCONV_FAILED )
 145         *pSrc = NULL;
 146     else
 147         *pSrc += n;
 148
 149     return out;
 150 }
 151
 152 // ----------------------------------------------------------------------------
 153 // wxMBConv
 154 // ----------------------------------------------------------------------------
 155
 156 size_t
 157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 158                   const char *src, size_t srcLen) const
 159 {
 160     // although new conversion classes are supposed to implement this function
 161     // directly, the existing ones only implement the old MB2WC() and so, to
 162     // avoid to have to rewrite all conversion classes at once, we provide a
 163     // default (but not efficient) implementation of this one in terms of the
 164     // old function by copying the input to ensure that it's NUL-terminated and
 165     // then using MB2WC() to convert it
 166     //
 167     // moreover, some conversion classes simply can't implement ToWChar()
 168     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 169     // NUL-terminated strings
 170
 171     // the number of chars [which would be] written to dst [if it were not NULL]
 172     size_t dstWritten = 0;
 173
 174     // the number of NULs terminating this string
 175     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 176
 177     // if we were not given the input size we just have to assume that the
 178     // string is properly terminated as we have no way of knowing how long it
 179     // is anyhow, but if we do have the size check whether there are enough
 180     // NULs at the end
 181     wxCharBuffer bufTmp;
 182     const char *srcEnd;
 183     if ( srcLen != wxNO_LEN )
 184     {
 185         // we need to know how to find the end of this string
 186         nulLen = GetMBNulLen();
 187         if ( nulLen == wxCONV_FAILED )
 188             return wxCONV_FAILED;
 189
 190         // if there are enough NULs we can avoid the copy
 191         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 192         {
 193             // make a copy in order to properly NUL-terminate the string
 194             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 195             char * const p = bufTmp.data();
 196             memcpy(p, src, srcLen);
 197             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 198                 *s = '\0';
 199
 200             src = bufTmp;
 201         }
 202
 203         srcEnd = src + srcLen;
 204     }
 205     else // quit after the first loop iteration
 206     {
 207         srcEnd = NULL;
 208     }
 209
 210     // the idea of this code is straightforward: it converts a NUL-terminated
 211     // chunk of the string during each iteration and updates the output buffer
 212     // with the result
 213     //
 214     // all the complication come from the fact that this function, for
 215     // historical reasons, must behave in 2 subtly different ways when it's
 216     // called with a fixed number of characters and when it's called for the
 217     // entire NUL-terminated string: in the former case (srcEnd != NULL) we
 218     // must count all characters we convert, NUL or not; but in the latter we
 219     // do not count the trailing NUL -- but still count all the NULs inside the
 220     // string
 221     //
 222     // so for the (simple) former case we just always count the trailing NUL,
 223     // but for the latter we need to wait until we see if there is going to be
 224     // another loop iteration and only count it then
 225     for ( ;; )
 226     {
 227         // try to convert the current chunk
 228         size_t lenChunk = MB2WC(NULL, src, 0);
 229         if ( lenChunk == wxCONV_FAILED )
 230             return wxCONV_FAILED;
 231
 232         dstWritten += lenChunk;
 233         if ( !srcEnd )
 234             dstWritten++;
 235
 236         if ( !lenChunk )
 237         {
 238             // nothing left in the input string, conversion succeeded
 239             break;
 240         }
 241
 242         if ( dst )
 243         {
 244             if ( dstWritten > dstLen )
 245                 return wxCONV_FAILED;
 246
 247             // +1 is for trailing NUL
 248             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 249                 return wxCONV_FAILED;
 250
 251             dst += lenChunk;
 252             if ( !srcEnd )
 253                 dst++;
 254         }
 255
 256         if ( !srcEnd )
 257         {
 258             // we convert just one chunk in this case as this is the entire
 259             // string anyhow (and we don't count the trailing NUL in this case)
 260             break;
 261         }
 262
 263         // advance the input pointer past the end of this chunk: notice that we
 264         // will always stop before srcEnd because we know that the chunk is
 265         // always properly NUL-terminated
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         // if the buffer ends before this NUL, we shouldn't count it in our
 276         // output so skip the code below
 277         if ( src == srcEnd )
 278             break;
 279
 280         // do count this terminator as it's inside the buffer we convert
 281         dstWritten++;
 282         if ( dst )
 283             dst++;
 284
 285         src += nulLen; // skip the terminator itself
 286
 287         if ( src >= srcEnd )
 288             break;
 289     }
 290
 291     return dstWritten;
 292 }
 293
 294 size_t
 295 wxMBConv::FromWChar(char *dst, size_t dstLen,
 296                     const wchar_t *src, size_t srcLen) const
 297 {
 298     // the number of chars [which would be] written to dst [if it were not NULL]
 299     size_t dstWritten = 0;
 300
 301     // if we don't know its length we have no choice but to assume that it is
 302     // NUL-terminated (notice that it can still be NUL-terminated even if
 303     // explicit length is given but it doesn't change our return value)
 304     const bool isNulTerminated = srcLen == wxNO_LEN;
 305
 306     // make a copy of the input string unless it is already properly
 307     // NUL-terminated
 308     wxWCharBuffer bufTmp;
 309     if ( isNulTerminated )
 310     {
 311         srcLen = wxWcslen(src) + 1;
 312     }
 313     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 314     {
 315         // make a copy in order to properly NUL-terminate the string
 316         bufTmp = wxWCharBuffer(srcLen);
 317         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 318         src = bufTmp;
 319     }
 320
 321     const size_t lenNul = GetMBNulLen();
 322     for ( const wchar_t * const srcEnd = src + srcLen;
 323           src < srcEnd;
 324           src++ /* skip L'\0' too */ )
 325     {
 326         // try to convert the current chunk
 327         size_t lenChunk = WC2MB(NULL, src, 0);
 328         if ( lenChunk == wxCONV_FAILED )
 329             return wxCONV_FAILED;
 330
 331         dstWritten += lenChunk;
 332
 333         const wchar_t * const
 334             chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
 335
 336         // our return value accounts for the trailing NUL(s), unlike that of
 337         // WC2MB(), however don't do it for the last NUL we artificially added
 338         // ourselves above
 339         if ( chunkEnd < srcEnd )
 340             dstWritten += lenNul;
 341
 342         if ( dst )
 343         {
 344             if ( dstWritten > dstLen )
 345                 return wxCONV_FAILED;
 346
 347             // if we know that there is enough space in the destination buffer
 348             // (because we accounted for lenNul in dstWritten above), we can
 349             // convert directly in place -- but otherwise we need another
 350             // temporary buffer to ensure that we don't overwrite the output
 351             wxCharBuffer dstBuf;
 352             char *dstTmp;
 353             if ( chunkEnd == srcEnd )
 354             {
 355                 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
 356                 dstTmp = dstBuf.data();
 357             }
 358             else
 359             {
 360                 dstTmp = dst;
 361             }
 362
 363             if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
 364                 return wxCONV_FAILED;
 365
 366             if ( dstTmp != dst )
 367             {
 368                 // copy everything up to but excluding the terminating NUL(s)
 369                 // into the real output buffer
 370                 memcpy(dst, dstTmp, lenChunk);
 371
 372                 // micro-optimization: if dstTmp != dst it means that chunkEnd
 373                 // == srcEnd and so we're done, no need to update anything below
 374                 break;
 375             }
 376
 377             dst += lenChunk;
 378             if ( chunkEnd < srcEnd )
 379                 dst += lenNul;
 380         }
 381
 382         src = chunkEnd;
 383     }
 384
 385     return dstWritten;
 386 }
 387
 388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 389 {
 390     size_t rc = ToWChar(outBuff, outLen, inBuff);
 391     if ( rc != wxCONV_FAILED )
 392     {
 393         // ToWChar() returns the buffer length, i.e. including the trailing
 394         // NUL, while this method doesn't take it into account
 395         rc--;
 396     }
 397
 398     return rc;
 399 }
 400
 401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 402 {
 403     size_t rc = FromWChar(outBuff, outLen, inBuff);
 404     if ( rc != wxCONV_FAILED )
 405     {
 406         rc -= GetMBNulLen();
 407     }
 408
 409     return rc;
 410 }
 411
 412 wxMBConv::~wxMBConv()
 413 {
 414     // nothing to do here (necessary for Darwin linking probably)
 415 }
 416
 417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 418 {
 419     if ( psz )
 420     {
 421         // calculate the length of the buffer needed first
 422         const size_t nLen = ToWChar(NULL, 0, psz);
 423         if ( nLen != wxCONV_FAILED )
 424         {
 425             // now do the actual conversion
 426             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 427
 428             // +1 for the trailing NULL
 429             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 430                 return buf;
 431         }
 432     }
 433
 434     return wxWCharBuffer();
 435 }
 436
 437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 438 {
 439     if ( pwz )
 440     {
 441         const size_t nLen = FromWChar(NULL, 0, pwz);
 442         if ( nLen != wxCONV_FAILED )
 443         {
 444             wxCharBuffer buf(nLen - 1);
 445             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 446                 return buf;
 447         }
 448     }
 449
 450     return wxCharBuffer();
 451 }
 452
 453 const wxWCharBuffer
 454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 455 {
 456     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 457     if ( dstLen != wxCONV_FAILED )
 458     {
 459         // notice that we allocate space for dstLen+1 wide characters here
 460         // because we want the buffer to always be NUL-terminated, even if the
 461         // input isn't (as otherwise the caller has no way to know its length)
 462         wxWCharBuffer wbuf(dstLen);
 463         wbuf.data()[dstLen] = L'\0';
 464         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 465         {
 466             if ( outLen )
 467             {
 468                 *outLen = dstLen;
 469
 470                 // we also need to handle NUL-terminated input strings
 471                 // specially: for them the output is the length of the string
 472                 // excluding the trailing NUL, however if we're asked to
 473                 // convert a specific number of characters we return the length
 474                 // of the resulting output even if it's NUL-terminated
 475                 if ( inLen == wxNO_LEN )
 476                     (*outLen)--;
 477             }
 478
 479             return wbuf;
 480         }
 481     }
 482
 483     if ( outLen )
 484         *outLen = 0;
 485
 486     return wxWCharBuffer();
 487 }
 488
 489 const wxCharBuffer
 490 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 491 {
 492     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 493     if ( dstLen != wxCONV_FAILED )
 494     {
 495         const size_t nulLen = GetMBNulLen();
 496
 497         // as above, ensure that the buffer is always NUL-terminated, even if
 498         // the input is not
 499         wxCharBuffer buf(dstLen + nulLen - 1);
 500         memset(buf.data() + dstLen, 0, nulLen);
 501         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 502         {
 503             if ( outLen )
 504             {
 505                 *outLen = dstLen;
 506
 507                 if ( inLen == wxNO_LEN )
 508                 {
 509                     // in this case both input and output are NUL-terminated
 510                     // and we're not supposed to count NUL
 511                     *outLen -= nulLen;
 512                 }
 513             }
 514
 515             return buf;
 516         }
 517     }
 518
 519     if ( outLen )
 520         *outLen = 0;
 521
 522     return wxCharBuffer();
 523 }
 524
 525 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
 526 {
 527     const size_t srcLen = buf.length();
 528     if ( srcLen )
 529     {
 530         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
 531         if ( dstLen != wxCONV_FAILED )
 532         {
 533             wxWCharBuffer wbuf(dstLen);
 534             wbuf.data()[dstLen] = L'\0';
 535             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
 536                 return wbuf;
 537         }
 538     }
 539
 540     return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
 541 }
 542
 543 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
 544 {
 545     const size_t srcLen = wbuf.length();
 546     if ( srcLen )
 547     {
 548         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
 549         if ( dstLen != wxCONV_FAILED )
 550         {
 551             wxCharBuffer buf(dstLen);
 552             buf.data()[dstLen] = '\0';
 553             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
 554                 return buf;
 555         }
 556     }
 557
 558     return wxScopedCharBuffer::CreateNonOwned("", 0);
 559 }
 560
 561 // ----------------------------------------------------------------------------
 562 // wxMBConvLibc
 563 // ----------------------------------------------------------------------------
 564
 565 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 566 {
 567     return wxMB2WC(buf, psz, n);
 568 }
 569
 570 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 571 {
 572     return wxWC2MB(buf, psz, n);
 573 }
 574
 575 // ----------------------------------------------------------------------------
 576 // wxConvBrokenFileNames
 577 // ----------------------------------------------------------------------------
 578
 579 #ifdef __UNIX__
 580
 581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 582 {
 583     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
 584          wxStricmp(charset, wxT("UTF8")) == 0  )
 585         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 586     else
 587         m_conv = new wxCSConv(charset);
 588 }
 589
 590 #endif // __UNIX__
 591
 592 // ----------------------------------------------------------------------------
 593 // UTF-7
 594 // ----------------------------------------------------------------------------
 595
 596 // Implementation (C) 2004 Fredrik Roubert
 597 //
 598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 599
 600 //
 601 // BASE64 decoding table
 602 //
 603 static const unsigned char utf7unb64[] =
 604 {
 605     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 606     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 607     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 608     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 609     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 610     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 611     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 612     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 613     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 614     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 615     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 616     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 617     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 618     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 619     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 620     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 621     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 622     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 623     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 624     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 625     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 626     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 627     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 628     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 629     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 630     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 631     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 632     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 633     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 634     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 635     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 636     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 637 };
 638
 639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 640                              const char *src, size_t srcLen) const
 641 {
 642     DecoderState stateOrig,
 643                 *statePtr;
 644     if ( srcLen == wxNO_LEN )
 645     {
 646         // convert the entire string, up to and including the trailing NUL
 647         srcLen = strlen(src) + 1;
 648
 649         // when working on the entire strings we don't update nor use the shift
 650         // state from the previous call
 651         statePtr = &stateOrig;
 652     }
 653     else // when working with partial strings we do use the shift state
 654     {
 655         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
 656
 657         // also save the old state to be able to rollback to it on error
 658         stateOrig = m_stateDecoder;
 659     }
 660
 661     // but to simplify the code below we use this variable in both cases
 662     DecoderState& state = *statePtr;
 663
 664
 665     // number of characters [which would have been] written to dst [if it were
 666     // not NULL]
 667     size_t len = 0;
 668
 669     const char * const srcEnd = src + srcLen;
 670
 671     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 672     {
 673         const unsigned char cc = *src++;
 674
 675         if ( state.IsShifted() )
 676         {
 677             const unsigned char dc = utf7unb64[cc];
 678             if ( dc == 0xff )
 679             {
 680                 // end of encoded part, check that nothing was left: there can
 681                 // be up to 4 bits of 0 padding but nothing else (we also need
 682                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 683                 // encoded sequence must contain an integral number of UTF-16
 684                 // characters)
 685                 if ( state.isLSB || state.bit > 4 ||
 686                         (state.accum & ((1 << state.bit) - 1)) )
 687                 {
 688                     if ( !len )
 689                         state = stateOrig;
 690
 691                     return wxCONV_FAILED;
 692                 }
 693
 694                 state.ToDirect();
 695
 696                 // re-parse this character normally below unless it's '-' which
 697                 // is consumed by the decoder
 698                 if ( cc == '-' )
 699                     continue;
 700             }
 701             else // valid encoded character
 702             {
 703                 // mini base64 decoder: each character is 6 bits
 704                 state.bit += 6;
 705                 state.accum <<= 6;
 706                 state.accum += dc;
 707
 708                 if ( state.bit >= 8 )
 709                 {
 710                     // got the full byte, consume it
 711                     state.bit -= 8;
 712                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 713
 714                     if ( state.isLSB )
 715                     {
 716                         // we've got the full word, output it
 717                         if ( dst )
 718                             *dst++ = (state.msb << 8) | b;
 719                         len++;
 720                         state.isLSB = false;
 721                     }
 722                     else // MSB
 723                     {
 724                         // just store it while we wait for LSB
 725                         state.msb = b;
 726                         state.isLSB = true;
 727                     }
 728                 }
 729             }
 730         }
 731
 732         if ( state.IsDirect() )
 733         {
 734             // start of an encoded segment?
 735             if ( cc == '+' )
 736             {
 737                 if ( *src == '-' )
 738                 {
 739                     // just the encoded plus sign, don't switch to shifted mode
 740                     if ( dst )
 741                         *dst++ = '+';
 742                     len++;
 743                     src++;
 744                 }
 745                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 746                 {
 747                     // empty encoded chunks are not allowed
 748                     if ( !len )
 749                         state = stateOrig;
 750
 751                     return wxCONV_FAILED;
 752                 }
 753                 else // base-64 encoded chunk follows
 754                 {
 755                     state.ToShifted();
 756                 }
 757             }
 758             else // not '+'
 759             {
 760                 // only printable 7 bit ASCII characters (with the exception of
 761                 // NUL, TAB, CR and LF) can be used directly
 762                 if ( cc >= 0x7f || (cc < ' ' &&
 763                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 764                     return wxCONV_FAILED;
 765
 766                 if ( dst )
 767                     *dst++ = cc;
 768                 len++;
 769             }
 770         }
 771     }
 772
 773     if ( !len )
 774     {
 775         // as we didn't read any characters we should be called with the same
 776         // data (followed by some more new data) again later so don't save our
 777         // state
 778         state = stateOrig;
 779
 780         return wxCONV_FAILED;
 781     }
 782
 783     return len;
 784 }
 785
 786 //
 787 // BASE64 encoding table
 788 //
 789 static const unsigned char utf7enb64[] =
 790 {
 791     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 792     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 793     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 794     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 795     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 796     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 797     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 798     '4', '5', '6', '7', '8', '9', '+', '/'
 799 };
 800
 801 //
 802 // UTF-7 encoding table
 803 //
 804 // 0 - Set D (directly encoded characters)
 805 // 1 - Set O (optional direct characters)
 806 // 2 - whitespace characters (optional)
 807 // 3 - special characters
 808 //
 809 static const unsigned char utf7encode[128] =
 810 {
 811     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 812     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 813     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 814     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 815     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 816     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 817     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 818     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 819 };
 820
 821 static inline bool wxIsUTF7Direct(wchar_t wc)
 822 {
 823     return wc < 0x80 && utf7encode[wc] < 1;
 824 }
 825
 826 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 827                                const wchar_t *src, size_t srcLen) const
 828 {
 829     EncoderState stateOrig,
 830                 *statePtr;
 831     if ( srcLen == wxNO_LEN )
 832     {
 833         // we don't apply the stored state when operating on entire strings at
 834         // once
 835         statePtr = &stateOrig;
 836
 837         srcLen = wxWcslen(src) + 1;
 838     }
 839     else // do use the mode we left the output in previously
 840     {
 841         stateOrig = m_stateEncoder;
 842         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
 843     }
 844
 845     EncoderState& state = *statePtr;
 846
 847
 848     size_t len = 0;
 849
 850     const wchar_t * const srcEnd = src + srcLen;
 851     while ( src < srcEnd && (!dst || len < dstLen) )
 852     {
 853         wchar_t cc = *src++;
 854         if ( wxIsUTF7Direct(cc) )
 855         {
 856             if ( state.IsShifted() )
 857             {
 858                 // pad with zeros the last encoded block if necessary
 859                 if ( state.bit )
 860                 {
 861                     if ( dst )
 862                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 863                     len++;
 864                 }
 865
 866                 state.ToDirect();
 867
 868                 if ( dst )
 869                     *dst++ = '-';
 870                 len++;
 871             }
 872
 873             if ( dst )
 874                 *dst++ = (char)cc;
 875             len++;
 876         }
 877         else if ( cc == '+' && state.IsDirect() )
 878         {
 879             if ( dst )
 880             {
 881                 *dst++ = '+';
 882                 *dst++ = '-';
 883             }
 884
 885             len += 2;
 886         }
 887 #ifndef WC_UTF16
 888         else if (((wxUint32)cc) > 0xffff)
 889         {
 890             // no surrogate pair generation (yet?)
 891             return wxCONV_FAILED;
 892         }
 893 #endif
 894         else
 895         {
 896             if ( state.IsDirect() )
 897             {
 898                 state.ToShifted();
 899
 900                 if ( dst )
 901                     *dst++ = '+';
 902                 len++;
 903             }
 904
 905             // BASE64 encode string
 906             for ( ;; )
 907             {
 908                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 909                 {
 910                     state.accum <<= 8;
 911                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 912
 913                     for (state.bit += 8; state.bit >= 6; )
 914                     {
 915                         state.bit -= 6;
 916                         if ( dst )
 917                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 918                         len++;
 919                     }
 920                 }
 921
 922                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 923                     break;
 924
 925                 src++;
 926             }
 927         }
 928     }
 929
 930     // we need to restore the original encoder state if we were called just to
 931     // calculate the amount of space needed as we will presumably be called
 932     // again to really convert the data now
 933     if ( !dst )
 934         state = stateOrig;
 935
 936     return len;
 937 }
 938
 939 // ----------------------------------------------------------------------------
 940 // UTF-8
 941 // ----------------------------------------------------------------------------
 942
 943 static const wxUint32 utf8_max[]=
 944     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 945
 946 // boundaries of the private use area we use to (temporarily) remap invalid
 947 // characters invalid in a UTF-8 encoded string
 948 const wxUint32 wxUnicodePUA = 0x100000;
 949 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 950
 951 // this table gives the length of the UTF-8 encoding from its first character:
 952 const unsigned char tableUtf8Lengths[256] = {
 953     // single-byte sequences (ASCII):
 954     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 955     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 956     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 957     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 958     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 959     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 960     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 961     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 962
 963     // these are invalid:
 964     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 965     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 966     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 967     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 968     0, 0,                                            // C0,C1
 969
 970     // two-byte sequences:
 971           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 972     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 973
 974     // three-byte sequences:
 975     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 976
 977     // four-byte sequences:
 978     4, 4, 4, 4, 4,                                   // F0..F4
 979
 980     // these are invalid again (5- or 6-byte
 981     // sequences and sequences for code points
 982     // above U+10FFFF, as restricted by RFC 3629):
 983                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 984 };
 985
 986 size_t
 987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 988                             const char *src, size_t srcLen) const
 989 {
 990     wchar_t *out = dstLen ? dst : NULL;
 991     size_t written = 0;
 992
 993     if ( srcLen == wxNO_LEN )
 994         srcLen = strlen(src) + 1;
 995
 996     for ( const char *p = src; ; p++ )
 997     {
 998         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 999         {
1000             // all done successfully, just add the trailing NULL if we are not
1001             // using explicit length
1002             if ( srcLen == wxNO_LEN )
1003             {
1004                 if ( out )
1005                 {
1006                     if ( !dstLen )
1007                         break;
1008
1009                     *out = L'\0';
1010                 }
1011
1012                 written++;
1013             }
1014
1015             return written;
1016         }
1017
1018         if ( out && !dstLen-- )
1019             break;
1020
1021         wxUint32 code;
1022         unsigned char c = *p;
1023
1024         if ( c < 0x80 )
1025         {
1026             if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027                 break;
1028
1029             if ( srcLen != wxNO_LEN )
1030                 srcLen--;
1031
1032             code = c;
1033         }
1034         else
1035         {
1036             unsigned len = tableUtf8Lengths[c];
1037             if ( !len )
1038                 break;
1039
1040             if ( srcLen < len ) // the test works for wxNO_LEN too
1041                 break;
1042
1043             if ( srcLen != wxNO_LEN )
1044                 srcLen -= len;
1045
1046             //   Char. number range   |        UTF-8 octet sequence
1047             //      (hexadecimal)     |              (binary)
1048             //  ----------------------+----------------------------------------
1049             //  0000 0000 - 0000 007F | 0xxxxxxx
1050             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053             //
1054             //  Code point value is stored in bits marked with 'x',
1055             //  lowest-order bit of the value on the right side in the diagram
1056             //  above.                                         (from RFC 3629)
1057
1058             // mask to extract lead byte's value ('x' bits above), by sequence
1059             // length:
1060             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062             // mask and value of lead byte's most significant bits, by length:
1063             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066             len--; // it's more convenient to work with 0-based length here
1067
1068             // extract the lead byte's value bits:
1069             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070                 break;
1071
1072             code = c & leadValueMask[len];
1073
1074             // all remaining bytes, if any, are handled in the same way
1075             // regardless of sequence's length:
1076             for ( ; len; --len )
1077             {
1078                 c = *++p;
1079                 if ( (c & 0xC0) != 0x80 )
1080                     return wxCONV_FAILED;
1081
1082                 code <<= 6;
1083                 code |= c & 0x3F;
1084             }
1085         }
1086
1087 #ifdef WC_UTF16
1088         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090         {
1091             if ( out )
1092                 out++;
1093             written++;
1094         }
1095 #else // !WC_UTF16
1096         if ( out )
1097             *out = code;
1098 #endif // WC_UTF16/!WC_UTF16
1099
1100         if ( out )
1101             out++;
1102
1103         written++;
1104     }
1105
1106     return wxCONV_FAILED;
1107 }
1108
1109 size_t
1110 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111                               const wchar_t *src, size_t srcLen) const
1112 {
1113     char *out = dstLen ? dst : NULL;
1114     size_t written = 0;
1115
1116     for ( const wchar_t *wp = src; ; wp++ )
1117     {
1118         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1119         {
1120             // all done successfully, just add the trailing NULL if we are not
1121             // using explicit length
1122             if ( srcLen == wxNO_LEN )
1123             {
1124                 if ( out )
1125                 {
1126                     if ( !dstLen )
1127                         break;
1128
1129                     *out = '\0';
1130                 }
1131
1132                 written++;
1133             }
1134
1135             return written;
1136         }
1137
1138         if ( srcLen != wxNO_LEN )
1139             srcLen--;
1140
1141         wxUint32 code;
1142 #ifdef WC_UTF16
1143         // cast is ok for WC_UTF16
1144         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145         {
1146             // skip the next char too as we decoded a surrogate
1147             wp++;
1148         }
1149 #else // wchar_t is UTF-32
1150         code = *wp & 0x7fffffff;
1151 #endif
1152
1153         unsigned len;
1154         if ( code <= 0x7F )
1155         {
1156             len = 1;
1157             if ( out )
1158             {
1159                 if ( dstLen < len )
1160                     break;
1161
1162                 out[0] = (char)code;
1163             }
1164         }
1165         else if ( code <= 0x07FF )
1166         {
1167             len = 2;
1168             if ( out )
1169             {
1170                 if ( dstLen < len )
1171                     break;
1172
1173                 // NB: this line takes 6 least significant bits, encodes them as
1174                 // 10xxxxxx and discards them so that the next byte can be encoded:
1175                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1176                 out[0] = 0xC0 | code;
1177             }
1178         }
1179         else if ( code < 0xFFFF )
1180         {
1181             len = 3;
1182             if ( out )
1183             {
1184                 if ( dstLen < len )
1185                     break;
1186
1187                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1188                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1189                 out[0] = 0xE0 | code;
1190             }
1191         }
1192         else if ( code <= 0x10FFFF )
1193         {
1194             len = 4;
1195             if ( out )
1196             {
1197                 if ( dstLen < len )
1198                     break;
1199
1200                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1201                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1202                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1203                 out[0] = 0xF0 | code;
1204             }
1205         }
1206         else
1207         {
1208             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1209             break;
1210         }
1211
1212         if ( out )
1213         {
1214             out += len;
1215             dstLen -= len;
1216         }
1217
1218         written += len;
1219     }
1220
1221     // we only get here if an error occurs during decoding
1222     return wxCONV_FAILED;
1223 }
1224
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226                              const char *psz, size_t srcLen) const
1227 {
1228     if ( m_options == MAP_INVALID_UTF8_NOT )
1229         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1230
1231     size_t len = 0;
1232
1233     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1234     {
1235         const char *opsz = psz;
1236         bool invalid = false;
1237         unsigned char cc = *psz++, fc = cc;
1238         unsigned cnt;
1239         for (cnt = 0; fc & 0x80; cnt++)
1240             fc <<= 1;
1241
1242         if (!cnt)
1243         {
1244             // plain ASCII char
1245             if (buf)
1246                 *buf++ = cc;
1247             len++;
1248
1249             // escape the escape character for octal escapes
1250             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1251                     && cc == '\\' && (!buf || len < n))
1252             {
1253                 if (buf)
1254                     *buf++ = cc;
1255                 len++;
1256             }
1257         }
1258         else
1259         {
1260             cnt--;
1261             if (!cnt)
1262             {
1263                 // invalid UTF-8 sequence
1264                 invalid = true;
1265             }
1266             else
1267             {
1268                 unsigned ocnt = cnt - 1;
1269                 wxUint32 res = cc & (0x3f >> cnt);
1270                 while (cnt--)
1271                 {
1272                     cc = *psz;
1273                     if ((cc & 0xC0) != 0x80)
1274                     {
1275                         // invalid UTF-8 sequence
1276                         invalid = true;
1277                         break;
1278                     }
1279
1280                     psz++;
1281                     res = (res << 6) | (cc & 0x3f);
1282                 }
1283
1284                 if (invalid || res <= utf8_max[ocnt])
1285                 {
1286                     // illegal UTF-8 encoding
1287                     invalid = true;
1288                 }
1289                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1290                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1291                 {
1292                     // if one of our PUA characters turns up externally
1293                     // it must also be treated as an illegal sequence
1294                     // (a bit like you have to escape an escape character)
1295                     invalid = true;
1296                 }
1297                 else
1298                 {
1299 #ifdef WC_UTF16
1300                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1301                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1302                     if (pa == wxCONV_FAILED)
1303                     {
1304                         invalid = true;
1305                     }
1306                     else
1307                     {
1308                         if (buf)
1309                             buf += pa;
1310                         len += pa;
1311                     }
1312 #else // !WC_UTF16
1313                     if (buf)
1314                         *buf++ = (wchar_t)res;
1315                     len++;
1316 #endif // WC_UTF16/!WC_UTF16
1317                 }
1318             }
1319
1320             if (invalid)
1321             {
1322                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1323                 {
1324                     while (opsz < psz && (!buf || len < n))
1325                     {
1326 #ifdef WC_UTF16
1327                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1328                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1329                         wxASSERT(pa != wxCONV_FAILED);
1330                         if (buf)
1331                             buf += pa;
1332                         opsz++;
1333                         len += pa;
1334 #else
1335                         if (buf)
1336                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1337                         opsz++;
1338                         len++;
1339 #endif
1340                     }
1341                 }
1342                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1343                 {
1344                     while (opsz < psz && (!buf || len < n))
1345                     {
1346                         if ( buf && len + 3 < n )
1347                         {
1348                             unsigned char on = *opsz;
1349                             *buf++ = L'\\';
1350                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1351                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1352                             *buf++ = (wchar_t)( L'0' + on % 010 );
1353                         }
1354
1355                         opsz++;
1356                         len += 4;
1357                     }
1358                 }
1359                 else // MAP_INVALID_UTF8_NOT
1360                 {
1361                     return wxCONV_FAILED;
1362                 }
1363             }
1364         }
1365     }
1366
1367     if (srcLen == wxNO_LEN && buf && (len < n))
1368         *buf = 0;
1369
1370     return len + 1;
1371 }
1372
1373 static inline bool isoctal(wchar_t wch)
1374 {
1375     return L'0' <= wch && wch <= L'7';
1376 }
1377
1378 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379                                const wchar_t *psz, size_t srcLen) const
1380 {
1381     if ( m_options == MAP_INVALID_UTF8_NOT )
1382         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1383
1384     size_t len = 0;
1385
1386     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1387     {
1388         wxUint32 cc;
1389
1390 #ifdef WC_UTF16
1391         // cast is ok for WC_UTF16
1392         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1393         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1394 #else
1395         cc = (*psz++) & 0x7fffffff;
1396 #endif
1397
1398         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1399                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1400         {
1401             if (buf)
1402                 *buf++ = (char)(cc - wxUnicodePUA);
1403             len++;
1404         }
1405         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1406                     && cc == L'\\' && psz[0] == L'\\' )
1407         {
1408             if (buf)
1409                 *buf++ = (char)cc;
1410             psz++;
1411             len++;
1412         }
1413         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1414                     cc == L'\\' &&
1415                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1416         {
1417             if (buf)
1418             {
1419                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1420                                  (psz[1] - L'0') * 010 +
1421                                  (psz[2] - L'0'));
1422             }
1423
1424             psz += 3;
1425             len++;
1426         }
1427         else
1428         {
1429             unsigned cnt;
1430             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1431             {
1432             }
1433
1434             if (!cnt)
1435             {
1436                 // plain ASCII char
1437                 if (buf)
1438                     *buf++ = (char) cc;
1439                 len++;
1440             }
1441             else
1442             {
1443                 len += cnt + 1;
1444                 if (buf)
1445                 {
1446                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1447                     while (cnt--)
1448                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1449                 }
1450             }
1451         }
1452     }
1453
1454     if (srcLen == wxNO_LEN && buf && (len < n))
1455         *buf = 0;
1456
1457     return len + 1;
1458 }
1459
1460 // ============================================================================
1461 // UTF-16
1462 // ============================================================================
1463
1464 #ifdef WORDS_BIGENDIAN
1465     #define wxMBConvUTF16straight wxMBConvUTF16BE
1466     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1467 #else
1468     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1469     #define wxMBConvUTF16straight wxMBConvUTF16LE
1470 #endif
1471
1472 /* static */
1473 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1474 {
1475     if ( srcLen == wxNO_LEN )
1476     {
1477         // count the number of bytes in input, including the trailing NULs
1478         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1479         for ( srcLen = 1; *inBuff++; srcLen++ )
1480             ;
1481
1482         srcLen *= BYTES_PER_CHAR;
1483     }
1484     else // we already have the length
1485     {
1486         // we can only convert an entire number of UTF-16 characters
1487         if ( srcLen % BYTES_PER_CHAR )
1488             return wxCONV_FAILED;
1489     }
1490
1491     return srcLen;
1492 }
1493
1494 // case when in-memory representation is UTF-16 too
1495 #ifdef WC_UTF16
1496
1497 // ----------------------------------------------------------------------------
1498 // conversions without endianness change
1499 // ----------------------------------------------------------------------------
1500
1501 size_t
1502 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1503                                const char *src, size_t srcLen) const
1504 {
1505     // set up the scene for using memcpy() (which is presumably more efficient
1506     // than copying the bytes one by one)
1507     srcLen = GetLength(src, srcLen);
1508     if ( srcLen == wxNO_LEN )
1509         return wxCONV_FAILED;
1510
1511     const size_t inLen = srcLen / BYTES_PER_CHAR;
1512     if ( dst )
1513     {
1514         if ( dstLen < inLen )
1515             return wxCONV_FAILED;
1516
1517         memcpy(dst, src, srcLen);
1518     }
1519
1520     return inLen;
1521 }
1522
1523 size_t
1524 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1525                                  const wchar_t *src, size_t srcLen) const
1526 {
1527     if ( srcLen == wxNO_LEN )
1528         srcLen = wxWcslen(src) + 1;
1529
1530     srcLen *= BYTES_PER_CHAR;
1531
1532     if ( dst )
1533     {
1534         if ( dstLen < srcLen )
1535             return wxCONV_FAILED;
1536
1537         memcpy(dst, src, srcLen);
1538     }
1539
1540     return srcLen;
1541 }
1542
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1546
1547 size_t
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549                            const char *src, size_t srcLen) const
1550 {
1551     srcLen = GetLength(src, srcLen);
1552     if ( srcLen == wxNO_LEN )
1553         return wxCONV_FAILED;
1554
1555     srcLen /= BYTES_PER_CHAR;
1556
1557     if ( dst )
1558     {
1559         if ( dstLen < srcLen )
1560             return wxCONV_FAILED;
1561
1562         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1563         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1564         {
1565             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1566         }
1567     }
1568
1569     return srcLen;
1570 }
1571
1572 size_t
1573 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1574                              const wchar_t *src, size_t srcLen) const
1575 {
1576     if ( srcLen == wxNO_LEN )
1577         srcLen = wxWcslen(src) + 1;
1578
1579     srcLen *= BYTES_PER_CHAR;
1580
1581     if ( dst )
1582     {
1583         if ( dstLen < srcLen )
1584             return wxCONV_FAILED;
1585
1586         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1587         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1588         {
1589             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1590         }
1591     }
1592
1593     return srcLen;
1594 }
1595
1596 #else // !WC_UTF16: wchar_t is UTF-32
1597
1598 // ----------------------------------------------------------------------------
1599 // conversions without endianness change
1600 // ----------------------------------------------------------------------------
1601
1602 size_t
1603 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1604                                const char *src, size_t srcLen) const
1605 {
1606     srcLen = GetLength(src, srcLen);
1607     if ( srcLen == wxNO_LEN )
1608         return wxCONV_FAILED;
1609
1610     const size_t inLen = srcLen / BYTES_PER_CHAR;
1611     if ( !dst )
1612     {
1613         // optimization: return maximal space which could be needed for this
1614         // string even if the real size could be smaller if the buffer contains
1615         // any surrogates
1616         return inLen;
1617     }
1618
1619     size_t outLen = 0;
1620     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1621     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1622     {
1623         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1624         if ( !inBuff )
1625             return wxCONV_FAILED;
1626
1627         if ( ++outLen > dstLen )
1628             return wxCONV_FAILED;
1629
1630         *dst++ = ch;
1631     }
1632
1633
1634     return outLen;
1635 }
1636
1637 size_t
1638 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1639                                  const wchar_t *src, size_t srcLen) const
1640 {
1641     if ( srcLen == wxNO_LEN )
1642         srcLen = wxWcslen(src) + 1;
1643
1644     size_t outLen = 0;
1645     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1646     for ( size_t n = 0; n < srcLen; n++ )
1647     {
1648         wxUint16 cc[2];
1649         const size_t numChars = encode_utf16(*src++, cc);
1650         if ( numChars == wxCONV_FAILED )
1651             return wxCONV_FAILED;
1652
1653         outLen += numChars * BYTES_PER_CHAR;
1654         if ( outBuff )
1655         {
1656             if ( outLen > dstLen )
1657                 return wxCONV_FAILED;
1658
1659             *outBuff++ = cc[0];
1660             if ( numChars == 2 )
1661             {
1662                 // second character of a surrogate
1663                 *outBuff++ = cc[1];
1664             }
1665         }
1666     }
1667
1668     return outLen;
1669 }
1670
1671 // ----------------------------------------------------------------------------
1672 // endian-reversing conversions
1673 // ----------------------------------------------------------------------------
1674
1675 size_t
1676 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1677                            const char *src, size_t srcLen) const
1678 {
1679     srcLen = GetLength(src, srcLen);
1680     if ( srcLen == wxNO_LEN )
1681         return wxCONV_FAILED;
1682
1683     const size_t inLen = srcLen / BYTES_PER_CHAR;
1684     if ( !dst )
1685     {
1686         // optimization: return maximal space which could be needed for this
1687         // string even if the real size could be smaller if the buffer contains
1688         // any surrogates
1689         return inLen;
1690     }
1691
1692     size_t outLen = 0;
1693     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1694     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1695     {
1696         wxUint32 ch;
1697         wxUint16 tmp[2];
1698
1699         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1700         inBuff++;
1701         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1702
1703         const size_t numChars = decode_utf16(tmp, ch);
1704         if ( numChars == wxCONV_FAILED )
1705             return wxCONV_FAILED;
1706
1707         if ( numChars == 2 )
1708             inBuff++;
1709
1710         if ( ++outLen > dstLen )
1711             return wxCONV_FAILED;
1712
1713         *dst++ = ch;
1714     }
1715
1716
1717     return outLen;
1718 }
1719
1720 size_t
1721 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1722                              const wchar_t *src, size_t srcLen) const
1723 {
1724     if ( srcLen == wxNO_LEN )
1725         srcLen = wxWcslen(src) + 1;
1726
1727     size_t outLen = 0;
1728     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1729     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1730     {
1731         wxUint16 cc[2];
1732         const size_t numChars = encode_utf16(*src, cc);
1733         if ( numChars == wxCONV_FAILED )
1734             return wxCONV_FAILED;
1735
1736         outLen += numChars * BYTES_PER_CHAR;
1737         if ( outBuff )
1738         {
1739             if ( outLen > dstLen )
1740                 return wxCONV_FAILED;
1741
1742             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1743             if ( numChars == 2 )
1744             {
1745                 // second character of a surrogate
1746                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1747             }
1748         }
1749     }
1750
1751     return outLen;
1752 }
1753
1754 #endif // WC_UTF16/!WC_UTF16
1755
1756
1757 // ============================================================================
1758 // UTF-32
1759 // ============================================================================
1760
1761 #ifdef WORDS_BIGENDIAN
1762     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1763     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1764 #else
1765     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1766     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1767 #endif
1768
1769
1770 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1771 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1772
1773 /* static */
1774 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1775 {
1776     if ( srcLen == wxNO_LEN )
1777     {
1778         // count the number of bytes in input, including the trailing NULs
1779         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1780         for ( srcLen = 1; *inBuff++; srcLen++ )
1781             ;
1782
1783         srcLen *= BYTES_PER_CHAR;
1784     }
1785     else // we already have the length
1786     {
1787         // we can only convert an entire number of UTF-32 characters
1788         if ( srcLen % BYTES_PER_CHAR )
1789             return wxCONV_FAILED;
1790     }
1791
1792     return srcLen;
1793 }
1794
1795 // case when in-memory representation is UTF-16
1796 #ifdef WC_UTF16
1797
1798 // ----------------------------------------------------------------------------
1799 // conversions without endianness change
1800 // ----------------------------------------------------------------------------
1801
1802 size_t
1803 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1804                                const char *src, size_t srcLen) const
1805 {
1806     srcLen = GetLength(src, srcLen);
1807     if ( srcLen == wxNO_LEN )
1808         return wxCONV_FAILED;
1809
1810     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1811     const size_t inLen = srcLen / BYTES_PER_CHAR;
1812     size_t outLen = 0;
1813     for ( size_t n = 0; n < inLen; n++ )
1814     {
1815         wxUint16 cc[2];
1816         const size_t numChars = encode_utf16(*inBuff++, cc);
1817         if ( numChars == wxCONV_FAILED )
1818             return wxCONV_FAILED;
1819
1820         outLen += numChars;
1821         if ( dst )
1822         {
1823             if ( outLen > dstLen )
1824                 return wxCONV_FAILED;
1825
1826             *dst++ = cc[0];
1827             if ( numChars == 2 )
1828             {
1829                 // second character of a surrogate
1830                 *dst++ = cc[1];
1831             }
1832         }
1833     }
1834
1835     return outLen;
1836 }
1837
1838 size_t
1839 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1840                                  const wchar_t *src, size_t srcLen) const
1841 {
1842     if ( srcLen == wxNO_LEN )
1843         srcLen = wxWcslen(src) + 1;
1844
1845     if ( !dst )
1846     {
1847         // optimization: return maximal space which could be needed for this
1848         // string instead of the exact amount which could be less if there are
1849         // any surrogates in the input
1850         //
1851         // we consider that surrogates are rare enough to make it worthwhile to
1852         // avoid running the loop below at the cost of slightly extra memory
1853         // consumption
1854         return srcLen * BYTES_PER_CHAR;
1855     }
1856
1857     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1858     size_t outLen = 0;
1859     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1860     {
1861         const wxUint32 ch = wxDecodeSurrogate(&src);
1862         if ( !src )
1863             return wxCONV_FAILED;
1864
1865         outLen += BYTES_PER_CHAR;
1866
1867         if ( outLen > dstLen )
1868             return wxCONV_FAILED;
1869
1870         *outBuff++ = ch;
1871     }
1872
1873     return outLen;
1874 }
1875
1876 // ----------------------------------------------------------------------------
1877 // endian-reversing conversions
1878 // ----------------------------------------------------------------------------
1879
1880 size_t
1881 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1882                            const char *src, size_t srcLen) const
1883 {
1884     srcLen = GetLength(src, srcLen);
1885     if ( srcLen == wxNO_LEN )
1886         return wxCONV_FAILED;
1887
1888     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1889     const size_t inLen = srcLen / BYTES_PER_CHAR;
1890     size_t outLen = 0;
1891     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1892     {
1893         wxUint16 cc[2];
1894         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1895         if ( numChars == wxCONV_FAILED )
1896             return wxCONV_FAILED;
1897
1898         outLen += numChars;
1899         if ( dst )
1900         {
1901             if ( outLen > dstLen )
1902                 return wxCONV_FAILED;
1903
1904             *dst++ = cc[0];
1905             if ( numChars == 2 )
1906             {
1907                 // second character of a surrogate
1908                 *dst++ = cc[1];
1909             }
1910         }
1911     }
1912
1913     return outLen;
1914 }
1915
1916 size_t
1917 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1918                              const wchar_t *src, size_t srcLen) const
1919 {
1920     if ( srcLen == wxNO_LEN )
1921         srcLen = wxWcslen(src) + 1;
1922
1923     if ( !dst )
1924     {
1925         // optimization: return maximal space which could be needed for this
1926         // string instead of the exact amount which could be less if there are
1927         // any surrogates in the input
1928         //
1929         // we consider that surrogates are rare enough to make it worthwhile to
1930         // avoid running the loop below at the cost of slightly extra memory
1931         // consumption
1932         return srcLen*BYTES_PER_CHAR;
1933     }
1934
1935     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1936     size_t outLen = 0;
1937     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1938     {
1939         const wxUint32 ch = wxDecodeSurrogate(&src);
1940         if ( !src )
1941             return wxCONV_FAILED;
1942
1943         outLen += BYTES_PER_CHAR;
1944
1945         if ( outLen > dstLen )
1946             return wxCONV_FAILED;
1947
1948         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1949     }
1950
1951     return outLen;
1952 }
1953
1954 #else // !WC_UTF16: wchar_t is UTF-32
1955
1956 // ----------------------------------------------------------------------------
1957 // conversions without endianness change
1958 // ----------------------------------------------------------------------------
1959
1960 size_t
1961 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1962                                const char *src, size_t srcLen) const
1963 {
1964     // use memcpy() as it should be much faster than hand-written loop
1965     srcLen = GetLength(src, srcLen);
1966     if ( srcLen == wxNO_LEN )
1967         return wxCONV_FAILED;
1968
1969     const size_t inLen = srcLen/BYTES_PER_CHAR;
1970     if ( dst )
1971     {
1972         if ( dstLen < inLen )
1973             return wxCONV_FAILED;
1974
1975         memcpy(dst, src, srcLen);
1976     }
1977
1978     return inLen;
1979 }
1980
1981 size_t
1982 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1983                                  const wchar_t *src, size_t srcLen) const
1984 {
1985     if ( srcLen == wxNO_LEN )
1986         srcLen = wxWcslen(src) + 1;
1987
1988     srcLen *= BYTES_PER_CHAR;
1989
1990     if ( dst )
1991     {
1992         if ( dstLen < srcLen )
1993             return wxCONV_FAILED;
1994
1995         memcpy(dst, src, srcLen);
1996     }
1997
1998     return srcLen;
1999 }
2000
2001 // ----------------------------------------------------------------------------
2002 // endian-reversing conversions
2003 // ----------------------------------------------------------------------------
2004
2005 size_t
2006 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2007                            const char *src, size_t srcLen) const
2008 {
2009     srcLen = GetLength(src, srcLen);
2010     if ( srcLen == wxNO_LEN )
2011         return wxCONV_FAILED;
2012
2013     srcLen /= BYTES_PER_CHAR;
2014
2015     if ( dst )
2016     {
2017         if ( dstLen < srcLen )
2018             return wxCONV_FAILED;
2019
2020         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2021         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2022         {
2023             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2024         }
2025     }
2026
2027     return srcLen;
2028 }
2029
2030 size_t
2031 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2032                              const wchar_t *src, size_t srcLen) const
2033 {
2034     if ( srcLen == wxNO_LEN )
2035         srcLen = wxWcslen(src) + 1;
2036
2037     srcLen *= BYTES_PER_CHAR;
2038
2039     if ( dst )
2040     {
2041         if ( dstLen < srcLen )
2042             return wxCONV_FAILED;
2043
2044         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2045         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2046         {
2047             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2048         }
2049     }
2050
2051     return srcLen;
2052 }
2053
2054 #endif // WC_UTF16/!WC_UTF16
2055
2056
2057 // ============================================================================
2058 // The classes doing conversion using the iconv_xxx() functions
2059 // ============================================================================
2060
2061 #ifdef HAVE_ICONV
2062
2063 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2064 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2065 //     (unless there's yet another bug in glibc) the only case when iconv()
2066 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2067 //     left in the input buffer -- when _real_ error occurs,
2068 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2069 //     iconv() failure.
2070 //     [This bug does not appear in glibc 2.2.]
2071 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2072 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2073                                      (errno != E2BIG || bufLeft != 0))
2074 #else
2075 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2076 #endif
2077
2078 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2079
2080 #define ICONV_T_INVALID ((iconv_t)-1)
2081
2082 #if SIZEOF_WCHAR_T == 4
2083     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2084     #define WC_ENC      wxFONTENCODING_UTF32
2085 #elif SIZEOF_WCHAR_T == 2
2086     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2087     #define WC_ENC      wxFONTENCODING_UTF16
2088 #else // sizeof(wchar_t) != 2 nor 4
2089     // does this ever happen?
2090     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2091 #endif
2092
2093 // ----------------------------------------------------------------------------
2094 // wxMBConv_iconv: encapsulates an iconv character set
2095 // ----------------------------------------------------------------------------
2096
2097 class wxMBConv_iconv : public wxMBConv
2098 {
2099 public:
2100     wxMBConv_iconv(const char *name);
2101     virtual ~wxMBConv_iconv();
2102
2103     // implement base class virtual methods
2104     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2105                            const char *src, size_t srcLen = wxNO_LEN) const;
2106     virtual size_t FromWChar(char *dst, size_t dstLen,
2107                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2108     virtual size_t GetMBNulLen() const;
2109
2110 #if wxUSE_UNICODE_UTF8
2111     virtual bool IsUTF8() const;
2112 #endif
2113
2114     virtual wxMBConv *Clone() const
2115     {
2116         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2117         p->m_minMBCharWidth = m_minMBCharWidth;
2118         return p;
2119     }
2120
2121     bool IsOk() const
2122         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2123
2124 protected:
2125     // the iconv handlers used to translate from multibyte
2126     // to wide char and in the other direction
2127     iconv_t m2w,
2128             w2m;
2129
2130 #if wxUSE_THREADS
2131     // guards access to m2w and w2m objects
2132     wxMutex m_iconvMutex;
2133 #endif
2134
2135 private:
2136     // the name (for iconv_open()) of a wide char charset -- if none is
2137     // available on this machine, it will remain NULL
2138     static wxString ms_wcCharsetName;
2139
2140     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2141     // different endian-ness than the native one
2142     static bool ms_wcNeedsSwap;
2143
2144
2145     // name of the encoding handled by this conversion
2146     const char *m_name;
2147
2148     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2149     // initially
2150     size_t m_minMBCharWidth;
2151 };
2152
2153 // make the constructor available for unit testing
2154 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2155 {
2156     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2157     if ( !result->IsOk() )
2158     {
2159         delete result;
2160         return 0;
2161     }
2162
2163     return result;
2164 }
2165
2166 wxString wxMBConv_iconv::ms_wcCharsetName;
2167 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2168
2169 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2170               : m_name(wxStrdup(name))
2171 {
2172     m_minMBCharWidth = 0;
2173
2174     // check for charset that represents wchar_t:
2175     if ( ms_wcCharsetName.empty() )
2176     {
2177         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2178
2179 #if wxUSE_FONTMAP
2180         const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2181 #else // !wxUSE_FONTMAP
2182         static const wxChar *const names_static[] =
2183         {
2184 #if SIZEOF_WCHAR_T == 4
2185             wxT("UCS-4"),
2186 #elif SIZEOF_WCHAR_T == 2
2187             wxT("UCS-2"),
2188 #endif
2189             NULL
2190         };
2191         const wxChar *const *names = names_static;
2192 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2193
2194         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2195         {
2196             const wxString nameCS(*names);
2197
2198             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2199             wxString nameXE(nameCS);
2200
2201 #ifdef WORDS_BIGENDIAN
2202                 nameXE += wxT("BE");
2203 #else // little endian
2204                 nameXE += wxT("LE");
2205 #endif
2206
2207             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2208                        nameXE.c_str());
2209
2210             m2w = iconv_open(nameXE.ToAscii(), name);
2211             if ( m2w == ICONV_T_INVALID )
2212             {
2213                 // try charset w/o bytesex info (e.g. "UCS4")
2214                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2215                            nameCS.c_str());
2216                 m2w = iconv_open(nameCS.ToAscii(), name);
2217
2218                 // and check for bytesex ourselves:
2219                 if ( m2w != ICONV_T_INVALID )
2220                 {
2221                     char    buf[2], *bufPtr;
2222                     wchar_t wbuf[2];
2223                     size_t  insz, outsz;
2224                     size_t  res;
2225
2226                     buf[0] = 'A';
2227                     buf[1] = 0;
2228                     wbuf[0] = 0;
2229                     insz = 2;
2230                     outsz = SIZEOF_WCHAR_T * 2;
2231                     char* wbufPtr = (char*)wbuf;
2232                     bufPtr = buf;
2233
2234                     res = iconv(
2235                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2236                         &wbufPtr, &outsz);
2237
2238                     if (ICONV_FAILED(res, insz))
2239                     {
2240                         wxLogLastError(wxT("iconv"));
2241                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2242                                    nameCS.c_str());
2243                     }
2244                     else // ok, can convert to this encoding, remember it
2245                     {
2246                         ms_wcCharsetName = nameCS;
2247                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2248                     }
2249                 }
2250             }
2251             else // use charset not requiring byte swapping
2252             {
2253                 ms_wcCharsetName = nameXE;
2254             }
2255         }
2256
2257         wxLogTrace(TRACE_STRCONV,
2258                    wxT("iconv wchar_t charset is \"%s\"%s"),
2259                    ms_wcCharsetName.empty() ? wxString("<none>")
2260                                             : ms_wcCharsetName,
2261                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2262                                   : wxT(""));
2263     }
2264     else // we already have ms_wcCharsetName
2265     {
2266         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2267     }
2268
2269     if ( ms_wcCharsetName.empty() )
2270     {
2271         w2m = ICONV_T_INVALID;
2272     }
2273     else
2274     {
2275         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2276         if ( w2m == ICONV_T_INVALID )
2277         {
2278             wxLogTrace(TRACE_STRCONV,
2279                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2280                        ms_wcCharsetName.c_str(), name);
2281         }
2282     }
2283 }
2284
2285 wxMBConv_iconv::~wxMBConv_iconv()
2286 {
2287     free(const_cast<char *>(m_name));
2288
2289     if ( m2w != ICONV_T_INVALID )
2290         iconv_close(m2w);
2291     if ( w2m != ICONV_T_INVALID )
2292         iconv_close(w2m);
2293 }
2294
2295 size_t
2296 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2297                         const char *src, size_t srcLen) const
2298 {
2299     if ( srcLen == wxNO_LEN )
2300     {
2301         // find the string length: notice that must be done differently for
2302         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2303         // consecutive NULs
2304         const size_t nulLen = GetMBNulLen();
2305         switch ( nulLen )
2306         {
2307             default:
2308                 return wxCONV_FAILED;
2309
2310             case 1:
2311                 srcLen = strlen(src); // arguably more optimized than our version
2312                 break;
2313
2314             case 2:
2315             case 4:
2316                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317                 // but they also have to start at character boundary and not
2318                 // span two adjacent characters
2319                 const char *p;
2320                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2321                     ;
2322                 srcLen = p - src;
2323                 break;
2324         }
2325
2326         // when we're determining the length of the string ourselves we count
2327         // the terminating NUL(s) as part of it and always NUL-terminate the
2328         // output
2329         srcLen += nulLen;
2330     }
2331
2332     // we express length in the number of (wide) characters but iconv always
2333     // counts buffer sizes it in bytes
2334     dstLen *= SIZEOF_WCHAR_T;
2335
2336 #if wxUSE_THREADS
2337     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338     //     Unfortunately there are a couple of global wxCSConv objects such as
2339     //     wxConvLocal that are used all over wx code, so we have to make sure
2340     //     the handle is used by at most one thread at the time. Otherwise
2341     //     only a few wx classes would be safe to use from non-main threads
2342     //     as MB<->WC conversion would fail "randomly".
2343     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2344 #endif // wxUSE_THREADS
2345
2346     size_t res, cres;
2347     const char *pszPtr = src;
2348
2349     if ( dst )
2350     {
2351         char* bufPtr = (char*)dst;
2352
2353         // have destination buffer, convert there
2354         size_t dstLenOrig = dstLen;
2355         cres = iconv(m2w,
2356                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2357                      &bufPtr, &dstLen);
2358
2359         // convert the number of bytes converted as returned by iconv to the
2360         // number of (wide) characters converted that we need
2361         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2362
2363         if (ms_wcNeedsSwap)
2364         {
2365             // convert to native endianness
2366             for ( unsigned i = 0; i < res; i++ )
2367                 dst[i] = WC_BSWAP(dst[i]);
2368         }
2369     }
2370     else // no destination buffer
2371     {
2372         // convert using temp buffer to calculate the size of the buffer needed
2373         wchar_t tbuf[256];
2374         res = 0;
2375
2376         do
2377         {
2378             char* bufPtr = (char*)tbuf;
2379             dstLen = 8 * SIZEOF_WCHAR_T;
2380
2381             cres = iconv(m2w,
2382                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2383                          &bufPtr, &dstLen );
2384
2385             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2386         }
2387         while ((cres == (size_t)-1) && (errno == E2BIG));
2388     }
2389
2390     if (ICONV_FAILED(cres, srcLen))
2391     {
2392         //VS: it is ok if iconv fails, hence trace only
2393         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2394         return wxCONV_FAILED;
2395     }
2396
2397     return res;
2398 }
2399
2400 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2401                                  const wchar_t *src, size_t srcLen) const
2402 {
2403 #if wxUSE_THREADS
2404     // NB: explained in MB2WC
2405     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2406 #endif
2407
2408     if ( srcLen == wxNO_LEN )
2409         srcLen = wxWcslen(src) + 1;
2410
2411     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2412     size_t outbuflen = dstLen;
2413     size_t res, cres;
2414
2415     wchar_t *tmpbuf = 0;
2416
2417     if (ms_wcNeedsSwap)
2418     {
2419         // need to copy to temp buffer to switch endianness
2420         // (doing WC_BSWAP twice on the original buffer won't work, as it
2421         //  could be in read-only memory, or be accessed in some other thread)
2422         tmpbuf = (wchar_t *)malloc(inbuflen);
2423         for ( size_t i = 0; i < srcLen; i++ )
2424             tmpbuf[i] = WC_BSWAP(src[i]);
2425
2426         src = tmpbuf;
2427     }
2428
2429     char* inbuf = (char*)src;
2430     if ( dst )
2431     {
2432         // have destination buffer, convert there
2433         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2434
2435         res = dstLen - outbuflen;
2436     }
2437     else // no destination buffer
2438     {
2439         // convert using temp buffer to calculate the size of the buffer needed
2440         char tbuf[256];
2441         res = 0;
2442         do
2443         {
2444             dst = tbuf;
2445             outbuflen = WXSIZEOF(tbuf);
2446
2447             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2448
2449             res += WXSIZEOF(tbuf) - outbuflen;
2450         }
2451         while ((cres == (size_t)-1) && (errno == E2BIG));
2452     }
2453
2454     if (ms_wcNeedsSwap)
2455     {
2456         free(tmpbuf);
2457     }
2458
2459     if (ICONV_FAILED(cres, inbuflen))
2460     {
2461         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2462         return wxCONV_FAILED;
2463     }
2464
2465     return res;
2466 }
2467
2468 size_t wxMBConv_iconv::GetMBNulLen() const
2469 {
2470     if ( m_minMBCharWidth == 0 )
2471     {
2472         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2473
2474 #if wxUSE_THREADS
2475         // NB: explained in MB2WC
2476         wxMutexLocker lock(self->m_iconvMutex);
2477 #endif
2478
2479         const wchar_t *wnul = L"";
2480         char buf[8]; // should be enough for NUL in any encoding
2481         size_t inLen = sizeof(wchar_t),
2482                outLen = WXSIZEOF(buf);
2483         char *inBuff = (char *)wnul;
2484         char *outBuff = buf;
2485         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2486         {
2487             self->m_minMBCharWidth = (size_t)-1;
2488         }
2489         else // ok
2490         {
2491             self->m_minMBCharWidth = outBuff - buf;
2492         }
2493     }
2494
2495     return m_minMBCharWidth;
2496 }
2497
2498 #if wxUSE_UNICODE_UTF8
2499 bool wxMBConv_iconv::IsUTF8() const
2500 {
2501     return wxStricmp(m_name, "UTF-8") == 0 ||
2502            wxStricmp(m_name, "UTF8") == 0;
2503 }
2504 #endif
2505
2506 #endif // HAVE_ICONV
2507
2508
2509 // ============================================================================
2510 // Win32 conversion classes
2511 // ============================================================================
2512
2513 #ifdef wxHAVE_WIN32_MB2WC
2514
2515 // from utils.cpp
2516 #if wxUSE_FONTMAP
2517 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2518 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2519 #endif
2520
2521 class wxMBConv_win32 : public wxMBConv
2522 {
2523 public:
2524     wxMBConv_win32()
2525     {
2526         m_CodePage = CP_ACP;
2527         m_minMBCharWidth = 0;
2528     }
2529
2530     wxMBConv_win32(const wxMBConv_win32& conv)
2531         : wxMBConv()
2532     {
2533         m_CodePage = conv.m_CodePage;
2534         m_minMBCharWidth = conv.m_minMBCharWidth;
2535     }
2536
2537 #if wxUSE_FONTMAP
2538     wxMBConv_win32(const char* name)
2539     {
2540         m_CodePage = wxCharsetToCodepage(name);
2541         m_minMBCharWidth = 0;
2542     }
2543
2544     wxMBConv_win32(wxFontEncoding encoding)
2545     {
2546         m_CodePage = wxEncodingToCodepage(encoding);
2547         m_minMBCharWidth = 0;
2548     }
2549 #endif // wxUSE_FONTMAP
2550
2551     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2552     {
2553         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554         // the behaviour is not compatible with the Unix version (using iconv)
2555         // and break the library itself, e.g. wxTextInputStream::NextChar()
2556         // wouldn't work if reading an incomplete MB char didn't result in an
2557         // error
2558         //
2559         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2560         // Win XP or newer and it is not supported for UTF-[78] so we always
2561         // use our own conversions in this case. See
2562         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2564         if ( m_CodePage == CP_UTF8 )
2565         {
2566             return wxMBConvUTF8().MB2WC(buf, psz, n);
2567         }
2568
2569         if ( m_CodePage == CP_UTF7 )
2570         {
2571             return wxMBConvUTF7().MB2WC(buf, psz, n);
2572         }
2573
2574         int flags = 0;
2575         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2576                 IsAtLeastWin2kSP4() )
2577         {
2578             flags = MB_ERR_INVALID_CHARS;
2579         }
2580
2581         const size_t len = ::MultiByteToWideChar
2582                              (
2583                                 m_CodePage,     // code page
2584                                 flags,          // flags: fall on error
2585                                 psz,            // input string
2586                                 -1,             // its length (NUL-terminated)
2587                                 buf,            // output string
2588                                 buf ? n : 0     // size of output buffer
2589                              );
2590         if ( !len )
2591         {
2592             // function totally failed
2593             return wxCONV_FAILED;
2594         }
2595
2596         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597         // check if we succeeded, by doing a double trip:
2598         if ( !flags && buf )
2599         {
2600             const size_t mbLen = strlen(psz);
2601             wxCharBuffer mbBuf(mbLen);
2602             if ( ::WideCharToMultiByte
2603                    (
2604                       m_CodePage,
2605                       0,
2606                       buf,
2607                       -1,
2608                       mbBuf.data(),
2609                       mbLen + 1,        // size in bytes, not length
2610                       NULL,
2611                       NULL
2612                    ) == 0 ||
2613                   strcmp(mbBuf, psz) != 0 )
2614             {
2615                 // we didn't obtain the same thing we started from, hence
2616                 // the conversion was lossy and we consider that it failed
2617                 return wxCONV_FAILED;
2618             }
2619         }
2620
2621         // note that it returns count of written chars for buf != NULL and size
2622         // of the needed buffer for buf == NULL so in either case the length of
2623         // the string (which never includes the terminating NUL) is one less
2624         return len - 1;
2625     }
2626
2627     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2628     {
2629         /*
2630             we have a problem here: by default, WideCharToMultiByte() may
2631             replace characters unrepresentable in the target code page with bad
2632             quality approximations such as turning "1/2" symbol (U+00BD) into
2633             "1" for the code pages which don't have it and we, obviously, want
2634             to avoid this at any price
2635
2636             the trouble is that this function does it _silently_, i.e. it won't
2637             even tell us whether it did or not... Win98/2000 and higher provide
2638             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639             we have to resort to a round trip, i.e. check that converting back
2640             results in the same string -- this is, of course, expensive but
2641             otherwise we simply can't be sure to not garble the data.
2642          */
2643
2644         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645         // it doesn't work with CJK encodings (which we test for rather roughly
2646         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2647         // supporting it
2648         BOOL usedDef wxDUMMY_INITIALIZE(false);
2649         BOOL *pUsedDef;
2650         int flags;
2651         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2652         {
2653             // it's our lucky day
2654             flags = WC_NO_BEST_FIT_CHARS;
2655             pUsedDef = &usedDef;
2656         }
2657         else // old system or unsupported encoding
2658         {
2659             flags = 0;
2660             pUsedDef = NULL;
2661         }
2662
2663         const size_t len = ::WideCharToMultiByte
2664                              (
2665                                 m_CodePage,     // code page
2666                                 flags,          // either none or no best fit
2667                                 pwz,            // input string
2668                                 -1,             // it is (wide) NUL-terminated
2669                                 buf,            // output buffer
2670                                 buf ? n : 0,    // and its size
2671                                 NULL,           // default "replacement" char
2672                                 pUsedDef        // [out] was it used?
2673                              );
2674
2675         if ( !len )
2676         {
2677             // function totally failed
2678             return wxCONV_FAILED;
2679         }
2680
2681         // we did something, check if we really succeeded
2682         if ( flags )
2683         {
2684             // check if the conversion failed, i.e. if any replacements
2685             // were done
2686             if ( usedDef )
2687                 return wxCONV_FAILED;
2688         }
2689         else // we must resort to double tripping...
2690         {
2691             // first we need to ensure that we really have the MB data: this is
2692             // not the case if we're called with NULL buffer, in which case we
2693             // need to do the conversion yet again
2694             wxCharBuffer bufDef;
2695             if ( !buf )
2696             {
2697                 bufDef = wxCharBuffer(len);
2698                 buf = bufDef.data();
2699                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2700                                             buf, len, NULL, NULL) )
2701                     return wxCONV_FAILED;
2702             }
2703
2704             if ( !n )
2705                 n = wcslen(pwz);
2706             wxWCharBuffer wcBuf(n);
2707             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2708                     wcscmp(wcBuf, pwz) != 0 )
2709             {
2710                 // we didn't obtain the same thing we started from, hence
2711                 // the conversion was lossy and we consider that it failed
2712                 return wxCONV_FAILED;
2713             }
2714         }
2715
2716         // see the comment above for the reason of "len - 1"
2717         return len - 1;
2718     }
2719
2720     virtual size_t GetMBNulLen() const
2721     {
2722         if ( m_minMBCharWidth == 0 )
2723         {
2724             int len = ::WideCharToMultiByte
2725                         (
2726                             m_CodePage,     // code page
2727                             0,              // no flags
2728                             L"",            // input string
2729                             1,              // translate just the NUL
2730                             NULL,           // output buffer
2731                             0,              // and its size
2732                             NULL,           // no replacement char
2733                             NULL            // [out] don't care if it was used
2734                         );
2735
2736             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2737             switch ( len )
2738             {
2739                 default:
2740                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2741                     self->m_minMBCharWidth = (size_t)-1;
2742                     break;
2743
2744                 case 0:
2745                     self->m_minMBCharWidth = (size_t)-1;
2746                     break;
2747
2748                 case 1:
2749                 case 2:
2750                 case 4:
2751                     self->m_minMBCharWidth = len;
2752                     break;
2753             }
2754         }
2755
2756         return m_minMBCharWidth;
2757     }
2758
2759     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2760
2761     bool IsOk() const { return m_CodePage != -1; }
2762
2763 private:
2764     static bool CanUseNoBestFit()
2765     {
2766         static int s_isWin98Or2k = -1;
2767
2768         if ( s_isWin98Or2k == -1 )
2769         {
2770             int verMaj, verMin;
2771             switch ( wxGetOsVersion(&verMaj, &verMin) )
2772             {
2773                 case wxOS_WINDOWS_9X:
2774                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2775                     break;
2776
2777                 case wxOS_WINDOWS_NT:
2778                     s_isWin98Or2k = verMaj >= 5;
2779                     break;
2780
2781                 default:
2782                     // unknown: be conservative by default
2783                     s_isWin98Or2k = 0;
2784                     break;
2785             }
2786
2787             wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2788         }
2789
2790         return s_isWin98Or2k == 1;
2791     }
2792
2793     static bool IsAtLeastWin2kSP4()
2794     {
2795 #ifdef __WXWINCE__
2796         return false;
2797 #else
2798         static int s_isAtLeastWin2kSP4 = -1;
2799
2800         if ( s_isAtLeastWin2kSP4 == -1 )
2801         {
2802             OSVERSIONINFOEX ver;
2803
2804             memset(&ver, 0, sizeof(ver));
2805             ver.dwOSVersionInfoSize = sizeof(ver);
2806             GetVersionEx((OSVERSIONINFO*)&ver);
2807
2808             s_isAtLeastWin2kSP4 =
2809               ((ver.dwMajorVersion > 5) || // Vista+
2810                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2811                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2812                ver.wServicePackMajor >= 4)) // 2000 SP4+
2813               ? 1 : 0;
2814         }
2815
2816         return s_isAtLeastWin2kSP4 == 1;
2817 #endif
2818     }
2819
2820
2821     // the code page we're working with
2822     long m_CodePage;
2823
2824     // cached result of GetMBNulLen(), set to 0 initially meaning
2825     // "unknown"
2826     size_t m_minMBCharWidth;
2827 };
2828
2829 #endif // wxHAVE_WIN32_MB2WC
2830
2831
2832 // ============================================================================
2833 // wxEncodingConverter based conversion classes
2834 // ============================================================================
2835
2836 #if wxUSE_FONTMAP
2837
2838 class wxMBConv_wxwin : public wxMBConv
2839 {
2840 private:
2841     void Init()
2842     {
2843         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844         // The wxMBConv_cf class does a better job.
2845         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2846                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2847                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2848     }
2849
2850 public:
2851     // temporarily just use wxEncodingConverter stuff,
2852     // so that it works while a better implementation is built
2853     wxMBConv_wxwin(const char* name)
2854     {
2855         if (name)
2856             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2857         else
2858             m_enc = wxFONTENCODING_SYSTEM;
2859
2860         Init();
2861     }
2862
2863     wxMBConv_wxwin(wxFontEncoding enc)
2864     {
2865         m_enc = enc;
2866
2867         Init();
2868     }
2869
2870     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2871     {
2872         size_t inbuf = strlen(psz);
2873         if (buf)
2874         {
2875             if (!m2w.Convert(psz, buf))
2876                 return wxCONV_FAILED;
2877         }
2878         return inbuf;
2879     }
2880
2881     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2882     {
2883         const size_t inbuf = wxWcslen(psz);
2884         if (buf)
2885         {
2886             if (!w2m.Convert(psz, buf))
2887                 return wxCONV_FAILED;
2888         }
2889
2890         return inbuf;
2891     }
2892
2893     virtual size_t GetMBNulLen() const
2894     {
2895         switch ( m_enc )
2896         {
2897             case wxFONTENCODING_UTF16BE:
2898             case wxFONTENCODING_UTF16LE:
2899                 return 2;
2900
2901             case wxFONTENCODING_UTF32BE:
2902             case wxFONTENCODING_UTF32LE:
2903                 return 4;
2904
2905             default:
2906                 return 1;
2907         }
2908     }
2909
2910     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2911
2912     bool IsOk() const { return m_ok; }
2913
2914 public:
2915     wxFontEncoding m_enc;
2916     wxEncodingConverter m2w, w2m;
2917
2918 private:
2919     // were we initialized successfully?
2920     bool m_ok;
2921
2922     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2923 };
2924
2925 // make the constructors available for unit testing
2926 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2927 {
2928     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2929     if ( !result->IsOk() )
2930     {
2931         delete result;
2932         return 0;
2933     }
2934
2935     return result;
2936 }
2937
2938 #endif // wxUSE_FONTMAP
2939
2940 // ============================================================================
2941 // wxCSConv implementation
2942 // ============================================================================
2943
2944 void wxCSConv::Init()
2945 {
2946     m_name = NULL;
2947     m_convReal =  NULL;
2948 }
2949
2950 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2951 {
2952     switch ( encoding )
2953     {
2954         case wxFONTENCODING_MAX:
2955         case wxFONTENCODING_SYSTEM:
2956             if ( m_name )
2957             {
2958                 // It's ok to not have encoding value if we have a name for it.
2959                 m_encoding = wxFONTENCODING_SYSTEM;
2960             }
2961             else // No name neither.
2962             {
2963                 // Fall back to the system default encoding in this case (not
2964                 // sure how much sense does this make but this is how the old
2965                 // code used to behave).
2966 #if wxUSE_INTL
2967                 m_encoding = wxLocale::GetSystemEncoding();
2968                 if ( m_encoding == wxFONTENCODING_SYSTEM )
2969 #endif // wxUSE_INTL
2970                     m_encoding = wxFONTENCODING_ISO8859_1;
2971             }
2972             break;
2973
2974         case wxFONTENCODING_DEFAULT:
2975             // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2976             m_encoding = wxFONTENCODING_ISO8859_1;
2977             break;
2978
2979         default:
2980             // Just use the provided encoding.
2981             m_encoding = encoding;
2982     }
2983 }
2984
2985 wxCSConv::wxCSConv(const wxString& charset)
2986 {
2987     Init();
2988
2989     if ( !charset.empty() )
2990     {
2991         SetName(charset.ToAscii());
2992     }
2993
2994 #if wxUSE_FONTMAP
2995     SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
2996 #else
2997     SetEncoding(wxFONTENCODING_SYSTEM);
2998 #endif
2999
3000     m_convReal = DoCreate();
3001 }
3002
3003 wxCSConv::wxCSConv(wxFontEncoding encoding)
3004 {
3005     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3006     {
3007         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3008
3009         encoding = wxFONTENCODING_SYSTEM;
3010     }
3011
3012     Init();
3013
3014     SetEncoding(encoding);
3015
3016     m_convReal = DoCreate();
3017 }
3018
3019 wxCSConv::~wxCSConv()
3020 {
3021     Clear();
3022 }
3023
3024 wxCSConv::wxCSConv(const wxCSConv& conv)
3025         : wxMBConv()
3026 {
3027     Init();
3028
3029     SetName(conv.m_name);
3030     SetEncoding(conv.m_encoding);
3031
3032     m_convReal = DoCreate();
3033 }
3034
3035 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3036 {
3037     Clear();
3038
3039     SetName(conv.m_name);
3040     SetEncoding(conv.m_encoding);
3041
3042     m_convReal = DoCreate();
3043
3044     return *this;
3045 }
3046
3047 void wxCSConv::Clear()
3048 {
3049     free(m_name);
3050     m_name = NULL;
3051
3052     wxDELETE(m_convReal);
3053 }
3054
3055 void wxCSConv::SetName(const char *charset)
3056 {
3057     if ( charset )
3058         m_name = wxStrdup(charset);
3059 }
3060
3061 #if wxUSE_FONTMAP
3062
3063 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3064                      wxEncodingNameCache );
3065
3066 static wxEncodingNameCache gs_nameCache;
3067 #endif
3068
3069 wxMBConv *wxCSConv::DoCreate() const
3070 {
3071 #if wxUSE_FONTMAP
3072     wxLogTrace(TRACE_STRCONV,
3073                wxT("creating conversion for %s"),
3074                (m_name ? m_name
3075                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3076 #endif // wxUSE_FONTMAP
3077
3078     // check for the special case of ASCII or ISO8859-1 charset: as we have
3079     // special knowledge of it anyhow, we don't need to create a special
3080     // conversion object
3081     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3082     {
3083         // don't convert at all
3084         return NULL;
3085     }
3086
3087     // we trust OS to do conversion better than we can so try external
3088     // conversion methods first
3089     //
3090     // the full order is:
3091     //      1. OS conversion (iconv() under Unix or Win32 API)
3092     //      2. hard coded conversions for UTF
3093     //      3. wxEncodingConverter as fall back
3094
3095     // step (1)
3096 #ifdef HAVE_ICONV
3097 #if !wxUSE_FONTMAP
3098     if ( m_name )
3099 #endif // !wxUSE_FONTMAP
3100     {
3101 #if wxUSE_FONTMAP
3102         wxFontEncoding encoding(m_encoding);
3103 #endif
3104
3105         if ( m_name )
3106         {
3107             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3108             if ( conv->IsOk() )
3109                 return conv;
3110
3111             delete conv;
3112
3113 #if wxUSE_FONTMAP
3114             encoding =
3115                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3116 #endif // wxUSE_FONTMAP
3117         }
3118 #if wxUSE_FONTMAP
3119         {
3120             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3121             if ( it != gs_nameCache.end() )
3122             {
3123                 if ( it->second.empty() )
3124                     return NULL;
3125
3126                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3127                 if ( conv->IsOk() )
3128                     return conv;
3129
3130                 delete conv;
3131             }
3132
3133             const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3134             // CS : in case this does not return valid names (eg for MacRoman)
3135             // encoding got a 'failure' entry in the cache all the same,
3136             // although it just has to be created using a different method, so
3137             // only store failed iconv creation attempts (or perhaps we
3138             // shoulnd't do this at all ?)
3139             if ( names[0] != NULL )
3140             {
3141                 for ( ; *names; ++names )
3142                 {
3143                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3144                     //             will need changes that will obsolete this
3145                     wxString name(*names);
3146                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3147                     if ( conv->IsOk() )
3148                     {
3149                         gs_nameCache[encoding] = *names;
3150                         return conv;
3151                     }
3152
3153                     delete conv;
3154                 }
3155
3156                 gs_nameCache[encoding] = wxT(""); // cache the failure
3157             }
3158         }
3159 #endif // wxUSE_FONTMAP
3160     }
3161 #endif // HAVE_ICONV
3162
3163 #ifdef wxHAVE_WIN32_MB2WC
3164     {
3165 #if wxUSE_FONTMAP
3166         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3167                                       : new wxMBConv_win32(m_encoding);
3168         if ( conv->IsOk() )
3169             return conv;
3170
3171         delete conv;
3172 #else
3173         return NULL;
3174 #endif
3175     }
3176 #endif // wxHAVE_WIN32_MB2WC
3177
3178 #ifdef __DARWIN__
3179     {
3180         // leave UTF16 and UTF32 to the built-ins of wx
3181         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3182             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3183         {
3184 #if wxUSE_FONTMAP
3185             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3186                                           : new wxMBConv_cf(m_encoding);
3187 #else
3188             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3189 #endif
3190
3191             if ( conv->IsOk() )
3192                  return conv;
3193
3194             delete conv;
3195         }
3196     }
3197 #endif // __DARWIN__
3198
3199     // step (2)
3200     wxFontEncoding enc = m_encoding;
3201 #if wxUSE_FONTMAP
3202     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3203     {
3204         // use "false" to suppress interactive dialogs -- we can be called from
3205         // anywhere and popping up a dialog from here is the last thing we want to
3206         // do
3207         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3208     }
3209 #endif // wxUSE_FONTMAP
3210
3211     switch ( enc )
3212     {
3213         case wxFONTENCODING_UTF7:
3214              return new wxMBConvUTF7;
3215
3216         case wxFONTENCODING_UTF8:
3217              return new wxMBConvUTF8;
3218
3219         case wxFONTENCODING_UTF16BE:
3220              return new wxMBConvUTF16BE;
3221
3222         case wxFONTENCODING_UTF16LE:
3223              return new wxMBConvUTF16LE;
3224
3225         case wxFONTENCODING_UTF32BE:
3226              return new wxMBConvUTF32BE;
3227
3228         case wxFONTENCODING_UTF32LE:
3229              return new wxMBConvUTF32LE;
3230
3231         default:
3232              // nothing to do but put here to suppress gcc warnings
3233              break;
3234     }
3235
3236     // step (3)
3237 #if wxUSE_FONTMAP
3238     {
3239         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3240                                       : new wxMBConv_wxwin(m_encoding);
3241         if ( conv->IsOk() )
3242             return conv;
3243
3244         delete conv;
3245     }
3246
3247     wxLogTrace(TRACE_STRCONV,
3248                wxT("encoding \"%s\" is not supported by this system"),
3249                (m_name ? wxString(m_name)
3250                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3251 #endif // wxUSE_FONTMAP
3252
3253     return NULL;
3254 }
3255
3256 bool wxCSConv::IsOk() const
3257 {
3258     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3259     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3260         return true; // always ok as we do it ourselves
3261
3262     // m_convReal->IsOk() is called at its own creation, so we know it must
3263     // be ok if m_convReal is non-NULL
3264     return m_convReal != NULL;
3265 }
3266
3267 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3268                          const char *src, size_t srcLen) const
3269 {
3270     if (m_convReal)
3271         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3272
3273     // latin-1 (direct)
3274     if ( srcLen == wxNO_LEN )
3275         srcLen = strlen(src) + 1; // take trailing NUL too
3276
3277     if ( dst )
3278     {
3279         if ( dstLen < srcLen )
3280             return wxCONV_FAILED;
3281
3282         for ( size_t n = 0; n < srcLen; n++ )
3283             dst[n] = (unsigned char)(src[n]);
3284     }
3285
3286     return srcLen;
3287 }
3288
3289 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3290                            const wchar_t *src, size_t srcLen) const
3291 {
3292     if (m_convReal)
3293         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3294
3295     // latin-1 (direct)
3296     if ( srcLen == wxNO_LEN )
3297         srcLen = wxWcslen(src) + 1;
3298
3299     if ( dst )
3300     {
3301         if ( dstLen < srcLen )
3302             return wxCONV_FAILED;
3303
3304         for ( size_t n = 0; n < srcLen; n++ )
3305         {
3306             if ( src[n] > 0xFF )
3307                 return wxCONV_FAILED;
3308
3309             dst[n] = (char)src[n];
3310         }
3311
3312     }
3313     else // still need to check the input validity
3314     {
3315         for ( size_t n = 0; n < srcLen; n++ )
3316         {
3317             if ( src[n] > 0xFF )
3318                 return wxCONV_FAILED;
3319         }
3320     }
3321
3322     return srcLen;
3323 }
3324
3325 size_t wxCSConv::GetMBNulLen() const
3326 {
3327     if ( m_convReal )
3328         return m_convReal->GetMBNulLen();
3329
3330     // otherwise, we are ISO-8859-1
3331     return 1;
3332 }
3333
3334 #if wxUSE_UNICODE_UTF8
3335 bool wxCSConv::IsUTF8() const
3336 {
3337     if ( m_convReal )
3338         return m_convReal->IsUTF8();
3339
3340     // otherwise, we are ISO-8859-1
3341     return false;
3342 }
3343 #endif
3344
3345
3346 #if wxUSE_UNICODE
3347
3348 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3349 {
3350     if ( !s )
3351         return wxWCharBuffer();
3352
3353     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3354     if ( !wbuf )
3355         wbuf = wxMBConvUTF8().cMB2WX(s);
3356     if ( !wbuf )
3357         wbuf = wxConvISO8859_1.cMB2WX(s);
3358
3359     return wbuf;
3360 }
3361
3362 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3363 {
3364     if ( !ws )
3365         return wxCharBuffer();
3366
3367     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3368     if ( !buf )
3369         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3370
3371     return buf;
3372 }
3373
3374 #endif // wxUSE_UNICODE
3375
3376 // ----------------------------------------------------------------------------
3377 // globals
3378 // ----------------------------------------------------------------------------
3379
3380 // NB: The reason why we create converted objects in this convoluted way,
3381 //     using a factory function instead of global variable, is that they
3382 //     may be used at static initialization time (some of them are used by
3383 //     wxString ctors and there may be a global wxString object). In other
3384 //     words, possibly _before_ the converter global object would be
3385 //     initialized.
3386
3387 #undef wxConvLibc
3388 #undef wxConvUTF8
3389 #undef wxConvUTF7
3390 #undef wxConvLocal
3391 #undef wxConvISO8859_1
3392
3393 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3394     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3395     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3396     {                                                                   \
3397         static impl_klass name##Obj ctor_args;                          \
3398         return &name##Obj;                                              \
3399     }                                                                   \
3400     /* this ensures that all global converter objects are created */    \
3401     /* by the time static initialization is done, i.e. before any */    \
3402     /* thread is launched: */                                           \
3403     static klass* gs_##name##instance = wxGet_##name##Ptr()
3404
3405 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3406     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3407
3408 #ifdef __INTELC__
3409     // disable warning "variable 'xxx' was declared but never referenced"
3410     #pragma warning(disable: 177)
3411 #endif // Intel C++
3412
3413 #ifdef __WINDOWS__
3414     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3415 #elif 0 // defined(__WXOSX__)
3416     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3417 #else
3418     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3419 #endif
3420
3421 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3422 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3423 //     provokes an error message about "not enough macro parameters"; and we
3424 //     can't use "()" here as the name##Obj declaration would be parsed as a
3425 //     function declaration then, so use a semicolon and live with an extra
3426 //     empty statement (and hope that no compilers warns about this)
3427 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3428 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3429
3430 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3431 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3432
3433 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3434 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3435
3436 #ifdef __DARWIN__
3437 // It is important to use this conversion object under Darwin as it ensures
3438 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3439 // decomposed form internally (at least for the file names).
3440 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3441 #endif
3442
3443 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3444 #ifdef __DARWIN__
3445                                     &wxConvMacUTF8DObj;
3446 #else // !__DARWIN__
3447                                     wxGet_wxConvLibcPtr();
3448 #endif // __DARWIN__/!__DARWIN__