src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __WXMAC__
  60 #ifndef __DARWIN__
  61 #include <ATSUnicode.h>
  62 #include <TextCommon.h>
  63 #include <TextEncodingConverter.h>
  64 #endif
  65
  66 // includes Mac headers
  67 #include "wx/mac/private.h"
  68 #endif
  69
  70
  71 #define TRACE_STRCONV _T("strconv")
  72
  73 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  74 // be 4 bytes
  75 #if SIZEOF_WCHAR_T == 2
  76     #define WC_UTF16
  77 #endif
  78
  79
  80 // ============================================================================
  81 // implementation
  82 // ============================================================================
  83
  84 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  85 static bool NotAllNULs(const char *p, size_t n)
  86 {
  87     while ( n && *p++ == '\0' )
  88         n--;
  89
  90     return n != 0;
  91 }
  92
  93 // ----------------------------------------------------------------------------
  94 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  95 // ----------------------------------------------------------------------------
  96
  97 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  98 {
  99     if (input <= 0xffff)
 100     {
 101         if (output)
 102             *output = (wxUint16) input;
 103
 104         return 1;
 105     }
 106     else if (input >= 0x110000)
 107     {
 108         return wxCONV_FAILED;
 109     }
 110     else
 111     {
 112         if (output)
 113         {
 114             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 115             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 116         }
 117
 118         return 2;
 119     }
 120 }
 121
 122 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 123 {
 124     if ((*input < 0xd800) || (*input > 0xdfff))
 125     {
 126         output = *input;
 127         return 1;
 128     }
 129     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 130     {
 131         output = *input;
 132         return wxCONV_FAILED;
 133     }
 134     else
 135     {
 136         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 137         return 2;
 138     }
 139 }
 140
 141 #ifdef WC_UTF16
 142     typedef wchar_t wxDecodeSurrogate_t;
 143 #else // !WC_UTF16
 144     typedef wxUint16 wxDecodeSurrogate_t;
 145 #endif // WC_UTF16/!WC_UTF16
 146
 147 // returns the next UTF-32 character from the wchar_t buffer and advances the
 148 // pointer to the character after this one
 149 //
 150 // if an invalid character is found, *pSrc is set to NULL, the caller must
 151 // check for this
 152 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 153 {
 154     wxUint32 out;
 155     const size_t
 156         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 157     if ( n == wxCONV_FAILED )
 158         *pSrc = NULL;
 159     else
 160         *pSrc += n;
 161
 162     return out;
 163 }
 164
 165 // ----------------------------------------------------------------------------
 166 // wxMBConv
 167 // ----------------------------------------------------------------------------
 168
 169 size_t
 170 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 171                   const char *src, size_t srcLen) const
 172 {
 173     // although new conversion classes are supposed to implement this function
 174     // directly, the existins ones only implement the old MB2WC() and so, to
 175     // avoid to have to rewrite all conversion classes at once, we provide a
 176     // default (but not efficient) implementation of this one in terms of the
 177     // old function by copying the input to ensure that it's NUL-terminated and
 178     // then using MB2WC() to convert it
 179
 180     // the number of chars [which would be] written to dst [if it were not NULL]
 181     size_t dstWritten = 0;
 182
 183     // the number of NULs terminating this string
 184     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 185
 186     // if we were not given the input size we just have to assume that the
 187     // string is properly terminated as we have no way of knowing how long it
 188     // is anyhow, but if we do have the size check whether there are enough
 189     // NULs at the end
 190     wxCharBuffer bufTmp;
 191     const char *srcEnd;
 192     if ( srcLen != wxNO_LEN )
 193     {
 194         // we need to know how to find the end of this string
 195         nulLen = GetMBNulLen();
 196         if ( nulLen == wxCONV_FAILED )
 197             return wxCONV_FAILED;
 198
 199         // if there are enough NULs we can avoid the copy
 200         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 201         {
 202             // make a copy in order to properly NUL-terminate the string
 203             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 204             char * const p = bufTmp.data();
 205             memcpy(p, src, srcLen);
 206             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 207                 *s = '\0';
 208
 209             src = bufTmp;
 210         }
 211
 212         srcEnd = src + srcLen;
 213     }
 214     else // quit after the first loop iteration
 215     {
 216         srcEnd = NULL;
 217     }
 218
 219     for ( ;; )
 220     {
 221         // try to convert the current chunk
 222         size_t lenChunk = MB2WC(NULL, src, 0);
 223         if ( lenChunk == wxCONV_FAILED )
 224             return wxCONV_FAILED;
 225
 226         lenChunk++; // for the L'\0' at the end of this chunk
 227
 228         dstWritten += lenChunk;
 229
 230         if ( lenChunk == 1 )
 231         {
 232             // nothing left in the input string, conversion succeeded
 233             break;
 234         }
 235
 236         if ( dst )
 237         {
 238             if ( dstWritten > dstLen )
 239                 return wxCONV_FAILED;
 240
 241             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 242                 return wxCONV_FAILED;
 243
 244             dst += lenChunk;
 245         }
 246
 247         if ( !srcEnd )
 248         {
 249             // we convert just one chunk in this case as this is the entire
 250             // string anyhow
 251             break;
 252         }
 253
 254         // advance the input pointer past the end of this chunk
 255         while ( NotAllNULs(src, nulLen) )
 256         {
 257             // notice that we must skip over multiple bytes here as we suppose
 258             // that if NUL takes 2 or 4 bytes, then all the other characters do
 259             // too and so if advanced by a single byte we might erroneously
 260             // detect sequences of NUL bytes in the middle of the input
 261             src += nulLen;
 262         }
 263
 264         src += nulLen; // skipping over its terminator as well
 265
 266         // note that ">=" (and not just "==") is needed here as the terminator
 267         // we skipped just above could be inside or just after the buffer
 268         // delimited by inEnd
 269         if ( src >= srcEnd )
 270             break;
 271     }
 272
 273     return dstWritten;
 274 }
 275
 276 size_t
 277 wxMBConv::FromWChar(char *dst, size_t dstLen,
 278                     const wchar_t *src, size_t srcLen) const
 279 {
 280     // the number of chars [which would be] written to dst [if it were not NULL]
 281     size_t dstWritten = 0;
 282
 283     // make a copy of the input string unless it is already properly
 284     // NUL-terminated
 285     //
 286     // if we don't know its length we have no choice but to assume that it is,
 287     // indeed, properly terminated
 288     wxWCharBuffer bufTmp;
 289     if ( srcLen == wxNO_LEN )
 290     {
 291         srcLen = wxWcslen(src) + 1;
 292     }
 293     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 294     {
 295         // make a copy in order to properly NUL-terminate the string
 296         bufTmp = wxWCharBuffer(srcLen);
 297         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 298         src = bufTmp;
 299     }
 300
 301     const size_t lenNul = GetMBNulLen();
 302     for ( const wchar_t * const srcEnd = src + srcLen;
 303           src < srcEnd;
 304           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 305     {
 306         // try to convert the current chunk
 307         size_t lenChunk = WC2MB(NULL, src, 0);
 308
 309         if ( lenChunk == wxCONV_FAILED )
 310             return wxCONV_FAILED;
 311
 312         lenChunk += lenNul;
 313         dstWritten += lenChunk;
 314
 315         if ( dst )
 316         {
 317             if ( dstWritten > dstLen )
 318                 return wxCONV_FAILED;
 319
 320             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 321                 return wxCONV_FAILED;
 322
 323             dst += lenChunk;
 324         }
 325     }
 326
 327     return dstWritten;
 328 }
 329
 330 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 331 {
 332     size_t rc = ToWChar(outBuff, outLen, inBuff);
 333     if ( rc != wxCONV_FAILED )
 334     {
 335         // ToWChar() returns the buffer length, i.e. including the trailing
 336         // NUL, while this method doesn't take it into account
 337         rc--;
 338     }
 339
 340     return rc;
 341 }
 342
 343 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 344 {
 345     size_t rc = FromWChar(outBuff, outLen, inBuff);
 346     if ( rc != wxCONV_FAILED )
 347     {
 348         rc -= GetMBNulLen();
 349     }
 350
 351     return rc;
 352 }
 353
 354 wxMBConv::~wxMBConv()
 355 {
 356     // nothing to do here (necessary for Darwin linking probably)
 357 }
 358
 359 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 360 {
 361     if ( psz )
 362     {
 363         // calculate the length of the buffer needed first
 364         const size_t nLen = MB2WC(NULL, psz, 0);
 365         if ( nLen != wxCONV_FAILED )
 366         {
 367             // now do the actual conversion
 368             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 369
 370             // +1 for the trailing NULL
 371             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 372                 return buf;
 373         }
 374     }
 375
 376     return wxWCharBuffer();
 377 }
 378
 379 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 380 {
 381     if ( pwz )
 382     {
 383         const size_t nLen = WC2MB(NULL, pwz, 0);
 384         if ( nLen != wxCONV_FAILED )
 385         {
 386             // extra space for trailing NUL(s)
 387             static const size_t extraLen = GetMaxMBNulLen();
 388
 389             wxCharBuffer buf(nLen + extraLen - 1);
 390             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 391                 return buf;
 392         }
 393     }
 394
 395     return wxCharBuffer();
 396 }
 397
 398 const wxWCharBuffer
 399 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 400 {
 401     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 402     if ( dstLen != wxCONV_FAILED )
 403     {
 404         wxWCharBuffer wbuf(dstLen - 1);
 405         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 406         {
 407             if ( outLen )
 408             {
 409                 *outLen = dstLen;
 410                 if ( wbuf[dstLen - 1] == L'\0' )
 411                     (*outLen)--;
 412             }
 413
 414             return wbuf;
 415         }
 416     }
 417
 418     if ( outLen )
 419         *outLen = 0;
 420
 421     return wxWCharBuffer();
 422 }
 423
 424 const wxCharBuffer
 425 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 426 {
 427     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 428     if ( dstLen != wxCONV_FAILED )
 429     {
 430         // special case of empty input: can't allocate 0 size buffer below as
 431         // wxCharBuffer insists on NUL-terminating it
 432         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 433         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 434         {
 435             if ( outLen )
 436             {
 437                 *outLen = dstLen;
 438
 439                 const size_t nulLen = GetMBNulLen();
 440                 if ( dstLen >= nulLen &&
 441                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 442                 {
 443                     // in this case the output is NUL-terminated and we're not
 444                     // supposed to count NUL
 445                     *outLen -= nulLen;
 446                 }
 447             }
 448
 449             return buf;
 450         }
 451     }
 452
 453     if ( outLen )
 454         *outLen = 0;
 455
 456     return wxCharBuffer();
 457 }
 458
 459 // ----------------------------------------------------------------------------
 460 // wxMBConvLibc
 461 // ----------------------------------------------------------------------------
 462
 463 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 464 {
 465     return wxMB2WC(buf, psz, n);
 466 }
 467
 468 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 469 {
 470     return wxWC2MB(buf, psz, n);
 471 }
 472
 473 // ----------------------------------------------------------------------------
 474 // wxConvBrokenFileNames
 475 // ----------------------------------------------------------------------------
 476
 477 #ifdef __UNIX__
 478
 479 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 480 {
 481     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 482                   || wxStricmp(charset, _T("UTF8")) == 0  )
 483         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 484     else
 485         m_conv = new wxCSConv(charset);
 486 }
 487
 488 #endif // __UNIX__
 489
 490 // ----------------------------------------------------------------------------
 491 // UTF-7
 492 // ----------------------------------------------------------------------------
 493
 494 // Implementation (C) 2004 Fredrik Roubert
 495
 496 //
 497 // BASE64 decoding table
 498 //
 499 static const unsigned char utf7unb64[] =
 500 {
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 507     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 508     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 510     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 511     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 512     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 514     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 515     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 516     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 533 };
 534
 535 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 536 {
 537     size_t len = 0;
 538
 539     while ( *psz && (!buf || (len < n)) )
 540     {
 541         unsigned char cc = *psz++;
 542         if (cc != '+')
 543         {
 544             // plain ASCII char
 545             if (buf)
 546                 *buf++ = cc;
 547             len++;
 548         }
 549         else if (*psz == '-')
 550         {
 551             // encoded plus sign
 552             if (buf)
 553                 *buf++ = cc;
 554             len++;
 555             psz++;
 556         }
 557         else // start of BASE64 encoded string
 558         {
 559             bool lsb, ok;
 560             unsigned int d, l;
 561             for ( ok = lsb = false, d = 0, l = 0;
 562                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 563                   psz++ )
 564             {
 565                 d <<= 6;
 566                 d += cc;
 567                 for (l += 6; l >= 8; lsb = !lsb)
 568                 {
 569                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 570                     if (lsb)
 571                     {
 572                         if (buf)
 573                             *buf++ |= c;
 574                         len ++;
 575                     }
 576                     else
 577                     {
 578                         if (buf)
 579                             *buf = (wchar_t)(c << 8);
 580                     }
 581
 582                     ok = true;
 583                 }
 584             }
 585
 586             if ( !ok )
 587             {
 588                 // in valid UTF7 we should have valid characters after '+'
 589                 return wxCONV_FAILED;
 590             }
 591
 592             if (*psz == '-')
 593                 psz++;
 594         }
 595     }
 596
 597     if ( buf && (len < n) )
 598         *buf = '\0';
 599
 600     return len;
 601 }
 602
 603 //
 604 // BASE64 encoding table
 605 //
 606 static const unsigned char utf7enb64[] =
 607 {
 608     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 609     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 610     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 611     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 612     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 613     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 614     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 615     '4', '5', '6', '7', '8', '9', '+', '/'
 616 };
 617
 618 //
 619 // UTF-7 encoding table
 620 //
 621 // 0 - Set D (directly encoded characters)
 622 // 1 - Set O (optional direct characters)
 623 // 2 - whitespace characters (optional)
 624 // 3 - special characters
 625 //
 626 static const unsigned char utf7encode[128] =
 627 {
 628     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 629     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 630     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 631     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 632     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 633     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 634     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 635     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 636 };
 637
 638 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 639 {
 640     size_t len = 0;
 641
 642     while (*psz && ((!buf) || (len < n)))
 643     {
 644         wchar_t cc = *psz++;
 645         if (cc < 0x80 && utf7encode[cc] < 1)
 646         {
 647             // plain ASCII char
 648             if (buf)
 649                 *buf++ = (char)cc;
 650
 651             len++;
 652         }
 653 #ifndef WC_UTF16
 654         else if (((wxUint32)cc) > 0xffff)
 655         {
 656             // no surrogate pair generation (yet?)
 657             return wxCONV_FAILED;
 658         }
 659 #endif
 660         else
 661         {
 662             if (buf)
 663                 *buf++ = '+';
 664
 665             len++;
 666             if (cc != '+')
 667             {
 668                 // BASE64 encode string
 669                 unsigned int lsb, d, l;
 670                 for (d = 0, l = 0; /*nothing*/; psz++)
 671                 {
 672                     for (lsb = 0; lsb < 2; lsb ++)
 673                     {
 674                         d <<= 8;
 675                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 676
 677                         for (l += 8; l >= 6; )
 678                         {
 679                             l -= 6;
 680                             if (buf)
 681                                 *buf++ = utf7enb64[(d >> l) % 64];
 682                             len++;
 683                         }
 684                     }
 685
 686                     cc = *psz;
 687                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 688                         break;
 689                 }
 690
 691                 if (l != 0)
 692                 {
 693                     if (buf)
 694                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 695
 696                     len++;
 697                 }
 698             }
 699
 700             if (buf)
 701                 *buf++ = '-';
 702             len++;
 703         }
 704     }
 705
 706     if (buf && (len < n))
 707         *buf = 0;
 708
 709     return len;
 710 }
 711
 712 // ----------------------------------------------------------------------------
 713 // UTF-8
 714 // ----------------------------------------------------------------------------
 715
 716 static wxUint32 utf8_max[]=
 717     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 718
 719 // boundaries of the private use area we use to (temporarily) remap invalid
 720 // characters invalid in a UTF-8 encoded string
 721 const wxUint32 wxUnicodePUA = 0x100000;
 722 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 723
 724 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 725 {
 726     size_t len = 0;
 727
 728     while (*psz && ((!buf) || (len < n)))
 729     {
 730         const char *opsz = psz;
 731         bool invalid = false;
 732         unsigned char cc = *psz++, fc = cc;
 733         unsigned cnt;
 734         for (cnt = 0; fc & 0x80; cnt++)
 735             fc <<= 1;
 736
 737         if (!cnt)
 738         {
 739             // plain ASCII char
 740             if (buf)
 741                 *buf++ = cc;
 742             len++;
 743
 744             // escape the escape character for octal escapes
 745             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 746                     && cc == '\\' && (!buf || len < n))
 747             {
 748                 if (buf)
 749                     *buf++ = cc;
 750                 len++;
 751             }
 752         }
 753         else
 754         {
 755             cnt--;
 756             if (!cnt)
 757             {
 758                 // invalid UTF-8 sequence
 759                 invalid = true;
 760             }
 761             else
 762             {
 763                 unsigned ocnt = cnt - 1;
 764                 wxUint32 res = cc & (0x3f >> cnt);
 765                 while (cnt--)
 766                 {
 767                     cc = *psz;
 768                     if ((cc & 0xC0) != 0x80)
 769                     {
 770                         // invalid UTF-8 sequence
 771                         invalid = true;
 772                         break;
 773                     }
 774
 775                     psz++;
 776                     res = (res << 6) | (cc & 0x3f);
 777                 }
 778
 779                 if (invalid || res <= utf8_max[ocnt])
 780                 {
 781                     // illegal UTF-8 encoding
 782                     invalid = true;
 783                 }
 784                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 785                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 786                 {
 787                     // if one of our PUA characters turns up externally
 788                     // it must also be treated as an illegal sequence
 789                     // (a bit like you have to escape an escape character)
 790                     invalid = true;
 791                 }
 792                 else
 793                 {
 794 #ifdef WC_UTF16
 795                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 796                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 797                     if (pa == wxCONV_FAILED)
 798                     {
 799                         invalid = true;
 800                     }
 801                     else
 802                     {
 803                         if (buf)
 804                             buf += pa;
 805                         len += pa;
 806                     }
 807 #else // !WC_UTF16
 808                     if (buf)
 809                         *buf++ = (wchar_t)res;
 810                     len++;
 811 #endif // WC_UTF16/!WC_UTF16
 812                 }
 813             }
 814
 815             if (invalid)
 816             {
 817                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 818                 {
 819                     while (opsz < psz && (!buf || len < n))
 820                     {
 821 #ifdef WC_UTF16
 822                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 823                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 824                         wxASSERT(pa != wxCONV_FAILED);
 825                         if (buf)
 826                             buf += pa;
 827                         opsz++;
 828                         len += pa;
 829 #else
 830                         if (buf)
 831                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 832                         opsz++;
 833                         len++;
 834 #endif
 835                     }
 836                 }
 837                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 838                 {
 839                     while (opsz < psz && (!buf || len < n))
 840                     {
 841                         if ( buf && len + 3 < n )
 842                         {
 843                             unsigned char on = *opsz;
 844                             *buf++ = L'\\';
 845                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 846                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 847                             *buf++ = (wchar_t)( L'0' + on % 010 );
 848                         }
 849
 850                         opsz++;
 851                         len += 4;
 852                     }
 853                 }
 854                 else // MAP_INVALID_UTF8_NOT
 855                 {
 856                     return wxCONV_FAILED;
 857                 }
 858             }
 859         }
 860     }
 861
 862     if (buf && (len < n))
 863         *buf = 0;
 864
 865     return len;
 866 }
 867
 868 static inline bool isoctal(wchar_t wch)
 869 {
 870     return L'0' <= wch && wch <= L'7';
 871 }
 872
 873 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 874 {
 875     size_t len = 0;
 876
 877     while (*psz && ((!buf) || (len < n)))
 878     {
 879         wxUint32 cc;
 880
 881 #ifdef WC_UTF16
 882         // cast is ok for WC_UTF16
 883         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 884         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 885 #else
 886         cc = (*psz++) & 0x7fffffff;
 887 #endif
 888
 889         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 890                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 891         {
 892             if (buf)
 893                 *buf++ = (char)(cc - wxUnicodePUA);
 894             len++;
 895         }
 896         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 897                     && cc == L'\\' && psz[0] == L'\\' )
 898         {
 899             if (buf)
 900                 *buf++ = (char)cc;
 901             psz++;
 902             len++;
 903         }
 904         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 905                     cc == L'\\' &&
 906                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 907         {
 908             if (buf)
 909             {
 910                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 911                                  (psz[1] - L'0') * 010 +
 912                                  (psz[2] - L'0'));
 913             }
 914
 915             psz += 3;
 916             len++;
 917         }
 918         else
 919         {
 920             unsigned cnt;
 921             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 922             {
 923             }
 924
 925             if (!cnt)
 926             {
 927                 // plain ASCII char
 928                 if (buf)
 929                     *buf++ = (char) cc;
 930                 len++;
 931             }
 932             else
 933             {
 934                 len += cnt + 1;
 935                 if (buf)
 936                 {
 937                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 938                     while (cnt--)
 939                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 940                 }
 941             }
 942         }
 943     }
 944
 945     if (buf && (len < n))
 946         *buf = 0;
 947
 948     return len;
 949 }
 950
 951 // ============================================================================
 952 // UTF-16
 953 // ============================================================================
 954
 955 #ifdef WORDS_BIGENDIAN
 956     #define wxMBConvUTF16straight wxMBConvUTF16BE
 957     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 958 #else
 959     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 960     #define wxMBConvUTF16straight wxMBConvUTF16LE
 961 #endif
 962
 963 /* static */
 964 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 965 {
 966     if ( srcLen == wxNO_LEN )
 967     {
 968         // count the number of bytes in input, including the trailing NULs
 969         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 970         for ( srcLen = 1; *inBuff++; srcLen++ )
 971             ;
 972
 973         srcLen *= BYTES_PER_CHAR;
 974     }
 975     else // we already have the length
 976     {
 977         // we can only convert an entire number of UTF-16 characters
 978         if ( srcLen % BYTES_PER_CHAR )
 979             return wxCONV_FAILED;
 980     }
 981
 982     return srcLen;
 983 }
 984
 985 // case when in-memory representation is UTF-16 too
 986 #ifdef WC_UTF16
 987
 988 // ----------------------------------------------------------------------------
 989 // conversions without endianness change
 990 // ----------------------------------------------------------------------------
 991
 992 size_t
 993 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 994                                const char *src, size_t srcLen) const
 995 {
 996     // set up the scene for using memcpy() (which is presumably more efficient
 997     // than copying the bytes one by one)
 998     srcLen = GetLength(src, srcLen);
 999     if ( srcLen == wxNO_LEN )
1000         return wxCONV_FAILED;
1001
1002     const size_t inLen = srcLen / BYTES_PER_CHAR;
1003     if ( dst )
1004     {
1005         if ( dstLen < inLen )
1006             return wxCONV_FAILED;
1007
1008         memcpy(dst, src, srcLen);
1009     }
1010
1011     return inLen;
1012 }
1013
1014 size_t
1015 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1016                                  const wchar_t *src, size_t srcLen) const
1017 {
1018     if ( srcLen == wxNO_LEN )
1019         srcLen = wxWcslen(src) + 1;
1020
1021     srcLen *= BYTES_PER_CHAR;
1022
1023     if ( dst )
1024     {
1025         if ( dstLen < srcLen )
1026             return wxCONV_FAILED;
1027
1028         memcpy(dst, src, srcLen);
1029     }
1030
1031     return srcLen;
1032 }
1033
1034 // ----------------------------------------------------------------------------
1035 // endian-reversing conversions
1036 // ----------------------------------------------------------------------------
1037
1038 size_t
1039 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1040                            const char *src, size_t srcLen) const
1041 {
1042     srcLen = GetLength(src, srcLen);
1043     if ( srcLen == wxNO_LEN )
1044         return wxCONV_FAILED;
1045
1046     srcLen /= BYTES_PER_CHAR;
1047
1048     if ( dst )
1049     {
1050         if ( dstLen < srcLen )
1051             return wxCONV_FAILED;
1052
1053         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1054         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1055         {
1056             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1057         }
1058     }
1059
1060     return srcLen;
1061 }
1062
1063 size_t
1064 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1065                              const wchar_t *src, size_t srcLen) const
1066 {
1067     if ( srcLen == wxNO_LEN )
1068         srcLen = wxWcslen(src) + 1;
1069
1070     srcLen *= BYTES_PER_CHAR;
1071
1072     if ( dst )
1073     {
1074         if ( dstLen < srcLen )
1075             return wxCONV_FAILED;
1076
1077         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1078         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1079         {
1080             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1081         }
1082     }
1083
1084     return srcLen;
1085 }
1086
1087 #else // !WC_UTF16: wchar_t is UTF-32
1088
1089 // ----------------------------------------------------------------------------
1090 // conversions without endianness change
1091 // ----------------------------------------------------------------------------
1092
1093 size_t
1094 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1095                                const char *src, size_t srcLen) const
1096 {
1097     srcLen = GetLength(src, srcLen);
1098     if ( srcLen == wxNO_LEN )
1099         return wxCONV_FAILED;
1100
1101     const size_t inLen = srcLen / BYTES_PER_CHAR;
1102     if ( !dst )
1103     {
1104         // optimization: return maximal space which could be needed for this
1105         // string even if the real size could be smaller if the buffer contains
1106         // any surrogates
1107         return inLen;
1108     }
1109
1110     size_t outLen = 0;
1111     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1112     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1113     {
1114         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1115         if ( !inBuff )
1116             return wxCONV_FAILED;
1117
1118         if ( ++outLen > dstLen )
1119             return wxCONV_FAILED;
1120
1121         *dst++ = ch;
1122     }
1123
1124
1125     return outLen;
1126 }
1127
1128 size_t
1129 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1130                                  const wchar_t *src, size_t srcLen) const
1131 {
1132     if ( srcLen == wxNO_LEN )
1133         srcLen = wxWcslen(src) + 1;
1134
1135     size_t outLen = 0;
1136     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1137     for ( size_t n = 0; n < srcLen; n++ )
1138     {
1139         wxUint16 cc[2];
1140         const size_t numChars = encode_utf16(*src++, cc);
1141         if ( numChars == wxCONV_FAILED )
1142             return wxCONV_FAILED;
1143
1144         outLen += numChars * BYTES_PER_CHAR;
1145         if ( outBuff )
1146         {
1147             if ( outLen > dstLen )
1148                 return wxCONV_FAILED;
1149
1150             *outBuff++ = cc[0];
1151             if ( numChars == 2 )
1152             {
1153                 // second character of a surrogate
1154                 *outBuff++ = cc[1];
1155             }
1156         }
1157     }
1158
1159     return outLen;
1160 }
1161
1162 // ----------------------------------------------------------------------------
1163 // endian-reversing conversions
1164 // ----------------------------------------------------------------------------
1165
1166 size_t
1167 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1168                            const char *src, size_t srcLen) const
1169 {
1170     srcLen = GetLength(src, srcLen);
1171     if ( srcLen == wxNO_LEN )
1172         return wxCONV_FAILED;
1173
1174     const size_t inLen = srcLen / BYTES_PER_CHAR;
1175     if ( !dst )
1176     {
1177         // optimization: return maximal space which could be needed for this
1178         // string even if the real size could be smaller if the buffer contains
1179         // any surrogates
1180         return inLen;
1181     }
1182
1183     size_t outLen = 0;
1184     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1185     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1186     {
1187         wxUint32 ch;
1188         wxUint16 tmp[2];
1189
1190         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1191         inBuff++;
1192         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1193
1194         const size_t numChars = decode_utf16(tmp, ch);
1195         if ( numChars == wxCONV_FAILED )
1196             return wxCONV_FAILED;
1197
1198         if ( numChars == 2 )
1199             inBuff++;
1200
1201         if ( ++outLen > dstLen )
1202             return wxCONV_FAILED;
1203
1204         *dst++ = ch;
1205     }
1206
1207
1208     return outLen;
1209 }
1210
1211 size_t
1212 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1213                              const wchar_t *src, size_t srcLen) const
1214 {
1215     if ( srcLen == wxNO_LEN )
1216         srcLen = wxWcslen(src) + 1;
1217
1218     size_t outLen = 0;
1219     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1220     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1221     {
1222         wxUint16 cc[2];
1223         const size_t numChars = encode_utf16(*src, cc);
1224         if ( numChars == wxCONV_FAILED )
1225             return wxCONV_FAILED;
1226
1227         outLen += numChars * BYTES_PER_CHAR;
1228         if ( outBuff )
1229         {
1230             if ( outLen > dstLen )
1231                 return wxCONV_FAILED;
1232
1233             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1234             if ( numChars == 2 )
1235             {
1236                 // second character of a surrogate
1237                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1238             }
1239         }
1240     }
1241
1242     return outLen;
1243 }
1244
1245 #endif // WC_UTF16/!WC_UTF16
1246
1247
1248 // ============================================================================
1249 // UTF-32
1250 // ============================================================================
1251
1252 #ifdef WORDS_BIGENDIAN
1253     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1254     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1255 #else
1256     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1257     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1258 #endif
1259
1260
1261 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1262 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1263
1264 /* static */
1265 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1266 {
1267     if ( srcLen == wxNO_LEN )
1268     {
1269         // count the number of bytes in input, including the trailing NULs
1270         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1271         for ( srcLen = 1; *inBuff++; srcLen++ )
1272             ;
1273
1274         srcLen *= BYTES_PER_CHAR;
1275     }
1276     else // we already have the length
1277     {
1278         // we can only convert an entire number of UTF-32 characters
1279         if ( srcLen % BYTES_PER_CHAR )
1280             return wxCONV_FAILED;
1281     }
1282
1283     return srcLen;
1284 }
1285
1286 // case when in-memory representation is UTF-16
1287 #ifdef WC_UTF16
1288
1289 // ----------------------------------------------------------------------------
1290 // conversions without endianness change
1291 // ----------------------------------------------------------------------------
1292
1293 size_t
1294 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1295                                const char *src, size_t srcLen) const
1296 {
1297     srcLen = GetLength(src, srcLen);
1298     if ( srcLen == wxNO_LEN )
1299         return wxCONV_FAILED;
1300
1301     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1302     const size_t inLen = srcLen / BYTES_PER_CHAR;
1303     size_t outLen = 0;
1304     for ( size_t n = 0; n < inLen; n++ )
1305     {
1306         wxUint16 cc[2];
1307         const size_t numChars = encode_utf16(*inBuff++, cc);
1308         if ( numChars == wxCONV_FAILED )
1309             return wxCONV_FAILED;
1310
1311         outLen += numChars;
1312         if ( dst )
1313         {
1314             if ( outLen > dstLen )
1315                 return wxCONV_FAILED;
1316
1317             *dst++ = cc[0];
1318             if ( numChars == 2 )
1319             {
1320                 // second character of a surrogate
1321                 *dst++ = cc[1];
1322             }
1323         }
1324     }
1325
1326     return outLen;
1327 }
1328
1329 size_t
1330 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1331                                  const wchar_t *src, size_t srcLen) const
1332 {
1333     if ( srcLen == wxNO_LEN )
1334         srcLen = wxWcslen(src) + 1;
1335
1336     if ( !dst )
1337     {
1338         // optimization: return maximal space which could be needed for this
1339         // string instead of the exact amount which could be less if there are
1340         // any surrogates in the input
1341         //
1342         // we consider that surrogates are rare enough to make it worthwhile to
1343         // avoid running the loop below at the cost of slightly extra memory
1344         // consumption
1345         return srcLen * BYTES_PER_CHAR;
1346     }
1347
1348     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1349     size_t outLen = 0;
1350     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1351     {
1352         const wxUint32 ch = wxDecodeSurrogate(&src);
1353         if ( !src )
1354             return wxCONV_FAILED;
1355
1356         outLen += BYTES_PER_CHAR;
1357
1358         if ( outLen > dstLen )
1359             return wxCONV_FAILED;
1360
1361         *outBuff++ = ch;
1362     }
1363
1364     return outLen;
1365 }
1366
1367 // ----------------------------------------------------------------------------
1368 // endian-reversing conversions
1369 // ----------------------------------------------------------------------------
1370
1371 size_t
1372 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1373                            const char *src, size_t srcLen) const
1374 {
1375     srcLen = GetLength(src, srcLen);
1376     if ( srcLen == wxNO_LEN )
1377         return wxCONV_FAILED;
1378
1379     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1380     const size_t inLen = srcLen / BYTES_PER_CHAR;
1381     size_t outLen = 0;
1382     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1383     {
1384         wxUint16 cc[2];
1385         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1386         if ( numChars == wxCONV_FAILED )
1387             return wxCONV_FAILED;
1388
1389         outLen += numChars;
1390         if ( dst )
1391         {
1392             if ( outLen > dstLen )
1393                 return wxCONV_FAILED;
1394
1395             *dst++ = cc[0];
1396             if ( numChars == 2 )
1397             {
1398                 // second character of a surrogate
1399                 *dst++ = cc[1];
1400             }
1401         }
1402     }
1403
1404     return outLen;
1405 }
1406
1407 size_t
1408 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1409                              const wchar_t *src, size_t srcLen) const
1410 {
1411     if ( srcLen == wxNO_LEN )
1412         srcLen = wxWcslen(src) + 1;
1413
1414     if ( !dst )
1415     {
1416         // optimization: return maximal space which could be needed for this
1417         // string instead of the exact amount which could be less if there are
1418         // any surrogates in the input
1419         //
1420         // we consider that surrogates are rare enough to make it worthwhile to
1421         // avoid running the loop below at the cost of slightly extra memory
1422         // consumption
1423         return srcLen*BYTES_PER_CHAR;
1424     }
1425
1426     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1427     size_t outLen = 0;
1428     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1429     {
1430         const wxUint32 ch = wxDecodeSurrogate(&src);
1431         if ( !src )
1432             return wxCONV_FAILED;
1433
1434         outLen += BYTES_PER_CHAR;
1435
1436         if ( outLen > dstLen )
1437             return wxCONV_FAILED;
1438
1439         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1440     }
1441
1442     return outLen;
1443 }
1444
1445 #else // !WC_UTF16: wchar_t is UTF-32
1446
1447 // ----------------------------------------------------------------------------
1448 // conversions without endianness change
1449 // ----------------------------------------------------------------------------
1450
1451 size_t
1452 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1453                                const char *src, size_t srcLen) const
1454 {
1455     // use memcpy() as it should be much faster than hand-written loop
1456     srcLen = GetLength(src, srcLen);
1457     if ( srcLen == wxNO_LEN )
1458         return wxCONV_FAILED;
1459
1460     const size_t inLen = srcLen/BYTES_PER_CHAR;
1461     if ( dst )
1462     {
1463         if ( dstLen < inLen )
1464             return wxCONV_FAILED;
1465
1466         memcpy(dst, src, srcLen);
1467     }
1468
1469     return inLen;
1470 }
1471
1472 size_t
1473 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1474                                  const wchar_t *src, size_t srcLen) const
1475 {
1476     if ( srcLen == wxNO_LEN )
1477         srcLen = wxWcslen(src) + 1;
1478
1479     srcLen *= BYTES_PER_CHAR;
1480
1481     if ( dst )
1482     {
1483         if ( dstLen < srcLen )
1484             return wxCONV_FAILED;
1485
1486         memcpy(dst, src, srcLen);
1487     }
1488
1489     return srcLen;
1490 }
1491
1492 // ----------------------------------------------------------------------------
1493 // endian-reversing conversions
1494 // ----------------------------------------------------------------------------
1495
1496 size_t
1497 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1498                            const char *src, size_t srcLen) const
1499 {
1500     srcLen = GetLength(src, srcLen);
1501     if ( srcLen == wxNO_LEN )
1502         return wxCONV_FAILED;
1503
1504     srcLen /= BYTES_PER_CHAR;
1505
1506     if ( dst )
1507     {
1508         if ( dstLen < srcLen )
1509             return wxCONV_FAILED;
1510
1511         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1512         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1513         {
1514             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1515         }
1516     }
1517
1518     return srcLen;
1519 }
1520
1521 size_t
1522 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1523                              const wchar_t *src, size_t srcLen) const
1524 {
1525     if ( srcLen == wxNO_LEN )
1526         srcLen = wxWcslen(src) + 1;
1527
1528     srcLen *= BYTES_PER_CHAR;
1529
1530     if ( dst )
1531     {
1532         if ( dstLen < srcLen )
1533             return wxCONV_FAILED;
1534
1535         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1536         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1537         {
1538             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1539         }
1540     }
1541
1542     return srcLen;
1543 }
1544
1545 #endif // WC_UTF16/!WC_UTF16
1546
1547
1548 // ============================================================================
1549 // The classes doing conversion using the iconv_xxx() functions
1550 // ============================================================================
1551
1552 #ifdef HAVE_ICONV
1553
1554 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1555 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1556 //     (unless there's yet another bug in glibc) the only case when iconv()
1557 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1558 //     left in the input buffer -- when _real_ error occurs,
1559 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1560 //     iconv() failure.
1561 //     [This bug does not appear in glibc 2.2.]
1562 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1563 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1564                                      (errno != E2BIG || bufLeft != 0))
1565 #else
1566 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1567 #endif
1568
1569 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1570
1571 #define ICONV_T_INVALID ((iconv_t)-1)
1572
1573 #if SIZEOF_WCHAR_T == 4
1574     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1575     #define WC_ENC      wxFONTENCODING_UTF32
1576 #elif SIZEOF_WCHAR_T == 2
1577     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1578     #define WC_ENC      wxFONTENCODING_UTF16
1579 #else // sizeof(wchar_t) != 2 nor 4
1580     // does this ever happen?
1581     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1582 #endif
1583
1584 // ----------------------------------------------------------------------------
1585 // wxMBConv_iconv: encapsulates an iconv character set
1586 // ----------------------------------------------------------------------------
1587
1588 class wxMBConv_iconv : public wxMBConv
1589 {
1590 public:
1591     wxMBConv_iconv(const wxChar *name);
1592     virtual ~wxMBConv_iconv();
1593
1594     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1595     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1596
1597     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1598     virtual size_t GetMBNulLen() const;
1599
1600 #if wxUSE_UNICODE_UTF8
1601     virtual bool IsUTF8() const;
1602 #endif
1603
1604     virtual wxMBConv *Clone() const
1605     {
1606         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1607         p->m_minMBCharWidth = m_minMBCharWidth;
1608         return p;
1609     }
1610
1611     bool IsOk() const
1612         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1613
1614 protected:
1615     // the iconv handlers used to translate from multibyte
1616     // to wide char and in the other direction
1617     iconv_t m2w,
1618             w2m;
1619
1620 #if wxUSE_THREADS
1621     // guards access to m2w and w2m objects
1622     wxMutex m_iconvMutex;
1623 #endif
1624
1625 private:
1626     // the name (for iconv_open()) of a wide char charset -- if none is
1627     // available on this machine, it will remain NULL
1628     static wxString ms_wcCharsetName;
1629
1630     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1631     // different endian-ness than the native one
1632     static bool ms_wcNeedsSwap;
1633
1634
1635     // name of the encoding handled by this conversion
1636     wxString m_name;
1637
1638     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1639     // initially
1640     size_t m_minMBCharWidth;
1641 };
1642
1643 // make the constructor available for unit testing
1644 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1645 {
1646     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1647     if ( !result->IsOk() )
1648     {
1649         delete result;
1650         return 0;
1651     }
1652
1653     return result;
1654 }
1655
1656 wxString wxMBConv_iconv::ms_wcCharsetName;
1657 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1658
1659 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1660               : m_name(name)
1661 {
1662     m_minMBCharWidth = 0;
1663
1664     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1665     // names for the charsets
1666     const wxCharBuffer cname(wxString(name).ToAscii());
1667
1668     // check for charset that represents wchar_t:
1669     if ( ms_wcCharsetName.empty() )
1670     {
1671         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1672
1673 #if wxUSE_FONTMAP
1674         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1675 #else // !wxUSE_FONTMAP
1676         static const wxChar *names_static[] =
1677         {
1678 #if SIZEOF_WCHAR_T == 4
1679             _T("UCS-4"),
1680 #elif SIZEOF_WCHAR_T = 2
1681             _T("UCS-2"),
1682 #endif
1683             NULL
1684         };
1685         const wxChar **names = names_static;
1686 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1687
1688         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1689         {
1690             const wxString nameCS(*names);
1691
1692             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1693             wxString nameXE(nameCS);
1694
1695 #ifdef WORDS_BIGENDIAN
1696                 nameXE += _T("BE");
1697 #else // little endian
1698                 nameXE += _T("LE");
1699 #endif
1700
1701             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1702                        nameXE.c_str());
1703
1704             m2w = iconv_open(nameXE.ToAscii(), cname);
1705             if ( m2w == ICONV_T_INVALID )
1706             {
1707                 // try charset w/o bytesex info (e.g. "UCS4")
1708                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1709                            nameCS.c_str());
1710                 m2w = iconv_open(nameCS.ToAscii(), cname);
1711
1712                 // and check for bytesex ourselves:
1713                 if ( m2w != ICONV_T_INVALID )
1714                 {
1715                     char    buf[2], *bufPtr;
1716                     wchar_t wbuf[2], *wbufPtr;
1717                     size_t  insz, outsz;
1718                     size_t  res;
1719
1720                     buf[0] = 'A';
1721                     buf[1] = 0;
1722                     wbuf[0] = 0;
1723                     insz = 2;
1724                     outsz = SIZEOF_WCHAR_T * 2;
1725                     wbufPtr = wbuf;
1726                     bufPtr = buf;
1727
1728                     res = iconv(
1729                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1730                         (char**)&wbufPtr, &outsz);
1731
1732                     if (ICONV_FAILED(res, insz))
1733                     {
1734                         wxLogLastError(wxT("iconv"));
1735                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1736                                    nameCS.c_str());
1737                     }
1738                     else // ok, can convert to this encoding, remember it
1739                     {
1740                         ms_wcCharsetName = nameCS;
1741                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1742                     }
1743                 }
1744             }
1745             else // use charset not requiring byte swapping
1746             {
1747                 ms_wcCharsetName = nameXE;
1748             }
1749         }
1750
1751         wxLogTrace(TRACE_STRCONV,
1752                    wxT("iconv wchar_t charset is \"%s\"%s"),
1753                    ms_wcCharsetName.empty() ? _T("<none>")
1754                                             : ms_wcCharsetName.c_str(),
1755                    ms_wcNeedsSwap ? _T(" (needs swap)")
1756                                   : _T(""));
1757     }
1758     else // we already have ms_wcCharsetName
1759     {
1760         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1761     }
1762
1763     if ( ms_wcCharsetName.empty() )
1764     {
1765         w2m = ICONV_T_INVALID;
1766     }
1767     else
1768     {
1769         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1770         if ( w2m == ICONV_T_INVALID )
1771         {
1772             wxLogTrace(TRACE_STRCONV,
1773                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1774                        ms_wcCharsetName.c_str(), cname.data());
1775         }
1776     }
1777 }
1778
1779 wxMBConv_iconv::~wxMBConv_iconv()
1780 {
1781     if ( m2w != ICONV_T_INVALID )
1782         iconv_close(m2w);
1783     if ( w2m != ICONV_T_INVALID )
1784         iconv_close(w2m);
1785 }
1786
1787 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1788 {
1789     // find the string length: notice that must be done differently for
1790     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1791     size_t inbuf;
1792     const size_t nulLen = GetMBNulLen();
1793     switch ( nulLen )
1794     {
1795         default:
1796             return wxCONV_FAILED;
1797
1798         case 1:
1799             inbuf = strlen(psz); // arguably more optimized than our version
1800             break;
1801
1802         case 2:
1803         case 4:
1804             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1805             // they also have to start at character boundary and not span two
1806             // adjacent characters
1807             const char *p;
1808             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1809                 ;
1810             inbuf = p - psz;
1811             break;
1812     }
1813
1814 #if wxUSE_THREADS
1815     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1816     //     Unfortunately there are a couple of global wxCSConv objects such as
1817     //     wxConvLocal that are used all over wx code, so we have to make sure
1818     //     the handle is used by at most one thread at the time. Otherwise
1819     //     only a few wx classes would be safe to use from non-main threads
1820     //     as MB<->WC conversion would fail "randomly".
1821     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1822 #endif // wxUSE_THREADS
1823
1824     size_t outbuf = n * SIZEOF_WCHAR_T;
1825     size_t res, cres;
1826     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1827     wchar_t *bufPtr = buf;
1828     const char *pszPtr = psz;
1829
1830     if (buf)
1831     {
1832         // have destination buffer, convert there
1833         cres = iconv(m2w,
1834                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1835                      (char**)&bufPtr, &outbuf);
1836         res = n - (outbuf / SIZEOF_WCHAR_T);
1837
1838         if (ms_wcNeedsSwap)
1839         {
1840             // convert to native endianness
1841             for ( unsigned i = 0; i < res; i++ )
1842                 buf[n] = WC_BSWAP(buf[i]);
1843         }
1844
1845         // NUL-terminate the string if there is any space left
1846         if (res < n)
1847             buf[res] = 0;
1848     }
1849     else
1850     {
1851         // no destination buffer... convert using temp buffer
1852         // to calculate destination buffer requirement
1853         wchar_t tbuf[8];
1854         res = 0;
1855
1856         do
1857         {
1858             bufPtr = tbuf;
1859             outbuf = 8 * SIZEOF_WCHAR_T;
1860
1861             cres = iconv(m2w,
1862                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1863                          (char**)&bufPtr, &outbuf );
1864
1865             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1866         }
1867         while ((cres == (size_t)-1) && (errno == E2BIG));
1868     }
1869
1870     if (ICONV_FAILED(cres, inbuf))
1871     {
1872         //VS: it is ok if iconv fails, hence trace only
1873         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1874         return wxCONV_FAILED;
1875     }
1876
1877     return res;
1878 }
1879
1880 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1881 {
1882 #if wxUSE_THREADS
1883     // NB: explained in MB2WC
1884     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1885 #endif
1886
1887     size_t inlen = wxWcslen(psz);
1888     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1889     size_t outbuf = n;
1890     size_t res, cres;
1891
1892     wchar_t *tmpbuf = 0;
1893
1894     if (ms_wcNeedsSwap)
1895     {
1896         // need to copy to temp buffer to switch endianness
1897         // (doing WC_BSWAP twice on the original buffer won't help, as it
1898         //  could be in read-only memory, or be accessed in some other thread)
1899         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1900         for ( size_t i = 0; i < inlen; i++ )
1901             tmpbuf[n] = WC_BSWAP(psz[i]);
1902
1903         tmpbuf[inlen] = L'\0';
1904         psz = tmpbuf;
1905     }
1906
1907     if (buf)
1908     {
1909         // have destination buffer, convert there
1910         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1911
1912         res = n - outbuf;
1913
1914         // NB: iconv was given only wcslen(psz) characters on input, and so
1915         //     it couldn't convert the trailing zero. Let's do it ourselves
1916         //     if there's some room left for it in the output buffer.
1917         if (res < n)
1918             buf[0] = 0;
1919     }
1920     else
1921     {
1922         // no destination buffer: convert using temp buffer
1923         // to calculate destination buffer requirement
1924         char tbuf[16];
1925         res = 0;
1926         do
1927         {
1928             buf = tbuf;
1929             outbuf = 16;
1930
1931             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1932
1933             res += 16 - outbuf;
1934         }
1935         while ((cres == (size_t)-1) && (errno == E2BIG));
1936     }
1937
1938     if (ms_wcNeedsSwap)
1939     {
1940         free(tmpbuf);
1941     }
1942
1943     if (ICONV_FAILED(cres, inbuf))
1944     {
1945         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1946         return wxCONV_FAILED;
1947     }
1948
1949     return res;
1950 }
1951
1952 size_t wxMBConv_iconv::GetMBNulLen() const
1953 {
1954     if ( m_minMBCharWidth == 0 )
1955     {
1956         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1957
1958 #if wxUSE_THREADS
1959         // NB: explained in MB2WC
1960         wxMutexLocker lock(self->m_iconvMutex);
1961 #endif
1962
1963         wchar_t *wnul = L"";
1964         char buf[8]; // should be enough for NUL in any encoding
1965         size_t inLen = sizeof(wchar_t),
1966                outLen = WXSIZEOF(buf);
1967         char *inBuff = (char *)wnul;
1968         char *outBuff = buf;
1969         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1970         {
1971             self->m_minMBCharWidth = (size_t)-1;
1972         }
1973         else // ok
1974         {
1975             self->m_minMBCharWidth = outBuff - buf;
1976         }
1977     }
1978
1979     return m_minMBCharWidth;
1980 }
1981
1982 #if wxUSE_UNICODE_UTF8
1983 bool wxMBConv_iconv::IsUTF8() const
1984 {
1985     return wxStricmp(m_name, "UTF-8") == 0 ||
1986            wxStricmp(m_name, "UTF8") == 0;
1987 }
1988 #endif
1989
1990 #endif // HAVE_ICONV
1991
1992
1993 // ============================================================================
1994 // Win32 conversion classes
1995 // ============================================================================
1996
1997 #ifdef wxHAVE_WIN32_MB2WC
1998
1999 // from utils.cpp
2000 #if wxUSE_FONTMAP
2001 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
2002 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2003 #endif
2004
2005 class wxMBConv_win32 : public wxMBConv
2006 {
2007 public:
2008     wxMBConv_win32()
2009     {
2010         m_CodePage = CP_ACP;
2011         m_minMBCharWidth = 0;
2012     }
2013
2014     wxMBConv_win32(const wxMBConv_win32& conv)
2015         : wxMBConv()
2016     {
2017         m_CodePage = conv.m_CodePage;
2018         m_minMBCharWidth = conv.m_minMBCharWidth;
2019     }
2020
2021 #if wxUSE_FONTMAP
2022     wxMBConv_win32(const wxChar* name)
2023     {
2024         m_CodePage = wxCharsetToCodepage(name);
2025         m_minMBCharWidth = 0;
2026     }
2027
2028     wxMBConv_win32(wxFontEncoding encoding)
2029     {
2030         m_CodePage = wxEncodingToCodepage(encoding);
2031         m_minMBCharWidth = 0;
2032     }
2033 #endif // wxUSE_FONTMAP
2034
2035     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2036     {
2037         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2038         // the behaviour is not compatible with the Unix version (using iconv)
2039         // and break the library itself, e.g. wxTextInputStream::NextChar()
2040         // wouldn't work if reading an incomplete MB char didn't result in an
2041         // error
2042         //
2043         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2044         // Win XP or newer and it is not supported for UTF-[78] so we always
2045         // use our own conversions in this case. See
2046         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2047         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2048         if ( m_CodePage == CP_UTF8 )
2049         {
2050             return wxMBConvUTF8().MB2WC(buf, psz, n);
2051         }
2052
2053         if ( m_CodePage == CP_UTF7 )
2054         {
2055             return wxMBConvUTF7().MB2WC(buf, psz, n);
2056         }
2057
2058         int flags = 0;
2059         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2060                 IsAtLeastWin2kSP4() )
2061         {
2062             flags = MB_ERR_INVALID_CHARS;
2063         }
2064
2065         const size_t len = ::MultiByteToWideChar
2066                              (
2067                                 m_CodePage,     // code page
2068                                 flags,          // flags: fall on error
2069                                 psz,            // input string
2070                                 -1,             // its length (NUL-terminated)
2071                                 buf,            // output string
2072                                 buf ? n : 0     // size of output buffer
2073                              );
2074         if ( !len )
2075         {
2076             // function totally failed
2077             return wxCONV_FAILED;
2078         }
2079
2080         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2081         // check if we succeeded, by doing a double trip:
2082         if ( !flags && buf )
2083         {
2084             const size_t mbLen = strlen(psz);
2085             wxCharBuffer mbBuf(mbLen);
2086             if ( ::WideCharToMultiByte
2087                    (
2088                       m_CodePage,
2089                       0,
2090                       buf,
2091                       -1,
2092                       mbBuf.data(),
2093                       mbLen + 1,        // size in bytes, not length
2094                       NULL,
2095                       NULL
2096                    ) == 0 ||
2097                   strcmp(mbBuf, psz) != 0 )
2098             {
2099                 // we didn't obtain the same thing we started from, hence
2100                 // the conversion was lossy and we consider that it failed
2101                 return wxCONV_FAILED;
2102             }
2103         }
2104
2105         // note that it returns count of written chars for buf != NULL and size
2106         // of the needed buffer for buf == NULL so in either case the length of
2107         // the string (which never includes the terminating NUL) is one less
2108         return len - 1;
2109     }
2110
2111     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2112     {
2113         /*
2114             we have a problem here: by default, WideCharToMultiByte() may
2115             replace characters unrepresentable in the target code page with bad
2116             quality approximations such as turning "1/2" symbol (U+00BD) into
2117             "1" for the code pages which don't have it and we, obviously, want
2118             to avoid this at any price
2119
2120             the trouble is that this function does it _silently_, i.e. it won't
2121             even tell us whether it did or not... Win98/2000 and higher provide
2122             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2123             we have to resort to a round trip, i.e. check that converting back
2124             results in the same string -- this is, of course, expensive but
2125             otherwise we simply can't be sure to not garble the data.
2126          */
2127
2128         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2129         // it doesn't work with CJK encodings (which we test for rather roughly
2130         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2131         // supporting it
2132         BOOL usedDef wxDUMMY_INITIALIZE(false);
2133         BOOL *pUsedDef;
2134         int flags;
2135         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2136         {
2137             // it's our lucky day
2138             flags = WC_NO_BEST_FIT_CHARS;
2139             pUsedDef = &usedDef;
2140         }
2141         else // old system or unsupported encoding
2142         {
2143             flags = 0;
2144             pUsedDef = NULL;
2145         }
2146
2147         const size_t len = ::WideCharToMultiByte
2148                              (
2149                                 m_CodePage,     // code page
2150                                 flags,          // either none or no best fit
2151                                 pwz,            // input string
2152                                 -1,             // it is (wide) NUL-terminated
2153                                 buf,            // output buffer
2154                                 buf ? n : 0,    // and its size
2155                                 NULL,           // default "replacement" char
2156                                 pUsedDef        // [out] was it used?
2157                              );
2158
2159         if ( !len )
2160         {
2161             // function totally failed
2162             return wxCONV_FAILED;
2163         }
2164
2165         // if we were really converting, check if we succeeded
2166         if ( buf )
2167         {
2168             if ( flags )
2169             {
2170                 // check if the conversion failed, i.e. if any replacements
2171                 // were done
2172                 if ( usedDef )
2173                     return wxCONV_FAILED;
2174             }
2175             else // we must resort to double tripping...
2176             {
2177                 wxWCharBuffer wcBuf(n);
2178                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2179                         wcscmp(wcBuf, pwz) != 0 )
2180                 {
2181                     // we didn't obtain the same thing we started from, hence
2182                     // the conversion was lossy and we consider that it failed
2183                     return wxCONV_FAILED;
2184                 }
2185             }
2186         }
2187
2188         // see the comment above for the reason of "len - 1"
2189         return len - 1;
2190     }
2191
2192     virtual size_t GetMBNulLen() const
2193     {
2194         if ( m_minMBCharWidth == 0 )
2195         {
2196             int len = ::WideCharToMultiByte
2197                         (
2198                             m_CodePage,     // code page
2199                             0,              // no flags
2200                             L"",            // input string
2201                             1,              // translate just the NUL
2202                             NULL,           // output buffer
2203                             0,              // and its size
2204                             NULL,           // no replacement char
2205                             NULL            // [out] don't care if it was used
2206                         );
2207
2208             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2209             switch ( len )
2210             {
2211                 default:
2212                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2213                     self->m_minMBCharWidth = (size_t)-1;
2214                     break;
2215
2216                 case 0:
2217                     self->m_minMBCharWidth = (size_t)-1;
2218                     break;
2219
2220                 case 1:
2221                 case 2:
2222                 case 4:
2223                     self->m_minMBCharWidth = len;
2224                     break;
2225             }
2226         }
2227
2228         return m_minMBCharWidth;
2229     }
2230
2231     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2232
2233     bool IsOk() const { return m_CodePage != -1; }
2234
2235 private:
2236     static bool CanUseNoBestFit()
2237     {
2238         static int s_isWin98Or2k = -1;
2239
2240         if ( s_isWin98Or2k == -1 )
2241         {
2242             int verMaj, verMin;
2243             switch ( wxGetOsVersion(&verMaj, &verMin) )
2244             {
2245                 case wxOS_WINDOWS_9X:
2246                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2247                     break;
2248
2249                 case wxOS_WINDOWS_NT:
2250                     s_isWin98Or2k = verMaj >= 5;
2251                     break;
2252
2253                 default:
2254                     // unknown: be conservative by default
2255                     s_isWin98Or2k = 0;
2256                     break;
2257             }
2258
2259             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2260         }
2261
2262         return s_isWin98Or2k == 1;
2263     }
2264
2265     static bool IsAtLeastWin2kSP4()
2266     {
2267 #ifdef __WXWINCE__
2268         return false;
2269 #else
2270         static int s_isAtLeastWin2kSP4 = -1;
2271
2272         if ( s_isAtLeastWin2kSP4 == -1 )
2273         {
2274             OSVERSIONINFOEX ver;
2275
2276             memset(&ver, 0, sizeof(ver));
2277             ver.dwOSVersionInfoSize = sizeof(ver);
2278             GetVersionEx((OSVERSIONINFO*)&ver);
2279
2280             s_isAtLeastWin2kSP4 =
2281               ((ver.dwMajorVersion > 5) || // Vista+
2282                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2283                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2284                ver.wServicePackMajor >= 4)) // 2000 SP4+
2285               ? 1 : 0;
2286         }
2287
2288         return s_isAtLeastWin2kSP4 == 1;
2289 #endif
2290     }
2291
2292
2293     // the code page we're working with
2294     long m_CodePage;
2295
2296     // cached result of GetMBNulLen(), set to 0 initially meaning
2297     // "unknown"
2298     size_t m_minMBCharWidth;
2299 };
2300
2301 #endif // wxHAVE_WIN32_MB2WC
2302
2303 // ============================================================================
2304 // Cocoa conversion classes
2305 // ============================================================================
2306
2307 #if defined(__WXCOCOA__)
2308
2309 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2310 // Strangely enough, internally Core Foundation uses
2311 // UTF-32 internally quite a bit - its just not public (yet).
2312
2313 #include <CoreFoundation/CFString.h>
2314 #include <CoreFoundation/CFStringEncodingExt.h>
2315
2316 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2317 {
2318     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2319
2320     switch (encoding)
2321     {
2322         case wxFONTENCODING_DEFAULT :
2323             enc = CFStringGetSystemEncoding();
2324             break ;
2325
2326         case wxFONTENCODING_ISO8859_1 :
2327             enc = kCFStringEncodingISOLatin1 ;
2328             break ;
2329         case wxFONTENCODING_ISO8859_2 :
2330             enc = kCFStringEncodingISOLatin2;
2331             break ;
2332         case wxFONTENCODING_ISO8859_3 :
2333             enc = kCFStringEncodingISOLatin3 ;
2334             break ;
2335         case wxFONTENCODING_ISO8859_4 :
2336             enc = kCFStringEncodingISOLatin4;
2337             break ;
2338         case wxFONTENCODING_ISO8859_5 :
2339             enc = kCFStringEncodingISOLatinCyrillic;
2340             break ;
2341         case wxFONTENCODING_ISO8859_6 :
2342             enc = kCFStringEncodingISOLatinArabic;
2343             break ;
2344         case wxFONTENCODING_ISO8859_7 :
2345             enc = kCFStringEncodingISOLatinGreek;
2346             break ;
2347         case wxFONTENCODING_ISO8859_8 :
2348             enc = kCFStringEncodingISOLatinHebrew;
2349             break ;
2350         case wxFONTENCODING_ISO8859_9 :
2351             enc = kCFStringEncodingISOLatin5;
2352             break ;
2353         case wxFONTENCODING_ISO8859_10 :
2354             enc = kCFStringEncodingISOLatin6;
2355             break ;
2356         case wxFONTENCODING_ISO8859_11 :
2357             enc = kCFStringEncodingISOLatinThai;
2358             break ;
2359         case wxFONTENCODING_ISO8859_13 :
2360             enc = kCFStringEncodingISOLatin7;
2361             break ;
2362         case wxFONTENCODING_ISO8859_14 :
2363             enc = kCFStringEncodingISOLatin8;
2364             break ;
2365         case wxFONTENCODING_ISO8859_15 :
2366             enc = kCFStringEncodingISOLatin9;
2367             break ;
2368
2369         case wxFONTENCODING_KOI8 :
2370             enc = kCFStringEncodingKOI8_R;
2371             break ;
2372         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2373             enc = kCFStringEncodingDOSRussian;
2374             break ;
2375
2376 //      case wxFONTENCODING_BULGARIAN :
2377 //          enc = ;
2378 //          break ;
2379
2380         case wxFONTENCODING_CP437 :
2381             enc = kCFStringEncodingDOSLatinUS ;
2382             break ;
2383         case wxFONTENCODING_CP850 :
2384             enc = kCFStringEncodingDOSLatin1;
2385             break ;
2386         case wxFONTENCODING_CP852 :
2387             enc = kCFStringEncodingDOSLatin2;
2388             break ;
2389         case wxFONTENCODING_CP855 :
2390             enc = kCFStringEncodingDOSCyrillic;
2391             break ;
2392         case wxFONTENCODING_CP866 :
2393             enc = kCFStringEncodingDOSRussian ;
2394             break ;
2395         case wxFONTENCODING_CP874 :
2396             enc = kCFStringEncodingDOSThai;
2397             break ;
2398         case wxFONTENCODING_CP932 :
2399             enc = kCFStringEncodingDOSJapanese;
2400             break ;
2401         case wxFONTENCODING_CP936 :
2402             enc = kCFStringEncodingDOSChineseSimplif ;
2403             break ;
2404         case wxFONTENCODING_CP949 :
2405             enc = kCFStringEncodingDOSKorean;
2406             break ;
2407         case wxFONTENCODING_CP950 :
2408             enc = kCFStringEncodingDOSChineseTrad;
2409             break ;
2410         case wxFONTENCODING_CP1250 :
2411             enc = kCFStringEncodingWindowsLatin2;
2412             break ;
2413         case wxFONTENCODING_CP1251 :
2414             enc = kCFStringEncodingWindowsCyrillic ;
2415             break ;
2416         case wxFONTENCODING_CP1252 :
2417             enc = kCFStringEncodingWindowsLatin1 ;
2418             break ;
2419         case wxFONTENCODING_CP1253 :
2420             enc = kCFStringEncodingWindowsGreek;
2421             break ;
2422         case wxFONTENCODING_CP1254 :
2423             enc = kCFStringEncodingWindowsLatin5;
2424             break ;
2425         case wxFONTENCODING_CP1255 :
2426             enc = kCFStringEncodingWindowsHebrew ;
2427             break ;
2428         case wxFONTENCODING_CP1256 :
2429             enc = kCFStringEncodingWindowsArabic ;
2430             break ;
2431         case wxFONTENCODING_CP1257 :
2432             enc = kCFStringEncodingWindowsBalticRim;
2433             break ;
2434 //   This only really encodes to UTF7 (if that) evidently
2435 //        case wxFONTENCODING_UTF7 :
2436 //            enc = kCFStringEncodingNonLossyASCII ;
2437 //            break ;
2438         case wxFONTENCODING_UTF8 :
2439             enc = kCFStringEncodingUTF8 ;
2440             break ;
2441         case wxFONTENCODING_EUC_JP :
2442             enc = kCFStringEncodingEUC_JP;
2443             break ;
2444         case wxFONTENCODING_UTF16 :
2445             enc = kCFStringEncodingUnicode ;
2446             break ;
2447         case wxFONTENCODING_MACROMAN :
2448             enc = kCFStringEncodingMacRoman ;
2449             break ;
2450         case wxFONTENCODING_MACJAPANESE :
2451             enc = kCFStringEncodingMacJapanese ;
2452             break ;
2453         case wxFONTENCODING_MACCHINESETRAD :
2454             enc = kCFStringEncodingMacChineseTrad ;
2455             break ;
2456         case wxFONTENCODING_MACKOREAN :
2457             enc = kCFStringEncodingMacKorean ;
2458             break ;
2459         case wxFONTENCODING_MACARABIC :
2460             enc = kCFStringEncodingMacArabic ;
2461             break ;
2462         case wxFONTENCODING_MACHEBREW :
2463             enc = kCFStringEncodingMacHebrew ;
2464             break ;
2465         case wxFONTENCODING_MACGREEK :
2466             enc = kCFStringEncodingMacGreek ;
2467             break ;
2468         case wxFONTENCODING_MACCYRILLIC :
2469             enc = kCFStringEncodingMacCyrillic ;
2470             break ;
2471         case wxFONTENCODING_MACDEVANAGARI :
2472             enc = kCFStringEncodingMacDevanagari ;
2473             break ;
2474         case wxFONTENCODING_MACGURMUKHI :
2475             enc = kCFStringEncodingMacGurmukhi ;
2476             break ;
2477         case wxFONTENCODING_MACGUJARATI :
2478             enc = kCFStringEncodingMacGujarati ;
2479             break ;
2480         case wxFONTENCODING_MACORIYA :
2481             enc = kCFStringEncodingMacOriya ;
2482             break ;
2483         case wxFONTENCODING_MACBENGALI :
2484             enc = kCFStringEncodingMacBengali ;
2485             break ;
2486         case wxFONTENCODING_MACTAMIL :
2487             enc = kCFStringEncodingMacTamil ;
2488             break ;
2489         case wxFONTENCODING_MACTELUGU :
2490             enc = kCFStringEncodingMacTelugu ;
2491             break ;
2492         case wxFONTENCODING_MACKANNADA :
2493             enc = kCFStringEncodingMacKannada ;
2494             break ;
2495         case wxFONTENCODING_MACMALAJALAM :
2496             enc = kCFStringEncodingMacMalayalam ;
2497             break ;
2498         case wxFONTENCODING_MACSINHALESE :
2499             enc = kCFStringEncodingMacSinhalese ;
2500             break ;
2501         case wxFONTENCODING_MACBURMESE :
2502             enc = kCFStringEncodingMacBurmese ;
2503             break ;
2504         case wxFONTENCODING_MACKHMER :
2505             enc = kCFStringEncodingMacKhmer ;
2506             break ;
2507         case wxFONTENCODING_MACTHAI :
2508             enc = kCFStringEncodingMacThai ;
2509             break ;
2510         case wxFONTENCODING_MACLAOTIAN :
2511             enc = kCFStringEncodingMacLaotian ;
2512             break ;
2513         case wxFONTENCODING_MACGEORGIAN :
2514             enc = kCFStringEncodingMacGeorgian ;
2515             break ;
2516         case wxFONTENCODING_MACARMENIAN :
2517             enc = kCFStringEncodingMacArmenian ;
2518             break ;
2519         case wxFONTENCODING_MACCHINESESIMP :
2520             enc = kCFStringEncodingMacChineseSimp ;
2521             break ;
2522         case wxFONTENCODING_MACTIBETAN :
2523             enc = kCFStringEncodingMacTibetan ;
2524             break ;
2525         case wxFONTENCODING_MACMONGOLIAN :
2526             enc = kCFStringEncodingMacMongolian ;
2527             break ;
2528         case wxFONTENCODING_MACETHIOPIC :
2529             enc = kCFStringEncodingMacEthiopic ;
2530             break ;
2531         case wxFONTENCODING_MACCENTRALEUR :
2532             enc = kCFStringEncodingMacCentralEurRoman ;
2533             break ;
2534         case wxFONTENCODING_MACVIATNAMESE :
2535             enc = kCFStringEncodingMacVietnamese ;
2536             break ;
2537         case wxFONTENCODING_MACARABICEXT :
2538             enc = kCFStringEncodingMacExtArabic ;
2539             break ;
2540         case wxFONTENCODING_MACSYMBOL :
2541             enc = kCFStringEncodingMacSymbol ;
2542             break ;
2543         case wxFONTENCODING_MACDINGBATS :
2544             enc = kCFStringEncodingMacDingbats ;
2545             break ;
2546         case wxFONTENCODING_MACTURKISH :
2547             enc = kCFStringEncodingMacTurkish ;
2548             break ;
2549         case wxFONTENCODING_MACCROATIAN :
2550             enc = kCFStringEncodingMacCroatian ;
2551             break ;
2552         case wxFONTENCODING_MACICELANDIC :
2553             enc = kCFStringEncodingMacIcelandic ;
2554             break ;
2555         case wxFONTENCODING_MACROMANIAN :
2556             enc = kCFStringEncodingMacRomanian ;
2557             break ;
2558         case wxFONTENCODING_MACCELTIC :
2559             enc = kCFStringEncodingMacCeltic ;
2560             break ;
2561         case wxFONTENCODING_MACGAELIC :
2562             enc = kCFStringEncodingMacGaelic ;
2563             break ;
2564 //      case wxFONTENCODING_MACKEYBOARD :
2565 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2566 //          break ;
2567
2568         default :
2569             // because gcc is picky
2570             break ;
2571     }
2572
2573     return enc ;
2574 }
2575
2576 class wxMBConv_cocoa : public wxMBConv
2577 {
2578 public:
2579     wxMBConv_cocoa()
2580     {
2581         Init(CFStringGetSystemEncoding()) ;
2582     }
2583
2584     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2585     {
2586         m_encoding = conv.m_encoding;
2587     }
2588
2589 #if wxUSE_FONTMAP
2590     wxMBConv_cocoa(const wxChar* name)
2591     {
2592         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2593     }
2594 #endif
2595
2596     wxMBConv_cocoa(wxFontEncoding encoding)
2597     {
2598         Init( wxCFStringEncFromFontEnc(encoding) );
2599     }
2600
2601     virtual ~wxMBConv_cocoa()
2602     {
2603     }
2604
2605     void Init( CFStringEncoding encoding)
2606     {
2607         m_encoding = encoding ;
2608     }
2609
2610     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2611     {
2612         wxASSERT(szUnConv);
2613
2614         CFStringRef theString = CFStringCreateWithBytes (
2615                                                 NULL, //the allocator
2616                                                 (const UInt8*)szUnConv,
2617                                                 strlen(szUnConv),
2618                                                 m_encoding,
2619                                                 false //no BOM/external representation
2620                                                 );
2621
2622         wxASSERT(theString);
2623
2624         size_t nOutLength = CFStringGetLength(theString);
2625
2626         if (szOut == NULL)
2627         {
2628             CFRelease(theString);
2629             return nOutLength;
2630         }
2631
2632         CFRange theRange = { 0, nOutSize };
2633
2634 #if SIZEOF_WCHAR_T == 4
2635         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2636 #endif
2637
2638         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2639
2640         CFRelease(theString);
2641
2642         szUniCharBuffer[nOutLength] = '\0';
2643
2644 #if SIZEOF_WCHAR_T == 4
2645         wxMBConvUTF16 converter;
2646         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2647         delete [] szUniCharBuffer;
2648 #endif
2649
2650         return nOutLength;
2651     }
2652
2653     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2654     {
2655         wxASSERT(szUnConv);
2656
2657         size_t nRealOutSize;
2658         size_t nBufSize = wxWcslen(szUnConv);
2659         UniChar* szUniBuffer = (UniChar*) szUnConv;
2660
2661 #if SIZEOF_WCHAR_T == 4
2662         wxMBConvUTF16 converter ;
2663         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2664         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2665         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2666         nBufSize /= sizeof(UniChar);
2667 #endif
2668
2669         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2670                                 NULL, //allocator
2671                                 szUniBuffer,
2672                                 nBufSize,
2673                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2674                             );
2675
2676         wxASSERT(theString);
2677
2678         //Note that CER puts a BOM when converting to unicode
2679         //so we  check and use getchars instead in that case
2680         if (m_encoding == kCFStringEncodingUnicode)
2681         {
2682             if (szOut != NULL)
2683                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2684
2685             nRealOutSize = CFStringGetLength(theString) + 1;
2686         }
2687         else
2688         {
2689             CFStringGetBytes(
2690                 theString,
2691                 CFRangeMake(0, CFStringGetLength(theString)),
2692                 m_encoding,
2693                 0, //what to put in characters that can't be converted -
2694                     //0 tells CFString to return NULL if it meets such a character
2695                 false, //not an external representation
2696                 (UInt8*) szOut,
2697                 nOutSize,
2698                 (CFIndex*) &nRealOutSize
2699                         );
2700         }
2701
2702         CFRelease(theString);
2703
2704 #if SIZEOF_WCHAR_T == 4
2705         delete[] szUniBuffer;
2706 #endif
2707
2708         return  nRealOutSize - 1;
2709     }
2710
2711     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2712
2713     bool IsOk() const
2714     {
2715         return m_encoding != kCFStringEncodingInvalidId &&
2716               CFStringIsEncodingAvailable(m_encoding);
2717     }
2718
2719 private:
2720     CFStringEncoding m_encoding ;
2721 };
2722
2723 #endif // defined(__WXCOCOA__)
2724
2725 // ============================================================================
2726 // Mac conversion classes
2727 // ============================================================================
2728
2729 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2730
2731 class wxMBConv_mac : public wxMBConv
2732 {
2733 public:
2734     wxMBConv_mac()
2735     {
2736         Init(CFStringGetSystemEncoding()) ;
2737     }
2738
2739     wxMBConv_mac(const wxMBConv_mac& conv)
2740     {
2741         Init(conv.m_char_encoding);
2742     }
2743
2744 #if wxUSE_FONTMAP
2745     wxMBConv_mac(const wxChar* name)
2746     {
2747         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2748     }
2749 #endif
2750
2751     wxMBConv_mac(wxFontEncoding encoding)
2752     {
2753         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2754     }
2755
2756     virtual ~wxMBConv_mac()
2757     {
2758         OSStatus status = noErr ;
2759         if (m_MB2WC_converter)
2760             status = TECDisposeConverter(m_MB2WC_converter);
2761         if (m_WC2MB_converter)
2762             status = TECDisposeConverter(m_WC2MB_converter);
2763     }
2764
2765     void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2766             TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2767     {
2768         m_MB2WC_converter = NULL ;
2769         m_WC2MB_converter = NULL ;
2770         m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2771         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2772     }
2773
2774     virtual void CreateIfNeeded() const
2775     {
2776         if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2777         {
2778             OSStatus status = noErr ;
2779             status = TECCreateConverter(&m_MB2WC_converter,
2780                                     m_char_encoding,
2781                                     m_unicode_encoding);
2782             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2783             status = TECCreateConverter(&m_WC2MB_converter,
2784                                     m_unicode_encoding,
2785                                     m_char_encoding);
2786             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2787         }
2788     }
2789
2790     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2791     {
2792         CreateIfNeeded() ;
2793         OSStatus status = noErr ;
2794         ByteCount byteOutLen ;
2795         ByteCount byteInLen = strlen(psz) + 1;
2796         wchar_t *tbuf = NULL ;
2797         UniChar* ubuf = NULL ;
2798         size_t res = 0 ;
2799
2800         if (buf == NULL)
2801         {
2802             // Apple specs say at least 32
2803             n = wxMax( 32, byteInLen ) ;
2804             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2805         }
2806
2807         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2808
2809 #if SIZEOF_WCHAR_T == 4
2810         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2811 #else
2812         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2813 #endif
2814
2815         status = TECConvertText(
2816             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2817             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2818
2819 #if SIZEOF_WCHAR_T == 4
2820         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2821         // is not properly terminated we get random characters at the end
2822         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2823         wxMBConvUTF16 converter ;
2824         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2825         free( ubuf ) ;
2826 #else
2827         res = byteOutLen / sizeof( UniChar ) ;
2828 #endif
2829
2830         if ( buf == NULL )
2831              free(tbuf) ;
2832
2833         if ( buf  && res < n)
2834             buf[res] = 0;
2835
2836         return res ;
2837     }
2838
2839     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2840     {
2841         CreateIfNeeded() ;
2842         OSStatus status = noErr ;
2843         ByteCount byteOutLen ;
2844         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2845
2846         char *tbuf = NULL ;
2847
2848         if (buf == NULL)
2849         {
2850             // Apple specs say at least 32
2851             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2852             tbuf = (char*) malloc( n ) ;
2853         }
2854
2855         ByteCount byteBufferLen = n ;
2856         UniChar* ubuf = NULL ;
2857
2858 #if SIZEOF_WCHAR_T == 4
2859         wxMBConvUTF16 converter ;
2860         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2861         byteInLen = unicharlen ;
2862         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2863         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2864 #else
2865         ubuf = (UniChar*) psz ;
2866 #endif
2867
2868         status = TECConvertText(
2869             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2870             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2871
2872 #if SIZEOF_WCHAR_T == 4
2873         free( ubuf ) ;
2874 #endif
2875
2876         if ( buf == NULL )
2877             free(tbuf) ;
2878
2879         size_t res = byteOutLen ;
2880         if ( buf  && res < n)
2881         {
2882             buf[res] = 0;
2883
2884             //we need to double-trip to verify it didn't insert any ? in place
2885             //of bogus characters
2886             wxWCharBuffer wcBuf(n);
2887             size_t pszlen = wxWcslen(psz);
2888             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2889                         wxWcslen(wcBuf) != pszlen ||
2890                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2891             {
2892                 // we didn't obtain the same thing we started from, hence
2893                 // the conversion was lossy and we consider that it failed
2894                 return wxCONV_FAILED;
2895             }
2896         }
2897
2898         return res ;
2899     }
2900
2901     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2902
2903     bool IsOk() const
2904     {
2905         CreateIfNeeded() ;
2906         return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2907     }
2908
2909 protected :
2910     mutable TECObjectRef m_MB2WC_converter;
2911     mutable TECObjectRef m_WC2MB_converter;
2912
2913     TextEncodingBase m_char_encoding;
2914     TextEncodingBase m_unicode_encoding;
2915 };
2916
2917 // MB is decomposed (D) normalized UTF8
2918
2919 class wxMBConv_macUTF8D : public wxMBConv_mac
2920 {
2921 public :
2922     wxMBConv_macUTF8D()
2923     {
2924         Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2925         m_uni = NULL;
2926         m_uniBack = NULL ;
2927     }
2928
2929     virtual ~wxMBConv_macUTF8D()
2930     {
2931         if (m_uni!=NULL)
2932             DisposeUnicodeToTextInfo(&m_uni);
2933         if (m_uniBack!=NULL)
2934             DisposeUnicodeToTextInfo(&m_uniBack);
2935     }
2936
2937     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2938     {
2939         CreateIfNeeded() ;
2940         OSStatus status = noErr ;
2941         ByteCount byteOutLen ;
2942         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2943
2944         char *tbuf = NULL ;
2945
2946         if (buf == NULL)
2947         {
2948             // Apple specs say at least 32
2949             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2950             tbuf = (char*) malloc( n ) ;
2951         }
2952
2953         ByteCount byteBufferLen = n ;
2954         UniChar* ubuf = NULL ;
2955
2956 #if SIZEOF_WCHAR_T == 4
2957         wxMBConvUTF16 converter ;
2958         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2959         byteInLen = unicharlen ;
2960         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2961         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2962 #else
2963         ubuf = (UniChar*) psz ;
2964 #endif
2965
2966         // ubuf is a non-decomposed UniChar buffer
2967
2968         ByteCount dcubuflen = byteInLen * 2 + 2 ;
2969         ByteCount dcubufread , dcubufwritten ;
2970         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2971
2972         ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2973             kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , dcubuf ) ;
2974
2975         // we now convert that decomposed buffer into UTF8
2976
2977         status = TECConvertText(
2978             m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2979             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2980
2981         free( dcubuf );
2982
2983 #if SIZEOF_WCHAR_T == 4
2984         free( ubuf ) ;
2985 #endif
2986
2987         if ( buf == NULL )
2988             free(tbuf) ;
2989
2990         size_t res = byteOutLen ;
2991         if ( buf  && res < n)
2992         {
2993             buf[res] = 0;
2994             // don't test for round-trip fidelity yet, we cannot guarantee it yet
2995         }
2996
2997         return res ;
2998     }
2999
3000     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
3001     {
3002         CreateIfNeeded() ;
3003         OSStatus status = noErr ;
3004         ByteCount byteOutLen ;
3005         ByteCount byteInLen = strlen(psz) + 1;
3006         wchar_t *tbuf = NULL ;
3007         UniChar* ubuf = NULL ;
3008         size_t res = 0 ;
3009
3010         if (buf == NULL)
3011         {
3012             // Apple specs say at least 32
3013             n = wxMax( 32, byteInLen ) ;
3014             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3015         }
3016
3017         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3018
3019 #if SIZEOF_WCHAR_T == 4
3020         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3021 #else
3022         ubuf = (UniChar*) (buf ? buf : tbuf) ;
3023 #endif
3024
3025         ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3026         ByteCount dcubufread , dcubufwritten ;
3027         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3028
3029         status = TECConvertText(
3030                                 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3031                                 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3032         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3033         // is not properly terminated we get random characters at the end
3034         dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3035
3036         // now from the decomposed UniChar to properly composed uniChar
3037         ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3038                                   kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , ubuf ) ;
3039
3040         free( dcubuf );
3041         byteOutLen = dcubufwritten ;
3042         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3043
3044
3045 #if SIZEOF_WCHAR_T == 4
3046         wxMBConvUTF16 converter ;
3047         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3048         free( ubuf ) ;
3049 #else
3050         res = byteOutLen / sizeof( UniChar ) ;
3051 #endif
3052
3053         if ( buf == NULL )
3054             free(tbuf) ;
3055
3056         if ( buf  && res < n)
3057             buf[res] = 0;
3058
3059         return res ;
3060     }
3061
3062     virtual void CreateIfNeeded() const
3063     {
3064         wxMBConv_mac::CreateIfNeeded() ;
3065         if ( m_uni == NULL )
3066         {
3067             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3068                 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3069             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3070                 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3071             m_map.mappingVersion = kUnicodeUseLatestMapping;
3072
3073             OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3074             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3075
3076             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3077                                                        kUnicodeNoSubset, kTextEncodingDefaultFormat);
3078             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3079                                                      kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3080             m_map.mappingVersion = kUnicodeUseLatestMapping;
3081             err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3082             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3083         }
3084     }
3085 protected :
3086     mutable UnicodeToTextInfo   m_uni;
3087     mutable UnicodeToTextInfo   m_uniBack;
3088     mutable UnicodeMapping      m_map;
3089 };
3090 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3091
3092 // ============================================================================
3093 // wxEncodingConverter based conversion classes
3094 // ============================================================================
3095
3096 #if wxUSE_FONTMAP
3097
3098 class wxMBConv_wxwin : public wxMBConv
3099 {
3100 private:
3101     void Init()
3102     {
3103         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3104                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3105     }
3106
3107 public:
3108     // temporarily just use wxEncodingConverter stuff,
3109     // so that it works while a better implementation is built
3110     wxMBConv_wxwin(const wxChar* name)
3111     {
3112         if (name)
3113             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3114         else
3115             m_enc = wxFONTENCODING_SYSTEM;
3116
3117         Init();
3118     }
3119
3120     wxMBConv_wxwin(wxFontEncoding enc)
3121     {
3122         m_enc = enc;
3123
3124         Init();
3125     }
3126
3127     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3128     {
3129         size_t inbuf = strlen(psz);
3130         if (buf)
3131         {
3132             if (!m2w.Convert(psz, buf))
3133                 return wxCONV_FAILED;
3134         }
3135         return inbuf;
3136     }
3137
3138     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3139     {
3140         const size_t inbuf = wxWcslen(psz);
3141         if (buf)
3142         {
3143             if (!w2m.Convert(psz, buf))
3144                 return wxCONV_FAILED;
3145         }
3146
3147         return inbuf;
3148     }
3149
3150     virtual size_t GetMBNulLen() const
3151     {
3152         switch ( m_enc )
3153         {
3154             case wxFONTENCODING_UTF16BE:
3155             case wxFONTENCODING_UTF16LE:
3156                 return 2;
3157
3158             case wxFONTENCODING_UTF32BE:
3159             case wxFONTENCODING_UTF32LE:
3160                 return 4;
3161
3162             default:
3163                 return 1;
3164         }
3165     }
3166
3167     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3168
3169     bool IsOk() const { return m_ok; }
3170
3171 public:
3172     wxFontEncoding m_enc;
3173     wxEncodingConverter m2w, w2m;
3174
3175 private:
3176     // were we initialized successfully?
3177     bool m_ok;
3178
3179     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3180 };
3181
3182 // make the constructors available for unit testing
3183 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
3184 {
3185     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3186     if ( !result->IsOk() )
3187     {
3188         delete result;
3189         return 0;
3190     }
3191
3192     return result;
3193 }
3194
3195 #endif // wxUSE_FONTMAP
3196
3197 // ============================================================================
3198 // wxCSConv implementation
3199 // ============================================================================
3200
3201 void wxCSConv::Init()
3202 {
3203     m_name = NULL;
3204     m_convReal =  NULL;
3205     m_deferred = true;
3206 }
3207
3208 wxCSConv::wxCSConv(const wxChar *charset)
3209 {
3210     Init();
3211
3212     if ( charset )
3213     {
3214         SetName(charset);
3215     }
3216
3217 #if wxUSE_FONTMAP
3218     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3219 #else
3220     m_encoding = wxFONTENCODING_SYSTEM;
3221 #endif
3222 }
3223
3224 wxCSConv::wxCSConv(wxFontEncoding encoding)
3225 {
3226     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3227     {
3228         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3229
3230         encoding = wxFONTENCODING_SYSTEM;
3231     }
3232
3233     Init();
3234
3235     m_encoding = encoding;
3236 }
3237
3238 wxCSConv::~wxCSConv()
3239 {
3240     Clear();
3241 }
3242
3243 wxCSConv::wxCSConv(const wxCSConv& conv)
3244         : wxMBConv()
3245 {
3246     Init();
3247
3248     SetName(conv.m_name);
3249     m_encoding = conv.m_encoding;
3250 }
3251
3252 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3253 {
3254     Clear();
3255
3256     SetName(conv.m_name);
3257     m_encoding = conv.m_encoding;
3258
3259     return *this;
3260 }
3261
3262 void wxCSConv::Clear()
3263 {
3264     free(m_name);
3265     delete m_convReal;
3266
3267     m_name = NULL;
3268     m_convReal = NULL;
3269 }
3270
3271 void wxCSConv::SetName(const wxChar *charset)
3272 {
3273     if (charset)
3274     {
3275         m_name = wxStrdup(charset);
3276         m_deferred = true;
3277     }
3278 }
3279
3280 #if wxUSE_FONTMAP
3281
3282 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3283                      wxEncodingNameCache );
3284
3285 static wxEncodingNameCache gs_nameCache;
3286 #endif
3287
3288 wxMBConv *wxCSConv::DoCreate() const
3289 {
3290 #if wxUSE_FONTMAP
3291     wxLogTrace(TRACE_STRCONV,
3292                wxT("creating conversion for %s"),
3293                (m_name ? m_name
3294                        : (const wxChar*)wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3295 #endif // wxUSE_FONTMAP
3296
3297     // check for the special case of ASCII or ISO8859-1 charset: as we have
3298     // special knowledge of it anyhow, we don't need to create a special
3299     // conversion object
3300     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3301             m_encoding == wxFONTENCODING_DEFAULT )
3302     {
3303         // don't convert at all
3304         return NULL;
3305     }
3306
3307     // we trust OS to do conversion better than we can so try external
3308     // conversion methods first
3309     //
3310     // the full order is:
3311     //      1. OS conversion (iconv() under Unix or Win32 API)
3312     //      2. hard coded conversions for UTF
3313     //      3. wxEncodingConverter as fall back
3314
3315     // step (1)
3316 #ifdef HAVE_ICONV
3317 #if !wxUSE_FONTMAP
3318     if ( m_name )
3319 #endif // !wxUSE_FONTMAP
3320     {
3321         wxString name(m_name);
3322 #if wxUSE_FONTMAP
3323         wxFontEncoding encoding(m_encoding);
3324 #endif
3325
3326         if ( !name.empty() )
3327         {
3328             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3329             if ( conv->IsOk() )
3330                 return conv;
3331
3332             delete conv;
3333
3334 #if wxUSE_FONTMAP
3335             encoding =
3336                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3337 #endif // wxUSE_FONTMAP
3338         }
3339 #if wxUSE_FONTMAP
3340         {
3341             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3342             if ( it != gs_nameCache.end() )
3343             {
3344                 if ( it->second.empty() )
3345                     return NULL;
3346
3347                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3348                 if ( conv->IsOk() )
3349                     return conv;
3350
3351                 delete conv;
3352             }
3353
3354             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3355             // CS : in case this does not return valid names (eg for MacRoman) encoding
3356             // got a 'failure' entry in the cache all the same, although it just has to
3357             // be created using a different method, so only store failed iconv creation
3358             // attempts (or perhaps we shoulnd't do this at all ?)
3359             if ( names[0] != NULL )
3360             {
3361                 for ( ; *names; ++names )
3362                 {
3363                     wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3364                     if ( conv->IsOk() )
3365                     {
3366                         gs_nameCache[encoding] = *names;
3367                         return conv;
3368                     }
3369
3370                     delete conv;
3371                 }
3372
3373                 gs_nameCache[encoding] = _T(""); // cache the failure
3374             }
3375         }
3376 #endif // wxUSE_FONTMAP
3377     }
3378 #endif // HAVE_ICONV
3379
3380 #ifdef wxHAVE_WIN32_MB2WC
3381     {
3382 #if wxUSE_FONTMAP
3383         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3384                                       : new wxMBConv_win32(m_encoding);
3385         if ( conv->IsOk() )
3386             return conv;
3387
3388         delete conv;
3389 #else
3390         return NULL;
3391 #endif
3392     }
3393 #endif // wxHAVE_WIN32_MB2WC
3394
3395 #if defined(__WXMAC__)
3396     {
3397         // leave UTF16 and UTF32 to the built-ins of wx
3398         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3399             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3400         {
3401 #if wxUSE_FONTMAP
3402             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3403                                         : new wxMBConv_mac(m_encoding);
3404 #else
3405             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3406 #endif
3407             if ( conv->IsOk() )
3408                  return conv;
3409
3410             delete conv;
3411         }
3412     }
3413 #endif
3414
3415 #if defined(__WXCOCOA__)
3416     {
3417         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3418         {
3419 #if wxUSE_FONTMAP
3420             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3421                                           : new wxMBConv_cocoa(m_encoding);
3422 #else
3423             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3424 #endif
3425
3426             if ( conv->IsOk() )
3427                  return conv;
3428
3429             delete conv;
3430         }
3431     }
3432 #endif
3433     // step (2)
3434     wxFontEncoding enc = m_encoding;
3435 #if wxUSE_FONTMAP
3436     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3437     {
3438         // use "false" to suppress interactive dialogs -- we can be called from
3439         // anywhere and popping up a dialog from here is the last thing we want to
3440         // do
3441         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3442     }
3443 #endif // wxUSE_FONTMAP
3444
3445     switch ( enc )
3446     {
3447         case wxFONTENCODING_UTF7:
3448              return new wxMBConvUTF7;
3449
3450         case wxFONTENCODING_UTF8:
3451              return new wxMBConvUTF8;
3452
3453         case wxFONTENCODING_UTF16BE:
3454              return new wxMBConvUTF16BE;
3455
3456         case wxFONTENCODING_UTF16LE:
3457              return new wxMBConvUTF16LE;
3458
3459         case wxFONTENCODING_UTF32BE:
3460              return new wxMBConvUTF32BE;
3461
3462         case wxFONTENCODING_UTF32LE:
3463              return new wxMBConvUTF32LE;
3464
3465         default:
3466              // nothing to do but put here to suppress gcc warnings
3467              break;
3468     }
3469
3470     // step (3)
3471 #if wxUSE_FONTMAP
3472     {
3473         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3474                                       : new wxMBConv_wxwin(m_encoding);
3475         if ( conv->IsOk() )
3476             return conv;
3477
3478         delete conv;
3479     }
3480 #endif // wxUSE_FONTMAP
3481
3482     // NB: This is a hack to prevent deadlock. What could otherwise happen
3483     //     in Unicode build: wxConvLocal creation ends up being here
3484     //     because of some failure and logs the error. But wxLog will try to
3485     //     attach a timestamp, for which it will need wxConvLocal (to convert
3486     //     time to char* and then wchar_t*), but that fails, tries to log the
3487     //     error, but wxLog has an (already locked) critical section that
3488     //     guards the static buffer.
3489     static bool alreadyLoggingError = false;
3490     if (!alreadyLoggingError)
3491     {
3492         alreadyLoggingError = true;
3493         wxLogError(_("Cannot convert from the charset '%s'!"),
3494                    m_name ? m_name
3495                       :
3496 #if wxUSE_FONTMAP
3497                          (const wxChar*)wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3498 #else // !wxUSE_FONTMAP
3499                          (const wxChar*)wxString::Format(_("encoding %i"), m_encoding).c_str()
3500 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3501               );
3502
3503         alreadyLoggingError = false;
3504     }
3505
3506     return NULL;
3507 }
3508
3509 void wxCSConv::CreateConvIfNeeded() const
3510 {
3511     if ( m_deferred )
3512     {
3513         wxCSConv *self = (wxCSConv *)this; // const_cast
3514
3515         // if we don't have neither the name nor the encoding, use the default
3516         // encoding for this system
3517         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3518         {
3519 #if wxUSE_INTL
3520             self->m_encoding = wxLocale::GetSystemEncoding();
3521 #else
3522             // fallback to some reasonable default:
3523             self->m_encoding = wxFONTENCODING_ISO8859_1;
3524 #endif // wxUSE_INTL
3525         }
3526
3527         self->m_convReal = DoCreate();
3528         self->m_deferred = false;
3529     }
3530 }
3531
3532 bool wxCSConv::IsOk() const
3533 {
3534     CreateConvIfNeeded();
3535
3536     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3537     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3538         return true; // always ok as we do it ourselves
3539
3540     // m_convReal->IsOk() is called at its own creation, so we know it must
3541     // be ok if m_convReal is non-NULL
3542     return m_convReal != NULL;
3543 }
3544
3545 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3546                          const char *src, size_t srcLen) const
3547 {
3548     CreateConvIfNeeded();
3549
3550     if (m_convReal)
3551         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3552
3553     // latin-1 (direct)
3554     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3555 }
3556
3557 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3558                            const wchar_t *src, size_t srcLen) const
3559 {
3560     CreateConvIfNeeded();
3561
3562     if (m_convReal)
3563         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3564
3565     // latin-1 (direct)
3566     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3567 }
3568
3569 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3570 {
3571     CreateConvIfNeeded();
3572
3573     if (m_convReal)
3574         return m_convReal->MB2WC(buf, psz, n);
3575
3576     // latin-1 (direct)
3577     size_t len = strlen(psz);
3578
3579     if (buf)
3580     {
3581         for (size_t c = 0; c <= len; c++)
3582             buf[c] = (unsigned char)(psz[c]);
3583     }
3584
3585     return len;
3586 }
3587
3588 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3589 {
3590     CreateConvIfNeeded();
3591
3592     if (m_convReal)
3593         return m_convReal->WC2MB(buf, psz, n);
3594
3595     // latin-1 (direct)
3596     const size_t len = wxWcslen(psz);
3597     if (buf)
3598     {
3599         for (size_t c = 0; c <= len; c++)
3600         {
3601             if (psz[c] > 0xFF)
3602                 return wxCONV_FAILED;
3603
3604             buf[c] = (char)psz[c];
3605         }
3606     }
3607     else
3608     {
3609         for (size_t c = 0; c <= len; c++)
3610         {
3611             if (psz[c] > 0xFF)
3612                 return wxCONV_FAILED;
3613         }
3614     }
3615
3616     return len;
3617 }
3618
3619 size_t wxCSConv::GetMBNulLen() const
3620 {
3621     CreateConvIfNeeded();
3622
3623     if ( m_convReal )
3624     {
3625         return m_convReal->GetMBNulLen();
3626     }
3627
3628     // otherwise, we are ISO-8859-1
3629     return 1;
3630 }
3631
3632 #if wxUSE_UNICODE_UTF8
3633 bool wxCSConv::IsUTF8() const
3634 {
3635     CreateConvIfNeeded();
3636
3637     if ( m_convReal )
3638     {
3639         return m_convReal->IsUTF8();
3640     }
3641
3642     // otherwise, we are ISO-8859-1
3643     return false;
3644 }
3645 #endif
3646
3647
3648 #if wxUSE_UNICODE
3649
3650 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3651 {
3652     if ( !s )
3653         return wxWCharBuffer();
3654
3655     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3656     if ( !wbuf )
3657         wbuf = wxMBConvUTF8().cMB2WX(s);
3658     if ( !wbuf )
3659         wbuf = wxConvISO8859_1.cMB2WX(s);
3660
3661     return wbuf;
3662 }
3663
3664 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3665 {
3666     if ( !ws )
3667         return wxCharBuffer();
3668
3669     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3670     if ( !buf )
3671         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3672
3673     return buf;
3674 }
3675
3676 #endif // wxUSE_UNICODE
3677
3678 // ----------------------------------------------------------------------------
3679 // globals
3680 // ----------------------------------------------------------------------------
3681
3682 // NB: The reason why we create converted objects in this convoluted way,
3683 //     using a factory function instead of global variable, is that they
3684 //     may be used at static initialization time (some of them are used by
3685 //     wxString ctors and there may be a global wxString object). In other
3686 //     words, possibly _before_ the converter global object would be
3687 //     initialized.
3688
3689 #undef wxConvLibc
3690 #undef wxConvUTF8
3691 #undef wxConvUTF7
3692 #undef wxConvLocal
3693 #undef wxConvISO8859_1
3694
3695 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3696     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3697     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3698     {                                                                   \
3699         static impl_klass name##Obj ctor_args;                          \
3700         return &name##Obj;                                              \
3701     }                                                                   \
3702     /* this ensures that all global converter objects are created */    \
3703     /* by the time static initialization is done, i.e. before any */    \
3704     /* thread is launched: */                                           \
3705     static klass* gs_##name##instance = wxGet_##name##Ptr()
3706
3707 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3708     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3709
3710 #ifdef __WINDOWS__
3711     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3712 #elif defined(__WXMAC__) && !defined(__MACH__)
3713     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3714 #else
3715     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3716 #endif
3717
3718 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3719 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3720
3721 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3722 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3723
3724 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3725 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3726
3727 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3728 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3729 #endif
3730 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3731 #ifdef __WXOSX__
3732 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3733                                     &wxConvMacUTF8DObj;
3734 #else
3735                                     wxGet_wxConvUTF8Ptr();
3736 #endif
3737 #else // !__WXOSX__
3738                                     wxGet_wxConvLibcPtr();
3739 #endif // __WXOSX__/!__WXOSX__
3740
3741 #else // !wxUSE_WCHAR_T
3742
3743 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3744 // stand-ins in absence of wchar_t
3745 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3746                                 wxConvISO8859_1,
3747                                 wxConvLocal,
3748                                 wxConvUTF8;
3749
3750 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T