src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21     #include "wx/utils.h"
  22 #endif
  23
  24 #include "wx/strconv.h"
  25
  26 #if wxUSE_WCHAR_T
  27
  28 #ifdef __WINDOWS__
  29     #include "wx/msw/private.h"
  30     #include "wx/msw/missing.h"
  31 #endif
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #define wxHAVE_WIN32_MB2WC
  43 #endif
  44
  45 #ifdef __SALFORDC__
  46     #include <clib.h>
  47 #endif
  48
  49 #ifdef HAVE_ICONV
  50     #include <iconv.h>
  51     #include "wx/thread.h"
  52 #endif
  53
  54 #include "wx/encconv.h"
  55 #include "wx/fontmap.h"
  56
  57 #ifdef __WXMAC__
  58 #ifndef __DARWIN__
  59 #include <ATSUnicode.h>
  60 #include <TextCommon.h>
  61 #include <TextEncodingConverter.h>
  62 #endif
  63
  64 // includes Mac headers
  65 #include "wx/mac/private.h"
  66 #endif
  67
  68
  69 #define TRACE_STRCONV _T("strconv")
  70
  71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  72 // be 4 bytes
  73 #if SIZEOF_WCHAR_T == 2
  74     #define WC_UTF16
  75 #endif
  76
  77
  78 // ============================================================================
  79 // implementation
  80 // ============================================================================
  81
  82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  83 static bool NotAllNULs(const char *p, size_t n)
  84 {
  85     while ( n && *p++ == '\0' )
  86         n--;
  87
  88     return n != 0;
  89 }
  90
  91 // ----------------------------------------------------------------------------
  92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  93 // ----------------------------------------------------------------------------
  94
  95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  96 {
  97     if (input <= 0xffff)
  98     {
  99         if (output)
 100             *output = (wxUint16) input;
 101
 102         return 1;
 103     }
 104     else if (input >= 0x110000)
 105     {
 106         return wxCONV_FAILED;
 107     }
 108     else
 109     {
 110         if (output)
 111         {
 112             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 113             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 114         }
 115
 116         return 2;
 117     }
 118 }
 119
 120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 121 {
 122     if ((*input < 0xd800) || (*input > 0xdfff))
 123     {
 124         output = *input;
 125         return 1;
 126     }
 127     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 128     {
 129         output = *input;
 130         return wxCONV_FAILED;
 131     }
 132     else
 133     {
 134         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 135         return 2;
 136     }
 137 }
 138
 139 #ifdef WC_UTF16
 140     typedef wchar_t wxDecodeSurrogate_t;
 141 #else // !WC_UTF16
 142     typedef wxUint16 wxDecodeSurrogate_t;
 143 #endif // WC_UTF16/!WC_UTF16
 144
 145 // returns the next UTF-32 character from the wchar_t buffer and advances the
 146 // pointer to the character after this one
 147 //
 148 // if an invalid character is found, *pSrc is set to NULL, the caller must
 149 // check for this
 150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 151 {
 152     wxUint32 out;
 153     const size_t
 154         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 155     if ( n == wxCONV_FAILED )
 156         *pSrc = NULL;
 157     else
 158         *pSrc += n;
 159
 160     return out;
 161 }
 162
 163 // ----------------------------------------------------------------------------
 164 // wxMBConv
 165 // ----------------------------------------------------------------------------
 166
 167 size_t
 168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 169                   const char *src, size_t srcLen) const
 170 {
 171     // although new conversion classes are supposed to implement this function
 172     // directly, the existins ones only implement the old MB2WC() and so, to
 173     // avoid to have to rewrite all conversion classes at once, we provide a
 174     // default (but not efficient) implementation of this one in terms of the
 175     // old function by copying the input to ensure that it's NUL-terminated and
 176     // then using MB2WC() to convert it
 177
 178     // the number of chars [which would be] written to dst [if it were not NULL]
 179     size_t dstWritten = 0;
 180
 181     // the number of NULs terminating this string
 182     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 183
 184     // if we were not given the input size we just have to assume that the
 185     // string is properly terminated as we have no way of knowing how long it
 186     // is anyhow, but if we do have the size check whether there are enough
 187     // NULs at the end
 188     wxCharBuffer bufTmp;
 189     const char *srcEnd;
 190     if ( srcLen != wxNO_LEN )
 191     {
 192         // we need to know how to find the end of this string
 193         nulLen = GetMBNulLen();
 194         if ( nulLen == wxCONV_FAILED )
 195             return wxCONV_FAILED;
 196
 197         // if there are enough NULs we can avoid the copy
 198         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 199         {
 200             // make a copy in order to properly NUL-terminate the string
 201             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 202             char * const p = bufTmp.data();
 203             memcpy(p, src, srcLen);
 204             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 205                 *s = '\0';
 206
 207             src = bufTmp;
 208         }
 209
 210         srcEnd = src + srcLen;
 211     }
 212     else // quit after the first loop iteration
 213     {
 214         srcEnd = NULL;
 215     }
 216
 217     for ( ;; )
 218     {
 219         // try to convert the current chunk
 220         size_t lenChunk = MB2WC(NULL, src, 0);
 221         if ( lenChunk == wxCONV_FAILED )
 222             return wxCONV_FAILED;
 223
 224         lenChunk++; // for the L'\0' at the end of this chunk
 225
 226         dstWritten += lenChunk;
 227
 228         if ( lenChunk == 1 )
 229         {
 230             // nothing left in the input string, conversion succeeded
 231             break;
 232         }
 233
 234         if ( dst )
 235         {
 236             if ( dstWritten > dstLen )
 237                 return wxCONV_FAILED;
 238
 239             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 240                 return wxCONV_FAILED;
 241
 242             dst += lenChunk;
 243         }
 244
 245         if ( !srcEnd )
 246         {
 247             // we convert just one chunk in this case as this is the entire
 248             // string anyhow
 249             break;
 250         }
 251
 252         // advance the input pointer past the end of this chunk
 253         while ( NotAllNULs(src, nulLen) )
 254         {
 255             // notice that we must skip over multiple bytes here as we suppose
 256             // that if NUL takes 2 or 4 bytes, then all the other characters do
 257             // too and so if advanced by a single byte we might erroneously
 258             // detect sequences of NUL bytes in the middle of the input
 259             src += nulLen;
 260         }
 261
 262         src += nulLen; // skipping over its terminator as well
 263
 264         // note that ">=" (and not just "==") is needed here as the terminator
 265         // we skipped just above could be inside or just after the buffer
 266         // delimited by inEnd
 267         if ( src >= srcEnd )
 268             break;
 269     }
 270
 271     return dstWritten;
 272 }
 273
 274 size_t
 275 wxMBConv::FromWChar(char *dst, size_t dstLen,
 276                     const wchar_t *src, size_t srcLen) const
 277 {
 278     // the number of chars [which would be] written to dst [if it were not NULL]
 279     size_t dstWritten = 0;
 280
 281     // make a copy of the input string unless it is already properly
 282     // NUL-terminated
 283     //
 284     // if we don't know its length we have no choice but to assume that it is,
 285     // indeed, properly terminated
 286     wxWCharBuffer bufTmp;
 287     if ( srcLen == wxNO_LEN )
 288     {
 289         srcLen = wxWcslen(src) + 1;
 290     }
 291     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 292     {
 293         // make a copy in order to properly NUL-terminate the string
 294         bufTmp = wxWCharBuffer(srcLen);
 295         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 296         src = bufTmp;
 297     }
 298
 299     const size_t lenNul = GetMBNulLen();
 300     for ( const wchar_t * const srcEnd = src + srcLen;
 301           src < srcEnd;
 302           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 303     {
 304         // try to convert the current chunk
 305         size_t lenChunk = WC2MB(NULL, src, 0);
 306
 307         if ( lenChunk == wxCONV_FAILED )
 308             return wxCONV_FAILED;
 309
 310         lenChunk += lenNul;
 311         dstWritten += lenChunk;
 312
 313         if ( dst )
 314         {
 315             if ( dstWritten > dstLen )
 316                 return wxCONV_FAILED;
 317
 318             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 319                 return wxCONV_FAILED;
 320
 321             dst += lenChunk;
 322         }
 323     }
 324
 325     return dstWritten;
 326 }
 327
 328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 329 {
 330     size_t rc = ToWChar(outBuff, outLen, inBuff);
 331     if ( rc != wxCONV_FAILED )
 332     {
 333         // ToWChar() returns the buffer length, i.e. including the trailing
 334         // NUL, while this method doesn't take it into account
 335         rc--;
 336     }
 337
 338     return rc;
 339 }
 340
 341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 342 {
 343     size_t rc = FromWChar(outBuff, outLen, inBuff);
 344     if ( rc != wxCONV_FAILED )
 345     {
 346         rc -= GetMBNulLen();
 347     }
 348
 349     return rc;
 350 }
 351
 352 wxMBConv::~wxMBConv()
 353 {
 354     // nothing to do here (necessary for Darwin linking probably)
 355 }
 356
 357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 358 {
 359     if ( psz )
 360     {
 361         // calculate the length of the buffer needed first
 362         const size_t nLen = MB2WC(NULL, psz, 0);
 363         if ( nLen != wxCONV_FAILED )
 364         {
 365             // now do the actual conversion
 366             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 367
 368             // +1 for the trailing NULL
 369             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 370                 return buf;
 371         }
 372     }
 373
 374     return wxWCharBuffer();
 375 }
 376
 377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 378 {
 379     if ( pwz )
 380     {
 381         const size_t nLen = WC2MB(NULL, pwz, 0);
 382         if ( nLen != wxCONV_FAILED )
 383         {
 384             // extra space for trailing NUL(s)
 385             static const size_t extraLen = GetMaxMBNulLen();
 386
 387             wxCharBuffer buf(nLen + extraLen - 1);
 388             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 389                 return buf;
 390         }
 391     }
 392
 393     return wxCharBuffer();
 394 }
 395
 396 const wxWCharBuffer
 397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 398 {
 399     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 400     if ( dstLen != wxCONV_FAILED )
 401     {
 402         wxWCharBuffer wbuf(dstLen - 1);
 403         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 404         {
 405             if ( outLen )
 406             {
 407                 *outLen = dstLen;
 408                 if ( wbuf[dstLen - 1] == L'\0' )
 409                     (*outLen)--;
 410             }
 411
 412             return wbuf;
 413         }
 414     }
 415
 416     if ( outLen )
 417         *outLen = 0;
 418
 419     return wxWCharBuffer();
 420 }
 421
 422 const wxCharBuffer
 423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 424 {
 425     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 426     if ( dstLen != wxCONV_FAILED )
 427     {
 428         if ( !dstLen )
 429         {
 430             // special case: can't allocate 0 size buffer below
 431             dstLen++;
 432         }
 433
 434         wxCharBuffer buf(dstLen - 1);
 435         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 436         {
 437             if ( outLen )
 438             {
 439                 *outLen = dstLen;
 440
 441                 const size_t nulLen = GetMBNulLen();
 442                 if ( dstLen >= nulLen &&
 443                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 444                 {
 445                     // in this case the output is NUL-terminated and we're not
 446                     // supposed to count NUL
 447                     *outLen -= nulLen;
 448                 }
 449             }
 450
 451             return buf;
 452         }
 453     }
 454
 455     if ( outLen )
 456         *outLen = 0;
 457
 458     return wxCharBuffer();
 459 }
 460
 461 // ----------------------------------------------------------------------------
 462 // wxMBConvLibc
 463 // ----------------------------------------------------------------------------
 464
 465 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 466 {
 467     return wxMB2WC(buf, psz, n);
 468 }
 469
 470 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 471 {
 472     return wxWC2MB(buf, psz, n);
 473 }
 474
 475 // ----------------------------------------------------------------------------
 476 // wxConvBrokenFileNames
 477 // ----------------------------------------------------------------------------
 478
 479 #ifdef __UNIX__
 480
 481 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 482 {
 483     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 484                   || wxStricmp(charset, _T("UTF8")) == 0  )
 485         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 486     else
 487         m_conv = new wxCSConv(charset);
 488 }
 489
 490 #endif // __UNIX__
 491
 492 // ----------------------------------------------------------------------------
 493 // UTF-7
 494 // ----------------------------------------------------------------------------
 495
 496 // Implementation (C) 2004 Fredrik Roubert
 497
 498 //
 499 // BASE64 decoding table
 500 //
 501 static const unsigned char utf7unb64[] =
 502 {
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 509     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 510     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 512     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 513     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 514     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 516     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 517     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 518     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 535 };
 536
 537 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 538 {
 539     size_t len = 0;
 540
 541     while ( *psz && (!buf || (len < n)) )
 542     {
 543         unsigned char cc = *psz++;
 544         if (cc != '+')
 545         {
 546             // plain ASCII char
 547             if (buf)
 548                 *buf++ = cc;
 549             len++;
 550         }
 551         else if (*psz == '-')
 552         {
 553             // encoded plus sign
 554             if (buf)
 555                 *buf++ = cc;
 556             len++;
 557             psz++;
 558         }
 559         else // start of BASE64 encoded string
 560         {
 561             bool lsb, ok;
 562             unsigned int d, l;
 563             for ( ok = lsb = false, d = 0, l = 0;
 564                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 565                   psz++ )
 566             {
 567                 d <<= 6;
 568                 d += cc;
 569                 for (l += 6; l >= 8; lsb = !lsb)
 570                 {
 571                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 572                     if (lsb)
 573                     {
 574                         if (buf)
 575                             *buf++ |= c;
 576                         len ++;
 577                     }
 578                     else
 579                     {
 580                         if (buf)
 581                             *buf = (wchar_t)(c << 8);
 582                     }
 583
 584                     ok = true;
 585                 }
 586             }
 587
 588             if ( !ok )
 589             {
 590                 // in valid UTF7 we should have valid characters after '+'
 591                 return wxCONV_FAILED;
 592             }
 593
 594             if (*psz == '-')
 595                 psz++;
 596         }
 597     }
 598
 599     if ( buf && (len < n) )
 600         *buf = '\0';
 601
 602     return len;
 603 }
 604
 605 //
 606 // BASE64 encoding table
 607 //
 608 static const unsigned char utf7enb64[] =
 609 {
 610     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 611     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 612     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 613     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 614     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 615     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 616     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 617     '4', '5', '6', '7', '8', '9', '+', '/'
 618 };
 619
 620 //
 621 // UTF-7 encoding table
 622 //
 623 // 0 - Set D (directly encoded characters)
 624 // 1 - Set O (optional direct characters)
 625 // 2 - whitespace characters (optional)
 626 // 3 - special characters
 627 //
 628 static const unsigned char utf7encode[128] =
 629 {
 630     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 631     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 632     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 633     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 634     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 635     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 636     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 637     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 638 };
 639
 640 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 641 {
 642     size_t len = 0;
 643
 644     while (*psz && ((!buf) || (len < n)))
 645     {
 646         wchar_t cc = *psz++;
 647         if (cc < 0x80 && utf7encode[cc] < 1)
 648         {
 649             // plain ASCII char
 650             if (buf)
 651                 *buf++ = (char)cc;
 652
 653             len++;
 654         }
 655 #ifndef WC_UTF16
 656         else if (((wxUint32)cc) > 0xffff)
 657         {
 658             // no surrogate pair generation (yet?)
 659             return wxCONV_FAILED;
 660         }
 661 #endif
 662         else
 663         {
 664             if (buf)
 665                 *buf++ = '+';
 666
 667             len++;
 668             if (cc != '+')
 669             {
 670                 // BASE64 encode string
 671                 unsigned int lsb, d, l;
 672                 for (d = 0, l = 0; /*nothing*/; psz++)
 673                 {
 674                     for (lsb = 0; lsb < 2; lsb ++)
 675                     {
 676                         d <<= 8;
 677                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 678
 679                         for (l += 8; l >= 6; )
 680                         {
 681                             l -= 6;
 682                             if (buf)
 683                                 *buf++ = utf7enb64[(d >> l) % 64];
 684                             len++;
 685                         }
 686                     }
 687
 688                     cc = *psz;
 689                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 690                         break;
 691                 }
 692
 693                 if (l != 0)
 694                 {
 695                     if (buf)
 696                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 697
 698                     len++;
 699                 }
 700             }
 701
 702             if (buf)
 703                 *buf++ = '-';
 704             len++;
 705         }
 706     }
 707
 708     if (buf && (len < n))
 709         *buf = 0;
 710
 711     return len;
 712 }
 713
 714 // ----------------------------------------------------------------------------
 715 // UTF-8
 716 // ----------------------------------------------------------------------------
 717
 718 static wxUint32 utf8_max[]=
 719     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 720
 721 // boundaries of the private use area we use to (temporarily) remap invalid
 722 // characters invalid in a UTF-8 encoded string
 723 const wxUint32 wxUnicodePUA = 0x100000;
 724 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 725
 726 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 727 {
 728     size_t len = 0;
 729
 730     while (*psz && ((!buf) || (len < n)))
 731     {
 732         const char *opsz = psz;
 733         bool invalid = false;
 734         unsigned char cc = *psz++, fc = cc;
 735         unsigned cnt;
 736         for (cnt = 0; fc & 0x80; cnt++)
 737             fc <<= 1;
 738
 739         if (!cnt)
 740         {
 741             // plain ASCII char
 742             if (buf)
 743                 *buf++ = cc;
 744             len++;
 745
 746             // escape the escape character for octal escapes
 747             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 748                     && cc == '\\' && (!buf || len < n))
 749             {
 750                 if (buf)
 751                     *buf++ = cc;
 752                 len++;
 753             }
 754         }
 755         else
 756         {
 757             cnt--;
 758             if (!cnt)
 759             {
 760                 // invalid UTF-8 sequence
 761                 invalid = true;
 762             }
 763             else
 764             {
 765                 unsigned ocnt = cnt - 1;
 766                 wxUint32 res = cc & (0x3f >> cnt);
 767                 while (cnt--)
 768                 {
 769                     cc = *psz;
 770                     if ((cc & 0xC0) != 0x80)
 771                     {
 772                         // invalid UTF-8 sequence
 773                         invalid = true;
 774                         break;
 775                     }
 776
 777                     psz++;
 778                     res = (res << 6) | (cc & 0x3f);
 779                 }
 780
 781                 if (invalid || res <= utf8_max[ocnt])
 782                 {
 783                     // illegal UTF-8 encoding
 784                     invalid = true;
 785                 }
 786                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 787                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 788                 {
 789                     // if one of our PUA characters turns up externally
 790                     // it must also be treated as an illegal sequence
 791                     // (a bit like you have to escape an escape character)
 792                     invalid = true;
 793                 }
 794                 else
 795                 {
 796 #ifdef WC_UTF16
 797                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 798                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 799                     if (pa == wxCONV_FAILED)
 800                     {
 801                         invalid = true;
 802                     }
 803                     else
 804                     {
 805                         if (buf)
 806                             buf += pa;
 807                         len += pa;
 808                     }
 809 #else // !WC_UTF16
 810                     if (buf)
 811                         *buf++ = (wchar_t)res;
 812                     len++;
 813 #endif // WC_UTF16/!WC_UTF16
 814                 }
 815             }
 816
 817             if (invalid)
 818             {
 819                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 820                 {
 821                     while (opsz < psz && (!buf || len < n))
 822                     {
 823 #ifdef WC_UTF16
 824                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 825                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 826                         wxASSERT(pa != wxCONV_FAILED);
 827                         if (buf)
 828                             buf += pa;
 829                         opsz++;
 830                         len += pa;
 831 #else
 832                         if (buf)
 833                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 834                         opsz++;
 835                         len++;
 836 #endif
 837                     }
 838                 }
 839                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 840                 {
 841                     while (opsz < psz && (!buf || len < n))
 842                     {
 843                         if ( buf && len + 3 < n )
 844                         {
 845                             unsigned char on = *opsz;
 846                             *buf++ = L'\\';
 847                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 848                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 849                             *buf++ = (wchar_t)( L'0' + on % 010 );
 850                         }
 851
 852                         opsz++;
 853                         len += 4;
 854                     }
 855                 }
 856                 else // MAP_INVALID_UTF8_NOT
 857                 {
 858                     return wxCONV_FAILED;
 859                 }
 860             }
 861         }
 862     }
 863
 864     if (buf && (len < n))
 865         *buf = 0;
 866
 867     return len;
 868 }
 869
 870 static inline bool isoctal(wchar_t wch)
 871 {
 872     return L'0' <= wch && wch <= L'7';
 873 }
 874
 875 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 876 {
 877     size_t len = 0;
 878
 879     while (*psz && ((!buf) || (len < n)))
 880     {
 881         wxUint32 cc;
 882
 883 #ifdef WC_UTF16
 884         // cast is ok for WC_UTF16
 885         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 886         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 887 #else
 888         cc = (*psz++) & 0x7fffffff;
 889 #endif
 890
 891         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 892                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 893         {
 894             if (buf)
 895                 *buf++ = (char)(cc - wxUnicodePUA);
 896             len++;
 897         }
 898         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 899                     && cc == L'\\' && psz[0] == L'\\' )
 900         {
 901             if (buf)
 902                 *buf++ = (char)cc;
 903             psz++;
 904             len++;
 905         }
 906         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 907                     cc == L'\\' &&
 908                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 909         {
 910             if (buf)
 911             {
 912                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 913                                  (psz[1] - L'0') * 010 +
 914                                  (psz[2] - L'0'));
 915             }
 916
 917             psz += 3;
 918             len++;
 919         }
 920         else
 921         {
 922             unsigned cnt;
 923             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 924             {
 925             }
 926
 927             if (!cnt)
 928             {
 929                 // plain ASCII char
 930                 if (buf)
 931                     *buf++ = (char) cc;
 932                 len++;
 933             }
 934             else
 935             {
 936                 len += cnt + 1;
 937                 if (buf)
 938                 {
 939                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 940                     while (cnt--)
 941                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 942                 }
 943             }
 944         }
 945     }
 946
 947     if (buf && (len < n))
 948         *buf = 0;
 949
 950     return len;
 951 }
 952
 953 // ============================================================================
 954 // UTF-16
 955 // ============================================================================
 956
 957 #ifdef WORDS_BIGENDIAN
 958     #define wxMBConvUTF16straight wxMBConvUTF16BE
 959     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 960 #else
 961     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 962     #define wxMBConvUTF16straight wxMBConvUTF16LE
 963 #endif
 964
 965 /* static */
 966 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 967 {
 968     if ( srcLen == wxNO_LEN )
 969     {
 970         // count the number of bytes in input, including the trailing NULs
 971         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 972         for ( srcLen = 1; *inBuff++; srcLen++ )
 973             ;
 974
 975         srcLen *= BYTES_PER_CHAR;
 976     }
 977     else // we already have the length
 978     {
 979         // we can only convert an entire number of UTF-16 characters
 980         if ( srcLen % BYTES_PER_CHAR )
 981             return wxCONV_FAILED;
 982     }
 983
 984     return srcLen;
 985 }
 986
 987 // case when in-memory representation is UTF-16 too
 988 #ifdef WC_UTF16
 989
 990 // ----------------------------------------------------------------------------
 991 // conversions without endianness change
 992 // ----------------------------------------------------------------------------
 993
 994 size_t
 995 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 996                                const char *src, size_t srcLen) const
 997 {
 998     // set up the scene for using memcpy() (which is presumably more efficient
 999     // than copying the bytes one by one)
1000     srcLen = GetLength(src, srcLen);
1001     if ( srcLen == wxNO_LEN )
1002         return wxCONV_FAILED;
1003
1004     const size_t inLen = srcLen / BYTES_PER_CHAR;
1005     if ( dst )
1006     {
1007         if ( dstLen < inLen )
1008             return wxCONV_FAILED;
1009
1010         memcpy(dst, src, srcLen);
1011     }
1012
1013     return inLen;
1014 }
1015
1016 size_t
1017 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1018                                  const wchar_t *src, size_t srcLen) const
1019 {
1020     if ( srcLen == wxNO_LEN )
1021         srcLen = wxWcslen(src) + 1;
1022
1023     srcLen *= BYTES_PER_CHAR;
1024
1025     if ( dst )
1026     {
1027         if ( dstLen < srcLen )
1028             return wxCONV_FAILED;
1029
1030         memcpy(dst, src, srcLen);
1031     }
1032
1033     return srcLen;
1034 }
1035
1036 // ----------------------------------------------------------------------------
1037 // endian-reversing conversions
1038 // ----------------------------------------------------------------------------
1039
1040 size_t
1041 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1042                            const char *src, size_t srcLen) const
1043 {
1044     srcLen = GetLength(src, srcLen);
1045     if ( srcLen == wxNO_LEN )
1046         return wxCONV_FAILED;
1047
1048     srcLen /= BYTES_PER_CHAR;
1049
1050     if ( dst )
1051     {
1052         if ( dstLen < srcLen )
1053             return wxCONV_FAILED;
1054
1055         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1056         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1057         {
1058             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1059         }
1060     }
1061
1062     return srcLen;
1063 }
1064
1065 size_t
1066 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1067                              const wchar_t *src, size_t srcLen) const
1068 {
1069     if ( srcLen == wxNO_LEN )
1070         srcLen = wxWcslen(src) + 1;
1071
1072     srcLen *= BYTES_PER_CHAR;
1073
1074     if ( dst )
1075     {
1076         if ( dstLen < srcLen )
1077             return wxCONV_FAILED;
1078
1079         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1080         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1081         {
1082             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1083         }
1084     }
1085
1086     return srcLen;
1087 }
1088
1089 #else // !WC_UTF16: wchar_t is UTF-32
1090
1091 // ----------------------------------------------------------------------------
1092 // conversions without endianness change
1093 // ----------------------------------------------------------------------------
1094
1095 size_t
1096 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1097                                const char *src, size_t srcLen) const
1098 {
1099     srcLen = GetLength(src, srcLen);
1100     if ( srcLen == wxNO_LEN )
1101         return wxCONV_FAILED;
1102
1103     const size_t inLen = srcLen / BYTES_PER_CHAR;
1104     if ( !dst )
1105     {
1106         // optimization: return maximal space which could be needed for this
1107         // string even if the real size could be smaller if the buffer contains
1108         // any surrogates
1109         return inLen;
1110     }
1111
1112     size_t outLen = 0;
1113     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1114     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1115     {
1116         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1117         if ( !inBuff )
1118             return wxCONV_FAILED;
1119
1120         if ( ++outLen > dstLen )
1121             return wxCONV_FAILED;
1122
1123         *dst++ = ch;
1124     }
1125
1126
1127     return outLen;
1128 }
1129
1130 size_t
1131 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1132                                  const wchar_t *src, size_t srcLen) const
1133 {
1134     if ( srcLen == wxNO_LEN )
1135         srcLen = wxWcslen(src) + 1;
1136
1137     size_t outLen = 0;
1138     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1139     for ( size_t n = 0; n < srcLen; n++ )
1140     {
1141         wxUint16 cc[2];
1142         const size_t numChars = encode_utf16(*src++, cc);
1143         if ( numChars == wxCONV_FAILED )
1144             return wxCONV_FAILED;
1145
1146         outLen += numChars * BYTES_PER_CHAR;
1147         if ( outBuff )
1148         {
1149             if ( outLen > dstLen )
1150                 return wxCONV_FAILED;
1151
1152             *outBuff++ = cc[0];
1153             if ( numChars == 2 )
1154             {
1155                 // second character of a surrogate
1156                 *outBuff++ = cc[1];
1157             }
1158         }
1159     }
1160
1161     return outLen;
1162 }
1163
1164 // ----------------------------------------------------------------------------
1165 // endian-reversing conversions
1166 // ----------------------------------------------------------------------------
1167
1168 size_t
1169 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1170                            const char *src, size_t srcLen) const
1171 {
1172     srcLen = GetLength(src, srcLen);
1173     if ( srcLen == wxNO_LEN )
1174         return wxCONV_FAILED;
1175
1176     const size_t inLen = srcLen / BYTES_PER_CHAR;
1177     if ( !dst )
1178     {
1179         // optimization: return maximal space which could be needed for this
1180         // string even if the real size could be smaller if the buffer contains
1181         // any surrogates
1182         return inLen;
1183     }
1184
1185     size_t outLen = 0;
1186     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1187     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1188     {
1189         wxUint32 ch;
1190         wxUint16 tmp[2];
1191
1192         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1193         inBuff++;
1194         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1195
1196         const size_t numChars = decode_utf16(tmp, ch);
1197         if ( numChars == wxCONV_FAILED )
1198             return wxCONV_FAILED;
1199
1200         if ( numChars == 2 )
1201             inBuff++;
1202
1203         if ( ++outLen > dstLen )
1204             return wxCONV_FAILED;
1205
1206         *dst++ = ch;
1207     }
1208
1209
1210     return outLen;
1211 }
1212
1213 size_t
1214 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1215                              const wchar_t *src, size_t srcLen) const
1216 {
1217     if ( srcLen == wxNO_LEN )
1218         srcLen = wxWcslen(src) + 1;
1219
1220     size_t outLen = 0;
1221     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1222     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1223     {
1224         wxUint16 cc[2];
1225         const size_t numChars = encode_utf16(*src, cc);
1226         if ( numChars == wxCONV_FAILED )
1227             return wxCONV_FAILED;
1228
1229         outLen += numChars * BYTES_PER_CHAR;
1230         if ( outBuff )
1231         {
1232             if ( outLen > dstLen )
1233                 return wxCONV_FAILED;
1234
1235             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1236             if ( numChars == 2 )
1237             {
1238                 // second character of a surrogate
1239                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1240             }
1241         }
1242     }
1243
1244     return outLen;
1245 }
1246
1247 #endif // WC_UTF16/!WC_UTF16
1248
1249
1250 // ============================================================================
1251 // UTF-32
1252 // ============================================================================
1253
1254 #ifdef WORDS_BIGENDIAN
1255     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1256     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1257 #else
1258     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1259     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1260 #endif
1261
1262
1263 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1264 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1265
1266 /* static */
1267 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1268 {
1269     if ( srcLen == wxNO_LEN )
1270     {
1271         // count the number of bytes in input, including the trailing NULs
1272         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1273         for ( srcLen = 1; *inBuff++; srcLen++ )
1274             ;
1275
1276         srcLen *= BYTES_PER_CHAR;
1277     }
1278     else // we already have the length
1279     {
1280         // we can only convert an entire number of UTF-32 characters
1281         if ( srcLen % BYTES_PER_CHAR )
1282             return wxCONV_FAILED;
1283     }
1284
1285     return srcLen;
1286 }
1287
1288 // case when in-memory representation is UTF-16
1289 #ifdef WC_UTF16
1290
1291 // ----------------------------------------------------------------------------
1292 // conversions without endianness change
1293 // ----------------------------------------------------------------------------
1294
1295 size_t
1296 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1297                                const char *src, size_t srcLen) const
1298 {
1299     srcLen = GetLength(src, srcLen);
1300     if ( srcLen == wxNO_LEN )
1301         return wxCONV_FAILED;
1302
1303     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1304     const size_t inLen = srcLen / BYTES_PER_CHAR;
1305     size_t outLen = 0;
1306     for ( size_t n = 0; n < inLen; n++ )
1307     {
1308         wxUint16 cc[2];
1309         const size_t numChars = encode_utf16(*inBuff++, cc);
1310         if ( numChars == wxCONV_FAILED )
1311             return wxCONV_FAILED;
1312
1313         outLen += numChars;
1314         if ( dst )
1315         {
1316             if ( outLen > dstLen )
1317                 return wxCONV_FAILED;
1318
1319             *dst++ = cc[0];
1320             if ( numChars == 2 )
1321             {
1322                 // second character of a surrogate
1323                 *dst++ = cc[1];
1324             }
1325         }
1326     }
1327
1328     return outLen;
1329 }
1330
1331 size_t
1332 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1333                                  const wchar_t *src, size_t srcLen) const
1334 {
1335     if ( srcLen == wxNO_LEN )
1336         srcLen = wxWcslen(src) + 1;
1337
1338     if ( !dst )
1339     {
1340         // optimization: return maximal space which could be needed for this
1341         // string instead of the exact amount which could be less if there are
1342         // any surrogates in the input
1343         //
1344         // we consider that surrogates are rare enough to make it worthwhile to
1345         // avoid running the loop below at the cost of slightly extra memory
1346         // consumption
1347         return srcLen * BYTES_PER_CHAR;
1348     }
1349
1350     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1351     size_t outLen = 0;
1352     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1353     {
1354         const wxUint32 ch = wxDecodeSurrogate(&src);
1355         if ( !src )
1356             return wxCONV_FAILED;
1357
1358         outLen += BYTES_PER_CHAR;
1359
1360         if ( outLen > dstLen )
1361             return wxCONV_FAILED;
1362
1363         *outBuff++ = ch;
1364     }
1365
1366     return outLen;
1367 }
1368
1369 // ----------------------------------------------------------------------------
1370 // endian-reversing conversions
1371 // ----------------------------------------------------------------------------
1372
1373 size_t
1374 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1375                            const char *src, size_t srcLen) const
1376 {
1377     srcLen = GetLength(src, srcLen);
1378     if ( srcLen == wxNO_LEN )
1379         return wxCONV_FAILED;
1380
1381     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1382     const size_t inLen = srcLen / BYTES_PER_CHAR;
1383     size_t outLen = 0;
1384     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1385     {
1386         wxUint16 cc[2];
1387         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1388         if ( numChars == wxCONV_FAILED )
1389             return wxCONV_FAILED;
1390
1391         outLen += numChars;
1392         if ( dst )
1393         {
1394             if ( outLen > dstLen )
1395                 return wxCONV_FAILED;
1396
1397             *dst++ = cc[0];
1398             if ( numChars == 2 )
1399             {
1400                 // second character of a surrogate
1401                 *dst++ = cc[1];
1402             }
1403         }
1404     }
1405
1406     return outLen;
1407 }
1408
1409 size_t
1410 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1411                              const wchar_t *src, size_t srcLen) const
1412 {
1413     if ( srcLen == wxNO_LEN )
1414         srcLen = wxWcslen(src) + 1;
1415
1416     if ( !dst )
1417     {
1418         // optimization: return maximal space which could be needed for this
1419         // string instead of the exact amount which could be less if there are
1420         // any surrogates in the input
1421         //
1422         // we consider that surrogates are rare enough to make it worthwhile to
1423         // avoid running the loop below at the cost of slightly extra memory
1424         // consumption
1425         return srcLen*BYTES_PER_CHAR;
1426     }
1427
1428     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1429     size_t outLen = 0;
1430     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1431     {
1432         const wxUint32 ch = wxDecodeSurrogate(&src);
1433         if ( !src )
1434             return wxCONV_FAILED;
1435
1436         outLen += BYTES_PER_CHAR;
1437
1438         if ( outLen > dstLen )
1439             return wxCONV_FAILED;
1440
1441         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1442     }
1443
1444     return outLen;
1445 }
1446
1447 #else // !WC_UTF16: wchar_t is UTF-32
1448
1449 // ----------------------------------------------------------------------------
1450 // conversions without endianness change
1451 // ----------------------------------------------------------------------------
1452
1453 size_t
1454 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1455                                const char *src, size_t srcLen) const
1456 {
1457     // use memcpy() as it should be much faster than hand-written loop
1458     srcLen = GetLength(src, srcLen);
1459     if ( srcLen == wxNO_LEN )
1460         return wxCONV_FAILED;
1461
1462     const size_t inLen = srcLen/BYTES_PER_CHAR;
1463     if ( dst )
1464     {
1465         if ( dstLen < inLen )
1466             return wxCONV_FAILED;
1467
1468         memcpy(dst, src, srcLen);
1469     }
1470
1471     return inLen;
1472 }
1473
1474 size_t
1475 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1476                                  const wchar_t *src, size_t srcLen) const
1477 {
1478     if ( srcLen == wxNO_LEN )
1479         srcLen = wxWcslen(src) + 1;
1480
1481     srcLen *= BYTES_PER_CHAR;
1482
1483     if ( dst )
1484     {
1485         if ( dstLen < srcLen )
1486             return wxCONV_FAILED;
1487
1488         memcpy(dst, src, srcLen);
1489     }
1490
1491     return srcLen;
1492 }
1493
1494 // ----------------------------------------------------------------------------
1495 // endian-reversing conversions
1496 // ----------------------------------------------------------------------------
1497
1498 size_t
1499 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1500                            const char *src, size_t srcLen) const
1501 {
1502     srcLen = GetLength(src, srcLen);
1503     if ( srcLen == wxNO_LEN )
1504         return wxCONV_FAILED;
1505
1506     srcLen /= BYTES_PER_CHAR;
1507
1508     if ( dst )
1509     {
1510         if ( dstLen < srcLen )
1511             return wxCONV_FAILED;
1512
1513         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1514         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1515         {
1516             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1517         }
1518     }
1519
1520     return srcLen;
1521 }
1522
1523 size_t
1524 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1525                              const wchar_t *src, size_t srcLen) const
1526 {
1527     if ( srcLen == wxNO_LEN )
1528         srcLen = wxWcslen(src) + 1;
1529
1530     srcLen *= BYTES_PER_CHAR;
1531
1532     if ( dst )
1533     {
1534         if ( dstLen < srcLen )
1535             return wxCONV_FAILED;
1536
1537         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1538         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1539         {
1540             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1541         }
1542     }
1543
1544     return srcLen;
1545 }
1546
1547 #endif // WC_UTF16/!WC_UTF16
1548
1549
1550 // ============================================================================
1551 // The classes doing conversion using the iconv_xxx() functions
1552 // ============================================================================
1553
1554 #ifdef HAVE_ICONV
1555
1556 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1557 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1558 //     (unless there's yet another bug in glibc) the only case when iconv()
1559 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1560 //     left in the input buffer -- when _real_ error occurs,
1561 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1562 //     iconv() failure.
1563 //     [This bug does not appear in glibc 2.2.]
1564 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1565 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1566                                      (errno != E2BIG || bufLeft != 0))
1567 #else
1568 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1569 #endif
1570
1571 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1572
1573 #define ICONV_T_INVALID ((iconv_t)-1)
1574
1575 #if SIZEOF_WCHAR_T == 4
1576     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1577     #define WC_ENC      wxFONTENCODING_UTF32
1578 #elif SIZEOF_WCHAR_T == 2
1579     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1580     #define WC_ENC      wxFONTENCODING_UTF16
1581 #else // sizeof(wchar_t) != 2 nor 4
1582     // does this ever happen?
1583     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1584 #endif
1585
1586 // ----------------------------------------------------------------------------
1587 // wxMBConv_iconv: encapsulates an iconv character set
1588 // ----------------------------------------------------------------------------
1589
1590 class wxMBConv_iconv : public wxMBConv
1591 {
1592 public:
1593     wxMBConv_iconv(const wxChar *name);
1594     virtual ~wxMBConv_iconv();
1595
1596     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1597     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1598
1599     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1600     virtual size_t GetMBNulLen() const;
1601
1602     virtual wxMBConv *Clone() const
1603     {
1604         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1605         p->m_minMBCharWidth = m_minMBCharWidth;
1606         return p;
1607     }
1608
1609     bool IsOk() const
1610         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1611
1612 protected:
1613     // the iconv handlers used to translate from multibyte
1614     // to wide char and in the other direction
1615     iconv_t m2w,
1616             w2m;
1617
1618 #if wxUSE_THREADS
1619     // guards access to m2w and w2m objects
1620     wxMutex m_iconvMutex;
1621 #endif
1622
1623 private:
1624     // the name (for iconv_open()) of a wide char charset -- if none is
1625     // available on this machine, it will remain NULL
1626     static wxString ms_wcCharsetName;
1627
1628     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1629     // different endian-ness than the native one
1630     static bool ms_wcNeedsSwap;
1631
1632
1633     // name of the encoding handled by this conversion
1634     wxString m_name;
1635
1636     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1637     // initially
1638     size_t m_minMBCharWidth;
1639 };
1640
1641 // make the constructor available for unit testing
1642 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1643 {
1644     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1645     if ( !result->IsOk() )
1646     {
1647         delete result;
1648         return 0;
1649     }
1650
1651     return result;
1652 }
1653
1654 wxString wxMBConv_iconv::ms_wcCharsetName;
1655 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1656
1657 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1658               : m_name(name)
1659 {
1660     m_minMBCharWidth = 0;
1661
1662     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1663     // names for the charsets
1664     const wxCharBuffer cname(wxString(name).ToAscii());
1665
1666     // check for charset that represents wchar_t:
1667     if ( ms_wcCharsetName.empty() )
1668     {
1669         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1670
1671 #if wxUSE_FONTMAP
1672         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1673 #else // !wxUSE_FONTMAP
1674         static const wxChar *names[] =
1675         {
1676 #if SIZEOF_WCHAR_T == 4
1677             _T("UCS-4"),
1678 #elif SIZEOF_WCHAR_T = 2
1679             _T("UCS-2"),
1680 #endif
1681             NULL
1682         };
1683 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1684
1685         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1686         {
1687             const wxString nameCS(*names);
1688
1689             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1690             wxString nameXE(nameCS);
1691
1692 #ifdef WORDS_BIGENDIAN
1693                 nameXE += _T("BE");
1694 #else // little endian
1695                 nameXE += _T("LE");
1696 #endif
1697
1698             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1699                        nameXE.c_str());
1700
1701             m2w = iconv_open(nameXE.ToAscii(), cname);
1702             if ( m2w == ICONV_T_INVALID )
1703             {
1704                 // try charset w/o bytesex info (e.g. "UCS4")
1705                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1706                            nameCS.c_str());
1707                 m2w = iconv_open(nameCS.ToAscii(), cname);
1708
1709                 // and check for bytesex ourselves:
1710                 if ( m2w != ICONV_T_INVALID )
1711                 {
1712                     char    buf[2], *bufPtr;
1713                     wchar_t wbuf[2], *wbufPtr;
1714                     size_t  insz, outsz;
1715                     size_t  res;
1716
1717                     buf[0] = 'A';
1718                     buf[1] = 0;
1719                     wbuf[0] = 0;
1720                     insz = 2;
1721                     outsz = SIZEOF_WCHAR_T * 2;
1722                     wbufPtr = wbuf;
1723                     bufPtr = buf;
1724
1725                     res = iconv(
1726                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1727                         (char**)&wbufPtr, &outsz);
1728
1729                     if (ICONV_FAILED(res, insz))
1730                     {
1731                         wxLogLastError(wxT("iconv"));
1732                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1733                                    nameCS.c_str());
1734                     }
1735                     else // ok, can convert to this encoding, remember it
1736                     {
1737                         ms_wcCharsetName = nameCS;
1738                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1739                     }
1740                 }
1741             }
1742             else // use charset not requiring byte swapping
1743             {
1744                 ms_wcCharsetName = nameXE;
1745             }
1746         }
1747
1748         wxLogTrace(TRACE_STRCONV,
1749                    wxT("iconv wchar_t charset is \"%s\"%s"),
1750                    ms_wcCharsetName.empty() ? _T("<none>")
1751                                             : ms_wcCharsetName.c_str(),
1752                    ms_wcNeedsSwap ? _T(" (needs swap)")
1753                                   : _T(""));
1754     }
1755     else // we already have ms_wcCharsetName
1756     {
1757         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1758     }
1759
1760     if ( ms_wcCharsetName.empty() )
1761     {
1762         w2m = ICONV_T_INVALID;
1763     }
1764     else
1765     {
1766         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1767         if ( w2m == ICONV_T_INVALID )
1768         {
1769             wxLogTrace(TRACE_STRCONV,
1770                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1771                        ms_wcCharsetName.c_str(), cname.data());
1772         }
1773     }
1774 }
1775
1776 wxMBConv_iconv::~wxMBConv_iconv()
1777 {
1778     if ( m2w != ICONV_T_INVALID )
1779         iconv_close(m2w);
1780     if ( w2m != ICONV_T_INVALID )
1781         iconv_close(w2m);
1782 }
1783
1784 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1785 {
1786     // find the string length: notice that must be done differently for
1787     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1788     size_t inbuf;
1789     const size_t nulLen = GetMBNulLen();
1790     switch ( nulLen )
1791     {
1792         default:
1793             return wxCONV_FAILED;
1794
1795         case 1:
1796             inbuf = strlen(psz); // arguably more optimized than our version
1797             break;
1798
1799         case 2:
1800         case 4:
1801             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1802             // they also have to start at character boundary and not span two
1803             // adjacent characters
1804             const char *p;
1805             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1806                 ;
1807             inbuf = p - psz;
1808             break;
1809     }
1810
1811 #if wxUSE_THREADS
1812     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1813     //     Unfortunately there is a couple of global wxCSConv objects such as
1814     //     wxConvLocal that are used all over wx code, so we have to make sure
1815     //     the handle is used by at most one thread at the time. Otherwise
1816     //     only a few wx classes would be safe to use from non-main threads
1817     //     as MB<->WC conversion would fail "randomly".
1818     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1819 #endif // wxUSE_THREADS
1820
1821     size_t outbuf = n * SIZEOF_WCHAR_T;
1822     size_t res, cres;
1823     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1824     wchar_t *bufPtr = buf;
1825     const char *pszPtr = psz;
1826
1827     if (buf)
1828     {
1829         // have destination buffer, convert there
1830         cres = iconv(m2w,
1831                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1832                      (char**)&bufPtr, &outbuf);
1833         res = n - (outbuf / SIZEOF_WCHAR_T);
1834
1835         if (ms_wcNeedsSwap)
1836         {
1837             // convert to native endianness
1838             for ( unsigned i = 0; i < res; i++ )
1839                 buf[n] = WC_BSWAP(buf[i]);
1840         }
1841
1842         // NUL-terminate the string if there is any space left
1843         if (res < n)
1844             buf[res] = 0;
1845     }
1846     else
1847     {
1848         // no destination buffer... convert using temp buffer
1849         // to calculate destination buffer requirement
1850         wchar_t tbuf[8];
1851         res = 0;
1852
1853         do
1854         {
1855             bufPtr = tbuf;
1856             outbuf = 8 * SIZEOF_WCHAR_T;
1857
1858             cres = iconv(m2w,
1859                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1860                          (char**)&bufPtr, &outbuf );
1861
1862             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1863         }
1864         while ((cres == (size_t)-1) && (errno == E2BIG));
1865     }
1866
1867     if (ICONV_FAILED(cres, inbuf))
1868     {
1869         //VS: it is ok if iconv fails, hence trace only
1870         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1871         return wxCONV_FAILED;
1872     }
1873
1874     return res;
1875 }
1876
1877 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1878 {
1879 #if wxUSE_THREADS
1880     // NB: explained in MB2WC
1881     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1882 #endif
1883
1884     size_t inlen = wxWcslen(psz);
1885     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1886     size_t outbuf = n;
1887     size_t res, cres;
1888
1889     wchar_t *tmpbuf = 0;
1890
1891     if (ms_wcNeedsSwap)
1892     {
1893         // need to copy to temp buffer to switch endianness
1894         // (doing WC_BSWAP twice on the original buffer won't help, as it
1895         //  could be in read-only memory, or be accessed in some other thread)
1896         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1897         for ( size_t i = 0; i < inlen; i++ )
1898             tmpbuf[n] = WC_BSWAP(psz[i]);
1899
1900         tmpbuf[inlen] = L'\0';
1901         psz = tmpbuf;
1902     }
1903
1904     if (buf)
1905     {
1906         // have destination buffer, convert there
1907         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1908
1909         res = n - outbuf;
1910
1911         // NB: iconv was given only wcslen(psz) characters on input, and so
1912         //     it couldn't convert the trailing zero. Let's do it ourselves
1913         //     if there's some room left for it in the output buffer.
1914         if (res < n)
1915             buf[0] = 0;
1916     }
1917     else
1918     {
1919         // no destination buffer: convert using temp buffer
1920         // to calculate destination buffer requirement
1921         char tbuf[16];
1922         res = 0;
1923         do
1924         {
1925             buf = tbuf;
1926             outbuf = 16;
1927
1928             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1929
1930             res += 16 - outbuf;
1931         }
1932         while ((cres == (size_t)-1) && (errno == E2BIG));
1933     }
1934
1935     if (ms_wcNeedsSwap)
1936     {
1937         free(tmpbuf);
1938     }
1939
1940     if (ICONV_FAILED(cres, inbuf))
1941     {
1942         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1943         return wxCONV_FAILED;
1944     }
1945
1946     return res;
1947 }
1948
1949 size_t wxMBConv_iconv::GetMBNulLen() const
1950 {
1951     if ( m_minMBCharWidth == 0 )
1952     {
1953         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1954
1955 #if wxUSE_THREADS
1956         // NB: explained in MB2WC
1957         wxMutexLocker lock(self->m_iconvMutex);
1958 #endif
1959
1960         wchar_t *wnul = L"";
1961         char buf[8]; // should be enough for NUL in any encoding
1962         size_t inLen = sizeof(wchar_t),
1963                outLen = WXSIZEOF(buf);
1964         char *inBuff = (char *)wnul;
1965         char *outBuff = buf;
1966         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1967         {
1968             self->m_minMBCharWidth = (size_t)-1;
1969         }
1970         else // ok
1971         {
1972             self->m_minMBCharWidth = outBuff - buf;
1973         }
1974     }
1975
1976     return m_minMBCharWidth;
1977 }
1978
1979 #endif // HAVE_ICONV
1980
1981
1982 // ============================================================================
1983 // Win32 conversion classes
1984 // ============================================================================
1985
1986 #ifdef wxHAVE_WIN32_MB2WC
1987
1988 // from utils.cpp
1989 #if wxUSE_FONTMAP
1990 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1991 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1992 #endif
1993
1994 class wxMBConv_win32 : public wxMBConv
1995 {
1996 public:
1997     wxMBConv_win32()
1998     {
1999         m_CodePage = CP_ACP;
2000         m_minMBCharWidth = 0;
2001     }
2002
2003     wxMBConv_win32(const wxMBConv_win32& conv)
2004     {
2005         m_CodePage = conv.m_CodePage;
2006         m_minMBCharWidth = conv.m_minMBCharWidth;
2007     }
2008
2009 #if wxUSE_FONTMAP
2010     wxMBConv_win32(const wxChar* name)
2011     {
2012         m_CodePage = wxCharsetToCodepage(name);
2013         m_minMBCharWidth = 0;
2014     }
2015
2016     wxMBConv_win32(wxFontEncoding encoding)
2017     {
2018         m_CodePage = wxEncodingToCodepage(encoding);
2019         m_minMBCharWidth = 0;
2020     }
2021 #endif // wxUSE_FONTMAP
2022
2023     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2024     {
2025         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2026         // the behaviour is not compatible with the Unix version (using iconv)
2027         // and break the library itself, e.g. wxTextInputStream::NextChar()
2028         // wouldn't work if reading an incomplete MB char didn't result in an
2029         // error
2030         //
2031         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2032         // Win XP or newer and it is not supported for UTF-[78] so we always
2033         // use our own conversions in this case. See
2034         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2035         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2036         if ( m_CodePage == CP_UTF8 )
2037         {
2038             return wxConvUTF8.MB2WC(buf, psz, n);
2039         }
2040
2041         if ( m_CodePage == CP_UTF7 )
2042         {
2043             return wxConvUTF7.MB2WC(buf, psz, n);
2044         }
2045
2046         int flags = 0;
2047         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2048                 IsAtLeastWin2kSP4() )
2049         {
2050             flags = MB_ERR_INVALID_CHARS;
2051         }
2052
2053         const size_t len = ::MultiByteToWideChar
2054                              (
2055                                 m_CodePage,     // code page
2056                                 flags,          // flags: fall on error
2057                                 psz,            // input string
2058                                 -1,             // its length (NUL-terminated)
2059                                 buf,            // output string
2060                                 buf ? n : 0     // size of output buffer
2061                              );
2062         if ( !len )
2063         {
2064             // function totally failed
2065             return wxCONV_FAILED;
2066         }
2067
2068         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2069         // check if we succeeded, by doing a double trip:
2070         if ( !flags && buf )
2071         {
2072             const size_t mbLen = strlen(psz);
2073             wxCharBuffer mbBuf(mbLen);
2074             if ( ::WideCharToMultiByte
2075                    (
2076                       m_CodePage,
2077                       0,
2078                       buf,
2079                       -1,
2080                       mbBuf.data(),
2081                       mbLen + 1,        // size in bytes, not length
2082                       NULL,
2083                       NULL
2084                    ) == 0 ||
2085                   strcmp(mbBuf, psz) != 0 )
2086             {
2087                 // we didn't obtain the same thing we started from, hence
2088                 // the conversion was lossy and we consider that it failed
2089                 return wxCONV_FAILED;
2090             }
2091         }
2092
2093         // note that it returns count of written chars for buf != NULL and size
2094         // of the needed buffer for buf == NULL so in either case the length of
2095         // the string (which never includes the terminating NUL) is one less
2096         return len - 1;
2097     }
2098
2099     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2100     {
2101         /*
2102             we have a problem here: by default, WideCharToMultiByte() may
2103             replace characters unrepresentable in the target code page with bad
2104             quality approximations such as turning "1/2" symbol (U+00BD) into
2105             "1" for the code pages which don't have it and we, obviously, want
2106             to avoid this at any price
2107
2108             the trouble is that this function does it _silently_, i.e. it won't
2109             even tell us whether it did or not... Win98/2000 and higher provide
2110             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2111             we have to resort to a round trip, i.e. check that converting back
2112             results in the same string -- this is, of course, expensive but
2113             otherwise we simply can't be sure to not garble the data.
2114          */
2115
2116         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2117         // it doesn't work with CJK encodings (which we test for rather roughly
2118         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2119         // supporting it
2120         BOOL usedDef wxDUMMY_INITIALIZE(false);
2121         BOOL *pUsedDef;
2122         int flags;
2123         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2124         {
2125             // it's our lucky day
2126             flags = WC_NO_BEST_FIT_CHARS;
2127             pUsedDef = &usedDef;
2128         }
2129         else // old system or unsupported encoding
2130         {
2131             flags = 0;
2132             pUsedDef = NULL;
2133         }
2134
2135         const size_t len = ::WideCharToMultiByte
2136                              (
2137                                 m_CodePage,     // code page
2138                                 flags,          // either none or no best fit
2139                                 pwz,            // input string
2140                                 -1,             // it is (wide) NUL-terminated
2141                                 buf,            // output buffer
2142                                 buf ? n : 0,    // and its size
2143                                 NULL,           // default "replacement" char
2144                                 pUsedDef        // [out] was it used?
2145                              );
2146
2147         if ( !len )
2148         {
2149             // function totally failed
2150             return wxCONV_FAILED;
2151         }
2152
2153         // if we were really converting, check if we succeeded
2154         if ( buf )
2155         {
2156             if ( flags )
2157             {
2158                 // check if the conversion failed, i.e. if any replacements
2159                 // were done
2160                 if ( usedDef )
2161                     return wxCONV_FAILED;
2162             }
2163             else // we must resort to double tripping...
2164             {
2165                 wxWCharBuffer wcBuf(n);
2166                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2167                         wcscmp(wcBuf, pwz) != 0 )
2168                 {
2169                     // we didn't obtain the same thing we started from, hence
2170                     // the conversion was lossy and we consider that it failed
2171                     return wxCONV_FAILED;
2172                 }
2173             }
2174         }
2175
2176         // see the comment above for the reason of "len - 1"
2177         return len - 1;
2178     }
2179
2180     virtual size_t GetMBNulLen() const
2181     {
2182         if ( m_minMBCharWidth == 0 )
2183         {
2184             int len = ::WideCharToMultiByte
2185                         (
2186                             m_CodePage,     // code page
2187                             0,              // no flags
2188                             L"",            // input string
2189                             1,              // translate just the NUL
2190                             NULL,           // output buffer
2191                             0,              // and its size
2192                             NULL,           // no replacement char
2193                             NULL            // [out] don't care if it was used
2194                         );
2195
2196             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2197             switch ( len )
2198             {
2199                 default:
2200                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2201                     self->m_minMBCharWidth = (size_t)-1;
2202                     break;
2203
2204                 case 0:
2205                     self->m_minMBCharWidth = (size_t)-1;
2206                     break;
2207
2208                 case 1:
2209                 case 2:
2210                 case 4:
2211                     self->m_minMBCharWidth = len;
2212                     break;
2213             }
2214         }
2215
2216         return m_minMBCharWidth;
2217     }
2218
2219     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2220
2221     bool IsOk() const { return m_CodePage != -1; }
2222
2223 private:
2224     static bool CanUseNoBestFit()
2225     {
2226         static int s_isWin98Or2k = -1;
2227
2228         if ( s_isWin98Or2k == -1 )
2229         {
2230             int verMaj, verMin;
2231             switch ( wxGetOsVersion(&verMaj, &verMin) )
2232             {
2233                 case wxWIN95:
2234                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2235                     break;
2236
2237                 case wxWINDOWS_NT:
2238                     s_isWin98Or2k = verMaj >= 5;
2239                     break;
2240
2241                 default:
2242                     // unknown: be conservative by default
2243                     s_isWin98Or2k = 0;
2244                     break;
2245             }
2246
2247             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2248         }
2249
2250         return s_isWin98Or2k == 1;
2251     }
2252
2253     static bool IsAtLeastWin2kSP4()
2254     {
2255 #ifdef __WXWINCE__
2256         return false;
2257 #else
2258         static int s_isAtLeastWin2kSP4 = -1;
2259
2260         if ( s_isAtLeastWin2kSP4 == -1 )
2261         {
2262             OSVERSIONINFOEX ver;
2263
2264             memset(&ver, 0, sizeof(ver));
2265             ver.dwOSVersionInfoSize = sizeof(ver);
2266             GetVersionEx((OSVERSIONINFO*)&ver);
2267
2268             s_isAtLeastWin2kSP4 =
2269               ((ver.dwMajorVersion > 5) || // Vista+
2270                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2271                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2272                ver.wServicePackMajor >= 4)) // 2000 SP4+
2273               ? 1 : 0;
2274         }
2275
2276         return s_isAtLeastWin2kSP4 == 1;
2277 #endif
2278     }
2279
2280
2281     // the code page we're working with
2282     long m_CodePage;
2283
2284     // cached result of GetMBNulLen(), set to 0 initially meaning
2285     // "unknown"
2286     size_t m_minMBCharWidth;
2287 };
2288
2289 #endif // wxHAVE_WIN32_MB2WC
2290
2291 // ============================================================================
2292 // Cocoa conversion classes
2293 // ============================================================================
2294
2295 #if defined(__WXCOCOA__)
2296
2297 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2298 // Strangely enough, internally Core Foundation uses
2299 // UTF-32 internally quite a bit - its just not public (yet).
2300
2301 #include <CoreFoundation/CFString.h>
2302 #include <CoreFoundation/CFStringEncodingExt.h>
2303
2304 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2305 {
2306     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2307
2308     switch (encoding)
2309     {
2310         case wxFONTENCODING_DEFAULT :
2311             enc = CFStringGetSystemEncoding();
2312             break ;
2313
2314         case wxFONTENCODING_ISO8859_1 :
2315             enc = kCFStringEncodingISOLatin1 ;
2316             break ;
2317         case wxFONTENCODING_ISO8859_2 :
2318             enc = kCFStringEncodingISOLatin2;
2319             break ;
2320         case wxFONTENCODING_ISO8859_3 :
2321             enc = kCFStringEncodingISOLatin3 ;
2322             break ;
2323         case wxFONTENCODING_ISO8859_4 :
2324             enc = kCFStringEncodingISOLatin4;
2325             break ;
2326         case wxFONTENCODING_ISO8859_5 :
2327             enc = kCFStringEncodingISOLatinCyrillic;
2328             break ;
2329         case wxFONTENCODING_ISO8859_6 :
2330             enc = kCFStringEncodingISOLatinArabic;
2331             break ;
2332         case wxFONTENCODING_ISO8859_7 :
2333             enc = kCFStringEncodingISOLatinGreek;
2334             break ;
2335         case wxFONTENCODING_ISO8859_8 :
2336             enc = kCFStringEncodingISOLatinHebrew;
2337             break ;
2338         case wxFONTENCODING_ISO8859_9 :
2339             enc = kCFStringEncodingISOLatin5;
2340             break ;
2341         case wxFONTENCODING_ISO8859_10 :
2342             enc = kCFStringEncodingISOLatin6;
2343             break ;
2344         case wxFONTENCODING_ISO8859_11 :
2345             enc = kCFStringEncodingISOLatinThai;
2346             break ;
2347         case wxFONTENCODING_ISO8859_13 :
2348             enc = kCFStringEncodingISOLatin7;
2349             break ;
2350         case wxFONTENCODING_ISO8859_14 :
2351             enc = kCFStringEncodingISOLatin8;
2352             break ;
2353         case wxFONTENCODING_ISO8859_15 :
2354             enc = kCFStringEncodingISOLatin9;
2355             break ;
2356
2357         case wxFONTENCODING_KOI8 :
2358             enc = kCFStringEncodingKOI8_R;
2359             break ;
2360         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2361             enc = kCFStringEncodingDOSRussian;
2362             break ;
2363
2364 //      case wxFONTENCODING_BULGARIAN :
2365 //          enc = ;
2366 //          break ;
2367
2368         case wxFONTENCODING_CP437 :
2369             enc = kCFStringEncodingDOSLatinUS ;
2370             break ;
2371         case wxFONTENCODING_CP850 :
2372             enc = kCFStringEncodingDOSLatin1;
2373             break ;
2374         case wxFONTENCODING_CP852 :
2375             enc = kCFStringEncodingDOSLatin2;
2376             break ;
2377         case wxFONTENCODING_CP855 :
2378             enc = kCFStringEncodingDOSCyrillic;
2379             break ;
2380         case wxFONTENCODING_CP866 :
2381             enc = kCFStringEncodingDOSRussian ;
2382             break ;
2383         case wxFONTENCODING_CP874 :
2384             enc = kCFStringEncodingDOSThai;
2385             break ;
2386         case wxFONTENCODING_CP932 :
2387             enc = kCFStringEncodingDOSJapanese;
2388             break ;
2389         case wxFONTENCODING_CP936 :
2390             enc = kCFStringEncodingDOSChineseSimplif ;
2391             break ;
2392         case wxFONTENCODING_CP949 :
2393             enc = kCFStringEncodingDOSKorean;
2394             break ;
2395         case wxFONTENCODING_CP950 :
2396             enc = kCFStringEncodingDOSChineseTrad;
2397             break ;
2398         case wxFONTENCODING_CP1250 :
2399             enc = kCFStringEncodingWindowsLatin2;
2400             break ;
2401         case wxFONTENCODING_CP1251 :
2402             enc = kCFStringEncodingWindowsCyrillic ;
2403             break ;
2404         case wxFONTENCODING_CP1252 :
2405             enc = kCFStringEncodingWindowsLatin1 ;
2406             break ;
2407         case wxFONTENCODING_CP1253 :
2408             enc = kCFStringEncodingWindowsGreek;
2409             break ;
2410         case wxFONTENCODING_CP1254 :
2411             enc = kCFStringEncodingWindowsLatin5;
2412             break ;
2413         case wxFONTENCODING_CP1255 :
2414             enc = kCFStringEncodingWindowsHebrew ;
2415             break ;
2416         case wxFONTENCODING_CP1256 :
2417             enc = kCFStringEncodingWindowsArabic ;
2418             break ;
2419         case wxFONTENCODING_CP1257 :
2420             enc = kCFStringEncodingWindowsBalticRim;
2421             break ;
2422 //   This only really encodes to UTF7 (if that) evidently
2423 //        case wxFONTENCODING_UTF7 :
2424 //            enc = kCFStringEncodingNonLossyASCII ;
2425 //            break ;
2426         case wxFONTENCODING_UTF8 :
2427             enc = kCFStringEncodingUTF8 ;
2428             break ;
2429         case wxFONTENCODING_EUC_JP :
2430             enc = kCFStringEncodingEUC_JP;
2431             break ;
2432         case wxFONTENCODING_UTF16 :
2433             enc = kCFStringEncodingUnicode ;
2434             break ;
2435         case wxFONTENCODING_MACROMAN :
2436             enc = kCFStringEncodingMacRoman ;
2437             break ;
2438         case wxFONTENCODING_MACJAPANESE :
2439             enc = kCFStringEncodingMacJapanese ;
2440             break ;
2441         case wxFONTENCODING_MACCHINESETRAD :
2442             enc = kCFStringEncodingMacChineseTrad ;
2443             break ;
2444         case wxFONTENCODING_MACKOREAN :
2445             enc = kCFStringEncodingMacKorean ;
2446             break ;
2447         case wxFONTENCODING_MACARABIC :
2448             enc = kCFStringEncodingMacArabic ;
2449             break ;
2450         case wxFONTENCODING_MACHEBREW :
2451             enc = kCFStringEncodingMacHebrew ;
2452             break ;
2453         case wxFONTENCODING_MACGREEK :
2454             enc = kCFStringEncodingMacGreek ;
2455             break ;
2456         case wxFONTENCODING_MACCYRILLIC :
2457             enc = kCFStringEncodingMacCyrillic ;
2458             break ;
2459         case wxFONTENCODING_MACDEVANAGARI :
2460             enc = kCFStringEncodingMacDevanagari ;
2461             break ;
2462         case wxFONTENCODING_MACGURMUKHI :
2463             enc = kCFStringEncodingMacGurmukhi ;
2464             break ;
2465         case wxFONTENCODING_MACGUJARATI :
2466             enc = kCFStringEncodingMacGujarati ;
2467             break ;
2468         case wxFONTENCODING_MACORIYA :
2469             enc = kCFStringEncodingMacOriya ;
2470             break ;
2471         case wxFONTENCODING_MACBENGALI :
2472             enc = kCFStringEncodingMacBengali ;
2473             break ;
2474         case wxFONTENCODING_MACTAMIL :
2475             enc = kCFStringEncodingMacTamil ;
2476             break ;
2477         case wxFONTENCODING_MACTELUGU :
2478             enc = kCFStringEncodingMacTelugu ;
2479             break ;
2480         case wxFONTENCODING_MACKANNADA :
2481             enc = kCFStringEncodingMacKannada ;
2482             break ;
2483         case wxFONTENCODING_MACMALAJALAM :
2484             enc = kCFStringEncodingMacMalayalam ;
2485             break ;
2486         case wxFONTENCODING_MACSINHALESE :
2487             enc = kCFStringEncodingMacSinhalese ;
2488             break ;
2489         case wxFONTENCODING_MACBURMESE :
2490             enc = kCFStringEncodingMacBurmese ;
2491             break ;
2492         case wxFONTENCODING_MACKHMER :
2493             enc = kCFStringEncodingMacKhmer ;
2494             break ;
2495         case wxFONTENCODING_MACTHAI :
2496             enc = kCFStringEncodingMacThai ;
2497             break ;
2498         case wxFONTENCODING_MACLAOTIAN :
2499             enc = kCFStringEncodingMacLaotian ;
2500             break ;
2501         case wxFONTENCODING_MACGEORGIAN :
2502             enc = kCFStringEncodingMacGeorgian ;
2503             break ;
2504         case wxFONTENCODING_MACARMENIAN :
2505             enc = kCFStringEncodingMacArmenian ;
2506             break ;
2507         case wxFONTENCODING_MACCHINESESIMP :
2508             enc = kCFStringEncodingMacChineseSimp ;
2509             break ;
2510         case wxFONTENCODING_MACTIBETAN :
2511             enc = kCFStringEncodingMacTibetan ;
2512             break ;
2513         case wxFONTENCODING_MACMONGOLIAN :
2514             enc = kCFStringEncodingMacMongolian ;
2515             break ;
2516         case wxFONTENCODING_MACETHIOPIC :
2517             enc = kCFStringEncodingMacEthiopic ;
2518             break ;
2519         case wxFONTENCODING_MACCENTRALEUR :
2520             enc = kCFStringEncodingMacCentralEurRoman ;
2521             break ;
2522         case wxFONTENCODING_MACVIATNAMESE :
2523             enc = kCFStringEncodingMacVietnamese ;
2524             break ;
2525         case wxFONTENCODING_MACARABICEXT :
2526             enc = kCFStringEncodingMacExtArabic ;
2527             break ;
2528         case wxFONTENCODING_MACSYMBOL :
2529             enc = kCFStringEncodingMacSymbol ;
2530             break ;
2531         case wxFONTENCODING_MACDINGBATS :
2532             enc = kCFStringEncodingMacDingbats ;
2533             break ;
2534         case wxFONTENCODING_MACTURKISH :
2535             enc = kCFStringEncodingMacTurkish ;
2536             break ;
2537         case wxFONTENCODING_MACCROATIAN :
2538             enc = kCFStringEncodingMacCroatian ;
2539             break ;
2540         case wxFONTENCODING_MACICELANDIC :
2541             enc = kCFStringEncodingMacIcelandic ;
2542             break ;
2543         case wxFONTENCODING_MACROMANIAN :
2544             enc = kCFStringEncodingMacRomanian ;
2545             break ;
2546         case wxFONTENCODING_MACCELTIC :
2547             enc = kCFStringEncodingMacCeltic ;
2548             break ;
2549         case wxFONTENCODING_MACGAELIC :
2550             enc = kCFStringEncodingMacGaelic ;
2551             break ;
2552 //      case wxFONTENCODING_MACKEYBOARD :
2553 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2554 //          break ;
2555
2556         default :
2557             // because gcc is picky
2558             break ;
2559     }
2560
2561     return enc ;
2562 }
2563
2564 class wxMBConv_cocoa : public wxMBConv
2565 {
2566 public:
2567     wxMBConv_cocoa()
2568     {
2569         Init(CFStringGetSystemEncoding()) ;
2570     }
2571
2572     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2573     {
2574         m_encoding = conv.m_encoding;
2575     }
2576
2577 #if wxUSE_FONTMAP
2578     wxMBConv_cocoa(const wxChar* name)
2579     {
2580         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2581     }
2582 #endif
2583
2584     wxMBConv_cocoa(wxFontEncoding encoding)
2585     {
2586         Init( wxCFStringEncFromFontEnc(encoding) );
2587     }
2588
2589     ~wxMBConv_cocoa()
2590     {
2591     }
2592
2593     void Init( CFStringEncoding encoding)
2594     {
2595         m_encoding = encoding ;
2596     }
2597
2598     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2599     {
2600         wxASSERT(szUnConv);
2601
2602         CFStringRef theString = CFStringCreateWithBytes (
2603                                                 NULL, //the allocator
2604                                                 (const UInt8*)szUnConv,
2605                                                 strlen(szUnConv),
2606                                                 m_encoding,
2607                                                 false //no BOM/external representation
2608                                                 );
2609
2610         wxASSERT(theString);
2611
2612         size_t nOutLength = CFStringGetLength(theString);
2613
2614         if (szOut == NULL)
2615         {
2616             CFRelease(theString);
2617             return nOutLength;
2618         }
2619
2620         CFRange theRange = { 0, nOutSize };
2621
2622 #if SIZEOF_WCHAR_T == 4
2623         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2624 #endif
2625
2626         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2627
2628         CFRelease(theString);
2629
2630         szUniCharBuffer[nOutLength] = '\0';
2631
2632 #if SIZEOF_WCHAR_T == 4
2633         wxMBConvUTF16 converter;
2634         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2635         delete [] szUniCharBuffer;
2636 #endif
2637
2638         return nOutLength;
2639     }
2640
2641     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2642     {
2643         wxASSERT(szUnConv);
2644
2645         size_t nRealOutSize;
2646         size_t nBufSize = wxWcslen(szUnConv);
2647         UniChar* szUniBuffer = (UniChar*) szUnConv;
2648
2649 #if SIZEOF_WCHAR_T == 4
2650         wxMBConvUTF16 converter ;
2651         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2652         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2653         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2654         nBufSize /= sizeof(UniChar);
2655 #endif
2656
2657         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2658                                 NULL, //allocator
2659                                 szUniBuffer,
2660                                 nBufSize,
2661                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2662                             );
2663
2664         wxASSERT(theString);
2665
2666         //Note that CER puts a BOM when converting to unicode
2667         //so we  check and use getchars instead in that case
2668         if (m_encoding == kCFStringEncodingUnicode)
2669         {
2670             if (szOut != NULL)
2671                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2672
2673             nRealOutSize = CFStringGetLength(theString) + 1;
2674         }
2675         else
2676         {
2677             CFStringGetBytes(
2678                 theString,
2679                 CFRangeMake(0, CFStringGetLength(theString)),
2680                 m_encoding,
2681                 0, //what to put in characters that can't be converted -
2682                     //0 tells CFString to return NULL if it meets such a character
2683                 false, //not an external representation
2684                 (UInt8*) szOut,
2685                 nOutSize,
2686                 (CFIndex*) &nRealOutSize
2687                         );
2688         }
2689
2690         CFRelease(theString);
2691
2692 #if SIZEOF_WCHAR_T == 4
2693         delete[] szUniBuffer;
2694 #endif
2695
2696         return  nRealOutSize - 1;
2697     }
2698
2699     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2700
2701     bool IsOk() const
2702     {
2703         return m_encoding != kCFStringEncodingInvalidId &&
2704               CFStringIsEncodingAvailable(m_encoding);
2705     }
2706
2707 private:
2708     CFStringEncoding m_encoding ;
2709 };
2710
2711 #endif // defined(__WXCOCOA__)
2712
2713 // ============================================================================
2714 // Mac conversion classes
2715 // ============================================================================
2716
2717 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2718
2719 class wxMBConv_mac : public wxMBConv
2720 {
2721 public:
2722     wxMBConv_mac()
2723     {
2724         Init(CFStringGetSystemEncoding()) ;
2725     }
2726
2727     wxMBConv_mac(const wxMBConv_mac& conv)
2728     {
2729         Init(conv.m_char_encoding);
2730     }
2731
2732 #if wxUSE_FONTMAP
2733     wxMBConv_mac(const wxChar* name)
2734     {
2735         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2736     }
2737 #endif
2738
2739     wxMBConv_mac(wxFontEncoding encoding)
2740     {
2741         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2742     }
2743
2744     ~wxMBConv_mac()
2745     {
2746         OSStatus status = noErr ;
2747         status = TECDisposeConverter(m_MB2WC_converter);
2748         status = TECDisposeConverter(m_WC2MB_converter);
2749     }
2750
2751
2752     void Init( TextEncodingBase encoding)
2753     {
2754         OSStatus status = noErr ;
2755         m_char_encoding = encoding ;
2756         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2757
2758         status = TECCreateConverter(&m_MB2WC_converter,
2759                                     m_char_encoding,
2760                                     m_unicode_encoding);
2761         status = TECCreateConverter(&m_WC2MB_converter,
2762                                     m_unicode_encoding,
2763                                     m_char_encoding);
2764     }
2765
2766     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2767     {
2768         OSStatus status = noErr ;
2769         ByteCount byteOutLen ;
2770         ByteCount byteInLen = strlen(psz) + 1;
2771         wchar_t *tbuf = NULL ;
2772         UniChar* ubuf = NULL ;
2773         size_t res = 0 ;
2774
2775         if (buf == NULL)
2776         {
2777             // Apple specs say at least 32
2778             n = wxMax( 32, byteInLen ) ;
2779             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2780         }
2781
2782         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2783
2784 #if SIZEOF_WCHAR_T == 4
2785         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2786 #else
2787         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2788 #endif
2789
2790         status = TECConvertText(
2791             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2792             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2793
2794 #if SIZEOF_WCHAR_T == 4
2795         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2796         // is not properly terminated we get random characters at the end
2797         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2798         wxMBConvUTF16 converter ;
2799         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2800         free( ubuf ) ;
2801 #else
2802         res = byteOutLen / sizeof( UniChar ) ;
2803 #endif
2804
2805         if ( buf == NULL )
2806              free(tbuf) ;
2807
2808         if ( buf  && res < n)
2809             buf[res] = 0;
2810
2811         return res ;
2812     }
2813
2814     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2815     {
2816         OSStatus status = noErr ;
2817         ByteCount byteOutLen ;
2818         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2819
2820         char *tbuf = NULL ;
2821
2822         if (buf == NULL)
2823         {
2824             // Apple specs say at least 32
2825             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2826             tbuf = (char*) malloc( n ) ;
2827         }
2828
2829         ByteCount byteBufferLen = n ;
2830         UniChar* ubuf = NULL ;
2831
2832 #if SIZEOF_WCHAR_T == 4
2833         wxMBConvUTF16 converter ;
2834         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2835         byteInLen = unicharlen ;
2836         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2837         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2838 #else
2839         ubuf = (UniChar*) psz ;
2840 #endif
2841
2842         status = TECConvertText(
2843             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2844             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2845
2846 #if SIZEOF_WCHAR_T == 4
2847         free( ubuf ) ;
2848 #endif
2849
2850         if ( buf == NULL )
2851             free(tbuf) ;
2852
2853         size_t res = byteOutLen ;
2854         if ( buf  && res < n)
2855         {
2856             buf[res] = 0;
2857
2858             //we need to double-trip to verify it didn't insert any ? in place
2859             //of bogus characters
2860             wxWCharBuffer wcBuf(n);
2861             size_t pszlen = wxWcslen(psz);
2862             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2863                         wxWcslen(wcBuf) != pszlen ||
2864                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2865             {
2866                 // we didn't obtain the same thing we started from, hence
2867                 // the conversion was lossy and we consider that it failed
2868                 return wxCONV_FAILED;
2869             }
2870         }
2871
2872         return res ;
2873     }
2874
2875     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2876
2877     bool IsOk() const
2878         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2879
2880 private:
2881     TECObjectRef m_MB2WC_converter;
2882     TECObjectRef m_WC2MB_converter;
2883
2884     TextEncodingBase m_char_encoding;
2885     TextEncodingBase m_unicode_encoding;
2886 };
2887
2888 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2889
2890 // ============================================================================
2891 // wxEncodingConverter based conversion classes
2892 // ============================================================================
2893
2894 #if wxUSE_FONTMAP
2895
2896 class wxMBConv_wxwin : public wxMBConv
2897 {
2898 private:
2899     void Init()
2900     {
2901         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2902                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2903     }
2904
2905 public:
2906     // temporarily just use wxEncodingConverter stuff,
2907     // so that it works while a better implementation is built
2908     wxMBConv_wxwin(const wxChar* name)
2909     {
2910         if (name)
2911             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2912         else
2913             m_enc = wxFONTENCODING_SYSTEM;
2914
2915         Init();
2916     }
2917
2918     wxMBConv_wxwin(wxFontEncoding enc)
2919     {
2920         m_enc = enc;
2921
2922         Init();
2923     }
2924
2925     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2926     {
2927         size_t inbuf = strlen(psz);
2928         if (buf)
2929         {
2930             if (!m2w.Convert(psz, buf))
2931                 return wxCONV_FAILED;
2932         }
2933         return inbuf;
2934     }
2935
2936     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2937     {
2938         const size_t inbuf = wxWcslen(psz);
2939         if (buf)
2940         {
2941             if (!w2m.Convert(psz, buf))
2942                 return wxCONV_FAILED;
2943         }
2944
2945         return inbuf;
2946     }
2947
2948     virtual size_t GetMBNulLen() const
2949     {
2950         switch ( m_enc )
2951         {
2952             case wxFONTENCODING_UTF16BE:
2953             case wxFONTENCODING_UTF16LE:
2954                 return 2;
2955
2956             case wxFONTENCODING_UTF32BE:
2957             case wxFONTENCODING_UTF32LE:
2958                 return 4;
2959
2960             default:
2961                 return 1;
2962         }
2963     }
2964
2965     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2966
2967     bool IsOk() const { return m_ok; }
2968
2969 public:
2970     wxFontEncoding m_enc;
2971     wxEncodingConverter m2w, w2m;
2972
2973 private:
2974     // were we initialized successfully?
2975     bool m_ok;
2976
2977     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2978 };
2979
2980 // make the constructors available for unit testing
2981 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2982 {
2983     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2984     if ( !result->IsOk() )
2985     {
2986         delete result;
2987         return 0;
2988     }
2989
2990     return result;
2991 }
2992
2993 #endif // wxUSE_FONTMAP
2994
2995 // ============================================================================
2996 // wxCSConv implementation
2997 // ============================================================================
2998
2999 void wxCSConv::Init()
3000 {
3001     m_name = NULL;
3002     m_convReal =  NULL;
3003     m_deferred = true;
3004 }
3005
3006 wxCSConv::wxCSConv(const wxChar *charset)
3007 {
3008     Init();
3009
3010     if ( charset )
3011     {
3012         SetName(charset);
3013     }
3014
3015 #if wxUSE_FONTMAP
3016     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3017 #else
3018     m_encoding = wxFONTENCODING_SYSTEM;
3019 #endif
3020 }
3021
3022 wxCSConv::wxCSConv(wxFontEncoding encoding)
3023 {
3024     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3025     {
3026         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3027
3028         encoding = wxFONTENCODING_SYSTEM;
3029     }
3030
3031     Init();
3032
3033     m_encoding = encoding;
3034 }
3035
3036 wxCSConv::~wxCSConv()
3037 {
3038     Clear();
3039 }
3040
3041 wxCSConv::wxCSConv(const wxCSConv& conv)
3042         : wxMBConv()
3043 {
3044     Init();
3045
3046     SetName(conv.m_name);
3047     m_encoding = conv.m_encoding;
3048 }
3049
3050 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3051 {
3052     Clear();
3053
3054     SetName(conv.m_name);
3055     m_encoding = conv.m_encoding;
3056
3057     return *this;
3058 }
3059
3060 void wxCSConv::Clear()
3061 {
3062     free(m_name);
3063     delete m_convReal;
3064
3065     m_name = NULL;
3066     m_convReal = NULL;
3067 }
3068
3069 void wxCSConv::SetName(const wxChar *charset)
3070 {
3071     if (charset)
3072     {
3073         m_name = wxStrdup(charset);
3074         m_deferred = true;
3075     }
3076 }
3077
3078 #if wxUSE_FONTMAP
3079 #include "wx/hashmap.h"
3080
3081 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3082                      wxEncodingNameCache );
3083
3084 static wxEncodingNameCache gs_nameCache;
3085 #endif
3086
3087 wxMBConv *wxCSConv::DoCreate() const
3088 {
3089 #if wxUSE_FONTMAP
3090     wxLogTrace(TRACE_STRCONV,
3091                wxT("creating conversion for %s"),
3092                (m_name ? m_name
3093                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3094 #endif // wxUSE_FONTMAP
3095
3096     // check for the special case of ASCII or ISO8859-1 charset: as we have
3097     // special knowledge of it anyhow, we don't need to create a special
3098     // conversion object
3099     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3100             m_encoding == wxFONTENCODING_DEFAULT )
3101     {
3102         // don't convert at all
3103         return NULL;
3104     }
3105
3106     // we trust OS to do conversion better than we can so try external
3107     // conversion methods first
3108     //
3109     // the full order is:
3110     //      1. OS conversion (iconv() under Unix or Win32 API)
3111     //      2. hard coded conversions for UTF
3112     //      3. wxEncodingConverter as fall back
3113
3114     // step (1)
3115 #ifdef HAVE_ICONV
3116 #if !wxUSE_FONTMAP
3117     if ( m_name )
3118 #endif // !wxUSE_FONTMAP
3119     {
3120         wxString name(m_name);
3121         wxFontEncoding encoding(m_encoding);
3122
3123         if ( !name.empty() )
3124         {
3125             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3126             if ( conv->IsOk() )
3127                 return conv;
3128
3129             delete conv;
3130
3131 #if wxUSE_FONTMAP
3132             encoding =
3133                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3134 #endif // wxUSE_FONTMAP
3135         }
3136 #if wxUSE_FONTMAP
3137         {
3138             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3139             if ( it != gs_nameCache.end() )
3140             {
3141                 if ( it->second.empty() )
3142                     return NULL;
3143
3144                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3145                 if ( conv->IsOk() )
3146                     return conv;
3147
3148                 delete conv;
3149             }
3150
3151             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3152
3153             for ( ; *names; ++names )
3154             {
3155                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3156                 if ( conv->IsOk() )
3157                 {
3158                     gs_nameCache[encoding] = *names;
3159                     return conv;
3160                 }
3161
3162                 delete conv;
3163             }
3164
3165             gs_nameCache[encoding] = _T(""); // cache the failure
3166         }
3167 #endif // wxUSE_FONTMAP
3168     }
3169 #endif // HAVE_ICONV
3170
3171 #ifdef wxHAVE_WIN32_MB2WC
3172     {
3173 #if wxUSE_FONTMAP
3174         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3175                                       : new wxMBConv_win32(m_encoding);
3176         if ( conv->IsOk() )
3177             return conv;
3178
3179         delete conv;
3180 #else
3181         return NULL;
3182 #endif
3183     }
3184 #endif // wxHAVE_WIN32_MB2WC
3185
3186 #if defined(__WXMAC__)
3187     {
3188         // leave UTF16 and UTF32 to the built-ins of wx
3189         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3190             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3191         {
3192 #if wxUSE_FONTMAP
3193             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3194                                         : new wxMBConv_mac(m_encoding);
3195 #else
3196             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3197 #endif
3198             if ( conv->IsOk() )
3199                  return conv;
3200
3201             delete conv;
3202         }
3203     }
3204 #endif
3205
3206 #if defined(__WXCOCOA__)
3207     {
3208         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3209         {
3210 #if wxUSE_FONTMAP
3211             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3212                                           : new wxMBConv_cocoa(m_encoding);
3213 #else
3214             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3215 #endif
3216
3217             if ( conv->IsOk() )
3218                  return conv;
3219
3220             delete conv;
3221         }
3222     }
3223 #endif
3224     // step (2)
3225     wxFontEncoding enc = m_encoding;
3226 #if wxUSE_FONTMAP
3227     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3228     {
3229         // use "false" to suppress interactive dialogs -- we can be called from
3230         // anywhere and popping up a dialog from here is the last thing we want to
3231         // do
3232         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3233     }
3234 #endif // wxUSE_FONTMAP
3235
3236     switch ( enc )
3237     {
3238         case wxFONTENCODING_UTF7:
3239              return new wxMBConvUTF7;
3240
3241         case wxFONTENCODING_UTF8:
3242              return new wxMBConvUTF8;
3243
3244         case wxFONTENCODING_UTF16BE:
3245              return new wxMBConvUTF16BE;
3246
3247         case wxFONTENCODING_UTF16LE:
3248              return new wxMBConvUTF16LE;
3249
3250         case wxFONTENCODING_UTF32BE:
3251              return new wxMBConvUTF32BE;
3252
3253         case wxFONTENCODING_UTF32LE:
3254              return new wxMBConvUTF32LE;
3255
3256         default:
3257              // nothing to do but put here to suppress gcc warnings
3258              break;
3259     }
3260
3261     // step (3)
3262 #if wxUSE_FONTMAP
3263     {
3264         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3265                                       : new wxMBConv_wxwin(m_encoding);
3266         if ( conv->IsOk() )
3267             return conv;
3268
3269         delete conv;
3270     }
3271 #endif // wxUSE_FONTMAP
3272
3273     // NB: This is a hack to prevent deadlock. What could otherwise happen
3274     //     in Unicode build: wxConvLocal creation ends up being here
3275     //     because of some failure and logs the error. But wxLog will try to
3276     //     attach timestamp, for which it will need wxConvLocal (to convert
3277     //     time to char* and then wchar_t*), but that fails, tries to log
3278     //     error, but wxLog has a (already locked) critical section that
3279     //     guards static buffer.
3280     static bool alreadyLoggingError = false;
3281     if (!alreadyLoggingError)
3282     {
3283         alreadyLoggingError = true;
3284         wxLogError(_("Cannot convert from the charset '%s'!"),
3285                    m_name ? m_name
3286                       :
3287 #if wxUSE_FONTMAP
3288                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3289 #else // !wxUSE_FONTMAP
3290                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3291 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3292               );
3293
3294         alreadyLoggingError = false;
3295     }
3296
3297     return NULL;
3298 }
3299
3300 void wxCSConv::CreateConvIfNeeded() const
3301 {
3302     if ( m_deferred )
3303     {
3304         wxCSConv *self = (wxCSConv *)this; // const_cast
3305
3306 #if wxUSE_INTL
3307         // if we don't have neither the name nor the encoding, use the default
3308         // encoding for this system
3309         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3310         {
3311             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3312         }
3313 #endif // wxUSE_INTL
3314
3315         self->m_convReal = DoCreate();
3316         self->m_deferred = false;
3317     }
3318 }
3319
3320 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3321 {
3322     CreateConvIfNeeded();
3323
3324     if (m_convReal)
3325         return m_convReal->MB2WC(buf, psz, n);
3326
3327     // latin-1 (direct)
3328     size_t len = strlen(psz);
3329
3330     if (buf)
3331     {
3332         for (size_t c = 0; c <= len; c++)
3333             buf[c] = (unsigned char)(psz[c]);
3334     }
3335
3336     return len;
3337 }
3338
3339 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3340 {
3341     CreateConvIfNeeded();
3342
3343     if (m_convReal)
3344         return m_convReal->WC2MB(buf, psz, n);
3345
3346     // latin-1 (direct)
3347     const size_t len = wxWcslen(psz);
3348     if (buf)
3349     {
3350         for (size_t c = 0; c <= len; c++)
3351         {
3352             if (psz[c] > 0xFF)
3353                 return wxCONV_FAILED;
3354
3355             buf[c] = (char)psz[c];
3356         }
3357     }
3358     else
3359     {
3360         for (size_t c = 0; c <= len; c++)
3361         {
3362             if (psz[c] > 0xFF)
3363                 return wxCONV_FAILED;
3364         }
3365     }
3366
3367     return len;
3368 }
3369
3370 size_t wxCSConv::GetMBNulLen() const
3371 {
3372     CreateConvIfNeeded();
3373
3374     if ( m_convReal )
3375     {
3376         return m_convReal->GetMBNulLen();
3377     }
3378
3379     return 1;
3380 }
3381
3382 // ----------------------------------------------------------------------------
3383 // globals
3384 // ----------------------------------------------------------------------------
3385
3386 #ifdef __WINDOWS__
3387     static wxMBConv_win32 wxConvLibcObj;
3388 #elif defined(__WXMAC__) && !defined(__MACH__)
3389     static wxMBConv_mac wxConvLibcObj ;
3390 #else
3391     static wxMBConvLibc wxConvLibcObj;
3392 #endif
3393
3394 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3395 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3396 static wxMBConvUTF7 wxConvUTF7Obj;
3397 static wxMBConvUTF8 wxConvUTF8Obj;
3398
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3400 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3401 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3404 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3405 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3406 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3407 #ifdef __WXOSX__
3408                                     wxConvUTF8Obj;
3409 #else
3410                                     wxConvLibcObj;
3411 #endif
3412
3413 #else // !wxUSE_WCHAR_T
3414
3415 // stand-ins in absence of wchar_t
3416 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3417                                 wxConvISO8859_1,
3418                                 wxConvLocal,
3419                                 wxConvUTF8;
3420
3421 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T