src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21     #include "wx/utils.h"
  22 #endif
  23
  24 #include "wx/strconv.h"
  25
  26 #if wxUSE_WCHAR_T
  27
  28 #ifdef __WINDOWS__
  29     #include "wx/msw/private.h"
  30     #include "wx/msw/missing.h"
  31 #endif
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #define wxHAVE_WIN32_MB2WC
  43 #endif
  44
  45 #ifdef __SALFORDC__
  46     #include <clib.h>
  47 #endif
  48
  49 #ifdef HAVE_ICONV
  50     #include <iconv.h>
  51     #include "wx/thread.h"
  52 #endif
  53
  54 #include "wx/encconv.h"
  55 #include "wx/fontmap.h"
  56
  57 #ifdef __WXMAC__
  58 #ifndef __DARWIN__
  59 #include <ATSUnicode.h>
  60 #include <TextCommon.h>
  61 #include <TextEncodingConverter.h>
  62 #endif
  63
  64 // includes Mac headers
  65 #include "wx/mac/private.h"
  66 #endif
  67
  68
  69 #define TRACE_STRCONV _T("strconv")
  70
  71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  72 // be 4 bytes
  73 #if SIZEOF_WCHAR_T == 2
  74     #define WC_UTF16
  75 #endif
  76
  77
  78 // ============================================================================
  79 // implementation
  80 // ============================================================================
  81
  82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  83 static bool NotAllNULs(const char *p, size_t n)
  84 {
  85     while ( n && *p++ == '\0' )
  86         n--;
  87
  88     return n != 0;
  89 }
  90
  91 // ----------------------------------------------------------------------------
  92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  93 // ----------------------------------------------------------------------------
  94
  95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  96 {
  97     if (input <= 0xffff)
  98     {
  99         if (output)
 100             *output = (wxUint16) input;
 101
 102         return 1;
 103     }
 104     else if (input >= 0x110000)
 105     {
 106         return wxCONV_FAILED;
 107     }
 108     else
 109     {
 110         if (output)
 111         {
 112             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 113             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 114         }
 115
 116         return 2;
 117     }
 118 }
 119
 120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 121 {
 122     if ((*input < 0xd800) || (*input > 0xdfff))
 123     {
 124         output = *input;
 125         return 1;
 126     }
 127     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 128     {
 129         output = *input;
 130         return wxCONV_FAILED;
 131     }
 132     else
 133     {
 134         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 135         return 2;
 136     }
 137 }
 138
 139 #ifdef WC_UTF16
 140     typedef wchar_t wxDecodeSurrogate_t;
 141 #else // !WC_UTF16
 142     typedef wxUint16 wxDecodeSurrogate_t;
 143 #endif // WC_UTF16/!WC_UTF16
 144
 145 // returns the next UTF-32 character from the wchar_t buffer and advances the
 146 // pointer to the character after this one
 147 //
 148 // if an invalid character is found, *pSrc is set to NULL, the caller must
 149 // check for this
 150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 151 {
 152     wxUint32 out;
 153     const size_t
 154         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 155     if ( n == wxCONV_FAILED )
 156         *pSrc = NULL;
 157     else
 158         *pSrc += n;
 159
 160     return out;
 161 }
 162
 163 // ----------------------------------------------------------------------------
 164 // wxMBConv
 165 // ----------------------------------------------------------------------------
 166
 167 size_t
 168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 169                   const char *src, size_t srcLen) const
 170 {
 171     // although new conversion classes are supposed to implement this function
 172     // directly, the existins ones only implement the old MB2WC() and so, to
 173     // avoid to have to rewrite all conversion classes at once, we provide a
 174     // default (but not efficient) implementation of this one in terms of the
 175     // old function by copying the input to ensure that it's NUL-terminated and
 176     // then using MB2WC() to convert it
 177
 178     // the number of chars [which would be] written to dst [if it were not NULL]
 179     size_t dstWritten = 0;
 180
 181     // the number of NULs terminating this string
 182     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 183
 184     // if we were not given the input size we just have to assume that the
 185     // string is properly terminated as we have no way of knowing how long it
 186     // is anyhow, but if we do have the size check whether there are enough
 187     // NULs at the end
 188     wxCharBuffer bufTmp;
 189     const char *srcEnd;
 190     if ( srcLen != wxNO_LEN )
 191     {
 192         // we need to know how to find the end of this string
 193         nulLen = GetMBNulLen();
 194         if ( nulLen == wxCONV_FAILED )
 195             return wxCONV_FAILED;
 196
 197         // if there are enough NULs we can avoid the copy
 198         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 199         {
 200             // make a copy in order to properly NUL-terminate the string
 201             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 202             char * const p = bufTmp.data();
 203             memcpy(p, src, srcLen);
 204             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 205                 *s = '\0';
 206
 207             src = bufTmp;
 208         }
 209
 210         srcEnd = src + srcLen;
 211     }
 212     else // quit after the first loop iteration
 213     {
 214         srcEnd = NULL;
 215     }
 216
 217     for ( ;; )
 218     {
 219         // try to convert the current chunk
 220         size_t lenChunk = MB2WC(NULL, src, 0);
 221         if ( lenChunk == wxCONV_FAILED )
 222             return wxCONV_FAILED;
 223
 224         lenChunk++; // for the L'\0' at the end of this chunk
 225
 226         dstWritten += lenChunk;
 227
 228         if ( lenChunk == 1 )
 229         {
 230             // nothing left in the input string, conversion succeeded
 231             break;
 232         }
 233
 234         if ( dst )
 235         {
 236             if ( dstWritten > dstLen )
 237                 return wxCONV_FAILED;
 238
 239             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 240                 return wxCONV_FAILED;
 241
 242             dst += lenChunk;
 243         }
 244
 245         if ( !srcEnd )
 246         {
 247             // we convert just one chunk in this case as this is the entire
 248             // string anyhow
 249             break;
 250         }
 251
 252         // advance the input pointer past the end of this chunk
 253         while ( NotAllNULs(src, nulLen) )
 254         {
 255             // notice that we must skip over multiple bytes here as we suppose
 256             // that if NUL takes 2 or 4 bytes, then all the other characters do
 257             // too and so if advanced by a single byte we might erroneously
 258             // detect sequences of NUL bytes in the middle of the input
 259             src += nulLen;
 260         }
 261
 262         src += nulLen; // skipping over its terminator as well
 263
 264         // note that ">=" (and not just "==") is needed here as the terminator
 265         // we skipped just above could be inside or just after the buffer
 266         // delimited by inEnd
 267         if ( src >= srcEnd )
 268             break;
 269     }
 270
 271     return dstWritten;
 272 }
 273
 274 size_t
 275 wxMBConv::FromWChar(char *dst, size_t dstLen,
 276                     const wchar_t *src, size_t srcLen) const
 277 {
 278     // the number of chars [which would be] written to dst [if it were not NULL]
 279     size_t dstWritten = 0;
 280
 281     // make a copy of the input string unless it is already properly
 282     // NUL-terminated
 283     //
 284     // if we don't know its length we have no choice but to assume that it is,
 285     // indeed, properly terminated
 286     wxWCharBuffer bufTmp;
 287     if ( srcLen == wxNO_LEN )
 288     {
 289         srcLen = wxWcslen(src) + 1;
 290     }
 291     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 292     {
 293         // make a copy in order to properly NUL-terminate the string
 294         bufTmp = wxWCharBuffer(srcLen);
 295         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 296         src = bufTmp;
 297     }
 298
 299     const size_t lenNul = GetMBNulLen();
 300     for ( const wchar_t * const srcEnd = src + srcLen;
 301           src < srcEnd;
 302           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 303     {
 304         // try to convert the current chunk
 305         size_t lenChunk = WC2MB(NULL, src, 0);
 306
 307         if ( lenChunk == wxCONV_FAILED )
 308             return wxCONV_FAILED;
 309
 310         lenChunk += lenNul;
 311         dstWritten += lenChunk;
 312
 313         if ( dst )
 314         {
 315             if ( dstWritten > dstLen )
 316                 return wxCONV_FAILED;
 317
 318             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 319                 return wxCONV_FAILED;
 320
 321             dst += lenChunk;
 322         }
 323     }
 324
 325     return dstWritten;
 326 }
 327
 328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 329 {
 330     size_t rc = ToWChar(outBuff, outLen, inBuff);
 331     if ( rc != wxCONV_FAILED )
 332     {
 333         // ToWChar() returns the buffer length, i.e. including the trailing
 334         // NUL, while this method doesn't take it into account
 335         rc--;
 336     }
 337
 338     return rc;
 339 }
 340
 341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 342 {
 343     size_t rc = FromWChar(outBuff, outLen, inBuff);
 344     if ( rc != wxCONV_FAILED )
 345     {
 346         rc -= GetMBNulLen();
 347     }
 348
 349     return rc;
 350 }
 351
 352 wxMBConv::~wxMBConv()
 353 {
 354     // nothing to do here (necessary for Darwin linking probably)
 355 }
 356
 357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 358 {
 359     if ( psz )
 360     {
 361         // calculate the length of the buffer needed first
 362         const size_t nLen = MB2WC(NULL, psz, 0);
 363         if ( nLen != wxCONV_FAILED )
 364         {
 365             // now do the actual conversion
 366             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 367
 368             // +1 for the trailing NULL
 369             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 370                 return buf;
 371         }
 372     }
 373
 374     return wxWCharBuffer();
 375 }
 376
 377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 378 {
 379     if ( pwz )
 380     {
 381         const size_t nLen = WC2MB(NULL, pwz, 0);
 382         if ( nLen != wxCONV_FAILED )
 383         {
 384             // extra space for trailing NUL(s)
 385             static const size_t extraLen = GetMaxMBNulLen();
 386
 387             wxCharBuffer buf(nLen + extraLen - 1);
 388             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 389                 return buf;
 390         }
 391     }
 392
 393     return wxCharBuffer();
 394 }
 395
 396 const wxWCharBuffer
 397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 398 {
 399     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 400     if ( dstLen != wxCONV_FAILED )
 401     {
 402         wxWCharBuffer wbuf(dstLen - 1);
 403         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 404         {
 405             if ( outLen )
 406             {
 407                 *outLen = dstLen;
 408                 if ( wbuf[dstLen - 1] == L'\0' )
 409                     (*outLen)--;
 410             }
 411
 412             return wbuf;
 413         }
 414     }
 415
 416     if ( outLen )
 417         *outLen = 0;
 418
 419     return wxWCharBuffer();
 420 }
 421
 422 const wxCharBuffer
 423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 424 {
 425     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 426     if ( dstLen != wxCONV_FAILED )
 427     {
 428         // special case of empty input: can't allocate 0 size buffer below as
 429         // wxCharBuffer insists on NUL-terminating it
 430         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 431         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 432         {
 433             if ( outLen )
 434             {
 435                 *outLen = dstLen;
 436
 437                 const size_t nulLen = GetMBNulLen();
 438                 if ( dstLen >= nulLen &&
 439                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 440                 {
 441                     // in this case the output is NUL-terminated and we're not
 442                     // supposed to count NUL
 443                     *outLen -= nulLen;
 444                 }
 445             }
 446
 447             return buf;
 448         }
 449     }
 450
 451     if ( outLen )
 452         *outLen = 0;
 453
 454     return wxCharBuffer();
 455 }
 456
 457 // ----------------------------------------------------------------------------
 458 // wxMBConvLibc
 459 // ----------------------------------------------------------------------------
 460
 461 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 462 {
 463     return wxMB2WC(buf, psz, n);
 464 }
 465
 466 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 467 {
 468     return wxWC2MB(buf, psz, n);
 469 }
 470
 471 // ----------------------------------------------------------------------------
 472 // wxConvBrokenFileNames
 473 // ----------------------------------------------------------------------------
 474
 475 #ifdef __UNIX__
 476
 477 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 478 {
 479     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 480                   || wxStricmp(charset, _T("UTF8")) == 0  )
 481         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 482     else
 483         m_conv = new wxCSConv(charset);
 484 }
 485
 486 #endif // __UNIX__
 487
 488 // ----------------------------------------------------------------------------
 489 // UTF-7
 490 // ----------------------------------------------------------------------------
 491
 492 // Implementation (C) 2004 Fredrik Roubert
 493
 494 //
 495 // BASE64 decoding table
 496 //
 497 static const unsigned char utf7unb64[] =
 498 {
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 505     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 506     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 508     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 509     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 510     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 512     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 513     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 514     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 531 };
 532
 533 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 534 {
 535     size_t len = 0;
 536
 537     while ( *psz && (!buf || (len < n)) )
 538     {
 539         unsigned char cc = *psz++;
 540         if (cc != '+')
 541         {
 542             // plain ASCII char
 543             if (buf)
 544                 *buf++ = cc;
 545             len++;
 546         }
 547         else if (*psz == '-')
 548         {
 549             // encoded plus sign
 550             if (buf)
 551                 *buf++ = cc;
 552             len++;
 553             psz++;
 554         }
 555         else // start of BASE64 encoded string
 556         {
 557             bool lsb, ok;
 558             unsigned int d, l;
 559             for ( ok = lsb = false, d = 0, l = 0;
 560                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 561                   psz++ )
 562             {
 563                 d <<= 6;
 564                 d += cc;
 565                 for (l += 6; l >= 8; lsb = !lsb)
 566                 {
 567                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 568                     if (lsb)
 569                     {
 570                         if (buf)
 571                             *buf++ |= c;
 572                         len ++;
 573                     }
 574                     else
 575                     {
 576                         if (buf)
 577                             *buf = (wchar_t)(c << 8);
 578                     }
 579
 580                     ok = true;
 581                 }
 582             }
 583
 584             if ( !ok )
 585             {
 586                 // in valid UTF7 we should have valid characters after '+'
 587                 return wxCONV_FAILED;
 588             }
 589
 590             if (*psz == '-')
 591                 psz++;
 592         }
 593     }
 594
 595     if ( buf && (len < n) )
 596         *buf = '\0';
 597
 598     return len;
 599 }
 600
 601 //
 602 // BASE64 encoding table
 603 //
 604 static const unsigned char utf7enb64[] =
 605 {
 606     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 607     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 608     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 609     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 610     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 611     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 612     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 613     '4', '5', '6', '7', '8', '9', '+', '/'
 614 };
 615
 616 //
 617 // UTF-7 encoding table
 618 //
 619 // 0 - Set D (directly encoded characters)
 620 // 1 - Set O (optional direct characters)
 621 // 2 - whitespace characters (optional)
 622 // 3 - special characters
 623 //
 624 static const unsigned char utf7encode[128] =
 625 {
 626     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 627     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 628     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 629     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 630     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 631     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 632     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 633     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 634 };
 635
 636 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 637 {
 638     size_t len = 0;
 639
 640     while (*psz && ((!buf) || (len < n)))
 641     {
 642         wchar_t cc = *psz++;
 643         if (cc < 0x80 && utf7encode[cc] < 1)
 644         {
 645             // plain ASCII char
 646             if (buf)
 647                 *buf++ = (char)cc;
 648
 649             len++;
 650         }
 651 #ifndef WC_UTF16
 652         else if (((wxUint32)cc) > 0xffff)
 653         {
 654             // no surrogate pair generation (yet?)
 655             return wxCONV_FAILED;
 656         }
 657 #endif
 658         else
 659         {
 660             if (buf)
 661                 *buf++ = '+';
 662
 663             len++;
 664             if (cc != '+')
 665             {
 666                 // BASE64 encode string
 667                 unsigned int lsb, d, l;
 668                 for (d = 0, l = 0; /*nothing*/; psz++)
 669                 {
 670                     for (lsb = 0; lsb < 2; lsb ++)
 671                     {
 672                         d <<= 8;
 673                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 674
 675                         for (l += 8; l >= 6; )
 676                         {
 677                             l -= 6;
 678                             if (buf)
 679                                 *buf++ = utf7enb64[(d >> l) % 64];
 680                             len++;
 681                         }
 682                     }
 683
 684                     cc = *psz;
 685                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 686                         break;
 687                 }
 688
 689                 if (l != 0)
 690                 {
 691                     if (buf)
 692                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 693
 694                     len++;
 695                 }
 696             }
 697
 698             if (buf)
 699                 *buf++ = '-';
 700             len++;
 701         }
 702     }
 703
 704     if (buf && (len < n))
 705         *buf = 0;
 706
 707     return len;
 708 }
 709
 710 // ----------------------------------------------------------------------------
 711 // UTF-8
 712 // ----------------------------------------------------------------------------
 713
 714 static wxUint32 utf8_max[]=
 715     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 716
 717 // boundaries of the private use area we use to (temporarily) remap invalid
 718 // characters invalid in a UTF-8 encoded string
 719 const wxUint32 wxUnicodePUA = 0x100000;
 720 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 721
 722 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 723 {
 724     size_t len = 0;
 725
 726     while (*psz && ((!buf) || (len < n)))
 727     {
 728         const char *opsz = psz;
 729         bool invalid = false;
 730         unsigned char cc = *psz++, fc = cc;
 731         unsigned cnt;
 732         for (cnt = 0; fc & 0x80; cnt++)
 733             fc <<= 1;
 734
 735         if (!cnt)
 736         {
 737             // plain ASCII char
 738             if (buf)
 739                 *buf++ = cc;
 740             len++;
 741
 742             // escape the escape character for octal escapes
 743             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 744                     && cc == '\\' && (!buf || len < n))
 745             {
 746                 if (buf)
 747                     *buf++ = cc;
 748                 len++;
 749             }
 750         }
 751         else
 752         {
 753             cnt--;
 754             if (!cnt)
 755             {
 756                 // invalid UTF-8 sequence
 757                 invalid = true;
 758             }
 759             else
 760             {
 761                 unsigned ocnt = cnt - 1;
 762                 wxUint32 res = cc & (0x3f >> cnt);
 763                 while (cnt--)
 764                 {
 765                     cc = *psz;
 766                     if ((cc & 0xC0) != 0x80)
 767                     {
 768                         // invalid UTF-8 sequence
 769                         invalid = true;
 770                         break;
 771                     }
 772
 773                     psz++;
 774                     res = (res << 6) | (cc & 0x3f);
 775                 }
 776
 777                 if (invalid || res <= utf8_max[ocnt])
 778                 {
 779                     // illegal UTF-8 encoding
 780                     invalid = true;
 781                 }
 782                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 783                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 784                 {
 785                     // if one of our PUA characters turns up externally
 786                     // it must also be treated as an illegal sequence
 787                     // (a bit like you have to escape an escape character)
 788                     invalid = true;
 789                 }
 790                 else
 791                 {
 792 #ifdef WC_UTF16
 793                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 794                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 795                     if (pa == wxCONV_FAILED)
 796                     {
 797                         invalid = true;
 798                     }
 799                     else
 800                     {
 801                         if (buf)
 802                             buf += pa;
 803                         len += pa;
 804                     }
 805 #else // !WC_UTF16
 806                     if (buf)
 807                         *buf++ = (wchar_t)res;
 808                     len++;
 809 #endif // WC_UTF16/!WC_UTF16
 810                 }
 811             }
 812
 813             if (invalid)
 814             {
 815                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 816                 {
 817                     while (opsz < psz && (!buf || len < n))
 818                     {
 819 #ifdef WC_UTF16
 820                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 821                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 822                         wxASSERT(pa != wxCONV_FAILED);
 823                         if (buf)
 824                             buf += pa;
 825                         opsz++;
 826                         len += pa;
 827 #else
 828                         if (buf)
 829                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 830                         opsz++;
 831                         len++;
 832 #endif
 833                     }
 834                 }
 835                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 836                 {
 837                     while (opsz < psz && (!buf || len < n))
 838                     {
 839                         if ( buf && len + 3 < n )
 840                         {
 841                             unsigned char on = *opsz;
 842                             *buf++ = L'\\';
 843                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 844                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 845                             *buf++ = (wchar_t)( L'0' + on % 010 );
 846                         }
 847
 848                         opsz++;
 849                         len += 4;
 850                     }
 851                 }
 852                 else // MAP_INVALID_UTF8_NOT
 853                 {
 854                     return wxCONV_FAILED;
 855                 }
 856             }
 857         }
 858     }
 859
 860     if (buf && (len < n))
 861         *buf = 0;
 862
 863     return len;
 864 }
 865
 866 static inline bool isoctal(wchar_t wch)
 867 {
 868     return L'0' <= wch && wch <= L'7';
 869 }
 870
 871 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 872 {
 873     size_t len = 0;
 874
 875     while (*psz && ((!buf) || (len < n)))
 876     {
 877         wxUint32 cc;
 878
 879 #ifdef WC_UTF16
 880         // cast is ok for WC_UTF16
 881         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 882         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 883 #else
 884         cc = (*psz++) & 0x7fffffff;
 885 #endif
 886
 887         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 888                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 889         {
 890             if (buf)
 891                 *buf++ = (char)(cc - wxUnicodePUA);
 892             len++;
 893         }
 894         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 895                     && cc == L'\\' && psz[0] == L'\\' )
 896         {
 897             if (buf)
 898                 *buf++ = (char)cc;
 899             psz++;
 900             len++;
 901         }
 902         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 903                     cc == L'\\' &&
 904                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 905         {
 906             if (buf)
 907             {
 908                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 909                                  (psz[1] - L'0') * 010 +
 910                                  (psz[2] - L'0'));
 911             }
 912
 913             psz += 3;
 914             len++;
 915         }
 916         else
 917         {
 918             unsigned cnt;
 919             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 920             {
 921             }
 922
 923             if (!cnt)
 924             {
 925                 // plain ASCII char
 926                 if (buf)
 927                     *buf++ = (char) cc;
 928                 len++;
 929             }
 930             else
 931             {
 932                 len += cnt + 1;
 933                 if (buf)
 934                 {
 935                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 936                     while (cnt--)
 937                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 938                 }
 939             }
 940         }
 941     }
 942
 943     if (buf && (len < n))
 944         *buf = 0;
 945
 946     return len;
 947 }
 948
 949 // ============================================================================
 950 // UTF-16
 951 // ============================================================================
 952
 953 #ifdef WORDS_BIGENDIAN
 954     #define wxMBConvUTF16straight wxMBConvUTF16BE
 955     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 956 #else
 957     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 958     #define wxMBConvUTF16straight wxMBConvUTF16LE
 959 #endif
 960
 961 /* static */
 962 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 963 {
 964     if ( srcLen == wxNO_LEN )
 965     {
 966         // count the number of bytes in input, including the trailing NULs
 967         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 968         for ( srcLen = 1; *inBuff++; srcLen++ )
 969             ;
 970
 971         srcLen *= BYTES_PER_CHAR;
 972     }
 973     else // we already have the length
 974     {
 975         // we can only convert an entire number of UTF-16 characters
 976         if ( srcLen % BYTES_PER_CHAR )
 977             return wxCONV_FAILED;
 978     }
 979
 980     return srcLen;
 981 }
 982
 983 // case when in-memory representation is UTF-16 too
 984 #ifdef WC_UTF16
 985
 986 // ----------------------------------------------------------------------------
 987 // conversions without endianness change
 988 // ----------------------------------------------------------------------------
 989
 990 size_t
 991 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 992                                const char *src, size_t srcLen) const
 993 {
 994     // set up the scene for using memcpy() (which is presumably more efficient
 995     // than copying the bytes one by one)
 996     srcLen = GetLength(src, srcLen);
 997     if ( srcLen == wxNO_LEN )
 998         return wxCONV_FAILED;
 999
1000     const size_t inLen = srcLen / BYTES_PER_CHAR;
1001     if ( dst )
1002     {
1003         if ( dstLen < inLen )
1004             return wxCONV_FAILED;
1005
1006         memcpy(dst, src, srcLen);
1007     }
1008
1009     return inLen;
1010 }
1011
1012 size_t
1013 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1014                                  const wchar_t *src, size_t srcLen) const
1015 {
1016     if ( srcLen == wxNO_LEN )
1017         srcLen = wxWcslen(src) + 1;
1018
1019     srcLen *= BYTES_PER_CHAR;
1020
1021     if ( dst )
1022     {
1023         if ( dstLen < srcLen )
1024             return wxCONV_FAILED;
1025
1026         memcpy(dst, src, srcLen);
1027     }
1028
1029     return srcLen;
1030 }
1031
1032 // ----------------------------------------------------------------------------
1033 // endian-reversing conversions
1034 // ----------------------------------------------------------------------------
1035
1036 size_t
1037 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1038                            const char *src, size_t srcLen) const
1039 {
1040     srcLen = GetLength(src, srcLen);
1041     if ( srcLen == wxNO_LEN )
1042         return wxCONV_FAILED;
1043
1044     srcLen /= BYTES_PER_CHAR;
1045
1046     if ( dst )
1047     {
1048         if ( dstLen < srcLen )
1049             return wxCONV_FAILED;
1050
1051         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1052         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1053         {
1054             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1055         }
1056     }
1057
1058     return srcLen;
1059 }
1060
1061 size_t
1062 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1063                              const wchar_t *src, size_t srcLen) const
1064 {
1065     if ( srcLen == wxNO_LEN )
1066         srcLen = wxWcslen(src) + 1;
1067
1068     srcLen *= BYTES_PER_CHAR;
1069
1070     if ( dst )
1071     {
1072         if ( dstLen < srcLen )
1073             return wxCONV_FAILED;
1074
1075         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1076         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1077         {
1078             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1079         }
1080     }
1081
1082     return srcLen;
1083 }
1084
1085 #else // !WC_UTF16: wchar_t is UTF-32
1086
1087 // ----------------------------------------------------------------------------
1088 // conversions without endianness change
1089 // ----------------------------------------------------------------------------
1090
1091 size_t
1092 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1093                                const char *src, size_t srcLen) const
1094 {
1095     srcLen = GetLength(src, srcLen);
1096     if ( srcLen == wxNO_LEN )
1097         return wxCONV_FAILED;
1098
1099     const size_t inLen = srcLen / BYTES_PER_CHAR;
1100     if ( !dst )
1101     {
1102         // optimization: return maximal space which could be needed for this
1103         // string even if the real size could be smaller if the buffer contains
1104         // any surrogates
1105         return inLen;
1106     }
1107
1108     size_t outLen = 0;
1109     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1110     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1111     {
1112         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1113         if ( !inBuff )
1114             return wxCONV_FAILED;
1115
1116         if ( ++outLen > dstLen )
1117             return wxCONV_FAILED;
1118
1119         *dst++ = ch;
1120     }
1121
1122
1123     return outLen;
1124 }
1125
1126 size_t
1127 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1128                                  const wchar_t *src, size_t srcLen) const
1129 {
1130     if ( srcLen == wxNO_LEN )
1131         srcLen = wxWcslen(src) + 1;
1132
1133     size_t outLen = 0;
1134     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1135     for ( size_t n = 0; n < srcLen; n++ )
1136     {
1137         wxUint16 cc[2];
1138         const size_t numChars = encode_utf16(*src++, cc);
1139         if ( numChars == wxCONV_FAILED )
1140             return wxCONV_FAILED;
1141
1142         outLen += numChars * BYTES_PER_CHAR;
1143         if ( outBuff )
1144         {
1145             if ( outLen > dstLen )
1146                 return wxCONV_FAILED;
1147
1148             *outBuff++ = cc[0];
1149             if ( numChars == 2 )
1150             {
1151                 // second character of a surrogate
1152                 *outBuff++ = cc[1];
1153             }
1154         }
1155     }
1156
1157     return outLen;
1158 }
1159
1160 // ----------------------------------------------------------------------------
1161 // endian-reversing conversions
1162 // ----------------------------------------------------------------------------
1163
1164 size_t
1165 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1166                            const char *src, size_t srcLen) const
1167 {
1168     srcLen = GetLength(src, srcLen);
1169     if ( srcLen == wxNO_LEN )
1170         return wxCONV_FAILED;
1171
1172     const size_t inLen = srcLen / BYTES_PER_CHAR;
1173     if ( !dst )
1174     {
1175         // optimization: return maximal space which could be needed for this
1176         // string even if the real size could be smaller if the buffer contains
1177         // any surrogates
1178         return inLen;
1179     }
1180
1181     size_t outLen = 0;
1182     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1183     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1184     {
1185         wxUint32 ch;
1186         wxUint16 tmp[2];
1187
1188         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189         inBuff++;
1190         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1191
1192         const size_t numChars = decode_utf16(tmp, ch);
1193         if ( numChars == wxCONV_FAILED )
1194             return wxCONV_FAILED;
1195
1196         if ( numChars == 2 )
1197             inBuff++;
1198
1199         if ( ++outLen > dstLen )
1200             return wxCONV_FAILED;
1201
1202         *dst++ = ch;
1203     }
1204
1205
1206     return outLen;
1207 }
1208
1209 size_t
1210 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1211                              const wchar_t *src, size_t srcLen) const
1212 {
1213     if ( srcLen == wxNO_LEN )
1214         srcLen = wxWcslen(src) + 1;
1215
1216     size_t outLen = 0;
1217     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1218     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1219     {
1220         wxUint16 cc[2];
1221         const size_t numChars = encode_utf16(*src, cc);
1222         if ( numChars == wxCONV_FAILED )
1223             return wxCONV_FAILED;
1224
1225         outLen += numChars * BYTES_PER_CHAR;
1226         if ( outBuff )
1227         {
1228             if ( outLen > dstLen )
1229                 return wxCONV_FAILED;
1230
1231             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1232             if ( numChars == 2 )
1233             {
1234                 // second character of a surrogate
1235                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1236             }
1237         }
1238     }
1239
1240     return outLen;
1241 }
1242
1243 #endif // WC_UTF16/!WC_UTF16
1244
1245
1246 // ============================================================================
1247 // UTF-32
1248 // ============================================================================
1249
1250 #ifdef WORDS_BIGENDIAN
1251     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1252     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1253 #else
1254     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1255     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1256 #endif
1257
1258
1259 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1261
1262 /* static */
1263 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1264 {
1265     if ( srcLen == wxNO_LEN )
1266     {
1267         // count the number of bytes in input, including the trailing NULs
1268         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1269         for ( srcLen = 1; *inBuff++; srcLen++ )
1270             ;
1271
1272         srcLen *= BYTES_PER_CHAR;
1273     }
1274     else // we already have the length
1275     {
1276         // we can only convert an entire number of UTF-32 characters
1277         if ( srcLen % BYTES_PER_CHAR )
1278             return wxCONV_FAILED;
1279     }
1280
1281     return srcLen;
1282 }
1283
1284 // case when in-memory representation is UTF-16
1285 #ifdef WC_UTF16
1286
1287 // ----------------------------------------------------------------------------
1288 // conversions without endianness change
1289 // ----------------------------------------------------------------------------
1290
1291 size_t
1292 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1293                                const char *src, size_t srcLen) const
1294 {
1295     srcLen = GetLength(src, srcLen);
1296     if ( srcLen == wxNO_LEN )
1297         return wxCONV_FAILED;
1298
1299     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1300     const size_t inLen = srcLen / BYTES_PER_CHAR;
1301     size_t outLen = 0;
1302     for ( size_t n = 0; n < inLen; n++ )
1303     {
1304         wxUint16 cc[2];
1305         const size_t numChars = encode_utf16(*inBuff++, cc);
1306         if ( numChars == wxCONV_FAILED )
1307             return wxCONV_FAILED;
1308
1309         outLen += numChars;
1310         if ( dst )
1311         {
1312             if ( outLen > dstLen )
1313                 return wxCONV_FAILED;
1314
1315             *dst++ = cc[0];
1316             if ( numChars == 2 )
1317             {
1318                 // second character of a surrogate
1319                 *dst++ = cc[1];
1320             }
1321         }
1322     }
1323
1324     return outLen;
1325 }
1326
1327 size_t
1328 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1329                                  const wchar_t *src, size_t srcLen) const
1330 {
1331     if ( srcLen == wxNO_LEN )
1332         srcLen = wxWcslen(src) + 1;
1333
1334     if ( !dst )
1335     {
1336         // optimization: return maximal space which could be needed for this
1337         // string instead of the exact amount which could be less if there are
1338         // any surrogates in the input
1339         //
1340         // we consider that surrogates are rare enough to make it worthwhile to
1341         // avoid running the loop below at the cost of slightly extra memory
1342         // consumption
1343         return srcLen * BYTES_PER_CHAR;
1344     }
1345
1346     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1347     size_t outLen = 0;
1348     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1349     {
1350         const wxUint32 ch = wxDecodeSurrogate(&src);
1351         if ( !src )
1352             return wxCONV_FAILED;
1353
1354         outLen += BYTES_PER_CHAR;
1355
1356         if ( outLen > dstLen )
1357             return wxCONV_FAILED;
1358
1359         *outBuff++ = ch;
1360     }
1361
1362     return outLen;
1363 }
1364
1365 // ----------------------------------------------------------------------------
1366 // endian-reversing conversions
1367 // ----------------------------------------------------------------------------
1368
1369 size_t
1370 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1371                            const char *src, size_t srcLen) const
1372 {
1373     srcLen = GetLength(src, srcLen);
1374     if ( srcLen == wxNO_LEN )
1375         return wxCONV_FAILED;
1376
1377     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1378     const size_t inLen = srcLen / BYTES_PER_CHAR;
1379     size_t outLen = 0;
1380     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1381     {
1382         wxUint16 cc[2];
1383         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1384         if ( numChars == wxCONV_FAILED )
1385             return wxCONV_FAILED;
1386
1387         outLen += numChars;
1388         if ( dst )
1389         {
1390             if ( outLen > dstLen )
1391                 return wxCONV_FAILED;
1392
1393             *dst++ = cc[0];
1394             if ( numChars == 2 )
1395             {
1396                 // second character of a surrogate
1397                 *dst++ = cc[1];
1398             }
1399         }
1400     }
1401
1402     return outLen;
1403 }
1404
1405 size_t
1406 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1407                              const wchar_t *src, size_t srcLen) const
1408 {
1409     if ( srcLen == wxNO_LEN )
1410         srcLen = wxWcslen(src) + 1;
1411
1412     if ( !dst )
1413     {
1414         // optimization: return maximal space which could be needed for this
1415         // string instead of the exact amount which could be less if there are
1416         // any surrogates in the input
1417         //
1418         // we consider that surrogates are rare enough to make it worthwhile to
1419         // avoid running the loop below at the cost of slightly extra memory
1420         // consumption
1421         return srcLen*BYTES_PER_CHAR;
1422     }
1423
1424     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1425     size_t outLen = 0;
1426     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1427     {
1428         const wxUint32 ch = wxDecodeSurrogate(&src);
1429         if ( !src )
1430             return wxCONV_FAILED;
1431
1432         outLen += BYTES_PER_CHAR;
1433
1434         if ( outLen > dstLen )
1435             return wxCONV_FAILED;
1436
1437         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1438     }
1439
1440     return outLen;
1441 }
1442
1443 #else // !WC_UTF16: wchar_t is UTF-32
1444
1445 // ----------------------------------------------------------------------------
1446 // conversions without endianness change
1447 // ----------------------------------------------------------------------------
1448
1449 size_t
1450 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1451                                const char *src, size_t srcLen) const
1452 {
1453     // use memcpy() as it should be much faster than hand-written loop
1454     srcLen = GetLength(src, srcLen);
1455     if ( srcLen == wxNO_LEN )
1456         return wxCONV_FAILED;
1457
1458     const size_t inLen = srcLen/BYTES_PER_CHAR;
1459     if ( dst )
1460     {
1461         if ( dstLen < inLen )
1462             return wxCONV_FAILED;
1463
1464         memcpy(dst, src, srcLen);
1465     }
1466
1467     return inLen;
1468 }
1469
1470 size_t
1471 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1472                                  const wchar_t *src, size_t srcLen) const
1473 {
1474     if ( srcLen == wxNO_LEN )
1475         srcLen = wxWcslen(src) + 1;
1476
1477     srcLen *= BYTES_PER_CHAR;
1478
1479     if ( dst )
1480     {
1481         if ( dstLen < srcLen )
1482             return wxCONV_FAILED;
1483
1484         memcpy(dst, src, srcLen);
1485     }
1486
1487     return srcLen;
1488 }
1489
1490 // ----------------------------------------------------------------------------
1491 // endian-reversing conversions
1492 // ----------------------------------------------------------------------------
1493
1494 size_t
1495 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1496                            const char *src, size_t srcLen) const
1497 {
1498     srcLen = GetLength(src, srcLen);
1499     if ( srcLen == wxNO_LEN )
1500         return wxCONV_FAILED;
1501
1502     srcLen /= BYTES_PER_CHAR;
1503
1504     if ( dst )
1505     {
1506         if ( dstLen < srcLen )
1507             return wxCONV_FAILED;
1508
1509         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1510         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1511         {
1512             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1513         }
1514     }
1515
1516     return srcLen;
1517 }
1518
1519 size_t
1520 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1521                              const wchar_t *src, size_t srcLen) const
1522 {
1523     if ( srcLen == wxNO_LEN )
1524         srcLen = wxWcslen(src) + 1;
1525
1526     srcLen *= BYTES_PER_CHAR;
1527
1528     if ( dst )
1529     {
1530         if ( dstLen < srcLen )
1531             return wxCONV_FAILED;
1532
1533         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1534         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1535         {
1536             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1537         }
1538     }
1539
1540     return srcLen;
1541 }
1542
1543 #endif // WC_UTF16/!WC_UTF16
1544
1545
1546 // ============================================================================
1547 // The classes doing conversion using the iconv_xxx() functions
1548 // ============================================================================
1549
1550 #ifdef HAVE_ICONV
1551
1552 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1553 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1554 //     (unless there's yet another bug in glibc) the only case when iconv()
1555 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1556 //     left in the input buffer -- when _real_ error occurs,
1557 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1558 //     iconv() failure.
1559 //     [This bug does not appear in glibc 2.2.]
1560 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1561 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1562                                      (errno != E2BIG || bufLeft != 0))
1563 #else
1564 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1565 #endif
1566
1567 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1568
1569 #define ICONV_T_INVALID ((iconv_t)-1)
1570
1571 #if SIZEOF_WCHAR_T == 4
1572     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1573     #define WC_ENC      wxFONTENCODING_UTF32
1574 #elif SIZEOF_WCHAR_T == 2
1575     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1576     #define WC_ENC      wxFONTENCODING_UTF16
1577 #else // sizeof(wchar_t) != 2 nor 4
1578     // does this ever happen?
1579     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1580 #endif
1581
1582 // ----------------------------------------------------------------------------
1583 // wxMBConv_iconv: encapsulates an iconv character set
1584 // ----------------------------------------------------------------------------
1585
1586 class wxMBConv_iconv : public wxMBConv
1587 {
1588 public:
1589     wxMBConv_iconv(const wxChar *name);
1590     virtual ~wxMBConv_iconv();
1591
1592     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1593     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1594
1595     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1596     virtual size_t GetMBNulLen() const;
1597
1598     virtual wxMBConv *Clone() const
1599     {
1600         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1601         p->m_minMBCharWidth = m_minMBCharWidth;
1602         return p;
1603     }
1604
1605     bool IsOk() const
1606         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1607
1608 protected:
1609     // the iconv handlers used to translate from multibyte
1610     // to wide char and in the other direction
1611     iconv_t m2w,
1612             w2m;
1613
1614 #if wxUSE_THREADS
1615     // guards access to m2w and w2m objects
1616     wxMutex m_iconvMutex;
1617 #endif
1618
1619 private:
1620     // the name (for iconv_open()) of a wide char charset -- if none is
1621     // available on this machine, it will remain NULL
1622     static wxString ms_wcCharsetName;
1623
1624     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1625     // different endian-ness than the native one
1626     static bool ms_wcNeedsSwap;
1627
1628
1629     // name of the encoding handled by this conversion
1630     wxString m_name;
1631
1632     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1633     // initially
1634     size_t m_minMBCharWidth;
1635 };
1636
1637 // make the constructor available for unit testing
1638 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1639 {
1640     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1641     if ( !result->IsOk() )
1642     {
1643         delete result;
1644         return 0;
1645     }
1646
1647     return result;
1648 }
1649
1650 wxString wxMBConv_iconv::ms_wcCharsetName;
1651 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1652
1653 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1654               : m_name(name)
1655 {
1656     m_minMBCharWidth = 0;
1657
1658     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1659     // names for the charsets
1660     const wxCharBuffer cname(wxString(name).ToAscii());
1661
1662     // check for charset that represents wchar_t:
1663     if ( ms_wcCharsetName.empty() )
1664     {
1665         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1666
1667 #if wxUSE_FONTMAP
1668         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1669 #else // !wxUSE_FONTMAP
1670         static const wxChar *names[] =
1671         {
1672 #if SIZEOF_WCHAR_T == 4
1673             _T("UCS-4"),
1674 #elif SIZEOF_WCHAR_T = 2
1675             _T("UCS-2"),
1676 #endif
1677             NULL
1678         };
1679 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1680
1681         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1682         {
1683             const wxString nameCS(*names);
1684
1685             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1686             wxString nameXE(nameCS);
1687
1688 #ifdef WORDS_BIGENDIAN
1689                 nameXE += _T("BE");
1690 #else // little endian
1691                 nameXE += _T("LE");
1692 #endif
1693
1694             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1695                        nameXE.c_str());
1696
1697             m2w = iconv_open(nameXE.ToAscii(), cname);
1698             if ( m2w == ICONV_T_INVALID )
1699             {
1700                 // try charset w/o bytesex info (e.g. "UCS4")
1701                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1702                            nameCS.c_str());
1703                 m2w = iconv_open(nameCS.ToAscii(), cname);
1704
1705                 // and check for bytesex ourselves:
1706                 if ( m2w != ICONV_T_INVALID )
1707                 {
1708                     char    buf[2], *bufPtr;
1709                     wchar_t wbuf[2], *wbufPtr;
1710                     size_t  insz, outsz;
1711                     size_t  res;
1712
1713                     buf[0] = 'A';
1714                     buf[1] = 0;
1715                     wbuf[0] = 0;
1716                     insz = 2;
1717                     outsz = SIZEOF_WCHAR_T * 2;
1718                     wbufPtr = wbuf;
1719                     bufPtr = buf;
1720
1721                     res = iconv(
1722                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1723                         (char**)&wbufPtr, &outsz);
1724
1725                     if (ICONV_FAILED(res, insz))
1726                     {
1727                         wxLogLastError(wxT("iconv"));
1728                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1729                                    nameCS.c_str());
1730                     }
1731                     else // ok, can convert to this encoding, remember it
1732                     {
1733                         ms_wcCharsetName = nameCS;
1734                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1735                     }
1736                 }
1737             }
1738             else // use charset not requiring byte swapping
1739             {
1740                 ms_wcCharsetName = nameXE;
1741             }
1742         }
1743
1744         wxLogTrace(TRACE_STRCONV,
1745                    wxT("iconv wchar_t charset is \"%s\"%s"),
1746                    ms_wcCharsetName.empty() ? _T("<none>")
1747                                             : ms_wcCharsetName.c_str(),
1748                    ms_wcNeedsSwap ? _T(" (needs swap)")
1749                                   : _T(""));
1750     }
1751     else // we already have ms_wcCharsetName
1752     {
1753         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1754     }
1755
1756     if ( ms_wcCharsetName.empty() )
1757     {
1758         w2m = ICONV_T_INVALID;
1759     }
1760     else
1761     {
1762         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1763         if ( w2m == ICONV_T_INVALID )
1764         {
1765             wxLogTrace(TRACE_STRCONV,
1766                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1767                        ms_wcCharsetName.c_str(), cname.data());
1768         }
1769     }
1770 }
1771
1772 wxMBConv_iconv::~wxMBConv_iconv()
1773 {
1774     if ( m2w != ICONV_T_INVALID )
1775         iconv_close(m2w);
1776     if ( w2m != ICONV_T_INVALID )
1777         iconv_close(w2m);
1778 }
1779
1780 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1781 {
1782     // find the string length: notice that must be done differently for
1783     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1784     size_t inbuf;
1785     const size_t nulLen = GetMBNulLen();
1786     switch ( nulLen )
1787     {
1788         default:
1789             return wxCONV_FAILED;
1790
1791         case 1:
1792             inbuf = strlen(psz); // arguably more optimized than our version
1793             break;
1794
1795         case 2:
1796         case 4:
1797             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1798             // they also have to start at character boundary and not span two
1799             // adjacent characters
1800             const char *p;
1801             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1802                 ;
1803             inbuf = p - psz;
1804             break;
1805     }
1806
1807 #if wxUSE_THREADS
1808     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1809     //     Unfortunately there is a couple of global wxCSConv objects such as
1810     //     wxConvLocal that are used all over wx code, so we have to make sure
1811     //     the handle is used by at most one thread at the time. Otherwise
1812     //     only a few wx classes would be safe to use from non-main threads
1813     //     as MB<->WC conversion would fail "randomly".
1814     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1815 #endif // wxUSE_THREADS
1816
1817     size_t outbuf = n * SIZEOF_WCHAR_T;
1818     size_t res, cres;
1819     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1820     wchar_t *bufPtr = buf;
1821     const char *pszPtr = psz;
1822
1823     if (buf)
1824     {
1825         // have destination buffer, convert there
1826         cres = iconv(m2w,
1827                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1828                      (char**)&bufPtr, &outbuf);
1829         res = n - (outbuf / SIZEOF_WCHAR_T);
1830
1831         if (ms_wcNeedsSwap)
1832         {
1833             // convert to native endianness
1834             for ( unsigned i = 0; i < res; i++ )
1835                 buf[n] = WC_BSWAP(buf[i]);
1836         }
1837
1838         // NUL-terminate the string if there is any space left
1839         if (res < n)
1840             buf[res] = 0;
1841     }
1842     else
1843     {
1844         // no destination buffer... convert using temp buffer
1845         // to calculate destination buffer requirement
1846         wchar_t tbuf[8];
1847         res = 0;
1848
1849         do
1850         {
1851             bufPtr = tbuf;
1852             outbuf = 8 * SIZEOF_WCHAR_T;
1853
1854             cres = iconv(m2w,
1855                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1856                          (char**)&bufPtr, &outbuf );
1857
1858             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1859         }
1860         while ((cres == (size_t)-1) && (errno == E2BIG));
1861     }
1862
1863     if (ICONV_FAILED(cres, inbuf))
1864     {
1865         //VS: it is ok if iconv fails, hence trace only
1866         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1867         return wxCONV_FAILED;
1868     }
1869
1870     return res;
1871 }
1872
1873 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1874 {
1875 #if wxUSE_THREADS
1876     // NB: explained in MB2WC
1877     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1878 #endif
1879
1880     size_t inlen = wxWcslen(psz);
1881     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1882     size_t outbuf = n;
1883     size_t res, cres;
1884
1885     wchar_t *tmpbuf = 0;
1886
1887     if (ms_wcNeedsSwap)
1888     {
1889         // need to copy to temp buffer to switch endianness
1890         // (doing WC_BSWAP twice on the original buffer won't help, as it
1891         //  could be in read-only memory, or be accessed in some other thread)
1892         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1893         for ( size_t i = 0; i < inlen; i++ )
1894             tmpbuf[n] = WC_BSWAP(psz[i]);
1895
1896         tmpbuf[inlen] = L'\0';
1897         psz = tmpbuf;
1898     }
1899
1900     if (buf)
1901     {
1902         // have destination buffer, convert there
1903         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1904
1905         res = n - outbuf;
1906
1907         // NB: iconv was given only wcslen(psz) characters on input, and so
1908         //     it couldn't convert the trailing zero. Let's do it ourselves
1909         //     if there's some room left for it in the output buffer.
1910         if (res < n)
1911             buf[0] = 0;
1912     }
1913     else
1914     {
1915         // no destination buffer: convert using temp buffer
1916         // to calculate destination buffer requirement
1917         char tbuf[16];
1918         res = 0;
1919         do
1920         {
1921             buf = tbuf;
1922             outbuf = 16;
1923
1924             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1925
1926             res += 16 - outbuf;
1927         }
1928         while ((cres == (size_t)-1) && (errno == E2BIG));
1929     }
1930
1931     if (ms_wcNeedsSwap)
1932     {
1933         free(tmpbuf);
1934     }
1935
1936     if (ICONV_FAILED(cres, inbuf))
1937     {
1938         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1939         return wxCONV_FAILED;
1940     }
1941
1942     return res;
1943 }
1944
1945 size_t wxMBConv_iconv::GetMBNulLen() const
1946 {
1947     if ( m_minMBCharWidth == 0 )
1948     {
1949         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1950
1951 #if wxUSE_THREADS
1952         // NB: explained in MB2WC
1953         wxMutexLocker lock(self->m_iconvMutex);
1954 #endif
1955
1956         wchar_t *wnul = L"";
1957         char buf[8]; // should be enough for NUL in any encoding
1958         size_t inLen = sizeof(wchar_t),
1959                outLen = WXSIZEOF(buf);
1960         char *inBuff = (char *)wnul;
1961         char *outBuff = buf;
1962         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1963         {
1964             self->m_minMBCharWidth = (size_t)-1;
1965         }
1966         else // ok
1967         {
1968             self->m_minMBCharWidth = outBuff - buf;
1969         }
1970     }
1971
1972     return m_minMBCharWidth;
1973 }
1974
1975 #endif // HAVE_ICONV
1976
1977
1978 // ============================================================================
1979 // Win32 conversion classes
1980 // ============================================================================
1981
1982 #ifdef wxHAVE_WIN32_MB2WC
1983
1984 // from utils.cpp
1985 #if wxUSE_FONTMAP
1986 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1987 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1988 #endif
1989
1990 class wxMBConv_win32 : public wxMBConv
1991 {
1992 public:
1993     wxMBConv_win32()
1994     {
1995         m_CodePage = CP_ACP;
1996         m_minMBCharWidth = 0;
1997     }
1998
1999     wxMBConv_win32(const wxMBConv_win32& conv)
2000     {
2001         m_CodePage = conv.m_CodePage;
2002         m_minMBCharWidth = conv.m_minMBCharWidth;
2003     }
2004
2005 #if wxUSE_FONTMAP
2006     wxMBConv_win32(const wxChar* name)
2007     {
2008         m_CodePage = wxCharsetToCodepage(name);
2009         m_minMBCharWidth = 0;
2010     }
2011
2012     wxMBConv_win32(wxFontEncoding encoding)
2013     {
2014         m_CodePage = wxEncodingToCodepage(encoding);
2015         m_minMBCharWidth = 0;
2016     }
2017 #endif // wxUSE_FONTMAP
2018
2019     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2020     {
2021         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2022         // the behaviour is not compatible with the Unix version (using iconv)
2023         // and break the library itself, e.g. wxTextInputStream::NextChar()
2024         // wouldn't work if reading an incomplete MB char didn't result in an
2025         // error
2026         //
2027         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2028         // Win XP or newer and it is not supported for UTF-[78] so we always
2029         // use our own conversions in this case. See
2030         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2031         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2032         if ( m_CodePage == CP_UTF8 )
2033         {
2034             return wxConvUTF8.MB2WC(buf, psz, n);
2035         }
2036
2037         if ( m_CodePage == CP_UTF7 )
2038         {
2039             return wxConvUTF7.MB2WC(buf, psz, n);
2040         }
2041
2042         int flags = 0;
2043         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2044                 IsAtLeastWin2kSP4() )
2045         {
2046             flags = MB_ERR_INVALID_CHARS;
2047         }
2048
2049         const size_t len = ::MultiByteToWideChar
2050                              (
2051                                 m_CodePage,     // code page
2052                                 flags,          // flags: fall on error
2053                                 psz,            // input string
2054                                 -1,             // its length (NUL-terminated)
2055                                 buf,            // output string
2056                                 buf ? n : 0     // size of output buffer
2057                              );
2058         if ( !len )
2059         {
2060             // function totally failed
2061             return wxCONV_FAILED;
2062         }
2063
2064         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2065         // check if we succeeded, by doing a double trip:
2066         if ( !flags && buf )
2067         {
2068             const size_t mbLen = strlen(psz);
2069             wxCharBuffer mbBuf(mbLen);
2070             if ( ::WideCharToMultiByte
2071                    (
2072                       m_CodePage,
2073                       0,
2074                       buf,
2075                       -1,
2076                       mbBuf.data(),
2077                       mbLen + 1,        // size in bytes, not length
2078                       NULL,
2079                       NULL
2080                    ) == 0 ||
2081                   strcmp(mbBuf, psz) != 0 )
2082             {
2083                 // we didn't obtain the same thing we started from, hence
2084                 // the conversion was lossy and we consider that it failed
2085                 return wxCONV_FAILED;
2086             }
2087         }
2088
2089         // note that it returns count of written chars for buf != NULL and size
2090         // of the needed buffer for buf == NULL so in either case the length of
2091         // the string (which never includes the terminating NUL) is one less
2092         return len - 1;
2093     }
2094
2095     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2096     {
2097         /*
2098             we have a problem here: by default, WideCharToMultiByte() may
2099             replace characters unrepresentable in the target code page with bad
2100             quality approximations such as turning "1/2" symbol (U+00BD) into
2101             "1" for the code pages which don't have it and we, obviously, want
2102             to avoid this at any price
2103
2104             the trouble is that this function does it _silently_, i.e. it won't
2105             even tell us whether it did or not... Win98/2000 and higher provide
2106             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2107             we have to resort to a round trip, i.e. check that converting back
2108             results in the same string -- this is, of course, expensive but
2109             otherwise we simply can't be sure to not garble the data.
2110          */
2111
2112         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2113         // it doesn't work with CJK encodings (which we test for rather roughly
2114         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2115         // supporting it
2116         BOOL usedDef wxDUMMY_INITIALIZE(false);
2117         BOOL *pUsedDef;
2118         int flags;
2119         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2120         {
2121             // it's our lucky day
2122             flags = WC_NO_BEST_FIT_CHARS;
2123             pUsedDef = &usedDef;
2124         }
2125         else // old system or unsupported encoding
2126         {
2127             flags = 0;
2128             pUsedDef = NULL;
2129         }
2130
2131         const size_t len = ::WideCharToMultiByte
2132                              (
2133                                 m_CodePage,     // code page
2134                                 flags,          // either none or no best fit
2135                                 pwz,            // input string
2136                                 -1,             // it is (wide) NUL-terminated
2137                                 buf,            // output buffer
2138                                 buf ? n : 0,    // and its size
2139                                 NULL,           // default "replacement" char
2140                                 pUsedDef        // [out] was it used?
2141                              );
2142
2143         if ( !len )
2144         {
2145             // function totally failed
2146             return wxCONV_FAILED;
2147         }
2148
2149         // if we were really converting, check if we succeeded
2150         if ( buf )
2151         {
2152             if ( flags )
2153             {
2154                 // check if the conversion failed, i.e. if any replacements
2155                 // were done
2156                 if ( usedDef )
2157                     return wxCONV_FAILED;
2158             }
2159             else // we must resort to double tripping...
2160             {
2161                 wxWCharBuffer wcBuf(n);
2162                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2163                         wcscmp(wcBuf, pwz) != 0 )
2164                 {
2165                     // we didn't obtain the same thing we started from, hence
2166                     // the conversion was lossy and we consider that it failed
2167                     return wxCONV_FAILED;
2168                 }
2169             }
2170         }
2171
2172         // see the comment above for the reason of "len - 1"
2173         return len - 1;
2174     }
2175
2176     virtual size_t GetMBNulLen() const
2177     {
2178         if ( m_minMBCharWidth == 0 )
2179         {
2180             int len = ::WideCharToMultiByte
2181                         (
2182                             m_CodePage,     // code page
2183                             0,              // no flags
2184                             L"",            // input string
2185                             1,              // translate just the NUL
2186                             NULL,           // output buffer
2187                             0,              // and its size
2188                             NULL,           // no replacement char
2189                             NULL            // [out] don't care if it was used
2190                         );
2191
2192             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2193             switch ( len )
2194             {
2195                 default:
2196                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2197                     self->m_minMBCharWidth = (size_t)-1;
2198                     break;
2199
2200                 case 0:
2201                     self->m_minMBCharWidth = (size_t)-1;
2202                     break;
2203
2204                 case 1:
2205                 case 2:
2206                 case 4:
2207                     self->m_minMBCharWidth = len;
2208                     break;
2209             }
2210         }
2211
2212         return m_minMBCharWidth;
2213     }
2214
2215     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2216
2217     bool IsOk() const { return m_CodePage != -1; }
2218
2219 private:
2220     static bool CanUseNoBestFit()
2221     {
2222         static int s_isWin98Or2k = -1;
2223
2224         if ( s_isWin98Or2k == -1 )
2225         {
2226             int verMaj, verMin;
2227             switch ( wxGetOsVersion(&verMaj, &verMin) )
2228             {
2229                 case wxWIN95:
2230                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2231                     break;
2232
2233                 case wxWINDOWS_NT:
2234                     s_isWin98Or2k = verMaj >= 5;
2235                     break;
2236
2237                 default:
2238                     // unknown: be conservative by default
2239                     s_isWin98Or2k = 0;
2240                     break;
2241             }
2242
2243             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2244         }
2245
2246         return s_isWin98Or2k == 1;
2247     }
2248
2249     static bool IsAtLeastWin2kSP4()
2250     {
2251 #ifdef __WXWINCE__
2252         return false;
2253 #else
2254         static int s_isAtLeastWin2kSP4 = -1;
2255
2256         if ( s_isAtLeastWin2kSP4 == -1 )
2257         {
2258             OSVERSIONINFOEX ver;
2259
2260             memset(&ver, 0, sizeof(ver));
2261             ver.dwOSVersionInfoSize = sizeof(ver);
2262             GetVersionEx((OSVERSIONINFO*)&ver);
2263
2264             s_isAtLeastWin2kSP4 =
2265               ((ver.dwMajorVersion > 5) || // Vista+
2266                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2267                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2268                ver.wServicePackMajor >= 4)) // 2000 SP4+
2269               ? 1 : 0;
2270         }
2271
2272         return s_isAtLeastWin2kSP4 == 1;
2273 #endif
2274     }
2275
2276
2277     // the code page we're working with
2278     long m_CodePage;
2279
2280     // cached result of GetMBNulLen(), set to 0 initially meaning
2281     // "unknown"
2282     size_t m_minMBCharWidth;
2283 };
2284
2285 #endif // wxHAVE_WIN32_MB2WC
2286
2287 // ============================================================================
2288 // Cocoa conversion classes
2289 // ============================================================================
2290
2291 #if defined(__WXCOCOA__)
2292
2293 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2294 // Strangely enough, internally Core Foundation uses
2295 // UTF-32 internally quite a bit - its just not public (yet).
2296
2297 #include <CoreFoundation/CFString.h>
2298 #include <CoreFoundation/CFStringEncodingExt.h>
2299
2300 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2301 {
2302     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2303
2304     switch (encoding)
2305     {
2306         case wxFONTENCODING_DEFAULT :
2307             enc = CFStringGetSystemEncoding();
2308             break ;
2309
2310         case wxFONTENCODING_ISO8859_1 :
2311             enc = kCFStringEncodingISOLatin1 ;
2312             break ;
2313         case wxFONTENCODING_ISO8859_2 :
2314             enc = kCFStringEncodingISOLatin2;
2315             break ;
2316         case wxFONTENCODING_ISO8859_3 :
2317             enc = kCFStringEncodingISOLatin3 ;
2318             break ;
2319         case wxFONTENCODING_ISO8859_4 :
2320             enc = kCFStringEncodingISOLatin4;
2321             break ;
2322         case wxFONTENCODING_ISO8859_5 :
2323             enc = kCFStringEncodingISOLatinCyrillic;
2324             break ;
2325         case wxFONTENCODING_ISO8859_6 :
2326             enc = kCFStringEncodingISOLatinArabic;
2327             break ;
2328         case wxFONTENCODING_ISO8859_7 :
2329             enc = kCFStringEncodingISOLatinGreek;
2330             break ;
2331         case wxFONTENCODING_ISO8859_8 :
2332             enc = kCFStringEncodingISOLatinHebrew;
2333             break ;
2334         case wxFONTENCODING_ISO8859_9 :
2335             enc = kCFStringEncodingISOLatin5;
2336             break ;
2337         case wxFONTENCODING_ISO8859_10 :
2338             enc = kCFStringEncodingISOLatin6;
2339             break ;
2340         case wxFONTENCODING_ISO8859_11 :
2341             enc = kCFStringEncodingISOLatinThai;
2342             break ;
2343         case wxFONTENCODING_ISO8859_13 :
2344             enc = kCFStringEncodingISOLatin7;
2345             break ;
2346         case wxFONTENCODING_ISO8859_14 :
2347             enc = kCFStringEncodingISOLatin8;
2348             break ;
2349         case wxFONTENCODING_ISO8859_15 :
2350             enc = kCFStringEncodingISOLatin9;
2351             break ;
2352
2353         case wxFONTENCODING_KOI8 :
2354             enc = kCFStringEncodingKOI8_R;
2355             break ;
2356         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2357             enc = kCFStringEncodingDOSRussian;
2358             break ;
2359
2360 //      case wxFONTENCODING_BULGARIAN :
2361 //          enc = ;
2362 //          break ;
2363
2364         case wxFONTENCODING_CP437 :
2365             enc = kCFStringEncodingDOSLatinUS ;
2366             break ;
2367         case wxFONTENCODING_CP850 :
2368             enc = kCFStringEncodingDOSLatin1;
2369             break ;
2370         case wxFONTENCODING_CP852 :
2371             enc = kCFStringEncodingDOSLatin2;
2372             break ;
2373         case wxFONTENCODING_CP855 :
2374             enc = kCFStringEncodingDOSCyrillic;
2375             break ;
2376         case wxFONTENCODING_CP866 :
2377             enc = kCFStringEncodingDOSRussian ;
2378             break ;
2379         case wxFONTENCODING_CP874 :
2380             enc = kCFStringEncodingDOSThai;
2381             break ;
2382         case wxFONTENCODING_CP932 :
2383             enc = kCFStringEncodingDOSJapanese;
2384             break ;
2385         case wxFONTENCODING_CP936 :
2386             enc = kCFStringEncodingDOSChineseSimplif ;
2387             break ;
2388         case wxFONTENCODING_CP949 :
2389             enc = kCFStringEncodingDOSKorean;
2390             break ;
2391         case wxFONTENCODING_CP950 :
2392             enc = kCFStringEncodingDOSChineseTrad;
2393             break ;
2394         case wxFONTENCODING_CP1250 :
2395             enc = kCFStringEncodingWindowsLatin2;
2396             break ;
2397         case wxFONTENCODING_CP1251 :
2398             enc = kCFStringEncodingWindowsCyrillic ;
2399             break ;
2400         case wxFONTENCODING_CP1252 :
2401             enc = kCFStringEncodingWindowsLatin1 ;
2402             break ;
2403         case wxFONTENCODING_CP1253 :
2404             enc = kCFStringEncodingWindowsGreek;
2405             break ;
2406         case wxFONTENCODING_CP1254 :
2407             enc = kCFStringEncodingWindowsLatin5;
2408             break ;
2409         case wxFONTENCODING_CP1255 :
2410             enc = kCFStringEncodingWindowsHebrew ;
2411             break ;
2412         case wxFONTENCODING_CP1256 :
2413             enc = kCFStringEncodingWindowsArabic ;
2414             break ;
2415         case wxFONTENCODING_CP1257 :
2416             enc = kCFStringEncodingWindowsBalticRim;
2417             break ;
2418 //   This only really encodes to UTF7 (if that) evidently
2419 //        case wxFONTENCODING_UTF7 :
2420 //            enc = kCFStringEncodingNonLossyASCII ;
2421 //            break ;
2422         case wxFONTENCODING_UTF8 :
2423             enc = kCFStringEncodingUTF8 ;
2424             break ;
2425         case wxFONTENCODING_EUC_JP :
2426             enc = kCFStringEncodingEUC_JP;
2427             break ;
2428         case wxFONTENCODING_UTF16 :
2429             enc = kCFStringEncodingUnicode ;
2430             break ;
2431         case wxFONTENCODING_MACROMAN :
2432             enc = kCFStringEncodingMacRoman ;
2433             break ;
2434         case wxFONTENCODING_MACJAPANESE :
2435             enc = kCFStringEncodingMacJapanese ;
2436             break ;
2437         case wxFONTENCODING_MACCHINESETRAD :
2438             enc = kCFStringEncodingMacChineseTrad ;
2439             break ;
2440         case wxFONTENCODING_MACKOREAN :
2441             enc = kCFStringEncodingMacKorean ;
2442             break ;
2443         case wxFONTENCODING_MACARABIC :
2444             enc = kCFStringEncodingMacArabic ;
2445             break ;
2446         case wxFONTENCODING_MACHEBREW :
2447             enc = kCFStringEncodingMacHebrew ;
2448             break ;
2449         case wxFONTENCODING_MACGREEK :
2450             enc = kCFStringEncodingMacGreek ;
2451             break ;
2452         case wxFONTENCODING_MACCYRILLIC :
2453             enc = kCFStringEncodingMacCyrillic ;
2454             break ;
2455         case wxFONTENCODING_MACDEVANAGARI :
2456             enc = kCFStringEncodingMacDevanagari ;
2457             break ;
2458         case wxFONTENCODING_MACGURMUKHI :
2459             enc = kCFStringEncodingMacGurmukhi ;
2460             break ;
2461         case wxFONTENCODING_MACGUJARATI :
2462             enc = kCFStringEncodingMacGujarati ;
2463             break ;
2464         case wxFONTENCODING_MACORIYA :
2465             enc = kCFStringEncodingMacOriya ;
2466             break ;
2467         case wxFONTENCODING_MACBENGALI :
2468             enc = kCFStringEncodingMacBengali ;
2469             break ;
2470         case wxFONTENCODING_MACTAMIL :
2471             enc = kCFStringEncodingMacTamil ;
2472             break ;
2473         case wxFONTENCODING_MACTELUGU :
2474             enc = kCFStringEncodingMacTelugu ;
2475             break ;
2476         case wxFONTENCODING_MACKANNADA :
2477             enc = kCFStringEncodingMacKannada ;
2478             break ;
2479         case wxFONTENCODING_MACMALAJALAM :
2480             enc = kCFStringEncodingMacMalayalam ;
2481             break ;
2482         case wxFONTENCODING_MACSINHALESE :
2483             enc = kCFStringEncodingMacSinhalese ;
2484             break ;
2485         case wxFONTENCODING_MACBURMESE :
2486             enc = kCFStringEncodingMacBurmese ;
2487             break ;
2488         case wxFONTENCODING_MACKHMER :
2489             enc = kCFStringEncodingMacKhmer ;
2490             break ;
2491         case wxFONTENCODING_MACTHAI :
2492             enc = kCFStringEncodingMacThai ;
2493             break ;
2494         case wxFONTENCODING_MACLAOTIAN :
2495             enc = kCFStringEncodingMacLaotian ;
2496             break ;
2497         case wxFONTENCODING_MACGEORGIAN :
2498             enc = kCFStringEncodingMacGeorgian ;
2499             break ;
2500         case wxFONTENCODING_MACARMENIAN :
2501             enc = kCFStringEncodingMacArmenian ;
2502             break ;
2503         case wxFONTENCODING_MACCHINESESIMP :
2504             enc = kCFStringEncodingMacChineseSimp ;
2505             break ;
2506         case wxFONTENCODING_MACTIBETAN :
2507             enc = kCFStringEncodingMacTibetan ;
2508             break ;
2509         case wxFONTENCODING_MACMONGOLIAN :
2510             enc = kCFStringEncodingMacMongolian ;
2511             break ;
2512         case wxFONTENCODING_MACETHIOPIC :
2513             enc = kCFStringEncodingMacEthiopic ;
2514             break ;
2515         case wxFONTENCODING_MACCENTRALEUR :
2516             enc = kCFStringEncodingMacCentralEurRoman ;
2517             break ;
2518         case wxFONTENCODING_MACVIATNAMESE :
2519             enc = kCFStringEncodingMacVietnamese ;
2520             break ;
2521         case wxFONTENCODING_MACARABICEXT :
2522             enc = kCFStringEncodingMacExtArabic ;
2523             break ;
2524         case wxFONTENCODING_MACSYMBOL :
2525             enc = kCFStringEncodingMacSymbol ;
2526             break ;
2527         case wxFONTENCODING_MACDINGBATS :
2528             enc = kCFStringEncodingMacDingbats ;
2529             break ;
2530         case wxFONTENCODING_MACTURKISH :
2531             enc = kCFStringEncodingMacTurkish ;
2532             break ;
2533         case wxFONTENCODING_MACCROATIAN :
2534             enc = kCFStringEncodingMacCroatian ;
2535             break ;
2536         case wxFONTENCODING_MACICELANDIC :
2537             enc = kCFStringEncodingMacIcelandic ;
2538             break ;
2539         case wxFONTENCODING_MACROMANIAN :
2540             enc = kCFStringEncodingMacRomanian ;
2541             break ;
2542         case wxFONTENCODING_MACCELTIC :
2543             enc = kCFStringEncodingMacCeltic ;
2544             break ;
2545         case wxFONTENCODING_MACGAELIC :
2546             enc = kCFStringEncodingMacGaelic ;
2547             break ;
2548 //      case wxFONTENCODING_MACKEYBOARD :
2549 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2550 //          break ;
2551
2552         default :
2553             // because gcc is picky
2554             break ;
2555     }
2556
2557     return enc ;
2558 }
2559
2560 class wxMBConv_cocoa : public wxMBConv
2561 {
2562 public:
2563     wxMBConv_cocoa()
2564     {
2565         Init(CFStringGetSystemEncoding()) ;
2566     }
2567
2568     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2569     {
2570         m_encoding = conv.m_encoding;
2571     }
2572
2573 #if wxUSE_FONTMAP
2574     wxMBConv_cocoa(const wxChar* name)
2575     {
2576         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2577     }
2578 #endif
2579
2580     wxMBConv_cocoa(wxFontEncoding encoding)
2581     {
2582         Init( wxCFStringEncFromFontEnc(encoding) );
2583     }
2584
2585     ~wxMBConv_cocoa()
2586     {
2587     }
2588
2589     void Init( CFStringEncoding encoding)
2590     {
2591         m_encoding = encoding ;
2592     }
2593
2594     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2595     {
2596         wxASSERT(szUnConv);
2597
2598         CFStringRef theString = CFStringCreateWithBytes (
2599                                                 NULL, //the allocator
2600                                                 (const UInt8*)szUnConv,
2601                                                 strlen(szUnConv),
2602                                                 m_encoding,
2603                                                 false //no BOM/external representation
2604                                                 );
2605
2606         wxASSERT(theString);
2607
2608         size_t nOutLength = CFStringGetLength(theString);
2609
2610         if (szOut == NULL)
2611         {
2612             CFRelease(theString);
2613             return nOutLength;
2614         }
2615
2616         CFRange theRange = { 0, nOutSize };
2617
2618 #if SIZEOF_WCHAR_T == 4
2619         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2620 #endif
2621
2622         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2623
2624         CFRelease(theString);
2625
2626         szUniCharBuffer[nOutLength] = '\0';
2627
2628 #if SIZEOF_WCHAR_T == 4
2629         wxMBConvUTF16 converter;
2630         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2631         delete [] szUniCharBuffer;
2632 #endif
2633
2634         return nOutLength;
2635     }
2636
2637     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2638     {
2639         wxASSERT(szUnConv);
2640
2641         size_t nRealOutSize;
2642         size_t nBufSize = wxWcslen(szUnConv);
2643         UniChar* szUniBuffer = (UniChar*) szUnConv;
2644
2645 #if SIZEOF_WCHAR_T == 4
2646         wxMBConvUTF16 converter ;
2647         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2648         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2649         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2650         nBufSize /= sizeof(UniChar);
2651 #endif
2652
2653         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2654                                 NULL, //allocator
2655                                 szUniBuffer,
2656                                 nBufSize,
2657                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2658                             );
2659
2660         wxASSERT(theString);
2661
2662         //Note that CER puts a BOM when converting to unicode
2663         //so we  check and use getchars instead in that case
2664         if (m_encoding == kCFStringEncodingUnicode)
2665         {
2666             if (szOut != NULL)
2667                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2668
2669             nRealOutSize = CFStringGetLength(theString) + 1;
2670         }
2671         else
2672         {
2673             CFStringGetBytes(
2674                 theString,
2675                 CFRangeMake(0, CFStringGetLength(theString)),
2676                 m_encoding,
2677                 0, //what to put in characters that can't be converted -
2678                     //0 tells CFString to return NULL if it meets such a character
2679                 false, //not an external representation
2680                 (UInt8*) szOut,
2681                 nOutSize,
2682                 (CFIndex*) &nRealOutSize
2683                         );
2684         }
2685
2686         CFRelease(theString);
2687
2688 #if SIZEOF_WCHAR_T == 4
2689         delete[] szUniBuffer;
2690 #endif
2691
2692         return  nRealOutSize - 1;
2693     }
2694
2695     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2696
2697     bool IsOk() const
2698     {
2699         return m_encoding != kCFStringEncodingInvalidId &&
2700               CFStringIsEncodingAvailable(m_encoding);
2701     }
2702
2703 private:
2704     CFStringEncoding m_encoding ;
2705 };
2706
2707 #endif // defined(__WXCOCOA__)
2708
2709 // ============================================================================
2710 // Mac conversion classes
2711 // ============================================================================
2712
2713 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2714
2715 class wxMBConv_mac : public wxMBConv
2716 {
2717 public:
2718     wxMBConv_mac()
2719     {
2720         Init(CFStringGetSystemEncoding()) ;
2721     }
2722
2723     wxMBConv_mac(const wxMBConv_mac& conv)
2724     {
2725         Init(conv.m_char_encoding);
2726     }
2727
2728 #if wxUSE_FONTMAP
2729     wxMBConv_mac(const wxChar* name)
2730     {
2731         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2732     }
2733 #endif
2734
2735     wxMBConv_mac(wxFontEncoding encoding)
2736     {
2737         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2738     }
2739
2740     ~wxMBConv_mac()
2741     {
2742         OSStatus status = noErr ;
2743         status = TECDisposeConverter(m_MB2WC_converter);
2744         status = TECDisposeConverter(m_WC2MB_converter);
2745     }
2746
2747
2748     void Init( TextEncodingBase encoding)
2749     {
2750         OSStatus status = noErr ;
2751         m_char_encoding = encoding ;
2752         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2753
2754         status = TECCreateConverter(&m_MB2WC_converter,
2755                                     m_char_encoding,
2756                                     m_unicode_encoding);
2757         status = TECCreateConverter(&m_WC2MB_converter,
2758                                     m_unicode_encoding,
2759                                     m_char_encoding);
2760     }
2761
2762     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2763     {
2764         OSStatus status = noErr ;
2765         ByteCount byteOutLen ;
2766         ByteCount byteInLen = strlen(psz) + 1;
2767         wchar_t *tbuf = NULL ;
2768         UniChar* ubuf = NULL ;
2769         size_t res = 0 ;
2770
2771         if (buf == NULL)
2772         {
2773             // Apple specs say at least 32
2774             n = wxMax( 32, byteInLen ) ;
2775             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2776         }
2777
2778         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2779
2780 #if SIZEOF_WCHAR_T == 4
2781         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2782 #else
2783         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2784 #endif
2785
2786         status = TECConvertText(
2787             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2788             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2789
2790 #if SIZEOF_WCHAR_T == 4
2791         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2792         // is not properly terminated we get random characters at the end
2793         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2794         wxMBConvUTF16 converter ;
2795         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2796         free( ubuf ) ;
2797 #else
2798         res = byteOutLen / sizeof( UniChar ) ;
2799 #endif
2800
2801         if ( buf == NULL )
2802              free(tbuf) ;
2803
2804         if ( buf  && res < n)
2805             buf[res] = 0;
2806
2807         return res ;
2808     }
2809
2810     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2811     {
2812         OSStatus status = noErr ;
2813         ByteCount byteOutLen ;
2814         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2815
2816         char *tbuf = NULL ;
2817
2818         if (buf == NULL)
2819         {
2820             // Apple specs say at least 32
2821             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2822             tbuf = (char*) malloc( n ) ;
2823         }
2824
2825         ByteCount byteBufferLen = n ;
2826         UniChar* ubuf = NULL ;
2827
2828 #if SIZEOF_WCHAR_T == 4
2829         wxMBConvUTF16 converter ;
2830         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2831         byteInLen = unicharlen ;
2832         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2833         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2834 #else
2835         ubuf = (UniChar*) psz ;
2836 #endif
2837
2838         status = TECConvertText(
2839             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2840             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2841
2842 #if SIZEOF_WCHAR_T == 4
2843         free( ubuf ) ;
2844 #endif
2845
2846         if ( buf == NULL )
2847             free(tbuf) ;
2848
2849         size_t res = byteOutLen ;
2850         if ( buf  && res < n)
2851         {
2852             buf[res] = 0;
2853
2854             //we need to double-trip to verify it didn't insert any ? in place
2855             //of bogus characters
2856             wxWCharBuffer wcBuf(n);
2857             size_t pszlen = wxWcslen(psz);
2858             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2859                         wxWcslen(wcBuf) != pszlen ||
2860                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2861             {
2862                 // we didn't obtain the same thing we started from, hence
2863                 // the conversion was lossy and we consider that it failed
2864                 return wxCONV_FAILED;
2865             }
2866         }
2867
2868         return res ;
2869     }
2870
2871     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2872
2873     bool IsOk() const
2874         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2875
2876 private:
2877     TECObjectRef m_MB2WC_converter;
2878     TECObjectRef m_WC2MB_converter;
2879
2880     TextEncodingBase m_char_encoding;
2881     TextEncodingBase m_unicode_encoding;
2882 };
2883
2884 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2885
2886 // ============================================================================
2887 // wxEncodingConverter based conversion classes
2888 // ============================================================================
2889
2890 #if wxUSE_FONTMAP
2891
2892 class wxMBConv_wxwin : public wxMBConv
2893 {
2894 private:
2895     void Init()
2896     {
2897         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2898                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2899     }
2900
2901 public:
2902     // temporarily just use wxEncodingConverter stuff,
2903     // so that it works while a better implementation is built
2904     wxMBConv_wxwin(const wxChar* name)
2905     {
2906         if (name)
2907             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2908         else
2909             m_enc = wxFONTENCODING_SYSTEM;
2910
2911         Init();
2912     }
2913
2914     wxMBConv_wxwin(wxFontEncoding enc)
2915     {
2916         m_enc = enc;
2917
2918         Init();
2919     }
2920
2921     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2922     {
2923         size_t inbuf = strlen(psz);
2924         if (buf)
2925         {
2926             if (!m2w.Convert(psz, buf))
2927                 return wxCONV_FAILED;
2928         }
2929         return inbuf;
2930     }
2931
2932     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2933     {
2934         const size_t inbuf = wxWcslen(psz);
2935         if (buf)
2936         {
2937             if (!w2m.Convert(psz, buf))
2938                 return wxCONV_FAILED;
2939         }
2940
2941         return inbuf;
2942     }
2943
2944     virtual size_t GetMBNulLen() const
2945     {
2946         switch ( m_enc )
2947         {
2948             case wxFONTENCODING_UTF16BE:
2949             case wxFONTENCODING_UTF16LE:
2950                 return 2;
2951
2952             case wxFONTENCODING_UTF32BE:
2953             case wxFONTENCODING_UTF32LE:
2954                 return 4;
2955
2956             default:
2957                 return 1;
2958         }
2959     }
2960
2961     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2962
2963     bool IsOk() const { return m_ok; }
2964
2965 public:
2966     wxFontEncoding m_enc;
2967     wxEncodingConverter m2w, w2m;
2968
2969 private:
2970     // were we initialized successfully?
2971     bool m_ok;
2972
2973     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2974 };
2975
2976 // make the constructors available for unit testing
2977 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2978 {
2979     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2980     if ( !result->IsOk() )
2981     {
2982         delete result;
2983         return 0;
2984     }
2985
2986     return result;
2987 }
2988
2989 #endif // wxUSE_FONTMAP
2990
2991 // ============================================================================
2992 // wxCSConv implementation
2993 // ============================================================================
2994
2995 void wxCSConv::Init()
2996 {
2997     m_name = NULL;
2998     m_convReal =  NULL;
2999     m_deferred = true;
3000 }
3001
3002 wxCSConv::wxCSConv(const wxChar *charset)
3003 {
3004     Init();
3005
3006     if ( charset )
3007     {
3008         SetName(charset);
3009     }
3010
3011 #if wxUSE_FONTMAP
3012     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3013 #else
3014     m_encoding = wxFONTENCODING_SYSTEM;
3015 #endif
3016 }
3017
3018 wxCSConv::wxCSConv(wxFontEncoding encoding)
3019 {
3020     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3021     {
3022         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3023
3024         encoding = wxFONTENCODING_SYSTEM;
3025     }
3026
3027     Init();
3028
3029     m_encoding = encoding;
3030 }
3031
3032 wxCSConv::~wxCSConv()
3033 {
3034     Clear();
3035 }
3036
3037 wxCSConv::wxCSConv(const wxCSConv& conv)
3038         : wxMBConv()
3039 {
3040     Init();
3041
3042     SetName(conv.m_name);
3043     m_encoding = conv.m_encoding;
3044 }
3045
3046 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3047 {
3048     Clear();
3049
3050     SetName(conv.m_name);
3051     m_encoding = conv.m_encoding;
3052
3053     return *this;
3054 }
3055
3056 void wxCSConv::Clear()
3057 {
3058     free(m_name);
3059     delete m_convReal;
3060
3061     m_name = NULL;
3062     m_convReal = NULL;
3063 }
3064
3065 void wxCSConv::SetName(const wxChar *charset)
3066 {
3067     if (charset)
3068     {
3069         m_name = wxStrdup(charset);
3070         m_deferred = true;
3071     }
3072 }
3073
3074 #if wxUSE_FONTMAP
3075 #include "wx/hashmap.h"
3076
3077 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3078                      wxEncodingNameCache );
3079
3080 static wxEncodingNameCache gs_nameCache;
3081 #endif
3082
3083 wxMBConv *wxCSConv::DoCreate() const
3084 {
3085 #if wxUSE_FONTMAP
3086     wxLogTrace(TRACE_STRCONV,
3087                wxT("creating conversion for %s"),
3088                (m_name ? m_name
3089                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3090 #endif // wxUSE_FONTMAP
3091
3092     // check for the special case of ASCII or ISO8859-1 charset: as we have
3093     // special knowledge of it anyhow, we don't need to create a special
3094     // conversion object
3095     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3096             m_encoding == wxFONTENCODING_DEFAULT )
3097     {
3098         // don't convert at all
3099         return NULL;
3100     }
3101
3102     // we trust OS to do conversion better than we can so try external
3103     // conversion methods first
3104     //
3105     // the full order is:
3106     //      1. OS conversion (iconv() under Unix or Win32 API)
3107     //      2. hard coded conversions for UTF
3108     //      3. wxEncodingConverter as fall back
3109
3110     // step (1)
3111 #ifdef HAVE_ICONV
3112 #if !wxUSE_FONTMAP
3113     if ( m_name )
3114 #endif // !wxUSE_FONTMAP
3115     {
3116         wxString name(m_name);
3117         wxFontEncoding encoding(m_encoding);
3118
3119         if ( !name.empty() )
3120         {
3121             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3122             if ( conv->IsOk() )
3123                 return conv;
3124
3125             delete conv;
3126
3127 #if wxUSE_FONTMAP
3128             encoding =
3129                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3130 #endif // wxUSE_FONTMAP
3131         }
3132 #if wxUSE_FONTMAP
3133         {
3134             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3135             if ( it != gs_nameCache.end() )
3136             {
3137                 if ( it->second.empty() )
3138                     return NULL;
3139
3140                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3141                 if ( conv->IsOk() )
3142                     return conv;
3143
3144                 delete conv;
3145             }
3146
3147             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3148
3149             for ( ; *names; ++names )
3150             {
3151                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3152                 if ( conv->IsOk() )
3153                 {
3154                     gs_nameCache[encoding] = *names;
3155                     return conv;
3156                 }
3157
3158                 delete conv;
3159             }
3160
3161             gs_nameCache[encoding] = _T(""); // cache the failure
3162         }
3163 #endif // wxUSE_FONTMAP
3164     }
3165 #endif // HAVE_ICONV
3166
3167 #ifdef wxHAVE_WIN32_MB2WC
3168     {
3169 #if wxUSE_FONTMAP
3170         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3171                                       : new wxMBConv_win32(m_encoding);
3172         if ( conv->IsOk() )
3173             return conv;
3174
3175         delete conv;
3176 #else
3177         return NULL;
3178 #endif
3179     }
3180 #endif // wxHAVE_WIN32_MB2WC
3181
3182 #if defined(__WXMAC__)
3183     {
3184         // leave UTF16 and UTF32 to the built-ins of wx
3185         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3186             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3187         {
3188 #if wxUSE_FONTMAP
3189             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3190                                         : new wxMBConv_mac(m_encoding);
3191 #else
3192             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3193 #endif
3194             if ( conv->IsOk() )
3195                  return conv;
3196
3197             delete conv;
3198         }
3199     }
3200 #endif
3201
3202 #if defined(__WXCOCOA__)
3203     {
3204         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3205         {
3206 #if wxUSE_FONTMAP
3207             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3208                                           : new wxMBConv_cocoa(m_encoding);
3209 #else
3210             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3211 #endif
3212
3213             if ( conv->IsOk() )
3214                  return conv;
3215
3216             delete conv;
3217         }
3218     }
3219 #endif
3220     // step (2)
3221     wxFontEncoding enc = m_encoding;
3222 #if wxUSE_FONTMAP
3223     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3224     {
3225         // use "false" to suppress interactive dialogs -- we can be called from
3226         // anywhere and popping up a dialog from here is the last thing we want to
3227         // do
3228         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3229     }
3230 #endif // wxUSE_FONTMAP
3231
3232     switch ( enc )
3233     {
3234         case wxFONTENCODING_UTF7:
3235              return new wxMBConvUTF7;
3236
3237         case wxFONTENCODING_UTF8:
3238              return new wxMBConvUTF8;
3239
3240         case wxFONTENCODING_UTF16BE:
3241              return new wxMBConvUTF16BE;
3242
3243         case wxFONTENCODING_UTF16LE:
3244              return new wxMBConvUTF16LE;
3245
3246         case wxFONTENCODING_UTF32BE:
3247              return new wxMBConvUTF32BE;
3248
3249         case wxFONTENCODING_UTF32LE:
3250              return new wxMBConvUTF32LE;
3251
3252         default:
3253              // nothing to do but put here to suppress gcc warnings
3254              break;
3255     }
3256
3257     // step (3)
3258 #if wxUSE_FONTMAP
3259     {
3260         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3261                                       : new wxMBConv_wxwin(m_encoding);
3262         if ( conv->IsOk() )
3263             return conv;
3264
3265         delete conv;
3266     }
3267 #endif // wxUSE_FONTMAP
3268
3269     // NB: This is a hack to prevent deadlock. What could otherwise happen
3270     //     in Unicode build: wxConvLocal creation ends up being here
3271     //     because of some failure and logs the error. But wxLog will try to
3272     //     attach timestamp, for which it will need wxConvLocal (to convert
3273     //     time to char* and then wchar_t*), but that fails, tries to log
3274     //     error, but wxLog has a (already locked) critical section that
3275     //     guards static buffer.
3276     static bool alreadyLoggingError = false;
3277     if (!alreadyLoggingError)
3278     {
3279         alreadyLoggingError = true;
3280         wxLogError(_("Cannot convert from the charset '%s'!"),
3281                    m_name ? m_name
3282                       :
3283 #if wxUSE_FONTMAP
3284                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3285 #else // !wxUSE_FONTMAP
3286                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3287 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3288               );
3289
3290         alreadyLoggingError = false;
3291     }
3292
3293     return NULL;
3294 }
3295
3296 void wxCSConv::CreateConvIfNeeded() const
3297 {
3298     if ( m_deferred )
3299     {
3300         wxCSConv *self = (wxCSConv *)this; // const_cast
3301
3302 #if wxUSE_INTL
3303         // if we don't have neither the name nor the encoding, use the default
3304         // encoding for this system
3305         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3306         {
3307             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3308         }
3309 #endif // wxUSE_INTL
3310
3311         self->m_convReal = DoCreate();
3312         self->m_deferred = false;
3313     }
3314 }
3315
3316 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3317 {
3318     CreateConvIfNeeded();
3319
3320     if (m_convReal)
3321         return m_convReal->MB2WC(buf, psz, n);
3322
3323     // latin-1 (direct)
3324     size_t len = strlen(psz);
3325
3326     if (buf)
3327     {
3328         for (size_t c = 0; c <= len; c++)
3329             buf[c] = (unsigned char)(psz[c]);
3330     }
3331
3332     return len;
3333 }
3334
3335 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3336 {
3337     CreateConvIfNeeded();
3338
3339     if (m_convReal)
3340         return m_convReal->WC2MB(buf, psz, n);
3341
3342     // latin-1 (direct)
3343     const size_t len = wxWcslen(psz);
3344     if (buf)
3345     {
3346         for (size_t c = 0; c <= len; c++)
3347         {
3348             if (psz[c] > 0xFF)
3349                 return wxCONV_FAILED;
3350
3351             buf[c] = (char)psz[c];
3352         }
3353     }
3354     else
3355     {
3356         for (size_t c = 0; c <= len; c++)
3357         {
3358             if (psz[c] > 0xFF)
3359                 return wxCONV_FAILED;
3360         }
3361     }
3362
3363     return len;
3364 }
3365
3366 size_t wxCSConv::GetMBNulLen() const
3367 {
3368     CreateConvIfNeeded();
3369
3370     if ( m_convReal )
3371     {
3372         return m_convReal->GetMBNulLen();
3373     }
3374
3375     return 1;
3376 }
3377
3378 // ----------------------------------------------------------------------------
3379 // globals
3380 // ----------------------------------------------------------------------------
3381
3382 #ifdef __WINDOWS__
3383     static wxMBConv_win32 wxConvLibcObj;
3384 #elif defined(__WXMAC__) && !defined(__MACH__)
3385     static wxMBConv_mac wxConvLibcObj ;
3386 #else
3387     static wxMBConvLibc wxConvLibcObj;
3388 #endif
3389
3390 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3391 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3392 static wxMBConvUTF7 wxConvUTF7Obj;
3393 static wxMBConvUTF8 wxConvUTF8Obj;
3394
3395 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3396 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3398 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3403 #ifdef __WXOSX__
3404                                     wxConvUTF8Obj;
3405 #else
3406                                     wxConvLibcObj;
3407 #endif
3408
3409 #else // !wxUSE_WCHAR_T
3410
3411 // stand-ins in absence of wchar_t
3412 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3413                                 wxConvISO8859_1,
3414                                 wxConvLocal,
3415                                 wxConvUTF8;
3416
3417 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T